]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
f67539c2 26#ifndef _WIN32
7c673cae 27#include <sys/utsname.h>
f67539c2 28#endif
7c673cae
FG
29#include <sys/uio.h>
30
31#include <boost/lexical_cast.hpp>
32#include <boost/fusion/include/std_pair.hpp>
33
f67539c2
TL
34#include "common/async/waiter.h"
35
36#if defined(__FreeBSD__) || defined(_WIN32)
7c673cae
FG
37#define XATTR_CREATE 0x1
38#define XATTR_REPLACE 0x2
39#else
40#include <sys/xattr.h>
41#endif
42
43#if defined(__linux__)
44#include <linux/falloc.h>
45#endif
46
47#include <sys/statvfs.h>
48
49#include "common/config.h"
50#include "common/version.h"
f67539c2 51#include "common/async/blocked_completion.h"
7c673cae 52
11fdf7f2
TL
53#include "mon/MonClient.h"
54
55#include "messages/MClientCaps.h"
56#include "messages/MClientLease.h"
57#include "messages/MClientQuota.h"
58#include "messages/MClientReclaim.h"
59#include "messages/MClientReclaimReply.h"
7c673cae 60#include "messages/MClientReconnect.h"
11fdf7f2 61#include "messages/MClientReply.h"
7c673cae
FG
62#include "messages/MClientRequest.h"
63#include "messages/MClientRequestForward.h"
11fdf7f2 64#include "messages/MClientSession.h"
7c673cae 65#include "messages/MClientSnap.h"
f67539c2 66#include "messages/MClientMetrics.h"
7c673cae 67#include "messages/MCommandReply.h"
7c673cae
FG
68#include "messages/MFSMap.h"
69#include "messages/MFSMapUser.h"
11fdf7f2
TL
70#include "messages/MMDSMap.h"
71#include "messages/MOSDMap.h"
7c673cae
FG
72
73#include "mds/flock.h"
11fdf7f2 74#include "mds/cephfs_features.h"
7c673cae
FG
75#include "osd/OSDMap.h"
76#include "osdc/Filer.h"
77
78#include "common/Cond.h"
7c673cae
FG
79#include "common/perf_counters.h"
80#include "common/admin_socket.h"
81#include "common/errno.h"
82#include "include/str_list.h"
83
84#define dout_subsys ceph_subsys_client
85
86#include "include/lru.h"
87#include "include/compat.h"
88#include "include/stringify.h"
f67539c2 89#include "include/random.h"
7c673cae
FG
90
91#include "Client.h"
92#include "Inode.h"
93#include "Dentry.h"
b32b8144 94#include "Delegation.h"
7c673cae
FG
95#include "Dir.h"
96#include "ClientSnapRealm.h"
97#include "Fh.h"
98#include "MetaSession.h"
99#include "MetaRequest.h"
100#include "ObjecterWriteback.h"
101#include "posix_acl.h"
102
11fdf7f2 103#include "include/ceph_assert.h"
7c673cae
FG
104#include "include/stat.h"
105
e306af50 106#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
107
108#if HAVE_GETGROUPLIST
109#include <grp.h>
110#include <pwd.h>
111#include <unistd.h>
112#endif
113
114#undef dout_prefix
115#define dout_prefix *_dout << "client." << whoami << " "
116
117#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119// FreeBSD fails to define this
120#ifndef O_DSYNC
121#define O_DSYNC 0x0
122#endif
123// Darwin fails to define this
124#ifndef O_RSYNC
125#define O_RSYNC 0x0
126#endif
127
128#ifndef O_DIRECT
129#define O_DIRECT 0x0
130#endif
131
f67539c2
TL
132// Windows doesn't define those values. While the Posix compatibilty layer
133// doesn't support those values, the Windows native functions do provide
134// similar flags. Special care should be taken if we're going to use those
135// flags in ceph-dokan. The current values are no-ops, while propagating
136// them to the rest of the code might cause the Windows functions to reject
137// them as invalid.
138#ifndef O_NOFOLLOW
139#define O_NOFOLLOW 0x0
140#endif
141
142#ifndef O_SYNC
143#define O_SYNC 0x0
144#endif
145
7c673cae
FG
146#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
b3b6e05e
TL
148#ifndef S_IXUGO
149#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150#endif
151
20effc67
TL
152using std::dec;
153using std::hex;
154using std::list;
155using std::oct;
156using std::pair;
157using std::string;
158using std::vector;
159
adb31ebb
TL
160using namespace TOPNSPC::common;
161
f67539c2
TL
162namespace bs = boost::system;
163namespace ca = ceph::async;
164
7c673cae
FG
165void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166{
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169}
170
b3b6e05e
TL
171bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177}
178
7c673cae
FG
179
180// -------------
181
182Client::CommandHook::CommandHook(Client *client) :
183 m_client(client)
184{
185}
186
9f95a23c
TL
187int Client::CommandHook::call(
188 std::string_view command,
189 const cmdmap_t& cmdmap,
190 Formatter *f,
191 std::ostream& errss,
192 bufferlist& out)
7c673cae 193{
7c673cae 194 f->open_object_section("result");
9f95a23c 195 {
f67539c2 196 std::scoped_lock l{m_client->client_lock};
9f95a23c
TL
197 if (command == "mds_requests")
198 m_client->dump_mds_requests(f);
adb31ebb
TL
199 else if (command == "mds_sessions") {
200 bool cap_dump = false;
201 cmd_getval(cmdmap, "cap_dump", cap_dump);
202 m_client->dump_mds_sessions(f, cap_dump);
203 } else if (command == "dump_cache")
9f95a23c
TL
204 m_client->dump_cache(f);
205 else if (command == "kick_stale_sessions")
206 m_client->_kick_stale_sessions();
207 else if (command == "status")
208 m_client->dump_status(f);
209 else
210 ceph_abort_msg("bad command registered");
211 }
7c673cae 212 f->close_section();
9f95a23c 213 return 0;
7c673cae
FG
214}
215
216
217// -------------
218
b3b6e05e
TL
219int Client::get_fd_inode(int fd, InodeRef *in) {
220 int r = 0;
221 if (fd == CEPHFS_AT_FDCWD) {
222 *in = cwd;
223 } else {
224 Fh *f = get_filehandle(fd);
225 if (!f) {
226 r = -CEPHFS_EBADF;
227 } else {
228 *in = f->inode;
229 }
230 }
231 return r;
232}
233
7c673cae
FG
234dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
235 : inode(in), offset(0), next_offset(2),
236 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
237 perms(perms)
238 { }
239
240void Client::_reset_faked_inos()
241{
242 ino_t start = 1024;
243 free_faked_inos.clear();
244 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
245 last_used_faked_ino = 0;
11fdf7f2 246 last_used_faked_root = 0;
f67539c2
TL
247 #ifdef _WIN32
248 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
249 // Windows structures, including Dokan ones, are using 64B identifiers.
250 _use_faked_inos = false;
251 #else
7c673cae 252 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
f67539c2 253 #endif
7c673cae
FG
254}
255
256void Client::_assign_faked_ino(Inode *in)
257{
11fdf7f2
TL
258 if (0 == last_used_faked_ino)
259 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
260 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
261 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 262 last_used_faked_ino = 2048;
7c673cae
FG
263 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
264 }
11fdf7f2 265 ceph_assert(it != free_faked_inos.end());
7c673cae 266 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 267 ceph_assert(it.get_len() > 0);
7c673cae
FG
268 last_used_faked_ino = it.get_start();
269 } else {
270 ++last_used_faked_ino;
11fdf7f2 271 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
272 }
273 in->faked_ino = last_used_faked_ino;
274 free_faked_inos.erase(in->faked_ino);
275 faked_ino_map[in->faked_ino] = in->vino();
276}
277
11fdf7f2
TL
278/*
279 * In the faked mode, if you export multiple subdirectories,
280 * you will see that the inode numbers of the exported subdirectories
281 * are the same. so we distinguish the mount point by reserving
282 * the "fake ids" between "1024~2048" and combining the last
283 * 10bits(0x3ff) of the "root inodes".
284*/
285void Client::_assign_faked_root(Inode *in)
286{
287 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
288 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
289 last_used_faked_root = 0;
290 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
291 }
20effc67 292 ceph_assert(it != free_faked_inos.end());
11fdf7f2
TL
293 vinodeno_t inode_info = in->vino();
294 uint64_t inode_num = (uint64_t)inode_info.ino;
295 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
296 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
20effc67 297 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
11fdf7f2
TL
298
299 in->faked_ino = last_used_faked_root;
300 free_faked_inos.erase(in->faked_ino);
301 faked_ino_map[in->faked_ino] = in->vino();
302}
303
7c673cae
FG
304void Client::_release_faked_ino(Inode *in)
305{
306 free_faked_inos.insert(in->faked_ino);
307 faked_ino_map.erase(in->faked_ino);
308}
309
310vinodeno_t Client::_map_faked_ino(ino_t ino)
311{
312 vinodeno_t vino;
313 if (ino == 1)
314 vino = root->vino();
315 else if (faked_ino_map.count(ino))
316 vino = faked_ino_map[ino];
317 else
318 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 319 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
320 return vino;
321}
322
323vinodeno_t Client::map_faked_ino(ino_t ino)
324{
f67539c2 325 std::scoped_lock lock(client_lock);
7c673cae
FG
326 return _map_faked_ino(ino);
327}
328
329// cons/des
330
331Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
f67539c2
TL
332 : Dispatcher(m->cct->get()),
333 timer(m->cct, timer_lock, false),
11fdf7f2
TL
334 messenger(m),
335 monclient(mc),
336 objecter(objecter_),
337 whoami(mc->get_global_id()),
f67539c2
TL
338 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
339 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
340 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
7c673cae
FG
341 async_ino_invalidator(m->cct),
342 async_dentry_invalidator(m->cct),
343 interrupt_finisher(m->cct),
344 remount_finisher(m->cct),
e306af50 345 async_ino_releasor(m->cct),
7c673cae 346 objecter_finisher(m->cct),
11fdf7f2
TL
347 m_command_hook(this),
348 fscid(0)
7c673cae
FG
349{
350 _reset_faked_inos();
7c673cae 351
7c673cae
FG
352 user_id = cct->_conf->client_mount_uid;
353 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
354 fuse_default_permissions = cct->_conf.get_val<bool>(
355 "fuse_default_permissions");
7c673cae 356
7c673cae
FG
357 if (cct->_conf->client_acl_type == "posix_acl")
358 acl_type = POSIX_ACL;
359
7c673cae
FG
360 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
361
362 // file handles
363 free_fd_set.insert(10, 1<<30);
364
365 mdsmap.reset(new MDSMap);
366
367 // osd interfaces
368 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
369 &client_lock));
370 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
371 client_flush_set_callback, // all commit callback
372 (void*)this,
373 cct->_conf->client_oc_size,
374 cct->_conf->client_oc_max_objects,
375 cct->_conf->client_oc_max_dirty,
376 cct->_conf->client_oc_target_dirty,
377 cct->_conf->client_oc_max_dirty_age,
378 true));
7c673cae
FG
379}
380
381
382Client::~Client()
383{
9f95a23c 384 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 385
f67539c2
TL
386 // If the task is crashed or aborted and doesn't
387 // get any chance to run the umount and shutdow.
388 {
389 std::scoped_lock l{client_lock};
390 tick_thread_stopped = true;
391 upkeep_cond.notify_one();
392 }
393
394 if (upkeeper.joinable())
395 upkeeper.join();
396
31f18b77
FG
397 // It is necessary to hold client_lock, because any inode destruction
398 // may call into ObjectCacher, which asserts that it's lock (which is
399 // client_lock) is held.
f67539c2 400 std::scoped_lock l{client_lock};
7c673cae
FG
401 tear_down_cache();
402}
403
404void Client::tear_down_cache()
405{
406 // fd's
f67539c2
TL
407 for (auto &[fd, fh] : fd_map) {
408 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
7c673cae
FG
409 _release_fh(fh);
410 }
411 fd_map.clear();
412
413 while (!opened_dirs.empty()) {
414 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 415 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
416 _closedir(dirp);
417 }
418
419 // caps!
420 // *** FIXME ***
421
422 // empty lru
7c673cae 423 trim_cache();
11fdf7f2 424 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
425
426 // close root ino
11fdf7f2 427 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae 428 if (root && inode_map.size() == 1 + root_parents.size()) {
b3b6e05e 429 root.reset();
7c673cae
FG
430 }
431
11fdf7f2 432 ceph_assert(inode_map.empty());
7c673cae
FG
433}
434
435inodeno_t Client::get_root_ino()
436{
f67539c2 437 std::scoped_lock l(client_lock);
7c673cae
FG
438 if (use_faked_inos())
439 return root->faked_ino;
440 else
441 return root->ino;
442}
443
444Inode *Client::get_root()
445{
f67539c2 446 std::scoped_lock l(client_lock);
7c673cae 447 root->ll_get();
b3b6e05e 448 return root.get();
7c673cae
FG
449}
450
451
452// debug crapola
453
454void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
455{
456 filepath path;
457 in->make_long_path(path);
458 ldout(cct, 1) << "dump_inode: "
459 << (disconnected ? "DISCONNECTED ":"")
460 << "inode " << in->ino
461 << " " << path
b3b6e05e 462 << " ref " << in->get_nref()
f67539c2 463 << " " << *in << dendl;
7c673cae
FG
464
465 if (f) {
466 f->open_object_section("inode");
467 f->dump_stream("path") << path;
468 if (disconnected)
469 f->dump_int("disconnected", 1);
470 in->dump(f);
471 f->close_section();
472 }
473
474 did.insert(in);
475 if (in->dir) {
476 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
477 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
478 it != in->dir->dentries.end();
479 ++it) {
480 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
481 if (f) {
482 f->open_object_section("dentry");
483 it->second->dump(f);
484 f->close_section();
485 }
486 if (it->second->inode)
487 dump_inode(f, it->second->inode.get(), did, false);
488 }
489 }
490}
491
492void Client::dump_cache(Formatter *f)
493{
494 set<Inode*> did;
495
11fdf7f2 496 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
497
498 if (f)
499 f->open_array_section("cache");
500
501 if (root)
b3b6e05e 502 dump_inode(f, root.get(), did, true);
7c673cae
FG
503
504 // make a second pass to catch anything disconnected
505 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
506 it != inode_map.end();
507 ++it) {
508 if (did.count(it->second))
509 continue;
510 dump_inode(f, it->second, did, true);
511 }
512
513 if (f)
514 f->close_section();
515}
516
517void Client::dump_status(Formatter *f)
518{
9f95a23c 519 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
520
521 ldout(cct, 1) << __func__ << dendl;
522
523 const epoch_t osd_epoch
524 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
525
526 if (f) {
527 f->open_object_section("metadata");
528 for (const auto& kv : metadata)
529 f->dump_string(kv.first.c_str(), kv.second);
530 f->close_section();
531
532 f->dump_int("dentry_count", lru.lru_get_size());
533 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
534 f->dump_int("id", get_nodeid().v);
11fdf7f2 535 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 536 f->dump_object("inst", inst);
11fdf7f2
TL
537 f->dump_object("addr", inst.addr);
538 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
539 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
540 f->dump_int("inode_count", inode_map.size());
541 f->dump_int("mds_epoch", mdsmap->get_epoch());
542 f->dump_int("osd_epoch", osd_epoch);
543 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f67539c2 544 f->dump_bool("blocklisted", blocklisted);
adb31ebb 545 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
546 }
547}
548
e306af50 549void Client::_pre_init()
7c673cae
FG
550{
551 timer.init();
e306af50
TL
552
553 objecter_finisher.start();
554 filer.reset(new Filer(objecter, &objecter_finisher));
f67539c2 555 objecter->enable_blocklist_events();
e306af50 556
7c673cae 557 objectcacher->start();
e306af50
TL
558}
559
560int Client::init()
561{
f67539c2
TL
562 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
563 ceph_assert(iref_writer.is_first_writer());
564
e306af50 565 _pre_init();
9f95a23c 566 {
f67539c2 567 std::scoped_lock l{client_lock};
9f95a23c
TL
568 messenger->add_dispatcher_tail(this);
569 }
7c673cae 570 _finish_init();
f67539c2 571 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
572 return 0;
573}
574
575void Client::_finish_init()
576{
9f95a23c 577 {
f67539c2 578 std::scoped_lock l{client_lock};
9f95a23c
TL
579 // logger
580 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
581 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
582 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
583 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
584 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
585 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
586 logger.reset(plb.create_perf_counters());
587 cct->get_perfcounters_collection()->add(logger.get());
588 }
7c673cae 589
11fdf7f2 590 cct->_conf.add_observer(this);
7c673cae
FG
591
592 AdminSocket* admin_socket = cct->get_admin_socket();
593 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
594 &m_command_hook,
595 "show in-progress mds requests");
596 if (ret < 0) {
597 lderr(cct) << "error registering admin socket command: "
598 << cpp_strerror(-ret) << dendl;
599 }
adb31ebb
TL
600 ret = admin_socket->register_command("mds_sessions "
601 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
602 &m_command_hook,
603 "show mds session state");
604 if (ret < 0) {
605 lderr(cct) << "error registering admin socket command: "
606 << cpp_strerror(-ret) << dendl;
607 }
608 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
609 &m_command_hook,
610 "show in-memory metadata cache contents");
611 if (ret < 0) {
612 lderr(cct) << "error registering admin socket command: "
613 << cpp_strerror(-ret) << dendl;
614 }
615 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
616 &m_command_hook,
617 "kick sessions that were remote reset");
618 if (ret < 0) {
619 lderr(cct) << "error registering admin socket command: "
620 << cpp_strerror(-ret) << dendl;
621 }
622 ret = admin_socket->register_command("status",
7c673cae
FG
623 &m_command_hook,
624 "show overall client status");
625 if (ret < 0) {
626 lderr(cct) << "error registering admin socket command: "
627 << cpp_strerror(-ret) << dendl;
628 }
7c673cae
FG
629}
630
631void Client::shutdown()
632{
11fdf7f2 633 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
634
635 // If we were not mounted, but were being used for sending
636 // MDS commands, we may have sessions that need closing.
9f95a23c 637 {
f67539c2
TL
638 std::scoped_lock l{client_lock};
639
640 // To make sure the tick thread will be stoppped before
641 // destructing the Client, just in case like the _mount()
642 // failed but didn't not get a chance to stop the tick
643 // thread
644 tick_thread_stopped = true;
645 upkeep_cond.notify_one();
646
9f95a23c
TL
647 _close_sessions();
648 }
11fdf7f2 649 cct->_conf.remove_observer(this);
7c673cae 650
11fdf7f2 651 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
652
653 if (ino_invalidate_cb) {
654 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
655 async_ino_invalidator.wait_for_empty();
656 async_ino_invalidator.stop();
657 }
658
659 if (dentry_invalidate_cb) {
660 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
661 async_dentry_invalidator.wait_for_empty();
662 async_dentry_invalidator.stop();
663 }
664
665 if (switch_interrupt_cb) {
666 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
667 interrupt_finisher.wait_for_empty();
668 interrupt_finisher.stop();
669 }
670
671 if (remount_cb) {
672 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
673 remount_finisher.wait_for_empty();
674 remount_finisher.stop();
675 }
676
e306af50
TL
677 if (ino_release_cb) {
678 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
679 async_ino_releasor.wait_for_empty();
680 async_ino_releasor.stop();
681 }
682
7c673cae 683 objectcacher->stop(); // outside of client_lock! this does a join.
f67539c2
TL
684
685 /*
686 * We are shuting down the client.
687 *
688 * Just declare the state to CLIENT_NEW to block and fail any
689 * new comming "reader" and then try to wait all the in-flight
690 * "readers" to finish.
691 */
692 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
693 if (!iref_writer.is_first_writer())
694 return;
695 iref_writer.wait_readers_done();
696
9f95a23c 697 {
f67539c2 698 std::scoped_lock l(timer_lock);
9f95a23c
TL
699 timer.shutdown();
700 }
f67539c2 701
7c673cae
FG
702 objecter_finisher.wait_for_empty();
703 objecter_finisher.stop();
704
705 if (logger) {
706 cct->get_perfcounters_collection()->remove(logger.get());
707 logger.reset();
708 }
709}
710
711
712// ===================
713// metadata cache stuff
714
715void Client::trim_cache(bool trim_kernel_dcache)
716{
181888fb
FG
717 uint64_t max = cct->_conf->client_cache_size;
718 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
719 unsigned last = 0;
720 while (lru.lru_get_size() != last) {
721 last = lru.lru_get_size();
722
f67539c2 723 if (!is_unmounting() && lru.lru_get_size() <= max) break;
7c673cae
FG
724
725 // trim!
31f18b77 726 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
727 if (!dn)
728 break; // done
f67539c2 729
7c673cae
FG
730 trim_dentry(dn);
731 }
732
181888fb 733 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
734 _invalidate_kernel_dcache();
735
736 // hose root?
b3b6e05e 737 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
7c673cae 738 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
b3b6e05e 739 root.reset();
7c673cae
FG
740 }
741}
742
743void Client::trim_cache_for_reconnect(MetaSession *s)
744{
745 mds_rank_t mds = s->mds_num;
11fdf7f2 746 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
747
748 int trimmed = 0;
749 list<Dentry*> skipped;
750 while (lru.lru_get_size() > 0) {
751 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
752 if (!dn)
753 break;
754
755 if ((dn->inode && dn->inode->caps.count(mds)) ||
756 dn->dir->parent_inode->caps.count(mds)) {
757 trim_dentry(dn);
758 trimmed++;
759 } else
760 skipped.push_back(dn);
761 }
762
763 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
764 lru.lru_insert_mid(*p);
765
11fdf7f2 766 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
767 << " trimmed " << trimmed << " dentries" << dendl;
768
769 if (s->caps.size() > 0)
770 _invalidate_kernel_dcache();
771}
772
773void Client::trim_dentry(Dentry *dn)
774{
775 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
776 << " in dir "
777 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
778 << dendl;
779 if (dn->inode) {
780 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
781 clear_dir_complete_and_ordered(diri, true);
782 }
783 unlink(dn, false, false); // drop dir, drop dentry
784}
785
786
1adf2230
AA
787void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
788 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 789{
7c673cae
FG
790 uint64_t prior_size = in->size;
791
7c673cae
FG
792 if (truncate_seq > in->truncate_seq ||
793 (truncate_seq == in->truncate_seq && size > in->size)) {
794 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
795 in->size = size;
796 in->reported_size = size;
797 if (truncate_seq != in->truncate_seq) {
798 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
799 << truncate_seq << dendl;
800 in->truncate_seq = truncate_seq;
801 in->oset.truncate_seq = truncate_seq;
802
803 // truncate cached file data
804 if (prior_size > size) {
805 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
806 }
807 }
808
809 // truncate inline data
810 if (in->inline_version < CEPH_INLINE_NONE) {
811 uint32_t len = in->inline_data.length();
812 if (size < len)
813 in->inline_data.splice(size, len - size);
814 }
815 }
816 if (truncate_seq >= in->truncate_seq &&
817 in->truncate_size != truncate_size) {
818 if (in->is_file()) {
819 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
820 << truncate_size << dendl;
821 in->truncate_size = truncate_size;
822 in->oset.truncate_size = truncate_size;
823 } else {
824 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
825 }
826 }
1adf2230
AA
827}
828
829void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
830 utime_t ctime, utime_t mtime, utime_t atime)
831{
832 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
833 << " ctime " << ctime << " mtime " << mtime << dendl;
834
835 if (time_warp_seq > in->time_warp_seq)
836 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
837 << " is higher than local time_warp_seq "
838 << in->time_warp_seq << dendl;
839
840 int warn = false;
7c673cae
FG
841 // be careful with size, mtime, atime
842 if (issued & (CEPH_CAP_FILE_EXCL|
843 CEPH_CAP_FILE_WR|
844 CEPH_CAP_FILE_BUFFER|
845 CEPH_CAP_AUTH_EXCL|
846 CEPH_CAP_XATTR_EXCL)) {
847 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
848 if (ctime > in->ctime)
849 in->ctime = ctime;
850 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
851 //the mds updated times, so take those!
852 in->mtime = mtime;
853 in->atime = atime;
854 in->time_warp_seq = time_warp_seq;
855 } else if (time_warp_seq == in->time_warp_seq) {
856 //take max times
857 if (mtime > in->mtime)
858 in->mtime = mtime;
859 if (atime > in->atime)
860 in->atime = atime;
861 } else if (issued & CEPH_CAP_FILE_EXCL) {
862 //ignore mds values as we have a higher seq
863 } else warn = true;
864 } else {
865 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
866 if (time_warp_seq >= in->time_warp_seq) {
867 in->ctime = ctime;
868 in->mtime = mtime;
869 in->atime = atime;
870 in->time_warp_seq = time_warp_seq;
871 } else warn = true;
872 }
873 if (warn) {
874 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
875 << time_warp_seq << " is lower than local time_warp_seq "
876 << in->time_warp_seq
877 << dendl;
878 }
879}
880
881void Client::_fragmap_remove_non_leaves(Inode *in)
882{
883 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
884 if (!in->dirfragtree.is_leaf(p->first))
885 in->fragmap.erase(p++);
886 else
887 ++p;
888}
889
890void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
891{
892 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
893 if (p->second == mds)
894 in->fragmap.erase(p++);
895 else
896 ++p;
897}
898
899Inode * Client::add_update_inode(InodeStat *st, utime_t from,
900 MetaSession *session,
901 const UserPerm& request_perms)
902{
903 Inode *in;
904 bool was_new = false;
905 if (inode_map.count(st->vino)) {
906 in = inode_map[st->vino];
11fdf7f2 907 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
908 } else {
909 in = new Inode(this, st->vino, &st->layout);
910 inode_map[st->vino] = in;
911
912 if (use_faked_inos())
913 _assign_faked_ino(in);
914
915 if (!root) {
916 root = in;
11fdf7f2 917 if (use_faked_inos())
b3b6e05e 918 _assign_faked_root(root.get());
7c673cae
FG
919 root_ancestor = in;
920 cwd = root;
f67539c2 921 } else if (is_mounting()) {
7c673cae
FG
922 root_parents[root_ancestor] = in;
923 root_ancestor = in;
924 }
925
926 // immutable bits
927 in->ino = st->vino.ino;
928 in->snapid = st->vino.snapid;
929 in->mode = st->mode & S_IFMT;
930 was_new = true;
931 }
932
933 in->rdev = st->rdev;
934 if (in->is_symlink())
935 in->symlink = st->symlink;
936
7c673cae 937 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
938 bool new_version = false;
939 if (in->version == 0 ||
940 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
941 (in->version & ~1) < st->version))
942 new_version = true;
7c673cae 943
1adf2230
AA
944 int issued;
945 in->caps_issued(&issued);
946 issued |= in->caps_dirty();
947 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 948
1adf2230
AA
949 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
950 !(issued & CEPH_CAP_AUTH_EXCL)) {
951 in->mode = st->mode;
952 in->uid = st->uid;
953 in->gid = st->gid;
954 in->btime = st->btime;
81eedcae 955 in->snap_btime = st->snap_btime;
f67539c2 956 in->snap_metadata = st->snap_metadata;
1adf2230 957 }
7c673cae 958
1adf2230
AA
959 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
960 !(issued & CEPH_CAP_LINK_EXCL)) {
961 in->nlink = st->nlink;
962 }
7c673cae 963
1adf2230
AA
964 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
965 update_inode_file_time(in, issued, st->time_warp_seq,
966 st->ctime, st->mtime, st->atime);
967 }
7c673cae 968
1adf2230
AA
969 if (new_version ||
970 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 971 in->layout = st->layout;
1adf2230
AA
972 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
973 }
7c673cae 974
1adf2230
AA
975 if (in->is_dir()) {
976 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
977 in->dirstat = st->dirstat;
978 }
979 // dir_layout/rstat/quota are not tracked by capability, update them only if
980 // the inode stat is from auth mds
981 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
982 in->dir_layout = st->dir_layout;
983 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
984 in->rstat = st->rstat;
985 in->quota = st->quota;
11fdf7f2 986 in->dir_pin = st->dir_pin;
1adf2230
AA
987 }
988 // move me if/when version reflects fragtree changes.
989 if (in->dirfragtree != st->dirfragtree) {
990 in->dirfragtree = st->dirfragtree;
991 _fragmap_remove_non_leaves(in);
7c673cae 992 }
7c673cae
FG
993 }
994
995 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
996 st->xattrbl.length() &&
997 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
998 auto p = st->xattrbl.cbegin();
999 decode(in->xattrs, p);
7c673cae
FG
1000 in->xattr_version = st->xattr_version;
1001 }
1002
1adf2230
AA
1003 if (st->inline_version > in->inline_version) {
1004 in->inline_data = st->inline_data;
1005 in->inline_version = st->inline_version;
7c673cae
FG
1006 }
1007
1adf2230
AA
1008 /* always take a newer change attr */
1009 if (st->change_attr > in->change_attr)
1010 in->change_attr = st->change_attr;
1011
1012 if (st->version > in->version)
1013 in->version = st->version;
1014
1015 if (was_new)
1016 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1017
1018 if (!st->cap.caps)
1019 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1020
7c673cae 1021 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
1022 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1023 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1024 st->cap.flags, request_perms);
28e407b8 1025 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 1026 in->max_size = st->max_size;
28e407b8
AA
1027 in->rstat = st->rstat;
1028 }
7c673cae 1029
1adf2230
AA
1030 // setting I_COMPLETE needs to happen after adding the cap
1031 if (in->is_dir() &&
1032 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1033 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1034 in->dirstat.nfiles == 0 &&
1035 in->dirstat.nsubdirs == 0) {
1036 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1037 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1038 if (in->dir) {
1039 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1040 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1041 in->dir->readdir_cache.clear();
1042 for (const auto& p : in->dir->dentries) {
1043 unlink(p.second, true, true); // keep dir, keep dentry
1044 }
1045 if (in->dir->dentries.empty())
1046 close_dir(in->dir);
7c673cae 1047 }
7c673cae 1048 }
1adf2230
AA
1049 } else {
1050 in->snap_caps |= st->cap.caps;
7c673cae
FG
1051 }
1052
f67539c2 1053 in->fscrypt = st->fscrypt;
7c673cae
FG
1054 return in;
1055}
1056
1057
1058/*
1059 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1060 */
1061Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1062 Inode *in, utime_t from, MetaSession *session,
1063 Dentry *old_dentry)
1064{
1065 Dentry *dn = NULL;
1066 if (dir->dentries.count(dname))
1067 dn = dir->dentries[dname];
1068
11fdf7f2 1069 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
1070 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1071 << dendl;
1072
1073 if (dn && dn->inode) {
1074 if (dn->inode->vino() == in->vino()) {
1075 touch_dn(dn);
1076 ldout(cct, 12) << " had dentry " << dname
1077 << " with correct vino " << dn->inode->vino()
1078 << dendl;
1079 } else {
1080 ldout(cct, 12) << " had dentry " << dname
1081 << " with WRONG vino " << dn->inode->vino()
1082 << dendl;
1083 unlink(dn, true, true); // keep dir, keep dentry
1084 }
1085 }
1086
1087 if (!dn || !dn->inode) {
1088 InodeRef tmp_ref(in);
1089 if (old_dentry) {
1090 if (old_dentry->dir != dir) {
1091 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1092 clear_dir_complete_and_ordered(old_diri, false);
1093 }
1094 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1095 }
1096 Inode *diri = dir->parent_inode;
7c673cae
FG
1097 clear_dir_complete_and_ordered(diri, false);
1098 dn = link(dir, dname, in, dn);
1099 }
1100
1101 update_dentry_lease(dn, dlease, from, session);
1102 return dn;
1103}
1104
1105void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1106{
1107 utime_t dttl = from;
1108 dttl += (float)dlease->duration_ms / 1000.0;
f67539c2
TL
1109
1110 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
7c673cae 1111
11fdf7f2 1112 ceph_assert(dn);
7c673cae 1113
9f95a23c 1114 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1115 if (dttl > dn->lease_ttl) {
1116 ldout(cct, 10) << "got dentry lease on " << dn->name
1117 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1118 dn->lease_ttl = dttl;
1119 dn->lease_mds = session->mds_num;
1120 dn->lease_seq = dlease->seq;
1121 dn->lease_gen = session->cap_gen;
1122 }
1123 }
1124 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1125 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1126 dn->mark_primary();
f67539c2 1127 dn->alternate_name = std::move(dlease->alternate_name);
7c673cae
FG
1128}
1129
1130
1131/*
1132 * update MDS location cache for a single inode
1133 */
522d829b 1134void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
7c673cae
FG
1135{
1136 // auth
1137 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1138 if (dst->auth >= 0) {
1139 in->fragmap[dst->frag] = dst->auth;
1140 } else {
1141 in->fragmap.erase(dst->frag);
1142 }
1143 if (!in->dirfragtree.is_leaf(dst->frag)) {
1144 in->dirfragtree.force_to_leaf(cct, dst->frag);
1145 _fragmap_remove_non_leaves(in);
1146 }
1147
522d829b
TL
1148 // replicated, only update from auth mds reply
1149 if (from == dst->auth) {
1150 in->dir_replicated = !dst->dist.empty();
1151 if (!dst->dist.empty())
1152 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1153 else
1154 in->frag_repmap.erase(dst->frag);
1155 }
7c673cae
FG
1156}
1157
1158void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1159{
f91f0fd5
TL
1160 if (complete)
1161 diri->dir_release_count++;
1162 else
1163 diri->dir_ordered_count++;
7c673cae
FG
1164 if (diri->flags & I_COMPLETE) {
1165 if (complete) {
1166 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1167 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1168 } else {
1169 if (diri->flags & I_DIR_ORDERED) {
1170 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1171 diri->flags &= ~I_DIR_ORDERED;
1172 }
1173 }
1174 if (diri->dir)
1175 diri->dir->readdir_cache.clear();
1176 }
1177}
1178
1179/*
1180 * insert results from readdir or lssnap into the metadata cache.
1181 */
1182void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1183
11fdf7f2 1184 auto& reply = request->reply;
7c673cae 1185 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1186 uint64_t features;
1187 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1188 features = (uint64_t)-1;
1189 }
1190 else {
1191 features = con->get_features();
1192 }
7c673cae
FG
1193
1194 dir_result_t *dirp = request->dirp;
11fdf7f2 1195 ceph_assert(dirp);
7c673cae
FG
1196
1197 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1198 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1199 if (!p.end()) {
1200 // snapdir?
1201 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1202 ceph_assert(diri);
7c673cae
FG
1203 diri = open_snapdir(diri);
1204 }
1205
1206 // only open dir if we're actually adding stuff to it!
1207 Dir *dir = diri->open_dir();
11fdf7f2 1208 ceph_assert(dir);
7c673cae
FG
1209
1210 // dirstat
11fdf7f2 1211 DirStat dst(p, features);
7c673cae
FG
1212 __u32 numdn;
1213 __u16 flags;
11fdf7f2
TL
1214 decode(numdn, p);
1215 decode(flags, p);
7c673cae
FG
1216
1217 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1218 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1219
1220 frag_t fg = (unsigned)request->head.args.readdir.frag;
1221 unsigned readdir_offset = dirp->next_offset;
1222 string readdir_start = dirp->last_name;
11fdf7f2 1223 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1224
1225 unsigned last_hash = 0;
1226 if (hash_order) {
1227 if (!readdir_start.empty()) {
1228 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1229 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1230 /* mds understands offset_hash */
1231 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1232 }
1233 }
1234
1235 if (fg != dst.frag) {
1236 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1237 fg = dst.frag;
1238 if (!hash_order) {
1239 readdir_offset = 2;
1240 readdir_start.clear();
1241 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1242 }
1243 }
1244
1245 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1246 << ", hash_order=" << hash_order
1247 << ", readdir_start " << readdir_start
1248 << ", last_hash " << last_hash
1249 << ", next_offset " << readdir_offset << dendl;
1250
1251 if (diri->snapid != CEPH_SNAPDIR &&
1252 fg.is_leftmost() && readdir_offset == 2 &&
1253 !(hash_order && last_hash)) {
1254 dirp->release_count = diri->dir_release_count;
1255 dirp->ordered_count = diri->dir_ordered_count;
1256 dirp->start_shared_gen = diri->shared_gen;
1257 dirp->cache_index = 0;
1258 }
1259
1260 dirp->buffer_frag = fg;
1261
1262 _readdir_drop_dirp_buffer(dirp);
1263 dirp->buffer.reserve(numdn);
1264
1265 string dname;
1266 LeaseStat dlease;
1267 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1268 decode(dname, p);
1269 dlease.decode(p, features);
7c673cae
FG
1270 InodeStat ist(p, features);
1271
1272 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1273
1274 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1275 request->perms);
1276 Dentry *dn;
1277 if (diri->dir->dentries.count(dname)) {
1278 Dentry *olddn = diri->dir->dentries[dname];
1279 if (olddn->inode != in) {
1280 // replace incorrect dentry
1281 unlink(olddn, true, true); // keep dir, dentry
1282 dn = link(dir, dname, in, olddn);
11fdf7f2 1283 ceph_assert(dn == olddn);
7c673cae
FG
1284 } else {
1285 // keep existing dn
1286 dn = olddn;
1287 touch_dn(dn);
1288 }
1289 } else {
1290 // new dn
1291 dn = link(dir, dname, in, NULL);
1292 }
f67539c2 1293 dn->alternate_name = std::move(dlease.alternate_name);
7c673cae
FG
1294
1295 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1296 if (hash_order) {
1297 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1298 if (hash != last_hash)
1299 readdir_offset = 2;
1300 last_hash = hash;
1301 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1302 } else {
1303 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1304 }
1305 // add to readdir cache
1306 if (dirp->release_count == diri->dir_release_count &&
1307 dirp->ordered_count == diri->dir_ordered_count &&
1308 dirp->start_shared_gen == diri->shared_gen) {
1309 if (dirp->cache_index == dir->readdir_cache.size()) {
1310 if (i == 0) {
11fdf7f2 1311 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1312 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1313 }
1314 dir->readdir_cache.push_back(dn);
1315 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1316 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1317 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1318 else
1319 dir->readdir_cache[dirp->cache_index] = dn;
1320 } else {
11fdf7f2 1321 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1322 }
1323 dirp->cache_index++;
1324 }
1325 // add to cached result list
f67539c2 1326 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
7c673cae
FG
1327 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1328 }
1329
1330 if (numdn > 0)
1331 dirp->last_name = dname;
1332 if (end)
1333 dirp->next_offset = 2;
1334 else
1335 dirp->next_offset = readdir_offset;
1336
1337 if (dir->is_empty())
1338 close_dir(dir);
1339 }
1340}
1341
1342/** insert_trace
1343 *
1344 * insert a trace from a MDS reply into the cache.
1345 */
1346Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1347{
11fdf7f2 1348 auto& reply = request->reply;
7c673cae
FG
1349 int op = request->get_op();
1350
1351 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1352 << " is_target=" << (int)reply->head.is_target
1353 << " is_dentry=" << (int)reply->head.is_dentry
1354 << dendl;
1355
11fdf7f2 1356 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1357 if (request->got_unsafe) {
1358 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1359 ceph_assert(p.end());
7c673cae
FG
1360 return NULL;
1361 }
1362
1363 if (p.end()) {
1364 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1365
1366 Dentry *d = request->dentry();
1367 if (d) {
1368 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1369 clear_dir_complete_and_ordered(diri, true);
1370 }
1371
1372 if (d && reply->get_result() == 0) {
1373 if (op == CEPH_MDS_OP_RENAME) {
1374 // rename
1375 Dentry *od = request->old_dentry();
1376 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1377 ceph_assert(od);
7c673cae
FG
1378 unlink(od, true, true); // keep dir, dentry
1379 } else if (op == CEPH_MDS_OP_RMDIR ||
1380 op == CEPH_MDS_OP_UNLINK) {
1381 // unlink, rmdir
1382 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1383 unlink(d, true, true); // keep dir, dentry
1384 }
1385 }
1386 return NULL;
1387 }
1388
1389 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1390 uint64_t features;
1391 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1392 features = (uint64_t)-1;
1393 }
1394 else {
1395 features = con->get_features();
1396 }
7c673cae
FG
1397 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1398
1399 // snap trace
1400 SnapRealm *realm = NULL;
1401 if (reply->snapbl.length())
1402 update_snap_trace(reply->snapbl, &realm);
1403
1404 ldout(cct, 10) << " hrm "
1405 << " is_target=" << (int)reply->head.is_target
1406 << " is_dentry=" << (int)reply->head.is_dentry
1407 << dendl;
1408
1409 InodeStat dirst;
1410 DirStat dst;
1411 string dname;
1412 LeaseStat dlease;
1413 InodeStat ist;
1414
1415 if (reply->head.is_dentry) {
1416 dirst.decode(p, features);
11fdf7f2
TL
1417 dst.decode(p, features);
1418 decode(dname, p);
1419 dlease.decode(p, features);
7c673cae
FG
1420 }
1421
1422 Inode *in = 0;
1423 if (reply->head.is_target) {
1424 ist.decode(p, features);
1425 if (cct->_conf->client_debug_getattr_caps) {
1426 unsigned wanted = 0;
1427 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1428 wanted = request->head.args.getattr.mask;
1429 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1430 wanted = request->head.args.open.mask;
1431
1432 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1433 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1434 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1435 }
1436
1437 in = add_update_inode(&ist, request->sent_stamp, session,
1438 request->perms);
1439 }
1440
1441 Inode *diri = NULL;
1442 if (reply->head.is_dentry) {
1443 diri = add_update_inode(&dirst, request->sent_stamp, session,
1444 request->perms);
522d829b
TL
1445 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1446 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
7c673cae
FG
1447
1448 if (in) {
1449 Dir *dir = diri->open_dir();
1450 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1451 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1452 } else {
1453 Dentry *dn = NULL;
1454 if (diri->dir && diri->dir->dentries.count(dname)) {
1455 dn = diri->dir->dentries[dname];
1456 if (dn->inode) {
7c673cae
FG
1457 clear_dir_complete_and_ordered(diri, false);
1458 unlink(dn, true, true); // keep dir, dentry
1459 }
1460 }
1461 if (dlease.duration_ms > 0) {
1462 if (!dn) {
1463 Dir *dir = diri->open_dir();
1464 dn = link(dir, dname, NULL, NULL);
1465 }
1466 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1467 }
1468 }
1469 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1470 op == CEPH_MDS_OP_MKSNAP) {
1471 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1472 // fake it for snap lookup
1473 vinodeno_t vino = ist.vino;
1474 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1475 ceph_assert(inode_map.count(vino));
7c673cae
FG
1476 diri = inode_map[vino];
1477
1478 string dname = request->path.last_dentry();
1479
1480 LeaseStat dlease;
1481 dlease.duration_ms = 0;
1482
1483 if (in) {
1484 Dir *dir = diri->open_dir();
1485 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1486 } else {
1487 if (diri->dir && diri->dir->dentries.count(dname)) {
1488 Dentry *dn = diri->dir->dentries[dname];
1489 if (dn->inode)
1490 unlink(dn, true, true); // keep dir, dentry
1491 }
1492 }
1493 }
1494
1495 if (in) {
1496 if (op == CEPH_MDS_OP_READDIR ||
1497 op == CEPH_MDS_OP_LSSNAP) {
1498 insert_readdir_results(request, session, in);
1499 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1500 // hack: return parent inode instead
1501 in = diri;
1502 }
1503
1504 if (request->dentry() == NULL && in != request->inode()) {
1505 // pin the target inode if its parent dentry is not pinned
1506 request->set_other_inode(in);
1507 }
1508 }
1509
1510 if (realm)
1511 put_snap_realm(realm);
1512
1513 request->target = in;
1514 return in;
1515}
1516
1517// -------
1518
1519mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1520{
1521 mds_rank_t mds = MDS_RANK_NONE;
1522 __u32 hash = 0;
1523 bool is_hash = false;
1524
1525 Inode *in = NULL;
1526 Dentry *de = NULL;
7c673cae
FG
1527
1528 if (req->resend_mds >= 0) {
1529 mds = req->resend_mds;
1530 req->resend_mds = -1;
11fdf7f2 1531 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1532 goto out;
1533 }
1534
1535 if (cct->_conf->client_use_random_mds)
1536 goto random_mds;
1537
1538 in = req->inode();
1539 de = req->dentry();
1540 if (in) {
11fdf7f2 1541 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1542 if (req->path.depth()) {
1543 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1544 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1545 << " on " << req->path[0]
1546 << " => " << hash << dendl;
1547 is_hash = true;
1548 }
1549 } else if (de) {
1550 if (de->inode) {
1551 in = de->inode.get();
11fdf7f2 1552 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1553 } else {
1554 in = de->dir->parent_inode;
1555 hash = in->hash_dentry_name(de->name);
11fdf7f2 1556 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1557 << " on " << de->name
1558 << " => " << hash << dendl;
1559 is_hash = true;
1560 }
1561 }
1562 if (in) {
1563 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1564 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1565 while (in->snapid != CEPH_NOSNAP) {
1566 if (in->snapid == CEPH_SNAPDIR)
1567 in = in->snapdir_parent.get();
11fdf7f2 1568 else if (!in->dentries.empty())
7c673cae
FG
1569 /* In most cases there will only be one dentry, so getting it
1570 * will be the correct action. If there are multiple hard links,
1571 * I think the MDS should be able to redirect as needed*/
1572 in = in->get_first_parent()->dir->parent_inode;
1573 else {
1574 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1575 break;
1576 }
1577 }
1578 is_hash = false;
1579 }
1580
11fdf7f2 1581 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1582 << " hash=" << hash << dendl;
1583
f67539c2 1584 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
7c673cae 1585 frag_t fg = in->dirfragtree[hash];
f67539c2
TL
1586 if (!req->auth_is_best()) {
1587 auto repmapit = in->frag_repmap.find(fg);
1588 if (repmapit != in->frag_repmap.end()) {
1589 auto& repmap = repmapit->second;
1590 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1591 mds = repmap.at(r);
1592 }
1593 } else if (in->fragmap.count(fg)) {
7c673cae
FG
1594 mds = in->fragmap[fg];
1595 if (phash_diri)
1596 *phash_diri = in;
91327a77 1597 } else if (in->auth_cap) {
f67539c2 1598 req->send_to_auth = true;
91327a77
AA
1599 mds = in->auth_cap->session->mds_num;
1600 }
1601 if (mds >= 0) {
11fdf7f2 1602 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1603 goto out;
1604 }
1605 }
1606
11fdf7f2
TL
1607 if (in->auth_cap && req->auth_is_best()) {
1608 mds = in->auth_cap->session->mds_num;
1609 } else if (!in->caps.empty()) {
1610 mds = in->caps.begin()->second.session->mds_num;
1611 } else {
7c673cae 1612 goto random_mds;
11fdf7f2
TL
1613 }
1614 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1615
1616 goto out;
1617 }
1618
1619random_mds:
1620 if (mds < 0) {
1621 mds = _get_random_up_mds();
1622 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1623 }
1624
1625out:
1626 ldout(cct, 20) << "mds is " << mds << dendl;
1627 return mds;
1628}
1629
7c673cae
FG
1630void Client::connect_mds_targets(mds_rank_t mds)
1631{
11fdf7f2
TL
1632 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1633 ceph_assert(mds_sessions.count(mds));
7c673cae 1634 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
f67539c2
TL
1635 for (const auto &rank : info.export_targets) {
1636 if (mds_sessions.count(rank) == 0 &&
1637 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
7c673cae 1638 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
f67539c2
TL
1639 << " export target mds." << rank << dendl;
1640 _open_mds_session(rank);
7c673cae
FG
1641 }
1642 }
1643}
1644
adb31ebb 1645void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1646{
1647 f->dump_int("id", get_nodeid().v);
11fdf7f2 1648 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1649 f->dump_object("inst", inst);
1650 f->dump_stream("inst_str") << inst;
1651 f->dump_stream("addr_str") << inst.addr;
7c673cae 1652 f->open_array_section("sessions");
11fdf7f2 1653 for (const auto &p : mds_sessions) {
7c673cae 1654 f->open_object_section("session");
20effc67 1655 p.second->dump(f, cap_dump);
7c673cae
FG
1656 f->close_section();
1657 }
1658 f->close_section();
1659 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1660}
f67539c2 1661
7c673cae
FG
1662void Client::dump_mds_requests(Formatter *f)
1663{
1664 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1665 p != mds_requests.end();
1666 ++p) {
1667 f->open_object_section("request");
1668 p->second->dump(f);
1669 f->close_section();
1670 }
1671}
1672
9f95a23c 1673int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1674 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1675 InodeRef *ptarget, bool *pcreated,
1676 const UserPerm& perms)
1677{
1678 // check whether this request actually did the create, and set created flag
1679 bufferlist extra_bl;
1680 inodeno_t created_ino;
1681 bool got_created_ino = false;
1682 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1683
11fdf7f2 1684 extra_bl = reply->get_extra_bl();
7c673cae 1685 if (extra_bl.length() >= 8) {
9f95a23c
TL
1686 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1687 struct openc_response_t ocres;
1688
1689 decode(ocres, extra_bl);
1690 created_ino = ocres.created_ino;
1691 /*
1692 * The userland cephfs client doesn't have a way to do an async create
1693 * (yet), so just discard delegated_inos for now. Eventually we should
1694 * store them and use them in create calls, even if they are synchronous,
1695 * if only for testing purposes.
1696 */
1697 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1698 } else {
1699 // u64 containing number of created ino
1700 decode(created_ino, extra_bl);
1701 }
7c673cae 1702 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1703 got_created_ino = true;
7c673cae
FG
1704 }
1705
1706 if (pcreated)
1707 *pcreated = got_created_ino;
1708
1709 if (request->target) {
1710 *ptarget = request->target;
1711 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1712 } else {
1713 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1714 (*ptarget) = p->second;
1715 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1716 } else {
1717 // we got a traceless reply, and need to look up what we just
1718 // created. for now, do this by name. someday, do this by the
1719 // ino... which we know! FIXME.
1720 InodeRef target;
1721 Dentry *d = request->dentry();
1722 if (d) {
1723 if (d->dir) {
1724 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1725 << d->dir->parent_inode->ino << "/" << d->name
1726 << " got_ino " << got_created_ino
1727 << " ino " << created_ino
1728 << dendl;
1729 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1730 &target, perms);
1731 } else {
1732 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1733 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1734 }
1735 } else {
1736 Inode *in = request->inode();
1737 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1738 << in->ino << dendl;
1739 r = _getattr(in, request->regetattr_mask, perms, true);
1740 target = in;
1741 }
1742 if (r >= 0) {
1743 // verify ino returned in reply and trace_dist are the same
1744 if (got_created_ino &&
1745 created_ino.val != target->ino.val) {
1746 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
f67539c2 1747 r = -CEPHFS_EINTR;
7c673cae
FG
1748 }
1749 if (ptarget)
1750 ptarget->swap(target);
1751 }
1752 }
1753 }
1754
1755 return r;
1756}
1757
1758
1759/**
1760 * make a request
1761 *
1762 * Blocking helper to make an MDS request.
1763 *
1764 * If the ptarget flag is set, behavior changes slightly: the caller
1765 * expects to get a pointer to the inode we are creating or operating
1766 * on. As a result, we will follow up any traceless mutation reply
1767 * with a getattr or lookup to transparently handle a traceless reply
1768 * from the MDS (as when the MDS restarts and the client has to replay
1769 * a request).
1770 *
1771 * @param request the MetaRequest to execute
1772 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1773 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1774 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1775 * @param use_mds [optional] prefer a specific mds (-1 for default)
1776 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1777 */
1778int Client::make_request(MetaRequest *request,
1779 const UserPerm& perms,
1780 InodeRef *ptarget, bool *pcreated,
1781 mds_rank_t use_mds,
1782 bufferlist *pdirbl)
1783{
1784 int r = 0;
1785
1786 // assign a unique tid
1787 ceph_tid_t tid = ++last_tid;
1788 request->set_tid(tid);
1789
1790 // and timestamp
1791 request->op_stamp = ceph_clock_now();
1792
1793 // make note
1794 mds_requests[tid] = request->get();
1795 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1796 oldest_tid = tid;
1797
1798 request->set_caller_perms(perms);
1799
1800 if (cct->_conf->client_inject_fixed_oldest_tid) {
1801 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1802 request->set_oldest_client_tid(1);
1803 } else {
1804 request->set_oldest_client_tid(oldest_tid);
1805 }
1806
1807 // hack target mds?
1808 if (use_mds >= 0)
1809 request->resend_mds = use_mds;
1810
20effc67 1811 MetaSessionRef session = NULL;
7c673cae
FG
1812 while (1) {
1813 if (request->aborted())
1814 break;
1815
f67539c2
TL
1816 if (blocklisted) {
1817 request->abort(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
1818 break;
1819 }
1820
7c673cae 1821 // set up wait cond
9f95a23c 1822 ceph::condition_variable caller_cond;
7c673cae
FG
1823 request->caller_cond = &caller_cond;
1824
1825 // choose mds
1826 Inode *hash_diri = NULL;
1827 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1828 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1829 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1830 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1831 if (hash_diri) {
1832 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1833 _fragmap_remove_stopped_mds(hash_diri, mds);
1834 } else {
1835 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1836 request->resend_mds = _get_random_up_mds();
1837 }
1838 } else {
1839 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1840 wait_on_list(waiting_for_mdsmap);
1841 }
1842 continue;
1843 }
1844
1845 // open a session?
7c673cae
FG
1846 if (!have_open_session(mds)) {
1847 session = _get_or_open_mds_session(mds);
f6b5b4d7 1848 if (session->state == MetaSession::STATE_REJECTED) {
f67539c2 1849 request->abort(-CEPHFS_EPERM);
f6b5b4d7
TL
1850 break;
1851 }
7c673cae
FG
1852 // wait
1853 if (session->state == MetaSession::STATE_OPENING) {
1854 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1855 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1856 continue;
1857 }
1858
1859 if (!have_open_session(mds))
1860 continue;
1861 } else {
20effc67 1862 session = mds_sessions.at(mds);
7c673cae
FG
1863 }
1864
1865 // send request.
20effc67 1866 send_request(request, session.get());
7c673cae
FG
1867
1868 // wait for signal
1869 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1870 request->kick = false;
9f95a23c
TL
1871 std::unique_lock l{client_lock, std::adopt_lock};
1872 caller_cond.wait(l, [request] {
1873 return (request->reply || // reply
1874 request->resend_mds >= 0 || // forward
1875 request->kick);
1876 });
1877 l.release();
1878 request->caller_cond = nullptr;
7c673cae
FG
1879
1880 // did we get a reply?
1881 if (request->reply)
1882 break;
1883 }
1884
1885 if (!request->reply) {
11fdf7f2
TL
1886 ceph_assert(request->aborted());
1887 ceph_assert(!request->got_unsafe);
7c673cae
FG
1888 r = request->get_abort_code();
1889 request->item.remove_myself();
1890 unregister_request(request);
11fdf7f2 1891 put_request(request);
7c673cae
FG
1892 return r;
1893 }
1894
1895 // got it!
11fdf7f2 1896 auto reply = std::move(request->reply);
7c673cae
FG
1897 r = reply->get_result();
1898 if (r >= 0)
1899 request->success = true;
1900
1901 // kick dispatcher (we've got it!)
11fdf7f2 1902 ceph_assert(request->dispatch_cond);
9f95a23c 1903 request->dispatch_cond->notify_all();
7c673cae
FG
1904 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1905 request->dispatch_cond = 0;
1906
1907 if (r >= 0 && ptarget)
20effc67 1908 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
7c673cae
FG
1909
1910 if (pdirbl)
11fdf7f2 1911 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1912
1913 // -- log times --
1914 utime_t lat = ceph_clock_now();
1915 lat -= request->sent_stamp;
1916 ldout(cct, 20) << "lat " << lat << dendl;
1917 logger->tinc(l_c_lat, lat);
1918 logger->tinc(l_c_reply, lat);
1919
1920 put_request(request);
7c673cae
FG
1921 return r;
1922}
1923
1924void Client::unregister_request(MetaRequest *req)
1925{
1926 mds_requests.erase(req->tid);
1927 if (req->tid == oldest_tid) {
1928 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1929 while (true) {
1930 if (p == mds_requests.end()) {
1931 oldest_tid = 0;
1932 break;
1933 }
1934 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1935 oldest_tid = p->first;
1936 break;
1937 }
1938 ++p;
1939 }
1940 }
1941 put_request(req);
1942}
1943
1944void Client::put_request(MetaRequest *request)
1945{
1946 if (request->_put()) {
1947 int op = -1;
1948 if (request->success)
1949 op = request->get_op();
1950 InodeRef other_in;
1951 request->take_other_inode(&other_in);
1952 delete request;
1953
1954 if (other_in &&
1955 (op == CEPH_MDS_OP_RMDIR ||
1956 op == CEPH_MDS_OP_RENAME ||
1957 op == CEPH_MDS_OP_RMSNAP)) {
1958 _try_to_trim_inode(other_in.get(), false);
1959 }
1960 }
1961}
1962
1963int Client::encode_inode_release(Inode *in, MetaRequest *req,
1964 mds_rank_t mds, int drop,
1965 int unless, int force)
1966{
11fdf7f2 1967 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
f67539c2 1968 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1911f103 1969 << ", force:" << force << ")" << dendl;
7c673cae 1970 int released = 0;
11fdf7f2
TL
1971 auto it = in->caps.find(mds);
1972 if (it != in->caps.end()) {
1973 Cap &cap = it->second;
7c673cae 1974 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1975 if ((drop & cap.issued) &&
1976 !(unless & cap.issued)) {
1911f103 1977 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
1978 cap.issued &= ~drop;
1979 cap.implemented &= ~drop;
7c673cae 1980 released = 1;
7c673cae
FG
1981 } else {
1982 released = force;
1983 }
1984 if (released) {
1911f103
TL
1985 cap.wanted = in->caps_wanted();
1986 if (&cap == in->auth_cap &&
1987 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1988 in->requested_max_size = 0;
1989 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1990 }
7c673cae
FG
1991 ceph_mds_request_release rel;
1992 rel.ino = in->ino;
11fdf7f2
TL
1993 rel.cap_id = cap.cap_id;
1994 rel.seq = cap.seq;
1995 rel.issue_seq = cap.issue_seq;
1996 rel.mseq = cap.mseq;
1997 rel.caps = cap.implemented;
1998 rel.wanted = cap.wanted;
7c673cae
FG
1999 rel.dname_len = 0;
2000 rel.dname_seq = 0;
2001 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2002 }
2003 }
11fdf7f2 2004 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
2005 << released << dendl;
2006 return released;
2007}
2008
2009void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2010 mds_rank_t mds, int drop, int unless)
2011{
11fdf7f2 2012 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
2013 << dn << ")" << dendl;
2014 int released = 0;
2015 if (dn->dir)
2016 released = encode_inode_release(dn->dir->parent_inode, req,
2017 mds, drop, unless, 1);
2018 if (released && dn->lease_mds == mds) {
2019 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 2020 auto& rel = req->cap_releases.back();
7c673cae
FG
2021 rel.item.dname_len = dn->name.length();
2022 rel.item.dname_seq = dn->lease_seq;
2023 rel.dname = dn->name;
adb31ebb 2024 dn->lease_mds = -1;
7c673cae 2025 }
11fdf7f2 2026 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
2027 << dn << ")" << dendl;
2028}
2029
2030
2031/*
2032 * This requires the MClientRequest *request member to be set.
2033 * It will error out horribly without one.
2034 * Additionally, if you set any *drop member, you'd better have
2035 * set the corresponding dentry!
2036 */
2037void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2038{
11fdf7f2 2039 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
2040 << req << ", mds: " << mds << ")" << dendl;
2041 if (req->inode_drop && req->inode())
2042 encode_inode_release(req->inode(), req,
2043 mds, req->inode_drop,
2044 req->inode_unless);
2045
2046 if (req->old_inode_drop && req->old_inode())
2047 encode_inode_release(req->old_inode(), req,
2048 mds, req->old_inode_drop,
2049 req->old_inode_unless);
2050 if (req->other_inode_drop && req->other_inode())
2051 encode_inode_release(req->other_inode(), req,
2052 mds, req->other_inode_drop,
2053 req->other_inode_unless);
2054
2055 if (req->dentry_drop && req->dentry())
2056 encode_dentry_release(req->dentry(), req,
2057 mds, req->dentry_drop,
2058 req->dentry_unless);
2059
2060 if (req->old_dentry_drop && req->old_dentry())
2061 encode_dentry_release(req->old_dentry(), req,
2062 mds, req->old_dentry_drop,
2063 req->old_dentry_unless);
11fdf7f2 2064 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
2065 << req << ", mds " << mds <<dendl;
2066}
2067
2068bool Client::have_open_session(mds_rank_t mds)
2069{
11fdf7f2
TL
2070 const auto &it = mds_sessions.find(mds);
2071 return it != mds_sessions.end() &&
20effc67
TL
2072 (it->second->state == MetaSession::STATE_OPEN ||
2073 it->second->state == MetaSession::STATE_STALE);
7c673cae
FG
2074}
2075
20effc67 2076MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
7c673cae 2077{
11fdf7f2 2078 const auto &it = mds_sessions.find(mds);
20effc67 2079 if (it == mds_sessions.end() || it->second->con != con) {
7c673cae 2080 return NULL;
11fdf7f2 2081 } else {
20effc67 2082 return it->second;
11fdf7f2 2083 }
7c673cae
FG
2084}
2085
20effc67 2086MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
7c673cae 2087{
11fdf7f2 2088 auto it = mds_sessions.find(mds);
20effc67 2089 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
7c673cae
FG
2090}
2091
2092/**
2093 * Populate a map of strings with client-identifying metadata,
2094 * such as the hostname. Call this once at initialization.
2095 */
2096void Client::populate_metadata(const std::string &mount_root)
2097{
2098 // Hostname
f67539c2
TL
2099#ifdef _WIN32
2100 // TODO: move this to compat.h
2101 char hostname[64];
2102 DWORD hostname_sz = 64;
2103 GetComputerNameA(hostname, &hostname_sz);
2104 metadata["hostname"] = hostname;
2105#else
7c673cae
FG
2106 struct utsname u;
2107 int r = uname(&u);
2108 if (r >= 0) {
2109 metadata["hostname"] = u.nodename;
2110 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2111 } else {
2112 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2113 }
f67539c2 2114#endif
7c673cae
FG
2115
2116 metadata["pid"] = stringify(getpid());
2117
2118 // Ceph entity id (the '0' in "client.0")
2119 metadata["entity_id"] = cct->_conf->name.get_id();
2120
2121 // Our mount position
2122 if (!mount_root.empty()) {
2123 metadata["root"] = mount_root;
2124 }
2125
2126 // Ceph version
2127 metadata["ceph_version"] = pretty_version_to_str();
2128 metadata["ceph_sha1"] = git_version_to_str();
2129
2130 // Apply any metadata from the user's configured overrides
2131 std::vector<std::string> tokens;
2132 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2133 for (const auto &i : tokens) {
2134 auto eqpos = i.find("=");
2135 // Throw out anything that isn't of the form "<str>=<str>"
2136 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2137 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2138 continue;
2139 }
2140 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2141 }
2142}
2143
2144/**
2145 * Optionally add or override client metadata fields.
2146 */
2147void Client::update_metadata(std::string const &k, std::string const &v)
2148{
f67539c2
TL
2149 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2150 ceph_assert(iref_reader.is_state_satisfied());
2151
2152 std::scoped_lock l(client_lock);
7c673cae 2153
11fdf7f2
TL
2154 auto it = metadata.find(k);
2155 if (it != metadata.end()) {
7c673cae 2156 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2157 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2158 }
2159
2160 metadata[k] = v;
2161}
2162
20effc67 2163MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
7c673cae 2164{
11fdf7f2
TL
2165 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2166 auto addrs = mdsmap->get_addrs(mds);
2167 auto em = mds_sessions.emplace(std::piecewise_construct,
2168 std::forward_as_tuple(mds),
20effc67 2169 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
11fdf7f2 2170 ceph_assert(em.second); /* not already present */
20effc67 2171 auto session = em.first->second;
7c673cae 2172
9f95a23c 2173 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2174 m->metadata = metadata;
2175 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
f67539c2 2176 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
11fdf7f2 2177 session->con->send_message2(std::move(m));
7c673cae
FG
2178 return session;
2179}
2180
2181void Client::_close_mds_session(MetaSession *s)
2182{
11fdf7f2 2183 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2184 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2185 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2186}
2187
f6b5b4d7 2188void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2189{
11fdf7f2 2190 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2191 if (rejected && s->state != MetaSession::STATE_CLOSING)
2192 s->state = MetaSession::STATE_REJECTED;
2193 else
2194 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2195 s->con->mark_down();
2196 signal_context_list(s->waiting_for_open);
9f95a23c 2197 mount_cond.notify_all();
f6b5b4d7 2198 remove_session_caps(s, err);
7c673cae 2199 kick_requests_closed(s);
f6b5b4d7
TL
2200 mds_ranks_closing.erase(s->mds_num);
2201 if (s->state == MetaSession::STATE_CLOSED)
2202 mds_sessions.erase(s->mds_num);
7c673cae
FG
2203}
2204
11fdf7f2 2205void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2206{
2207 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2208 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 2209
f67539c2 2210 std::scoped_lock cl(client_lock);
20effc67 2211 auto session = _get_mds_session(from, m->get_connection().get());
7c673cae
FG
2212 if (!session) {
2213 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2214 return;
2215 }
2216
2217 switch (m->get_op()) {
2218 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2219 {
2220 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2221 missing_features -= m->supported_features;
2222 if (!missing_features.empty()) {
2223 lderr(cct) << "mds." << from << " lacks required features '"
2224 << missing_features << "', closing session " << dendl;
20effc67
TL
2225 _close_mds_session(session.get());
2226 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2
TL
2227 break;
2228 }
2229 session->mds_features = std::move(m->supported_features);
2230
20effc67 2231 renew_caps(session.get());
11fdf7f2 2232 session->state = MetaSession::STATE_OPEN;
f67539c2 2233 if (is_unmounting())
9f95a23c 2234 mount_cond.notify_all();
11fdf7f2
TL
2235 else
2236 connect_mds_targets(from);
2237 signal_context_list(session->waiting_for_open);
2238 break;
2239 }
7c673cae
FG
2240
2241 case CEPH_SESSION_CLOSE:
20effc67 2242 _closed_mds_session(session.get());
7c673cae
FG
2243 break;
2244
2245 case CEPH_SESSION_RENEWCAPS:
2246 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2247 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2248 session->cap_ttl =
2249 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298 2250 if (was_stale)
20effc67 2251 wake_up_session_caps(session.get(), false);
7c673cae
FG
2252 }
2253 break;
2254
2255 case CEPH_SESSION_STALE:
28e407b8
AA
2256 // invalidate session caps/leases
2257 session->cap_gen++;
2258 session->cap_ttl = ceph_clock_now();
2259 session->cap_ttl -= 1;
20effc67 2260 renew_caps(session.get());
7c673cae
FG
2261 break;
2262
2263 case CEPH_SESSION_RECALL_STATE:
f67539c2
TL
2264 /*
2265 * Call the renew caps and flush cap releases just before
2266 * triming the caps in case the tick() won't get a chance
2267 * to run them, which could cause the client to be blocklisted
2268 * and MDS daemons trying to recall the caps again and
2269 * again.
2270 *
2271 * In most cases it will do nothing, and the new cap releases
2272 * added by trim_caps() followed will be deferred flushing
2273 * by tick().
2274 */
2275 renew_and_flush_cap_releases();
20effc67 2276 trim_caps(session.get(), m->get_max_caps());
7c673cae
FG
2277 break;
2278
2279 case CEPH_SESSION_FLUSHMSG:
a8e16298 2280 /* flush cap release */
11fdf7f2
TL
2281 if (auto& m = session->release; m) {
2282 session->con->send_message2(std::move(m));
a8e16298 2283 }
9f95a23c 2284 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2285 break;
2286
2287 case CEPH_SESSION_FORCE_RO:
20effc67 2288 force_session_readonly(session.get());
7c673cae
FG
2289 break;
2290
2291 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2292 {
2293 std::string_view error_str;
2294 auto it = m->metadata.find("error_string");
2295 if (it != m->metadata.end())
2296 error_str = it->second;
2297 else
2298 error_str = "unknown error";
2299 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2300
20effc67 2301 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2 2302 }
7c673cae
FG
2303 break;
2304
2305 default:
2306 ceph_abort();
2307 }
7c673cae
FG
2308}
2309
2310bool Client::_any_stale_sessions() const
2311{
9f95a23c 2312 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2313
11fdf7f2 2314 for (const auto &p : mds_sessions) {
20effc67 2315 if (p.second->state == MetaSession::STATE_STALE) {
7c673cae
FG
2316 return true;
2317 }
2318 }
2319
2320 return false;
2321}
2322
2323void Client::_kick_stale_sessions()
2324{
11fdf7f2 2325 ldout(cct, 1) << __func__ << dendl;
7c673cae 2326
11fdf7f2 2327 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67
TL
2328 auto s = it->second;
2329 if (s->state == MetaSession::STATE_REJECTED) {
2330 mds_sessions.erase(it->first);
f6b5b4d7
TL
2331 continue;
2332 }
20effc67
TL
2333 if (s->state == MetaSession::STATE_STALE)
2334 _closed_mds_session(s.get());
7c673cae
FG
2335 }
2336}
2337
2338void Client::send_request(MetaRequest *request, MetaSession *session,
2339 bool drop_cap_releases)
2340{
2341 // make the request
2342 mds_rank_t mds = session->mds_num;
11fdf7f2 2343 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2344 << " for mds." << mds << dendl;
11fdf7f2 2345 auto r = build_client_request(request);
7c673cae
FG
2346 if (request->dentry()) {
2347 r->set_dentry_wanted();
2348 }
2349 if (request->got_unsafe) {
2350 r->set_replayed_op();
2351 if (request->target)
2352 r->head.ino = request->target->ino;
2353 } else {
2354 encode_cap_releases(request, mds);
2355 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2356 request->cap_releases.clear();
2357 else
2358 r->releases.swap(request->cap_releases);
2359 }
2360 r->set_mdsmap_epoch(mdsmap->get_epoch());
2361 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2362 objecter->with_osdmap([r](const OSDMap& o) {
2363 r->set_osdmap_epoch(o.get_epoch());
2364 });
2365 }
2366
2367 if (request->mds == -1) {
2368 request->sent_stamp = ceph_clock_now();
11fdf7f2 2369 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2370 }
2371 request->mds = mds;
2372
2373 Inode *in = request->inode();
11fdf7f2
TL
2374 if (in) {
2375 auto it = in->caps.find(mds);
2376 if (it != in->caps.end()) {
2377 request->sent_on_mseq = it->second.mseq;
2378 }
2379 }
7c673cae
FG
2380
2381 session->requests.push_back(&request->item);
2382
11fdf7f2
TL
2383 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2384 session->con->send_message2(std::move(r));
7c673cae
FG
2385}
2386
9f95a23c 2387ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2388{
9f95a23c 2389 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2390 req->set_tid(request->tid);
2391 req->set_stamp(request->op_stamp);
2392 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2393
2394 // if the filepath's haven't been set, set them!
2395 if (request->path.empty()) {
2396 Inode *in = request->inode();
2397 Dentry *de = request->dentry();
2398 if (in)
2399 in->make_nosnap_relative_path(request->path);
2400 else if (de) {
2401 if (de->inode)
2402 de->inode->make_nosnap_relative_path(request->path);
2403 else if (de->dir) {
2404 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2405 request->path.push_dentry(de->name);
2406 }
2407 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2408 << " No path, inode, or appropriately-endowed dentry given!"
2409 << dendl;
2410 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2411 << " No path, inode, or dentry given!"
2412 << dendl;
2413 }
2414 req->set_filepath(request->get_filepath());
2415 req->set_filepath2(request->get_filepath2());
f67539c2 2416 req->set_alternate_name(request->alternate_name);
7c673cae
FG
2417 req->set_data(request->data);
2418 req->set_retry_attempt(request->retry_attempt++);
2419 req->head.num_fwd = request->num_fwd;
2420 const gid_t *_gids;
2421 int gid_count = request->perms.get_gids(&_gids);
2422 req->set_gid_list(gid_count, _gids);
2423 return req;
2424}
2425
2426
2427
11fdf7f2 2428void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2429{
2430 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
f67539c2
TL
2431
2432 std::scoped_lock cl(client_lock);
20effc67 2433 auto session = _get_mds_session(mds, fwd->get_connection().get());
7c673cae 2434 if (!session) {
7c673cae
FG
2435 return;
2436 }
2437 ceph_tid_t tid = fwd->get_tid();
2438
2439 if (mds_requests.count(tid) == 0) {
11fdf7f2 2440 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2441 return;
2442 }
2443
2444 MetaRequest *request = mds_requests[tid];
11fdf7f2 2445 ceph_assert(request);
7c673cae
FG
2446
2447 // reset retry counter
2448 request->retry_attempt = 0;
2449
2450 // request not forwarded, or dest mds has no session.
2451 // resend.
11fdf7f2 2452 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2453 << " fwd " << fwd->get_num_fwd()
2454 << " to mds." << fwd->get_dest_mds()
2455 << ", resending to " << fwd->get_dest_mds()
2456 << dendl;
2457
2458 request->mds = -1;
2459 request->item.remove_myself();
2460 request->num_fwd = fwd->get_num_fwd();
2461 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2462 request->caller_cond->notify_all();
7c673cae
FG
2463}
2464
2465bool Client::is_dir_operation(MetaRequest *req)
2466{
2467 int op = req->get_op();
2468 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2469 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2470 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2471 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2472 return true;
2473 return false;
2474}
2475
11fdf7f2 2476void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2477{
2478 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
f67539c2
TL
2479
2480 std::scoped_lock cl(client_lock);
20effc67 2481 auto session = _get_mds_session(mds_num, reply->get_connection().get());
7c673cae 2482 if (!session) {
7c673cae
FG
2483 return;
2484 }
2485
2486 ceph_tid_t tid = reply->get_tid();
2487 bool is_safe = reply->is_safe();
2488
2489 if (mds_requests.count(tid) == 0) {
11fdf7f2 2490 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2491 << " safe is:" << is_safe << dendl;
7c673cae
FG
2492 return;
2493 }
2494 MetaRequest *request = mds_requests.at(tid);
2495
11fdf7f2 2496 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2497 << " tid " << tid << dendl;
2498
2499 if (request->got_unsafe && !is_safe) {
2500 //duplicate response
2501 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2502 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2503 return;
2504 }
2505
f67539c2 2506 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
7c673cae
FG
2507 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2508 << " from mds." << request->mds << dendl;
2509 request->send_to_auth = true;
2510 request->resend_mds = choose_target_mds(request);
2511 Inode *in = request->inode();
11fdf7f2 2512 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2513 if (request->resend_mds >= 0 &&
2514 request->resend_mds == request->mds &&
2515 (in == NULL ||
11fdf7f2
TL
2516 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2517 request->sent_on_mseq == it->second.mseq)) {
2518 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae 2519 } else {
9f95a23c 2520 request->caller_cond->notify_all();
7c673cae
FG
2521 return;
2522 }
7c673cae
FG
2523 }
2524
11fdf7f2 2525 ceph_assert(!request->reply);
7c673cae 2526 request->reply = reply;
20effc67 2527 insert_trace(request, session.get());
7c673cae
FG
2528
2529 // Handle unsafe reply
2530 if (!is_safe) {
2531 request->got_unsafe = true;
2532 session->unsafe_requests.push_back(&request->unsafe_item);
2533 if (is_dir_operation(request)) {
2534 Inode *dir = request->inode();
11fdf7f2 2535 ceph_assert(dir);
7c673cae
FG
2536 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2537 }
2538 if (request->target) {
2539 InodeRef &in = request->target;
2540 in->unsafe_ops.push_back(&request->unsafe_target_item);
2541 }
2542 }
2543
2544 // Only signal the caller once (on the first reply):
2545 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2546 if (!is_safe || !request->got_unsafe) {
9f95a23c 2547 ceph::condition_variable cond;
7c673cae
FG
2548 request->dispatch_cond = &cond;
2549
2550 // wake up waiter
11fdf7f2 2551 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2552 request->caller_cond->notify_all();
7c673cae
FG
2553
2554 // wake for kick back
9f95a23c
TL
2555 std::unique_lock l{client_lock, std::adopt_lock};
2556 cond.wait(l, [tid, request, &cond, this] {
2557 if (request->dispatch_cond) {
2558 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2559 << tid << " " << &cond << dendl;
2560 }
2561 return !request->dispatch_cond;
2562 });
2563 l.release();
7c673cae
FG
2564 }
2565
2566 if (is_safe) {
2567 // the filesystem change is committed to disk
2568 // we're done, clean up
2569 if (request->got_unsafe) {
2570 request->unsafe_item.remove_myself();
2571 request->unsafe_dir_item.remove_myself();
2572 request->unsafe_target_item.remove_myself();
2573 signal_cond_list(request->waitfor_safe);
2574 }
2575 request->item.remove_myself();
2576 unregister_request(request);
2577 }
f67539c2 2578 if (is_unmounting())
9f95a23c 2579 mount_cond.notify_all();
7c673cae
FG
2580}
2581
2582void Client::_handle_full_flag(int64_t pool)
2583{
2584 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2585 << "on " << pool << dendl;
f67539c2 2586 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
7c673cae
FG
2587 // to do this rather than blocking, because otherwise when we fill up we
2588 // potentially lock caps forever on files with dirty pages, and we need
2589 // to be able to release those caps to the MDS so that it can delete files
2590 // and free up space.
f67539c2 2591 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
7c673cae
FG
2592
2593 // For all inodes with layouts in this pool and a pending flush write op
2594 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2595 // from ObjectCacher so that it doesn't re-issue the write in response to
2596 // the ENOSPC error.
2597 // Fortunately since we're cancelling everything in a given pool, we don't
2598 // need to know which ops belong to which ObjectSet, we can just blow all
2599 // the un-flushed cached data away and mark any dirty inodes' async_err
f67539c2 2600 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
7c673cae
FG
2601 // affecting this pool, and all the objectsets we're purging were also
2602 // in this pool.
2603 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2604 i != inode_map.end(); ++i)
2605 {
2606 Inode *inode = i->second;
2607 if (inode->oset.dirty_or_tx
2608 && (pool == -1 || inode->layout.pool_id == pool)) {
2609 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2610 << " has dirty objects, purging and setting ENOSPC" << dendl;
2611 objectcacher->purge_set(&inode->oset);
f67539c2 2612 inode->set_async_err(-CEPHFS_ENOSPC);
7c673cae
FG
2613 }
2614 }
2615
2616 if (cancelled_epoch != (epoch_t)-1) {
2617 set_cap_epoch_barrier(cancelled_epoch);
2618 }
2619}
2620
11fdf7f2 2621void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2622{
f67539c2
TL
2623 std::set<entity_addr_t> new_blocklists;
2624
2625 std::scoped_lock cl(client_lock);
2626 objecter->consume_blocklist_events(&new_blocklists);
31f18b77 2627
11fdf7f2 2628 const auto myaddrs = messenger->get_myaddrs();
f67539c2 2629 bool new_blocklist = false;
11fdf7f2
TL
2630 bool prenautilus = objecter->with_osdmap(
2631 [&](const OSDMap& o) {
9f95a23c 2632 return o.require_osd_release < ceph_release_t::nautilus;
11fdf7f2 2633 });
f67539c2 2634 if (!blocklisted) {
11fdf7f2 2635 for (auto a : myaddrs.v) {
f67539c2 2636 // blocklist entries are always TYPE_ANY for nautilus+
11fdf7f2 2637 a.set_type(entity_addr_t::TYPE_ANY);
f67539c2
TL
2638 if (new_blocklists.count(a)) {
2639 new_blocklist = true;
11fdf7f2
TL
2640 break;
2641 }
2642 if (prenautilus) {
2643 // ...except pre-nautilus, they were TYPE_LEGACY
2644 a.set_type(entity_addr_t::TYPE_LEGACY);
f67539c2
TL
2645 if (new_blocklists.count(a)) {
2646 new_blocklist = true;
11fdf7f2
TL
2647 break;
2648 }
2649 }
2650 }
2651 }
f67539c2 2652 if (new_blocklist) {
31f18b77
FG
2653 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2654 return o.get_epoch();
2655 });
f67539c2
TL
2656 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2657 blocklisted = true;
31f18b77 2658
f67539c2 2659 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
2660
2661 // Since we know all our OSD ops will fail, cancel them all preemtively,
2662 // so that on an unhealthy cluster we can umount promptly even if e.g.
2663 // some PGs were inaccessible.
f67539c2
TL
2664 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2665
2666 }
31f18b77 2667
f67539c2
TL
2668 if (blocklisted) {
2669 // Handle case where we were blocklisted but no longer are
2670 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2671 return o.is_blocklisted(myaddrs);});
31f18b77
FG
2672 }
2673
f67539c2
TL
2674 // Always subscribe to next osdmap for blocklisted client
2675 // until this client is not blocklisted.
2676 if (blocklisted) {
f64942e4
AA
2677 objecter->maybe_request_map();
2678 }
2679
7c673cae
FG
2680 if (objecter->osdmap_full_flag()) {
2681 _handle_full_flag(-1);
2682 } else {
2683 // Accumulate local list of full pools so that I can drop
2684 // the objecter lock before re-entering objecter in
2685 // cancel_writes
2686 std::vector<int64_t> full_pools;
2687
2688 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2689 for (const auto& kv : o.get_pools()) {
2690 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2691 full_pools.push_back(kv.first);
2692 }
2693 }
2694 });
2695
2696 for (auto p : full_pools)
2697 _handle_full_flag(p);
2698
2699 // Subscribe to subsequent maps to watch for the full flag going
2700 // away. For the global full flag objecter does this for us, but
2701 // it pays no attention to the per-pool full flag so in this branch
2702 // we do it ourselves.
2703 if (!full_pools.empty()) {
2704 objecter->maybe_request_map();
2705 }
2706 }
7c673cae
FG
2707}
2708
2709
2710// ------------------------
2711// incoming messages
2712
2713
11fdf7f2 2714bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2715{
f67539c2
TL
2716 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2717 if (!iref_reader.is_state_satisfied()) {
7c673cae 2718 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2719 return true;
2720 }
2721
2722 switch (m->get_type()) {
2723 // mounting and mds sessions
2724 case CEPH_MSG_MDS_MAP:
9f95a23c 2725 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2726 break;
2727 case CEPH_MSG_FS_MAP:
9f95a23c 2728 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2729 break;
2730 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2731 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2732 break;
2733 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2734 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2735 break;
2736
2737 case CEPH_MSG_OSD_MAP:
9f95a23c 2738 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2739 break;
2740
2741 // requests
2742 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2743 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2744 break;
2745 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2746 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2747 break;
2748
2749 // reclaim reply
2750 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2751 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2752 break;
2753
2754 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2755 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2756 break;
2757 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2758 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2759 break;
2760 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2761 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2762 break;
2763 case MSG_COMMAND_REPLY:
2764 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2765 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2766 } else {
2767 return false;
2768 }
2769 break;
2770 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2771 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2772 break;
2773
2774 default:
2775 return false;
2776 }
2777
2778 // unmounting?
f67539c2
TL
2779 std::scoped_lock cl(client_lock);
2780 if (is_unmounting()) {
7c673cae
FG
2781 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2782 << "+" << inode_map.size() << dendl;
f67539c2 2783 uint64_t size = lru.lru_get_size() + inode_map.size();
7c673cae 2784 trim_cache();
f67539c2 2785 if (size > lru.lru_get_size() + inode_map.size()) {
7c673cae 2786 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2787 mount_cond.notify_all();
7c673cae
FG
2788 } else {
2789 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2790 << "+" << inode_map.size() << dendl;
2791 }
2792 }
2793
2794 return true;
2795}
2796
11fdf7f2 2797void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae 2798{
f67539c2 2799 std::scoped_lock cl(client_lock);
7c673cae 2800 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2801
2802 signal_cond_list(waiting_for_fsmap);
2803
2804 monclient->sub_got("fsmap", fsmap->get_epoch());
2805}
2806
11fdf7f2 2807void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae 2808{
f67539c2 2809 std::scoped_lock cl(client_lock);
7c673cae
FG
2810 fsmap_user.reset(new FSMapUser);
2811 *fsmap_user = m->get_fsmap();
7c673cae
FG
2812
2813 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2814 signal_cond_list(waiting_for_fsmap);
2815}
2816
f67539c2
TL
2817// Cancel all the commands for missing or laggy GIDs
2818void Client::cancel_commands(const MDSMap& newmap)
7c673cae 2819{
f67539c2 2820 std::vector<ceph_tid_t> cancel_ops;
7c673cae 2821
f67539c2 2822 std::scoped_lock cmd_lock(command_lock);
7c673cae 2823 auto &commands = command_table.get_commands();
f67539c2 2824 for (const auto &[tid, op] : commands) {
7c673cae 2825 const mds_gid_t op_mds_gid = op.mds_gid;
f67539c2
TL
2826 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2827 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2828 cancel_ops.push_back(tid);
7c673cae
FG
2829 if (op.outs) {
2830 std::ostringstream ss;
2831 ss << "MDS " << op_mds_gid << " went away";
2832 *(op.outs) = ss.str();
2833 }
f67539c2
TL
2834 /*
2835 * No need to make the con->mark_down under
2836 * client_lock here, because the con will
2837 * has its own lock.
2838 */
7c673cae 2839 op.con->mark_down();
f67539c2
TL
2840 if (op.on_finish)
2841 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
7c673cae
FG
2842 }
2843 }
2844
f67539c2
TL
2845 for (const auto &tid : cancel_ops)
2846 command_table.erase(tid);
2847}
2848
2849void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2850{
2851 std::unique_lock cl(client_lock);
2852 if (m->get_epoch() <= mdsmap->get_epoch()) {
2853 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2854 << " is identical to or older than our "
2855 << mdsmap->get_epoch() << dendl;
2856 return;
7c673cae
FG
2857 }
2858
f67539c2
TL
2859 cl.unlock();
2860 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2861 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2862 _mdsmap->decode(m->get_encoded());
2863 cancel_commands(*_mdsmap.get());
2864 cl.lock();
2865
2866 _mdsmap.swap(mdsmap);
2867
7c673cae 2868 // reset session
11fdf7f2 2869 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2870 mds_rank_t mds = p->first;
20effc67 2871 MetaSessionRef session = p->second;
7c673cae
FG
2872 ++p;
2873
f67539c2 2874 int oldstate = _mdsmap->get_state(mds);
7c673cae
FG
2875 int newstate = mdsmap->get_state(mds);
2876 if (!mdsmap->is_up(mds)) {
2877 session->con->mark_down();
11fdf7f2 2878 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f67539c2
TL
2879 auto old_inc = _mdsmap->get_incarnation(mds);
2880 auto new_inc = mdsmap->get_incarnation(mds);
f64942e4
AA
2881 if (old_inc != new_inc) {
2882 ldout(cct, 1) << "mds incarnation changed from "
2883 << old_inc << " to " << new_inc << dendl;
2884 oldstate = MDSMap::STATE_NULL;
2885 }
7c673cae 2886 session->con->mark_down();
11fdf7f2 2887 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2888 // When new MDS starts to take over, notify kernel to trim unused entries
2889 // in its dcache/icache. Hopefully, the kernel will release some unused
2890 // inodes before the new MDS enters reconnect state.
20effc67 2891 trim_cache_for_reconnect(session.get());
7c673cae
FG
2892 } else if (oldstate == newstate)
2893 continue; // no change
f67539c2 2894
7c673cae
FG
2895 session->mds_state = newstate;
2896 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2897 session->con = messenger->connect_to_mds(session->addrs);
20effc67 2898 send_reconnect(session.get());
81eedcae
TL
2899 } else if (newstate > MDSMap::STATE_RECONNECT) {
2900 if (oldstate < MDSMap::STATE_RECONNECT) {
2901 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
20effc67 2902 _closed_mds_session(session.get());
81eedcae
TL
2903 continue;
2904 }
2905 if (newstate >= MDSMap::STATE_ACTIVE) {
2906 if (oldstate < MDSMap::STATE_ACTIVE) {
2907 // kick new requests
20effc67
TL
2908 kick_requests(session.get());
2909 kick_flushing_caps(session.get());
81eedcae 2910 signal_context_list(session->waiting_for_open);
20effc67 2911 wake_up_session_caps(session.get(), true);
81eedcae
TL
2912 }
2913 connect_mds_targets(mds);
7c673cae 2914 }
7c673cae
FG
2915 } else if (newstate == MDSMap::STATE_NULL &&
2916 mds >= mdsmap->get_max_mds()) {
20effc67 2917 _closed_mds_session(session.get());
7c673cae
FG
2918 }
2919 }
2920
2921 // kick any waiting threads
2922 signal_cond_list(waiting_for_mdsmap);
2923
7c673cae
FG
2924 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2925}
2926
2927void Client::send_reconnect(MetaSession *session)
2928{
2929 mds_rank_t mds = session->mds_num;
11fdf7f2 2930 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2931
2932 // trim unused caps to reduce MDS's cache rejoin time
2933 trim_cache_for_reconnect(session);
2934
2935 session->readonly = false;
2936
11fdf7f2 2937 session->release.reset();
7c673cae
FG
2938
2939 // reset my cap seq number
2940 session->seq = 0;
2941 //connect to the mds' offload targets
2942 connect_mds_targets(mds);
2943 //make sure unsafe requests get saved
2944 resend_unsafe_requests(session);
2945
11fdf7f2
TL
2946 early_kick_flushing_caps(session);
2947
9f95a23c 2948 auto m = make_message<MClientReconnect>();
11fdf7f2 2949 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2950
2951 // i have an open session.
2952 ceph::unordered_set<inodeno_t> did_snaprealm;
2953 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2954 p != inode_map.end();
2955 ++p) {
2956 Inode *in = p->second;
11fdf7f2
TL
2957 auto it = in->caps.find(mds);
2958 if (it != in->caps.end()) {
2959 if (allow_multi &&
9f95a23c
TL
2960 m->get_approx_size() >=
2961 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
2962 m->mark_more();
2963 session->con->send_message2(std::move(m));
2964
9f95a23c 2965 m = make_message<MClientReconnect>();
11fdf7f2
TL
2966 }
2967
2968 Cap &cap = it->second;
7c673cae 2969 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2970 << " " << ccap_string(cap.issued)
7c673cae
FG
2971 << " wants " << ccap_string(in->caps_wanted())
2972 << dendl;
2973 filepath path;
f91f0fd5 2974 in->make_short_path(path);
7c673cae
FG
2975 ldout(cct, 10) << " path " << path << dendl;
2976
2977 bufferlist flockbl;
2978 _encode_filelocks(in, flockbl);
2979
11fdf7f2
TL
2980 cap.seq = 0; // reset seq.
2981 cap.issue_seq = 0; // reset seq.
2982 cap.mseq = 0; // reset seq.
2983 // cap gen should catch up with session cap_gen
2984 if (cap.gen < session->cap_gen) {
2985 cap.gen = session->cap_gen;
2986 cap.issued = cap.implemented = CEPH_CAP_PIN;
2987 } else {
2988 cap.issued = cap.implemented;
2989 }
7c673cae
FG
2990 snapid_t snap_follows = 0;
2991 if (!in->cap_snaps.empty())
2992 snap_follows = in->cap_snaps.begin()->first;
2993
2994 m->add_cap(p->first.ino,
11fdf7f2 2995 cap.cap_id,
7c673cae
FG
2996 path.get_ino(), path.get_path(), // ino
2997 in->caps_wanted(), // wanted
11fdf7f2 2998 cap.issued, // issued
7c673cae
FG
2999 in->snaprealm->ino,
3000 snap_follows,
3001 flockbl);
3002
3003 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3004 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3005 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3006 did_snaprealm.insert(in->snaprealm->ino);
3007 }
3008 }
3009 }
3010
11fdf7f2
TL
3011 if (!allow_multi)
3012 m->set_encoding_version(0); // use connection features to choose encoding
3013 session->con->send_message2(std::move(m));
7c673cae 3014
9f95a23c 3015 mount_cond.notify_all();
11fdf7f2
TL
3016
3017 if (session->reclaim_state == MetaSession::RECLAIMING)
3018 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
3019}
3020
3021
3022void Client::kick_requests(MetaSession *session)
3023{
11fdf7f2 3024 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3025 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3026 p != mds_requests.end();
3027 ++p) {
31f18b77
FG
3028 MetaRequest *req = p->second;
3029 if (req->got_unsafe)
3030 continue;
3031 if (req->aborted()) {
3032 if (req->caller_cond) {
3033 req->kick = true;
9f95a23c 3034 req->caller_cond->notify_all();
31f18b77 3035 }
7c673cae 3036 continue;
31f18b77
FG
3037 }
3038 if (req->retry_attempt > 0)
7c673cae 3039 continue; // new requests only
31f18b77 3040 if (req->mds == session->mds_num) {
7c673cae
FG
3041 send_request(p->second, session);
3042 }
3043 }
3044}
3045
3046void Client::resend_unsafe_requests(MetaSession *session)
3047{
3048 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3049 !iter.end();
3050 ++iter)
3051 send_request(*iter, session);
3052
3053 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3054 // process completed requests in clientreplay stage.
3055 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3056 p != mds_requests.end();
3057 ++p) {
3058 MetaRequest *req = p->second;
3059 if (req->got_unsafe)
3060 continue;
31f18b77
FG
3061 if (req->aborted())
3062 continue;
7c673cae
FG
3063 if (req->retry_attempt == 0)
3064 continue; // old requests only
3065 if (req->mds == session->mds_num)
3066 send_request(req, session, true);
3067 }
3068}
3069
3070void Client::wait_unsafe_requests()
3071{
3072 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2 3073 for (const auto &p : mds_sessions) {
20effc67
TL
3074 const auto s = p.second;
3075 if (!s->unsafe_requests.empty()) {
3076 MetaRequest *req = s->unsafe_requests.back();
7c673cae
FG
3077 req->get();
3078 last_unsafe_reqs.push_back(req);
3079 }
3080 }
3081
3082 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3083 p != last_unsafe_reqs.end();
3084 ++p) {
3085 MetaRequest *req = *p;
3086 if (req->unsafe_item.is_on_list())
3087 wait_on_list(req->waitfor_safe);
3088 put_request(req);
3089 }
3090}
3091
3092void Client::kick_requests_closed(MetaSession *session)
3093{
11fdf7f2 3094 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3095 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3096 p != mds_requests.end(); ) {
3097 MetaRequest *req = p->second;
3098 ++p;
3099 if (req->mds == session->mds_num) {
3100 if (req->caller_cond) {
3101 req->kick = true;
9f95a23c 3102 req->caller_cond->notify_all();
7c673cae
FG
3103 }
3104 req->item.remove_myself();
3105 if (req->got_unsafe) {
11fdf7f2 3106 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 3107 req->unsafe_item.remove_myself();
eafe8130
TL
3108 if (is_dir_operation(req)) {
3109 Inode *dir = req->inode();
20effc67 3110 ceph_assert(dir);
f67539c2 3111 dir->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3112 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3113 << dir->ino << " " << req->get_tid() << dendl;
3114 req->unsafe_dir_item.remove_myself();
3115 }
3116 if (req->target) {
3117 InodeRef &in = req->target;
f67539c2 3118 in->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3119 lderr(cct) << "kick_requests_closed drop req of inode : "
3120 << in->ino << " " << req->get_tid() << dendl;
3121 req->unsafe_target_item.remove_myself();
3122 }
7c673cae
FG
3123 signal_cond_list(req->waitfor_safe);
3124 unregister_request(req);
3125 }
3126 }
3127 }
11fdf7f2
TL
3128 ceph_assert(session->requests.empty());
3129 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
3130}
3131
3132
3133
3134
3135/************
3136 * leases
3137 */
3138
3139void Client::got_mds_push(MetaSession *s)
3140{
3141 s->seq++;
3142 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3143 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 3144 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
3145 }
3146}
3147
11fdf7f2 3148void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3149{
11fdf7f2 3150 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3151
11fdf7f2 3152 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae 3153 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
3154
3155 std::scoped_lock cl(client_lock);
20effc67 3156 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 3157 if (!session) {
7c673cae
FG
3158 return;
3159 }
3160
20effc67 3161 got_mds_push(session.get());
7c673cae
FG
3162
3163 ceph_seq_t seq = m->get_seq();
3164
3165 Inode *in;
3166 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3167 if (inode_map.count(vino) == 0) {
3168 ldout(cct, 10) << " don't have vino " << vino << dendl;
3169 goto revoke;
3170 }
3171 in = inode_map[vino];
3172
9f95a23c 3173 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3174 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3175 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3176 goto revoke;
3177 }
3178 Dentry *dn = in->dir->dentries[m->dname];
3179 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3180 dn->lease_mds = -1;
3181 }
3182
3183 revoke:
11fdf7f2 3184 {
9f95a23c
TL
3185 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3186 m->get_mask(), m->get_ino(),
3187 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3188 m->get_connection()->send_message2(std::move(reply));
3189 }
7c673cae
FG
3190}
3191
f67539c2 3192void Client::_put_inode(Inode *in, int n)
7c673cae 3193{
f67539c2
TL
3194 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3195
b3b6e05e
TL
3196 int left = in->get_nref();
3197 ceph_assert(left >= n + 1);
3198 in->iput(n);
3199 left -= n;
3200 if (left == 1) { // the last one will be held by the inode_map
7c673cae
FG
3201 // release any caps
3202 remove_all_caps(in);
3203
11fdf7f2 3204 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3205 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3206 ceph_assert(!unclean);
7c673cae
FG
3207 inode_map.erase(in->vino());
3208 if (use_faked_inos())
3209 _release_faked_ino(in);
3210
b3b6e05e 3211 if (root == nullptr) {
7c673cae
FG
3212 root_ancestor = 0;
3213 while (!root_parents.empty())
3214 root_parents.erase(root_parents.begin());
3215 }
3216
b3b6e05e 3217 in->iput();
7c673cae
FG
3218 }
3219}
3220
f67539c2
TL
3221void Client::delay_put_inodes(bool wakeup)
3222{
3223 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3224
3225 std::map<Inode*,int> release;
3226 {
3227 std::scoped_lock dl(delay_i_lock);
3228 release.swap(delay_i_release);
3229 }
3230
3231 if (release.empty())
3232 return;
3233
3234 for (auto &[in, cnt] : release)
3235 _put_inode(in, cnt);
3236
3237 if (wakeup)
3238 mount_cond.notify_all();
3239}
3240
3241void Client::put_inode(Inode *in, int n)
3242{
3243 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3244
3245 std::scoped_lock dl(delay_i_lock);
3246 delay_i_release[in] += n;
3247}
3248
7c673cae
FG
3249void Client::close_dir(Dir *dir)
3250{
3251 Inode *in = dir->parent_inode;
11fdf7f2
TL
3252 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3253 ceph_assert(dir->is_empty());
3254 ceph_assert(in->dir == dir);
3255 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3256 if (!in->dentries.empty())
7c673cae
FG
3257 in->get_first_parent()->put(); // unpin dentry
3258
3259 delete in->dir;
3260 in->dir = 0;
3261 put_inode(in); // unpin inode
3262}
3263
3264 /**
3265 * Don't call this with in==NULL, use get_or_create for that
3266 * leave dn set to default NULL unless you're trying to add
3267 * a new inode to a pre-created Dentry
3268 */
3269Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3270{
3271 if (!dn) {
3272 // create a new Dentry
11fdf7f2
TL
3273 dn = new Dentry(dir, name);
3274
7c673cae
FG
3275 lru.lru_insert_mid(dn); // mid or top?
3276
3277 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3278 << " dn " << dn << " (new dn)" << dendl;
3279 } else {
11fdf7f2 3280 ceph_assert(!dn->inode);
7c673cae
FG
3281 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3282 << " dn " << dn << " (old dn)" << dendl;
3283 }
3284
3285 if (in) { // link to inode
11fdf7f2 3286 InodeRef tmp_ref;
7c673cae 3287 // only one parent for directories!
11fdf7f2
TL
3288 if (in->is_dir() && !in->dentries.empty()) {
3289 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3290 Dentry *olddn = in->get_first_parent();
11fdf7f2 3291 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3292 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3293 clear_dir_complete_and_ordered(old_diri, true);
3294 unlink(olddn, true, true); // keep dir, dentry
3295 }
3296
11fdf7f2 3297 dn->link(in);
f67539c2 3298 inc_dentry_nr();
11fdf7f2 3299 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3300 }
3301
3302 return dn;
3303}
3304
3305void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3306{
11fdf7f2 3307 InodeRef in(dn->inode);
7c673cae
FG
3308 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3309 << " inode " << dn->inode << dendl;
3310
3311 // unlink from inode
11fdf7f2
TL
3312 if (dn->inode) {
3313 dn->unlink();
f67539c2 3314 dec_dentry_nr();
11fdf7f2 3315 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3316 }
3317
3318 if (keepdentry) {
3319 dn->lease_mds = -1;
3320 } else {
3321 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3322
3323 // unlink from dir
11fdf7f2
TL
3324 Dir *dir = dn->dir;
3325 dn->detach();
7c673cae
FG
3326
3327 // delete den
3328 lru.lru_remove(dn);
3329 dn->put();
11fdf7f2
TL
3330
3331 if (dir->is_empty() && !keepdir)
3332 close_dir(dir);
7c673cae
FG
3333 }
3334}
3335
3336/**
3337 * For asynchronous flushes, check for errors from the IO and
3338 * update the inode if necessary
3339 */
3340class C_Client_FlushComplete : public Context {
3341private:
3342 Client *client;
3343 InodeRef inode;
3344public:
3345 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3346 void finish(int r) override {
9f95a23c 3347 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3348 if (r != 0) {
3349 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3350 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3351 << " 0x" << std::hex << inode->ino << std::dec
3352 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3353 inode->set_async_err(r);
3354 }
3355 }
3356};
3357
3358
3359/****
3360 * caps
3361 */
3362
3363void Client::get_cap_ref(Inode *in, int cap)
3364{
3365 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3366 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3367 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
b3b6e05e 3368 in->iget();
7c673cae
FG
3369 }
3370 if ((cap & CEPH_CAP_FILE_CACHE) &&
3371 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3372 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
b3b6e05e 3373 in->iget();
7c673cae
FG
3374 }
3375 in->get_cap_ref(cap);
3376}
3377
3378void Client::put_cap_ref(Inode *in, int cap)
3379{
3380 int last = in->put_cap_ref(cap);
3381 if (last) {
3382 int put_nref = 0;
3383 int drop = last & ~in->caps_issued();
3384 if (in->snapid == CEPH_NOSNAP) {
f67539c2 3385 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3386 !in->cap_snaps.empty() &&
3387 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3388 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3389 in->cap_snaps.rbegin()->second.writing = 0;
3390 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3391 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3392 }
3393 if (last & CEPH_CAP_FILE_BUFFER) {
3394 for (auto &p : in->cap_snaps)
3395 p.second.dirty_data = 0;
3396 signal_cond_list(in->waitfor_commit);
11fdf7f2 3397 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3398 ++put_nref;
3399 }
3400 }
3401 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3402 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3403 ++put_nref;
3404 }
3405 if (drop)
3406 check_caps(in, 0);
3407 if (put_nref)
3408 put_inode(in, put_nref);
3409 }
3410}
3411
f67539c2
TL
3412// get caps for a given file handle -- the inode should have @need caps
3413// issued by the mds and @want caps not revoked (or not under revocation).
3414// this routine blocks till the cap requirement is satisfied. also account
3415// (track) for capability hit when required (when cap requirement succeedes).
f6b5b4d7 3416int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3417{
f6b5b4d7
TL
3418 Inode *in = fh->inode.get();
3419
7c673cae
FG
3420 int r = check_pool_perm(in, need);
3421 if (r < 0)
3422 return r;
3423
3424 while (1) {
3425 int file_wanted = in->caps_file_wanted();
3426 if ((file_wanted & need) != need) {
3427 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3428 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3429 << dendl;
f67539c2 3430 return -CEPHFS_EBADF;
7c673cae
FG
3431 }
3432
f6b5b4d7 3433 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
f67539c2 3434 return -CEPHFS_EBADF;
f6b5b4d7
TL
3435
3436 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
f67539c2 3437 return -CEPHFS_EIO;
f6b5b4d7 3438
7c673cae
FG
3439 int implemented;
3440 int have = in->caps_issued(&implemented);
3441
3442 bool waitfor_caps = false;
3443 bool waitfor_commit = false;
3444
3445 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3446 if (endoff > 0) {
3447 if ((endoff >= (loff_t)in->max_size ||
3448 endoff > (loff_t)(in->size << 1)) &&
3449 endoff > (loff_t)in->wanted_max_size) {
3450 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3451 in->wanted_max_size = endoff;
3452 }
3453 if (in->wanted_max_size > in->max_size &&
3454 in->wanted_max_size > in->requested_max_size)
3455 check_caps(in, 0);
7c673cae
FG
3456 }
3457
3458 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3459 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3460 waitfor_caps = true;
3461 }
3462 if (!in->cap_snaps.empty()) {
3463 if (in->cap_snaps.rbegin()->second.writing) {
3464 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3465 waitfor_caps = true;
3466 }
3467 for (auto &p : in->cap_snaps) {
3468 if (p.second.dirty_data) {
3469 waitfor_commit = true;
3470 break;
3471 }
3472 }
3473 if (waitfor_commit) {
3474 _flush(in, new C_Client_FlushComplete(this, in));
3475 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3476 }
3477 }
3478 }
3479
3480 if (!waitfor_caps && !waitfor_commit) {
3481 if ((have & need) == need) {
7c673cae
FG
3482 int revoking = implemented & ~have;
3483 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3484 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3485 << " revoking " << ccap_string(revoking)
7c673cae 3486 << dendl;
c07f9fc5 3487 if ((revoking & want) == 0) {
7c673cae
FG
3488 *phave = need | (have & want);
3489 in->get_cap_ref(need);
f67539c2 3490 cap_hit();
7c673cae
FG
3491 return 0;
3492 }
3493 }
3494 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3495 waitfor_caps = true;
3496 }
3497
3498 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3499 in->auth_cap->session->readonly)
f67539c2 3500 return -CEPHFS_EROFS;
7c673cae
FG
3501
3502 if (in->flags & I_CAP_DROPPED) {
3503 int mds_wanted = in->caps_mds_wanted();
3504 if ((mds_wanted & need) != need) {
3505 int ret = _renew_caps(in);
3506 if (ret < 0)
3507 return ret;
3508 continue;
3509 }
a8e16298 3510 if (!(file_wanted & ~mds_wanted))
7c673cae 3511 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3512 }
3513
3514 if (waitfor_caps)
3515 wait_on_list(in->waitfor_caps);
3516 else if (waitfor_commit)
3517 wait_on_list(in->waitfor_commit);
3518 }
3519}
3520
3521int Client::get_caps_used(Inode *in)
3522{
3523 unsigned used = in->caps_used();
3524 if (!(used & CEPH_CAP_FILE_CACHE) &&
3525 !objectcacher->set_is_empty(&in->oset))
3526 used |= CEPH_CAP_FILE_CACHE;
3527 return used;
3528}
3529
3530void Client::cap_delay_requeue(Inode *in)
3531{
11fdf7f2 3532 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3533 in->hold_caps_until = ceph_clock_now();
3534 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3535 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3536}
3537
3538void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3539 int flags, int used, int want, int retain,
7c673cae
FG
3540 int flush, ceph_tid_t flush_tid)
3541{
3542 int held = cap->issued | cap->implemented;
3543 int revoking = cap->implemented & ~cap->issued;
3544 retain &= ~revoking;
3545 int dropping = cap->issued & ~retain;
3546 int op = CEPH_CAP_OP_UPDATE;
3547
11fdf7f2 3548 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3549 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3550 << " used " << ccap_string(used)
3551 << " want " << ccap_string(want)
3552 << " flush " << ccap_string(flush)
3553 << " retain " << ccap_string(retain)
3554 << " held "<< ccap_string(held)
3555 << " revoking " << ccap_string(revoking)
3556 << " dropping " << ccap_string(dropping)
3557 << dendl;
3558
3559 if (cct->_conf->client_inject_release_failure && revoking) {
3560 const int would_have_issued = cap->issued & retain;
3561 const int would_have_implemented = cap->implemented & (cap->issued | used);
3562 // Simulated bug:
3563 // - tell the server we think issued is whatever they issued plus whatever we implemented
3564 // - leave what we have implemented in place
3565 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3566 cap->issued = cap->issued | cap->implemented;
3567
3568 // Make an exception for revoking xattr caps: we are injecting
3569 // failure to release other caps, but allow xattr because client
3570 // will block on xattr ops if it can't release these to MDS (#9800)
3571 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3572 cap->issued ^= xattr_mask & revoking;
3573 cap->implemented ^= xattr_mask & revoking;
3574
3575 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3576 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3577 } else {
3578 // Normal behaviour
3579 cap->issued &= retain;
3580 cap->implemented &= cap->issued | used;
3581 }
3582
3583 snapid_t follows = 0;
3584
3585 if (flush)
3586 follows = in->snaprealm->get_snap_context().seq;
20effc67 3587
9f95a23c 3588 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3589 in->ino,
3590 0,
3591 cap->cap_id, cap->seq,
3592 cap->implemented,
3593 want,
3594 flush,
3595 cap->mseq,
3596 cap_epoch_barrier);
3597 m->caller_uid = in->cap_dirtier_uid;
3598 m->caller_gid = in->cap_dirtier_gid;
3599
3600 m->head.issue_seq = cap->issue_seq;
3601 m->set_tid(flush_tid);
3602
3603 m->head.uid = in->uid;
3604 m->head.gid = in->gid;
3605 m->head.mode = in->mode;
20effc67 3606
7c673cae 3607 m->head.nlink = in->nlink;
20effc67 3608
7c673cae 3609 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3610 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3611 m->head.xattr_version = in->xattr_version;
3612 }
20effc67 3613
7c673cae
FG
3614 m->size = in->size;
3615 m->max_size = in->max_size;
3616 m->truncate_seq = in->truncate_seq;
3617 m->truncate_size = in->truncate_size;
3618 m->mtime = in->mtime;
3619 m->atime = in->atime;
3620 m->ctime = in->ctime;
3621 m->btime = in->btime;
3622 m->time_warp_seq = in->time_warp_seq;
3623 m->change_attr = in->change_attr;
eafe8130
TL
3624
3625 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3626 !in->cap_snaps.empty() &&
3627 in->cap_snaps.rbegin()->second.flush_tid == 0)
3628 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3629 m->flags = flags;
3630
7c673cae
FG
3631 if (flush & CEPH_CAP_FILE_WR) {
3632 m->inline_version = in->inline_version;
3633 m->inline_data = in->inline_data;
3634 }
3635
3636 in->reported_size = in->size;
3637 m->set_snap_follows(follows);
3638 cap->wanted = want;
3639 if (cap == in->auth_cap) {
1911f103
TL
3640 if (want & CEPH_CAP_ANY_FILE_WR) {
3641 m->set_max_size(in->wanted_max_size);
3642 in->requested_max_size = in->wanted_max_size;
3643 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3644 } else {
3645 in->requested_max_size = 0;
3646 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3647 }
7c673cae
FG
3648 }
3649
3650 if (!session->flushing_caps_tids.empty())
3651 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3652
11fdf7f2 3653 session->con->send_message2(std::move(m));
7c673cae
FG
3654}
3655
31f18b77
FG
3656static bool is_max_size_approaching(Inode *in)
3657{
3658 /* mds will adjust max size according to the reported size */
3659 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3660 return false;
3661 if (in->size >= in->max_size)
3662 return true;
3663 /* half of previous max_size increment has been used */
3664 if (in->max_size > in->reported_size &&
3665 (in->size << 1) >= in->max_size + in->reported_size)
3666 return true;
3667 return false;
3668}
7c673cae 3669
11fdf7f2
TL
3670static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3671{
3672 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3673 return used;
3674 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3675 return used;
3676
3677 if (issued & CEPH_CAP_FILE_LAZYIO) {
3678 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3679 used &= ~CEPH_CAP_FILE_CACHE;
3680 used |= CEPH_CAP_FILE_LAZYIO;
3681 }
3682 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3683 used &= ~CEPH_CAP_FILE_BUFFER;
3684 used |= CEPH_CAP_FILE_LAZYIO;
3685 }
3686 } else {
3687 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3688 used &= ~CEPH_CAP_FILE_CACHE;
3689 used |= CEPH_CAP_FILE_LAZYIO;
3690 }
3691 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3692 used &= ~CEPH_CAP_FILE_BUFFER;
3693 used |= CEPH_CAP_FILE_LAZYIO;
3694 }
3695 }
3696 return used;
3697}
3698
7c673cae
FG
3699/**
3700 * check_caps
3701 *
3702 * Examine currently used and wanted versus held caps. Release, flush or ack
3703 * revoked caps to the MDS as appropriate.
3704 *
3705 * @param in the inode to check
3706 * @param flags flags to apply to cap check
3707 */
3708void Client::check_caps(Inode *in, unsigned flags)
3709{
3710 unsigned wanted = in->caps_wanted();
3711 unsigned used = get_caps_used(in);
3712 unsigned cap_used;
3713
7c673cae
FG
3714 int implemented;
3715 int issued = in->caps_issued(&implemented);
3716 int revoking = implemented & ~issued;
3717
11fdf7f2
TL
3718 int orig_used = used;
3719 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3720
7c673cae 3721 int retain = wanted | used | CEPH_CAP_PIN;
f67539c2 3722 if (!is_unmounting() && in->nlink > 0) {
a8e16298 3723 if (wanted) {
7c673cae 3724 retain |= CEPH_CAP_ANY;
a8e16298
TL
3725 } else if (in->is_dir() &&
3726 (issued & CEPH_CAP_FILE_SHARED) &&
3727 (in->flags & I_COMPLETE)) {
3728 // we do this here because we don't want to drop to Fs (and then
3729 // drop the Fs if we do a create!) if that alone makes us send lookups
3730 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3731 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3732 retain |= wanted;
3733 } else {
7c673cae 3734 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3735 // keep RD only if we didn't have the file open RW,
3736 // because then the mds would revoke it anyway to
3737 // journal max_size=0.
3738 if (in->max_size == 0)
3739 retain |= CEPH_CAP_ANY_RD;
3740 }
7c673cae
FG
3741 }
3742
11fdf7f2 3743 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3744 << " wanted " << ccap_string(wanted)
3745 << " used " << ccap_string(used)
3746 << " issued " << ccap_string(issued)
3747 << " revoking " << ccap_string(revoking)
3748 << " flags=" << flags
3749 << dendl;
3750
3751 if (in->snapid != CEPH_NOSNAP)
3752 return; //snap caps last forever, can't write
3753
3754 if (in->caps.empty())
3755 return; // guard if at end of func
3756
11fdf7f2
TL
3757 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3758 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3759 if (_release(in))
11fdf7f2 3760 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3761 }
7c673cae 3762
20effc67
TL
3763 for (auto &[mds, cap] : in->caps) {
3764 auto session = mds_sessions.at(mds);
7c673cae
FG
3765
3766 cap_used = used;
11fdf7f2 3767 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3768 cap_used &= ~in->auth_cap->issued;
3769
11fdf7f2 3770 revoking = cap.implemented & ~cap.issued;
20effc67 3771
7c673cae 3772 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3773 << " issued " << ccap_string(cap.issued)
3774 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3775 << " revoking " << ccap_string(revoking) << dendl;
3776
3777 if (in->wanted_max_size > in->max_size &&
3778 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3779 &cap == in->auth_cap)
7c673cae
FG
3780 goto ack;
3781
3782 /* approaching file_max? */
11fdf7f2
TL
3783 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3784 &cap == in->auth_cap &&
31f18b77 3785 is_max_size_approaching(in)) {
7c673cae 3786 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3787 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3788 goto ack;
3789 }
3790
3791 /* completed revocation? */
3792 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3793 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3794 goto ack;
3795 }
3796
3797 /* want more caps from mds? */
11fdf7f2 3798 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3799 goto ack;
3800
f67539c2 3801 if (!revoking && is_unmounting() && (cap_used == 0))
7c673cae
FG
3802 goto ack;
3803
11fdf7f2 3804 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3805 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3806 continue;
3807
11fdf7f2 3808 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3809 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3810 cap_delay_requeue(in);
7c673cae
FG
3811 continue;
3812 }
3813
3814 ack:
eafe8130
TL
3815 if (&cap == in->auth_cap) {
3816 if (in->flags & I_KICK_FLUSH) {
3817 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3818 << " to mds." << mds << dendl;
20effc67 3819 kick_flushing_caps(in, session.get());
eafe8130
TL
3820 }
3821 if (!in->cap_snaps.empty() &&
3822 in->cap_snaps.rbegin()->second.flush_tid == 0)
3823 flush_snaps(in);
7c673cae
FG
3824 }
3825
3826 int flushing;
e306af50 3827 int msg_flags = 0;
7c673cae 3828 ceph_tid_t flush_tid;
11fdf7f2 3829 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3830 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3831 if (flags & CHECK_CAPS_SYNCHRONOUS)
3832 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3833 } else {
3834 flushing = 0;
3835 flush_tid = 0;
3836 }
3837
20effc67
TL
3838 in->delay_cap_item.remove_myself();
3839 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
eafe8130 3840 flushing, flush_tid);
7c673cae
FG
3841 }
3842}
3843
3844
3845void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3846{
3847 int used = get_caps_used(in);
3848 int dirty = in->caps_dirty();
11fdf7f2 3849 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3850
3851 if (in->cap_snaps.size() &&
3852 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3853 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3854 return;
3855 } else if (in->caps_dirty() ||
3856 (used & CEPH_CAP_FILE_WR) ||
3857 (dirty & CEPH_CAP_ANY_WR)) {
3858 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3859 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3860 CapSnap &capsnap = capsnapem.first->second;
3861 capsnap.context = old_snapc;
3862 capsnap.issued = in->caps_issued();
3863 capsnap.dirty = in->caps_dirty();
f67539c2 3864
7c673cae 3865 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
f67539c2 3866
7c673cae
FG
3867 capsnap.uid = in->uid;
3868 capsnap.gid = in->gid;
3869 capsnap.mode = in->mode;
3870 capsnap.btime = in->btime;
3871 capsnap.xattrs = in->xattrs;
3872 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3873 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3874 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
f67539c2 3875
7c673cae 3876 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3877 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3878 capsnap.writing = 1;
3879 } else {
3880 finish_cap_snap(in, capsnap, used);
3881 }
3882 } else {
11fdf7f2 3883 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3884 }
3885}
3886
3887void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3888{
11fdf7f2 3889 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3890 capsnap.size = in->size;
3891 capsnap.mtime = in->mtime;
3892 capsnap.atime = in->atime;
3893 capsnap.ctime = in->ctime;
3894 capsnap.time_warp_seq = in->time_warp_seq;
3895 capsnap.change_attr = in->change_attr;
7c673cae
FG
3896 capsnap.dirty |= in->caps_dirty();
3897
11fdf7f2
TL
3898 /* Only reset it if it wasn't set before */
3899 if (capsnap.cap_dirtier_uid == -1) {
3900 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3901 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3902 }
3903
7c673cae
FG
3904 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3905 capsnap.inline_data = in->inline_data;
3906 capsnap.inline_version = in->inline_version;
3907 }
3908
3909 if (used & CEPH_CAP_FILE_BUFFER) {
f67539c2 3910 capsnap.writing = 1;
11fdf7f2 3911 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3912 << " WRBUFFER, delaying" << dendl;
3913 } else {
3914 capsnap.dirty_data = 0;
3915 flush_snaps(in);
3916 }
3917}
3918
eafe8130
TL
3919void Client::send_flush_snap(Inode *in, MetaSession *session,
3920 snapid_t follows, CapSnap& capsnap)
3921{
9f95a23c
TL
3922 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3923 in->ino, in->snaprealm->ino, 0,
3924 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
3925 m->caller_uid = capsnap.cap_dirtier_uid;
3926 m->caller_gid = capsnap.cap_dirtier_gid;
3927
3928 m->set_client_tid(capsnap.flush_tid);
3929 m->head.snap_follows = follows;
3930
3931 m->head.caps = capsnap.issued;
3932 m->head.dirty = capsnap.dirty;
3933
3934 m->head.uid = capsnap.uid;
3935 m->head.gid = capsnap.gid;
3936 m->head.mode = capsnap.mode;
3937 m->btime = capsnap.btime;
3938
3939 m->size = capsnap.size;
3940
3941 m->head.xattr_version = capsnap.xattr_version;
3942 encode(capsnap.xattrs, m->xattrbl);
3943
3944 m->ctime = capsnap.ctime;
3945 m->btime = capsnap.btime;
3946 m->mtime = capsnap.mtime;
3947 m->atime = capsnap.atime;
3948 m->time_warp_seq = capsnap.time_warp_seq;
3949 m->change_attr = capsnap.change_attr;
3950
3951 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3952 m->inline_version = in->inline_version;
3953 m->inline_data = in->inline_data;
3954 }
3955
3956 ceph_assert(!session->flushing_caps_tids.empty());
3957 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3958
3959 session->con->send_message2(std::move(m));
3960}
3961
3962void Client::flush_snaps(Inode *in)
7c673cae 3963{
eafe8130 3964 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3965 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3966
3967 // pick auth mds
11fdf7f2 3968 ceph_assert(in->auth_cap);
7c673cae 3969 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3970
3971 for (auto &p : in->cap_snaps) {
3972 CapSnap &capsnap = p.second;
eafe8130
TL
3973 // only do new flush
3974 if (capsnap.flush_tid > 0)
3975 continue;
7c673cae
FG
3976
3977 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3978 << " follows " << p.first
3979 << " size " << capsnap.size
3980 << " mtime " << capsnap.mtime
3981 << " dirty_data=" << capsnap.dirty_data
3982 << " writing=" << capsnap.writing
3983 << " on " << *in << dendl;
3984 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3985 break;
f67539c2 3986
eafe8130
TL
3987 capsnap.flush_tid = ++last_flush_tid;
3988 session->flushing_caps_tids.insert(capsnap.flush_tid);
3989 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3990 if (!in->flushing_cap_item.is_on_list())
3991 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3992
eafe8130 3993 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3994 }
3995}
3996
9f95a23c 3997void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 3998{
9f95a23c 3999 ceph::condition_variable cond;
7c673cae 4000 ls.push_back(&cond);
9f95a23c
TL
4001 std::unique_lock l{client_lock, std::adopt_lock};
4002 cond.wait(l);
4003 l.release();
7c673cae
FG
4004 ls.remove(&cond);
4005}
4006
9f95a23c 4007void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 4008{
9f95a23c
TL
4009 for (auto cond : ls) {
4010 cond->notify_all();
4011 }
7c673cae
FG
4012}
4013
4014void Client::wait_on_context_list(list<Context*>& ls)
4015{
9f95a23c 4016 ceph::condition_variable cond;
7c673cae
FG
4017 bool done = false;
4018 int r;
9f95a23c
TL
4019 ls.push_back(new C_Cond(cond, &done, &r));
4020 std::unique_lock l{client_lock, std::adopt_lock};
4021 cond.wait(l, [&done] { return done;});
4022 l.release();
7c673cae
FG
4023}
4024
4025void Client::signal_context_list(list<Context*>& ls)
4026{
4027 while (!ls.empty()) {
4028 ls.front()->complete(0);
4029 ls.pop_front();
4030 }
4031}
4032
a8e16298 4033void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 4034{
11fdf7f2
TL
4035 for (const auto &cap : s->caps) {
4036 auto &in = cap->inode;
a8e16298 4037 if (reconnect) {
11fdf7f2
TL
4038 in.requested_max_size = 0;
4039 in.wanted_max_size = 0;
a8e16298
TL
4040 } else {
4041 if (cap->gen < s->cap_gen) {
4042 // mds did not re-issue stale cap.
4043 cap->issued = cap->implemented = CEPH_CAP_PIN;
4044 // make sure mds knows what we want.
11fdf7f2
TL
4045 if (in.caps_file_wanted() & ~cap->wanted)
4046 in.flags |= I_CAP_DROPPED;
a8e16298
TL
4047 }
4048 }
11fdf7f2 4049 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4050 }
4051}
4052
4053
4054// flush dirty data (from objectcache)
4055
4056class C_Client_CacheInvalidate : public Context {
4057private:
4058 Client *client;
4059 vinodeno_t ino;
4060 int64_t offset, length;
4061public:
4062 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4063 client(c), offset(off), length(len) {
4064 if (client->use_faked_inos())
4065 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4066 else
4067 ino = in->vino();
4068 }
4069 void finish(int r) override {
4070 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 4071 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
4072 client->_async_invalidate(ino, offset, length);
4073 }
4074};
4075
4076void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4077{
f67539c2
TL
4078 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4079 if (!mref_reader.is_state_satisfied())
7c673cae 4080 return;
f67539c2 4081
11fdf7f2 4082 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
4083 ino_invalidate_cb(callback_handle, ino, off, len);
4084}
4085
4086void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4087
4088 if (ino_invalidate_cb)
4089 // we queue the invalidate, which calls the callback and decrements the ref
4090 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4091}
4092
4093void Client::_invalidate_inode_cache(Inode *in)
4094{
11fdf7f2 4095 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
4096
4097 // invalidate our userspace inode cache
94b18763 4098 if (cct->_conf->client_oc) {
7c673cae 4099 objectcacher->release_set(&in->oset);
94b18763
FG
4100 if (!objectcacher->set_is_empty(&in->oset))
4101 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4102 }
7c673cae
FG
4103
4104 _schedule_invalidate_callback(in, 0, 0);
4105}
4106
4107void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4108{
11fdf7f2 4109 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
4110
4111 // invalidate our userspace inode cache
4112 if (cct->_conf->client_oc) {
4113 vector<ObjectExtent> ls;
4114 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 4115 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
4116 }
4117
4118 _schedule_invalidate_callback(in, off, len);
4119}
4120
4121bool Client::_release(Inode *in)
4122{
4123 ldout(cct, 20) << "_release " << *in << dendl;
4124 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4125 _invalidate_inode_cache(in);
4126 return true;
4127 }
4128 return false;
4129}
4130
4131bool Client::_flush(Inode *in, Context *onfinish)
4132{
4133 ldout(cct, 10) << "_flush " << *in << dendl;
4134
4135 if (!in->oset.dirty_or_tx) {
4136 ldout(cct, 10) << " nothing to flush" << dendl;
4137 onfinish->complete(0);
4138 return true;
4139 }
4140
4141 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 4142 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
4143 objectcacher->purge_set(&in->oset);
4144 if (onfinish) {
f67539c2 4145 onfinish->complete(-CEPHFS_ENOSPC);
7c673cae
FG
4146 }
4147 return true;
4148 }
4149
4150 return objectcacher->flush_set(&in->oset, onfinish);
4151}
4152
4153void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4154{
f67539c2 4155 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
4156 if (!in->oset.dirty_or_tx) {
4157 ldout(cct, 10) << " nothing to flush" << dendl;
4158 return;
4159 }
4160
11fdf7f2 4161 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 4162 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 4163 offset, size, &onflush);
7c673cae
FG
4164 if (!ret) {
4165 // wait for flush
9f95a23c 4166 client_lock.unlock();
11fdf7f2 4167 onflush.wait();
9f95a23c 4168 client_lock.lock();
7c673cae
FG
4169 }
4170}
4171
4172void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4173{
f67539c2
TL
4174 // std::scoped_lock l(client_lock);
4175 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 4176 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 4177 ceph_assert(in);
7c673cae
FG
4178 _flushed(in);
4179}
4180
4181void Client::_flushed(Inode *in)
4182{
4183 ldout(cct, 10) << "_flushed " << *in << dendl;
4184
4185 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4186}
4187
4188
4189
4190// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4191void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4192{
4193 unsigned had = in->caps_issued();
4194
4195 if ((issued & CEPH_CAP_FILE_CACHE) &&
4196 !(had & CEPH_CAP_FILE_CACHE))
4197 in->cache_gen++;
4198
f91f0fd5
TL
4199 if ((issued & CEPH_CAP_FILE_SHARED) !=
4200 (had & CEPH_CAP_FILE_SHARED)) {
4201 if (issued & CEPH_CAP_FILE_SHARED)
4202 in->shared_gen++;
7c673cae
FG
4203 if (in->is_dir())
4204 clear_dir_complete_and_ordered(in, true);
4205 }
4206}
4207
4208void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4209 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4210 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4211{
11fdf7f2
TL
4212 if (!in->is_any_caps()) {
4213 ceph_assert(in->snaprealm == 0);
4214 in->snaprealm = get_snap_realm(realm);
4215 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4216 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4217 } else {
4218 ceph_assert(in->snaprealm);
4219 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4220 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4221 in->snaprealm_item.remove_myself();
4222 auto oldrealm = in->snaprealm;
4223 in->snaprealm = get_snap_realm(realm);
4224 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4225 put_snap_realm(oldrealm);
4226 }
4227 }
4228
7c673cae 4229 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4230 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4231 Cap &cap = capem.first->second;
4232 if (!capem.second) {
4233 if (cap.gen < mds_session->cap_gen)
4234 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4235
4236 /*
4237 * auth mds of the inode changed. we received the cap export
4238 * message, but still haven't received the cap import message.
4239 * handle_cap_export() updated the new auth MDS' cap.
4240 *
4241 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4242 * a message that was send before the cap import message. So
4243 * don't remove caps.
4244 */
11fdf7f2 4245 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4246 if (&cap != in->auth_cap)
4247 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4248
11fdf7f2
TL
4249 ceph_assert(cap.cap_id == cap_id);
4250 seq = cap.seq;
4251 mseq = cap.mseq;
4252 issued |= cap.issued;
7c673cae
FG
4253 flags |= CEPH_CAP_FLAG_AUTH;
4254 }
f67539c2
TL
4255 } else {
4256 inc_pinned_icaps();
7c673cae
FG
4257 }
4258
11fdf7f2 4259 check_cap_issue(in, issued);
7c673cae
FG
4260
4261 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4262 if (in->auth_cap != &cap &&
7c673cae
FG
4263 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4264 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4265 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4266 << "add myself to new auth MDS' flushing caps list" << dendl;
4267 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4268 }
11fdf7f2 4269 in->auth_cap = &cap;
7c673cae
FG
4270 }
4271 }
4272
11fdf7f2
TL
4273 unsigned old_caps = cap.issued;
4274 cap.cap_id = cap_id;
4275 cap.issued = issued;
4276 cap.implemented |= issued;
4277 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4278 cap.wanted = wanted;
a8e16298 4279 else
11fdf7f2
TL
4280 cap.wanted |= wanted;
4281 cap.seq = seq;
4282 cap.issue_seq = seq;
4283 cap.mseq = mseq;
4284 cap.gen = mds_session->cap_gen;
4285 cap.latest_perms = cap_perms;
4286 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4287 << " from mds." << mds
4288 << " on " << *in
4289 << dendl;
4290
4291 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4292 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4293 for (auto &p : in->caps) {
4294 if (&p.second == &cap)
7c673cae 4295 continue;
11fdf7f2 4296 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4297 check_caps(in, CHECK_CAPS_NODELAY);
4298 break;
4299 }
4300 }
4301 }
4302
4303 if (issued & ~old_caps)
4304 signal_cond_list(in->waitfor_caps);
4305}
4306
4307void Client::remove_cap(Cap *cap, bool queue_release)
4308{
11fdf7f2 4309 auto &in = cap->inode;
7c673cae
FG
4310 MetaSession *session = cap->session;
4311 mds_rank_t mds = cap->session->mds_num;
4312
11fdf7f2 4313 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4314
4315 if (queue_release) {
4316 session->enqueue_cap_release(
11fdf7f2 4317 in.ino,
7c673cae
FG
4318 cap->cap_id,
4319 cap->issue_seq,
4320 cap->mseq,
4321 cap_epoch_barrier);
f67539c2
TL
4322 } else {
4323 dec_pinned_icaps();
7c673cae
FG
4324 }
4325
f67539c2 4326
11fdf7f2
TL
4327 if (in.auth_cap == cap) {
4328 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4329 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4330 in.flushing_cap_item.remove_myself();
7c673cae 4331 }
11fdf7f2 4332 in.auth_cap = NULL;
7c673cae 4333 }
11fdf7f2
TL
4334 size_t n = in.caps.erase(mds);
4335 ceph_assert(n == 1);
7c673cae
FG
4336 cap = nullptr;
4337
11fdf7f2
TL
4338 if (!in.is_any_caps()) {
4339 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4340 in.snaprealm_item.remove_myself();
4341 put_snap_realm(in.snaprealm);
4342 in.snaprealm = 0;
7c673cae
FG
4343 }
4344}
4345
4346void Client::remove_all_caps(Inode *in)
4347{
4348 while (!in->caps.empty())
11fdf7f2 4349 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4350}
4351
f6b5b4d7 4352void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4353{
11fdf7f2 4354 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4355
4356 while (s->caps.size()) {
4357 Cap *cap = *s->caps.begin();
11fdf7f2 4358 InodeRef in(&cap->inode);
eafe8130 4359 bool dirty_caps = false;
7c673cae 4360 if (in->auth_cap == cap) {
7c673cae
FG
4361 dirty_caps = in->dirty_caps | in->flushing_caps;
4362 in->wanted_max_size = 0;
4363 in->requested_max_size = 0;
f6b5b4d7
TL
4364 if (in->has_any_filelocks())
4365 in->flags |= I_ERROR_FILELOCK;
7c673cae 4366 }
f6b5b4d7 4367 auto caps = cap->implemented;
a8e16298
TL
4368 if (cap->wanted | cap->issued)
4369 in->flags |= I_CAP_DROPPED;
7c673cae 4370 remove_cap(cap, false);
eafe8130 4371 in->cap_snaps.clear();
7c673cae 4372 if (dirty_caps) {
11fdf7f2 4373 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4374 if (in->flushing_caps) {
4375 num_flushing_caps--;
4376 in->flushing_cap_tids.clear();
4377 }
4378 in->flushing_caps = 0;
28e407b8 4379 in->mark_caps_clean();
11fdf7f2 4380 put_inode(in.get());
7c673cae 4381 }
f6b5b4d7
TL
4382 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4383 if (caps && !in->caps_issued_mask(caps, true)) {
f67539c2 4384 if (err == -CEPHFS_EBLOCKLISTED) {
f6b5b4d7
TL
4385 if (in->oset.dirty_or_tx) {
4386 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4387 in->set_async_err(err);
4388 }
4389 objectcacher->purge_set(&in->oset);
4390 } else {
4391 objectcacher->release_set(&in->oset);
4392 }
4393 _schedule_invalidate_callback(in.get(), 0, 0);
4394 }
4395
a8e16298 4396 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4397 }
4398 s->flushing_caps_tids.clear();
9f95a23c 4399 sync_cond.notify_all();
7c673cae
FG
4400}
4401
91327a77 4402int Client::_do_remount(bool retry_on_error)
b32b8144 4403{
adb31ebb 4404 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4405
b32b8144
FG
4406 errno = 0;
4407 int r = remount_cb(callback_handle);
91327a77
AA
4408 if (r == 0) {
4409 retries_on_invalidate = 0;
4410 } else {
b32b8144
FG
4411 int e = errno;
4412 client_t whoami = get_nodeid();
4413 if (r == -1) {
4414 lderr(cct) <<
4415 "failed to remount (to trim kernel dentries): "
4416 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4417 } else {
4418 lderr(cct) <<
4419 "failed to remount (to trim kernel dentries): "
4420 "return code = " << r << dendl;
4421 }
91327a77 4422 bool should_abort =
11fdf7f2
TL
4423 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4424 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4425 !(retry_on_error && (++retries_on_invalidate < max_retries));
f67539c2 4426 if (should_abort && !is_unmounting()) {
b32b8144
FG
4427 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4428 ceph_abort();
4429 }
4430 }
4431 return r;
4432}
4433
7c673cae
FG
4434class C_Client_Remount : public Context {
4435private:
4436 Client *client;
4437public:
4438 explicit C_Client_Remount(Client *c) : client(c) {}
4439 void finish(int r) override {
11fdf7f2 4440 ceph_assert(r == 0);
91327a77 4441 client->_do_remount(true);
7c673cae
FG
4442 }
4443};
4444
4445void Client::_invalidate_kernel_dcache()
4446{
f67539c2
TL
4447 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4448 if (!mref_reader.is_state_satisfied())
7c673cae 4449 return;
f67539c2 4450
94b18763
FG
4451 if (can_invalidate_dentries) {
4452 if (dentry_invalidate_cb && root->dir) {
4453 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4454 p != root->dir->dentries.end();
4455 ++p) {
4456 if (p->second->inode)
4457 _schedule_invalidate_dentry_callback(p->second, false);
4458 }
7c673cae
FG
4459 }
4460 } else if (remount_cb) {
4461 // Hacky:
4462 // when remounting a file system, linux kernel trims all unused dentries in the fs
4463 remount_finisher.queue(new C_Client_Remount(this));
4464 }
4465}
4466
91327a77
AA
4467void Client::_trim_negative_child_dentries(InodeRef& in)
4468{
4469 if (!in->is_dir())
4470 return;
4471
4472 Dir* dir = in->dir;
4473 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4474 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4475 Dentry *dn = p->second;
4476 ++p;
11fdf7f2 4477 ceph_assert(!dn->inode);
91327a77
AA
4478 if (dn->lru_is_expireable())
4479 unlink(dn, true, false); // keep dir, drop dentry
4480 }
4481 if (dir->dentries.empty()) {
4482 close_dir(dir);
4483 }
4484 }
4485
4486 if (in->flags & I_SNAPDIR_OPEN) {
4487 InodeRef snapdir = open_snapdir(in.get());
4488 _trim_negative_child_dentries(snapdir);
4489 }
4490}
4491
e306af50
TL
4492class C_Client_CacheRelease : public Context {
4493private:
4494 Client *client;
4495 vinodeno_t ino;
4496public:
4497 C_Client_CacheRelease(Client *c, Inode *in) :
4498 client(c) {
4499 if (client->use_faked_inos())
4500 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4501 else
4502 ino = in->vino();
4503 }
4504 void finish(int r) override {
4505 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4506 client->_async_inode_release(ino);
4507 }
4508};
4509
4510void Client::_async_inode_release(vinodeno_t ino)
4511{
f67539c2
TL
4512 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4513 if (!mref_reader.is_state_satisfied())
e306af50 4514 return;
f67539c2 4515
e306af50
TL
4516 ldout(cct, 10) << __func__ << " " << ino << dendl;
4517 ino_release_cb(callback_handle, ino);
4518}
4519
4520void Client::_schedule_ino_release_callback(Inode *in) {
4521
4522 if (ino_release_cb)
4523 // we queue the invalidate, which calls the callback and decrements the ref
4524 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4525}
4526
28e407b8 4527void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4528{
4529 mds_rank_t mds = s->mds_num;
28e407b8 4530 size_t caps_size = s->caps.size();
11fdf7f2 4531 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4532 << " caps " << caps_size << dendl;
4533
28e407b8
AA
4534 uint64_t trimmed = 0;
4535 auto p = s->caps.begin();
4536 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4537 * looking at from getting deleted during traversal. */
7c673cae
FG
4538 while ((caps_size - trimmed) > max && !p.end()) {
4539 Cap *cap = *p;
11fdf7f2 4540 InodeRef in(&cap->inode);
7c673cae
FG
4541
4542 // Increment p early because it will be invalidated if cap
4543 // is deleted inside remove_cap
4544 ++p;
4545
4546 if (in->caps.size() > 1 && cap != in->auth_cap) {
4547 int mine = cap->issued | cap->implemented;
4548 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4549 // disposable non-auth cap
b32b8144 4550 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4551 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4552 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4553 trimmed++;
4554 }
4555 } else {
4556 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4557 _trim_negative_child_dentries(in);
7c673cae 4558 bool all = true;
11fdf7f2
TL
4559 auto q = in->dentries.begin();
4560 while (q != in->dentries.end()) {
4561 Dentry *dn = *q;
4562 ++q;
7c673cae
FG
4563 if (dn->lru_is_expireable()) {
4564 if (can_invalidate_dentries &&
b3b6e05e 4565 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
7c673cae
FG
4566 // Only issue one of these per DN for inodes in root: handle
4567 // others more efficiently by calling for root-child DNs at
4568 // the end of this function.
4569 _schedule_invalidate_dentry_callback(dn, true);
4570 }
28e407b8
AA
4571 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4572 to_trim.insert(dn);
7c673cae
FG
4573 } else {
4574 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4575 all = false;
4576 }
4577 }
b3b6e05e 4578 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
f91f0fd5
TL
4579 _schedule_ino_release_callback(in.get());
4580 }
b3b6e05e 4581 if (all && in->ino != CEPH_INO_ROOT) {
7c673cae
FG
4582 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4583 trimmed++;
4584 }
4585 }
4586 }
28e407b8
AA
4587 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4588 for (const auto &dn : to_trim) {
4589 trim_dentry(dn);
4590 }
4591 to_trim.clear();
7c673cae 4592
b32b8144 4593 caps_size = s->caps.size();
11fdf7f2 4594 if (caps_size > (size_t)max)
7c673cae
FG
4595 _invalidate_kernel_dcache();
4596}
4597
4598void Client::force_session_readonly(MetaSession *s)
4599{
4600 s->readonly = true;
4601 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4602 auto &in = (*p)->inode;
4603 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4604 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4605 }
4606}
4607
7c673cae
FG
4608int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4609{
4610 MetaSession *session = in->auth_cap->session;
4611
4612 int flushing = in->dirty_caps;
11fdf7f2 4613 ceph_assert(flushing);
7c673cae
FG
4614
4615 ceph_tid_t flush_tid = ++last_flush_tid;
4616 in->flushing_cap_tids[flush_tid] = flushing;
4617
4618 if (!in->flushing_caps) {
11fdf7f2 4619 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4620 num_flushing_caps++;
4621 } else {
11fdf7f2 4622 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4623 }
4624
4625 in->flushing_caps |= flushing;
28e407b8 4626 in->mark_caps_clean();
7c673cae
FG
4627
4628 if (!in->flushing_cap_item.is_on_list())
4629 session->flushing_caps.push_back(&in->flushing_cap_item);
4630 session->flushing_caps_tids.insert(flush_tid);
4631
4632 *ptid = flush_tid;
4633 return flushing;
4634}
4635
4636void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4637{
4638 for (auto &p : in->cap_snaps) {
4639 CapSnap &capsnap = p.second;
4640 if (capsnap.flush_tid > 0) {
4641 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4642 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4643 }
4644 }
4645 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4646 it != in->flushing_cap_tids.end();
4647 ++it) {
4648 old_s->flushing_caps_tids.erase(it->first);
4649 new_s->flushing_caps_tids.insert(it->first);
4650 }
4651 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4652}
4653
4654/*
20effc67
TL
4655 * Flush all the dirty caps back to the MDS. Because the callers
4656 * generally wait on the result of this function (syncfs and umount
4657 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
7c673cae
FG
4658 */
4659void Client::flush_caps_sync()
4660{
4661 ldout(cct, 10) << __func__ << dendl;
20effc67
TL
4662 for (auto &q : mds_sessions) {
4663 auto s = q.second;
4664 xlist<Inode*>::iterator p = s->dirty_list.begin();
4665 while (!p.end()) {
4666 unsigned flags = CHECK_CAPS_NODELAY;
4667 Inode *in = *p;
7c673cae 4668
20effc67
TL
4669 ++p;
4670 if (p.end())
4671 flags |= CHECK_CAPS_SYNCHRONOUS;
4672 check_caps(in, flags);
4673 }
7c673cae
FG
4674 }
4675}
4676
7c673cae
FG
4677void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4678{
4679 while (in->flushing_caps) {
4680 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4681 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4682 if (it->first > want)
4683 break;
11fdf7f2 4684 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4685 << ccap_string(it->second) << " want " << want
4686 << " last " << it->first << dendl;
4687 wait_on_list(in->waitfor_caps);
4688 }
4689}
4690
4691void Client::wait_sync_caps(ceph_tid_t want)
4692{
4693 retry:
11fdf7f2 4694 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4695 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2 4696 for (auto &p : mds_sessions) {
20effc67 4697 auto s = p.second;
7c673cae
FG
4698 if (s->flushing_caps_tids.empty())
4699 continue;
4700 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4701 if (oldest_tid <= want) {
11fdf7f2 4702 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4703 << " (want " << want << ")" << dendl;
9f95a23c
TL
4704 std::unique_lock l{client_lock, std::adopt_lock};
4705 sync_cond.wait(l);
4706 l.release();
7c673cae
FG
4707 goto retry;
4708 }
4709 }
4710}
4711
eafe8130
TL
4712void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4713{
4714 in->flags &= ~I_KICK_FLUSH;
4715
4716 Cap *cap = in->auth_cap;
4717 ceph_assert(cap->session == session);
4718
4719 ceph_tid_t last_snap_flush = 0;
4720 for (auto p = in->flushing_cap_tids.rbegin();
4721 p != in->flushing_cap_tids.rend();
4722 ++p) {
4723 if (!p->second) {
4724 last_snap_flush = p->first;
4725 break;
4726 }
4727 }
4728
4729 int wanted = in->caps_wanted();
4730 int used = get_caps_used(in) | in->caps_dirty();
4731 auto it = in->cap_snaps.begin();
4732 for (auto& p : in->flushing_cap_tids) {
4733 if (p.second) {
4734 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4735 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4736 p.second, p.first);
4737 } else {
4738 ceph_assert(it != in->cap_snaps.end());
4739 ceph_assert(it->second.flush_tid == p.first);
4740 send_flush_snap(in, session, it->first, it->second);
4741 ++it;
4742 }
4743 }
4744}
4745
7c673cae
FG
4746void Client::kick_flushing_caps(MetaSession *session)
4747{
4748 mds_rank_t mds = session->mds_num;
11fdf7f2 4749 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4750
4751 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4752 Inode *in = *p;
eafe8130
TL
4753 if (in->flags & I_KICK_FLUSH) {
4754 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4755 kick_flushing_caps(in, session);
4756 }
7c673cae 4757 }
7c673cae
FG
4758}
4759
4760void Client::early_kick_flushing_caps(MetaSession *session)
4761{
7c673cae
FG
4762 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4763 Inode *in = *p;
11fdf7f2
TL
4764 Cap *cap = in->auth_cap;
4765 ceph_assert(cap);
7c673cae
FG
4766
4767 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4768 // stage. This guarantees that MDS processes the cap flush message before issuing
4769 // the flushing caps to other client.
eafe8130
TL
4770 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4771 in->flags |= I_KICK_FLUSH;
7c673cae 4772 continue;
eafe8130 4773 }
7c673cae
FG
4774
4775 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4776 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4777 // send_reconnect() also will reset these sequence numbers. make sure
4778 // sequence numbers in cap flush message match later reconnect message.
4779 cap->seq = 0;
4780 cap->issue_seq = 0;
4781 cap->mseq = 0;
4782 cap->issued = cap->implemented;
4783
eafe8130 4784 kick_flushing_caps(in, session);
7c673cae
FG
4785 }
4786}
4787
7c673cae
FG
4788void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4789{
4790 list<SnapRealm*> q;
4791 q.push_back(realm);
4792
4793 while (!q.empty()) {
4794 realm = q.front();
4795 q.pop_front();
4796
11fdf7f2 4797 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4798 realm->invalidate_cache();
4799
4800 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4801 p != realm->pchildren.end();
4802 ++p)
4803 q.push_back(*p);
4804 }
4805}
4806
4807SnapRealm *Client::get_snap_realm(inodeno_t r)
4808{
4809 SnapRealm *realm = snap_realms[r];
4810 if (!realm)
4811 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4812 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4813 realm->nref++;
4814 return realm;
4815}
4816
4817SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4818{
4819 if (snap_realms.count(r) == 0) {
11fdf7f2 4820 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4821 return NULL;
4822 }
4823 SnapRealm *realm = snap_realms[r];
11fdf7f2 4824 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4825 realm->nref++;
4826 return realm;
4827}
4828
4829void Client::put_snap_realm(SnapRealm *realm)
4830{
11fdf7f2 4831 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4832 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4833 if (--realm->nref == 0) {
4834 snap_realms.erase(realm->ino);
4835 if (realm->pparent) {
4836 realm->pparent->pchildren.erase(realm);
4837 put_snap_realm(realm->pparent);
4838 }
4839 delete realm;
4840 }
4841}
4842
4843bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4844{
4845 if (realm->parent != parent) {
11fdf7f2 4846 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4847 << " " << realm->parent << " -> " << parent << dendl;
4848 realm->parent = parent;
4849 if (realm->pparent) {
4850 realm->pparent->pchildren.erase(realm);
4851 put_snap_realm(realm->pparent);
4852 }
4853 realm->pparent = get_snap_realm(parent);
4854 realm->pparent->pchildren.insert(realm);
4855 return true;
4856 }
4857 return false;
4858}
4859
4860static bool has_new_snaps(const SnapContext& old_snapc,
4861 const SnapContext& new_snapc)
4862{
4863 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4864}
4865
4866
11fdf7f2 4867void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4868{
4869 SnapRealm *first_realm = NULL;
11fdf7f2 4870 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4871
4872 map<SnapRealm*, SnapContext> dirty_realms;
4873
11fdf7f2 4874 auto p = bl.cbegin();
7c673cae
FG
4875 while (!p.end()) {
4876 SnapRealmInfo info;
11fdf7f2 4877 decode(info, p);
7c673cae
FG
4878 SnapRealm *realm = get_snap_realm(info.ino());
4879
4880 bool invalidate = false;
4881
4882 if (info.seq() > realm->seq) {
11fdf7f2 4883 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4884 << dendl;
4885
4886 if (flush) {
4887 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4888 // flush me + children
4889 list<SnapRealm*> q;
4890 q.push_back(realm);
4891 while (!q.empty()) {
4892 SnapRealm *realm = q.front();
4893 q.pop_front();
4894
4895 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4896 p != realm->pchildren.end();
4897 ++p)
4898 q.push_back(*p);
4899
4900 if (dirty_realms.count(realm) == 0) {
4901 realm->nref++;
4902 dirty_realms[realm] = realm->get_snap_context();
4903 }
4904 }
4905 }
4906
4907 // update
4908 realm->seq = info.seq();
4909 realm->created = info.created();
4910 realm->parent_since = info.parent_since();
4911 realm->prior_parent_snaps = info.prior_parent_snaps;
4912 realm->my_snaps = info.my_snaps;
4913 invalidate = true;
4914 }
4915
4916 // _always_ verify parent
4917 if (adjust_realm_parent(realm, info.parent()))
4918 invalidate = true;
4919
4920 if (invalidate) {
4921 invalidate_snaprealm_and_children(realm);
11fdf7f2 4922 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4923 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4924 } else {
11fdf7f2 4925 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4926 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4927 }
f67539c2 4928
7c673cae
FG
4929 if (!first_realm)
4930 first_realm = realm;
4931 else
4932 put_snap_realm(realm);
4933 }
4934
f67539c2 4935 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 4936 // if there are new snaps ?
f67539c2 4937 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 4938 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
f67539c2
TL
4939 for (auto&& in : realm->inodes_with_caps) {
4940 queue_cap_snap(in, snapc);
7c673cae
FG
4941 }
4942 } else {
4943 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4944 }
4945 put_snap_realm(realm);
4946 }
4947
4948 if (realm_ret)
4949 *realm_ret = first_realm;
4950 else
4951 put_snap_realm(first_realm);
4952}
4953
11fdf7f2 4954void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4955{
11fdf7f2 4956 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 4957 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
4958
4959 std::scoped_lock cl(client_lock);
20effc67 4960 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 4961 if (!session) {
7c673cae
FG
4962 return;
4963 }
4964
20effc67 4965 got_mds_push(session.get());
7c673cae
FG
4966
4967 map<Inode*, SnapContext> to_move;
4968 SnapRealm *realm = 0;
4969
4970 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4971 ceph_assert(m->head.split);
7c673cae 4972 SnapRealmInfo info;
11fdf7f2
TL
4973 auto p = m->bl.cbegin();
4974 decode(info, p);
4975 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4976
4977 // flush, then move, ino's.
4978 realm = get_snap_realm(info.ino());
4979 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4980 for (auto& ino : m->split_inos) {
4981 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4982 if (inode_map.count(vino)) {
4983 Inode *in = inode_map[vino];
4984 if (!in->snaprealm || in->snaprealm == realm)
4985 continue;
4986 if (in->snaprealm->created > info.created()) {
4987 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4988 << *in->snaprealm << dendl;
4989 continue;
4990 }
4991 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4992
4993
4994 in->snaprealm_item.remove_myself();
4995 to_move[in] = in->snaprealm->get_snap_context();
4996 put_snap_realm(in->snaprealm);
4997 }
4998 }
4999
5000 // move child snaprealms, too
11fdf7f2
TL
5001 for (auto& child_realm : m->split_realms) {
5002 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5003 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
5004 if (!child)
5005 continue;
5006 adjust_realm_parent(child, realm->ino);
5007 put_snap_realm(child);
5008 }
5009 }
5010
5011 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5012
5013 if (realm) {
5014 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5015 Inode *in = p->first;
5016 in->snaprealm = realm;
5017 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5018 realm->nref++;
5019 // queue for snap writeback
5020 if (has_new_snaps(p->second, realm->get_snap_context()))
5021 queue_cap_snap(in, p->second);
5022 }
5023 put_snap_realm(realm);
5024 }
7c673cae
FG
5025}
5026
11fdf7f2 5027void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
5028{
5029 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5030
5031 std::scoped_lock cl(client_lock);
20effc67 5032 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5033 if (!session) {
7c673cae
FG
5034 return;
5035 }
5036
20effc67 5037 got_mds_push(session.get());
7c673cae 5038
11fdf7f2 5039 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
5040
5041 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5042 if (inode_map.count(vino)) {
5043 Inode *in = NULL;
5044 in = inode_map[vino];
5045
5046 if (in) {
5047 in->quota = m->quota;
5048 in->rstat = m->rstat;
5049 }
5050 }
7c673cae
FG
5051}
5052
11fdf7f2 5053void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
5054{
5055 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5056
5057 std::scoped_lock cl(client_lock);
20effc67 5058 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5059 if (!session) {
7c673cae
FG
5060 return;
5061 }
5062
5063 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5064 // Pause RADOS operations until we see the required epoch
5065 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5066 }
5067
5068 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5069 // Record the barrier so that we will transmit it to MDS when releasing
5070 set_cap_epoch_barrier(m->osd_epoch_barrier);
5071 }
5072
20effc67 5073 got_mds_push(session.get());
7c673cae 5074
11fdf7f2 5075 Inode *in;
7c673cae 5076 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
5077 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5078 in = it->second;
5079 } else {
7c673cae 5080 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 5081 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
5082 session->enqueue_cap_release(
5083 m->get_ino(),
5084 m->get_cap_id(),
5085 m->get_seq(),
5086 m->get_mseq(),
5087 cap_epoch_barrier);
5088 } else {
11fdf7f2 5089 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 5090 }
7c673cae
FG
5091
5092 // in case the mds is waiting on e.g. a revocation
5093 flush_cap_releases();
5094 return;
5095 }
5096
5097 switch (m->get_op()) {
20effc67
TL
5098 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5099 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5100 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
7c673cae
FG
5101 }
5102
11fdf7f2
TL
5103 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5104 Cap &cap = in->caps.at(mds);
7c673cae 5105
11fdf7f2 5106 switch (m->get_op()) {
20effc67 5107 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
11fdf7f2
TL
5108 case CEPH_CAP_OP_IMPORT:
5109 case CEPH_CAP_OP_REVOKE:
20effc67
TL
5110 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5111 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
11fdf7f2
TL
5112 }
5113 } else {
5114 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5115 return;
7c673cae
FG
5116 }
5117}
5118
11fdf7f2 5119void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5120{
5121 mds_rank_t mds = session->mds_num;
5122
11fdf7f2 5123 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5124 << " IMPORT from mds." << mds << dendl;
5125
5126 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5127 Cap *cap = NULL;
5128 UserPerm cap_perms;
11fdf7f2
TL
5129 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5130 cap = &it->second;
5131 cap_perms = cap->latest_perms;
7c673cae
FG
5132 }
5133
5134 // add/update it
5135 SnapRealm *realm = NULL;
5136 update_snap_trace(m->snapbl, &realm);
5137
1911f103
TL
5138 int issued = m->get_caps();
5139 int wanted = m->get_wanted();
7c673cae 5140 add_update_cap(in, session, m->get_cap_id(),
1911f103 5141 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 5142 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
5143
5144 if (cap && cap->cap_id == m->peer.cap_id) {
5145 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5146 }
5147
5148 if (realm)
5149 put_snap_realm(realm);
5150
eafe8130 5151 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
5152 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5153 in->requested_max_size > m->get_max_size()) {
5154 in->requested_max_size = 0;
5155 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5156 }
7c673cae 5157 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5158 kick_flushing_caps(in, session);
7c673cae
FG
5159 }
5160}
5161
11fdf7f2 5162void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5163{
5164 mds_rank_t mds = session->mds_num;
5165
11fdf7f2 5166 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5167 << " EXPORT from mds." << mds << dendl;
5168
11fdf7f2
TL
5169 auto it = in->caps.find(mds);
5170 if (it != in->caps.end()) {
5171 Cap &cap = it->second;
5172 if (cap.cap_id == m->get_cap_id()) {
5173 if (m->peer.cap_id) {
5174 const auto peer_mds = mds_rank_t(m->peer.mds);
20effc67 5175 auto tsession = _get_or_open_mds_session(peer_mds);
11fdf7f2
TL
5176 auto it = in->caps.find(peer_mds);
5177 if (it != in->caps.end()) {
5178 Cap &tcap = it->second;
5179 if (tcap.cap_id == m->peer.cap_id &&
5180 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5181 tcap.cap_id = m->peer.cap_id;
5182 tcap.seq = m->peer.seq - 1;
5183 tcap.issue_seq = tcap.seq;
5184 tcap.issued |= cap.issued;
5185 tcap.implemented |= cap.issued;
5186 if (&cap == in->auth_cap)
5187 in->auth_cap = &tcap;
5188 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
20effc67 5189 adjust_session_flushing_caps(in, session, tsession.get());
11fdf7f2
TL
5190 }
5191 } else {
20effc67 5192 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
11fdf7f2
TL
5193 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5194 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5195 cap.latest_perms);
5196 }
7c673cae 5197 } else {
11fdf7f2
TL
5198 if (cap.wanted | cap.issued)
5199 in->flags |= I_CAP_DROPPED;
7c673cae 5200 }
7c673cae 5201
11fdf7f2
TL
5202 remove_cap(&cap, false);
5203 }
7c673cae 5204 }
7c673cae
FG
5205}
5206
11fdf7f2 5207void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5208{
5209 mds_rank_t mds = session->mds_num;
11fdf7f2 5210 ceph_assert(in->caps.count(mds));
7c673cae 5211
11fdf7f2 5212 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5213 << " size " << in->size << " -> " << m->get_size()
5214 << dendl;
5215
1adf2230
AA
5216 int issued;
5217 in->caps_issued(&issued);
5218 issued |= in->caps_dirty();
5219 update_inode_file_size(in, issued, m->get_size(),
5220 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
5221}
5222
11fdf7f2 5223void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5224{
5225 ceph_tid_t flush_ack_tid = m->get_client_tid();
5226 int dirty = m->get_dirty();
5227 int cleaned = 0;
5228 int flushed = 0;
5229
11fdf7f2
TL
5230 auto it = in->flushing_cap_tids.begin();
5231 if (it->first < flush_ack_tid) {
5232 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5233 << " got unexpected flush ack tid " << flush_ack_tid
5234 << " expected is " << it->first << dendl;
5235 }
5236 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5237 if (!it->second) {
5238 // cap snap
5239 ++it;
5240 continue;
5241 }
7c673cae
FG
5242 if (it->first == flush_ack_tid)
5243 cleaned = it->second;
5244 if (it->first <= flush_ack_tid) {
5245 session->flushing_caps_tids.erase(it->first);
5246 in->flushing_cap_tids.erase(it++);
5247 ++flushed;
5248 continue;
5249 }
5250 cleaned &= ~it->second;
5251 if (!cleaned)
5252 break;
5253 ++it;
5254 }
5255
11fdf7f2 5256 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5257 << " cleaned " << ccap_string(cleaned) << " on " << *in
5258 << " with " << ccap_string(dirty) << dendl;
5259
5260 if (flushed) {
5261 signal_cond_list(in->waitfor_caps);
5262 if (session->flushing_caps_tids.empty() ||
5263 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5264 sync_cond.notify_all();
7c673cae
FG
5265 }
5266
5267 if (!dirty) {
5268 in->cap_dirtier_uid = -1;
5269 in->cap_dirtier_gid = -1;
5270 }
5271
5272 if (!cleaned) {
5273 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5274 } else {
5275 if (in->flushing_caps) {
5276 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5277 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5278 in->flushing_caps &= ~cleaned;
5279 if (in->flushing_caps == 0) {
5280 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5281 num_flushing_caps--;
eafe8130 5282 if (in->flushing_cap_tids.empty())
7c673cae
FG
5283 in->flushing_cap_item.remove_myself();
5284 }
5285 if (!in->caps_dirty())
5286 put_inode(in);
5287 }
5288 }
7c673cae
FG
5289}
5290
5291
11fdf7f2 5292void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5293{
eafe8130 5294 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5295 mds_rank_t mds = session->mds_num;
11fdf7f2 5296 ceph_assert(in->caps.count(mds));
7c673cae
FG
5297 snapid_t follows = m->get_snap_follows();
5298
11fdf7f2
TL
5299 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5300 auto& capsnap = it->second;
eafe8130
TL
5301 if (flush_ack_tid != capsnap.flush_tid) {
5302 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5303 } else {
eafe8130 5304 InodeRef tmp_ref(in);
11fdf7f2 5305 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5306 << " on " << *in << dendl;
7c673cae 5307 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5308 in->flushing_cap_tids.erase(capsnap.flush_tid);
5309 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5310 in->flushing_cap_item.remove_myself();
11fdf7f2 5311 in->cap_snaps.erase(it);
eafe8130
TL
5312
5313 signal_cond_list(in->waitfor_caps);
5314 if (session->flushing_caps_tids.empty() ||
5315 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5316 sync_cond.notify_all();
7c673cae
FG
5317 }
5318 } else {
11fdf7f2 5319 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5320 << " on " << *in << dendl;
5321 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5322 }
7c673cae
FG
5323}
5324
5325class C_Client_DentryInvalidate : public Context {
5326private:
5327 Client *client;
5328 vinodeno_t dirino;
5329 vinodeno_t ino;
5330 string name;
5331public:
5332 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5333 client(c), name(dn->name) {
5334 if (client->use_faked_inos()) {
5335 dirino.ino = dn->dir->parent_inode->faked_ino;
5336 if (del)
5337 ino.ino = dn->inode->faked_ino;
5338 } else {
5339 dirino = dn->dir->parent_inode->vino();
5340 if (del)
5341 ino = dn->inode->vino();
5342 }
5343 if (!del)
5344 ino.ino = inodeno_t();
5345 }
5346 void finish(int r) override {
5347 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5348 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5349 client->_async_dentry_invalidate(dirino, ino, name);
5350 }
5351};
5352
5353void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5354{
f67539c2
TL
5355 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5356 if (!mref_reader.is_state_satisfied())
7c673cae 5357 return;
f67539c2 5358
11fdf7f2 5359 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5360 << " in dir " << dirino << dendl;
e306af50 5361 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5362}
5363
5364void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5365{
5366 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5367 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5368}
5369
5370void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5371{
b3b6e05e 5372 int ref = in->get_nref();
494da23a 5373 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5374
5375 if (in->dir && !in->dir->dentries.empty()) {
5376 for (auto p = in->dir->dentries.begin();
5377 p != in->dir->dentries.end(); ) {
5378 Dentry *dn = p->second;
5379 ++p;
5380 /* rmsnap removes whole subtree, need trim inodes recursively.
5381 * we don't need to invalidate dentries recursively. because
5382 * invalidating a directory dentry effectively invalidate
5383 * whole subtree */
5384 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5385 _try_to_trim_inode(dn->inode.get(), false);
5386
5387 if (dn->lru_is_expireable())
5388 unlink(dn, true, false); // keep dir, drop dentry
5389 }
5390 if (in->dir->dentries.empty()) {
5391 close_dir(in->dir);
5392 --ref;
5393 }
5394 }
5395
b3b6e05e 5396 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
7c673cae
FG
5397 InodeRef snapdir = open_snapdir(in);
5398 _try_to_trim_inode(snapdir.get(), false);
5399 --ref;
5400 }
5401
b3b6e05e 5402 if (ref > 1) {
11fdf7f2
TL
5403 auto q = in->dentries.begin();
5404 while (q != in->dentries.end()) {
5405 Dentry *dn = *q;
5406 ++q;
494da23a
TL
5407 if( in->ll_ref > 0 && sched_inval) {
5408 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5409 // so in->dentries doesn't always reflect the state of kernel's dcache.
5410 _schedule_invalidate_dentry_callback(dn, true);
5411 }
7c673cae
FG
5412 unlink(dn, true, true);
5413 }
5414 }
5415}
5416
11fdf7f2 5417void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5418{
5419 mds_rank_t mds = session->mds_num;
5420 int used = get_caps_used(in);
5421 int wanted = in->caps_wanted();
a4b75251 5422 int flags = 0;
7c673cae 5423
a8e16298
TL
5424 const unsigned new_caps = m->get_caps();
5425 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5426 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5427 << " mds." << mds << " seq " << m->get_seq()
5428 << " caps now " << ccap_string(new_caps)
a8e16298 5429 << " was " << ccap_string(cap->issued)
92f5a8d4 5430 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5431
5432 if (was_stale)
5433 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5434 cap->seq = m->get_seq();
28e407b8 5435 cap->gen = session->cap_gen;
7c673cae 5436
11fdf7f2 5437 check_cap_issue(in, new_caps);
a8e16298 5438
7c673cae 5439 // update inode
1adf2230
AA
5440 int issued;
5441 in->caps_issued(&issued);
5442 issued |= in->caps_dirty();
7c673cae 5443
1adf2230
AA
5444 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5445 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5446 in->mode = m->head.mode;
5447 in->uid = m->head.uid;
5448 in->gid = m->head.gid;
5449 in->btime = m->btime;
5450 }
5451 bool deleted_inode = false;
1adf2230
AA
5452 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5453 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae 5454 in->nlink = m->head.nlink;
20effc67 5455 if (in->nlink == 0)
7c673cae
FG
5456 deleted_inode = true;
5457 }
1adf2230 5458 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5459 m->xattrbl.length() &&
5460 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5461 auto p = m->xattrbl.cbegin();
5462 decode(in->xattrs, p);
7c673cae
FG
5463 in->xattr_version = m->head.xattr_version;
5464 }
28e407b8
AA
5465
5466 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5467 in->dirstat.nfiles = m->get_nfiles();
5468 in->dirstat.nsubdirs = m->get_nsubdirs();
5469 }
5470
1adf2230
AA
5471 if (new_caps & CEPH_CAP_ANY_RD) {
5472 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5473 m->get_ctime(), m->get_mtime(), m->get_atime());
5474 }
5475
5476 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5477 in->layout = m->get_layout();
5478 update_inode_file_size(in, issued, m->get_size(),
5479 m->get_truncate_seq(), m->get_truncate_size());
5480 }
5481
5482 if (m->inline_version > in->inline_version) {
5483 in->inline_data = m->inline_data;
5484 in->inline_version = m->inline_version;
5485 }
5486
5487 /* always take a newer change attr */
5488 if (m->get_change_attr() > in->change_attr)
5489 in->change_attr = m->get_change_attr();
7c673cae
FG
5490
5491 // max_size
5492 if (cap == in->auth_cap &&
1adf2230
AA
5493 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5494 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5495 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5496 in->max_size = m->get_max_size();
5497 if (in->max_size > in->wanted_max_size) {
5498 in->wanted_max_size = 0;
5499 in->requested_max_size = 0;
5500 }
5501 }
5502
5503 bool check = false;
a8e16298
TL
5504 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5505 (wanted & ~(cap->wanted | new_caps))) {
5506 // If mds is importing cap, prior cap messages that update 'wanted'
5507 // may get dropped by mds (migrate seq mismatch).
5508 //
5509 // We don't send cap message to update 'wanted' if what we want are
5510 // already issued. If mds revokes caps, cap message that releases caps
5511 // also tells mds what we want. But if caps got revoked by mds forcedly
5512 // (session stale). We may haven't told mds what we want.
7c673cae 5513 check = true;
a8e16298 5514 }
7c673cae 5515
7c673cae
FG
5516
5517 // update caps
a8e16298 5518 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5519 if (revoked) {
5520 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5521 cap->issued = new_caps;
5522 cap->implemented |= new_caps;
5523
b32b8144
FG
5524 // recall delegations if we're losing caps necessary for them
5525 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5526 in->recall_deleg(false);
5527 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5528 in->recall_deleg(true);
5529
11fdf7f2
TL
5530 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5531 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5532 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5533 // waitin' for flush
11fdf7f2 5534 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
a4b75251
TL
5535 if (_release(in)) {
5536 check = true;
5537 flags = CHECK_CAPS_NODELAY;
5538 }
7c673cae
FG
5539 } else {
5540 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5541 check = true;
a4b75251 5542 flags = CHECK_CAPS_NODELAY;
7c673cae 5543 }
a8e16298
TL
5544 } else if (cap->issued == new_caps) {
5545 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5546 } else {
a8e16298 5547 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5548 cap->issued = new_caps;
5549 cap->implemented |= new_caps;
5550
5551 if (cap == in->auth_cap) {
5552 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5553 for (const auto &p : in->caps) {
5554 if (&p.second == cap)
7c673cae 5555 continue;
11fdf7f2 5556 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5557 check = true;
5558 break;
5559 }
5560 }
5561 }
5562 }
5563
5564 if (check)
a4b75251 5565 check_caps(in, flags);
7c673cae
FG
5566
5567 // wake up waiters
5568 if (new_caps)
5569 signal_cond_list(in->waitfor_caps);
5570
5571 // may drop inode's last ref
5572 if (deleted_inode)
5573 _try_to_trim_inode(in, true);
7c673cae
FG
5574}
5575
7c673cae
FG
5576int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5577{
b3b6e05e
TL
5578 if (perms.uid() == 0) {
5579 // Executable are overridable when there is at least one exec bit set
5580 if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
5581 return -CEPHFS_EACCES;
7c673cae 5582 return 0;
b3b6e05e 5583 }
7c673cae
FG
5584
5585 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5586 int ret = _posix_acl_permission(in, perms, want);
f67539c2 5587 if (ret != -CEPHFS_EAGAIN)
7c673cae
FG
5588 return ret;
5589 }
5590
5591 // check permissions before doing anything else
5592 if (!in->check_mode(perms, want))
f67539c2 5593 return -CEPHFS_EACCES;
7c673cae
FG
5594 return 0;
5595}
5596
5597int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5598 const UserPerm& perms)
5599{
5600 int r = _getattr_for_perm(in, perms);
5601 if (r < 0)
5602 goto out;
5603
5604 r = 0;
5605 if (strncmp(name, "system.", 7) == 0) {
5606 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
f67539c2 5607 r = -CEPHFS_EPERM;
7c673cae
FG
5608 } else {
5609 r = inode_permission(in, perms, want);
5610 }
5611out:
1adf2230 5612 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5613 return r;
5614}
5615
20effc67 5616std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
7c673cae
FG
5617 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5618 return out;
5619}
5620
5621int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5622 const UserPerm& perms)
5623{
181888fb 5624 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5625 int r = _getattr_for_perm(in, perms);
5626 if (r < 0)
5627 goto out;
5628
5629 if (mask & CEPH_SETATTR_SIZE) {
5630 r = inode_permission(in, perms, MAY_WRITE);
5631 if (r < 0)
5632 goto out;
5633 }
5634
f67539c2 5635 r = -CEPHFS_EPERM;
7c673cae
FG
5636 if (mask & CEPH_SETATTR_UID) {
5637 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5638 goto out;
5639 }
5640 if (mask & CEPH_SETATTR_GID) {
5641 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5642 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5643 goto out;
5644 }
5645
5646 if (mask & CEPH_SETATTR_MODE) {
5647 if (perms.uid() != 0 && perms.uid() != in->uid)
5648 goto out;
5649
5650 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5651 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5652 stx->stx_mode &= ~S_ISGID;
5653 }
5654
5655 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5656 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5657 if (perms.uid() != 0 && perms.uid() != in->uid) {
5658 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5659 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5660 check_mask |= CEPH_SETATTR_MTIME;
5661 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5662 check_mask |= CEPH_SETATTR_ATIME;
5663 if (check_mask & mask) {
5664 goto out;
5665 } else {
5666 r = inode_permission(in, perms, MAY_WRITE);
5667 if (r < 0)
5668 goto out;
5669 }
5670 }
5671 }
5672 r = 0;
5673out:
5674 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5675 return r;
5676}
5677
5678int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5679{
181888fb 5680 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5681 unsigned want = 0;
5682
5683 if ((flags & O_ACCMODE) == O_WRONLY)
5684 want = MAY_WRITE;
5685 else if ((flags & O_ACCMODE) == O_RDWR)
5686 want = MAY_READ | MAY_WRITE;
5687 else if ((flags & O_ACCMODE) == O_RDONLY)
5688 want = MAY_READ;
5689 if (flags & O_TRUNC)
5690 want |= MAY_WRITE;
5691
5692 int r = 0;
5693 switch (in->mode & S_IFMT) {
5694 case S_IFLNK:
f67539c2 5695 r = -CEPHFS_ELOOP;
7c673cae
FG
5696 goto out;
5697 case S_IFDIR:
5698 if (want & MAY_WRITE) {
f67539c2 5699 r = -CEPHFS_EISDIR;
7c673cae
FG
5700 goto out;
5701 }
5702 break;
5703 }
5704
5705 r = _getattr_for_perm(in, perms);
5706 if (r < 0)
5707 goto out;
5708
5709 r = inode_permission(in, perms, want);
5710out:
5711 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5712 return r;
5713}
5714
5715int Client::may_lookup(Inode *dir, const UserPerm& perms)
5716{
181888fb 5717 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5718 int r = _getattr_for_perm(dir, perms);
5719 if (r < 0)
5720 goto out;
5721
5722 r = inode_permission(dir, perms, MAY_EXEC);
5723out:
5724 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5725 return r;
5726}
5727
5728int Client::may_create(Inode *dir, const UserPerm& perms)
5729{
181888fb 5730 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5731 int r = _getattr_for_perm(dir, perms);
5732 if (r < 0)
5733 goto out;
5734
5735 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5736out:
5737 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5738 return r;
5739}
5740
5741int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5742{
181888fb 5743 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5744 int r = _getattr_for_perm(dir, perms);
5745 if (r < 0)
5746 goto out;
5747
5748 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5749 if (r < 0)
5750 goto out;
5751
f67539c2 5752 /* 'name == NULL' means rmsnap w/o permission checks */
7c673cae
FG
5753 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5754 InodeRef otherin;
5755 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5756 if (r < 0)
5757 goto out;
5758 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
f67539c2 5759 r = -CEPHFS_EPERM;
7c673cae
FG
5760 }
5761out:
5762 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5763 return r;
5764}
5765
f67539c2
TL
5766int Client::may_delete(const char *relpath, const UserPerm& perms) {
5767 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5768
5769 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5770 if (!mref_reader.is_state_satisfied())
5771 return -ENOTCONN;
5772
5773 filepath path(relpath);
5774 string name = path.last_dentry();
5775 path.pop_dentry();
5776 InodeRef dir;
5777
5778 std::scoped_lock lock(client_lock);
5779 int r = path_walk(path, &dir, perms);
5780 if (r < 0)
5781 return r;
5782 if (cct->_conf->client_permissions) {
5783 int r = may_delete(dir.get(), name.c_str(), perms);
5784 if (r < 0)
5785 return r;
5786 }
5787
5788 return 0;
5789}
5790
7c673cae
FG
5791int Client::may_hardlink(Inode *in, const UserPerm& perms)
5792{
181888fb 5793 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5794 int r = _getattr_for_perm(in, perms);
5795 if (r < 0)
5796 goto out;
5797
5798 if (perms.uid() == 0 || perms.uid() == in->uid) {
5799 r = 0;
5800 goto out;
5801 }
5802
f67539c2 5803 r = -CEPHFS_EPERM;
7c673cae
FG
5804 if (!S_ISREG(in->mode))
5805 goto out;
5806
5807 if (in->mode & S_ISUID)
5808 goto out;
5809
5810 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5811 goto out;
5812
5813 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5814out:
5815 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5816 return r;
5817}
5818
5819int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5820{
5821 int mask = CEPH_STAT_CAP_MODE;
5822 bool force = false;
5823 if (acl_type != NO_ACL) {
5824 mask |= CEPH_STAT_CAP_XATTR;
5825 force = in->xattr_version == 0;
5826 }
5827 return _getattr(in, mask, perms, force);
5828}
5829
5830vinodeno_t Client::_get_vino(Inode *in)
5831{
5832 /* The caller must hold the client lock */
5833 return vinodeno_t(in->ino, in->snapid);
5834}
5835
7c673cae
FG
5836/**
5837 * Resolve an MDS spec to a list of MDS daemon GIDs.
5838 *
5839 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5840 * It may be '*' in which case it matches all GIDs.
5841 *
5842 * If no error is returned, the `targets` vector will be populated with at least
5843 * one MDS.
5844 */
5845int Client::resolve_mds(
5846 const std::string &mds_spec,
5847 std::vector<mds_gid_t> *targets)
5848{
11fdf7f2
TL
5849 ceph_assert(fsmap);
5850 ceph_assert(targets != nullptr);
7c673cae
FG
5851
5852 mds_role_t role;
f67539c2
TL
5853 CachedStackStringStream css;
5854 int role_r = fsmap->parse_role(mds_spec, &role, *css);
7c673cae
FG
5855 if (role_r == 0) {
5856 // We got a role, resolve it to a GID
f67539c2
TL
5857 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5858 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5859 << role << "' aka " << info.human_name() << dendl;
5860 targets->push_back(info.global_id);
7c673cae
FG
5861 return 0;
5862 }
5863
5864 std::string strtol_err;
5865 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5866 if (strtol_err.empty()) {
5867 // It is a possible GID
5868 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5869 if (fsmap->gid_exists(mds_gid)) {
f67539c2
TL
5870 auto& info = fsmap->get_info_gid(mds_gid);
5871 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5872 << info.human_name() << dendl;
7c673cae 5873 targets->push_back(mds_gid);
f67539c2 5874 return 0;
7c673cae 5875 } else {
f67539c2 5876 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
7c673cae 5877 << dendl;
f67539c2
TL
5878 lderr(cct) << "FSMap: " << *fsmap << dendl;
5879 return -CEPHFS_ENOENT;
7c673cae
FG
5880 }
5881 } else if (mds_spec == "*") {
5882 // It is a wildcard: use all MDSs
f67539c2 5883 const auto& mds_info = fsmap->get_mds_info();
7c673cae 5884
f67539c2 5885 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
7c673cae 5886 if (mds_info.empty()) {
f67539c2
TL
5887 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5888 lderr(cct) << "FSMap: " << *fsmap << dendl;
5889 return -CEPHFS_ENOENT;
7c673cae
FG
5890 }
5891
f67539c2
TL
5892 for (const auto& [gid, info] : mds_info) {
5893 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5894 targets->push_back(gid);
7c673cae 5895 }
f67539c2 5896 return 0;
7c673cae
FG
5897 } else {
5898 // It did not parse as an integer, it is not a wildcard, it must be a name
5899 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5900 if (mds_gid == 0) {
f67539c2 5901 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
7c673cae 5902 lderr(cct) << "FSMap: " << *fsmap << dendl;
f67539c2 5903 return -CEPHFS_ENOENT;
7c673cae 5904 } else {
f67539c2
TL
5905 auto& info = fsmap->get_info_gid(mds_gid);
5906 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5907 << "' to " << info.human_name() << dendl;
7c673cae
FG
5908 targets->push_back(mds_gid);
5909 }
f67539c2 5910 return 0;
7c673cae 5911 }
7c673cae
FG
5912}
5913
5914
5915/**
5916 * Authenticate with mon and establish global ID
5917 */
5918int Client::authenticate()
5919{
9f95a23c 5920 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
5921
5922 if (monclient->is_authenticated()) {
5923 return 0;
5924 }
5925
9f95a23c 5926 client_lock.unlock();
7c673cae 5927 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
9f95a23c 5928 client_lock.lock();
7c673cae
FG
5929 if (r < 0) {
5930 return r;
5931 }
5932
5933 whoami = monclient->get_global_id();
5934 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5935
5936 return 0;
5937}
5938
5939int Client::fetch_fsmap(bool user)
5940{
f67539c2
TL
5941 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5942
7c673cae
FG
5943 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5944 // rather than MDSMap because no one MDSMap contains all the daemons, and
5945 // a `tell` can address any daemon.
5946 version_t fsmap_latest;
f67539c2 5947 bs::error_code ec;
7c673cae 5948 do {
9f95a23c 5949 client_lock.unlock();
f67539c2
TL
5950 std::tie(fsmap_latest, std::ignore) =
5951 monclient->get_version("fsmap", ca::use_blocked[ec]);
9f95a23c 5952 client_lock.lock();
f67539c2 5953 } while (ec == bs::errc::resource_unavailable_try_again);
7c673cae 5954
f67539c2
TL
5955 if (ec) {
5956 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5957 return ceph::from_error_code(ec);
7c673cae
FG
5958 }
5959
5960 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5961
5962 if (user) {
5963 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5964 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5965 monclient->renew_subs();
5966 wait_on_list(waiting_for_fsmap);
5967 }
11fdf7f2
TL
5968 ceph_assert(fsmap_user);
5969 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5970 } else {
5971 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5972 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5973 monclient->renew_subs();
5974 wait_on_list(waiting_for_fsmap);
5975 }
11fdf7f2
TL
5976 ceph_assert(fsmap);
5977 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5978 }
5979 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5980 << fsmap_latest << dendl;
5981 return 0;
5982}
5983
5984/**
5985 *
5986 * @mds_spec one of ID, rank, GID, "*"
5987 *
5988 */
5989int Client::mds_command(
5990 const std::string &mds_spec,
5991 const vector<string>& cmd,
5992 const bufferlist& inbl,
5993 bufferlist *outbl,
5994 string *outs,
5995 Context *onfinish)
5996{
f67539c2
TL
5997 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
5998 if (!iref_reader.is_state_satisfied())
5999 return -CEPHFS_ENOTCONN;
7c673cae 6000
f67539c2 6001 std::unique_lock cl(client_lock);
7c673cae
FG
6002
6003 int r;
6004 r = authenticate();
6005 if (r < 0) {
6006 return r;
6007 }
6008
6009 r = fetch_fsmap(false);
6010 if (r < 0) {
6011 return r;
6012 }
6013
6014 // Look up MDS target(s) of the command
6015 std::vector<mds_gid_t> targets;
6016 r = resolve_mds(mds_spec, &targets);
6017 if (r < 0) {
6018 return r;
6019 }
6020
6021 // If daemons are laggy, we won't send them commands. If all
6022 // are laggy then we fail.
6023 std::vector<mds_gid_t> non_laggy;
f67539c2 6024 for (const auto& gid : targets) {
7c673cae
FG
6025 const auto info = fsmap->get_info_gid(gid);
6026 if (!info.laggy()) {
6027 non_laggy.push_back(gid);
6028 }
6029 }
6030 if (non_laggy.size() == 0) {
6031 *outs = "All targeted MDS daemons are laggy";
f67539c2 6032 return -CEPHFS_ENOENT;
7c673cae
FG
6033 }
6034
6035 if (metadata.empty()) {
6036 // We are called on an unmounted client, so metadata
6037 // won't be initialized yet.
6038 populate_metadata("");
6039 }
6040
6041 // Send commands to targets
6042 C_GatherBuilder gather(cct, onfinish);
f67539c2 6043 for (const auto& target_gid : non_laggy) {
7c673cae
FG
6044 const auto info = fsmap->get_info_gid(target_gid);
6045
6046 // Open a connection to the target MDS
11fdf7f2 6047 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae 6048
f67539c2
TL
6049 cl.unlock();
6050 {
6051 std::scoped_lock cmd_lock(command_lock);
6052 // Generate MDSCommandOp state
6053 auto &op = command_table.start_command();
7c673cae 6054
f67539c2
TL
6055 op.on_finish = gather.new_sub();
6056 op.cmd = cmd;
6057 op.outbl = outbl;
6058 op.outs = outs;
6059 op.inbl = inbl;
6060 op.mds_gid = target_gid;
6061 op.con = conn;
7c673cae 6062
f67539c2
TL
6063 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6064 << " tid=" << op.tid << cmd << dendl;
7c673cae 6065
f67539c2
TL
6066 // Construct and send MCommand
6067 MessageRef m = op.get_message(monclient->get_fsid());
6068 conn->send_message2(std::move(m));
6069 }
6070 cl.lock();
7c673cae
FG
6071 }
6072 gather.activate();
6073
6074 return 0;
6075}
6076
11fdf7f2 6077void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
6078{
6079 ceph_tid_t const tid = m->get_tid();
6080
6081 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6082
f67539c2 6083 std::scoped_lock cmd_lock(command_lock);
7c673cae
FG
6084 if (!command_table.exists(tid)) {
6085 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
6086 return;
6087 }
6088
6089 auto &op = command_table.get_command(tid);
6090 if (op.outbl) {
11fdf7f2 6091 *op.outbl = m->get_data();
7c673cae
FG
6092 }
6093 if (op.outs) {
6094 *op.outs = m->rs;
6095 }
6096
6097 if (op.on_finish) {
6098 op.on_finish->complete(m->r);
6099 }
6100
6101 command_table.erase(tid);
7c673cae
FG
6102}
6103
6104// -------------------
6105// MOUNT
6106
11fdf7f2 6107int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 6108{
7c673cae
FG
6109 int r = authenticate();
6110 if (r < 0) {
6111 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6112 return r;
6113 }
6114
11fdf7f2
TL
6115 std::string resolved_fs_name;
6116 if (fs_name.empty()) {
9f95a23c
TL
6117 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6118 if (resolved_fs_name.empty())
6119 // Try the backwards compatibility fs name option
6120 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
6121 } else {
6122 resolved_fs_name = fs_name;
6123 }
6124
7c673cae 6125 std::string want = "mdsmap";
11fdf7f2 6126 if (!resolved_fs_name.empty()) {
7c673cae
FG
6127 r = fetch_fsmap(true);
6128 if (r < 0)
6129 return r;
11fdf7f2
TL
6130 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6131 if (fscid == FS_CLUSTER_ID_NONE) {
f67539c2 6132 return -CEPHFS_ENOENT;
11fdf7f2 6133 }
7c673cae
FG
6134
6135 std::ostringstream oss;
11fdf7f2 6136 oss << want << "." << fscid;
7c673cae
FG
6137 want = oss.str();
6138 }
6139 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6140
6141 monclient->sub_want(want, 0, 0);
6142 monclient->renew_subs();
6143
11fdf7f2
TL
6144 return 0;
6145}
6146
6147int Client::mount(const std::string &mount_root, const UserPerm& perms,
6148 bool require_mds, const std::string &fs_name)
6149{
f67539c2 6150 ceph_assert(is_initialized());
11fdf7f2 6151
f67539c2
TL
6152 /*
6153 * To make sure that the _unmount() must wait until the mount()
6154 * is done.
6155 */
6156 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6157 if (!mref_writer.is_first_writer()) // already mounting or mounted
11fdf7f2 6158 return 0;
11fdf7f2 6159
f67539c2 6160 std::unique_lock cl(client_lock);
11fdf7f2
TL
6161
6162 int r = subscribe_mdsmap(fs_name);
6163 if (r < 0) {
6164 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6165 return r;
6166 }
6167
f67539c2
TL
6168 start_tick_thread(); // start tick thread
6169
7c673cae
FG
6170 if (require_mds) {
6171 while (1) {
6172 auto availability = mdsmap->is_cluster_available();
6173 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6174 // Error out
6175 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6176 return CEPH_FUSE_NO_MDS_UP;
6177 } else if (availability == MDSMap::AVAILABLE) {
6178 // Continue to mount
6179 break;
6180 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6181 // Else, wait. MDSMonitor will update the map to bring
6182 // us to a conclusion eventually.
6183 wait_on_list(waiting_for_mdsmap);
6184 } else {
6185 // Unexpected value!
6186 ceph_abort();
6187 }
6188 }
6189 }
6190
6191 populate_metadata(mount_root.empty() ? "/" : mount_root);
6192
6193 filepath fp(CEPH_INO_ROOT);
6194 if (!mount_root.empty()) {
6195 fp = filepath(mount_root.c_str());
6196 }
6197 while (true) {
6198 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6199 req->set_filepath(fp);
6200 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6201 int res = make_request(req, perms);
6202 if (res < 0) {
f67539c2 6203 if (res == -CEPHFS_EACCES && root) {
7c673cae
FG
6204 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6205 break;
6206 }
6207 return res;
6208 }
6209
6210 if (fp.depth())
6211 fp.pop_dentry();
6212 else
6213 break;
6214 }
6215
11fdf7f2 6216 ceph_assert(root);
b3b6e05e 6217 _ll_get(root.get());
7c673cae 6218
7c673cae
FG
6219 // trace?
6220 if (!cct->_conf->client_trace.empty()) {
6221 traceout.open(cct->_conf->client_trace.c_str());
6222 if (traceout.is_open()) {
6223 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6224 } else {
6225 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6226 }
6227 }
6228
6229 /*
6230 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6231 ldout(cct, 3) << "op: struct stat st;" << dendl;
6232 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6233 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6234 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6235 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6236 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6237 ldout(cct, 3) << "op: int fd;" << dendl;
6238 */
f67539c2
TL
6239
6240 mref_writer.update_state(CLIENT_MOUNTED);
7c673cae
FG
6241 return 0;
6242}
6243
6244// UNMOUNT
6245
6246void Client::_close_sessions()
6247{
f6b5b4d7 6248 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67 6249 if (it->second->state == MetaSession::STATE_REJECTED)
f6b5b4d7
TL
6250 mds_sessions.erase(it++);
6251 else
6252 ++it;
6253 }
6254
7c673cae
FG
6255 while (!mds_sessions.empty()) {
6256 // send session closes!
11fdf7f2 6257 for (auto &p : mds_sessions) {
20effc67
TL
6258 if (p.second->state != MetaSession::STATE_CLOSING) {
6259 _close_mds_session(p.second.get());
f6b5b4d7 6260 mds_ranks_closing.insert(p.first);
7c673cae
FG
6261 }
6262 }
6263
6264 // wait for sessions to close
f6b5b4d7
TL
6265 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6266 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6267 << timo << "s)" << dendl;
9f95a23c 6268 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6269 if (!timo) {
6270 mount_cond.wait(l);
6271 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6272 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6273 while (!mds_ranks_closing.empty()) {
6274 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6275 // this prunes entry from mds_sessions and mds_ranks_closing
20effc67 6276 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
f6b5b4d7
TL
6277 }
6278 }
6279
6280 mds_ranks_closing.clear();
9f95a23c 6281 l.release();
7c673cae
FG
6282 }
6283}
6284
522d829b
TL
6285void Client::flush_mdlog_sync(Inode *in)
6286{
6287 if (in->unsafe_ops.empty()) {
6288 return;
6289 }
6290
6291 std::set<mds_rank_t> anchor;
6292 for (auto &&p : in->unsafe_ops) {
6293 anchor.emplace(p->mds);
6294 }
6295 if (in->auth_cap) {
6296 anchor.emplace(in->auth_cap->session->mds_num);
6297 }
6298
6299 for (auto &rank : anchor) {
6300 auto session = &mds_sessions.at(rank);
20effc67 6301 flush_mdlog(session->get());
522d829b
TL
6302 }
6303}
6304
31f18b77
FG
6305void Client::flush_mdlog_sync()
6306{
522d829b 6307 if (mds_requests.empty())
31f18b77 6308 return;
11fdf7f2 6309 for (auto &p : mds_sessions) {
20effc67 6310 flush_mdlog(p.second.get());
31f18b77
FG
6311 }
6312}
6313
6314void Client::flush_mdlog(MetaSession *session)
6315{
6316 // Only send this to Luminous or newer MDS daemons, older daemons
6317 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6318 const uint64_t features = session->con->get_features();
6319 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6320 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6321 session->con->send_message2(std::move(m));
31f18b77
FG
6322 }
6323}
6324
6325
11fdf7f2
TL
6326void Client::_abort_mds_sessions(int err)
6327{
6328 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6329 auto req = p->second;
6330 ++p;
6331 // unsafe requests will be removed during close session below.
6332 if (req->got_unsafe)
6333 continue;
6334
6335 req->abort(err);
6336 if (req->caller_cond) {
6337 req->kick = true;
9f95a23c 6338 req->caller_cond->notify_all();
11fdf7f2
TL
6339 }
6340 }
6341
6342 // Process aborts on any requests that were on this waitlist.
6343 // Any requests that were on a waiting_for_open session waitlist
6344 // will get kicked during close session below.
6345 signal_cond_list(waiting_for_mdsmap);
6346
6347 // Force-close all sessions
6348 while(!mds_sessions.empty()) {
20effc67
TL
6349 auto session = mds_sessions.begin()->second;
6350 _closed_mds_session(session.get(), err);
11fdf7f2
TL
6351 }
6352}
6353
6354void Client::_unmount(bool abort)
7c673cae 6355{
f67539c2
TL
6356 /*
6357 * We are unmounting the client.
6358 *
6359 * Just declare the state to STATE_UNMOUNTING to block and fail
6360 * any new comming "reader" and then try to wait all the in-flight
6361 * "readers" to finish.
6362 */
6363 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6364 if (!mref_writer.is_first_writer())
181888fb 6365 return;
f67539c2 6366 mref_writer.wait_readers_done();
7c673cae 6367
f67539c2
TL
6368 std::unique_lock lock{client_lock};
6369
6370 if (abort || blocklisted) {
6371 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
11fdf7f2
TL
6372 } else {
6373 ldout(cct, 2) << "unmounting" << dendl;
6374 }
7c673cae 6375
b32b8144
FG
6376 deleg_timeout = 0;
6377
11fdf7f2 6378 if (abort) {
f67539c2 6379 mount_aborted = true;
11fdf7f2 6380 // Abort all mds sessions
f67539c2 6381 _abort_mds_sessions(-CEPHFS_ENOTCONN);
11fdf7f2 6382
f67539c2 6383 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
11fdf7f2
TL
6384 } else {
6385 // flush the mdlog for pending requests, if any
6386 flush_mdlog_sync();
6387 }
6388
9f95a23c
TL
6389 mount_cond.wait(lock, [this] {
6390 if (!mds_requests.empty()) {
6391 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6392 << dendl;
6393 }
6394 return mds_requests.empty();
6395 });
7c673cae
FG
6396
6397 cwd.reset();
b3b6e05e 6398 root.reset();
7c673cae
FG
6399
6400 // clean up any unclosed files
6401 while (!fd_map.empty()) {
6402 Fh *fh = fd_map.begin()->second;
6403 fd_map.erase(fd_map.begin());
6404 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6405 _release_fh(fh);
6406 }
6407
6408 while (!ll_unclosed_fh_set.empty()) {
6409 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6410 Fh *fh = *it;
6411 ll_unclosed_fh_set.erase(fh);
6412 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6413 _release_fh(fh);
6414 }
6415
6416 while (!opened_dirs.empty()) {
6417 dir_result_t *dirp = *opened_dirs.begin();
6418 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6419 _closedir(dirp);
6420 }
6421
6422 _ll_drop_pins();
6423
7c673cae
FG
6424 if (cct->_conf->client_oc) {
6425 // flush/release all buffered data
11fdf7f2
TL
6426 std::list<InodeRef> anchor;
6427 for (auto& p : inode_map) {
6428 Inode *in = p.second;
7c673cae 6429 if (!in) {
11fdf7f2
TL
6430 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6431 ceph_assert(in);
7c673cae 6432 }
11fdf7f2
TL
6433
6434 // prevent inode from getting freed
6435 anchor.emplace_back(in);
6436
f67539c2 6437 if (abort || blocklisted) {
11fdf7f2
TL
6438 objectcacher->purge_set(&in->oset);
6439 } else if (!in->caps.empty()) {
7c673cae
FG
6440 _release(in);
6441 _flush(in, new C_Client_FlushComplete(this, in));
6442 }
6443 }
6444 }
6445
f67539c2 6446 if (abort || blocklisted) {
20effc67
TL
6447 for (auto &q : mds_sessions) {
6448 auto s = q.second;
6449 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6450 Inode *in = *p;
6451 ++p;
6452 if (in->dirty_caps) {
6453 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6454 in->mark_caps_clean();
6455 put_inode(in);
6456 }
11fdf7f2
TL
6457 }
6458 }
6459 } else {
6460 flush_caps_sync();
6461 wait_sync_caps(last_flush_tid);
6462 }
7c673cae
FG
6463
6464 // empty lru cache
7c673cae
FG
6465 trim_cache();
6466
f67539c2
TL
6467 delay_put_inodes();
6468
7c673cae
FG
6469 while (lru.lru_get_size() > 0 ||
6470 !inode_map.empty()) {
6471 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6472 << "+" << inode_map.size() << " items"
6473 << ", waiting (for caps to release?)"
6474 << dendl;
f67539c2 6475
9f95a23c
TL
6476 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6477 r == std::cv_status::timeout) {
7c673cae
FG
6478 dump_cache(NULL);
6479 }
6480 }
11fdf7f2
TL
6481 ceph_assert(lru.lru_get_size() == 0);
6482 ceph_assert(inode_map.empty());
7c673cae
FG
6483
6484 // stop tracing
6485 if (!cct->_conf->client_trace.empty()) {
6486 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6487 traceout.close();
6488 }
6489
f67539c2
TL
6490 // stop the tick thread
6491 tick_thread_stopped = true;
6492 upkeep_cond.notify_one();
6493
7c673cae
FG
6494 _close_sessions();
6495
f67539c2 6496 mref_writer.update_state(CLIENT_UNMOUNTED);
7c673cae
FG
6497
6498 ldout(cct, 2) << "unmounted." << dendl;
6499}
6500
b32b8144
FG
6501void Client::unmount()
6502{
11fdf7f2
TL
6503 _unmount(false);
6504}
6505
6506void Client::abort_conn()
6507{
11fdf7f2 6508 _unmount(true);
b32b8144
FG
6509}
6510
7c673cae
FG
6511void Client::flush_cap_releases()
6512{
f67539c2
TL
6513 uint64_t nr_caps = 0;
6514
7c673cae 6515 // send any cap releases
11fdf7f2 6516 for (auto &p : mds_sessions) {
20effc67
TL
6517 auto session = p.second;
6518 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
11fdf7f2 6519 p.first)) {
20effc67 6520 nr_caps += session->release->caps.size();
7c673cae
FG
6521 if (cct->_conf->client_inject_release_failure) {
6522 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6523 } else {
20effc67 6524 session->con->send_message2(std::move(session->release));
7c673cae 6525 }
20effc67 6526 session->release.reset();
7c673cae
FG
6527 }
6528 }
f67539c2
TL
6529
6530 if (nr_caps > 0) {
6531 dec_pinned_icaps(nr_caps);
6532 }
7c673cae
FG
6533}
6534
f67539c2 6535void Client::renew_and_flush_cap_releases()
7c673cae 6536{
f67539c2
TL
6537 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6538
6539 if (!mount_aborted && mdsmap->get_epoch()) {
6540 // renew caps?
6541 utime_t el = ceph_clock_now() - last_cap_renew;
6542 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6543 renew_caps();
6544
6545 flush_cap_releases();
7c673cae 6546 }
f67539c2
TL
6547}
6548
6549void Client::tick()
6550{
6551 ldout(cct, 20) << "tick" << dendl;
7c673cae 6552
7c673cae
FG
6553 utime_t now = ceph_clock_now();
6554
f67539c2
TL
6555 /*
6556 * If the mount() is not finished
6557 */
6558 if (is_mounting() && !mds_requests.empty()) {
7c673cae 6559 MetaRequest *req = mds_requests.begin()->second;
f67539c2 6560
7c673cae 6561 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
f67539c2 6562 req->abort(-CEPHFS_ETIMEDOUT);
7c673cae 6563 if (req->caller_cond) {
f67539c2
TL
6564 req->kick = true;
6565 req->caller_cond->notify_all();
7c673cae
FG
6566 }
6567 signal_cond_list(waiting_for_mdsmap);
11fdf7f2 6568 for (auto &p : mds_sessions) {
20effc67 6569 signal_context_list(p.second->waiting_for_open);
11fdf7f2 6570 }
7c673cae
FG
6571 }
6572 }
6573
f67539c2 6574 renew_and_flush_cap_releases();
7c673cae
FG
6575
6576 // delayed caps
28e407b8 6577 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6578 while (!p.end()) {
6579 Inode *in = *p;
6580 ++p;
f67539c2 6581 if (!mount_aborted && in->hold_caps_until > now)
7c673cae 6582 break;
28e407b8 6583 delayed_list.pop_front();
f67539c2
TL
6584 if (!mount_aborted)
6585 check_caps(in, CHECK_CAPS_NODELAY);
7c673cae
FG
6586 }
6587
f67539c2
TL
6588 if (!mount_aborted)
6589 collect_and_send_metrics();
6590
6591 delay_put_inodes(is_unmounting());
7c673cae 6592 trim_cache(true);
f6b5b4d7 6593
f67539c2 6594 if (blocklisted && (is_mounted() || is_unmounting()) &&
f6b5b4d7
TL
6595 last_auto_reconnect + 30 * 60 < now &&
6596 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6597 messenger->client_reset();
6598 fd_gen++; // invalidate open files
f67539c2 6599 blocklisted = false;
f6b5b4d7
TL
6600 _kick_stale_sessions();
6601 last_auto_reconnect = now;
6602 }
7c673cae
FG
6603}
6604
f67539c2
TL
6605void Client::start_tick_thread()
6606{
6607 upkeeper = std::thread([this]() {
6608 using time = ceph::coarse_mono_time;
6609 using sec = std::chrono::seconds;
6610
6611 auto last_tick = time::min();
6612
6613 std::unique_lock cl(client_lock);
6614 while (!tick_thread_stopped) {
6615 auto now = clock::now();
6616 auto since = now - last_tick;
6617
6618 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6619 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6620
6621 auto interval = std::max(t_interval, d_interval);
6622 if (likely(since >= interval*.90)) {
6623 tick();
6624 last_tick = clock::now();
6625 } else {
6626 interval -= since;
6627 }
6628
6629 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6630 if (!tick_thread_stopped)
6631 upkeep_cond.wait_for(cl, interval);
6632 }
6633 });
6634}
6635
6636void Client::collect_and_send_metrics() {
6637 ldout(cct, 20) << __func__ << dendl;
6638
6639 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6640
6641 // right now, we only track and send global metrics. its sufficient
6642 // to send these metrics to MDS rank0.
6643 collect_and_send_global_metrics();
6644}
6645
6646void Client::collect_and_send_global_metrics() {
6647 ldout(cct, 20) << __func__ << dendl;
6648 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6649
6650 if (!have_open_session((mds_rank_t)0)) {
6651 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6652 << dendl;
6653 return;
6654 }
6655 auto session = _get_or_open_mds_session((mds_rank_t)0);
6656 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6657 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6658 return;
6659 }
6660
6661 ClientMetricMessage metric;
6662 std::vector<ClientMetricMessage> message;
6663
6664 // read latency
6665 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6666 message.push_back(metric);
6667
6668 // write latency
6669 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6670 message.push_back(metric);
6671
6672 // metadata latency
6673 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6674 message.push_back(metric);
6675
6676 // cap hit ratio -- nr_caps is unused right now
6677 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6678 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6679 message.push_back(metric);
6680
6681 // dentry lease hit ratio
6682 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6683 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6684 message.push_back(metric);
6685
6686 // opened files
6687 {
6688 auto [opened_files, total_inodes] = get_opened_files_rates();
6689 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6690 }
6691 message.push_back(metric);
6692
6693 // pinned i_caps
6694 {
6695 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6696 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6697 }
6698 message.push_back(metric);
6699
6700 // opened inodes
6701 {
6702 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6703 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6704 }
6705 message.push_back(metric);
6706
a4b75251
TL
6707 // read io sizes
6708 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6709 total_read_size));
6710 message.push_back(metric);
6711
6712 // write io sizes
6713 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6714 total_write_size));
6715 message.push_back(metric);
6716
f67539c2
TL
6717 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6718}
6719
7c673cae
FG
6720void Client::renew_caps()
6721{
6722 ldout(cct, 10) << "renew_caps()" << dendl;
6723 last_cap_renew = ceph_clock_now();
6724
11fdf7f2
TL
6725 for (auto &p : mds_sessions) {
6726 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6727 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
20effc67 6728 renew_caps(p.second.get());
7c673cae
FG
6729 }
6730}
6731
6732void Client::renew_caps(MetaSession *session)
6733{
6734 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6735 session->last_cap_renew_request = ceph_clock_now();
6736 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6737 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6738}
6739
6740
6741// ===============================================================
6742// high level (POSIXy) interface
6743
6744int Client::_do_lookup(Inode *dir, const string& name, int mask,
6745 InodeRef *target, const UserPerm& perms)
6746{
6747 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6748 MetaRequest *req = new MetaRequest(op);
6749 filepath path;
6750 dir->make_nosnap_relative_path(path);
6751 path.push_dentry(name);
6752 req->set_filepath(path);
6753 req->set_inode(dir);
6754 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6755 mask |= DEBUG_GETATTR_CAPS;
6756 req->head.args.getattr.mask = mask;
6757
11fdf7f2 6758 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6759
6760 int r = make_request(req, perms, target);
11fdf7f2 6761 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6762 return r;
6763}
6764
f67539c2
TL
6765bool Client::_dentry_valid(const Dentry *dn)
6766{
6767 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6768
6769 // is dn lease valid?
6770 utime_t now = ceph_clock_now();
6771 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6772 mds_sessions.count(dn->lease_mds)) {
20effc67
TL
6773 auto s = mds_sessions.at(dn->lease_mds);
6774 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
f67539c2
TL
6775 dlease_hit();
6776 return true;
6777 }
6778
20effc67 6779 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
f67539c2
TL
6780 << " vs lease_gen " << dn->lease_gen << dendl;
6781 }
6782
6783 dlease_miss();
6784 return false;
6785}
6786
7c673cae 6787int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
f67539c2 6788 const UserPerm& perms, std::string* alternate_name)
7c673cae
FG
6789{
6790 int r = 0;
6791 Dentry *dn = NULL;
f67539c2 6792 bool did_lookup_request = false;
f91f0fd5
TL
6793 // can only request shared caps
6794 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 6795
7c673cae 6796 if (dname == "..") {
11fdf7f2
TL
6797 if (dir->dentries.empty()) {
6798 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6799 filepath path(dir->ino);
6800 req->set_filepath(path);
6801
6802 InodeRef tmptarget;
6803 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6804
6805 if (r == 0) {
f91f0fd5 6806 *target = std::move(tmptarget);
11fdf7f2
TL
6807 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6808 } else {
6809 *target = dir;
6810 }
6811 }
7c673cae
FG
6812 else
6813 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6814 goto done;
6815 }
6816
6817 if (dname == ".") {
6818 *target = dir;
6819 goto done;
6820 }
6821
11fdf7f2 6822 if (!dir->is_dir()) {
f67539c2 6823 r = -CEPHFS_ENOTDIR;
11fdf7f2
TL
6824 goto done;
6825 }
6826
7c673cae 6827 if (dname.length() > NAME_MAX) {
f67539c2 6828 r = -CEPHFS_ENAMETOOLONG;
7c673cae
FG
6829 goto done;
6830 }
6831
6832 if (dname == cct->_conf->client_snapdir &&
6833 dir->snapid == CEPH_NOSNAP) {
6834 *target = open_snapdir(dir);
6835 goto done;
6836 }
6837
f67539c2 6838relookup:
7c673cae
FG
6839 if (dir->dir &&
6840 dir->dir->dentries.count(dname)) {
6841 dn = dir->dir->dentries[dname];
6842
f67539c2
TL
6843 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6844 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7c673cae 6845
94b18763 6846 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
f67539c2
TL
6847 if (_dentry_valid(dn)) {
6848 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6849 // make trim_caps() behave.
6850 dir->try_touch_cap(dn->lease_mds);
6851 goto hit_dn;
7c673cae 6852 }
92f5a8d4 6853 // dir shared caps?
94b18763 6854 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6855 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6856 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6857 goto hit_dn;
6858 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6859 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae 6860 << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6861 return -CEPHFS_ENOENT;
7c673cae
FG
6862 }
6863 }
6864 } else {
6865 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6866 }
6867 } else {
6868 // can we conclude ENOENT locally?
94b18763 6869 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6870 (dir->flags & I_COMPLETE)) {
11fdf7f2 6871 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6872 return -CEPHFS_ENOENT;
7c673cae
FG
6873 }
6874 }
6875
f67539c2
TL
6876 if (did_lookup_request) {
6877 r = 0;
6878 goto done;
6879 }
7c673cae 6880 r = _do_lookup(dir, dname, mask, target, perms);
f67539c2
TL
6881 did_lookup_request = true;
6882 if (r == 0) {
6883 /* complete lookup to get dentry for alternate_name */
6884 goto relookup;
6885 } else {
6886 goto done;
6887 }
6888
6889 hit_dn:
6890 if (dn->inode) {
7c673cae 6891 *target = dn->inode;
f67539c2
TL
6892 if (alternate_name)
6893 *alternate_name = dn->alternate_name;
7c673cae 6894 } else {
f67539c2 6895 r = -CEPHFS_ENOENT;
7c673cae
FG
6896 }
6897 touch_dn(dn);
f67539c2 6898 goto done;
7c673cae
FG
6899
6900 done:
6901 if (r < 0)
11fdf7f2 6902 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6903 else
11fdf7f2 6904 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6905 return r;
6906}
6907
6908int Client::get_or_create(Inode *dir, const char* name,
6909 Dentry **pdn, bool expect_null)
6910{
6911 // lookup
11fdf7f2 6912 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6913 dir->open_dir();
6914 if (dir->dir->dentries.count(name)) {
6915 Dentry *dn = dir->dir->dentries[name];
f67539c2
TL
6916 if (_dentry_valid(dn)) {
6917 if (expect_null)
6918 return -CEPHFS_EEXIST;
7c673cae
FG
6919 }
6920 *pdn = dn;
6921 } else {
6922 // otherwise link up a new one
6923 *pdn = link(dir->dir, name, NULL, NULL);
6924 }
6925
6926 // success
6927 return 0;
6928}
6929
f67539c2
TL
6930int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6931{
6932 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6933 if (!mref_reader.is_state_satisfied())
6934 return -CEPHFS_ENOTCONN;
6935
6936 ldout(cct, 10) << __func__ << ": " << path << dendl;
6937
6938 std::scoped_lock lock(client_lock);
6939
6940 return path_walk(path, wdr, perms, followsym);
6941}
6942
7c673cae 6943int Client::path_walk(const filepath& origpath, InodeRef *end,
b3b6e05e 6944 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
f67539c2
TL
6945{
6946 walk_dentry_result wdr;
b3b6e05e 6947 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
f67539c2
TL
6948 *end = std::move(wdr.in);
6949 return rc;
6950}
6951
b3b6e05e
TL
6952int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
6953 bool followsym, int mask, InodeRef dirinode)
7c673cae
FG
6954{
6955 filepath path = origpath;
6956 InodeRef cur;
f67539c2 6957 std::string alternate_name;
7c673cae
FG
6958 if (origpath.absolute())
6959 cur = root;
b3b6e05e 6960 else if (!dirinode)
7c673cae 6961 cur = cwd;
b3b6e05e
TL
6962 else {
6963 cur = dirinode;
6964 }
11fdf7f2 6965 ceph_assert(cur);
7c673cae 6966
b3b6e05e 6967 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
11fdf7f2 6968 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6969
6970 int symlinks = 0;
6971
6972 unsigned i=0;
6973 while (i < path.depth() && cur) {
6974 int caps = 0;
6975 const string &dname = path[i];
6976 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6977 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6978 InodeRef next;
6979 if (cct->_conf->client_permissions) {
6980 int r = may_lookup(cur.get(), perms);
6981 if (r < 0)
6982 return r;
6983 caps = CEPH_CAP_AUTH_SHARED;
6984 }
6985
6986 /* Get extra requested caps on the last component */
6987 if (i == (path.depth() - 1))
6988 caps |= mask;
f67539c2 6989 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7c673cae
FG
6990 if (r < 0)
6991 return r;
6992 // only follow trailing symlink if followsym. always follow
6993 // 'directory' symlinks.
6994 if (next && next->is_symlink()) {
6995 symlinks++;
6996 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6997 if (symlinks > MAXSYMLINKS) {
f67539c2 6998 return -CEPHFS_ELOOP;
7c673cae
FG
6999 }
7000
7001 if (i < path.depth() - 1) {
7002 // dir symlink
7003 // replace consumed components of path with symlink dir target
7004 filepath resolved(next->symlink.c_str());
7005 resolved.append(path.postfixpath(i + 1));
7006 path = resolved;
7007 i = 0;
7008 if (next->symlink[0] == '/') {
7009 cur = root;
7010 }
7011 continue;
7012 } else if (followsym) {
7013 if (next->symlink[0] == '/') {
7014 path = next->symlink.c_str();
7015 i = 0;
7016 // reset position
7017 cur = root;
7018 } else {
7019 filepath more(next->symlink.c_str());
7020 // we need to remove the symlink component from off of the path
7021 // before adding the target that the symlink points to. remain
7022 // at the same position in the path.
7023 path.pop_dentry();
7024 path.append(more);
7025 }
7026 continue;
7027 }
7028 }
7029 cur.swap(next);
7030 i++;
7031 }
7032 if (!cur)
f67539c2
TL
7033 return -CEPHFS_ENOENT;
7034 if (result) {
7035 result->in = std::move(cur);
7036 result->alternate_name = std::move(alternate_name);
7037 }
7c673cae
FG
7038 return 0;
7039}
7040
7041
7042// namespace ops
7043
f67539c2 7044int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7c673cae 7045{
f67539c2
TL
7046 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7047 if (!mref_reader.is_state_satisfied())
7048 return -CEPHFS_ENOTCONN;
7049
7c673cae
FG
7050 tout(cct) << "link" << std::endl;
7051 tout(cct) << relexisting << std::endl;
7052 tout(cct) << relpath << std::endl;
7053
7054 filepath existing(relexisting);
7055
7056 InodeRef in, dir;
f67539c2
TL
7057
7058 std::scoped_lock lock(client_lock);
7c673cae
FG
7059 int r = path_walk(existing, &in, perm, true);
7060 if (r < 0)
7061 return r;
7062 if (std::string(relpath) == "/") {
f67539c2 7063 r = -CEPHFS_EEXIST;
7c673cae
FG
7064 return r;
7065 }
7066 filepath path(relpath);
7067 string name = path.last_dentry();
7068 path.pop_dentry();
7069
7070 r = path_walk(path, &dir, perm, true);
7071 if (r < 0)
7072 return r;
7073 if (cct->_conf->client_permissions) {
7074 if (S_ISDIR(in->mode)) {
f67539c2 7075 r = -CEPHFS_EPERM;
7c673cae
FG
7076 return r;
7077 }
7078 r = may_hardlink(in.get(), perm);
7079 if (r < 0)
7080 return r;
7081 r = may_create(dir.get(), perm);
7082 if (r < 0)
7083 return r;
7084 }
f67539c2 7085 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7086 return r;
7087}
7088
7089int Client::unlink(const char *relpath, const UserPerm& perm)
b3b6e05e
TL
7090{
7091 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7092}
7093
7094int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7c673cae 7095{
f67539c2 7096 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7097 if (!mref_reader.is_state_satisfied()) {
f67539c2 7098 return -CEPHFS_ENOTCONN;
b3b6e05e 7099 }
f67539c2 7100
11fdf7f2 7101 tout(cct) << __func__ << std::endl;
b3b6e05e 7102 tout(cct) << dirfd << std::endl;
7c673cae 7103 tout(cct) << relpath << std::endl;
b3b6e05e 7104 tout(cct) << flags << std::endl;
7c673cae 7105
b3b6e05e
TL
7106 if (std::string(relpath) == "/") {
7107 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7108 }
7c673cae
FG
7109
7110 filepath path(relpath);
7111 string name = path.last_dentry();
7112 path.pop_dentry();
7113 InodeRef dir;
f67539c2
TL
7114
7115 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7116
7117 InodeRef dirinode;
7118 int r = get_fd_inode(dirfd, &dirinode);
7119 if (r < 0) {
7120 return r;
7121 }
7122
7123 r = path_walk(path, &dir, perm, true, 0, dirinode);
7124 if (r < 0) {
7c673cae 7125 return r;
b3b6e05e 7126 }
7c673cae
FG
7127 if (cct->_conf->client_permissions) {
7128 r = may_delete(dir.get(), name.c_str(), perm);
b3b6e05e 7129 if (r < 0) {
7c673cae 7130 return r;
b3b6e05e 7131 }
7c673cae 7132 }
b3b6e05e
TL
7133 if (flags & AT_REMOVEDIR) {
7134 r = _rmdir(dir.get(), name.c_str(), perm);
7135 } else {
7136 r = _unlink(dir.get(), name.c_str(), perm);
7137 }
7138 return r;
7c673cae
FG
7139}
7140
f67539c2 7141int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7c673cae 7142{
f67539c2
TL
7143 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7144 if (!mref_reader.is_state_satisfied())
7145 return -CEPHFS_ENOTCONN;
7146
11fdf7f2 7147 tout(cct) << __func__ << std::endl;
7c673cae
FG
7148 tout(cct) << relfrom << std::endl;
7149 tout(cct) << relto << std::endl;
7150
7151 if (std::string(relfrom) == "/" || std::string(relto) == "/")
f67539c2 7152 return -CEPHFS_EBUSY;
7c673cae
FG
7153
7154 filepath from(relfrom);
7155 filepath to(relto);
7156 string fromname = from.last_dentry();
7157 from.pop_dentry();
7158 string toname = to.last_dentry();
7159 to.pop_dentry();
7160
7161 InodeRef fromdir, todir;
f67539c2
TL
7162
7163 std::scoped_lock lock(client_lock);
7c673cae
FG
7164 int r = path_walk(from, &fromdir, perm);
7165 if (r < 0)
7166 goto out;
7167 r = path_walk(to, &todir, perm);
7168 if (r < 0)
7169 goto out;
7170
7171 if (cct->_conf->client_permissions) {
7172 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7173 if (r < 0)
7174 return r;
7175 r = may_delete(todir.get(), toname.c_str(), perm);
f67539c2 7176 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
7177 return r;
7178 }
f67539c2 7179 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7180out:
7181 return r;
7182}
7183
7184// dirs
7185
f67539c2 7186int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
b3b6e05e
TL
7187{
7188 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7189}
7190
7191int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7192 std::string alternate_name)
7c673cae 7193{
f67539c2
TL
7194 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7195 if (!mref_reader.is_state_satisfied())
7196 return -CEPHFS_ENOTCONN;
7197
11fdf7f2 7198 tout(cct) << __func__ << std::endl;
b3b6e05e 7199 tout(cct) << dirfd << std::endl;
7c673cae
FG
7200 tout(cct) << relpath << std::endl;
7201 tout(cct) << mode << std::endl;
11fdf7f2 7202 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 7203
b3b6e05e 7204 if (std::string(relpath) == "/") {
f67539c2 7205 return -CEPHFS_EEXIST;
b3b6e05e 7206 }
7c673cae
FG
7207
7208 filepath path(relpath);
7209 string name = path.last_dentry();
7210 path.pop_dentry();
7211 InodeRef dir;
f67539c2
TL
7212
7213 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7214
7215 InodeRef dirinode;
7216 int r = get_fd_inode(dirfd, &dirinode);
7217 if (r < 0) {
7c673cae 7218 return r;
b3b6e05e
TL
7219 }
7220
7221 r = path_walk(path, &dir, perm, true, 0, dirinode);
7222 if (r < 0) {
7223 return r;
7224 }
7c673cae
FG
7225 if (cct->_conf->client_permissions) {
7226 r = may_create(dir.get(), perm);
b3b6e05e 7227 if (r < 0) {
7c673cae 7228 return r;
b3b6e05e 7229 }
7c673cae 7230 }
f67539c2 7231 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7c673cae
FG
7232}
7233
7234int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7235{
f67539c2
TL
7236 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7237 if (!mref_reader.is_state_satisfied())
7238 return -CEPHFS_ENOTCONN;
7239
7c673cae 7240 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 7241 tout(cct) << __func__ << std::endl;
7c673cae
FG
7242 tout(cct) << relpath << std::endl;
7243 tout(cct) << mode << std::endl;
7244
7245 //get through existing parts of path
7246 filepath path(relpath);
7247 unsigned int i;
7248 int r = 0, caps = 0;
7249 InodeRef cur, next;
f67539c2
TL
7250
7251 std::scoped_lock lock(client_lock);
7c673cae
FG
7252 cur = cwd;
7253 for (i=0; i<path.depth(); ++i) {
7254 if (cct->_conf->client_permissions) {
7255 r = may_lookup(cur.get(), perms);
7256 if (r < 0)
7257 break;
7258 caps = CEPH_CAP_AUTH_SHARED;
7259 }
7260 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7261 if (r < 0)
7262 break;
7263 cur.swap(next);
7264 }
f67539c2 7265 if (r!=-CEPHFS_ENOENT) return r;
11fdf7f2 7266 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
7267 //make new directory at each level
7268 for (; i<path.depth(); ++i) {
7269 if (cct->_conf->client_permissions) {
7270 r = may_create(cur.get(), perms);
7271 if (r < 0)
7272 return r;
7273 }
7274 //make new dir
7275 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 7276
7c673cae 7277 //check proper creation/existence
f67539c2 7278 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
c07f9fc5
FG
7279 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7280 }
7281 if (r < 0)
7282 return r;
7c673cae
FG
7283 //move to new dir and continue
7284 cur.swap(next);
11fdf7f2 7285 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
7286 << filepath(cur->ino).get_path() << dendl;
7287 }
7288 return 0;
7289}
7290
7291int Client::rmdir(const char *relpath, const UserPerm& perms)
7292{
b3b6e05e 7293 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7c673cae
FG
7294}
7295
7296int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
f67539c2
TL
7297{
7298 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7299 if (!mref_reader.is_state_satisfied())
7300 return -CEPHFS_ENOTCONN;
7301
11fdf7f2 7302 tout(cct) << __func__ << std::endl;
7c673cae
FG
7303 tout(cct) << relpath << std::endl;
7304 tout(cct) << mode << std::endl;
7305 tout(cct) << rdev << std::endl;
7306
7307 if (std::string(relpath) == "/")
f67539c2 7308 return -CEPHFS_EEXIST;
7c673cae
FG
7309
7310 filepath path(relpath);
7311 string name = path.last_dentry();
7312 path.pop_dentry();
7313 InodeRef dir;
f67539c2
TL
7314
7315 std::scoped_lock lock(client_lock);
7c673cae
FG
7316 int r = path_walk(path, &dir, perms);
7317 if (r < 0)
7318 return r;
7319 if (cct->_conf->client_permissions) {
7320 int r = may_create(dir.get(), perms);
7321 if (r < 0)
7322 return r;
7323 }
7324 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7325}
7326
7327// symlinks
7328
f67539c2 7329int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
b3b6e05e
TL
7330{
7331 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7332}
7333
7334int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7335 std::string alternate_name)
7c673cae 7336{
f67539c2 7337 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7338 if (!mref_reader.is_state_satisfied()) {
f67539c2 7339 return -CEPHFS_ENOTCONN;
b3b6e05e 7340 }
f67539c2 7341
11fdf7f2 7342 tout(cct) << __func__ << std::endl;
7c673cae 7343 tout(cct) << target << std::endl;
b3b6e05e 7344 tout(cct) << dirfd << std::endl;
7c673cae
FG
7345 tout(cct) << relpath << std::endl;
7346
b3b6e05e 7347 if (std::string(relpath) == "/") {
f67539c2 7348 return -CEPHFS_EEXIST;
b3b6e05e 7349 }
7c673cae
FG
7350
7351 filepath path(relpath);
7352 string name = path.last_dentry();
7353 path.pop_dentry();
7354 InodeRef dir;
f67539c2
TL
7355
7356 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7357
7358 InodeRef dirinode;
7359 int r = get_fd_inode(dirfd, &dirinode);
7360 if (r < 0) {
7c673cae 7361 return r;
b3b6e05e
TL
7362 }
7363 r = path_walk(path, &dir, perms, true, 0, dirinode);
7364 if (r < 0) {
7365 return r;
7366 }
7c673cae
FG
7367 if (cct->_conf->client_permissions) {
7368 int r = may_create(dir.get(), perms);
b3b6e05e 7369 if (r < 0) {
7c673cae 7370 return r;
b3b6e05e 7371 }
7c673cae 7372 }
f67539c2 7373 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7c673cae
FG
7374}
7375
7376int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7377{
b3b6e05e
TL
7378 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7379}
7380
7381int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
f67539c2 7382 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7383 if (!mref_reader.is_state_satisfied()) {
f67539c2 7384 return -CEPHFS_ENOTCONN;
b3b6e05e 7385 }
f67539c2 7386
11fdf7f2 7387 tout(cct) << __func__ << std::endl;
b3b6e05e 7388 tout(cct) << dirfd << std::endl;
7c673cae
FG
7389 tout(cct) << relpath << std::endl;
7390
b3b6e05e 7391 InodeRef dirinode;
f67539c2 7392 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7393 int r = get_fd_inode(dirfd, &dirinode);
7394 if (r < 0) {
7c673cae 7395 return r;
b3b6e05e
TL
7396 }
7397
7398 InodeRef in;
7399 filepath path(relpath);
7400 r = path_walk(path, &in, perms, false, 0, dirinode);
7401 if (r < 0) {
7402 return r;
7403 }
7c673cae
FG
7404
7405 return _readlink(in.get(), buf, size);
7406}
7407
7408int Client::_readlink(Inode *in, char *buf, size_t size)
7409{
7410 if (!in->is_symlink())
f67539c2 7411 return -CEPHFS_EINVAL;
7c673cae
FG
7412
7413 // copy into buf (at most size bytes)
7414 int r = in->symlink.length();
7415 if (r > (int)size)
7416 r = size;
7417 memcpy(buf, in->symlink.c_str(), r);
7418 return r;
7419}
7420
7421
7422// inode stuff
7423
7424int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7425{
94b18763 7426 bool yes = in->caps_issued_mask(mask, true);
7c673cae 7427
11fdf7f2 7428 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
7429 if (yes && !force)
7430 return 0;
7431
7432 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7433 filepath path;
7434 in->make_nosnap_relative_path(path);
7435 req->set_filepath(path);
7436 req->set_inode(in);
7437 req->head.args.getattr.mask = mask;
7438
7439 int res = make_request(req, perms);
11fdf7f2 7440 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
7441 return res;
7442}
7443
7444int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7445 const UserPerm& perms, InodeRef *inp)
7446{
7447 int issued = in->caps_issued();
20effc67
TL
7448 union ceph_mds_request_args args;
7449 bool kill_sguid = false;
7450 int inode_drop = 0;
7c673cae 7451
11fdf7f2 7452 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
7453 ccap_string(issued) << dendl;
7454
7455 if (in->snapid != CEPH_NOSNAP) {
f67539c2 7456 return -CEPHFS_EROFS;
7c673cae
FG
7457 }
7458 if ((mask & CEPH_SETATTR_SIZE) &&
f67539c2
TL
7459 (uint64_t)stx->stx_size > in->size &&
7460 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7c673cae 7461 perms)) {
f67539c2 7462 return -CEPHFS_EDQUOT;
7c673cae
FG
7463 }
7464
20effc67
TL
7465 memset(&args, 0, sizeof(args));
7466
7c673cae
FG
7467 // make the change locally?
7468 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7469 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7470 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7471 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7472 << in->cap_dirtier_gid << ", forcing sync setattr"
7473 << dendl;
7474 /*
7475 * This works because we implicitly flush the caps as part of the
7476 * request, so the cap update check will happen with the writeback
7477 * cap context, and then the setattr check will happen with the
7478 * caller's context.
7479 *
7480 * In reality this pattern is likely pretty rare (different users
7481 * setattr'ing the same file). If that turns out not to be the
7482 * case later, we can build a more complex pipelined cap writeback
7483 * infrastructure...
7484 */
20effc67 7485 mask |= CEPH_SETATTR_CTIME;
7c673cae
FG
7486 }
7487
7488 if (!mask) {
7489 // caller just needs us to bump the ctime
7490 in->ctime = ceph_clock_now();
7491 in->cap_dirtier_uid = perms.uid();
7492 in->cap_dirtier_gid = perms.gid();
7493 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7494 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7495 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7496 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7497 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7498 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7499 else
7500 mask |= CEPH_SETATTR_CTIME;
7501 }
7502
7503 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
20effc67 7504 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7c673cae
FG
7505
7506 mask &= ~CEPH_SETATTR_KILL_SGUID;
20effc67
TL
7507 } else if (mask & CEPH_SETATTR_SIZE) {
7508 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7509 mask |= CEPH_SETATTR_KILL_SGUID;
7510 inode_drop |= CEPH_CAP_AUTH_SHARED;
7511 }
7512
7513 if (mask & CEPH_SETATTR_UID) {
7514 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7c673cae 7515
20effc67 7516 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7517 in->ctime = ceph_clock_now();
7518 in->cap_dirtier_uid = perms.uid();
7519 in->cap_dirtier_gid = perms.gid();
7520 in->uid = stx->stx_uid;
28e407b8 7521 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7522 mask &= ~CEPH_SETATTR_UID;
7523 kill_sguid = true;
20effc67
TL
7524 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7525 in->uid != stx->stx_uid) {
7526 args.setattr.uid = stx->stx_uid;
7527 inode_drop |= CEPH_CAP_AUTH_SHARED;
7528 } else {
7529 mask &= ~CEPH_SETATTR_UID;
7c673cae 7530 }
20effc67
TL
7531 }
7532
7533 if (mask & CEPH_SETATTR_GID) {
7534 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7535
7536 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7537 in->ctime = ceph_clock_now();
7538 in->cap_dirtier_uid = perms.uid();
7539 in->cap_dirtier_gid = perms.gid();
7540 in->gid = stx->stx_gid;
28e407b8 7541 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7542 mask &= ~CEPH_SETATTR_GID;
7543 kill_sguid = true;
20effc67
TL
7544 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7545 in->gid != stx->stx_gid) {
7546 args.setattr.gid = stx->stx_gid;
7547 inode_drop |= CEPH_CAP_AUTH_SHARED;
7548 } else {
7549 mask &= ~CEPH_SETATTR_GID;
7c673cae 7550 }
20effc67 7551 }
7c673cae 7552
20effc67
TL
7553 if (mask & CEPH_SETATTR_MODE) {
7554 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7555
7556 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7557 in->ctime = ceph_clock_now();
7558 in->cap_dirtier_uid = perms.uid();
7559 in->cap_dirtier_gid = perms.gid();
7560 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7561 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7562 mask &= ~CEPH_SETATTR_MODE;
20effc67
TL
7563 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7564 in->mode != stx->stx_mode) {
7565 args.setattr.mode = stx->stx_mode;
7566 inode_drop |= CEPH_CAP_AUTH_SHARED;
7567 } else {
7568 mask &= ~CEPH_SETATTR_MODE;
7c673cae 7569 }
20effc67
TL
7570 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7571 kill_sguid && S_ISREG(in->mode) &&
7572 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7573 /* Must squash the any setuid/setgid bits with an ownership change */
7574 in->mode &= ~(S_ISUID|S_ISGID);
7575 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7576 }
7577
7578 if (mask & CEPH_SETATTR_BTIME) {
7579 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7c673cae 7580
20effc67 7581 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7582 in->ctime = ceph_clock_now();
7583 in->cap_dirtier_uid = perms.uid();
7584 in->cap_dirtier_gid = perms.gid();
7585 in->btime = utime_t(stx->stx_btime);
28e407b8 7586 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7587 mask &= ~CEPH_SETATTR_BTIME;
20effc67
TL
7588 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7589 in->btime != utime_t(stx->stx_btime)) {
7590 args.setattr.btime = utime_t(stx->stx_btime);
7591 inode_drop |= CEPH_CAP_AUTH_SHARED;
7592 } else {
7593 mask &= ~CEPH_SETATTR_BTIME;
7594 }
7595 }
7596
7597 if (mask & CEPH_SETATTR_SIZE) {
7598 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7599 //too big!
7600 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7601 return -CEPHFS_EFBIG;
7602 }
7603
7604 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7605 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7606 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7607 stx->stx_size >= in->size) {
7608 if (stx->stx_size > in->size) {
7609 in->size = in->reported_size = stx->stx_size;
7610 in->cap_dirtier_uid = perms.uid();
7611 in->cap_dirtier_gid = perms.gid();
7612 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7613 mask &= ~(CEPH_SETATTR_SIZE);
7614 mask |= CEPH_SETATTR_MTIME;
7615 } else {
7616 // ignore it when size doesn't change
7617 mask &= ~(CEPH_SETATTR_SIZE);
7618 }
7619 } else {
7620 args.setattr.size = stx->stx_size;
7621 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7622 CEPH_CAP_FILE_WR;
7623 }
7624 }
7625
7626 if (mask & CEPH_SETATTR_MTIME) {
7627 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7628 in->mtime = utime_t(stx->stx_mtime);
7629 in->ctime = ceph_clock_now();
7630 in->cap_dirtier_uid = perms.uid();
7631 in->cap_dirtier_gid = perms.gid();
7632 in->time_warp_seq++;
7633 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7634 mask &= ~CEPH_SETATTR_MTIME;
7635 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7636 utime_t(stx->stx_mtime) > in->mtime) {
7637 in->mtime = utime_t(stx->stx_mtime);
7638 in->ctime = ceph_clock_now();
7639 in->cap_dirtier_uid = perms.uid();
7640 in->cap_dirtier_gid = perms.gid();
7641 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7642 mask &= ~CEPH_SETATTR_MTIME;
7643 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7644 in->mtime != utime_t(stx->stx_mtime)) {
7645 args.setattr.mtime = utime_t(stx->stx_mtime);
7646 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7647 CEPH_CAP_FILE_WR;
7648 } else {
7649 mask &= ~CEPH_SETATTR_MTIME;
7c673cae 7650 }
7c673cae
FG
7651 }
7652
20effc67
TL
7653 if (mask & CEPH_SETATTR_ATIME) {
7654 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7655 in->atime = utime_t(stx->stx_atime);
7c673cae
FG
7656 in->ctime = ceph_clock_now();
7657 in->cap_dirtier_uid = perms.uid();
7658 in->cap_dirtier_gid = perms.gid();
7659 in->time_warp_seq++;
28e407b8 7660 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
20effc67
TL
7661 mask &= ~CEPH_SETATTR_ATIME;
7662 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7663 utime_t(stx->stx_atime) > in->atime) {
7664 in->atime = utime_t(stx->stx_atime);
7665 in->ctime = ceph_clock_now();
7666 in->cap_dirtier_uid = perms.uid();
7667 in->cap_dirtier_gid = perms.gid();
7668 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7669 mask &= ~CEPH_SETATTR_ATIME;
7670 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7671 in->atime != utime_t(stx->stx_atime)) {
7672 args.setattr.atime = utime_t(stx->stx_atime);
7673 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7674 CEPH_CAP_FILE_WR;
7675 } else {
7676 mask &= ~CEPH_SETATTR_ATIME;
7c673cae
FG
7677 }
7678 }
20effc67 7679
7c673cae
FG
7680 if (!mask) {
7681 in->change_attr++;
7682 return 0;
7683 }
7684
7c673cae
FG
7685 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7686
7687 filepath path;
7688
7689 in->make_nosnap_relative_path(path);
7690 req->set_filepath(path);
7691 req->set_inode(in);
7692
20effc67
TL
7693 req->head.args = args;
7694 req->inode_drop = inode_drop;
7c673cae 7695 req->head.args.setattr.mask = mask;
7c673cae
FG
7696 req->regetattr_mask = mask;
7697
7698 int res = make_request(req, perms, inp);
7699 ldout(cct, 10) << "_setattr result=" << res << dendl;
7700 return res;
7701}
7702
7703/* Note that we only care about attrs that setattr cares about */
7704void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7705{
7706 stx->stx_size = st->st_size;
7707 stx->stx_mode = st->st_mode;
7708 stx->stx_uid = st->st_uid;
7709 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7710#ifdef __APPLE__
7711 stx->stx_mtime = st->st_mtimespec;
7712 stx->stx_atime = st->st_atimespec;
f67539c2
TL
7713#elif __WIN32
7714 stx->stx_mtime.tv_sec = st->st_mtime;
7715 stx->stx_atime.tv_sec = st->st_atime;
11fdf7f2 7716#else
7c673cae
FG
7717 stx->stx_mtime = st->st_mtim;
7718 stx->stx_atime = st->st_atim;
11fdf7f2 7719#endif
7c673cae
FG
7720}
7721
7722int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7723 const UserPerm& perms, InodeRef *inp)
7724{
7725 int ret = _do_setattr(in, stx, mask, perms, inp);
7726 if (ret < 0)
7727 return ret;
7728 if (mask & CEPH_SETATTR_MODE)
7729 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7730 return ret;
7731}
7732
7733int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7734 const UserPerm& perms)
7735{
7736 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7737 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7738 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7739 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7740 if (cct->_conf->client_permissions) {
7741 int r = may_setattr(in.get(), stx, mask, perms);
7742 if (r < 0)
7743 return r;
7744 }
7745 return __setattrx(in.get(), stx, mask, perms);
7746}
7747
7748int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7749 const UserPerm& perms)
7750{
7751 struct ceph_statx stx;
7752
7753 stat_to_statx(attr, &stx);
7754 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7755
7756 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7757 mask &= ~CEPH_SETATTR_UID;
7758 }
7759 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7760 mask &= ~CEPH_SETATTR_GID;
7761 }
7762
7c673cae
FG
7763 return _setattrx(in, &stx, mask, perms);
7764}
7765
7766int Client::setattr(const char *relpath, struct stat *attr, int mask,
7767 const UserPerm& perms)
7768{
f67539c2
TL
7769 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7770 if (!mref_reader.is_state_satisfied())
7771 return -CEPHFS_ENOTCONN;
7772
11fdf7f2 7773 tout(cct) << __func__ << std::endl;
7c673cae
FG
7774 tout(cct) << relpath << std::endl;
7775 tout(cct) << mask << std::endl;
7776
7777 filepath path(relpath);
7778 InodeRef in;
f67539c2
TL
7779
7780 std::scoped_lock lock(client_lock);
7c673cae
FG
7781 int r = path_walk(path, &in, perms);
7782 if (r < 0)
7783 return r;
7784 return _setattr(in, attr, mask, perms);
7785}
7786
7787int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7788 const UserPerm& perms, int flags)
7789{
f67539c2
TL
7790 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7791 if (!mref_reader.is_state_satisfied())
7792 return -CEPHFS_ENOTCONN;
7793
11fdf7f2 7794 tout(cct) << __func__ << std::endl;
7c673cae
FG
7795 tout(cct) << relpath << std::endl;
7796 tout(cct) << mask << std::endl;
7797
7798 filepath path(relpath);
7799 InodeRef in;
f67539c2
TL
7800
7801 std::scoped_lock lock(client_lock);
7c673cae
FG
7802 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7803 if (r < 0)
7804 return r;
7805 return _setattrx(in, stx, mask, perms);
7806}
7807
7808int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7809{
f67539c2
TL
7810 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7811 if (!mref_reader.is_state_satisfied())
7812 return -CEPHFS_ENOTCONN;
7813
11fdf7f2 7814 tout(cct) << __func__ << std::endl;
7c673cae
FG
7815 tout(cct) << fd << std::endl;
7816 tout(cct) << mask << std::endl;
7817
f67539c2 7818 std::scoped_lock lock(client_lock);
7c673cae
FG
7819 Fh *f = get_filehandle(fd);
7820 if (!f)
f67539c2 7821 return -CEPHFS_EBADF;
7c673cae
FG
7822#if defined(__linux__) && defined(O_PATH)
7823 if (f->flags & O_PATH)
f67539c2 7824 return -CEPHFS_EBADF;
7c673cae
FG
7825#endif
7826 return _setattr(f->inode, attr, mask, perms);
7827}
7828
7829int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7830{
f67539c2
TL
7831 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7832 if (!mref_reader.is_state_satisfied())
7833 return -CEPHFS_ENOTCONN;
7834
11fdf7f2 7835 tout(cct) << __func__ << std::endl;
7c673cae
FG
7836 tout(cct) << fd << std::endl;
7837 tout(cct) << mask << std::endl;
7838
f67539c2 7839 std::scoped_lock lock(client_lock);
7c673cae
FG
7840 Fh *f = get_filehandle(fd);
7841 if (!f)
f67539c2 7842 return -CEPHFS_EBADF;
7c673cae
FG
7843#if defined(__linux__) && defined(O_PATH)
7844 if (f->flags & O_PATH)
f67539c2 7845 return -CEPHFS_EBADF;
7c673cae
FG
7846#endif
7847 return _setattrx(f->inode, stx, mask, perms);
7848}
7849
7850int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7851 frag_info_t *dirstat, int mask)
7852{
f67539c2
TL
7853 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7854 if (!mref_reader.is_state_satisfied())
7855 return -CEPHFS_ENOTCONN;
7856
11fdf7f2 7857 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7858 tout(cct) << "stat" << std::endl;
7859 tout(cct) << relpath << std::endl;
181888fb 7860
7c673cae
FG
7861 filepath path(relpath);
7862 InodeRef in;
f67539c2
TL
7863
7864 std::scoped_lock lock(client_lock);
7c673cae
FG
7865 int r = path_walk(path, &in, perms, true, mask);
7866 if (r < 0)
7867 return r;
7868 r = _getattr(in, mask, perms);
7869 if (r < 0) {
11fdf7f2 7870 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7871 return r;
7872 }
7873 fill_stat(in, stbuf, dirstat);
11fdf7f2 7874 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7875 return r;
7876}
7877
7878unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7879{
7880 unsigned mask = 0;
7881
7882 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7883 if (flags & AT_NO_ATTR_SYNC)
7884 goto out;
7885
7886 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7887 mask |= CEPH_CAP_PIN;
7888 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7889 mask |= CEPH_CAP_AUTH_SHARED;
7890 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7891 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 7892 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
7893 mask |= CEPH_CAP_FILE_SHARED;
7894 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7895 mask |= CEPH_CAP_XATTR_SHARED;
7896out:
7897 return mask;
7898}
7899
7900int Client::statx(const char *relpath, struct ceph_statx *stx,
7901 const UserPerm& perms,
7902 unsigned int want, unsigned int flags)
7903{
b3b6e05e 7904 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7c673cae
FG
7905}
7906
7907int Client::lstat(const char *relpath, struct stat *stbuf,
7908 const UserPerm& perms, frag_info_t *dirstat, int mask)
7909{
f67539c2
TL
7910 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7911 if (!mref_reader.is_state_satisfied())
7912 return -CEPHFS_ENOTCONN;
7913
11fdf7f2 7914 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
11fdf7f2 7915 tout(cct) << __func__ << std::endl;
7c673cae 7916 tout(cct) << relpath << std::endl;
181888fb 7917
7c673cae
FG
7918 filepath path(relpath);
7919 InodeRef in;
f67539c2
TL
7920
7921 std::scoped_lock lock(client_lock);
7c673cae
FG
7922 // don't follow symlinks
7923 int r = path_walk(path, &in, perms, false, mask);
7924 if (r < 0)
7925 return r;
7926 r = _getattr(in, mask, perms);
7927 if (r < 0) {
11fdf7f2 7928 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7929 return r;
7930 }
7931 fill_stat(in, stbuf, dirstat);
11fdf7f2 7932 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7933 return r;
7934}
7935
7936int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7937{
11fdf7f2 7938 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7939 << " mode 0" << oct << in->mode << dec
7940 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7941 memset(st, 0, sizeof(struct stat));
7942 if (use_faked_inos())
7943 st->st_ino = in->faked_ino;
7944 else
7945 st->st_ino = in->ino;
7946 st->st_dev = in->snapid;
7947 st->st_mode = in->mode;
7948 st->st_rdev = in->rdev;
28e407b8
AA
7949 if (in->is_dir()) {
7950 switch (in->nlink) {
7951 case 0:
7952 st->st_nlink = 0; /* dir is unlinked */
7953 break;
7954 case 1:
7955 st->st_nlink = 1 /* parent dentry */
7956 + 1 /* <dir>/. */
7957 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7958 break;
7959 default:
7960 ceph_abort();
7961 }
7962 } else {
7963 st->st_nlink = in->nlink;
7964 }
7c673cae
FG
7965 st->st_uid = in->uid;
7966 st->st_gid = in->gid;
7967 if (in->ctime > in->mtime) {
7968 stat_set_ctime_sec(st, in->ctime.sec());
7969 stat_set_ctime_nsec(st, in->ctime.nsec());
7970 } else {
7971 stat_set_ctime_sec(st, in->mtime.sec());
7972 stat_set_ctime_nsec(st, in->mtime.nsec());
7973 }
7974 stat_set_atime_sec(st, in->atime.sec());
7975 stat_set_atime_nsec(st, in->atime.nsec());
7976 stat_set_mtime_sec(st, in->mtime.sec());
7977 stat_set_mtime_nsec(st, in->mtime.nsec());
7978 if (in->is_dir()) {
7979 if (cct->_conf->client_dirsize_rbytes)
7980 st->st_size = in->rstat.rbytes;
7981 else
7982 st->st_size = in->dirstat.size();
f67539c2
TL
7983// The Windows "stat" structure provides just a subset of the fields that are
7984// available on Linux.
7985#ifndef _WIN32
7c673cae 7986 st->st_blocks = 1;
f67539c2 7987#endif
7c673cae
FG
7988 } else {
7989 st->st_size = in->size;
f67539c2 7990#ifndef _WIN32
7c673cae 7991 st->st_blocks = (in->size + 511) >> 9;
f67539c2 7992#endif
7c673cae 7993 }
f67539c2 7994#ifndef _WIN32
11fdf7f2 7995 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
f67539c2 7996#endif
7c673cae
FG
7997
7998 if (dirstat)
7999 *dirstat = in->dirstat;
8000 if (rstat)
8001 *rstat = in->rstat;
8002
8003 return in->caps_issued();
8004}
8005
8006void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8007{
11fdf7f2 8008 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8009 << " mode 0" << oct << in->mode << dec
8010 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8011 memset(stx, 0, sizeof(struct ceph_statx));
8012
8013 /*
8014 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
8015 * so that all bits are set.
8016 */
8017 if (!mask)
8018 mask = ~0;
8019
8020 /* These are always considered to be available */
8021 stx->stx_dev = in->snapid;
11fdf7f2 8022 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
8023
8024 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8025 stx->stx_mode = S_IFMT & in->mode;
8026 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8027 stx->stx_rdev = in->rdev;
8028 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8029
8030 if (mask & CEPH_CAP_AUTH_SHARED) {
8031 stx->stx_uid = in->uid;
8032 stx->stx_gid = in->gid;
8033 stx->stx_mode = in->mode;
8034 in->btime.to_timespec(&stx->stx_btime);
8035 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8036 }
8037
8038 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
8039 if (in->is_dir()) {
8040 switch (in->nlink) {
8041 case 0:
8042 stx->stx_nlink = 0; /* dir is unlinked */
8043 break;
8044 case 1:
8045 stx->stx_nlink = 1 /* parent dentry */
8046 + 1 /* <dir>/. */
8047 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8048 break;
8049 default:
8050 ceph_abort();
8051 }
8052 } else {
8053 stx->stx_nlink = in->nlink;
8054 }
7c673cae
FG
8055 stx->stx_mask |= CEPH_STATX_NLINK;
8056 }
8057
8058 if (mask & CEPH_CAP_FILE_SHARED) {
8059
8060 in->atime.to_timespec(&stx->stx_atime);
8061 in->mtime.to_timespec(&stx->stx_mtime);
8062
8063 if (in->is_dir()) {
8064 if (cct->_conf->client_dirsize_rbytes)
8065 stx->stx_size = in->rstat.rbytes;
8066 else
8067 stx->stx_size = in->dirstat.size();
8068 stx->stx_blocks = 1;
8069 } else {
8070 stx->stx_size = in->size;
8071 stx->stx_blocks = (in->size + 511) >> 9;
8072 }
8073 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8074 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8075 }
8076
8077 /* Change time and change_attr both require all shared caps to view */
8078 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8079 stx->stx_version = in->change_attr;
8080 if (in->ctime > in->mtime)
8081 in->ctime.to_timespec(&stx->stx_ctime);
8082 else
8083 in->mtime.to_timespec(&stx->stx_ctime);
8084 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8085 }
8086
8087}
8088
8089void Client::touch_dn(Dentry *dn)
8090{
8091 lru.lru_touch(dn);
8092}
8093
8094int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8095{
b3b6e05e 8096 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
7c673cae
FG
8097}
8098
8099int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8100{
f67539c2
TL
8101 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8102 if (!mref_reader.is_state_satisfied())
8103 return -CEPHFS_ENOTCONN;
8104
11fdf7f2 8105 tout(cct) << __func__ << std::endl;
7c673cae
FG
8106 tout(cct) << fd << std::endl;
8107 tout(cct) << mode << std::endl;
181888fb 8108
f67539c2 8109 std::scoped_lock lock(client_lock);
7c673cae
FG
8110 Fh *f = get_filehandle(fd);
8111 if (!f)
f67539c2 8112 return -CEPHFS_EBADF;
7c673cae
FG
8113#if defined(__linux__) && defined(O_PATH)
8114 if (f->flags & O_PATH)
f67539c2 8115 return -CEPHFS_EBADF;
7c673cae
FG
8116#endif
8117 struct stat attr;
8118 attr.st_mode = mode;
8119 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8120}
8121
b3b6e05e
TL
8122int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8123 const UserPerm& perms) {
f67539c2 8124 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8125 if (!mref_reader.is_state_satisfied()) {
f67539c2 8126 return -CEPHFS_ENOTCONN;
b3b6e05e 8127 }
f67539c2 8128
11fdf7f2 8129 tout(cct) << __func__ << std::endl;
b3b6e05e 8130 tout(cct) << dirfd << std::endl;
7c673cae
FG
8131 tout(cct) << relpath << std::endl;
8132 tout(cct) << mode << std::endl;
b3b6e05e 8133 tout(cct) << flags << std::endl;
181888fb 8134
7c673cae
FG
8135 filepath path(relpath);
8136 InodeRef in;
b3b6e05e 8137 InodeRef dirinode;
f67539c2
TL
8138
8139 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8140 int r = get_fd_inode(dirfd, &dirinode);
8141 if (r < 0) {
8142 return r;
8143 }
8144
8145 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8146 if (r < 0) {
7c673cae 8147 return r;
b3b6e05e 8148 }
7c673cae
FG
8149 struct stat attr;
8150 attr.st_mode = mode;
8151 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8152}
8153
b3b6e05e
TL
8154int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8155{
8156 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8157}
8158
7c673cae
FG
8159int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8160 const UserPerm& perms)
8161{
b3b6e05e 8162 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
7c673cae
FG
8163}
8164
8165int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8166{
f67539c2
TL
8167 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8168 if (!mref_reader.is_state_satisfied())
8169 return -CEPHFS_ENOTCONN;
8170
11fdf7f2 8171 tout(cct) << __func__ << std::endl;
7c673cae
FG
8172 tout(cct) << fd << std::endl;
8173 tout(cct) << new_uid << std::endl;
8174 tout(cct) << new_gid << std::endl;
181888fb 8175
f67539c2 8176 std::scoped_lock lock(client_lock);
7c673cae
FG
8177 Fh *f = get_filehandle(fd);
8178 if (!f)
f67539c2 8179 return -CEPHFS_EBADF;
7c673cae
FG
8180#if defined(__linux__) && defined(O_PATH)
8181 if (f->flags & O_PATH)
f67539c2 8182 return -CEPHFS_EBADF;
7c673cae
FG
8183#endif
8184 struct stat attr;
8185 attr.st_uid = new_uid;
8186 attr.st_gid = new_gid;
8187 int mask = 0;
8188 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8189 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8190 return _setattr(f->inode, &attr, mask, perms);
8191}
8192
8193int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8194 const UserPerm& perms)
8195{
b3b6e05e
TL
8196 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8197}
8198
8199int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8200 int flags, const UserPerm& perms) {
f67539c2 8201 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8202 if (!mref_reader.is_state_satisfied()) {
f67539c2 8203 return -CEPHFS_ENOTCONN;
b3b6e05e 8204 }
f67539c2 8205
11fdf7f2 8206 tout(cct) << __func__ << std::endl;
b3b6e05e 8207 tout(cct) << dirfd << std::endl;
7c673cae
FG
8208 tout(cct) << relpath << std::endl;
8209 tout(cct) << new_uid << std::endl;
8210 tout(cct) << new_gid << std::endl;
b3b6e05e 8211 tout(cct) << flags << std::endl;
181888fb 8212
7c673cae
FG
8213 filepath path(relpath);
8214 InodeRef in;
b3b6e05e 8215 InodeRef dirinode;
f67539c2
TL
8216
8217 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8218 int r = get_fd_inode(dirfd, &dirinode);
8219 if (r < 0) {
7c673cae 8220 return r;
b3b6e05e
TL
8221 }
8222
8223 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8224 if (r < 0) {
8225 return r;
8226 }
7c673cae
FG
8227 struct stat attr;
8228 attr.st_uid = new_uid;
8229 attr.st_gid = new_gid;
b3b6e05e 8230 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
8231}
8232
11fdf7f2
TL
8233static void attr_set_atime_and_mtime(struct stat *attr,
8234 const utime_t &atime,
8235 const utime_t &mtime)
8236{
8237 stat_set_atime_sec(attr, atime.tv.tv_sec);
8238 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8239 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8240 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8241}
8242
8243// for [l]utime() invoke the timeval variant as the timespec
8244// variant are not yet implemented. for futime[s](), invoke
8245// the timespec variant.
7c673cae
FG
8246int Client::utime(const char *relpath, struct utimbuf *buf,
8247 const UserPerm& perms)
8248{
11fdf7f2
TL
8249 struct timeval tv[2];
8250 tv[0].tv_sec = buf->actime;
8251 tv[0].tv_usec = 0;
8252 tv[1].tv_sec = buf->modtime;
8253 tv[1].tv_usec = 0;
8254
8255 return utimes(relpath, tv, perms);
8256}
8257
8258int Client::lutime(const char *relpath, struct utimbuf *buf,
8259 const UserPerm& perms)
8260{
8261 struct timeval tv[2];
8262 tv[0].tv_sec = buf->actime;
8263 tv[0].tv_usec = 0;
8264 tv[1].tv_sec = buf->modtime;
8265 tv[1].tv_usec = 0;
8266
8267 return lutimes(relpath, tv, perms);
8268}
8269
8270int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8271{
8272 struct timespec ts[2];
8273 ts[0].tv_sec = buf->actime;
8274 ts[0].tv_nsec = 0;
8275 ts[1].tv_sec = buf->modtime;
8276 ts[1].tv_nsec = 0;
8277
8278 return futimens(fd, ts, perms);
8279}
8280
8281int Client::utimes(const char *relpath, struct timeval times[2],
8282 const UserPerm& perms)
8283{
f67539c2
TL
8284 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8285 if (!mref_reader.is_state_satisfied())
8286 return -CEPHFS_ENOTCONN;
8287
11fdf7f2 8288 tout(cct) << __func__ << std::endl;
7c673cae 8289 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8290 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8291 << std::endl;
8292 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8293 << std::endl;
181888fb 8294
7c673cae
FG
8295 filepath path(relpath);
8296 InodeRef in;
f67539c2
TL
8297
8298 std::scoped_lock lock(client_lock);
7c673cae
FG
8299 int r = path_walk(path, &in, perms);
8300 if (r < 0)
8301 return r;
8302 struct stat attr;
11fdf7f2
TL
8303 utime_t atime(times[0]);
8304 utime_t mtime(times[1]);
8305
8306 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8307 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8308}
8309
11fdf7f2
TL
8310int Client::lutimes(const char *relpath, struct timeval times[2],
8311 const UserPerm& perms)
7c673cae 8312{
f67539c2
TL
8313 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8314 if (!mref_reader.is_state_satisfied())
8315 return -CEPHFS_ENOTCONN;
8316
11fdf7f2 8317 tout(cct) << __func__ << std::endl;
7c673cae 8318 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8319 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8320 << std::endl;
8321 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8322 << std::endl;
181888fb 8323
7c673cae
FG
8324 filepath path(relpath);
8325 InodeRef in;
f67539c2
TL
8326
8327 std::scoped_lock lock(client_lock);
7c673cae
FG
8328 int r = path_walk(path, &in, perms, false);
8329 if (r < 0)
8330 return r;
8331 struct stat attr;
11fdf7f2
TL
8332 utime_t atime(times[0]);
8333 utime_t mtime(times[1]);
8334
8335 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8336 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8337}
8338
11fdf7f2
TL
8339int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8340{
8341 struct timespec ts[2];
8342 ts[0].tv_sec = times[0].tv_sec;
8343 ts[0].tv_nsec = times[0].tv_usec * 1000;
8344 ts[1].tv_sec = times[1].tv_sec;
8345 ts[1].tv_nsec = times[1].tv_usec * 1000;
8346
8347 return futimens(fd, ts, perms);
8348}
8349
8350int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8351{
f67539c2
TL
8352 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8353 if (!mref_reader.is_state_satisfied())
8354 return -CEPHFS_ENOTCONN;
8355
11fdf7f2
TL
8356 tout(cct) << __func__ << std::endl;
8357 tout(cct) << fd << std::endl;
8358 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8359 << std::endl;
8360 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8361 << std::endl;
8362
f67539c2 8363 std::scoped_lock lock(client_lock);
11fdf7f2
TL
8364 Fh *f = get_filehandle(fd);
8365 if (!f)
f67539c2 8366 return -CEPHFS_EBADF;
11fdf7f2
TL
8367#if defined(__linux__) && defined(O_PATH)
8368 if (f->flags & O_PATH)
f67539c2 8369 return -CEPHFS_EBADF;
11fdf7f2
TL
8370#endif
8371 struct stat attr;
8372 utime_t atime(times[0]);
8373 utime_t mtime(times[1]);
8374
8375 attr_set_atime_and_mtime(&attr, atime, mtime);
8376 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8377}
8378
b3b6e05e
TL
8379int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8380 const UserPerm& perms) {
8381 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8382 if (!mref_reader.is_state_satisfied()) {
8383 return -CEPHFS_ENOTCONN;
8384 }
8385
8386 tout(cct) << __func__ << std::endl;
8387 tout(cct) << dirfd << std::endl;
8388 tout(cct) << relpath << std::endl;
8389 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8390 << std::endl;
8391 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8392 << std::endl;
8393 tout(cct) << flags << std::endl;
8394
8395 filepath path(relpath);
8396 InodeRef in;
8397 InodeRef dirinode;
8398
8399 std::scoped_lock lock(client_lock);
8400 int r = get_fd_inode(dirfd, &dirinode);
8401 if (r < 0) {
8402 return r;
8403 }
8404
8405#if defined(__linux__) && defined(O_PATH)
8406 if (flags & O_PATH) {
8407 return -CEPHFS_EBADF;
8408 }
8409#endif
8410
8411 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8412 if (r < 0) {
8413 return r;
8414 }
8415 struct stat attr;
8416 utime_t atime(times[0]);
8417 utime_t mtime(times[1]);
8418
8419 attr_set_atime_and_mtime(&attr, atime, mtime);
8420 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8421}
8422
7c673cae
FG
8423int Client::flock(int fd, int operation, uint64_t owner)
8424{
f67539c2
TL
8425 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8426 if (!mref_reader.is_state_satisfied())
8427 return -CEPHFS_ENOTCONN;
8428
11fdf7f2 8429 tout(cct) << __func__ << std::endl;
7c673cae
FG
8430 tout(cct) << fd << std::endl;
8431 tout(cct) << operation << std::endl;
8432 tout(cct) << owner << std::endl;
181888fb 8433
f67539c2 8434 std::scoped_lock lock(client_lock);
7c673cae
FG
8435 Fh *f = get_filehandle(fd);
8436 if (!f)
f67539c2 8437 return -CEPHFS_EBADF;
7c673cae
FG
8438
8439 return _flock(f, operation, owner);
8440}
8441
8442int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8443{
f67539c2
TL
8444 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8445 if (!mref_reader.is_state_satisfied())
8446 return -CEPHFS_ENOTCONN;
8447
11fdf7f2 8448 tout(cct) << __func__ << std::endl;
7c673cae 8449 tout(cct) << relpath << std::endl;
181888fb 8450
7c673cae
FG
8451 filepath path(relpath);
8452 InodeRef in;
f67539c2
TL
8453
8454 std::scoped_lock lock(client_lock);
7c673cae
FG
8455 int r = path_walk(path, &in, perms, true);
8456 if (r < 0)
8457 return r;
8458 if (cct->_conf->client_permissions) {
8459 int r = may_open(in.get(), O_RDONLY, perms);
8460 if (r < 0)
8461 return r;
8462 }
8463 r = _opendir(in.get(), dirpp, perms);
8464 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
f67539c2
TL
8465 if (r != -CEPHFS_ENOTDIR)
8466 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
8467 return r;
8468}
8469
b3b6e05e
TL
8470int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8471 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8472 if (!mref_reader.is_state_satisfied()) {
8473 return -CEPHFS_ENOTCONN;
8474 }
8475
8476 tout(cct) << __func__ << std::endl;
8477 tout(cct) << dirfd << std::endl;
8478
8479 InodeRef dirinode;
8480 std::scoped_lock locker(client_lock);
8481 int r = get_fd_inode(dirfd, &dirinode);
8482 if (r < 0) {
8483 return r;
8484 }
8485
8486 if (cct->_conf->client_permissions) {
8487 r = may_open(dirinode.get(), O_RDONLY, perms);
8488 if (r < 0) {
8489 return r;
8490 }
8491 }
8492 r = _opendir(dirinode.get(), dirpp, perms);
8493 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8494 if (r != -CEPHFS_ENOTDIR) {
8495 tout(cct) << (uintptr_t)*dirpp << std::endl;
8496 }
8497 return r;
8498}
8499
7c673cae
FG
8500int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8501{
8502 if (!in->is_dir())
f67539c2 8503 return -CEPHFS_ENOTDIR;
7c673cae
FG
8504 *dirpp = new dir_result_t(in, perms);
8505 opened_dirs.insert(*dirpp);
11fdf7f2 8506 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
8507 return 0;
8508}
8509
8510
8511int Client::closedir(dir_result_t *dir)
8512{
11fdf7f2 8513 tout(cct) << __func__ << std::endl;
f67539c2 8514 tout(cct) << (uintptr_t)dir << std::endl;
7c673cae 8515
11fdf7f2 8516 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
f67539c2 8517 std::scoped_lock lock(client_lock);
7c673cae
FG
8518 _closedir(dir);
8519 return 0;
8520}
8521
8522void Client::_closedir(dir_result_t *dirp)
8523{
11fdf7f2 8524 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
f67539c2 8525
7c673cae 8526 if (dirp->inode) {
11fdf7f2 8527 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
8528 dirp->inode.reset();
8529 }
8530 _readdir_drop_dirp_buffer(dirp);
8531 opened_dirs.erase(dirp);
8532 delete dirp;
8533}
8534
8535void Client::rewinddir(dir_result_t *dirp)
8536{
11fdf7f2 8537 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb 8538
f67539c2
TL
8539 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8540 if (!mref_reader.is_state_satisfied())
181888fb
FG
8541 return;
8542
f67539c2 8543 std::scoped_lock lock(client_lock);
7c673cae
FG
8544 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8545 _readdir_drop_dirp_buffer(d);
8546 d->reset();
8547}
8548
8549loff_t Client::telldir(dir_result_t *dirp)
8550{
8551 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 8552 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
8553 return d->offset;
8554}
8555
8556void Client::seekdir(dir_result_t *dirp, loff_t offset)
8557{
11fdf7f2 8558 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 8559
f67539c2
TL
8560 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8561 if (!mref_reader.is_state_satisfied())
181888fb
FG
8562 return;
8563
f67539c2
TL
8564 std::scoped_lock lock(client_lock);
8565
7c673cae
FG
8566 if (offset == dirp->offset)
8567 return;
8568
8569 if (offset > dirp->offset)
8570 dirp->release_count = 0; // bump if we do a forward seek
8571 else
8572 dirp->ordered_count = 0; // disable filling readdir cache
8573
8574 if (dirp->hash_order()) {
8575 if (dirp->offset > offset) {
8576 _readdir_drop_dirp_buffer(dirp);
8577 dirp->reset();
8578 }
8579 } else {
8580 if (offset == 0 ||
8581 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8582 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8583 _readdir_drop_dirp_buffer(dirp);
8584 dirp->reset();
8585 }
8586 }
8587
8588 dirp->offset = offset;
8589}
8590
8591
8592//struct dirent {
8593// ino_t d_ino; /* inode number */
8594// off_t d_off; /* offset to the next dirent */
8595// unsigned short d_reclen; /* length of this record */
8596// unsigned char d_type; /* type of file */
8597// char d_name[256]; /* filename */
8598//};
8599void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8600{
8601 strncpy(de->d_name, name, 255);
8602 de->d_name[255] = '\0';
f67539c2 8603#if !defined(__CYGWIN__) && !(defined(_WIN32))
7c673cae 8604 de->d_ino = ino;
11fdf7f2 8605#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
8606 de->d_off = next_off;
8607#endif
8608 de->d_reclen = 1;
8609 de->d_type = IFTODT(type);
11fdf7f2 8610 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
8611 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8612#endif
8613}
8614
8615void Client::_readdir_next_frag(dir_result_t *dirp)
8616{
8617 frag_t fg = dirp->buffer_frag;
8618
8619 if (fg.is_rightmost()) {
11fdf7f2 8620 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
8621 dirp->set_end();
8622 return;
8623 }
8624
8625 // advance
8626 fg = fg.next();
11fdf7f2 8627 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
8628
8629 if (dirp->hash_order()) {
8630 // keep last_name
8631 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8632 if (dirp->offset < new_offset) // don't decrease offset
8633 dirp->offset = new_offset;
8634 } else {
8635 dirp->last_name.clear();
8636 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8637 _readdir_rechoose_frag(dirp);
8638 }
8639}
8640
8641void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8642{
11fdf7f2 8643 ceph_assert(dirp->inode);
7c673cae
FG
8644
8645 if (dirp->hash_order())
8646 return;
8647
8648 frag_t cur = frag_t(dirp->offset_high());
8649 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8650 if (fg != cur) {
11fdf7f2 8651 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
8652 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8653 dirp->last_name.clear();
8654 dirp->next_offset = 2;
8655 }
8656}
8657
8658void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8659{
11fdf7f2 8660 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
8661 dirp->buffer.clear();
8662}
8663
8664int Client::_readdir_get_frag(dir_result_t *dirp)
8665{
11fdf7f2
TL
8666 ceph_assert(dirp);
8667 ceph_assert(dirp->inode);
7c673cae
FG
8668
8669 // get the current frag.
8670 frag_t fg;
8671 if (dirp->hash_order())
8672 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8673 else
8674 fg = frag_t(dirp->offset_high());
8675
11fdf7f2 8676 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
8677 << " offset " << hex << dirp->offset << dec << dendl;
8678
8679 int op = CEPH_MDS_OP_READDIR;
8680 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8681 op = CEPH_MDS_OP_LSSNAP;
8682
8683 InodeRef& diri = dirp->inode;
8684
8685 MetaRequest *req = new MetaRequest(op);
8686 filepath path;
8687 diri->make_nosnap_relative_path(path);
8688 req->set_filepath(path);
8689 req->set_inode(diri.get());
8690 req->head.args.readdir.frag = fg;
8691 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8692 if (dirp->last_name.length()) {
94b18763 8693 req->path2.set_path(dirp->last_name);
7c673cae
FG
8694 } else if (dirp->hash_order()) {
8695 req->head.args.readdir.offset_hash = dirp->offset_high();
8696 }
8697 req->dirp = dirp;
8698
8699 bufferlist dirbl;
8700 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8701
f67539c2 8702 if (res == -CEPHFS_EAGAIN) {
11fdf7f2 8703 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
8704 _readdir_rechoose_frag(dirp);
8705 return _readdir_get_frag(dirp);
8706 }
8707
8708 if (res == 0) {
11fdf7f2 8709 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
8710 << " size " << dirp->buffer.size() << dendl;
8711 } else {
11fdf7f2 8712 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
8713 dirp->set_end();
8714 }
8715
8716 return res;
8717}
8718
8719struct dentry_off_lt {
8720 bool operator()(const Dentry* dn, int64_t off) const {
8721 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8722 }
8723};
8724
8725int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8726 int caps, bool getref)
8727{
f67539c2 8728 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11fdf7f2 8729 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
8730 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8731 << dendl;
8732 Dir *dir = dirp->inode->dir;
8733
8734 if (!dir) {
8735 ldout(cct, 10) << " dir is empty" << dendl;
8736 dirp->set_end();
8737 return 0;
8738 }
8739
8740 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8741 dir->readdir_cache.end(),
8742 dirp->offset, dentry_off_lt());
8743
8744 string dn_name;
8745 while (true) {
adb31ebb 8746 int mask = caps;
7c673cae 8747 if (!dirp->inode->is_complete_and_ordered())
f67539c2 8748 return -CEPHFS_EAGAIN;
7c673cae
FG
8749 if (pd == dir->readdir_cache.end())
8750 break;
8751 Dentry *dn = *pd;
8752 if (dn->inode == NULL) {
8753 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8754 ++pd;
8755 continue;
8756 }
8757 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8758 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8759 ++pd;
8760 continue;
8761 }
8762
92f5a8d4 8763 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
8764 if (dn->inode->is_dir()) {
8765 mask |= CEPH_STAT_RSTAT;
8766 }
8767 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
8768 if (r < 0)
8769 return r;
92f5a8d4
TL
8770
8771 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8772 pd = dir->readdir_cache.begin() + idx;
8773 if (pd >= dir->readdir_cache.end() || *pd != dn)
f67539c2 8774 return -CEPHFS_EAGAIN;
7c673cae
FG
8775
8776 struct ceph_statx stx;
8777 struct dirent de;
8778 fill_statx(dn->inode, caps, &stx);
8779
8780 uint64_t next_off = dn->offset + 1;
eafe8130 8781 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8782 ++pd;
8783 if (pd == dir->readdir_cache.end())
8784 next_off = dir_result_t::END;
8785
8786 Inode *in = NULL;
7c673cae
FG
8787 if (getref) {
8788 in = dn->inode.get();
8789 _ll_get(in);
8790 }
8791
8792 dn_name = dn->name; // fill in name while we have lock
8793
9f95a23c 8794 client_lock.unlock();
7c673cae 8795 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8796 client_lock.lock();
7c673cae
FG
8797 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8798 << " = " << r << dendl;
8799 if (r < 0) {
8800 return r;
8801 }
8802
8803 dirp->offset = next_off;
8804 if (dirp->at_end())
8805 dirp->next_offset = 2;
8806 else
8807 dirp->next_offset = dirp->offset_low();
8808 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8809 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8810 if (r > 0)
8811 return r;
8812 }
8813
11fdf7f2 8814 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8815 dirp->set_end();
8816 return 0;
8817}
8818
8819int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8820 unsigned want, unsigned flags, bool getref)
8821{
8822 int caps = statx_to_mask(flags, want);
8823
f67539c2
TL
8824 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8825 if (!mref_reader.is_state_satisfied())
8826 return -CEPHFS_ENOTCONN;
7c673cae 8827
f67539c2 8828 std::unique_lock cl(client_lock);
181888fb 8829
7c673cae
FG
8830 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8831
11fdf7f2 8832 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8833 << dec << " at_end=" << dirp->at_end()
8834 << " hash_order=" << dirp->hash_order() << dendl;
8835
8836 struct dirent de;
8837 struct ceph_statx stx;
8838 memset(&de, 0, sizeof(de));
8839 memset(&stx, 0, sizeof(stx));
8840
8841 InodeRef& diri = dirp->inode;
8842
8843 if (dirp->at_end())
8844 return 0;
8845
8846 if (dirp->offset == 0) {
8847 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8848 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8849 uint64_t next_off = 1;
8850
8851 int r;
adb31ebb 8852 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8853 if (r < 0)
8854 return r;
8855
8856 fill_statx(diri, caps, &stx);
8857 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8858
8859 Inode *inode = NULL;
8860 if (getref) {
8861 inode = diri.get();
8862 _ll_get(inode);
8863 }
8864
f67539c2 8865 cl.unlock();
7c673cae 8866 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8867 cl.lock();
7c673cae
FG
8868 if (r < 0)
8869 return r;
8870
8871 dirp->offset = next_off;
8872 if (r > 0)
8873 return r;
8874 }
8875 if (dirp->offset == 1) {
8876 ldout(cct, 15) << " including .." << dendl;
8877 uint64_t next_off = 2;
8878 InodeRef in;
11fdf7f2 8879 if (diri->dentries.empty())
7c673cae
FG
8880 in = diri;
8881 else
94b18763 8882 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8883
8884 int r;
adb31ebb 8885 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8886 if (r < 0)
8887 return r;
8888
8889 fill_statx(in, caps, &stx);
8890 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8891
8892 Inode *inode = NULL;
8893 if (getref) {
8894 inode = in.get();
8895 _ll_get(inode);
8896 }
8897
f67539c2 8898 cl.unlock();
7c673cae 8899 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8900 cl.lock();
7c673cae
FG
8901 if (r < 0)
8902 return r;
8903
8904 dirp->offset = next_off;
8905 if (r > 0)
8906 return r;
8907 }
8908
8909 // can we read from our cache?
8910 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8911 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8912 << dirp->inode->is_complete_and_ordered()
8913 << " issued " << ccap_string(dirp->inode->caps_issued())
8914 << dendl;
8915 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8916 dirp->inode->is_complete_and_ordered() &&
94b18763 8917 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 8918 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
f67539c2 8919 if (err != -CEPHFS_EAGAIN)
7c673cae
FG
8920 return err;
8921 }
8922
8923 while (1) {
8924 if (dirp->at_end())
8925 return 0;
8926
8927 bool check_caps = true;
8928 if (!dirp->is_cached()) {
8929 int r = _readdir_get_frag(dirp);
8930 if (r)
8931 return r;
8932 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8933 // different than the requested one. (our dirfragtree was outdated)
8934 check_caps = false;
8935 }
8936 frag_t fg = dirp->buffer_frag;
8937
8938 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8939 << " offset " << hex << dirp->offset << dendl;
8940
8941 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8942 dirp->offset, dir_result_t::dentry_off_lt());
8943 it != dirp->buffer.end();
8944 ++it) {
8945 dir_result_t::dentry &entry = *it;
8946
8947 uint64_t next_off = entry.offset + 1;
8948
8949 int r;
8950 if (check_caps) {
adb31ebb
TL
8951 int mask = caps;
8952 if(entry.inode->is_dir()){
8953 mask |= CEPH_STAT_RSTAT;
8954 }
8955 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
8956 if (r < 0)
8957 return r;
8958 }
8959
8960 fill_statx(entry.inode, caps, &stx);
8961 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8962
8963 Inode *inode = NULL;
8964 if (getref) {
8965 inode = entry.inode.get();
8966 _ll_get(inode);
8967 }
8968
f67539c2 8969 cl.unlock();
7c673cae 8970 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
f67539c2 8971 cl.lock();
7c673cae
FG
8972
8973 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8974 << " = " << r << dendl;
8975 if (r < 0)
8976 return r;
8977
8978 dirp->offset = next_off;
8979 if (r > 0)
8980 return r;
8981 }
8982
8983 if (dirp->next_offset > 2) {
8984 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8985 _readdir_drop_dirp_buffer(dirp);
8986 continue; // more!
8987 }
8988
8989 if (!fg.is_rightmost()) {
8990 // next frag!
8991 _readdir_next_frag(dirp);
8992 continue;
8993 }
8994
8995 if (diri->shared_gen == dirp->start_shared_gen &&
8996 diri->dir_release_count == dirp->release_count) {
8997 if (diri->dir_ordered_count == dirp->ordered_count) {
8998 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8999 if (diri->dir) {
11fdf7f2 9000 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
9001 diri->dir->readdir_cache.resize(dirp->cache_index);
9002 }
9003 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9004 } else {
9005 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9006 diri->flags |= I_COMPLETE;
9007 }
9008 }
9009
9010 dirp->set_end();
9011 return 0;
9012 }
9013 ceph_abort();
9014 return 0;
9015}
9016
9017
9018int Client::readdir_r(dir_result_t *d, struct dirent *de)
9019{
9020 return readdirplus_r(d, de, 0, 0, 0, NULL);
9021}
9022
9023/*
9024 * readdirplus_r
9025 *
9026 * returns
9027 * 1 if we got a dirent
9028 * 0 for end of directory
9029 * <0 on error
9030 */
9031
9032struct single_readdir {
9033 struct dirent *de;
9034 struct ceph_statx *stx;
9035 Inode *inode;
9036 bool full;
9037};
9038
9039static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9040 struct ceph_statx *stx, off_t off,
9041 Inode *in)
9042{
9043 single_readdir *c = static_cast<single_readdir *>(p);
9044
9045 if (c->full)
9046 return -1; // already filled this dirent
9047
9048 *c->de = *de;
9049 if (c->stx)
9050 *c->stx = *stx;
9051 c->inode = in;
9052 c->full = true;
9053 return 1;
9054}
9055
9056struct dirent *Client::readdir(dir_result_t *d)
9057{
9058 int ret;
f91f0fd5 9059 auto& de = d->de;
7c673cae
FG
9060 single_readdir sr;
9061 sr.de = &de;
9062 sr.stx = NULL;
9063 sr.inode = NULL;
9064 sr.full = false;
9065
9066 // our callback fills the dirent and sets sr.full=true on first
9067 // call, and returns -1 the second time around.
9068 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9069 if (ret < -1) {
9070 errno = -ret; // this sucks.
9071 return (dirent *) NULL;
9072 }
9073 if (sr.full) {
9074 return &de;
9075 }
9076 return (dirent *) NULL;
9077}
9078
9079int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9080 struct ceph_statx *stx, unsigned want,
9081 unsigned flags, Inode **out)
9082{
9083 single_readdir sr;
9084 sr.de = de;
9085 sr.stx = stx;
9086 sr.inode = NULL;
9087 sr.full = false;
9088
9089 // our callback fills the dirent and sets sr.full=true on first
9090 // call, and returns -1 the second time around.
9091 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9092 if (r < -1)
9093 return r;
9094 if (out)
9095 *out = sr.inode;
9096 if (sr.full)
9097 return 1;
9098 return 0;
9099}
9100
9101
9102/* getdents */
9103struct getdents_result {
9104 char *buf;
9105 int buflen;
9106 int pos;
9107 bool fullent;
9108};
9109
9110static int _readdir_getdent_cb(void *p, struct dirent *de,
9111 struct ceph_statx *stx, off_t off, Inode *in)
9112{
9113 struct getdents_result *c = static_cast<getdents_result *>(p);
9114
9115 int dlen;
9116 if (c->fullent)
9117 dlen = sizeof(*de);
9118 else
9119 dlen = strlen(de->d_name) + 1;
9120
9121 if (c->pos + dlen > c->buflen)
9122 return -1; // doesn't fit
9123
9124 if (c->fullent) {
9125 memcpy(c->buf + c->pos, de, sizeof(*de));
9126 } else {
9127 memcpy(c->buf + c->pos, de->d_name, dlen);
9128 }
9129 c->pos += dlen;
9130 return 0;
9131}
9132
9133int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9134{
9135 getdents_result gr;
9136 gr.buf = buf;
9137 gr.buflen = buflen;
9138 gr.fullent = fullent;
9139 gr.pos = 0;
9140
9141 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9142
9143 if (r < 0) { // some error
9144 if (r == -1) { // buffer ran out of space
9145 if (gr.pos) { // but we got some entries already!
9146 return gr.pos;
9147 } // or we need a larger buffer
f67539c2 9148 return -CEPHFS_ERANGE;
7c673cae
FG
9149 } else { // actual error, return it
9150 return r;
9151 }
9152 }
9153 return gr.pos;
9154}
9155
9156
9157/* getdir */
9158struct getdir_result {
9159 list<string> *contents;
9160 int num;
9161};
9162
9163static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9164{
9165 getdir_result *r = static_cast<getdir_result *>(p);
9166
9167 r->contents->push_back(de->d_name);
9168 r->num++;
9169 return 0;
9170}
9171
9172int Client::getdir(const char *relpath, list<string>& contents,
9173 const UserPerm& perms)
9174{
9175 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
f67539c2
TL
9176 tout(cct) << "getdir" << std::endl;
9177 tout(cct) << relpath << std::endl;
7c673cae
FG
9178
9179 dir_result_t *d;
9180 int r = opendir(relpath, &d, perms);
9181 if (r < 0)
9182 return r;
9183
9184 getdir_result gr;
9185 gr.contents = &contents;
9186 gr.num = 0;
9187 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9188
9189 closedir(d);
9190
9191 if (r < 0)
9192 return r;
9193 return gr.num;
9194}
9195
9196
9197/****** file i/o **********/
f67539c2 9198
b3b6e05e 9199// common parts for open and openat. call with client_lock locked.
20effc67 9200int Client::create_and_open(int dirfd, const char *relpath, int flags,
b3b6e05e
TL
9201 const UserPerm& perms, mode_t mode, int stripe_unit,
9202 int stripe_count, int object_size, const char *data_pool,
9203 std::string alternate_name) {
9204 ceph_assert(ceph_mutex_is_locked(client_lock));
f91f0fd5 9205 int cflags = ceph_flags_sys2wire(flags);
f91f0fd5 9206 tout(cct) << cflags << std::endl;
7c673cae
FG
9207
9208 Fh *fh = NULL;
9209
9210#if defined(__linux__) && defined(O_PATH)
9211 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9212 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9213 * in kernel (fs/open.c). */
9214 if (flags & O_PATH)
9215 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9216#endif
9217
9218 filepath path(relpath);
9219 InodeRef in;
9220 bool created = false;
9221 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9222 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
9223 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9224
b3b6e05e 9225 InodeRef dirinode = nullptr;
20effc67
TL
9226 int r = get_fd_inode(dirfd, &dirinode);
9227 if (r < 0) {
9228 return r;
b3b6e05e 9229 }
7c673cae 9230
20effc67 9231 r = path_walk(path, &in, perms, followsym, mask, dirinode);
7c673cae 9232 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 9233 return -CEPHFS_EEXIST;
7c673cae
FG
9234
9235#if defined(__linux__) && defined(O_PATH)
9236 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9237#else
b3b6e05e 9238 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
7c673cae 9239#endif
f67539c2 9240 return -CEPHFS_ELOOP;
7c673cae 9241
f67539c2 9242 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
7c673cae
FG
9243 filepath dirpath = path;
9244 string dname = dirpath.last_dentry();
9245 dirpath.pop_dentry();
9246 InodeRef dir;
9247 r = path_walk(dirpath, &dir, perms, true,
b3b6e05e
TL
9248 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9249 if (r < 0) {
7c673cae 9250 goto out;
b3b6e05e 9251 }
7c673cae
FG
9252 if (cct->_conf->client_permissions) {
9253 r = may_create(dir.get(), perms);
9254 if (r < 0)
b3b6e05e 9255 goto out;
7c673cae
FG
9256 }
9257 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
f67539c2
TL
9258 stripe_count, object_size, data_pool, &created, perms,
9259 std::move(alternate_name));
7c673cae
FG
9260 }
9261 if (r < 0)
9262 goto out;
9263
9264 if (!created) {
9265 // posix says we can only check permissions of existing files
9266 if (cct->_conf->client_permissions) {
9267 r = may_open(in.get(), flags, perms);
9268 if (r < 0)
b3b6e05e 9269 goto out;
7c673cae
FG
9270 }
9271 }
9272
9273 if (!fh)
9274 r = _open(in.get(), flags, mode, &fh, perms);
9275 if (r >= 0) {
9276 // allocate a integer file descriptor
11fdf7f2 9277 ceph_assert(fh);
7c673cae 9278 r = get_fd();
11fdf7f2 9279 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
9280 fd_map[r] = fh;
9281 }
9282
9283 out:
b3b6e05e
TL
9284 return r;
9285}
9286
9287int Client::open(const char *relpath, int flags, const UserPerm& perms,
9288 mode_t mode, int stripe_unit, int stripe_count,
9289 int object_size, const char *data_pool, std::string alternate_name)
9290{
9291 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9292 stripe_count, object_size, data_pool, alternate_name);
9293}
9294
b3b6e05e
TL
9295int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9296 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9297 const char *data_pool, std::string alternate_name) {
9298 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9299 if (!mref_reader.is_state_satisfied()) {
9300 return -CEPHFS_ENOTCONN;
9301 }
9302
9303 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9304 tout(cct) << dirfd << std::endl;
9305 tout(cct) << relpath << std::endl;
9306 tout(cct) << flags << std::endl;
9307 tout(cct) << mode << std::endl;
9308
9309 std::scoped_lock locker(client_lock);
9310 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9311 object_size, data_pool, alternate_name);
9312
7c673cae 9313 tout(cct) << r << std::endl;
b3b6e05e 9314 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
7c673cae
FG
9315 return r;
9316}
9317
7c673cae
FG
9318int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9319 const UserPerm& perms)
9320{
11fdf7f2 9321 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 9322
f67539c2
TL
9323 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9324 if (!mref_reader.is_state_satisfied())
9325 return -CEPHFS_ENOTCONN;
181888fb 9326
f67539c2 9327 std::scoped_lock lock(client_lock);
7c673cae
FG
9328 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9329 filepath path(ino);
9330 req->set_filepath(path);
9331
9332 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9333 char f[30];
9334 sprintf(f, "%u", h);
9335 filepath path2(dirino);
9336 path2.push_dentry(string(f));
9337 req->set_filepath2(path2);
9338
9339 int r = make_request(req, perms, NULL, NULL,
9340 rand() % mdsmap->get_num_in_mds());
11fdf7f2 9341 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
9342 return r;
9343}
9344
9345
9346/**
9347 * Load inode into local cache.
9348 *
9349 * If inode pointer is non-NULL, and take a reference on
9350 * the resulting Inode object in one operation, so that caller
9351 * can safely assume inode will still be there after return.
9352 */
f67539c2 9353int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
7c673cae 9354{
f67539c2 9355 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
7c673cae 9356
f67539c2
TL
9357 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9358 if (!mref_reader.is_state_satisfied())
9359 return -CEPHFS_ENOTCONN;
181888fb 9360
b3b6e05e
TL
9361 if (is_reserved_vino(vino))
9362 return -CEPHFS_ESTALE;
9363
7c673cae 9364 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
f67539c2 9365 filepath path(vino.ino);
7c673cae
FG
9366 req->set_filepath(path);
9367
f67539c2
TL
9368 /*
9369 * The MDS expects either a "real" snapid here or 0. The special value
9370 * carveouts for the snapid are all at the end of the range so we can
9371 * just look for any snapid below this value.
9372 */
9373 if (vino.snapid < CEPH_NOSNAP)
9374 req->head.args.lookupino.snapid = vino.snapid;
9375
7c673cae
FG
9376 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9377 if (r == 0 && inode != NULL) {
7c673cae 9378 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 9379 ceph_assert(p != inode_map.end());
7c673cae
FG
9380 *inode = p->second;
9381 _ll_get(*inode);
9382 }
f67539c2 9383 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
7c673cae
FG
9384 return r;
9385}
9386
1adf2230
AA
9387int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9388{
f67539c2
TL
9389 vinodeno_t vino(ino, CEPH_NOSNAP);
9390 std::scoped_lock lock(client_lock);
9391 return _lookup_vino(vino, perms, inode);
1adf2230 9392}
7c673cae
FG
9393
9394/**
9395 * Find the parent inode of `ino` and insert it into
9396 * our cache. Conditionally also set `parent` to a referenced
9397 * Inode* if caller provides non-NULL value.
9398 */
1adf2230 9399int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 9400{
11fdf7f2 9401 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9402
7c673cae
FG
9403 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9404 filepath path(ino->ino);
9405 req->set_filepath(path);
9406
9407 InodeRef target;
9408 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9409 // Give caller a reference to the parent ino if they provided a pointer.
9410 if (parent != NULL) {
9411 if (r == 0) {
9412 *parent = target.get();
9413 _ll_get(*parent);
11fdf7f2 9414 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
9415 } else {
9416 *parent = NULL;
9417 }
9418 }
11fdf7f2 9419 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9420 return r;
9421}
9422
7c673cae
FG
9423/**
9424 * Populate the parent dentry for `ino`, provided it is
9425 * a child of `parent`.
9426 */
1adf2230 9427int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 9428{
11fdf7f2
TL
9429 ceph_assert(parent->is_dir());
9430 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9431
f67539c2
TL
9432 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9433 if (!mref_reader.is_state_satisfied())
9434 return -CEPHFS_ENOTCONN;
181888fb 9435
7c673cae
FG
9436 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9437 req->set_filepath2(filepath(parent->ino));
9438 req->set_filepath(filepath(ino->ino));
9439 req->set_inode(ino);
9440
9441 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 9442 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9443 return r;
9444}
9445
1adf2230
AA
9446int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9447{
f67539c2 9448 std::scoped_lock lock(client_lock);
1adf2230
AA
9449 return _lookup_name(ino, parent, perms);
9450}
7c673cae 9451
11fdf7f2 9452Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 9453{
11fdf7f2 9454 ceph_assert(in);
f6b5b4d7 9455 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 9456
11fdf7f2 9457 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
9458
9459 if (in->snapid != CEPH_NOSNAP) {
9460 in->snap_cap_refs++;
9461 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9462 << ccap_string(in->caps_issued()) << dendl;
9463 }
9464
11fdf7f2 9465 const auto& conf = cct->_conf;
7c673cae
FG
9466 f->readahead.set_trigger_requests(1);
9467 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9468 uint64_t max_readahead = Readahead::NO_LIMIT;
9469 if (conf->client_readahead_max_bytes) {
11fdf7f2 9470 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
9471 }
9472 if (conf->client_readahead_max_periods) {
11fdf7f2 9473 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
9474 }
9475 f->readahead.set_max_readahead_size(max_readahead);
9476 vector<uint64_t> alignments;
9477 alignments.push_back(in->layout.get_period());
9478 alignments.push_back(in->layout.stripe_unit);
9479 f->readahead.set_alignments(alignments);
9480
9481 return f;
9482}
9483
9484int Client::_release_fh(Fh *f)
9485{
9486 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9487 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9488 Inode *in = f->inode.get();
11fdf7f2 9489 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 9490
b32b8144
FG
9491 in->unset_deleg(f);
9492
7c673cae
FG
9493 if (in->snapid == CEPH_NOSNAP) {
9494 if (in->put_open_ref(f->mode)) {
9495 _flush(in, new C_Client_FlushComplete(this, in));
9496 check_caps(in, 0);
9497 }
9498 } else {
11fdf7f2 9499 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
9500 in->snap_cap_refs--;
9501 }
9502
9503 _release_filelocks(f);
9504
9505 // Finally, read any async err (i.e. from flushes)
9506 int err = f->take_async_err();
9507 if (err != 0) {
11fdf7f2 9508 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
9509 << cpp_strerror(err) << dendl;
9510 } else {
11fdf7f2 9511 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
9512 }
9513
9514 _put_fh(f);
9515
9516 return err;
9517}
9518
9519void Client::_put_fh(Fh *f)
9520{
9521 int left = f->put();
9522 if (!left) {
9523 delete f;
9524 }
9525}
9526
9527int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9528 const UserPerm& perms)
9529{
9530 if (in->snapid != CEPH_NOSNAP &&
9531 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
f67539c2 9532 return -CEPHFS_EROFS;
7c673cae
FG
9533 }
9534
9535 // use normalized flags to generate cmode
11fdf7f2
TL
9536 int cflags = ceph_flags_sys2wire(flags);
9537 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9538 cflags |= CEPH_O_LAZY;
9539
9540 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
9541 int want = ceph_caps_for_mode(cmode);
9542 int result = 0;
9543
9544 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9545
b32b8144 9546 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
9547 // update wanted?
9548 check_caps(in, CHECK_CAPS_NODELAY);
9549 } else {
b32b8144 9550
7c673cae
FG
9551 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9552 filepath path;
9553 in->make_nosnap_relative_path(path);
9554 req->set_filepath(path);
11fdf7f2 9555 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
9556 req->head.args.open.mode = mode;
9557 req->head.args.open.pool = -1;
9558 if (cct->_conf->client_debug_getattr_caps)
9559 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9560 else
9561 req->head.args.open.mask = 0;
9562 req->head.args.open.old_size = in->size; // for O_TRUNC
9563 req->set_inode(in);
9564 result = make_request(req, perms);
b32b8144
FG
9565
9566 /*
9567 * NFS expects that delegations will be broken on a conflicting open,
9568 * not just when there is actual conflicting access to the file. SMB leases
9569 * and oplocks also have similar semantics.
9570 *
9571 * Ensure that clients that have delegations enabled will wait on minimal
9572 * caps during open, just to ensure that other clients holding delegations
9573 * return theirs first.
9574 */
9575 if (deleg_timeout && result == 0) {
9576 int need = 0, have;
9577
9578 if (cmode & CEPH_FILE_MODE_WR)
9579 need |= CEPH_CAP_FILE_WR;
9580 if (cmode & CEPH_FILE_MODE_RD)
9581 need |= CEPH_CAP_FILE_RD;
9582
f6b5b4d7
TL
9583 Fh fh(in, flags, cmode, fd_gen, perms);
9584 result = get_caps(&fh, need, want, &have, -1);
b32b8144 9585 if (result < 0) {
1adf2230 9586 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
9587 " . Denying open: " <<
9588 cpp_strerror(result) << dendl;
b32b8144
FG
9589 } else {
9590 put_cap_ref(in, need);
9591 }
9592 }
7c673cae
FG
9593 }
9594
9595 // success?
9596 if (result >= 0) {
9597 if (fhp)
9598 *fhp = _create_fh(in, flags, cmode, perms);
9599 } else {
9600 in->put_open_ref(cmode);
9601 }
9602
9603 trim_cache();
9604
9605 return result;
9606}
9607
9608int Client::_renew_caps(Inode *in)
9609{
9610 int wanted = in->caps_file_wanted();
9611 if (in->is_any_caps() &&
9612 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9613 check_caps(in, CHECK_CAPS_NODELAY);
9614 return 0;
9615 }
9616
9617 int flags = 0;
9618 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9619 flags = O_RDWR;
9620 else if (wanted & CEPH_CAP_FILE_RD)
9621 flags = O_RDONLY;
9622 else if (wanted & CEPH_CAP_FILE_WR)
9623 flags = O_WRONLY;
9624
9625 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9626 filepath path;
9627 in->make_nosnap_relative_path(path);
9628 req->set_filepath(path);
9629 req->head.args.open.flags = flags;
9630 req->head.args.open.pool = -1;
9631 if (cct->_conf->client_debug_getattr_caps)
9632 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9633 else
9634 req->head.args.open.mask = 0;
9635 req->set_inode(in);
9636
9637 // duplicate in case Cap goes away; not sure if that race is a concern?
9638 const UserPerm *pperm = in->get_best_perms();
9639 UserPerm perms;
9640 if (pperm != NULL)
9641 perms = *pperm;
9642 int ret = make_request(req, perms);
9643 return ret;
9644}
9645
b3b6e05e 9646int Client::_close(int fd)
7c673cae
FG
9647{
9648 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
7c673cae
FG
9649 tout(cct) << "close" << std::endl;
9650 tout(cct) << fd << std::endl;
9651
9652 Fh *fh = get_filehandle(fd);
9653 if (!fh)
f67539c2 9654 return -CEPHFS_EBADF;
7c673cae
FG
9655 int err = _release_fh(fh);
9656 fd_map.erase(fd);
9657 put_fd(fd);
9658 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9659 return err;
9660}
9661
b3b6e05e
TL
9662int Client::close(int fd) {
9663 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9664 if (!mref_reader.is_state_satisfied())
9665 return -CEPHFS_ENOTCONN;
9666
9667 std::scoped_lock lock(client_lock);
9668 return _close(fd);
9669}
7c673cae
FG
9670
9671// ------------
9672// read, write
9673
9674loff_t Client::lseek(int fd, loff_t offset, int whence)
9675{
f67539c2
TL
9676 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9677 if (!mref_reader.is_state_satisfied())
9678 return -CEPHFS_ENOTCONN;
9679
7c673cae
FG
9680 tout(cct) << "lseek" << std::endl;
9681 tout(cct) << fd << std::endl;
9682 tout(cct) << offset << std::endl;
9683 tout(cct) << whence << std::endl;
9684
f67539c2 9685 std::scoped_lock lock(client_lock);
7c673cae
FG
9686 Fh *f = get_filehandle(fd);
9687 if (!f)
f67539c2 9688 return -CEPHFS_EBADF;
7c673cae
FG
9689#if defined(__linux__) && defined(O_PATH)
9690 if (f->flags & O_PATH)
f67539c2 9691 return -CEPHFS_EBADF;
7c673cae
FG
9692#endif
9693 return _lseek(f, offset, whence);
9694}
9695
9696loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9697{
9698 Inode *in = f->inode.get();
9f95a23c 9699 bool whence_check = false;
11fdf7f2 9700 loff_t pos = -1;
7c673cae 9701
9f95a23c
TL
9702 switch (whence) {
9703 case SEEK_END:
9704 whence_check = true;
9705 break;
9706
9707#ifdef SEEK_DATA
9708 case SEEK_DATA:
9709 whence_check = true;
9710 break;
9711#endif
9712
9713#ifdef SEEK_HOLE
9714 case SEEK_HOLE:
9715 whence_check = true;
9716 break;
9717#endif
9718 }
9719
9720 if (whence_check) {
9721 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9722 if (r < 0)
92f5a8d4 9723 return r;
92f5a8d4
TL
9724 }
9725
7c673cae
FG
9726 switch (whence) {
9727 case SEEK_SET:
11fdf7f2 9728 pos = offset;
7c673cae
FG
9729 break;
9730
9731 case SEEK_CUR:
92f5a8d4 9732 pos = f->pos + offset;
7c673cae
FG
9733 break;
9734
9735 case SEEK_END:
11fdf7f2 9736 pos = in->size + offset;
7c673cae
FG
9737 break;
9738
9f95a23c 9739#ifdef SEEK_DATA
92f5a8d4 9740 case SEEK_DATA:
9f95a23c 9741 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9742 return -CEPHFS_ENXIO;
92f5a8d4
TL
9743 pos = offset;
9744 break;
9f95a23c 9745#endif
92f5a8d4 9746
9f95a23c 9747#ifdef SEEK_HOLE
92f5a8d4 9748 case SEEK_HOLE:
9f95a23c 9749 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9750 return -CEPHFS_ENXIO;
9f95a23c 9751 pos = in->size;
92f5a8d4 9752 break;
9f95a23c 9753#endif
92f5a8d4 9754
7c673cae 9755 default:
92f5a8d4 9756 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
f67539c2 9757 return -CEPHFS_EINVAL;
7c673cae
FG
9758 }
9759
11fdf7f2 9760 if (pos < 0) {
f67539c2 9761 return -CEPHFS_EINVAL;
11fdf7f2
TL
9762 } else {
9763 f->pos = pos;
9764 }
9765
1adf2230 9766 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
9767 return f->pos;
9768}
9769
9770
9771void Client::lock_fh_pos(Fh *f)
9772{
11fdf7f2 9773 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9774
9775 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 9776 ceph::condition_variable cond;
7c673cae 9777 f->pos_waiters.push_back(&cond);
11fdf7f2 9778 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
9779 std::unique_lock l{client_lock, std::adopt_lock};
9780 cond.wait(l, [f, me=&cond] {
9781 return !f->pos_locked && f->pos_waiters.front() == me;
9782 });
9783 l.release();
11fdf7f2
TL
9784 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9785 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9786 f->pos_waiters.pop_front();
9787 }
9788
9789 f->pos_locked = true;
9790}
9791
9792void Client::unlock_fh_pos(Fh *f)
9793{
f67539c2
TL
9794 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9795
11fdf7f2 9796 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 9797 f->pos_locked = false;
f67539c2
TL
9798 if (!f->pos_waiters.empty()) {
9799 // only wake up the oldest waiter
9800 auto cond = f->pos_waiters.front();
9801 cond->notify_one();
9802 }
7c673cae
FG
9803}
9804
9805int Client::uninline_data(Inode *in, Context *onfinish)
9806{
9807 if (!in->inline_data.length()) {
9808 onfinish->complete(0);
9809 return 0;
9810 }
9811
9812 char oid_buf[32];
9813 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9814 object_t oid = oid_buf;
9815
9816 ObjectOperation create_ops;
9817 create_ops.create(false);
9818
9819 objecter->mutate(oid,
9820 OSDMap::file_to_object_locator(in->layout),
9821 create_ops,
9822 in->snaprealm->get_snap_context(),
9823 ceph::real_clock::now(),
9824 0,
9825 NULL);
9826
9827 bufferlist inline_version_bl;
11fdf7f2 9828 encode(in->inline_version, inline_version_bl);
7c673cae
FG
9829
9830 ObjectOperation uninline_ops;
9831 uninline_ops.cmpxattr("inline_version",
9832 CEPH_OSD_CMPXATTR_OP_GT,
9833 CEPH_OSD_CMPXATTR_MODE_U64,
9834 inline_version_bl);
9835 bufferlist inline_data = in->inline_data;
9836 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9837 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9838
9839 objecter->mutate(oid,
9840 OSDMap::file_to_object_locator(in->layout),
9841 uninline_ops,
9842 in->snaprealm->get_snap_context(),
9843 ceph::real_clock::now(),
9844 0,
9845 onfinish);
9846
9847 return 0;
9848}
9849
9850//
9851
9852// blocking osd interface
9853
9854int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9855{
f67539c2
TL
9856 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9857 if (!mref_reader.is_state_satisfied())
9858 return -CEPHFS_ENOTCONN;
9859
7c673cae
FG
9860 tout(cct) << "read" << std::endl;
9861 tout(cct) << fd << std::endl;
9862 tout(cct) << size << std::endl;
9863 tout(cct) << offset << std::endl;
9864
f67539c2 9865 std::unique_lock lock(client_lock);
7c673cae
FG
9866 Fh *f = get_filehandle(fd);
9867 if (!f)
f67539c2 9868 return -CEPHFS_EBADF;
7c673cae
FG
9869#if defined(__linux__) && defined(O_PATH)
9870 if (f->flags & O_PATH)
f67539c2 9871 return -CEPHFS_EBADF;
7c673cae
FG
9872#endif
9873 bufferlist bl;
11fdf7f2
TL
9874 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9875 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9876 int r = _read(f, offset, size, &bl);
9877 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9878 if (r >= 0) {
f6b5b4d7 9879 lock.unlock();
9f95a23c 9880 bl.begin().copy(bl.length(), buf);
7c673cae
FG
9881 r = bl.length();
9882 }
9883 return r;
9884}
9885
9886int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9887{
9888 if (iovcnt < 0)
f67539c2 9889 return -CEPHFS_EINVAL;
7c673cae
FG
9890 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9891}
9892
11fdf7f2 9893int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9894{
f67539c2
TL
9895 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9896
11fdf7f2
TL
9897 int want, have = 0;
9898 bool movepos = false;
9899 std::unique_ptr<C_SaferCond> onuninline;
adb31ebb 9900 int64_t rc = 0;
11fdf7f2 9901 const auto& conf = cct->_conf;
7c673cae 9902 Inode *in = f->inode.get();
11fdf7f2
TL
9903 utime_t lat;
9904 utime_t start = ceph_clock_now();
7c673cae
FG
9905
9906 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
f67539c2 9907 return -CEPHFS_EBADF;
7c673cae
FG
9908 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9909
7c673cae
FG
9910 if (offset < 0) {
9911 lock_fh_pos(f);
9912 offset = f->pos;
9913 movepos = true;
9914 }
9915 loff_t start_pos = offset;
9916
9917 if (in->inline_version == 0) {
adb31ebb 9918 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9919 if (r < 0) {
adb31ebb 9920 rc = r;
11fdf7f2 9921 goto done;
c07f9fc5 9922 }
11fdf7f2 9923 ceph_assert(in->inline_version > 0);
7c673cae
FG
9924 }
9925
9926retry:
11fdf7f2
TL
9927 if (f->mode & CEPH_FILE_MODE_LAZY)
9928 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9929 else
9930 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
9931 {
9932 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9933 if (r < 0) {
9934 rc = r;
9935 goto done;
9936 }
c07f9fc5 9937 }
7c673cae 9938 if (f->flags & O_DIRECT)
11fdf7f2 9939 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9940
9941 if (in->inline_version < CEPH_INLINE_NONE) {
9942 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9943 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9944 uninline_data(in, onuninline.get());
7c673cae
FG
9945 } else {
9946 uint32_t len = in->inline_data.length();
7c673cae
FG
9947 uint64_t endoff = offset + size;
9948 if (endoff > in->size)
9949 endoff = in->size;
9950
9951 if (offset < len) {
9952 if (endoff <= len) {
9953 bl->substr_of(in->inline_data, offset, endoff - offset);
9954 } else {
9955 bl->substr_of(in->inline_data, offset, len - offset);
9956 bl->append_zero(endoff - len);
9957 }
adb31ebb 9958 rc = endoff - offset;
7c673cae
FG
9959 } else if ((uint64_t)offset < endoff) {
9960 bl->append_zero(endoff - offset);
adb31ebb 9961 rc = endoff - offset;
11fdf7f2 9962 } else {
adb31ebb 9963 rc = 0;
7c673cae 9964 }
7c673cae
FG
9965 goto success;
9966 }
9967 }
9968
9969 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9970 conf->client_oc &&
9971 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9972
9973 if (f->flags & O_RSYNC) {
9974 _flush_range(in, offset, size);
9975 }
adb31ebb
TL
9976 rc = _read_async(f, offset, size, bl);
9977 if (rc < 0)
7c673cae
FG
9978 goto done;
9979 } else {
9980 if (f->flags & O_DIRECT)
9981 _flush_range(in, offset, size);
9982
9983 bool checkeof = false;
adb31ebb
TL
9984 rc = _read_sync(f, offset, size, bl, &checkeof);
9985 if (rc < 0)
7c673cae
FG
9986 goto done;
9987 if (checkeof) {
adb31ebb
TL
9988 offset += rc;
9989 size -= rc;
7c673cae
FG
9990
9991 put_cap_ref(in, CEPH_CAP_FILE_RD);
9992 have = 0;
9993 // reverify size
adb31ebb
TL
9994 {
9995 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9996 if (r < 0) {
9997 rc = r;
9998 goto done;
9999 }
10000 }
7c673cae
FG
10001
10002 // eof? short read.
10003 if ((uint64_t)offset < in->size)
10004 goto retry;
10005 }
10006 }
10007
10008success:
adb31ebb 10009 ceph_assert(rc >= 0);
a4b75251 10010 update_read_io_size(bl->length());
7c673cae
FG
10011 if (movepos) {
10012 // adjust fd pos
adb31ebb 10013 f->pos = start_pos + rc;
7c673cae 10014 }
11fdf7f2
TL
10015
10016 lat = ceph_clock_now();
10017 lat -= start;
10018 logger->tinc(l_c_read, lat);
7c673cae
FG
10019
10020done:
10021 // done!
11fdf7f2 10022
7c673cae 10023 if (onuninline) {
9f95a23c 10024 client_lock.unlock();
11fdf7f2 10025 int ret = onuninline->wait();
9f95a23c 10026 client_lock.lock();
f67539c2 10027 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10028 in->inline_data.clear();
10029 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10030 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10031 check_caps(in, 0);
10032 } else
adb31ebb 10033 rc = ret;
7c673cae 10034 }
11fdf7f2 10035 if (have) {
7c673cae 10036 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
10037 }
10038 if (movepos) {
10039 unlock_fh_pos(f);
10040 }
adb31ebb 10041 return rc;
7c673cae
FG
10042}
10043
10044Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10045 client(c), f(f) {
10046 f->get();
10047 f->readahead.inc_pending();
10048}
10049
10050Client::C_Readahead::~C_Readahead() {
10051 f->readahead.dec_pending();
10052 client->_put_fh(f);
10053}
10054
10055void Client::C_Readahead::finish(int r) {
10056 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10057 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
a4b75251
TL
10058 if (r > 0) {
10059 client->update_read_io_size(r);
10060 }
7c673cae
FG
10061}
10062
10063int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10064{
f67539c2
TL
10065 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10066
11fdf7f2 10067 const auto& conf = cct->_conf;
7c673cae
FG
10068 Inode *in = f->inode.get();
10069
11fdf7f2 10070 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
10071
10072 // trim read based on file size?
10073 if (off >= in->size)
10074 return 0;
10075 if (len == 0)
10076 return 0;
10077 if (off + len > in->size) {
10078 len = in->size - off;
10079 }
10080
10081 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10082 << " max_bytes=" << f->readahead.get_max_readahead_size()
10083 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10084
10085 // read (and possibly block)
11fdf7f2
TL
10086 int r = 0;
10087 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 10088 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 10089 off, len, bl, 0, &onfinish);
7c673cae
FG
10090 if (r == 0) {
10091 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 10092 client_lock.unlock();
11fdf7f2 10093 r = onfinish.wait();
9f95a23c 10094 client_lock.lock();
7c673cae 10095 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
a4b75251 10096 update_read_io_size(bl->length());
7c673cae
FG
10097 }
10098
10099 if(f->readahead.get_min_readahead_size() > 0) {
10100 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10101 if (readahead_extent.second > 0) {
10102 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10103 << " (caller wants " << off << "~" << len << ")" << dendl;
10104 Context *onfinish2 = new C_Readahead(this, f);
10105 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10106 readahead_extent.first, readahead_extent.second,
10107 NULL, 0, onfinish2);
10108 if (r2 == 0) {
10109 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10110 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10111 } else {
10112 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10113 delete onfinish2;
10114 }
10115 }
10116 }
10117
10118 return r;
10119}
10120
10121int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10122 bool *checkeof)
10123{
f67539c2
TL
10124 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10125
7c673cae
FG
10126 Inode *in = f->inode.get();
10127 uint64_t pos = off;
10128 int left = len;
10129 int read = 0;
10130
11fdf7f2 10131 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 10132
f67539c2
TL
10133 // 0 success, 1 continue and < 0 error happen.
10134 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
11fdf7f2 10135 int r = onfinish.wait();
7c673cae
FG
10136
10137 // if we get ENOENT from OSD, assume 0 bytes returned
f67539c2 10138 if (r == -CEPHFS_ENOENT)
7c673cae
FG
10139 r = 0;
10140 if (r < 0)
10141 return r;
f67539c2 10142
7c673cae
FG
10143 if (tbl.length()) {
10144 r = tbl.length();
10145
10146 read += r;
10147 pos += r;
10148 left -= r;
10149 bl->claim_append(tbl);
10150 }
10151 // short read?
10152 if (r >= 0 && r < wanted) {
10153 if (pos < in->size) {
10154 // zero up to known EOF
10155 int64_t some = in->size - pos;
10156 if (some > left)
10157 some = left;
11fdf7f2
TL
10158 auto z = buffer::ptr_node::create(some);
10159 z->zero();
10160 bl->push_back(std::move(z));
7c673cae
FG
10161 read += some;
10162 pos += some;
10163 left -= some;
10164 if (left == 0)
f67539c2 10165 return 0;
7c673cae
FG
10166 }
10167
10168 *checkeof = true;
f67539c2 10169 return 0;
7c673cae 10170 }
f67539c2
TL
10171 return 1;
10172 };
7c673cae 10173
f67539c2
TL
10174 while (left > 0) {
10175 C_SaferCond onfinish("Client::_read_sync flock");
10176 bufferlist tbl;
7c673cae 10177
f67539c2
TL
10178 int wanted = left;
10179 filer->read_trunc(in->ino, &in->layout, in->snapid,
10180 pos, left, &tbl, 0,
10181 in->truncate_size, in->truncate_seq,
10182 &onfinish);
10183 client_lock.unlock();
10184 int r = wait_and_copy(onfinish, tbl, wanted);
10185 client_lock.lock();
10186 if (!r)
10187 return read;
10188 if (r < 0)
10189 return r;
7c673cae 10190 }
f67539c2 10191 return read;
7c673cae
FG
10192}
10193
10194int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10195{
f67539c2
TL
10196 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10197 if (!mref_reader.is_state_satisfied())
10198 return -CEPHFS_ENOTCONN;
10199
7c673cae
FG
10200 tout(cct) << "write" << std::endl;
10201 tout(cct) << fd << std::endl;
10202 tout(cct) << size << std::endl;
10203 tout(cct) << offset << std::endl;
10204
f67539c2 10205 std::scoped_lock lock(client_lock);
7c673cae
FG
10206 Fh *fh = get_filehandle(fd);
10207 if (!fh)
f67539c2 10208 return -CEPHFS_EBADF;
7c673cae
FG
10209#if defined(__linux__) && defined(O_PATH)
10210 if (fh->flags & O_PATH)
f67539c2 10211 return -CEPHFS_EBADF;
7c673cae 10212#endif
11fdf7f2
TL
10213 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10214 size = std::min(size, (loff_t)INT_MAX);
10215 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
10216 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10217 return r;
10218}
10219
10220int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10221{
10222 if (iovcnt < 0)
f67539c2 10223 return -CEPHFS_EINVAL;
7c673cae
FG
10224 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10225}
10226
11fdf7f2 10227int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
20effc67
TL
10228 unsigned iovcnt, int64_t offset,
10229 bool write, bool clamp_to_int)
7c673cae 10230{
20effc67
TL
10231 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10232
7c673cae
FG
10233#if defined(__linux__) && defined(O_PATH)
10234 if (fh->flags & O_PATH)
f67539c2 10235 return -CEPHFS_EBADF;
7c673cae
FG
10236#endif
10237 loff_t totallen = 0;
10238 for (unsigned i = 0; i < iovcnt; i++) {
10239 totallen += iov[i].iov_len;
10240 }
11fdf7f2
TL
10241
10242 /*
10243 * Some of the API functions take 64-bit size values, but only return
10244 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10245 * we don't do I/Os larger than the values we can return.
10246 */
10247 if (clamp_to_int) {
10248 totallen = std::min(totallen, (loff_t)INT_MAX);
10249 }
7c673cae 10250 if (write) {
11fdf7f2
TL
10251 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10252 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
10253 return w;
10254 } else {
10255 bufferlist bl;
11fdf7f2
TL
10256 int64_t r = _read(fh, offset, totallen, &bl);
10257 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
10258 if (r <= 0)
10259 return r;
10260
20effc67 10261 client_lock.unlock();
9f95a23c 10262 auto iter = bl.cbegin();
7c673cae
FG
10263 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10264 /*
f67539c2
TL
10265 * This piece of code aims to handle the case that bufferlist
10266 * does not have enough data to fill in the iov
7c673cae 10267 */
9f95a23c
TL
10268 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10269 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10270 resid -= round_size;
10271 /* iter is self-updating */
7c673cae 10272 }
20effc67 10273 client_lock.lock();
f67539c2 10274 return r;
7c673cae
FG
10275 }
10276}
10277
11fdf7f2
TL
10278int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10279{
f67539c2
TL
10280 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10281 if (!mref_reader.is_state_satisfied())
10282 return -CEPHFS_ENOTCONN;
10283
11fdf7f2
TL
10284 tout(cct) << fd << std::endl;
10285 tout(cct) << offset << std::endl;
10286
20effc67 10287 std::scoped_lock cl(client_lock);
11fdf7f2
TL
10288 Fh *fh = get_filehandle(fd);
10289 if (!fh)
f67539c2 10290 return -CEPHFS_EBADF;
20effc67 10291 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
11fdf7f2
TL
10292}
10293
10294int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10295 const struct iovec *iov, int iovcnt)
7c673cae 10296{
f67539c2
TL
10297 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10298
f64942e4
AA
10299 uint64_t fpos = 0;
10300
7c673cae 10301 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
f67539c2 10302 return -CEPHFS_EFBIG;
7c673cae
FG
10303
10304 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10305 Inode *in = f->inode.get();
10306
10307 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
f67539c2 10308 return -CEPHFS_ENOSPC;
7c673cae
FG
10309 }
10310
11fdf7f2 10311 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
10312
10313 // was Fh opened as writeable?
10314 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10315 return -CEPHFS_EBADF;
7c673cae 10316
7c673cae
FG
10317 // use/adjust fd pos?
10318 if (offset < 0) {
10319 lock_fh_pos(f);
10320 /*
10321 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10322 * change out from under us.
10323 */
10324 if (f->flags & O_APPEND) {
9f95a23c 10325 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
10326 if (r < 0) {
10327 unlock_fh_pos(f);
10328 return r;
10329 }
10330 }
10331 offset = f->pos;
f64942e4 10332 fpos = offset+size;
7c673cae
FG
10333 unlock_fh_pos(f);
10334 }
10335
11fdf7f2
TL
10336 // check quota
10337 uint64_t endoff = offset + size;
10338 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10339 f->actor_perms)) {
f67539c2 10340 return -CEPHFS_EDQUOT;
11fdf7f2
TL
10341 }
10342
7c673cae
FG
10343 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10344
10345 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10346
10347 // time it.
10348 utime_t start = ceph_clock_now();
10349
10350 if (in->inline_version == 0) {
10351 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10352 if (r < 0)
10353 return r;
11fdf7f2 10354 ceph_assert(in->inline_version > 0);
7c673cae
FG
10355 }
10356
10357 // copy into fresh buffer (since our write may be resub, async)
10358 bufferlist bl;
10359 if (buf) {
10360 if (size > 0)
10361 bl.append(buf, size);
10362 } else if (iov){
10363 for (int i = 0; i < iovcnt; i++) {
10364 if (iov[i].iov_len > 0) {
10365 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10366 }
10367 }
10368 }
10369
10370 utime_t lat;
10371 uint64_t totalwritten;
11fdf7f2
TL
10372 int want, have;
10373 if (f->mode & CEPH_FILE_MODE_LAZY)
10374 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10375 else
10376 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 10377 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
10378 if (r < 0)
10379 return r;
10380
10381 /* clear the setuid/setgid bits, if any */
181888fb 10382 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
10383 struct ceph_statx stx = { 0 };
10384
10385 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10386 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10387 if (r < 0)
10388 return r;
10389 } else {
10390 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10391 }
10392
10393 if (f->flags & O_DIRECT)
11fdf7f2 10394 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10395
10396 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10397
11fdf7f2
TL
10398 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10399
7c673cae
FG
10400 if (in->inline_version < CEPH_INLINE_NONE) {
10401 if (endoff > cct->_conf->client_max_inline_size ||
10402 endoff > CEPH_INLINE_MAX_SIZE ||
10403 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
10404 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10405 uninline_data(in, onuninline.get());
7c673cae
FG
10406 } else {
10407 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10408
10409 uint32_t len = in->inline_data.length();
10410
10411 if (endoff < len)
9f95a23c 10412 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
10413
10414 if (offset < len)
10415 in->inline_data.splice(offset, len - offset);
10416 else if (offset > len)
10417 in->inline_data.append_zero(offset - len);
10418
10419 in->inline_data.append(bl);
10420 in->inline_version++;
10421
10422 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10423
10424 goto success;
10425 }
10426 }
10427
11fdf7f2
TL
10428 if (cct->_conf->client_oc &&
10429 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10430 // do buffered write
10431 if (!in->oset.dirty_or_tx)
10432 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10433
10434 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10435
10436 // async, caching, non-blocking.
10437 r = objectcacher->file_write(&in->oset, &in->layout,
10438 in->snaprealm->get_snap_context(),
10439 offset, size, bl, ceph::real_clock::now(),
10440 0);
10441 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10442
10443 if (r < 0)
10444 goto done;
10445
10446 // flush cached write if O_SYNC is set on file fh
10447 // O_DSYNC == O_SYNC on linux < 2.6.33
10448 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10449 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10450 _flush_range(in, offset, size);
10451 }
10452 } else {
10453 if (f->flags & O_DIRECT)
10454 _flush_range(in, offset, size);
10455
10456 // simple, non-atomic sync write
11fdf7f2 10457 C_SaferCond onfinish("Client::_write flock");
f67539c2 10458 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
10459
10460 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10461 offset, size, bl, ceph::real_clock::now(), 0,
10462 in->truncate_size, in->truncate_seq,
11fdf7f2 10463 &onfinish);
9f95a23c 10464 client_lock.unlock();
f6b5b4d7 10465 r = onfinish.wait();
9f95a23c 10466 client_lock.lock();
f67539c2 10467 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
f6b5b4d7
TL
10468 if (r < 0)
10469 goto done;
7c673cae
FG
10470 }
10471
10472 // if we get here, write was successful, update client metadata
10473success:
a4b75251 10474 update_write_io_size(size);
7c673cae
FG
10475 // time
10476 lat = ceph_clock_now();
10477 lat -= start;
10478 logger->tinc(l_c_wrlat, lat);
10479
f64942e4
AA
10480 if (fpos) {
10481 lock_fh_pos(f);
10482 f->pos = fpos;
10483 unlock_fh_pos(f);
10484 }
7c673cae 10485 totalwritten = size;
11fdf7f2 10486 r = (int64_t)totalwritten;
7c673cae
FG
10487
10488 // extend file?
10489 if (totalwritten + offset > in->size) {
10490 in->size = totalwritten + offset;
28e407b8 10491 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 10492
11fdf7f2 10493 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 10494 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
10495 } else if (is_max_size_approaching(in)) {
10496 check_caps(in, 0);
7c673cae
FG
10497 }
10498
10499 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10500 } else {
10501 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10502 }
10503
10504 // mtime
91327a77 10505 in->mtime = in->ctime = ceph_clock_now();
7c673cae 10506 in->change_attr++;
28e407b8 10507 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10508
10509done:
10510
11fdf7f2 10511 if (nullptr != onuninline) {
9f95a23c 10512 client_lock.unlock();
11fdf7f2 10513 int uninline_ret = onuninline->wait();
9f95a23c 10514 client_lock.lock();
7c673cae 10515
f67539c2 10516 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10517 in->inline_data.clear();
10518 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10519 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10520 check_caps(in, 0);
10521 } else
10522 r = uninline_ret;
10523 }
10524
10525 put_cap_ref(in, CEPH_CAP_FILE_WR);
10526 return r;
10527}
10528
10529int Client::_flush(Fh *f)
10530{
10531 Inode *in = f->inode.get();
10532 int err = f->take_async_err();
10533 if (err != 0) {
10534 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10535 << cpp_strerror(err) << dendl;
10536 } else {
10537 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10538 }
10539
10540 return err;
10541}
10542
10543int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10544{
10545 struct ceph_statx stx;
10546 stx.stx_size = length;
10547 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10548}
10549
10550int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10551{
f67539c2
TL
10552 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10553 if (!mref_reader.is_state_satisfied())
10554 return -CEPHFS_ENOTCONN;
10555
11fdf7f2 10556 tout(cct) << __func__ << std::endl;
7c673cae
FG
10557 tout(cct) << fd << std::endl;
10558 tout(cct) << length << std::endl;
10559
f67539c2 10560 std::scoped_lock lock(client_lock);
7c673cae
FG
10561 Fh *f = get_filehandle(fd);
10562 if (!f)
f67539c2 10563 return -CEPHFS_EBADF;
7c673cae
FG
10564#if defined(__linux__) && defined(O_PATH)
10565 if (f->flags & O_PATH)
f67539c2 10566 return -CEPHFS_EBADF;
7c673cae 10567#endif
adb31ebb 10568 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10569 return -CEPHFS_EBADF;
7c673cae
FG
10570 struct stat attr;
10571 attr.st_size = length;
10572 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10573}
10574
10575int Client::fsync(int fd, bool syncdataonly)
10576{
f67539c2
TL
10577 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10578 if (!mref_reader.is_state_satisfied())
10579 return -CEPHFS_ENOTCONN;
10580
7c673cae
FG
10581 tout(cct) << "fsync" << std::endl;
10582 tout(cct) << fd << std::endl;
10583 tout(cct) << syncdataonly << std::endl;
10584
f67539c2 10585 std::scoped_lock lock(client_lock);
7c673cae
FG
10586 Fh *f = get_filehandle(fd);
10587 if (!f)
f67539c2 10588 return -CEPHFS_EBADF;
7c673cae
FG
10589#if defined(__linux__) && defined(O_PATH)
10590 if (f->flags & O_PATH)
f67539c2 10591 return -CEPHFS_EBADF;
7c673cae
FG
10592#endif
10593 int r = _fsync(f, syncdataonly);
10594 if (r == 0) {
10595 // The IOs in this fsync were okay, but maybe something happened
10596 // in the background that we shoudl be reporting?
10597 r = f->take_async_err();
1adf2230 10598 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
10599 << ") = 0, async_err = " << r << dendl;
10600 } else {
10601 // Assume that an error we encountered during fsync, even reported
10602 // synchronously, would also have applied the error to the Fh, and we
10603 // should clear it here to avoid returning the same error again on next
10604 // call.
1adf2230 10605 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
10606 << r << dendl;
10607 f->take_async_err();
10608 }
10609 return r;
10610}
10611
10612int Client::_fsync(Inode *in, bool syncdataonly)
10613{
f67539c2
TL
10614 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10615
7c673cae 10616 int r = 0;
11fdf7f2 10617 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
10618 ceph_tid_t flush_tid = 0;
10619 InodeRef tmp_ref;
11fdf7f2
TL
10620 utime_t lat;
10621 utime_t start = ceph_clock_now();
7c673cae 10622
1adf2230 10623 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
10624
10625 if (cct->_conf->client_oc) {
11fdf7f2
TL
10626 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10627 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10628 _flush(in, object_cacher_completion.get());
7c673cae
FG
10629 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10630 }
10631
10632 if (!syncdataonly && in->dirty_caps) {
10633 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10634 if (in->flushing_caps)
10635 flush_tid = last_flush_tid;
10636 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10637
10638 if (!syncdataonly && !in->unsafe_ops.empty()) {
522d829b 10639 flush_mdlog_sync(in);
28e407b8 10640
7c673cae
FG
10641 MetaRequest *req = in->unsafe_ops.back();
10642 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10643
10644 req->get();
10645 wait_on_list(req->waitfor_safe);
10646 put_request(req);
10647 }
10648
11fdf7f2 10649 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 10650 client_lock.unlock();
7c673cae 10651 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 10652 r = object_cacher_completion->wait();
9f95a23c 10653 client_lock.lock();
7c673cae
FG
10654 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10655 } else {
10656 // FIXME: this can starve
10657 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10658 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10659 << " uncommitted, waiting" << dendl;
10660 wait_on_list(in->waitfor_commit);
10661 }
10662 }
10663
10664 if (!r) {
10665 if (flush_tid > 0)
10666 wait_sync_caps(in, flush_tid);
10667
10668 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10669 } else {
1adf2230 10670 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
10671 << cpp_strerror(-r) << dendl;
10672 }
11fdf7f2
TL
10673
10674 lat = ceph_clock_now();
10675 lat -= start;
10676 logger->tinc(l_c_fsync, lat);
7c673cae
FG
10677
10678 return r;
10679}
10680
10681int Client::_fsync(Fh *f, bool syncdataonly)
10682{
1adf2230 10683 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
10684 return _fsync(f->inode.get(), syncdataonly);
10685}
10686
10687int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10688{
f67539c2
TL
10689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10690 if (!mref_reader.is_state_satisfied())
10691 return -CEPHFS_ENOTCONN;
10692
7c673cae
FG
10693 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10694 tout(cct) << fd << std::endl;
10695
f67539c2 10696 std::scoped_lock lock(client_lock);
7c673cae
FG
10697 Fh *f = get_filehandle(fd);
10698 if (!f)
f67539c2 10699 return -CEPHFS_EBADF;
7c673cae
FG
10700 int r = _getattr(f->inode, mask, perms);
10701 if (r < 0)
10702 return r;
10703 fill_stat(f->inode, stbuf, NULL);
1adf2230 10704 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
10705 return r;
10706}
10707
10708int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10709 unsigned int want, unsigned int flags)
10710{
f67539c2
TL
10711 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10712 if (!mref_reader.is_state_satisfied())
10713 return -CEPHFS_ENOTCONN;
10714
7c673cae
FG
10715 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10716 tout(cct) << fd << std::endl;
10717
f67539c2 10718 std::scoped_lock lock(client_lock);
7c673cae
FG
10719 Fh *f = get_filehandle(fd);
10720 if (!f)
f67539c2 10721 return -CEPHFS_EBADF;
7c673cae
FG
10722
10723 unsigned mask = statx_to_mask(flags, want);
10724
10725 int r = 0;
b3b6e05e 10726 if (mask) {
7c673cae
FG
10727 r = _getattr(f->inode, mask, perms);
10728 if (r < 0) {
10729 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10730 return r;
10731 }
10732 }
10733
10734 fill_statx(f->inode, mask, stx);
10735 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10736 return r;
10737}
10738
b3b6e05e
TL
10739int Client::statxat(int dirfd, const char *relpath,
10740 struct ceph_statx *stx, const UserPerm& perms,
10741 unsigned int want, unsigned int flags) {
10742 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10743 if (!mref_reader.is_state_satisfied()) {
10744 return -CEPHFS_ENOTCONN;
10745 }
10746
10747 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10748 tout(cct) << dirfd << std::endl;
10749 tout(cct) << relpath << std::endl;
10750
10751 unsigned mask = statx_to_mask(flags, want);
10752
10753 InodeRef dirinode;
10754 std::scoped_lock lock(client_lock);
10755 int r = get_fd_inode(dirfd, &dirinode);
10756 if (r < 0) {
10757 return r;
10758 }
10759
10760 InodeRef in;
10761 filepath path(relpath);
10762 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10763 if (r < 0) {
10764 return r;
10765 }
10766 r = _getattr(in, mask, perms);
10767 if (r < 0) {
10768 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10769 return r;
10770 }
10771
10772 fill_statx(in, mask, stx);
10773 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10774 return r;
10775}
10776
7c673cae
FG
10777// not written yet, but i want to link!
10778
10779int Client::chdir(const char *relpath, std::string &new_cwd,
10780 const UserPerm& perms)
10781{
f67539c2
TL
10782 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10783 if (!mref_reader.is_state_satisfied())
10784 return -CEPHFS_ENOTCONN;
10785
7c673cae
FG
10786 tout(cct) << "chdir" << std::endl;
10787 tout(cct) << relpath << std::endl;
181888fb 10788
7c673cae
FG
10789 filepath path(relpath);
10790 InodeRef in;
f67539c2
TL
10791
10792 std::scoped_lock lock(client_lock);
7c673cae
FG
10793 int r = path_walk(path, &in, perms);
10794 if (r < 0)
10795 return r;
92f5a8d4
TL
10796
10797 if (!(in.get()->is_dir()))
f67539c2 10798 return -CEPHFS_ENOTDIR;
92f5a8d4 10799
7c673cae
FG
10800 if (cwd != in)
10801 cwd.swap(in);
10802 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10803
b5b8bbf5 10804 _getcwd(new_cwd, perms);
7c673cae
FG
10805 return 0;
10806}
10807
b5b8bbf5 10808void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
10809{
10810 filepath path;
11fdf7f2 10811 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
10812
10813 Inode *in = cwd.get();
b3b6e05e 10814 while (in != root.get()) {
11fdf7f2 10815 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
10816
10817 // A cwd or ancester is unlinked
11fdf7f2 10818 if (in->dentries.empty()) {
7c673cae
FG
10819 return;
10820 }
10821
10822 Dentry *dn = in->get_first_parent();
10823
10824
10825 if (!dn) {
10826 // look it up
11fdf7f2 10827 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
10828 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10829 filepath path(in->ino);
10830 req->set_filepath(path);
10831 req->set_inode(in);
10832 int res = make_request(req, perms);
10833 if (res < 0)
10834 break;
10835
10836 // start over
10837 path = filepath();
10838 in = cwd.get();
10839 continue;
10840 }
10841 path.push_front_dentry(dn->name);
10842 in = dn->dir->parent_inode;
10843 }
10844 dir = "/";
10845 dir += path.get_path();
10846}
10847
b5b8bbf5
FG
10848void Client::getcwd(string& dir, const UserPerm& perms)
10849{
f67539c2
TL
10850 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10851 if (!mref_reader.is_state_satisfied())
10852 return;
10853
10854 std::scoped_lock l(client_lock);
10855
10856 _getcwd(dir, perms);
b5b8bbf5
FG
10857}
10858
7c673cae
FG
10859int Client::statfs(const char *path, struct statvfs *stbuf,
10860 const UserPerm& perms)
10861{
f67539c2
TL
10862 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10863 if (!mref_reader.is_state_satisfied())
10864 return -CEPHFS_ENOTCONN;
10865
11fdf7f2 10866 tout(cct) << __func__ << std::endl;
91327a77 10867 unsigned long int total_files_on_fs;
7c673cae
FG
10868
10869 ceph_statfs stats;
10870 C_SaferCond cond;
d2e6a577 10871
f67539c2 10872 std::unique_lock lock(client_lock);
d2e6a577
FG
10873 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10874 if (data_pools.size() == 1) {
10875 objecter->get_fs_stats(stats, data_pools[0], &cond);
10876 } else {
20effc67 10877 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
d2e6a577 10878 }
7c673cae 10879
f67539c2 10880 lock.unlock();
7c673cae 10881 int rval = cond.wait();
f67539c2
TL
10882 lock.lock();
10883
20effc67 10884 ceph_assert(root);
91327a77 10885 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
10886
10887 if (rval < 0) {
10888 ldout(cct, 1) << "underlying call to statfs returned error: "
10889 << cpp_strerror(rval)
10890 << dendl;
10891 return rval;
10892 }
10893
10894 memset(stbuf, 0, sizeof(*stbuf));
10895
10896 /*
10897 * we're going to set a block size of 4MB so we can represent larger
10898 * FSes without overflowing. Additionally convert the space
10899 * measurements from KB to bytes while making them in terms of
10900 * blocks. We use 4MB only because it is big enough, and because it
10901 * actually *is* the (ceph) default block size.
10902 */
10903 const int CEPH_BLOCK_SHIFT = 22;
10904 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10905 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77 10906 stbuf->f_files = total_files_on_fs;
f67539c2 10907 stbuf->f_ffree = -1;
7c673cae
FG
10908 stbuf->f_favail = -1;
10909 stbuf->f_fsid = -1; // ??
10910 stbuf->f_flag = 0; // ??
10911 stbuf->f_namemax = NAME_MAX;
10912
10913 // Usually quota_root will == root_ancestor, but if the mount root has no
10914 // quota but we can see a parent of it that does have a quota, we'll
10915 // respect that one instead.
11fdf7f2 10916 ceph_assert(root != nullptr);
b3b6e05e 10917 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
7c673cae
FG
10918
10919 // get_quota_root should always give us something
10920 // because client quotas are always enabled
11fdf7f2 10921 ceph_assert(quota_root != nullptr);
7c673cae
FG
10922
10923 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10924
10925 // Skip the getattr if any sessions are stale, as we don't want to
10926 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10927 // is unhealthy.
10928 if (!_any_stale_sessions()) {
10929 int r = _getattr(quota_root, 0, perms, true);
10930 if (r != 0) {
10931 // Ignore return value: error getting latest inode metadata is not a good
10932 // reason to break "df".
10933 lderr(cct) << "Error in getattr on quota root 0x"
10934 << std::hex << quota_root->ino << std::dec
10935 << " statfs result may be outdated" << dendl;
10936 }
10937 }
10938
10939 // Special case: if there is a size quota set on the Inode acting
10940 // as the root for this client mount, then report the quota status
10941 // as the filesystem statistics.
10942 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10943 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10944 // It is possible for a quota to be exceeded: arithmetic here must
10945 // handle case where used > total.
10946 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10947
10948 stbuf->f_blocks = total;
10949 stbuf->f_bfree = free;
10950 stbuf->f_bavail = free;
10951 } else {
d2e6a577 10952 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10953 // multiple pools may be used without one filesystem namespace via
10954 // layouts, this is the most correct thing we can do.
10955 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10956 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10957 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10958 }
10959
10960 return rval;
10961}
10962
10963int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10964 struct flock *fl, uint64_t owner, bool removing)
10965{
11fdf7f2 10966 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10967 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10968 << " type " << fl->l_type << " owner " << owner
10969 << " " << fl->l_start << "~" << fl->l_len << dendl;
10970
f6b5b4d7 10971 if (in->flags & I_ERROR_FILELOCK)
f67539c2 10972 return -CEPHFS_EIO;
f6b5b4d7 10973
7c673cae
FG
10974 int lock_cmd;
10975 if (F_RDLCK == fl->l_type)
10976 lock_cmd = CEPH_LOCK_SHARED;
10977 else if (F_WRLCK == fl->l_type)
10978 lock_cmd = CEPH_LOCK_EXCL;
10979 else if (F_UNLCK == fl->l_type)
10980 lock_cmd = CEPH_LOCK_UNLOCK;
10981 else
f67539c2 10982 return -CEPHFS_EIO;
7c673cae
FG
10983
10984 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10985 sleep = 0;
10986
10987 /*
10988 * Set the most significant bit, so that MDS knows the 'owner'
10989 * is sufficient to identify the owner of lock. (old code uses
10990 * both 'owner' and 'pid')
10991 */
10992 owner |= (1ULL << 63);
10993
10994 MetaRequest *req = new MetaRequest(op);
10995 filepath path;
10996 in->make_nosnap_relative_path(path);
10997 req->set_filepath(path);
10998 req->set_inode(in);
10999
11000 req->head.args.filelock_change.rule = lock_type;
11001 req->head.args.filelock_change.type = lock_cmd;
11002 req->head.args.filelock_change.owner = owner;
11003 req->head.args.filelock_change.pid = fl->l_pid;
11004 req->head.args.filelock_change.start = fl->l_start;
11005 req->head.args.filelock_change.length = fl->l_len;
11006 req->head.args.filelock_change.wait = sleep;
11007
11008 int ret;
11009 bufferlist bl;
11010
11011 if (sleep && switch_interrupt_cb) {
11012 // enable interrupt
11013 switch_interrupt_cb(callback_handle, req->get());
11014 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
11015 // disable interrupt
11016 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
11017 if (ret == 0 && req->aborted()) {
11018 // effect of this lock request has been revoked by the 'lock intr' request
11019 ret = req->get_abort_code();
11020 }
7c673cae
FG
11021 put_request(req);
11022 } else {
11023 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11024 }
11025
11026 if (ret == 0) {
11027 if (op == CEPH_MDS_OP_GETFILELOCK) {
11028 ceph_filelock filelock;
11fdf7f2
TL
11029 auto p = bl.cbegin();
11030 decode(filelock, p);
7c673cae
FG
11031
11032 if (CEPH_LOCK_SHARED == filelock.type)
11033 fl->l_type = F_RDLCK;
11034 else if (CEPH_LOCK_EXCL == filelock.type)
11035 fl->l_type = F_WRLCK;
11036 else
11037 fl->l_type = F_UNLCK;
11038
11039 fl->l_whence = SEEK_SET;
11040 fl->l_start = filelock.start;
11041 fl->l_len = filelock.length;
11042 fl->l_pid = filelock.pid;
11043 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11044 ceph_lock_state_t *lock_state;
11045 if (lock_type == CEPH_LOCK_FCNTL) {
11046 if (!in->fcntl_locks)
11fdf7f2
TL
11047 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11048 lock_state = in->fcntl_locks.get();
7c673cae
FG
11049 } else if (lock_type == CEPH_LOCK_FLOCK) {
11050 if (!in->flock_locks)
11fdf7f2
TL
11051 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11052 lock_state = in->flock_locks.get();
7c673cae
FG
11053 } else {
11054 ceph_abort();
f67539c2 11055 return -CEPHFS_EINVAL;
7c673cae
FG
11056 }
11057 _update_lock_state(fl, owner, lock_state);
11058
11059 if (!removing) {
11060 if (lock_type == CEPH_LOCK_FCNTL) {
11061 if (!fh->fcntl_locks)
11fdf7f2
TL
11062 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11063 lock_state = fh->fcntl_locks.get();
7c673cae
FG
11064 } else {
11065 if (!fh->flock_locks)
11fdf7f2
TL
11066 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11067 lock_state = fh->flock_locks.get();
7c673cae
FG
11068 }
11069 _update_lock_state(fl, owner, lock_state);
11070 }
11071 } else
11072 ceph_abort();
11073 }
11074 return ret;
11075}
11076
11077int Client::_interrupt_filelock(MetaRequest *req)
11078{
31f18b77
FG
11079 // Set abort code, but do not kick. The abort code prevents the request
11080 // from being re-sent.
f67539c2 11081 req->abort(-CEPHFS_EINTR);
31f18b77
FG
11082 if (req->mds < 0)
11083 return 0; // haven't sent the request
11084
7c673cae
FG
11085 Inode *in = req->inode();
11086
11087 int lock_type;
11088 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11089 lock_type = CEPH_LOCK_FLOCK_INTR;
11090 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11091 lock_type = CEPH_LOCK_FCNTL_INTR;
11092 else {
11093 ceph_abort();
f67539c2 11094 return -CEPHFS_EINVAL;
7c673cae
FG
11095 }
11096
11097 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11098 filepath path;
11099 in->make_nosnap_relative_path(path);
11100 intr_req->set_filepath(path);
11101 intr_req->set_inode(in);
11102 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11103 intr_req->head.args.filelock_change.rule = lock_type;
11104 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11105
11106 UserPerm perms(req->get_uid(), req->get_gid());
11107 return make_request(intr_req, perms, NULL, NULL, -1);
11108}
11109
11110void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11111{
11112 if (!in->fcntl_locks && !in->flock_locks)
11113 return;
11114
11115 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 11116 encode(nr_fcntl_locks, bl);
7c673cae 11117 if (nr_fcntl_locks) {
11fdf7f2 11118 auto &lock_state = in->fcntl_locks;
20effc67 11119 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11120 p != lock_state->held_locks.end();
11121 ++p)
11fdf7f2 11122 encode(p->second, bl);
7c673cae
FG
11123 }
11124
11125 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 11126 encode(nr_flock_locks, bl);
7c673cae 11127 if (nr_flock_locks) {
11fdf7f2 11128 auto &lock_state = in->flock_locks;
20effc67 11129 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11130 p != lock_state->held_locks.end();
11131 ++p)
11fdf7f2 11132 encode(p->second, bl);
7c673cae
FG
11133 }
11134
11fdf7f2 11135 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
11136 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11137}
11138
11139void Client::_release_filelocks(Fh *fh)
11140{
11141 if (!fh->fcntl_locks && !fh->flock_locks)
11142 return;
11143
11144 Inode *in = fh->inode.get();
11fdf7f2 11145 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 11146
f6b5b4d7
TL
11147 list<ceph_filelock> activated_locks;
11148
7c673cae
FG
11149 list<pair<int, ceph_filelock> > to_release;
11150
11151 if (fh->fcntl_locks) {
11fdf7f2 11152 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
11153 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11154 auto q = p++;
11155 if (in->flags & I_ERROR_FILELOCK) {
11156 lock_state->remove_lock(q->second, activated_locks);
11157 } else {
11158 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11159 }
11160 }
11fdf7f2 11161 lock_state.reset();
7c673cae
FG
11162 }
11163 if (fh->flock_locks) {
11fdf7f2 11164 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
11165 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11166 auto q = p++;
11167 if (in->flags & I_ERROR_FILELOCK) {
11168 lock_state->remove_lock(q->second, activated_locks);
11169 } else {
11170 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11171 }
11172 }
11fdf7f2 11173 lock_state.reset();
7c673cae
FG
11174 }
11175
f6b5b4d7
TL
11176 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11177 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 11178
f6b5b4d7 11179 if (to_release.empty())
11fdf7f2
TL
11180 return;
11181
7c673cae
FG
11182 struct flock fl;
11183 memset(&fl, 0, sizeof(fl));
11184 fl.l_whence = SEEK_SET;
11185 fl.l_type = F_UNLCK;
11186
11187 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11188 p != to_release.end();
11189 ++p) {
11190 fl.l_start = p->second.start;
11191 fl.l_len = p->second.length;
11192 fl.l_pid = p->second.pid;
11193 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11194 p->second.owner, true);
11195 }
11196}
11197
11198void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11199 ceph_lock_state_t *lock_state)
11200{
11201 int lock_cmd;
11202 if (F_RDLCK == fl->l_type)
11203 lock_cmd = CEPH_LOCK_SHARED;
11204 else if (F_WRLCK == fl->l_type)
11205 lock_cmd = CEPH_LOCK_EXCL;
11206 else
11207 lock_cmd = CEPH_LOCK_UNLOCK;;
11208
11209 ceph_filelock filelock;
11210 filelock.start = fl->l_start;
11211 filelock.length = fl->l_len;
11212 filelock.client = 0;
11213 // see comment in _do_filelock()
11214 filelock.owner = owner | (1ULL << 63);
11215 filelock.pid = fl->l_pid;
11216 filelock.type = lock_cmd;
11217
11218 if (filelock.type == CEPH_LOCK_UNLOCK) {
11219 list<ceph_filelock> activated_locks;
11220 lock_state->remove_lock(filelock, activated_locks);
11221 } else {
11222 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 11223 ceph_assert(r);
7c673cae
FG
11224 }
11225}
11226
11227int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11228{
11229 Inode *in = fh->inode.get();
11230 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11231 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11232 return ret;
11233}
11234
11235int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11236{
11237 Inode *in = fh->inode.get();
11238 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11239 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11240 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11241 return ret;
11242}
11243
11244int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11245{
11246 Inode *in = fh->inode.get();
11247 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11248
11249 int sleep = !(cmd & LOCK_NB);
11250 cmd &= ~LOCK_NB;
11251
11252 int type;
11253 switch (cmd) {
11254 case LOCK_SH:
11255 type = F_RDLCK;
11256 break;
11257 case LOCK_EX:
11258 type = F_WRLCK;
11259 break;
11260 case LOCK_UN:
11261 type = F_UNLCK;
11262 break;
11263 default:
f67539c2 11264 return -CEPHFS_EINVAL;
7c673cae
FG
11265 }
11266
11267 struct flock fl;
11268 memset(&fl, 0, sizeof(fl));
11269 fl.l_type = type;
11270 fl.l_whence = SEEK_SET;
11271
11272 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11273 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11274 return ret;
11275}
11276
f67539c2
TL
11277int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11278 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11279 if (!mref_reader.is_state_satisfied()) {
11280 return -CEPHFS_ENOTCONN;
11281 }
11282
20effc67 11283 std::scoped_lock lock(client_lock);
f67539c2
TL
11284 InodeRef in;
11285 int r = Client::path_walk(path, &in, perms, true);
11286 if (r < 0) {
11287 return r;
11288 }
11289
11290 if (in->snapid == CEPH_NOSNAP) {
11291 return -CEPHFS_EINVAL;
11292 }
11293
11294 snap_info->id = in->snapid;
11295 snap_info->metadata = in->snap_metadata;
11296 return 0;
11297}
11298
11299int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11300{
11301 /* Since the only thing this does is wrap a call to statfs, and
11302 statfs takes a lock, it doesn't seem we have a need to split it
11303 out. */
7c673cae
FG
11304 return statfs(0, stbuf, perms);
11305}
11306
20effc67 11307void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
11308{
11309 if (!args)
11310 return;
20effc67 11311
11fdf7f2 11312 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
11313 << " invalidate_ino_cb " << args->ino_cb
11314 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
11315 << " switch_interrupt_cb " << args->switch_intr_cb
11316 << " remount_cb " << args->remount_cb
11317 << dendl;
11318 callback_handle = args->handle;
11319 if (args->ino_cb) {
11320 ino_invalidate_cb = args->ino_cb;
11321 async_ino_invalidator.start();
11322 }
11323 if (args->dentry_cb) {
11324 dentry_invalidate_cb = args->dentry_cb;
11325 async_dentry_invalidator.start();
11326 }
11327 if (args->switch_intr_cb) {
11328 switch_interrupt_cb = args->switch_intr_cb;
11329 interrupt_finisher.start();
11330 }
11331 if (args->remount_cb) {
11332 remount_cb = args->remount_cb;
11333 remount_finisher.start();
11334 }
e306af50
TL
11335 if (args->ino_release_cb) {
11336 ino_release_cb = args->ino_release_cb;
11337 async_ino_releasor.start();
11338 }
11339 if (args->umask_cb)
11340 umask_cb = args->umask_cb;
7c673cae
FG
11341}
11342
20effc67
TL
11343// This is deprecated, use ll_register_callbacks2() instead.
11344void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11345{
11346 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11347
11348 _ll_register_callbacks(args);
11349}
11350
11351int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11352{
11353 if (is_mounting() || is_mounted() || is_unmounting())
11354 return -CEPHFS_EBUSY;
11355
11356 _ll_register_callbacks(args);
11357 return 0;
11358}
11359
7c673cae
FG
11360int Client::test_dentry_handling(bool can_invalidate)
11361{
11362 int r = 0;
11363
f67539c2
TL
11364 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11365 if (!iref_reader.is_state_satisfied())
11366 return -CEPHFS_ENOTCONN;
11367
7c673cae
FG
11368 can_invalidate_dentries = can_invalidate;
11369
11370 if (can_invalidate_dentries) {
11fdf7f2 11371 ceph_assert(dentry_invalidate_cb);
7c673cae 11372 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 11373 r = 0;
11fdf7f2
TL
11374 } else {
11375 ceph_assert(remount_cb);
7c673cae 11376 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 11377 r = _do_remount(false);
b32b8144 11378 }
11fdf7f2 11379
7c673cae
FG
11380 return r;
11381}
11382
11383int Client::_sync_fs()
11384{
f67539c2
TL
11385 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11386
11fdf7f2 11387 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
11388
11389 // flush file data
11fdf7f2
TL
11390 std::unique_ptr<C_SaferCond> cond = nullptr;
11391 if (cct->_conf->client_oc) {
11392 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11393 objectcacher->flush_all(cond.get());
11394 }
7c673cae
FG
11395
11396 // flush caps
11397 flush_caps_sync();
11398 ceph_tid_t flush_tid = last_flush_tid;
11399
11400 // wait for unsafe mds requests
11401 wait_unsafe_requests();
11402
11403 wait_sync_caps(flush_tid);
11404
11fdf7f2 11405 if (nullptr != cond) {
9f95a23c 11406 client_lock.unlock();
11fdf7f2
TL
11407 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11408 cond->wait();
11409 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 11410 client_lock.lock();
7c673cae
FG
11411 }
11412
11413 return 0;
11414}
11415
11416int Client::sync_fs()
11417{
f67539c2
TL
11418 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11419 if (!mref_reader.is_state_satisfied())
11420 return -CEPHFS_ENOTCONN;
181888fb 11421
f67539c2 11422 std::scoped_lock l(client_lock);
181888fb 11423
7c673cae
FG
11424 return _sync_fs();
11425}
11426
11427int64_t Client::drop_caches()
11428{
f67539c2 11429 std::scoped_lock l(client_lock);
7c673cae
FG
11430 return objectcacher->release_all();
11431}
11432
11fdf7f2
TL
11433int Client::_lazyio(Fh *fh, int enable)
11434{
11435 Inode *in = fh->inode.get();
11436 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11437
11438 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11439 return 0;
11440
11441 int orig_mode = fh->mode;
11442 if (enable) {
11443 fh->mode |= CEPH_FILE_MODE_LAZY;
11444 in->get_open_ref(fh->mode);
11445 in->put_open_ref(orig_mode);
11446 check_caps(in, CHECK_CAPS_NODELAY);
11447 } else {
11448 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11449 in->get_open_ref(fh->mode);
11450 in->put_open_ref(orig_mode);
11451 check_caps(in, 0);
11452 }
11453
11454 return 0;
11455}
11456
11457int Client::lazyio(int fd, int enable)
11458{
f67539c2 11459 std::scoped_lock l(client_lock);
11fdf7f2
TL
11460 Fh *f = get_filehandle(fd);
11461 if (!f)
f67539c2 11462 return -CEPHFS_EBADF;
11fdf7f2
TL
11463
11464 return _lazyio(f, enable);
11465}
11466
11467int Client::ll_lazyio(Fh *fh, int enable)
11468{
11fdf7f2
TL
11469 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11470 tout(cct) << __func__ << std::endl;
11471
f67539c2 11472 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11473 return _lazyio(fh, enable);
11474}
7c673cae 11475
92f5a8d4 11476int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 11477{
f67539c2 11478 std::scoped_lock l(client_lock);
92f5a8d4 11479 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
11480 << ", " << offset << ", " << count << ")" << dendl;
11481
11482 Fh *f = get_filehandle(fd);
11483 if (!f)
f67539c2 11484 return -CEPHFS_EBADF;
7c673cae
FG
11485
11486 // for now
11487 _fsync(f, true);
11488
11489 return 0;
11490}
11491
11492int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11493{
f67539c2 11494 std::scoped_lock l(client_lock);
7c673cae
FG
11495 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11496 << ", " << offset << ", " << count << ")" << dendl;
11497
11498 Fh *f = get_filehandle(fd);
11499 if (!f)
f67539c2 11500 return -CEPHFS_EBADF;
7c673cae
FG
11501 Inode *in = f->inode.get();
11502
11503 _fsync(f, true);
92f5a8d4
TL
11504 if (_release(in)) {
11505 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11506 if (r < 0)
11507 return r;
11508 }
7c673cae
FG
11509 return 0;
11510}
11511
11512
11513// =============================
11514// snaps
11515
f67539c2
TL
11516int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11517 mode_t mode, const std::map<std::string, std::string> &metadata)
7c673cae 11518{
f67539c2
TL
11519 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11520 if (!mref_reader.is_state_satisfied())
11521 return -CEPHFS_ENOTCONN;
181888fb 11522
f67539c2 11523 std::scoped_lock l(client_lock);
181888fb 11524
7c673cae
FG
11525 filepath path(relpath);
11526 InodeRef in;
11527 int r = path_walk(path, &in, perm);
11528 if (r < 0)
11529 return r;
11530 if (cct->_conf->client_permissions) {
11531 r = may_create(in.get(), perm);
11532 if (r < 0)
11533 return r;
11534 }
11535 Inode *snapdir = open_snapdir(in.get());
f67539c2 11536 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
7c673cae 11537}
181888fb 11538
f67539c2 11539int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
7c673cae 11540{
f67539c2
TL
11541 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11542 if (!mref_reader.is_state_satisfied())
11543 return -CEPHFS_ENOTCONN;
181888fb 11544
f67539c2 11545 std::scoped_lock l(client_lock);
181888fb 11546
7c673cae
FG
11547 filepath path(relpath);
11548 InodeRef in;
11549 int r = path_walk(path, &in, perms);
11550 if (r < 0)
11551 return r;
f67539c2 11552 Inode *snapdir = open_snapdir(in.get());
7c673cae 11553 if (cct->_conf->client_permissions) {
f67539c2 11554 r = may_delete(snapdir, check_perms ? name : NULL, perms);
7c673cae
FG
11555 if (r < 0)
11556 return r;
11557 }
7c673cae
FG
11558 return _rmdir(snapdir, name, perms);
11559}
11560
11561// =============================
11562// expose caps
11563
f67539c2
TL
11564int Client::get_caps_issued(int fd)
11565{
11566 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11567 if (!mref_reader.is_state_satisfied())
11568 return -CEPHFS_ENOTCONN;
7c673cae 11569
f67539c2 11570 std::scoped_lock lock(client_lock);
181888fb 11571
7c673cae
FG
11572 Fh *f = get_filehandle(fd);
11573 if (!f)
f67539c2 11574 return -CEPHFS_EBADF;
7c673cae
FG
11575
11576 return f->inode->caps_issued();
11577}
11578
11579int Client::get_caps_issued(const char *path, const UserPerm& perms)
11580{
f67539c2
TL
11581 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11582 if (!mref_reader.is_state_satisfied())
11583 return -CEPHFS_ENOTCONN;
181888fb 11584
f67539c2 11585 std::scoped_lock lock(client_lock);
181888fb 11586
7c673cae
FG
11587 filepath p(path);
11588 InodeRef in;
11589 int r = path_walk(p, &in, perms, true);
11590 if (r < 0)
11591 return r;
11592 return in->caps_issued();
11593}
11594
11595// =========================================
11596// low level
11597
11598Inode *Client::open_snapdir(Inode *diri)
11599{
11600 Inode *in;
11601 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11602 if (!inode_map.count(vino)) {
11603 in = new Inode(this, vino, &diri->layout);
11604
11605 in->ino = diri->ino;
11606 in->snapid = CEPH_SNAPDIR;
11607 in->mode = diri->mode;
11608 in->uid = diri->uid;
11609 in->gid = diri->gid;
494da23a 11610 in->nlink = 1;
7c673cae
FG
11611 in->mtime = diri->mtime;
11612 in->ctime = diri->ctime;
11613 in->btime = diri->btime;
f6b5b4d7 11614 in->atime = diri->atime;
7c673cae
FG
11615 in->size = diri->size;
11616 in->change_attr = diri->change_attr;
11617
11618 in->dirfragtree.clear();
11619 in->snapdir_parent = diri;
11620 diri->flags |= I_SNAPDIR_OPEN;
11621 inode_map[vino] = in;
11622 if (use_faked_inos())
11623 _assign_faked_ino(in);
11624 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11625 } else {
11626 in = inode_map[vino];
11627 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11628 }
11629 return in;
11630}
11631
11632int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11633 Inode **out, const UserPerm& perms)
11634{
f67539c2
TL
11635 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11636 if (!mref_reader.is_state_satisfied())
11637 return -CEPHFS_ENOTCONN;
11638
31f18b77 11639 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
11640 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11641 tout(cct) << __func__ << std::endl;
7c673cae
FG
11642 tout(cct) << name << std::endl;
11643
f67539c2 11644 std::scoped_lock lock(client_lock);
181888fb 11645
7c673cae 11646 int r = 0;
11fdf7f2
TL
11647 if (!fuse_default_permissions) {
11648 if (strcmp(name, ".") && strcmp(name, "..")) {
11649 r = may_lookup(parent, perms);
11650 if (r < 0)
11651 return r;
11652 }
7c673cae
FG
11653 }
11654
11655 string dname(name);
11656 InodeRef in;
11657
11658 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11659 if (r < 0) {
11660 attr->st_ino = 0;
11661 goto out;
11662 }
11663
11fdf7f2 11664 ceph_assert(in);
7c673cae
FG
11665 fill_stat(in, attr);
11666 _ll_get(in.get());
11667
11668 out:
11fdf7f2 11669 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11670 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11671 tout(cct) << attr->st_ino << std::endl;
11672 *out = in.get();
11673 return r;
11674}
11675
f67539c2
TL
11676int Client::ll_lookup_vino(
11677 vinodeno_t vino,
1adf2230
AA
11678 const UserPerm& perms,
11679 Inode **inode)
11680{
81eedcae 11681 ceph_assert(inode != NULL);
f67539c2
TL
11682 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11683 if (!mref_reader.is_state_satisfied())
11684 return -CEPHFS_ENOTCONN;
81eedcae 11685
b3b6e05e
TL
11686 if (is_reserved_vino(vino))
11687 return -CEPHFS_ESTALE;
11688
f67539c2
TL
11689 std::scoped_lock lock(client_lock);
11690 ldout(cct, 3) << __func__ << " " << vino << dendl;
1adf2230 11691
f67539c2
TL
11692 // Check the cache first
11693 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11694 if (p != inode_map.end()) {
11695 *inode = p->second;
11696 _ll_get(*inode);
81eedcae
TL
11697 return 0;
11698 }
11699
f67539c2 11700 uint64_t snapid = vino.snapid;
81eedcae 11701
f67539c2
TL
11702 // for snapdir, find the non-snapped dir inode
11703 if (snapid == CEPH_SNAPDIR)
11704 vino.snapid = CEPH_NOSNAP;
11705
11706 int r = _lookup_vino(vino, perms, inode);
11707 if (r)
1adf2230 11708 return r;
f67539c2 11709 ceph_assert(*inode != NULL);
81eedcae 11710
f67539c2
TL
11711 if (snapid == CEPH_SNAPDIR) {
11712 Inode *tmp = *inode;
1adf2230 11713
f67539c2
TL
11714 // open the snapdir and put the inode ref
11715 *inode = open_snapdir(tmp);
11716 _ll_forget(tmp, 1);
11717 _ll_get(*inode);
1adf2230 11718 }
1adf2230
AA
11719 return 0;
11720}
11721
f67539c2
TL
11722int Client::ll_lookup_inode(
11723 struct inodeno_t ino,
11724 const UserPerm& perms,
11725 Inode **inode)
11726{
11727 vinodeno_t vino(ino, CEPH_NOSNAP);
11728 return ll_lookup_vino(vino, perms, inode);
11729}
11730
7c673cae
FG
11731int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11732 struct ceph_statx *stx, unsigned want, unsigned flags,
11733 const UserPerm& perms)
11734{
f67539c2
TL
11735 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11736 if (!mref_reader.is_state_satisfied())
11737 return -CEPHFS_ENOTCONN;
11738
31f18b77 11739 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 11740 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
11741 tout(cct) << "ll_lookupx" << std::endl;
11742 tout(cct) << name << std::endl;
11743
f67539c2 11744 std::scoped_lock lock(client_lock);
181888fb 11745
7c673cae 11746 int r = 0;
11fdf7f2 11747 if (!fuse_default_permissions) {
7c673cae
FG
11748 r = may_lookup(parent, perms);
11749 if (r < 0)
11750 return r;
11751 }
11752
11753 string dname(name);
11754 InodeRef in;
11755
11756 unsigned mask = statx_to_mask(flags, want);
11757 r = _lookup(parent, dname, mask, &in, perms);
11758 if (r < 0) {
11759 stx->stx_ino = 0;
11760 stx->stx_mask = 0;
11761 } else {
11fdf7f2 11762 ceph_assert(in);
7c673cae
FG
11763 fill_statx(in, mask, stx);
11764 _ll_get(in.get());
11765 }
11766
11fdf7f2 11767 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11768 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11769 tout(cct) << stx->stx_ino << std::endl;
11770 *out = in.get();
11771 return r;
11772}
11773
11774int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11775 unsigned int want, unsigned int flags, const UserPerm& perms)
11776{
f67539c2
TL
11777 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11778 if (!mref_reader.is_state_satisfied())
11779 return -CEPHFS_ENOTCONN;
181888fb 11780
7c673cae
FG
11781 filepath fp(name, 0);
11782 InodeRef in;
11783 int rc;
11784 unsigned mask = statx_to_mask(flags, want);
11785
11fdf7f2
TL
11786 ldout(cct, 3) << __func__ << " " << name << dendl;
11787 tout(cct) << __func__ << std::endl;
7c673cae
FG
11788 tout(cct) << name << std::endl;
11789
f67539c2 11790 std::scoped_lock lock(client_lock);
7c673cae
FG
11791 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11792 if (rc < 0) {
11793 /* zero out mask, just in case... */
11794 stx->stx_mask = 0;
11795 stx->stx_ino = 0;
11796 *out = NULL;
11797 return rc;
11798 } else {
11fdf7f2 11799 ceph_assert(in);
7c673cae
FG
11800 fill_statx(in, mask, stx);
11801 _ll_get(in.get());
11802 *out = in.get();
11803 return 0;
11804 }
11805}
11806
11807void Client::_ll_get(Inode *in)
11808{
11809 if (in->ll_ref == 0) {
b3b6e05e 11810 in->iget();
11fdf7f2
TL
11811 if (in->is_dir() && !in->dentries.empty()) {
11812 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11813 in->get_first_parent()->get(); // pin dentry
11814 }
11fdf7f2
TL
11815 if (in->snapid != CEPH_NOSNAP)
11816 ll_snap_ref[in->snapid]++;
7c673cae
FG
11817 }
11818 in->ll_get();
11fdf7f2 11819 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
11820}
11821
494da23a 11822int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
11823{
11824 in->ll_put(num);
11fdf7f2 11825 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 11826 if (in->ll_ref == 0) {
11fdf7f2
TL
11827 if (in->is_dir() && !in->dentries.empty()) {
11828 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11829 in->get_first_parent()->put(); // unpin dentry
11830 }
11fdf7f2
TL
11831 if (in->snapid != CEPH_NOSNAP) {
11832 auto p = ll_snap_ref.find(in->snapid);
11833 ceph_assert(p != ll_snap_ref.end());
11834 ceph_assert(p->second > 0);
11835 if (--p->second == 0)
11836 ll_snap_ref.erase(p);
11837 }
7c673cae
FG
11838 put_inode(in);
11839 return 0;
11840 } else {
11841 return in->ll_ref;
11842 }
11843}
11844
11845void Client::_ll_drop_pins()
11846{
11fdf7f2 11847 ldout(cct, 10) << __func__ << dendl;
1adf2230 11848 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
11849 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11850 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11851 it != inode_map.end();
11852 it = next) {
11853 Inode *in = it->second;
11854 next = it;
11855 ++next;
1adf2230
AA
11856 if (in->ll_ref){
11857 to_be_put.insert(in);
7c673cae 11858 _ll_put(in, in->ll_ref);
1adf2230 11859 }
7c673cae
FG
11860 }
11861}
11862
494da23a 11863bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 11864{
11fdf7f2 11865 inodeno_t ino = in->ino;
7c673cae 11866
11fdf7f2
TL
11867 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11868 tout(cct) << __func__ << std::endl;
7c673cae
FG
11869 tout(cct) << ino.val << std::endl;
11870 tout(cct) << count << std::endl;
11871
181888fb 11872 // Ignore forget if we're no longer mounted
f67539c2
TL
11873 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11874 if (!mref_reader.is_state_satisfied())
181888fb
FG
11875 return true;
11876
7c673cae
FG
11877 if (ino == 1) return true; // ignore forget on root.
11878
11879 bool last = false;
11880 if (in->ll_ref < count) {
11881 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11882 << ", which only has ll_ref=" << in->ll_ref << dendl;
11883 _ll_put(in, in->ll_ref);
11884 last = true;
11885 } else {
11886 if (_ll_put(in, count) == 0)
11887 last = true;
11888 }
11889
11890 return last;
11891}
11892
494da23a 11893bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 11894{
f67539c2 11895 std::scoped_lock lock(client_lock);
1adf2230
AA
11896 return _ll_forget(in, count);
11897}
11898
7c673cae
FG
11899bool Client::ll_put(Inode *in)
11900{
11901 /* ll_forget already takes the lock */
11902 return ll_forget(in, 1);
11903}
11904
11fdf7f2
TL
11905int Client::ll_get_snap_ref(snapid_t snap)
11906{
f67539c2 11907 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11908 auto p = ll_snap_ref.find(snap);
11909 if (p != ll_snap_ref.end())
11910 return p->second;
11911 return 0;
11912}
11913
7c673cae
FG
11914snapid_t Client::ll_get_snapid(Inode *in)
11915{
f67539c2 11916 std::scoped_lock lock(client_lock);
7c673cae
FG
11917 return in->snapid;
11918}
11919
11920Inode *Client::ll_get_inode(ino_t ino)
11921{
f67539c2
TL
11922 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11923 if (!mref_reader.is_state_satisfied())
181888fb
FG
11924 return NULL;
11925
f67539c2
TL
11926 std::scoped_lock lock(client_lock);
11927
7c673cae
FG
11928 vinodeno_t vino = _map_faked_ino(ino);
11929 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11930 if (p == inode_map.end())
11931 return NULL;
11932 Inode *in = p->second;
11933 _ll_get(in);
11934 return in;
11935}
11936
11937Inode *Client::ll_get_inode(vinodeno_t vino)
11938{
f67539c2
TL
11939 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11940 if (!mref_reader.is_state_satisfied())
181888fb
FG
11941 return NULL;
11942
b3b6e05e
TL
11943 if (is_reserved_vino(vino))
11944 return NULL;
11945
f67539c2
TL
11946 std::scoped_lock lock(client_lock);
11947
7c673cae
FG
11948 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11949 if (p == inode_map.end())
11950 return NULL;
11951 Inode *in = p->second;
11952 _ll_get(in);
11953 return in;
11954}
11955
11956int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11957{
11958 vinodeno_t vino = _get_vino(in);
11959
11fdf7f2
TL
11960 ldout(cct, 8) << __func__ << " " << vino << dendl;
11961 tout(cct) << __func__ << std::endl;
7c673cae
FG
11962 tout(cct) << vino.ino.val << std::endl;
11963
11964 if (vino.snapid < CEPH_NOSNAP)
11965 return 0;
11966 else
11967 return _getattr(in, caps, perms);
11968}
11969
11970int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11971{
f67539c2
TL
11972 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11973 if (!mref_reader.is_state_satisfied())
11974 return -CEPHFS_ENOTCONN;
7c673cae 11975
f67539c2 11976 std::scoped_lock lock(client_lock);
181888fb 11977
7c673cae
FG
11978 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11979
11980 if (res == 0)
11981 fill_stat(in, attr);
11fdf7f2 11982 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11983 return res;
11984}
11985
11986int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11987 unsigned int flags, const UserPerm& perms)
11988{
f67539c2
TL
11989 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11990 if (!mref_reader.is_state_satisfied())
11991 return -CEPHFS_ENOTCONN;
7c673cae 11992
f67539c2 11993 std::scoped_lock lock(client_lock);
181888fb 11994
7c673cae
FG
11995 int res = 0;
11996 unsigned mask = statx_to_mask(flags, want);
11997
94b18763 11998 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
11999 res = _ll_getattr(in, mask, perms);
12000
12001 if (res == 0)
12002 fill_statx(in, mask, stx);
11fdf7f2 12003 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12004 return res;
12005}
12006
12007int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12008 const UserPerm& perms, InodeRef *inp)
12009{
12010 vinodeno_t vino = _get_vino(in);
12011
11fdf7f2 12012 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 12013 << dendl;
11fdf7f2 12014 tout(cct) << __func__ << std::endl;
7c673cae
FG
12015 tout(cct) << vino.ino.val << std::endl;
12016 tout(cct) << stx->stx_mode << std::endl;
12017 tout(cct) << stx->stx_uid << std::endl;
12018 tout(cct) << stx->stx_gid << std::endl;
12019 tout(cct) << stx->stx_size << std::endl;
12020 tout(cct) << stx->stx_mtime << std::endl;
12021 tout(cct) << stx->stx_atime << std::endl;
12022 tout(cct) << stx->stx_btime << std::endl;
12023 tout(cct) << mask << std::endl;
12024
11fdf7f2 12025 if (!fuse_default_permissions) {
7c673cae
FG
12026 int res = may_setattr(in, stx, mask, perms);
12027 if (res < 0)
12028 return res;
12029 }
12030
12031 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12032
12033 return __setattrx(in, stx, mask, perms, inp);
12034}
12035
12036int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12037 const UserPerm& perms)
12038{
f67539c2
TL
12039 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12040 if (!mref_reader.is_state_satisfied())
12041 return -CEPHFS_ENOTCONN;
181888fb 12042
f67539c2 12043 std::scoped_lock lock(client_lock);
181888fb 12044
7c673cae
FG
12045 InodeRef target(in);
12046 int res = _ll_setattrx(in, stx, mask, perms, &target);
12047 if (res == 0) {
11fdf7f2 12048 ceph_assert(in == target.get());
7c673cae
FG
12049 fill_statx(in, in->caps_issued(), stx);
12050 }
12051
11fdf7f2 12052 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12053 return res;
12054}
12055
12056int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12057 const UserPerm& perms)
12058{
12059 struct ceph_statx stx;
12060 stat_to_statx(attr, &stx);
12061
f67539c2
TL
12062 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12063 if (!mref_reader.is_state_satisfied())
12064 return -CEPHFS_ENOTCONN;
181888fb 12065
f67539c2 12066 std::scoped_lock lock(client_lock);
181888fb 12067
7c673cae
FG
12068 InodeRef target(in);
12069 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12070 if (res == 0) {
11fdf7f2 12071 ceph_assert(in == target.get());
7c673cae
FG
12072 fill_stat(in, attr);
12073 }
12074
11fdf7f2 12075 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12076 return res;
12077}
12078
12079
12080// ----------
12081// xattrs
12082
12083int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12084 const UserPerm& perms)
12085{
f67539c2
TL
12086 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12087 if (!mref_reader.is_state_satisfied())
12088 return -CEPHFS_ENOTCONN;
181888fb 12089
f67539c2 12090 std::scoped_lock lock(client_lock);
181888fb 12091
7c673cae
FG
12092 InodeRef in;
12093 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12094 if (r < 0)
12095 return r;
12096 return _getxattr(in, name, value, size, perms);
12097}
12098
12099int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12100 const UserPerm& perms)
12101{
f67539c2
TL
12102 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12103 if (!mref_reader.is_state_satisfied())
12104 return -CEPHFS_ENOTCONN;
181888fb 12105
f67539c2 12106 std::scoped_lock lock(client_lock);
181888fb 12107
7c673cae
FG
12108 InodeRef in;
12109 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12110 if (r < 0)
12111 return r;
12112 return _getxattr(in, name, value, size, perms);
12113}
12114
12115int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12116 const UserPerm& perms)
12117{
f67539c2
TL
12118 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12119 if (!mref_reader.is_state_satisfied())
12120 return -CEPHFS_ENOTCONN;
181888fb 12121
f67539c2 12122 std::scoped_lock lock(client_lock);
181888fb 12123
7c673cae
FG
12124 Fh *f = get_filehandle(fd);
12125 if (!f)
f67539c2 12126 return -CEPHFS_EBADF;
7c673cae
FG
12127 return _getxattr(f->inode, name, value, size, perms);
12128}
12129
12130int Client::listxattr(const char *path, char *list, size_t size,
12131 const UserPerm& perms)
12132{
f67539c2
TL
12133 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12134 if (!mref_reader.is_state_satisfied())
12135 return -CEPHFS_ENOTCONN;
181888fb 12136
f67539c2 12137 std::scoped_lock lock(client_lock);
181888fb 12138
7c673cae
FG
12139 InodeRef in;
12140 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12141 if (r < 0)
12142 return r;
12143 return Client::_listxattr(in.get(), list, size, perms);
12144}
12145
12146int Client::llistxattr(const char *path, char *list, size_t size,
12147 const UserPerm& perms)
12148{
f67539c2
TL
12149 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12150 if (!mref_reader.is_state_satisfied())
12151 return -CEPHFS_ENOTCONN;
181888fb 12152
f67539c2 12153 std::scoped_lock lock(client_lock);
181888fb 12154
7c673cae
FG
12155 InodeRef in;
12156 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12157 if (r < 0)
12158 return r;
12159 return Client::_listxattr(in.get(), list, size, perms);
12160}
12161
12162int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12163{
f67539c2
TL
12164 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12165 if (!mref_reader.is_state_satisfied())
12166 return -CEPHFS_ENOTCONN;
181888fb 12167
f67539c2 12168 std::scoped_lock lock(client_lock);
181888fb 12169
7c673cae
FG
12170 Fh *f = get_filehandle(fd);
12171 if (!f)
f67539c2 12172 return -CEPHFS_EBADF;
7c673cae
FG
12173 return Client::_listxattr(f->inode.get(), list, size, perms);
12174}
12175
12176int Client::removexattr(const char *path, const char *name,
12177 const UserPerm& perms)
12178{
f67539c2
TL
12179 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12180 if (!mref_reader.is_state_satisfied())
12181 return -CEPHFS_ENOTCONN;
181888fb 12182
f67539c2 12183 std::scoped_lock lock(client_lock);
181888fb 12184
7c673cae
FG
12185 InodeRef in;
12186 int r = Client::path_walk(path, &in, perms, true);
12187 if (r < 0)
12188 return r;
12189 return _removexattr(in, name, perms);
12190}
12191
12192int Client::lremovexattr(const char *path, const char *name,
12193 const UserPerm& perms)
12194{
f67539c2
TL
12195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12196 if (!mref_reader.is_state_satisfied())
12197 return -CEPHFS_ENOTCONN;
181888fb 12198
f67539c2 12199 std::scoped_lock lock(client_lock);
181888fb 12200
7c673cae
FG
12201 InodeRef in;
12202 int r = Client::path_walk(path, &in, perms, false);
12203 if (r < 0)
12204 return r;
12205 return _removexattr(in, name, perms);
12206}
12207
12208int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12209{
f67539c2
TL
12210 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12211 if (!mref_reader.is_state_satisfied())
12212 return -CEPHFS_ENOTCONN;
181888fb 12213
f67539c2 12214 std::scoped_lock lock(client_lock);
181888fb 12215
7c673cae
FG
12216 Fh *f = get_filehandle(fd);
12217 if (!f)
f67539c2 12218 return -CEPHFS_EBADF;
7c673cae
FG
12219 return _removexattr(f->inode, name, perms);
12220}
12221
12222int Client::setxattr(const char *path, const char *name, const void *value,
12223 size_t size, int flags, const UserPerm& perms)
12224{
f67539c2
TL
12225 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12226 if (!mref_reader.is_state_satisfied())
12227 return -CEPHFS_ENOTCONN;
12228
7c673cae
FG
12229 _setxattr_maybe_wait_for_osdmap(name, value, size);
12230
f67539c2 12231 std::scoped_lock lock(client_lock);
181888fb 12232
7c673cae
FG
12233 InodeRef in;
12234 int r = Client::path_walk(path, &in, perms, true);
12235 if (r < 0)
12236 return r;
12237 return _setxattr(in, name, value, size, flags, perms);
12238}
12239
12240int Client::lsetxattr(const char *path, const char *name, const void *value,
12241 size_t size, int flags, const UserPerm& perms)
12242{
f67539c2
TL
12243 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12244 if (!mref_reader.is_state_satisfied())
12245 return -CEPHFS_ENOTCONN;
7c673cae 12246
f67539c2 12247 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12248
f67539c2 12249 std::scoped_lock lock(client_lock);
181888fb 12250
7c673cae
FG
12251 InodeRef in;
12252 int r = Client::path_walk(path, &in, perms, false);
12253 if (r < 0)
12254 return r;
12255 return _setxattr(in, name, value, size, flags, perms);
12256}
12257
12258int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12259 int flags, const UserPerm& perms)
12260{
f67539c2
TL
12261 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12262 if (!mref_reader.is_state_satisfied())
12263 return -CEPHFS_ENOTCONN;
7c673cae 12264
f67539c2 12265 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12266
f67539c2 12267 std::scoped_lock lock(client_lock);
181888fb 12268
7c673cae
FG
12269 Fh *f = get_filehandle(fd);
12270 if (!f)
f67539c2 12271 return -CEPHFS_EBADF;
7c673cae
FG
12272 return _setxattr(f->inode, name, value, size, flags, perms);
12273}
12274
12275int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12276 const UserPerm& perms)
12277{
12278 int r;
12279
12280 const VXattr *vxattr = _match_vxattr(in, name);
12281 if (vxattr) {
f67539c2 12282 r = -CEPHFS_ENODATA;
7c673cae
FG
12283
12284 // Do a force getattr to get the latest quota before returning
12285 // a value to userspace.
28e407b8
AA
12286 int flags = 0;
12287 if (vxattr->flags & VXATTR_RSTAT) {
12288 flags |= CEPH_STAT_RSTAT;
12289 }
adb31ebb
TL
12290 if (vxattr->flags & VXATTR_DIRSTAT) {
12291 flags |= CEPH_CAP_FILE_SHARED;
12292 }
f67539c2 12293 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
7c673cae
FG
12294 if (r != 0) {
12295 // Error from getattr!
12296 return r;
12297 }
12298
12299 // call pointer-to-member function
12300 char buf[256];
12301 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12302 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12303 } else {
f67539c2 12304 r = -CEPHFS_ENODATA;
7c673cae
FG
12305 }
12306
12307 if (size != 0) {
12308 if (r > (int)size) {
f67539c2 12309 r = -CEPHFS_ERANGE;
7c673cae
FG
12310 } else if (r > 0) {
12311 memcpy(value, buf, r);
12312 }
12313 }
12314 goto out;
12315 }
12316
12317 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
f67539c2 12318 r = -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12319 goto out;
12320 }
12321
12322 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12323 if (r == 0) {
12324 string n(name);
f67539c2 12325 r = -CEPHFS_ENODATA;
7c673cae
FG
12326 if (in->xattrs.count(n)) {
12327 r = in->xattrs[n].length();
12328 if (r > 0 && size != 0) {
12329 if (size >= (unsigned)r)
12330 memcpy(value, in->xattrs[n].c_str(), r);
12331 else
f67539c2 12332 r = -CEPHFS_ERANGE;
7c673cae
FG
12333 }
12334 }
12335 }
12336 out:
1adf2230 12337 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
12338 return r;
12339}
12340
12341int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12342 const UserPerm& perms)
12343{
12344 if (cct->_conf->client_permissions) {
12345 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12346 if (r < 0)
12347 return r;
12348 }
12349 return _getxattr(in.get(), name, value, size, perms);
12350}
12351
12352int Client::ll_getxattr(Inode *in, const char *name, void *value,
12353 size_t size, const UserPerm& perms)
12354{
f67539c2
TL
12355 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12356 if (!mref_reader.is_state_satisfied())
12357 return -CEPHFS_ENOTCONN;
181888fb 12358
7c673cae
FG
12359 vinodeno_t vino = _get_vino(in);
12360
11fdf7f2
TL
12361 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12362 tout(cct) << __func__ << std::endl;
7c673cae
FG
12363 tout(cct) << vino.ino.val << std::endl;
12364 tout(cct) << name << std::endl;
12365
f67539c2 12366 std::scoped_lock lock(client_lock);
11fdf7f2 12367 if (!fuse_default_permissions) {
7c673cae
FG
12368 int r = xattr_permission(in, name, MAY_READ, perms);
12369 if (r < 0)
12370 return r;
12371 }
12372
12373 return _getxattr(in, name, value, size, perms);
12374}
12375
12376int Client::_listxattr(Inode *in, char *name, size_t size,
12377 const UserPerm& perms)
12378{
81eedcae 12379 bool len_only = (size == 0);
7c673cae 12380 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
12381 if (r != 0) {
12382 goto out;
12383 }
7c673cae 12384
81eedcae 12385 r = 0;
f67539c2
TL
12386 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12387 if (xattr_name.rfind("ceph.", 0) == 0) {
12388 continue;
12389 }
12390
12391 size_t this_len = xattr_name.length() + 1;
81eedcae
TL
12392 r += this_len;
12393 if (len_only)
12394 continue;
7c673cae 12395
81eedcae 12396 if (this_len > size) {
f67539c2 12397 r = -CEPHFS_ERANGE;
81eedcae
TL
12398 goto out;
12399 }
12400
f67539c2 12401 memcpy(name, xattr_name.c_str(), this_len);
81eedcae
TL
12402 name += this_len;
12403 size -= this_len;
12404 }
81eedcae 12405out:
11fdf7f2 12406 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
12407 return r;
12408}
12409
12410int Client::ll_listxattr(Inode *in, char *names, size_t size,
12411 const UserPerm& perms)
12412{
f67539c2
TL
12413 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12414 if (!mref_reader.is_state_satisfied())
12415 return -CEPHFS_ENOTCONN;
181888fb 12416
7c673cae
FG
12417 vinodeno_t vino = _get_vino(in);
12418
11fdf7f2
TL
12419 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12420 tout(cct) << __func__ << std::endl;
7c673cae
FG
12421 tout(cct) << vino.ino.val << std::endl;
12422 tout(cct) << size << std::endl;
12423
f67539c2 12424 std::scoped_lock lock(client_lock);
7c673cae
FG
12425 return _listxattr(in, names, size, perms);
12426}
12427
12428int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12429 size_t size, int flags, const UserPerm& perms)
12430{
12431
12432 int xattr_flags = 0;
12433 if (!value)
12434 xattr_flags |= CEPH_XATTR_REMOVE;
12435 if (flags & XATTR_CREATE)
12436 xattr_flags |= CEPH_XATTR_CREATE;
12437 if (flags & XATTR_REPLACE)
12438 xattr_flags |= CEPH_XATTR_REPLACE;
12439
12440 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12441 filepath path;
12442 in->make_nosnap_relative_path(path);
12443 req->set_filepath(path);
12444 req->set_string2(name);
12445 req->set_inode(in);
12446 req->head.args.setxattr.flags = xattr_flags;
12447
12448 bufferlist bl;
20effc67 12449 ceph_assert(value || size == 0);
7c673cae
FG
12450 bl.append((const char*)value, size);
12451 req->set_data(bl);
12452
12453 int res = make_request(req, perms);
12454
12455 trim_cache();
11fdf7f2 12456 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
12457 res << dendl;
12458 return res;
12459}
12460
12461int Client::_setxattr(Inode *in, const char *name, const void *value,
12462 size_t size, int flags, const UserPerm& perms)
12463{
12464 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12465 return -CEPHFS_EROFS;
7c673cae
FG
12466 }
12467
f6b5b4d7
TL
12468 if (size == 0) {
12469 value = "";
12470 } else if (value == NULL) {
f67539c2 12471 return -CEPHFS_EINVAL;
f6b5b4d7
TL
12472 }
12473
7c673cae
FG
12474 bool posix_acl_xattr = false;
12475 if (acl_type == POSIX_ACL)
12476 posix_acl_xattr = !strncmp(name, "system.", 7);
12477
12478 if (strncmp(name, "user.", 5) &&
12479 strncmp(name, "security.", 9) &&
12480 strncmp(name, "trusted.", 8) &&
12481 strncmp(name, "ceph.", 5) &&
12482 !posix_acl_xattr)
f67539c2 12483 return -CEPHFS_EOPNOTSUPP;
7c673cae 12484
11fdf7f2
TL
12485 bool check_realm = false;
12486
7c673cae
FG
12487 if (posix_acl_xattr) {
12488 if (!strcmp(name, ACL_EA_ACCESS)) {
12489 mode_t new_mode = in->mode;
12490 if (value) {
12491 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12492 if (ret < 0)
12493 return ret;
12494 if (ret == 0) {
12495 value = NULL;
12496 size = 0;
12497 }
12498 if (new_mode != in->mode) {
12499 struct ceph_statx stx;
12500 stx.stx_mode = new_mode;
12501 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12502 if (ret < 0)
12503 return ret;
12504 }
12505 }
12506 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12507 if (value) {
12508 if (!S_ISDIR(in->mode))
f67539c2 12509 return -CEPHFS_EACCES;
7c673cae
FG
12510 int ret = posix_acl_check(value, size);
12511 if (ret < 0)
f67539c2 12512 return -CEPHFS_EINVAL;
7c673cae
FG
12513 if (ret == 0) {
12514 value = NULL;
12515 size = 0;
12516 }
12517 }
12518 } else {
f67539c2 12519 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12520 }
12521 } else {
12522 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
12523 if (vxattr) {
12524 if (vxattr->readonly)
f67539c2 12525 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12526 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12527 check_realm = true;
12528 }
7c673cae
FG
12529 }
12530
11fdf7f2
TL
12531 int ret = _do_setxattr(in, name, value, size, flags, perms);
12532 if (ret >= 0 && check_realm) {
12533 // check if snaprealm was created for quota inode
12534 if (in->quota.is_enable() &&
12535 !(in->snaprealm && in->snaprealm->ino == in->ino))
f67539c2 12536 ret = -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12537 }
12538
12539 return ret;
7c673cae
FG
12540}
12541
12542int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12543 size_t size, int flags, const UserPerm& perms)
12544{
12545 if (cct->_conf->client_permissions) {
12546 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12547 if (r < 0)
12548 return r;
12549 }
12550 return _setxattr(in.get(), name, value, size, flags, perms);
12551}
12552
12553int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12554{
12555 string tmp;
12556 if (name == "layout") {
12557 string::iterator begin = value.begin();
12558 string::iterator end = value.end();
12559 keys_and_values<string::iterator> p; // create instance of parser
12560 std::map<string, string> m; // map to receive results
12561 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 12562 return -CEPHFS_EINVAL;
7c673cae
FG
12563 }
12564 if (begin != end)
f67539c2 12565 return -CEPHFS_EINVAL;
7c673cae
FG
12566 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12567 if (q->first == "pool") {
12568 tmp = q->second;
12569 break;
12570 }
12571 }
12572 } else if (name == "layout.pool") {
12573 tmp = value;
12574 }
12575
12576 if (tmp.length()) {
12577 int64_t pool;
12578 try {
12579 pool = boost::lexical_cast<unsigned>(tmp);
12580 if (!osdmap->have_pg_pool(pool))
f67539c2 12581 return -CEPHFS_ENOENT;
7c673cae
FG
12582 } catch (boost::bad_lexical_cast const&) {
12583 pool = osdmap->lookup_pg_pool_name(tmp);
12584 if (pool < 0) {
f67539c2 12585 return -CEPHFS_ENOENT;
7c673cae
FG
12586 }
12587 }
12588 }
12589
12590 return 0;
12591}
12592
12593void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12594{
12595 // For setting pool of layout, MetaRequest need osdmap epoch.
12596 // There is a race which create a new data pool but client and mds both don't have.
12597 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
f67539c2 12598 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
7c673cae
FG
12599 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12600 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12601 string rest(strstr(name, "layout"));
12602 string v((const char*)value, size);
12603 int r = objecter->with_osdmap([&](const OSDMap& o) {
12604 return _setxattr_check_data_pool(rest, v, &o);
12605 });
12606
f67539c2
TL
12607 if (r == -CEPHFS_ENOENT) {
12608 bs::error_code ec;
12609 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12610 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12611 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
7c673cae
FG
12612 }
12613 }
12614}
12615
12616int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12617 size_t size, int flags, const UserPerm& perms)
12618{
f67539c2
TL
12619 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12620 if (!mref_reader.is_state_satisfied())
12621 return -CEPHFS_ENOTCONN;
7c673cae 12622
f67539c2 12623 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12624
7c673cae
FG
12625 vinodeno_t vino = _get_vino(in);
12626
11fdf7f2
TL
12627 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12628 tout(cct) << __func__ << std::endl;
7c673cae
FG
12629 tout(cct) << vino.ino.val << std::endl;
12630 tout(cct) << name << std::endl;
12631
f67539c2 12632 std::scoped_lock lock(client_lock);
11fdf7f2 12633 if (!fuse_default_permissions) {
7c673cae
FG
12634 int r = xattr_permission(in, name, MAY_WRITE, perms);
12635 if (r < 0)
12636 return r;
12637 }
12638 return _setxattr(in, name, value, size, flags, perms);
12639}
12640
12641int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12642{
12643 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12644 return -CEPHFS_EROFS;
7c673cae
FG
12645 }
12646
12647 // same xattrs supported by kernel client
12648 if (strncmp(name, "user.", 5) &&
12649 strncmp(name, "system.", 7) &&
12650 strncmp(name, "security.", 9) &&
12651 strncmp(name, "trusted.", 8) &&
12652 strncmp(name, "ceph.", 5))
f67539c2 12653 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12654
12655 const VXattr *vxattr = _match_vxattr(in, name);
12656 if (vxattr && vxattr->readonly)
f67539c2 12657 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12658
12659 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12660 filepath path;
12661 in->make_nosnap_relative_path(path);
12662 req->set_filepath(path);
12663 req->set_filepath2(name);
12664 req->set_inode(in);
12665
12666 int res = make_request(req, perms);
12667
12668 trim_cache();
1adf2230 12669 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
12670 return res;
12671}
12672
12673int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12674{
12675 if (cct->_conf->client_permissions) {
12676 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12677 if (r < 0)
12678 return r;
12679 }
12680 return _removexattr(in.get(), name, perms);
12681}
12682
12683int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12684{
f67539c2
TL
12685 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12686 if (!mref_reader.is_state_satisfied())
12687 return -CEPHFS_ENOTCONN;
181888fb 12688
7c673cae
FG
12689 vinodeno_t vino = _get_vino(in);
12690
12691 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12692 tout(cct) << "ll_removexattr" << std::endl;
12693 tout(cct) << vino.ino.val << std::endl;
12694 tout(cct) << name << std::endl;
12695
f67539c2 12696 std::scoped_lock lock(client_lock);
11fdf7f2 12697 if (!fuse_default_permissions) {
7c673cae
FG
12698 int r = xattr_permission(in, name, MAY_WRITE, perms);
12699 if (r < 0)
12700 return r;
12701 }
12702
12703 return _removexattr(in, name, perms);
12704}
12705
12706bool Client::_vxattrcb_quota_exists(Inode *in)
12707{
11fdf7f2 12708 return in->quota.is_enable() &&
f6b5b4d7
TL
12709 (in->snapid != CEPH_NOSNAP ||
12710 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
12711}
12712size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12713{
12714 return snprintf(val, size,
12715 "max_bytes=%lld max_files=%lld",
12716 (long long int)in->quota.max_bytes,
12717 (long long int)in->quota.max_files);
12718}
12719size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12720{
12721 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12722}
12723size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12724{
12725 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12726}
12727
12728bool Client::_vxattrcb_layout_exists(Inode *in)
12729{
12730 return in->layout != file_layout_t();
12731}
12732size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12733{
12734 int r = snprintf(val, size,
11fdf7f2 12735 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
12736 (unsigned long long)in->layout.stripe_unit,
12737 (unsigned long long)in->layout.stripe_count,
12738 (unsigned long long)in->layout.object_size);
12739 objecter->with_osdmap([&](const OSDMap& o) {
12740 if (o.have_pg_pool(in->layout.pool_id))
12741 r += snprintf(val + r, size - r, "%s",
12742 o.get_pool_name(in->layout.pool_id).c_str());
12743 else
12744 r += snprintf(val + r, size - r, "%" PRIu64,
12745 (uint64_t)in->layout.pool_id);
12746 });
12747 if (in->layout.pool_ns.length())
12748 r += snprintf(val + r, size - r, " pool_namespace=%s",
12749 in->layout.pool_ns.c_str());
12750 return r;
12751}
12752size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12753{
11fdf7f2 12754 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
12755}
12756size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12757{
11fdf7f2 12758 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
12759}
12760size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12761{
11fdf7f2 12762 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
12763}
12764size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12765{
12766 size_t r;
12767 objecter->with_osdmap([&](const OSDMap& o) {
12768 if (o.have_pg_pool(in->layout.pool_id))
12769 r = snprintf(val, size, "%s", o.get_pool_name(
12770 in->layout.pool_id).c_str());
12771 else
12772 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12773 });
12774 return r;
12775}
12776size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12777{
12778 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12779}
12780size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12781{
11fdf7f2 12782 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
12783}
12784size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12785{
11fdf7f2 12786 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
12787}
12788size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12789{
11fdf7f2 12790 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
12791}
12792size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12793{
11fdf7f2 12794 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
12795}
12796size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12797{
11fdf7f2 12798 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
12799}
12800size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12801{
11fdf7f2 12802 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae 12803}
f67539c2
TL
12804size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12805{
12806 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12807}
7c673cae
FG
12808size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12809{
11fdf7f2 12810 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
12811}
12812size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12813{
81eedcae 12814 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
12815 (long)in->rstat.rctime.nsec());
12816}
11fdf7f2
TL
12817bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12818{
f67539c2 12819 return in->dir_pin != -CEPHFS_ENODATA;
11fdf7f2
TL
12820}
12821size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12822{
12823 return snprintf(val, size, "%ld", (long)in->dir_pin);
12824}
7c673cae 12825
81eedcae
TL
12826bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12827{
12828 return !in->snap_btime.is_zero();
12829}
12830
12831size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12832{
12833 return snprintf(val, size, "%llu.%09lu",
12834 (long long unsigned)in->snap_btime.sec(),
12835 (long unsigned)in->snap_btime.nsec());
12836}
12837
20effc67
TL
12838size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
12839{
12840 int issued;
12841
12842 in->caps_issued(&issued);
12843 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
12844}
12845
f67539c2
TL
12846bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12847{
12848 // checking one of the xattrs would suffice
12849 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12850}
12851
12852size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12853{
12854 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12855 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12856 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12857 in->xattrs["ceph.mirror.info.fs_id"].length(),
12858 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12859}
12860
adb31ebb
TL
12861size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12862{
12863 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12864}
12865
12866size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12867{
12868 auto name = messenger->get_myname();
20effc67 12869 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
adb31ebb
TL
12870}
12871
7c673cae
FG
12872#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12873#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12874
adb31ebb 12875#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
12876{ \
12877 name: CEPH_XATTR_NAME(_type, _name), \
12878 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12879 readonly: true, \
28e407b8
AA
12880 exists_cb: NULL, \
12881 flags: _flags, \
7c673cae
FG
12882}
12883#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12884{ \
12885 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12886 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12887 readonly: false, \
7c673cae 12888 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 12889 flags: 0, \
7c673cae
FG
12890}
12891#define XATTR_QUOTA_FIELD(_type, _name) \
12892{ \
12893 name: CEPH_XATTR_NAME(_type, _name), \
12894 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12895 readonly: false, \
7c673cae 12896 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 12897 flags: 0, \
7c673cae
FG
12898}
12899
12900const Client::VXattr Client::_dir_vxattrs[] = {
12901 {
12902 name: "ceph.dir.layout",
12903 getxattr_cb: &Client::_vxattrcb_layout,
12904 readonly: false,
7c673cae 12905 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12906 flags: 0,
7c673cae
FG
12907 },
12908 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12909 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12910 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12911 XATTR_LAYOUT_FIELD(dir, layout, pool),
12912 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
12913 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12914 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12915 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12916 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12917 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12918 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
f67539c2 12919 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
adb31ebb
TL
12920 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12921 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
12922 {
12923 name: "ceph.quota",
12924 getxattr_cb: &Client::_vxattrcb_quota,
12925 readonly: false,
7c673cae 12926 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 12927 flags: 0,
7c673cae
FG
12928 },
12929 XATTR_QUOTA_FIELD(quota, max_bytes),
12930 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
12931 {
12932 name: "ceph.dir.pin",
12933 getxattr_cb: &Client::_vxattrcb_dir_pin,
12934 readonly: false,
11fdf7f2
TL
12935 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12936 flags: 0,
12937 },
81eedcae
TL
12938 {
12939 name: "ceph.snap.btime",
12940 getxattr_cb: &Client::_vxattrcb_snap_btime,
12941 readonly: true,
81eedcae
TL
12942 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12943 flags: 0,
12944 },
f67539c2
TL
12945 {
12946 name: "ceph.mirror.info",
12947 getxattr_cb: &Client::_vxattrcb_mirror_info,
12948 readonly: false,
12949 exists_cb: &Client::_vxattrcb_mirror_info_exists,
12950 flags: 0,
12951 },
20effc67
TL
12952 {
12953 name: "ceph.caps",
12954 getxattr_cb: &Client::_vxattrcb_caps,
12955 readonly: true,
12956 exists_cb: NULL,
12957 flags: 0,
12958 },
7c673cae
FG
12959 { name: "" } /* Required table terminator */
12960};
12961
12962const Client::VXattr Client::_file_vxattrs[] = {
12963 {
12964 name: "ceph.file.layout",
12965 getxattr_cb: &Client::_vxattrcb_layout,
12966 readonly: false,
7c673cae 12967 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12968 flags: 0,
7c673cae
FG
12969 },
12970 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12971 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12972 XATTR_LAYOUT_FIELD(file, layout, object_size),
12973 XATTR_LAYOUT_FIELD(file, layout, pool),
12974 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
12975 {
12976 name: "ceph.snap.btime",
12977 getxattr_cb: &Client::_vxattrcb_snap_btime,
12978 readonly: true,
81eedcae
TL
12979 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12980 flags: 0,
12981 },
20effc67
TL
12982 {
12983 name: "ceph.caps",
12984 getxattr_cb: &Client::_vxattrcb_caps,
12985 readonly: true,
12986 exists_cb: NULL,
12987 flags: 0,
12988 },
7c673cae
FG
12989 { name: "" } /* Required table terminator */
12990};
12991
adb31ebb
TL
12992const Client::VXattr Client::_common_vxattrs[] = {
12993 {
12994 name: "ceph.cluster_fsid",
12995 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12996 readonly: true,
12997 exists_cb: nullptr,
12998 flags: 0,
12999 },
13000 {
13001 name: "ceph.client_id",
13002 getxattr_cb: &Client::_vxattrcb_client_id,
13003 readonly: true,
13004 exists_cb: nullptr,
13005 flags: 0,
13006 },
13007 { name: "" } /* Required table terminator */
13008};
13009
7c673cae
FG
13010const Client::VXattr *Client::_get_vxattrs(Inode *in)
13011{
13012 if (in->is_dir())
13013 return _dir_vxattrs;
13014 else if (in->is_file())
13015 return _file_vxattrs;
13016 return NULL;
13017}
13018
13019const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13020{
13021 if (strncmp(name, "ceph.", 5) == 0) {
13022 const VXattr *vxattr = _get_vxattrs(in);
13023 if (vxattr) {
13024 while (!vxattr->name.empty()) {
13025 if (vxattr->name == name)
13026 return vxattr;
13027 vxattr++;
13028 }
13029 }
adb31ebb
TL
13030
13031 // for common vxattrs
13032 vxattr = _common_vxattrs;
13033 while (!vxattr->name.empty()) {
13034 if (vxattr->name == name)
13035 return vxattr;
13036 vxattr++;
13037 }
7c673cae 13038 }
adb31ebb 13039
7c673cae
FG
13040 return NULL;
13041}
13042
7c673cae
FG
13043int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13044{
f67539c2
TL
13045 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13046 if (!mref_reader.is_state_satisfied())
13047 return -CEPHFS_ENOTCONN;
181888fb 13048
7c673cae
FG
13049 vinodeno_t vino = _get_vino(in);
13050
13051 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13052 tout(cct) << "ll_readlink" << std::endl;
13053 tout(cct) << vino.ino.val << std::endl;
13054
f67539c2 13055 std::scoped_lock lock(client_lock);
11fdf7f2
TL
13056 for (auto dn : in->dentries) {
13057 touch_dn(dn);
7c673cae
FG
13058 }
13059
13060 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13061 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13062 return r;
13063}
13064
13065int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13066 const UserPerm& perms, InodeRef *inp)
13067{
1adf2230 13068 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13069 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13070 << ", gid " << perms.gid() << ")" << dendl;
13071
13072 if (strlen(name) > NAME_MAX)
f67539c2 13073 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13074
13075 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13076 return -CEPHFS_EROFS;
7c673cae
FG
13077 }
13078 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13079 return -CEPHFS_EDQUOT;
7c673cae
FG
13080 }
13081
13082 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13083
13084 filepath path;
13085 dir->make_nosnap_relative_path(path);
13086 path.push_dentry(name);
13087 req->set_filepath(path);
13088 req->set_inode(dir);
13089 req->head.args.mknod.rdev = rdev;
13090 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13091 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13092
13093 bufferlist xattrs_bl;
13094 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13095 if (res < 0)
13096 goto fail;
13097 req->head.args.mknod.mode = mode;
13098 if (xattrs_bl.length() > 0)
13099 req->set_data(xattrs_bl);
13100
13101 Dentry *de;
13102 res = get_or_create(dir, name, &de);
13103 if (res < 0)
13104 goto fail;
13105 req->set_dentry(de);
13106
13107 res = make_request(req, perms, inp);
13108
13109 trim_cache();
13110
1adf2230 13111 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13112 return res;
13113
13114 fail:
13115 put_request(req);
13116 return res;
13117}
13118
13119int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13120 dev_t rdev, struct stat *attr, Inode **out,
13121 const UserPerm& perms)
13122{
f67539c2
TL
13123 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13124 if (!mref_reader.is_state_satisfied())
13125 return -CEPHFS_ENOTCONN;
181888fb 13126
7c673cae
FG
13127 vinodeno_t vparent = _get_vino(parent);
13128
13129 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13130 tout(cct) << "ll_mknod" << std::endl;
13131 tout(cct) << vparent.ino.val << std::endl;
13132 tout(cct) << name << std::endl;
13133 tout(cct) << mode << std::endl;
13134 tout(cct) << rdev << std::endl;
13135
f67539c2 13136 std::scoped_lock lock(client_lock);
11fdf7f2 13137 if (!fuse_default_permissions) {
7c673cae
FG
13138 int r = may_create(parent, perms);
13139 if (r < 0)
13140 return r;
13141 }
13142
13143 InodeRef in;
13144 int r = _mknod(parent, name, mode, rdev, perms, &in);
13145 if (r == 0) {
13146 fill_stat(in, attr);
13147 _ll_get(in.get());
13148 }
13149 tout(cct) << attr->st_ino << std::endl;
13150 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13151 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13152 *out = in.get();
13153 return r;
13154}
13155
13156int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13157 dev_t rdev, Inode **out,
13158 struct ceph_statx *stx, unsigned want, unsigned flags,
13159 const UserPerm& perms)
13160{
f67539c2
TL
13161 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13162 if (!mref_reader.is_state_satisfied())
13163 return -CEPHFS_ENOTCONN;
7c673cae 13164
f67539c2 13165 unsigned caps = statx_to_mask(flags, want);
181888fb 13166
7c673cae
FG
13167 vinodeno_t vparent = _get_vino(parent);
13168
13169 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13170 tout(cct) << "ll_mknodx" << std::endl;
13171 tout(cct) << vparent.ino.val << std::endl;
13172 tout(cct) << name << std::endl;
13173 tout(cct) << mode << std::endl;
13174 tout(cct) << rdev << std::endl;
13175
f67539c2
TL
13176 std::scoped_lock lock(client_lock);
13177
11fdf7f2 13178 if (!fuse_default_permissions) {
7c673cae
FG
13179 int r = may_create(parent, perms);
13180 if (r < 0)
13181 return r;
13182 }
13183
13184 InodeRef in;
13185 int r = _mknod(parent, name, mode, rdev, perms, &in);
13186 if (r == 0) {
13187 fill_statx(in, caps, stx);
13188 _ll_get(in.get());
13189 }
13190 tout(cct) << stx->stx_ino << std::endl;
13191 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13192 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13193 *out = in.get();
13194 return r;
13195}
13196
13197int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13198 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13199 int object_size, const char *data_pool, bool *created,
f67539c2 13200 const UserPerm& perms, std::string alternate_name)
7c673cae 13201{
1adf2230 13202 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
13203 mode << dec << ")" << dendl;
13204
13205 if (strlen(name) > NAME_MAX)
f67539c2 13206 return -CEPHFS_ENAMETOOLONG;
7c673cae 13207 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13208 return -CEPHFS_EROFS;
7c673cae
FG
13209 }
13210 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13211 return -CEPHFS_EDQUOT;
7c673cae
FG
13212 }
13213
13214 // use normalized flags to generate cmode
11fdf7f2
TL
13215 int cflags = ceph_flags_sys2wire(flags);
13216 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13217 cflags |= CEPH_O_LAZY;
13218
13219 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
13220
13221 int64_t pool_id = -1;
13222 if (data_pool && *data_pool) {
13223 pool_id = objecter->with_osdmap(
13224 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13225 if (pool_id < 0)
f67539c2 13226 return -CEPHFS_EINVAL;
7c673cae 13227 if (pool_id > 0xffffffffll)
f67539c2 13228 return -CEPHFS_ERANGE; // bummer!
7c673cae
FG
13229 }
13230
13231 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13232
13233 filepath path;
13234 dir->make_nosnap_relative_path(path);
13235 path.push_dentry(name);
13236 req->set_filepath(path);
f67539c2 13237 req->set_alternate_name(std::move(alternate_name));
7c673cae 13238 req->set_inode(dir);
11fdf7f2 13239 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
13240
13241 req->head.args.open.stripe_unit = stripe_unit;
13242 req->head.args.open.stripe_count = stripe_count;
13243 req->head.args.open.object_size = object_size;
13244 if (cct->_conf->client_debug_getattr_caps)
13245 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13246 else
13247 req->head.args.open.mask = 0;
13248 req->head.args.open.pool = pool_id;
13249 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13250 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13251
13252 mode |= S_IFREG;
13253 bufferlist xattrs_bl;
13254 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13255 if (res < 0)
13256 goto fail;
13257 req->head.args.open.mode = mode;
13258 if (xattrs_bl.length() > 0)
13259 req->set_data(xattrs_bl);
13260
13261 Dentry *de;
13262 res = get_or_create(dir, name, &de);
13263 if (res < 0)
13264 goto fail;
13265 req->set_dentry(de);
13266
13267 res = make_request(req, perms, inp, created);
13268 if (res < 0) {
13269 goto reply_error;
13270 }
13271
13272 /* If the caller passed a value in fhp, do the open */
13273 if(fhp) {
13274 (*inp)->get_open_ref(cmode);
13275 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13276 }
13277
13278 reply_error:
13279 trim_cache();
13280
1adf2230 13281 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
13282 << " layout " << stripe_unit
13283 << ' ' << stripe_count
13284 << ' ' << object_size
13285 <<") = " << res << dendl;
13286 return res;
13287
13288 fail:
13289 put_request(req);
13290 return res;
13291}
13292
7c673cae 13293int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
f67539c2
TL
13294 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13295 std::string alternate_name)
7c673cae 13296{
1adf2230 13297 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13298 << mode << dec << ", uid " << perm.uid()
13299 << ", gid " << perm.gid() << ")" << dendl;
13300
13301 if (strlen(name) > NAME_MAX)
f67539c2 13302 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13303
13304 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13305 return -CEPHFS_EROFS;
7c673cae
FG
13306 }
13307 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13308 return -CEPHFS_EDQUOT;
7c673cae 13309 }
f67539c2
TL
13310
13311 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13312 MetaRequest *req = new MetaRequest(is_snap_op ?
7c673cae
FG
13313 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13314
13315 filepath path;
13316 dir->make_nosnap_relative_path(path);
13317 path.push_dentry(name);
13318 req->set_filepath(path);
13319 req->set_inode(dir);
13320 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13321 req->dentry_unless = CEPH_CAP_FILE_EXCL;
f67539c2 13322 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13323
13324 mode |= S_IFDIR;
f67539c2
TL
13325 bufferlist bl;
13326 int res = _posix_acl_create(dir, &mode, bl, perm);
7c673cae
FG
13327 if (res < 0)
13328 goto fail;
13329 req->head.args.mkdir.mode = mode;
f67539c2
TL
13330 if (is_snap_op) {
13331 SnapPayload payload;
13332 // clear the bufferlist that may have been populated by the call
13333 // to _posix_acl_create(). MDS mksnap does not make use of it.
13334 // So, reuse it to pass metadata payload.
13335 bl.clear();
13336 payload.metadata = metadata;
13337 encode(payload, bl);
13338 }
13339 if (bl.length() > 0) {
13340 req->set_data(bl);
13341 }
7c673cae
FG
13342
13343 Dentry *de;
13344 res = get_or_create(dir, name, &de);
13345 if (res < 0)
13346 goto fail;
13347 req->set_dentry(de);
13348
13349 ldout(cct, 10) << "_mkdir: making request" << dendl;
13350 res = make_request(req, perm, inp);
13351 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13352
13353 trim_cache();
13354
1adf2230 13355 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13356 return res;
13357
13358 fail:
13359 put_request(req);
13360 return res;
13361}
13362
13363int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13364 struct stat *attr, Inode **out, const UserPerm& perm)
13365{
f67539c2
TL
13366 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13367 if (!mref_reader.is_state_satisfied())
13368 return -CEPHFS_ENOTCONN;
181888fb 13369
7c673cae
FG
13370 vinodeno_t vparent = _get_vino(parent);
13371
13372 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13373 tout(cct) << "ll_mkdir" << std::endl;
13374 tout(cct) << vparent.ino.val << std::endl;
13375 tout(cct) << name << std::endl;
13376 tout(cct) << mode << std::endl;
13377
f67539c2
TL
13378 std::scoped_lock lock(client_lock);
13379
11fdf7f2 13380 if (!fuse_default_permissions) {
7c673cae
FG
13381 int r = may_create(parent, perm);
13382 if (r < 0)
13383 return r;
13384 }
13385
13386 InodeRef in;
13387 int r = _mkdir(parent, name, mode, perm, &in);
13388 if (r == 0) {
13389 fill_stat(in, attr);
13390 _ll_get(in.get());
13391 }
13392 tout(cct) << attr->st_ino << std::endl;
13393 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13394 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13395 *out = in.get();
13396 return r;
13397}
13398
13399int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13400 struct ceph_statx *stx, unsigned want, unsigned flags,
13401 const UserPerm& perms)
13402{
f67539c2
TL
13403 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13404 if (!mref_reader.is_state_satisfied())
13405 return -CEPHFS_ENOTCONN;
181888fb 13406
7c673cae
FG
13407 vinodeno_t vparent = _get_vino(parent);
13408
13409 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13410 tout(cct) << "ll_mkdirx" << std::endl;
13411 tout(cct) << vparent.ino.val << std::endl;
13412 tout(cct) << name << std::endl;
13413 tout(cct) << mode << std::endl;
13414
f67539c2
TL
13415 std::scoped_lock lock(client_lock);
13416
11fdf7f2 13417 if (!fuse_default_permissions) {
7c673cae
FG
13418 int r = may_create(parent, perms);
13419 if (r < 0)
13420 return r;
13421 }
13422
13423 InodeRef in;
13424 int r = _mkdir(parent, name, mode, perms, &in);
13425 if (r == 0) {
13426 fill_statx(in, statx_to_mask(flags, want), stx);
13427 _ll_get(in.get());
13428 } else {
13429 stx->stx_ino = 0;
13430 stx->stx_mask = 0;
13431 }
13432 tout(cct) << stx->stx_ino << std::endl;
13433 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13434 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13435 *out = in.get();
13436 return r;
13437}
13438
13439int Client::_symlink(Inode *dir, const char *name, const char *target,
f67539c2 13440 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
7c673cae 13441{
1adf2230 13442 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
13443 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13444 << dendl;
13445
13446 if (strlen(name) > NAME_MAX)
f67539c2 13447 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13448
13449 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13450 return -CEPHFS_EROFS;
7c673cae
FG
13451 }
13452 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13453 return -CEPHFS_EDQUOT;
7c673cae
FG
13454 }
13455
13456 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13457
13458 filepath path;
13459 dir->make_nosnap_relative_path(path);
13460 path.push_dentry(name);
13461 req->set_filepath(path);
f67539c2 13462 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13463 req->set_inode(dir);
13464 req->set_string2(target);
13465 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13466 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13467
13468 Dentry *de;
13469 int res = get_or_create(dir, name, &de);
13470 if (res < 0)
13471 goto fail;
13472 req->set_dentry(de);
13473
13474 res = make_request(req, perms, inp);
13475
13476 trim_cache();
1adf2230 13477 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
13478 res << dendl;
13479 return res;
13480
13481 fail:
13482 put_request(req);
13483 return res;
13484}
13485
13486int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13487 struct stat *attr, Inode **out, const UserPerm& perms)
13488{
f67539c2
TL
13489 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13490 if (!mref_reader.is_state_satisfied())
13491 return -CEPHFS_ENOTCONN;
181888fb 13492
7c673cae
FG
13493 vinodeno_t vparent = _get_vino(parent);
13494
13495 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13496 << dendl;
13497 tout(cct) << "ll_symlink" << std::endl;
13498 tout(cct) << vparent.ino.val << std::endl;
13499 tout(cct) << name << std::endl;
13500 tout(cct) << value << std::endl;
13501
f67539c2
TL
13502 std::scoped_lock lock(client_lock);
13503
11fdf7f2 13504 if (!fuse_default_permissions) {
7c673cae
FG
13505 int r = may_create(parent, perms);
13506 if (r < 0)
13507 return r;
13508 }
13509
13510 InodeRef in;
f67539c2 13511 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13512 if (r == 0) {
13513 fill_stat(in, attr);
13514 _ll_get(in.get());
13515 }
13516 tout(cct) << attr->st_ino << std::endl;
13517 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13518 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13519 *out = in.get();
13520 return r;
13521}
13522
13523int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13524 Inode **out, struct ceph_statx *stx, unsigned want,
13525 unsigned flags, const UserPerm& perms)
13526{
f67539c2
TL
13527 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13528 if (!mref_reader.is_state_satisfied())
13529 return -CEPHFS_ENOTCONN;
181888fb 13530
7c673cae
FG
13531 vinodeno_t vparent = _get_vino(parent);
13532
13533 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13534 << dendl;
13535 tout(cct) << "ll_symlinkx" << std::endl;
13536 tout(cct) << vparent.ino.val << std::endl;
13537 tout(cct) << name << std::endl;
13538 tout(cct) << value << std::endl;
13539
f67539c2
TL
13540 std::scoped_lock lock(client_lock);
13541
11fdf7f2 13542 if (!fuse_default_permissions) {
7c673cae
FG
13543 int r = may_create(parent, perms);
13544 if (r < 0)
13545 return r;
13546 }
13547
13548 InodeRef in;
f67539c2 13549 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13550 if (r == 0) {
13551 fill_statx(in, statx_to_mask(flags, want), stx);
13552 _ll_get(in.get());
13553 }
13554 tout(cct) << stx->stx_ino << std::endl;
13555 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13556 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13557 *out = in.get();
13558 return r;
13559}
13560
13561int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13562{
1adf2230 13563 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
13564 << " uid " << perm.uid() << " gid " << perm.gid()
13565 << ")" << dendl;
13566
13567 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13568 return -CEPHFS_EROFS;
7c673cae
FG
13569 }
13570
13571 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13572
13573 filepath path;
13574 dir->make_nosnap_relative_path(path);
13575 path.push_dentry(name);
13576 req->set_filepath(path);
13577
13578 InodeRef otherin;
b32b8144 13579 Inode *in;
7c673cae 13580 Dentry *de;
b32b8144 13581
7c673cae
FG
13582 int res = get_or_create(dir, name, &de);
13583 if (res < 0)
13584 goto fail;
13585 req->set_dentry(de);
13586 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13587 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13588
13589 res = _lookup(dir, name, 0, &otherin, perm);
13590 if (res < 0)
13591 goto fail;
b32b8144
FG
13592
13593 in = otherin.get();
13594 req->set_other_inode(in);
13595 in->break_all_delegs();
7c673cae
FG
13596 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13597
13598 req->set_inode(dir);
13599
13600 res = make_request(req, perm);
13601
13602 trim_cache();
1adf2230 13603 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
13604 return res;
13605
13606 fail:
13607 put_request(req);
13608 return res;
13609}
13610
13611int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13612{
f67539c2
TL
13613 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13614 if (!mref_reader.is_state_satisfied())
13615 return -CEPHFS_ENOTCONN;
181888fb 13616
7c673cae
FG
13617 vinodeno_t vino = _get_vino(in);
13618
13619 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13620 tout(cct) << "ll_unlink" << std::endl;
13621 tout(cct) << vino.ino.val << std::endl;
13622 tout(cct) << name << std::endl;
13623
f67539c2
TL
13624 std::scoped_lock lock(client_lock);
13625
11fdf7f2 13626 if (!fuse_default_permissions) {
7c673cae
FG
13627 int r = may_delete(in, name, perm);
13628 if (r < 0)
13629 return r;
13630 }
13631 return _unlink(in, name, perm);
13632}
13633
13634int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13635{
1adf2230 13636 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
13637 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13638
13639 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13640 return -CEPHFS_EROFS;
7c673cae 13641 }
b32b8144
FG
13642
13643 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13644 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
13645 filepath path;
13646 dir->make_nosnap_relative_path(path);
13647 path.push_dentry(name);
13648 req->set_filepath(path);
11fdf7f2 13649 req->set_inode(dir);
7c673cae
FG
13650
13651 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13652 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13653 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13654
13655 InodeRef in;
13656
13657 Dentry *de;
13658 int res = get_or_create(dir, name, &de);
13659 if (res < 0)
13660 goto fail;
b32b8144
FG
13661 if (op == CEPH_MDS_OP_RMDIR)
13662 req->set_dentry(de);
13663 else
13664 de->get();
13665
7c673cae
FG
13666 res = _lookup(dir, name, 0, &in, perms);
13667 if (res < 0)
13668 goto fail;
11fdf7f2
TL
13669
13670 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 13671 unlink(de, true, true);
b32b8144 13672 de->put();
7c673cae 13673 }
11fdf7f2 13674 req->set_other_inode(in.get());
7c673cae
FG
13675
13676 res = make_request(req, perms);
13677
13678 trim_cache();
1adf2230 13679 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
13680 return res;
13681
13682 fail:
13683 put_request(req);
13684 return res;
13685}
13686
13687int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13688{
f67539c2
TL
13689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13690 if (!mref_reader.is_state_satisfied())
13691 return -CEPHFS_ENOTCONN;
181888fb 13692
7c673cae
FG
13693 vinodeno_t vino = _get_vino(in);
13694
13695 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13696 tout(cct) << "ll_rmdir" << std::endl;
13697 tout(cct) << vino.ino.val << std::endl;
13698 tout(cct) << name << std::endl;
13699
f67539c2
TL
13700 std::scoped_lock lock(client_lock);
13701
11fdf7f2 13702 if (!fuse_default_permissions) {
7c673cae
FG
13703 int r = may_delete(in, name, perms);
13704 if (r < 0)
13705 return r;
13706 }
13707
13708 return _rmdir(in, name, perms);
13709}
13710
f67539c2 13711int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
7c673cae 13712{
1adf2230 13713 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
13714 << todir->ino << " " << toname
13715 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13716 << dendl;
13717
13718 if (fromdir->snapid != todir->snapid)
f67539c2 13719 return -CEPHFS_EXDEV;
7c673cae
FG
13720
13721 int op = CEPH_MDS_OP_RENAME;
13722 if (fromdir->snapid != CEPH_NOSNAP) {
13723 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13724 op = CEPH_MDS_OP_RENAMESNAP;
13725 else
f67539c2
TL
13726 return -CEPHFS_EROFS;
13727 }
13728 if (fromdir != todir) {
13729 Inode *fromdir_root =
13730 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13731 Inode *todir_root =
13732 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13733 if (fromdir_root != todir_root) {
13734 return -CEPHFS_EXDEV;
13735 }
7c673cae 13736 }
7c673cae
FG
13737
13738 InodeRef target;
13739 MetaRequest *req = new MetaRequest(op);
13740
13741 filepath from;
13742 fromdir->make_nosnap_relative_path(from);
13743 from.push_dentry(fromname);
13744 filepath to;
13745 todir->make_nosnap_relative_path(to);
13746 to.push_dentry(toname);
13747 req->set_filepath(to);
13748 req->set_filepath2(from);
f67539c2 13749 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13750
13751 Dentry *oldde;
13752 int res = get_or_create(fromdir, fromname, &oldde);
13753 if (res < 0)
13754 goto fail;
13755 Dentry *de;
13756 res = get_or_create(todir, toname, &de);
13757 if (res < 0)
13758 goto fail;
13759
13760 if (op == CEPH_MDS_OP_RENAME) {
13761 req->set_old_dentry(oldde);
13762 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13763 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13764
13765 req->set_dentry(de);
13766 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13767 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13768
13769 InodeRef oldin, otherin;
f67539c2 13770 res = _lookup(fromdir, fromname, 0, &oldin, perm);
7c673cae
FG
13771 if (res < 0)
13772 goto fail;
b32b8144
FG
13773
13774 Inode *oldinode = oldin.get();
13775 oldinode->break_all_delegs();
13776 req->set_old_inode(oldinode);
7c673cae
FG
13777 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13778
13779 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
13780 switch (res) {
13781 case 0:
13782 {
13783 Inode *in = otherin.get();
13784 req->set_other_inode(in);
13785 in->break_all_delegs();
13786 }
7c673cae 13787 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144 13788 break;
f67539c2 13789 case -CEPHFS_ENOENT:
b32b8144
FG
13790 break;
13791 default:
13792 goto fail;
7c673cae
FG
13793 }
13794
13795 req->set_inode(todir);
13796 } else {
13797 // renamesnap reply contains no tracedn, so we need to invalidate
13798 // dentry manually
13799 unlink(oldde, true, true);
13800 unlink(de, true, true);
11fdf7f2
TL
13801
13802 req->set_inode(todir);
7c673cae
FG
13803 }
13804
13805 res = make_request(req, perm, &target);
13806 ldout(cct, 10) << "rename result is " << res << dendl;
13807
13808 // renamed item from our cache
13809
13810 trim_cache();
1adf2230 13811 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
13812 return res;
13813
13814 fail:
13815 put_request(req);
13816 return res;
13817}
13818
13819int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13820 const char *newname, const UserPerm& perm)
13821{
f67539c2
TL
13822 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13823 if (!mref_reader.is_state_satisfied())
13824 return -CEPHFS_ENOTCONN;
181888fb 13825
7c673cae
FG
13826 vinodeno_t vparent = _get_vino(parent);
13827 vinodeno_t vnewparent = _get_vino(newparent);
13828
13829 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13830 << vnewparent << " " << newname << dendl;
13831 tout(cct) << "ll_rename" << std::endl;
13832 tout(cct) << vparent.ino.val << std::endl;
13833 tout(cct) << name << std::endl;
13834 tout(cct) << vnewparent.ino.val << std::endl;
13835 tout(cct) << newname << std::endl;
13836
f67539c2
TL
13837 std::scoped_lock lock(client_lock);
13838
11fdf7f2 13839 if (!fuse_default_permissions) {
7c673cae
FG
13840 int r = may_delete(parent, name, perm);
13841 if (r < 0)
13842 return r;
13843 r = may_delete(newparent, newname, perm);
f67539c2 13844 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
13845 return r;
13846 }
13847
f67539c2 13848 return _rename(parent, name, newparent, newname, perm, "");
7c673cae
FG
13849}
13850
f67539c2 13851int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
7c673cae 13852{
1adf2230 13853 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
13854 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13855
13856 if (strlen(newname) > NAME_MAX)
f67539c2 13857 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13858
13859 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
f67539c2 13860 return -CEPHFS_EROFS;
7c673cae
FG
13861 }
13862 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13863 return -CEPHFS_EDQUOT;
7c673cae
FG
13864 }
13865
b32b8144 13866 in->break_all_delegs();
7c673cae
FG
13867 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13868
13869 filepath path(newname, dir->ino);
13870 req->set_filepath(path);
f67539c2 13871 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13872 filepath existing(in->ino);
13873 req->set_filepath2(existing);
13874
13875 req->set_inode(dir);
13876 req->inode_drop = CEPH_CAP_FILE_SHARED;
13877 req->inode_unless = CEPH_CAP_FILE_EXCL;
13878
13879 Dentry *de;
13880 int res = get_or_create(dir, newname, &de);
13881 if (res < 0)
13882 goto fail;
13883 req->set_dentry(de);
13884
13885 res = make_request(req, perm, inp);
13886 ldout(cct, 10) << "link result is " << res << dendl;
13887
13888 trim_cache();
1adf2230 13889 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
13890 return res;
13891
13892 fail:
13893 put_request(req);
13894 return res;
13895}
13896
13897int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13898 const UserPerm& perm)
13899{
f67539c2
TL
13900 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13901 if (!mref_reader.is_state_satisfied())
13902 return -CEPHFS_ENOTCONN;
181888fb 13903
7c673cae
FG
13904 vinodeno_t vino = _get_vino(in);
13905 vinodeno_t vnewparent = _get_vino(newparent);
13906
31f18b77 13907 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
13908 newname << dendl;
13909 tout(cct) << "ll_link" << std::endl;
13910 tout(cct) << vino.ino.val << std::endl;
13911 tout(cct) << vnewparent << std::endl;
13912 tout(cct) << newname << std::endl;
13913
7c673cae
FG
13914 InodeRef target;
13915
f67539c2
TL
13916 std::scoped_lock lock(client_lock);
13917
11fdf7f2 13918 if (!fuse_default_permissions) {
7c673cae 13919 if (S_ISDIR(in->mode))
f67539c2 13920 return -CEPHFS_EPERM;
7c673cae 13921
11fdf7f2 13922 int r = may_hardlink(in, perm);
7c673cae
FG
13923 if (r < 0)
13924 return r;
13925
13926 r = may_create(newparent, perm);
13927 if (r < 0)
13928 return r;
13929 }
13930
f67539c2 13931 return _link(in, newparent, newname, perm, "", &target);
7c673cae
FG
13932}
13933
13934int Client::ll_num_osds(void)
13935{
f67539c2 13936 std::scoped_lock lock(client_lock);
7c673cae
FG
13937 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13938}
13939
13940int Client::ll_osdaddr(int osd, uint32_t *addr)
13941{
f67539c2 13942 std::scoped_lock lock(client_lock);
181888fb 13943
7c673cae
FG
13944 entity_addr_t g;
13945 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13946 if (!o.exists(osd))
13947 return false;
11fdf7f2 13948 g = o.get_addrs(osd).front();
7c673cae
FG
13949 return true;
13950 });
13951 if (!exists)
13952 return -1;
13953 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13954 *addr = ntohl(nb_addr);
13955 return 0;
13956}
181888fb 13957
7c673cae
FG
13958uint32_t Client::ll_stripe_unit(Inode *in)
13959{
f67539c2 13960 std::scoped_lock lock(client_lock);
7c673cae
FG
13961 return in->layout.stripe_unit;
13962}
13963
13964uint64_t Client::ll_snap_seq(Inode *in)
13965{
f67539c2 13966 std::scoped_lock lock(client_lock);
7c673cae
FG
13967 return in->snaprealm->seq;
13968}
13969
13970int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13971{
f67539c2 13972 std::scoped_lock lock(client_lock);
7c673cae
FG
13973 *layout = in->layout;
13974 return 0;
13975}
13976
13977int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13978{
13979 return ll_file_layout(fh->inode.get(), layout);
13980}
13981
13982/* Currently we cannot take advantage of redundancy in reads, since we
13983 would have to go through all possible placement groups (a
13984 potentially quite large number determined by a hash), and use CRUSH
13985 to calculate the appropriate set of OSDs for each placement group,
13986 then index into that. An array with one entry per OSD is much more
13987 tractable and works for demonstration purposes. */
13988
13989int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13990 file_layout_t* layout)
13991{
f67539c2 13992 std::scoped_lock lock(client_lock);
181888fb 13993
28e407b8 13994 inodeno_t ino = in->ino;
7c673cae
FG
13995 uint32_t object_size = layout->object_size;
13996 uint32_t su = layout->stripe_unit;
13997 uint32_t stripe_count = layout->stripe_count;
13998 uint64_t stripes_per_object = object_size / su;
11fdf7f2 13999 uint64_t stripeno = 0, stripepos = 0;
7c673cae 14000
11fdf7f2
TL
14001 if(stripe_count) {
14002 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14003 stripepos = blockno % stripe_count; // which object in the object set (X)
14004 }
7c673cae
FG
14005 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14006 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14007
14008 object_t oid = file_object_t(ino, objectno);
14009 return objecter->with_osdmap([&](const OSDMap& o) {
14010 ceph_object_layout olayout =
14011 o.file_to_object_layout(oid, *layout);
14012 pg_t pg = (pg_t)olayout.ol_pgid;
14013 vector<int> osds;
14014 int primary;
14015 o.pg_to_acting_osds(pg, &osds, &primary);
14016 return primary;
14017 });
14018}
14019
14020/* Return the offset of the block, internal to the object */
14021
14022uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14023{
f67539c2 14024 std::scoped_lock lock(client_lock);
7c673cae
FG
14025 file_layout_t *layout=&(in->layout);
14026 uint32_t object_size = layout->object_size;
14027 uint32_t su = layout->stripe_unit;
14028 uint64_t stripes_per_object = object_size / su;
14029
14030 return (blockno % stripes_per_object) * su;
14031}
14032
14033int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14034 const UserPerm& perms)
14035{
f67539c2
TL
14036 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14037 if (!mref_reader.is_state_satisfied())
14038 return -CEPHFS_ENOTCONN;
181888fb 14039
7c673cae
FG
14040 vinodeno_t vino = _get_vino(in);
14041
14042 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14043 tout(cct) << "ll_opendir" << std::endl;
14044 tout(cct) << vino.ino.val << std::endl;
14045
f67539c2
TL
14046 std::scoped_lock lock(client_lock);
14047
11fdf7f2 14048 if (!fuse_default_permissions) {
7c673cae
FG
14049 int r = may_open(in, flags, perms);
14050 if (r < 0)
14051 return r;
14052 }
14053
14054 int r = _opendir(in, dirpp, perms);
f67539c2 14055 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
14056
14057 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14058 << dendl;
14059 return r;
14060}
14061
14062int Client::ll_releasedir(dir_result_t *dirp)
14063{
f67539c2
TL
14064 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14065 if (!mref_reader.is_state_satisfied())
14066 return -CEPHFS_ENOTCONN;
14067
7c673cae
FG
14068 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14069 tout(cct) << "ll_releasedir" << std::endl;
f67539c2 14070 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14071
f67539c2 14072 std::scoped_lock lock(client_lock);
181888fb 14073
7c673cae
FG
14074 _closedir(dirp);
14075 return 0;
14076}
14077
14078int Client::ll_fsyncdir(dir_result_t *dirp)
14079{
f67539c2
TL
14080 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14081 if (!mref_reader.is_state_satisfied())
14082 return -CEPHFS_ENOTCONN;
14083
7c673cae
FG
14084 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14085 tout(cct) << "ll_fsyncdir" << std::endl;
f67539c2 14086 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14087
f67539c2 14088 std::scoped_lock lock(client_lock);
7c673cae
FG
14089 return _fsync(dirp->inode.get(), false);
14090}
14091
14092int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14093{
11fdf7f2 14094 ceph_assert(!(flags & O_CREAT));
7c673cae 14095
f67539c2
TL
14096 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14097 if (!mref_reader.is_state_satisfied())
14098 return -CEPHFS_ENOTCONN;
181888fb 14099
7c673cae
FG
14100 vinodeno_t vino = _get_vino(in);
14101
14102 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14103 tout(cct) << "ll_open" << std::endl;
14104 tout(cct) << vino.ino.val << std::endl;
14105 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14106
f67539c2
TL
14107 std::scoped_lock lock(client_lock);
14108
7c673cae 14109 int r;
11fdf7f2 14110 if (!fuse_default_permissions) {
7c673cae
FG
14111 r = may_open(in, flags, perms);
14112 if (r < 0)
14113 goto out;
14114 }
14115
14116 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14117
14118 out:
14119 Fh *fhptr = fhp ? *fhp : NULL;
14120 if (fhptr) {
14121 ll_unclosed_fh_set.insert(fhptr);
14122 }
f67539c2 14123 tout(cct) << (uintptr_t)fhptr << std::endl;
7c673cae
FG
14124 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14125 " = " << r << " (" << fhptr << ")" << dendl;
14126 return r;
14127}
14128
14129int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14130 int flags, InodeRef *in, int caps, Fh **fhp,
14131 const UserPerm& perms)
14132{
14133 *fhp = NULL;
14134
14135 vinodeno_t vparent = _get_vino(parent);
14136
1adf2230 14137 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14138 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14139 << ", gid " << perms.gid() << dendl;
14140 tout(cct) << "ll_create" << std::endl;
14141 tout(cct) << vparent.ino.val << std::endl;
14142 tout(cct) << name << std::endl;
14143 tout(cct) << mode << std::endl;
14144 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14145
14146 bool created = false;
14147 int r = _lookup(parent, name, caps, in, perms);
14148
14149 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 14150 return -CEPHFS_EEXIST;
7c673cae 14151
f67539c2 14152 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
11fdf7f2 14153 if (!fuse_default_permissions) {
7c673cae
FG
14154 r = may_create(parent, perms);
14155 if (r < 0)
14156 goto out;
14157 }
14158 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
f67539c2 14159 perms, "");
7c673cae
FG
14160 if (r < 0)
14161 goto out;
14162 }
14163
14164 if (r < 0)
14165 goto out;
14166
11fdf7f2 14167 ceph_assert(*in);
7c673cae
FG
14168
14169 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14170 if (!created) {
11fdf7f2 14171 if (!fuse_default_permissions) {
7c673cae
FG
14172 r = may_open(in->get(), flags, perms);
14173 if (r < 0) {
14174 if (*fhp) {
14175 int release_r = _release_fh(*fhp);
11fdf7f2 14176 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
14177 }
14178 goto out;
14179 }
14180 }
14181 if (*fhp == NULL) {
14182 r = _open(in->get(), flags, mode, fhp, perms);
14183 if (r < 0)
14184 goto out;
14185 }
14186 }
14187
14188out:
14189 if (*fhp) {
14190 ll_unclosed_fh_set.insert(*fhp);
14191 }
14192
14193 ino_t ino = 0;
14194 if (r >= 0) {
14195 Inode *inode = in->get();
14196 if (use_faked_inos())
14197 ino = inode->faked_ino;
14198 else
14199 ino = inode->ino;
14200 }
14201
f67539c2 14202 tout(cct) << (uintptr_t)*fhp << std::endl;
7c673cae 14203 tout(cct) << ino << std::endl;
1adf2230 14204 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14205 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14206 *fhp << " " << hex << ino << dec << ")" << dendl;
14207
14208 return r;
14209}
14210
14211int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14212 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14213 const UserPerm& perms)
14214{
f67539c2
TL
14215 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14216 if (!mref_reader.is_state_satisfied())
14217 return -CEPHFS_ENOTCONN;
7c673cae 14218
f67539c2
TL
14219 std::scoped_lock lock(client_lock);
14220 InodeRef in;
181888fb 14221
7c673cae
FG
14222 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14223 fhp, perms);
14224 if (r >= 0) {
11fdf7f2 14225 ceph_assert(in);
7c673cae
FG
14226
14227 // passing an Inode in outp requires an additional ref
14228 if (outp) {
14229 _ll_get(in.get());
14230 *outp = in.get();
14231 }
14232 fill_stat(in, attr);
14233 } else {
14234 attr->st_ino = 0;
14235 }
14236
14237 return r;
14238}
14239
14240int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14241 int oflags, Inode **outp, Fh **fhp,
14242 struct ceph_statx *stx, unsigned want, unsigned lflags,
14243 const UserPerm& perms)
14244{
14245 unsigned caps = statx_to_mask(lflags, want);
f67539c2
TL
14246 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14247 if (!mref_reader.is_state_satisfied())
14248 return -CEPHFS_ENOTCONN;
7c673cae 14249
f67539c2
TL
14250 std::scoped_lock lock(client_lock);
14251 InodeRef in;
7c673cae
FG
14252
14253 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14254 if (r >= 0) {
11fdf7f2 14255 ceph_assert(in);
7c673cae
FG
14256
14257 // passing an Inode in outp requires an additional ref
14258 if (outp) {
14259 _ll_get(in.get());
14260 *outp = in.get();
14261 }
14262 fill_statx(in, caps, stx);
14263 } else {
14264 stx->stx_ino = 0;
14265 stx->stx_mask = 0;
14266 }
14267
14268 return r;
14269}
14270
14271loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14272{
f67539c2
TL
14273 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14274 if (!mref_reader.is_state_satisfied())
14275 return -CEPHFS_ENOTCONN;
14276
7c673cae
FG
14277 tout(cct) << "ll_lseek" << std::endl;
14278 tout(cct) << offset << std::endl;
14279 tout(cct) << whence << std::endl;
14280
f67539c2 14281 std::scoped_lock lock(client_lock);
7c673cae
FG
14282 return _lseek(fh, offset, whence);
14283}
14284
14285int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14286{
f67539c2
TL
14287 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14288 if (!mref_reader.is_state_satisfied())
14289 return -CEPHFS_ENOTCONN;
14290
7c673cae
FG
14291 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14292 tout(cct) << "ll_read" << std::endl;
f67539c2 14293 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14294 tout(cct) << off << std::endl;
14295 tout(cct) << len << std::endl;
14296
11fdf7f2
TL
14297 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14298 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14299 std::scoped_lock lock(client_lock);
14300
f6b5b4d7
TL
14301 int r = _read(fh, off, len, bl);
14302 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14303 << dendl;
14304 return r;
7c673cae
FG
14305}
14306
14307int Client::ll_read_block(Inode *in, uint64_t blockid,
14308 char *buf,
14309 uint64_t offset,
14310 uint64_t length,
14311 file_layout_t* layout)
14312{
f67539c2
TL
14313 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14314 if (!mref_reader.is_state_satisfied())
14315 return -CEPHFS_ENOTCONN;
181888fb 14316
b32b8144 14317 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14318 object_t oid = file_object_t(vino.ino, blockid);
14319 C_SaferCond onfinish;
14320 bufferlist bl;
14321
14322 objecter->read(oid,
14323 object_locator_t(layout->pool_id),
14324 offset,
14325 length,
14326 vino.snapid,
14327 &bl,
14328 CEPH_OSD_FLAG_READ,
14329 &onfinish);
14330
7c673cae 14331 int r = onfinish.wait();
7c673cae 14332 if (r >= 0) {
9f95a23c 14333 bl.begin().copy(bl.length(), buf);
7c673cae
FG
14334 r = bl.length();
14335 }
14336
14337 return r;
14338}
14339
14340/* It appears that the OSD doesn't return success unless the entire
14341 buffer was written, return the write length on success. */
14342
14343int Client::ll_write_block(Inode *in, uint64_t blockid,
14344 char* buf, uint64_t offset,
14345 uint64_t length, file_layout_t* layout,
14346 uint64_t snapseq, uint32_t sync)
14347{
7c673cae 14348 vinodeno_t vino = ll_get_vino(in);
7c673cae 14349 int r = 0;
11fdf7f2 14350 std::unique_ptr<C_SaferCond> onsafe = nullptr;
f67539c2
TL
14351
14352 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14353 if (!mref_reader.is_state_satisfied())
14354 return -CEPHFS_ENOTCONN;
14355
7c673cae 14356 if (length == 0) {
f67539c2 14357 return -CEPHFS_EINVAL;
7c673cae
FG
14358 }
14359 if (true || sync) {
14360 /* if write is stable, the epilogue is waiting on
14361 * flock */
11fdf7f2 14362 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
14363 }
14364 object_t oid = file_object_t(vino.ino, blockid);
14365 SnapContext fakesnap;
11fdf7f2
TL
14366 ceph::bufferlist bl;
14367 if (length > 0) {
14368 bl.push_back(buffer::copy(buf, length));
14369 }
7c673cae
FG
14370
14371 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14372 << dendl;
14373
14374 fakesnap.seq = snapseq;
14375
14376 /* lock just in time */
7c673cae
FG
14377 objecter->write(oid,
14378 object_locator_t(layout->pool_id),
14379 offset,
14380 length,
14381 fakesnap,
14382 bl,
14383 ceph::real_clock::now(),
14384 0,
11fdf7f2 14385 onsafe.get());
7c673cae 14386
11fdf7f2
TL
14387 if (nullptr != onsafe) {
14388 r = onsafe->wait();
7c673cae
FG
14389 }
14390
14391 if (r < 0) {
14392 return r;
14393 } else {
14394 return length;
14395 }
14396}
14397
14398int Client::ll_commit_blocks(Inode *in,
14399 uint64_t offset,
14400 uint64_t length)
14401{
7c673cae
FG
14402 /*
14403 BarrierContext *bctx;
b32b8144 14404 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14405 uint64_t ino = vino.ino;
14406
14407 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14408 << offset << " to " << length << dendl;
14409
14410 if (length == 0) {
f67539c2 14411 return -CEPHFS_EINVAL;
7c673cae
FG
14412 }
14413
f67539c2 14414 std::scoped_lock lock(client_lock);
7c673cae
FG
14415 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14416 if (p != barriers.end()) {
14417 barrier_interval civ(offset, offset + length);
14418 p->second->commit_barrier(civ);
14419 }
14420 */
14421 return 0;
14422}
14423
14424int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14425{
7c673cae
FG
14426 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14427 "~" << len << dendl;
14428 tout(cct) << "ll_write" << std::endl;
f67539c2 14429 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14430 tout(cct) << off << std::endl;
14431 tout(cct) << len << std::endl;
14432
f67539c2
TL
14433 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14434 if (!mref_reader.is_state_satisfied())
14435 return -CEPHFS_ENOTCONN;
181888fb 14436
11fdf7f2
TL
14437 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14438 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14439 std::scoped_lock lock(client_lock);
14440
7c673cae
FG
14441 int r = _write(fh, off, len, data, NULL, 0);
14442 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14443 << dendl;
14444 return r;
14445}
14446
11fdf7f2
TL
14447int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14448{
f67539c2
TL
14449 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14450 if (!mref_reader.is_state_satisfied())
14451 return -CEPHFS_ENOTCONN;
14452
20effc67
TL
14453 std::scoped_lock cl(client_lock);
14454 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
11fdf7f2
TL
14455}
14456
14457int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14458{
f67539c2
TL
14459 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14460 if (!mref_reader.is_state_satisfied())
14461 return -CEPHFS_ENOTCONN;
14462
20effc67
TL
14463 std::scoped_lock cl(client_lock);
14464 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
11fdf7f2
TL
14465}
14466
7c673cae
FG
14467int Client::ll_flush(Fh *fh)
14468{
f67539c2
TL
14469 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14470 if (!mref_reader.is_state_satisfied())
14471 return -CEPHFS_ENOTCONN;
14472
7c673cae
FG
14473 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14474 tout(cct) << "ll_flush" << std::endl;
f67539c2 14475 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14476
f67539c2 14477 std::scoped_lock lock(client_lock);
7c673cae
FG
14478 return _flush(fh);
14479}
14480
14481int Client::ll_fsync(Fh *fh, bool syncdataonly)
14482{
f67539c2
TL
14483 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14484 if (!mref_reader.is_state_satisfied())
14485 return -CEPHFS_ENOTCONN;
14486
7c673cae
FG
14487 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14488 tout(cct) << "ll_fsync" << std::endl;
f67539c2 14489 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14490
f67539c2 14491 std::scoped_lock lock(client_lock);
7c673cae
FG
14492 int r = _fsync(fh, syncdataonly);
14493 if (r) {
14494 // If we're returning an error, clear it from the FH
14495 fh->take_async_err();
14496 }
14497 return r;
14498}
14499
28e407b8
AA
14500int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14501{
f67539c2
TL
14502 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14503 if (!mref_reader.is_state_satisfied())
14504 return -CEPHFS_ENOTCONN;
14505
28e407b8
AA
14506 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14507 tout(cct) << "ll_sync_inode" << std::endl;
f67539c2 14508 tout(cct) << (uintptr_t)in << std::endl;
28e407b8 14509
f67539c2 14510 std::scoped_lock lock(client_lock);
28e407b8
AA
14511 return _fsync(in, syncdataonly);
14512}
14513
7c673cae
FG
14514int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14515{
f67539c2
TL
14516 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14517
7c673cae 14518 if (offset < 0 || length <= 0)
f67539c2 14519 return -CEPHFS_EINVAL;
7c673cae
FG
14520
14521 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
f67539c2 14522 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14523
14524 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
f67539c2 14525 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14526
14527 Inode *in = fh->inode.get();
14528
14529 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14530 !(mode & FALLOC_FL_PUNCH_HOLE)) {
f67539c2 14531 return -CEPHFS_ENOSPC;
7c673cae
FG
14532 }
14533
14534 if (in->snapid != CEPH_NOSNAP)
f67539c2 14535 return -CEPHFS_EROFS;
7c673cae
FG
14536
14537 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 14538 return -CEPHFS_EBADF;
7c673cae
FG
14539
14540 uint64_t size = offset + length;
14541 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14542 size > in->size &&
11fdf7f2 14543 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
f67539c2 14544 return -CEPHFS_EDQUOT;
7c673cae
FG
14545 }
14546
14547 int have;
f6b5b4d7 14548 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
14549 if (r < 0)
14550 return r;
14551
11fdf7f2 14552 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
14553 if (mode & FALLOC_FL_PUNCH_HOLE) {
14554 if (in->inline_version < CEPH_INLINE_NONE &&
14555 (have & CEPH_CAP_FILE_BUFFER)) {
14556 bufferlist bl;
9f95a23c 14557 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
14558 int len = in->inline_data.length();
14559 if (offset < len) {
14560 if (offset > 0)
9f95a23c 14561 inline_iter.copy(offset, bl);
7c673cae
FG
14562 int size = length;
14563 if (offset + size > len)
14564 size = len - offset;
14565 if (size > 0)
14566 bl.append_zero(size);
9f95a23c
TL
14567 if (offset + size < len) {
14568 inline_iter += size;
14569 inline_iter.copy(len - offset - size, bl);
14570 }
7c673cae
FG
14571 in->inline_data = bl;
14572 in->inline_version++;
14573 }
91327a77 14574 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14575 in->change_attr++;
28e407b8 14576 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14577 } else {
14578 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
14579 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14580 uninline_data(in, onuninline.get());
7c673cae
FG
14581 }
14582
11fdf7f2 14583 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae 14584
7c673cae
FG
14585 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14586
14587 _invalidate_inode_cache(in, offset, length);
14588 filer->zero(in->ino, &in->layout,
14589 in->snaprealm->get_snap_context(),
14590 offset, length,
14591 ceph::real_clock::now(),
11fdf7f2 14592 0, true, &onfinish);
91327a77 14593 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14594 in->change_attr++;
28e407b8 14595 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14596
9f95a23c 14597 client_lock.unlock();
11fdf7f2 14598 onfinish.wait();
9f95a23c 14599 client_lock.lock();
f67539c2 14600 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
14601 }
14602 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14603 uint64_t size = offset + length;
14604 if (size > in->size) {
14605 in->size = size;
91327a77 14606 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14607 in->change_attr++;
28e407b8 14608 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14609
11fdf7f2 14610 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 14611 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
14612 } else if (is_max_size_approaching(in)) {
14613 check_caps(in, 0);
7c673cae
FG
14614 }
14615 }
14616 }
14617
11fdf7f2 14618 if (nullptr != onuninline) {
9f95a23c 14619 client_lock.unlock();
11fdf7f2 14620 int ret = onuninline->wait();
9f95a23c 14621 client_lock.lock();
7c673cae 14622
f67539c2 14623 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
14624 in->inline_data.clear();
14625 in->inline_version = CEPH_INLINE_NONE;
28e407b8 14626 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14627 check_caps(in, 0);
14628 } else
11fdf7f2 14629 r = ret;
7c673cae
FG
14630 }
14631
14632 put_cap_ref(in, CEPH_CAP_FILE_WR);
14633 return r;
14634}
7c673cae 14635
11fdf7f2 14636int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 14637{
f67539c2
TL
14638 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14639 if (!mref_reader.is_state_satisfied())
14640 return -CEPHFS_ENOTCONN;
14641
11fdf7f2
TL
14642 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14643 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
f67539c2 14644 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14645
f67539c2 14646 std::scoped_lock lock(client_lock);
7c673cae
FG
14647 return _fallocate(fh, mode, offset, length);
14648}
14649
14650int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14651{
f67539c2
TL
14652 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14653 if (!mref_reader.is_state_satisfied())
14654 return -CEPHFS_ENOTCONN;
7c673cae 14655
f67539c2 14656 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
181888fb 14657
f67539c2 14658 std::scoped_lock lock(client_lock);
7c673cae
FG
14659 Fh *fh = get_filehandle(fd);
14660 if (!fh)
f67539c2 14661 return -CEPHFS_EBADF;
7c673cae
FG
14662#if defined(__linux__) && defined(O_PATH)
14663 if (fh->flags & O_PATH)
f67539c2 14664 return -CEPHFS_EBADF;
7c673cae
FG
14665#endif
14666 return _fallocate(fh, mode, offset, length);
14667}
14668
14669int Client::ll_release(Fh *fh)
14670{
f67539c2
TL
14671 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14672 if (!mref_reader.is_state_satisfied())
14673 return -CEPHFS_ENOTCONN;
91327a77 14674
11fdf7f2 14675 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 14676 dendl;
11fdf7f2 14677 tout(cct) << __func__ << " (fh)" << std::endl;
f67539c2
TL
14678 tout(cct) << (uintptr_t)fh << std::endl;
14679
14680 std::scoped_lock lock(client_lock);
7c673cae
FG
14681
14682 if (ll_unclosed_fh_set.count(fh))
14683 ll_unclosed_fh_set.erase(fh);
14684 return _release_fh(fh);
14685}
14686
14687int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14688{
f67539c2
TL
14689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14690 if (!mref_reader.is_state_satisfied())
14691 return -CEPHFS_ENOTCONN;
7c673cae
FG
14692
14693 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
f67539c2 14694 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
181888fb 14695
f67539c2 14696 std::scoped_lock lock(client_lock);
7c673cae
FG
14697 return _getlk(fh, fl, owner);
14698}
14699
14700int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14701{
f67539c2
TL
14702 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14703 if (!mref_reader.is_state_satisfied())
14704 return -CEPHFS_ENOTCONN;
7c673cae 14705
11fdf7f2 14706 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14707 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14708
f67539c2 14709 std::scoped_lock lock(client_lock);
7c673cae
FG
14710 return _setlk(fh, fl, owner, sleep);
14711}
14712
14713int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14714{
f67539c2
TL
14715 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14716 if (!mref_reader.is_state_satisfied())
14717 return -CEPHFS_ENOTCONN;
7c673cae 14718
11fdf7f2 14719 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14720 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14721
f67539c2 14722 std::scoped_lock lock(client_lock);
7c673cae
FG
14723 return _flock(fh, cmd, owner);
14724}
14725
b32b8144
FG
14726int Client::set_deleg_timeout(uint32_t timeout)
14727{
f67539c2 14728 std::scoped_lock lock(client_lock);
b32b8144
FG
14729
14730 /*
f67539c2 14731 * The whole point is to prevent blocklisting so we must time out the
b32b8144
FG
14732 * delegation before the session autoclose timeout kicks in.
14733 */
14734 if (timeout >= mdsmap->get_session_autoclose())
f67539c2 14735 return -CEPHFS_EINVAL;
b32b8144
FG
14736
14737 deleg_timeout = timeout;
14738 return 0;
14739}
14740
14741int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14742{
f67539c2 14743 int ret = -CEPHFS_EINVAL;
b32b8144 14744
f67539c2
TL
14745 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14746 if (!mref_reader.is_state_satisfied())
14747 return -CEPHFS_ENOTCONN;
b32b8144 14748
f67539c2 14749 std::scoped_lock lock(client_lock);
b32b8144
FG
14750
14751 Inode *inode = fh->inode.get();
14752
14753 switch(cmd) {
14754 case CEPH_DELEGATION_NONE:
14755 inode->unset_deleg(fh);
14756 ret = 0;
14757 break;
14758 default:
14759 try {
14760 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 14761 } catch (std::bad_alloc&) {
f67539c2 14762 ret = -CEPHFS_ENOMEM;
b32b8144
FG
14763 }
14764 break;
14765 }
14766 return ret;
14767}
14768
7c673cae
FG
14769class C_Client_RequestInterrupt : public Context {
14770private:
14771 Client *client;
14772 MetaRequest *req;
14773public:
14774 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14775 req->get();
14776 }
14777 void finish(int r) override {
f67539c2 14778 std::scoped_lock l(client->client_lock);
11fdf7f2 14779 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
14780 client->_interrupt_filelock(req);
14781 client->put_request(req);
14782 }
14783};
14784
14785void Client::ll_interrupt(void *d)
14786{
14787 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
14788 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14789 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
14790 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14791}
14792
14793// =========================================
14794// layout
14795
14796// expose file layouts
14797
14798int Client::describe_layout(const char *relpath, file_layout_t *lp,
14799 const UserPerm& perms)
14800{
f67539c2
TL
14801 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14802 if (!mref_reader.is_state_satisfied())
14803 return -CEPHFS_ENOTCONN;
7c673cae 14804
f67539c2 14805 std::scoped_lock lock(client_lock);
181888fb 14806
7c673cae
FG
14807 filepath path(relpath);
14808 InodeRef in;
14809 int r = path_walk(path, &in, perms);
14810 if (r < 0)
14811 return r;
14812
14813 *lp = in->layout;
14814
11fdf7f2 14815 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
14816 return 0;
14817}
14818
14819int Client::fdescribe_layout(int fd, file_layout_t *lp)
14820{
f67539c2
TL
14821 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14822 if (!mref_reader.is_state_satisfied())
14823 return -CEPHFS_ENOTCONN;
7c673cae 14824
f67539c2 14825 std::scoped_lock lock(client_lock);
181888fb 14826
7c673cae
FG
14827 Fh *f = get_filehandle(fd);
14828 if (!f)
f67539c2 14829 return -CEPHFS_EBADF;
7c673cae
FG
14830 Inode *in = f->inode.get();
14831
14832 *lp = in->layout;
14833
11fdf7f2 14834 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
14835 return 0;
14836}
14837
d2e6a577
FG
14838int64_t Client::get_default_pool_id()
14839{
f67539c2
TL
14840 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14841 if (!mref_reader.is_state_satisfied())
14842 return -CEPHFS_ENOTCONN;
181888fb 14843
f67539c2 14844 std::scoped_lock lock(client_lock);
181888fb 14845
d2e6a577
FG
14846 /* first data pool is the default */
14847 return mdsmap->get_first_data_pool();
14848}
7c673cae
FG
14849
14850// expose osdmap
14851
14852int64_t Client::get_pool_id(const char *pool_name)
14853{
f67539c2
TL
14854 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14855 if (!mref_reader.is_state_satisfied())
14856 return -CEPHFS_ENOTCONN;
181888fb 14857
f67539c2 14858 std::scoped_lock lock(client_lock);
181888fb 14859
7c673cae
FG
14860 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14861 pool_name);
14862}
14863
14864string Client::get_pool_name(int64_t pool)
14865{
f67539c2
TL
14866 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14867 if (!mref_reader.is_state_satisfied())
181888fb
FG
14868 return string();
14869
f67539c2
TL
14870 std::scoped_lock lock(client_lock);
14871
7c673cae
FG
14872 return objecter->with_osdmap([pool](const OSDMap& o) {
14873 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14874 });
14875}
14876
14877int Client::get_pool_replication(int64_t pool)
14878{
f67539c2
TL
14879 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14880 if (!mref_reader.is_state_satisfied())
14881 return -CEPHFS_ENOTCONN;
181888fb 14882
f67539c2 14883 std::scoped_lock lock(client_lock);
181888fb 14884
7c673cae 14885 return objecter->with_osdmap([pool](const OSDMap& o) {
f67539c2 14886 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
7c673cae
FG
14887 });
14888}
14889
14890int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14891{
f67539c2
TL
14892 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14893 if (!mref_reader.is_state_satisfied())
14894 return -CEPHFS_ENOTCONN;
7c673cae 14895
f67539c2 14896 std::scoped_lock lock(client_lock);
181888fb 14897
7c673cae
FG
14898 Fh *f = get_filehandle(fd);
14899 if (!f)
f67539c2 14900 return -CEPHFS_EBADF;
7c673cae
FG
14901 Inode *in = f->inode.get();
14902
14903 vector<ObjectExtent> extents;
14904 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 14905 ceph_assert(extents.size() == 1);
7c673cae
FG
14906
14907 objecter->with_osdmap([&](const OSDMap& o) {
14908 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14909 o.pg_to_acting_osds(pg, osds);
14910 });
14911
14912 if (osds.empty())
f67539c2 14913 return -CEPHFS_EINVAL;
7c673cae
FG
14914
14915 /*
14916 * Return the remainder of the extent (stripe unit)
14917 *
14918 * If length = 1 is passed to Striper::file_to_extents we get a single
14919 * extent back, but its length is one so we still need to compute the length
14920 * to the end of the stripe unit.
14921 *
14922 * If length = su then we may get 1 or 2 objects back in the extents vector
14923 * which would have to be examined. Even then, the offsets are local to the
14924 * object, so matching up to the file offset is extra work.
14925 *
14926 * It seems simpler to stick with length = 1 and manually compute the
14927 * remainder.
14928 */
14929 if (len) {
14930 uint64_t su = in->layout.stripe_unit;
14931 *len = su - (off % su);
14932 }
14933
14934 return 0;
14935}
14936
14937int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14938{
f67539c2
TL
14939 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14940 if (!mref_reader.is_state_satisfied())
14941 return -CEPHFS_ENOTCONN;
181888fb 14942
f67539c2 14943 std::scoped_lock lock(client_lock);
181888fb 14944
7c673cae 14945 if (id < 0)
f67539c2 14946 return -CEPHFS_EINVAL;
7c673cae
FG
14947 return objecter->with_osdmap([&](const OSDMap& o) {
14948 return o.crush->get_full_location_ordered(id, path);
14949 });
14950}
14951
14952int Client::get_file_stripe_address(int fd, loff_t offset,
14953 vector<entity_addr_t>& address)
14954{
f67539c2
TL
14955 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14956 if (!mref_reader.is_state_satisfied())
14957 return -CEPHFS_ENOTCONN;
7c673cae 14958
f67539c2 14959 std::scoped_lock lock(client_lock);
181888fb 14960
7c673cae
FG
14961 Fh *f = get_filehandle(fd);
14962 if (!f)
f67539c2 14963 return -CEPHFS_EBADF;
7c673cae
FG
14964 Inode *in = f->inode.get();
14965
14966 // which object?
14967 vector<ObjectExtent> extents;
14968 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14969 in->truncate_size, extents);
11fdf7f2 14970 ceph_assert(extents.size() == 1);
7c673cae
FG
14971
14972 // now we have the object and its 'layout'
14973 return objecter->with_osdmap([&](const OSDMap& o) {
14974 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14975 vector<int> osds;
14976 o.pg_to_acting_osds(pg, osds);
14977 if (osds.empty())
f67539c2 14978 return -CEPHFS_EINVAL;
7c673cae 14979 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 14980 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
14981 address.push_back(addr);
14982 }
14983 return 0;
14984 });
14985}
14986
14987int Client::get_osd_addr(int osd, entity_addr_t& addr)
14988{
f67539c2
TL
14989 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14990 if (!mref_reader.is_state_satisfied())
14991 return -CEPHFS_ENOTCONN;
181888fb 14992
f67539c2 14993 std::scoped_lock lock(client_lock);
181888fb 14994
7c673cae
FG
14995 return objecter->with_osdmap([&](const OSDMap& o) {
14996 if (!o.exists(osd))
f67539c2 14997 return -CEPHFS_ENOENT;
7c673cae 14998
11fdf7f2 14999 addr = o.get_addrs(osd).front();
7c673cae
FG
15000 return 0;
15001 });
15002}
15003
15004int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15005 loff_t length, loff_t offset)
15006{
f67539c2
TL
15007 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15008 if (!mref_reader.is_state_satisfied())
15009 return -CEPHFS_ENOTCONN;
7c673cae 15010
f67539c2 15011 std::scoped_lock lock(client_lock);
181888fb 15012
7c673cae
FG
15013 Fh *f = get_filehandle(fd);
15014 if (!f)
f67539c2 15015 return -CEPHFS_EBADF;
7c673cae
FG
15016 Inode *in = f->inode.get();
15017
15018 // map to a list of extents
15019 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15020
11fdf7f2 15021 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
15022 return 0;
15023}
15024
15025
f67539c2 15026/* find an osd with the same ip. -CEPHFS_ENXIO if none. */
7c673cae
FG
15027int Client::get_local_osd()
15028{
f67539c2
TL
15029 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15030 if (!mref_reader.is_state_satisfied())
15031 return -CEPHFS_ENOTCONN;
181888fb 15032
f67539c2 15033 std::scoped_lock lock(client_lock);
181888fb 15034
7c673cae
FG
15035 objecter->with_osdmap([this](const OSDMap& o) {
15036 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 15037 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
15038 local_osd_epoch = o.get_epoch();
15039 }
15040 });
15041 return local_osd;
15042}
15043
15044
15045
15046
15047
15048
15049// ===============================
15050
15051void Client::ms_handle_connect(Connection *con)
15052{
11fdf7f2 15053 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15054}
15055
15056bool Client::ms_handle_reset(Connection *con)
15057{
11fdf7f2 15058 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15059 return false;
15060}
15061
15062void Client::ms_handle_remote_reset(Connection *con)
15063{
f67539c2 15064 std::scoped_lock lock(client_lock);
11fdf7f2 15065 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15066 switch (con->get_peer_type()) {
15067 case CEPH_ENTITY_TYPE_MDS:
15068 {
15069 // kludge to figure out which mds this is; fixme with a Connection* state
15070 mds_rank_t mds = MDS_RANK_NONE;
20effc67 15071 MetaSessionRef s = NULL;
11fdf7f2 15072 for (auto &p : mds_sessions) {
b3b6e05e 15073 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
11fdf7f2 15074 mds = p.first;
20effc67 15075 s = p.second;
7c673cae
FG
15076 }
15077 }
15078 if (mds >= 0) {
20effc67 15079 ceph_assert(s != NULL);
7c673cae
FG
15080 switch (s->state) {
15081 case MetaSession::STATE_CLOSING:
15082 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
20effc67 15083 _closed_mds_session(s.get());
7c673cae
FG
15084 break;
15085
15086 case MetaSession::STATE_OPENING:
15087 {
15088 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15089 list<Context*> waiters;
15090 waiters.swap(s->waiting_for_open);
20effc67
TL
15091 _closed_mds_session(s.get());
15092 auto news = _get_or_open_mds_session(mds);
7c673cae
FG
15093 news->waiting_for_open.swap(waiters);
15094 }
15095 break;
15096
15097 case MetaSession::STATE_OPEN:
15098 {
f67539c2 15099 objecter->maybe_request_map(); /* to check if we are blocklisted */
f6b5b4d7 15100 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae 15101 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
20effc67 15102 _closed_mds_session(s.get());
7c673cae
FG
15103 } else {
15104 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15105 s->state = MetaSession::STATE_STALE;
15106 }
15107 }
15108 break;
15109
15110 case MetaSession::STATE_NEW:
15111 case MetaSession::STATE_CLOSED:
15112 default:
15113 break;
15114 }
15115 }
15116 }
15117 break;
15118 }
15119}
15120
15121bool Client::ms_handle_refused(Connection *con)
15122{
11fdf7f2 15123 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15124 return false;
15125}
15126
7c673cae
FG
15127Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15128{
11fdf7f2
TL
15129 Inode *quota_in = root_ancestor;
15130 SnapRealm *realm = in->snaprealm;
15131 while (realm) {
15132 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15133 if (realm->ino != in->ino) {
15134 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15135 if (p == inode_map.end())
15136 break;
7c673cae 15137
11fdf7f2
TL
15138 if (p->second->quota.is_enable()) {
15139 quota_in = p->second;
15140 break;
7c673cae 15141 }
7c673cae 15142 }
11fdf7f2 15143 realm = realm->pparent;
7c673cae 15144 }
11fdf7f2
TL
15145 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15146 return quota_in;
7c673cae
FG
15147}
15148
15149/**
15150 * Traverse quota ancestors of the Inode, return true
15151 * if any of them passes the passed function
15152 */
15153bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15154 std::function<bool (const Inode &in)> test)
15155{
15156 while (true) {
11fdf7f2 15157 ceph_assert(in != NULL);
7c673cae
FG
15158 if (test(*in)) {
15159 return true;
15160 }
15161
15162 if (in == root_ancestor) {
15163 // We're done traversing, drop out
15164 return false;
15165 } else {
15166 // Continue up the tree
15167 in = get_quota_root(in, perms);
15168 }
15169 }
15170
15171 return false;
15172}
15173
15174bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15175{
15176 return check_quota_condition(in, perms,
15177 [](const Inode &in) {
15178 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15179 });
15180}
15181
15182bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 15183 const UserPerm& perms)
7c673cae
FG
15184{
15185 return check_quota_condition(in, perms,
11fdf7f2 15186 [&new_bytes](const Inode &in) {
7c673cae
FG
15187 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15188 > in.quota.max_bytes;
15189 });
15190}
15191
11fdf7f2 15192bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 15193{
9f95a23c
TL
15194 ceph_assert(in->size >= in->reported_size);
15195 const uint64_t size = in->size - in->reported_size;
11fdf7f2 15196 return check_quota_condition(in, perms,
9f95a23c 15197 [&size](const Inode &in) {
11fdf7f2
TL
15198 if (in.quota.max_bytes) {
15199 if (in.rstat.rbytes >= in.quota.max_bytes) {
15200 return true;
15201 }
15202
11fdf7f2 15203 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
15204 return (space >> 4) < size;
15205 } else {
15206 return false;
15207 }
15208 });
7c673cae
FG
15209}
15210
15211enum {
15212 POOL_CHECKED = 1,
15213 POOL_CHECKING = 2,
15214 POOL_READ = 4,
15215 POOL_WRITE = 8,
15216};
15217
15218int Client::check_pool_perm(Inode *in, int need)
15219{
f67539c2
TL
15220 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15221
7c673cae
FG
15222 if (!cct->_conf->client_check_pool_perm)
15223 return 0;
15224
f67539c2
TL
15225 /* Only need to do this for regular files */
15226 if (!in->is_file())
15227 return 0;
15228
7c673cae
FG
15229 int64_t pool_id = in->layout.pool_id;
15230 std::string pool_ns = in->layout.pool_ns;
15231 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15232 int have = 0;
15233 while (true) {
15234 auto it = pool_perms.find(perm_key);
15235 if (it == pool_perms.end())
15236 break;
15237 if (it->second == POOL_CHECKING) {
15238 // avoid concurrent checkings
15239 wait_on_list(waiting_for_pool_perm);
15240 } else {
15241 have = it->second;
11fdf7f2 15242 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
15243 break;
15244 }
15245 }
15246
15247 if (!have) {
15248 if (in->snapid != CEPH_NOSNAP) {
15249 // pool permission check needs to write to the first object. But for snapshot,
20effc67 15250 // head of the first object may have already been deleted. To avoid creating
7c673cae
FG
15251 // orphan object, skip the check for now.
15252 return 0;
15253 }
15254
15255 pool_perms[perm_key] = POOL_CHECKING;
15256
15257 char oid_buf[32];
15258 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15259 object_t oid = oid_buf;
15260
15261 SnapContext nullsnapc;
15262
15263 C_SaferCond rd_cond;
15264 ObjectOperation rd_op;
f67539c2 15265 rd_op.stat(nullptr, nullptr, nullptr);
7c673cae
FG
15266
15267 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15268 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15269
15270 C_SaferCond wr_cond;
15271 ObjectOperation wr_op;
15272 wr_op.create(true);
15273
15274 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15275 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15276
9f95a23c 15277 client_lock.unlock();
7c673cae
FG
15278 int rd_ret = rd_cond.wait();
15279 int wr_ret = wr_cond.wait();
9f95a23c 15280 client_lock.lock();
7c673cae
FG
15281
15282 bool errored = false;
15283
f67539c2 15284 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
7c673cae 15285 have |= POOL_READ;
f67539c2 15286 else if (rd_ret != -CEPHFS_EPERM) {
11fdf7f2 15287 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15288 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15289 errored = true;
15290 }
15291
f67539c2 15292 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
7c673cae 15293 have |= POOL_WRITE;
f67539c2 15294 else if (wr_ret != -CEPHFS_EPERM) {
11fdf7f2 15295 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15296 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15297 errored = true;
15298 }
15299
15300 if (errored) {
15301 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15302 // Raise EIO because actual error code might be misleading for
15303 // userspace filesystem user.
15304 pool_perms.erase(perm_key);
15305 signal_cond_list(waiting_for_pool_perm);
f67539c2 15306 return -CEPHFS_EIO;
7c673cae
FG
15307 }
15308
15309 pool_perms[perm_key] = have | POOL_CHECKED;
15310 signal_cond_list(waiting_for_pool_perm);
15311 }
15312
15313 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 15314 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15315 << " need " << ccap_string(need) << ", but no read perm" << dendl;
f67539c2 15316 return -CEPHFS_EPERM;
7c673cae
FG
15317 }
15318 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 15319 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15320 << " need " << ccap_string(need) << ", but no write perm" << dendl;
f67539c2 15321 return -CEPHFS_EPERM;
7c673cae
FG
15322 }
15323
15324 return 0;
15325}
15326
15327int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15328{
15329 if (acl_type == POSIX_ACL) {
15330 if (in->xattrs.count(ACL_EA_ACCESS)) {
15331 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15332
15333 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15334 }
15335 }
f67539c2 15336 return -CEPHFS_EAGAIN;
7c673cae
FG
15337}
15338
15339int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15340{
15341 if (acl_type == NO_ACL)
15342 return 0;
15343
15344 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15345 if (r < 0)
15346 goto out;
15347
15348 if (acl_type == POSIX_ACL) {
15349 if (in->xattrs.count(ACL_EA_ACCESS)) {
15350 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15351 bufferptr acl(access_acl.c_str(), access_acl.length());
15352 r = posix_acl_access_chmod(acl, mode);
15353 if (r < 0)
15354 goto out;
15355 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15356 } else {
15357 r = 0;
15358 }
15359 }
15360out:
15361 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15362 return r;
15363}
15364
15365int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15366 const UserPerm& perms)
15367{
15368 if (acl_type == NO_ACL)
15369 return 0;
15370
15371 if (S_ISLNK(*mode))
15372 return 0;
15373
15374 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15375 if (r < 0)
15376 goto out;
15377
15378 if (acl_type == POSIX_ACL) {
15379 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15380 map<string, bufferptr> xattrs;
15381
15382 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15383 bufferptr acl(default_acl.c_str(), default_acl.length());
15384 r = posix_acl_inherit_mode(acl, mode);
15385 if (r < 0)
15386 goto out;
15387
15388 if (r > 0) {
15389 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15390 if (r < 0)
15391 goto out;
15392 if (r > 0)
15393 xattrs[ACL_EA_ACCESS] = acl;
15394 }
15395
15396 if (S_ISDIR(*mode))
15397 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15398
15399 r = xattrs.size();
15400 if (r > 0)
11fdf7f2 15401 encode(xattrs, xattrs_bl);
7c673cae
FG
15402 } else {
15403 if (umask_cb)
15404 *mode &= ~umask_cb(callback_handle);
15405 r = 0;
15406 }
15407 }
15408out:
15409 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15410 return r;
15411}
15412
15413void Client::set_filer_flags(int flags)
15414{
f67539c2 15415 std::scoped_lock l(client_lock);
11fdf7f2 15416 ceph_assert(flags == 0 ||
7c673cae
FG
15417 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15418 objecter->add_global_op_flags(flags);
15419}
15420
15421void Client::clear_filer_flags(int flags)
15422{
f67539c2 15423 std::scoped_lock l(client_lock);
11fdf7f2 15424 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
15425 objecter->clear_global_op_flag(flags);
15426}
15427
11fdf7f2
TL
15428// called before mount
15429void Client::set_uuid(const std::string& uuid)
15430{
f67539c2
TL
15431 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15432 ceph_assert(iref_reader.is_state_satisfied());
15433
15434 std::scoped_lock l(client_lock);
20effc67 15435 ceph_assert(!uuid.empty());
11fdf7f2
TL
15436
15437 metadata["uuid"] = uuid;
15438 _close_sessions();
15439}
15440
15441// called before mount. 0 means infinite
15442void Client::set_session_timeout(unsigned timeout)
15443{
f67539c2
TL
15444 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15445 ceph_assert(iref_reader.is_state_satisfied());
15446
15447 std::scoped_lock l(client_lock);
11fdf7f2
TL
15448
15449 metadata["timeout"] = stringify(timeout);
15450}
15451
15452// called before mount
15453int Client::start_reclaim(const std::string& uuid, unsigned flags,
15454 const std::string& fs_name)
15455{
f67539c2
TL
15456 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15457 if (!iref_reader.is_state_satisfied())
15458 return -CEPHFS_ENOTCONN;
11fdf7f2
TL
15459
15460 if (uuid.empty())
f67539c2 15461 return -CEPHFS_EINVAL;
11fdf7f2 15462
f67539c2 15463 std::unique_lock l(client_lock);
11fdf7f2
TL
15464 {
15465 auto it = metadata.find("uuid");
15466 if (it != metadata.end() && it->second == uuid)
f67539c2 15467 return -CEPHFS_EINVAL;
11fdf7f2
TL
15468 }
15469
15470 int r = subscribe_mdsmap(fs_name);
15471 if (r < 0) {
15472 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15473 return r;
15474 }
15475
15476 if (metadata.empty())
15477 populate_metadata("");
15478
15479 while (mdsmap->get_epoch() == 0)
15480 wait_on_list(waiting_for_mdsmap);
15481
15482 reclaim_errno = 0;
15483 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15484 if (!mdsmap->is_up(mds)) {
15485 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15486 wait_on_list(waiting_for_mdsmap);
15487 continue;
15488 }
15489
20effc67 15490 MetaSessionRef session;
11fdf7f2
TL
15491 if (!have_open_session(mds)) {
15492 session = _get_or_open_mds_session(mds);
f6b5b4d7 15493 if (session->state == MetaSession::STATE_REJECTED)
f67539c2 15494 return -CEPHFS_EPERM;
11fdf7f2
TL
15495 if (session->state != MetaSession::STATE_OPENING) {
15496 // umounting?
f67539c2 15497 return -CEPHFS_EINVAL;
11fdf7f2
TL
15498 }
15499 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15500 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
15501 continue;
15502 }
15503
20effc67 15504 session = mds_sessions.at(mds);
11fdf7f2 15505 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
f67539c2 15506 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
15507
15508 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15509 session->reclaim_state == MetaSession::RECLAIMING) {
15510 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 15511 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
15512 session->con->send_message2(std::move(m));
15513 wait_on_list(waiting_for_reclaim);
15514 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
f67539c2 15515 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15516 } else {
15517 mds++;
15518 }
15519 }
15520
15521 // didn't find target session in any mds
15522 if (reclaim_target_addrs.empty()) {
15523 if (flags & CEPH_RECLAIM_RESET)
f67539c2
TL
15524 return -CEPHFS_ENOENT;
15525 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15526 }
15527
15528 if (flags & CEPH_RECLAIM_RESET)
15529 return 0;
15530
f67539c2
TL
15531 // use blocklist to check if target session was killed
15532 // (config option mds_session_blocklist_on_evict needs to be true)
15533 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15534 bs::error_code ec;
15535 l.unlock();
15536 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15537 l.lock();
11fdf7f2 15538
f67539c2
TL
15539 if (ec)
15540 return ceph::from_error_code(ec);
15541
15542 bool blocklisted = objecter->with_osdmap(
11fdf7f2 15543 [this](const OSDMap &osd_map) -> bool {
f67539c2 15544 return osd_map.is_blocklisted(reclaim_target_addrs);
11fdf7f2 15545 });
f67539c2
TL
15546 if (blocklisted)
15547 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15548
15549 metadata["reclaiming_uuid"] = uuid;
15550 return 0;
15551}
15552
15553void Client::finish_reclaim()
15554{
15555 auto it = metadata.find("reclaiming_uuid");
15556 if (it == metadata.end()) {
15557 for (auto &p : mds_sessions)
20effc67 15558 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
11fdf7f2
TL
15559 return;
15560 }
15561
15562 for (auto &p : mds_sessions) {
20effc67 15563 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 15564 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
20effc67 15565 p.second->con->send_message2(std::move(m));
11fdf7f2
TL
15566 }
15567
15568 metadata["uuid"] = it->second;
15569 metadata.erase(it);
15570}
15571
15572void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15573{
15574 mds_rank_t from = mds_rank_t(reply->get_source().num());
15575 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15576
f67539c2 15577 std::scoped_lock cl(client_lock);
20effc67 15578 auto session = _get_mds_session(from, reply->get_connection().get());
11fdf7f2
TL
15579 if (!session) {
15580 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15581 return;
15582 }
15583
15584 if (reply->get_result() >= 0) {
15585 session->reclaim_state = MetaSession::RECLAIM_OK;
15586 if (reply->get_epoch() > reclaim_osd_epoch)
15587 reclaim_osd_epoch = reply->get_epoch();
15588 if (!reply->get_addrs().empty())
15589 reclaim_target_addrs = reply->get_addrs();
15590 } else {
15591 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15592 reclaim_errno = reply->get_result();
15593 }
15594
15595 signal_cond_list(waiting_for_reclaim);
15596}
15597
7c673cae
FG
15598/**
15599 * This is included in cap release messages, to cause
15600 * the MDS to wait until this OSD map epoch. It is necessary
15601 * in corner cases where we cancel RADOS ops, so that
15602 * nobody else tries to do IO to the same objects in
15603 * the same epoch as the cancelled ops.
15604 */
15605void Client::set_cap_epoch_barrier(epoch_t e)
15606{
15607 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15608 cap_epoch_barrier = e;
15609}
15610
15611const char** Client::get_tracked_conf_keys() const
15612{
15613 static const char* keys[] = {
15614 "client_cache_size",
15615 "client_cache_mid",
15616 "client_acl_type",
b32b8144
FG
15617 "client_deleg_timeout",
15618 "client_deleg_break_on_open",
f67539c2
TL
15619 "client_oc_size",
15620 "client_oc_max_objects",
15621 "client_oc_max_dirty",
15622 "client_oc_target_dirty",
15623 "client_oc_max_dirty_age",
7c673cae
FG
15624 NULL
15625 };
15626 return keys;
15627}
15628
11fdf7f2 15629void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
15630 const std::set <std::string> &changed)
15631{
f67539c2 15632 std::scoped_lock lock(client_lock);
7c673cae 15633
181888fb 15634 if (changed.count("client_cache_mid")) {
7c673cae
FG
15635 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15636 }
15637 if (changed.count("client_acl_type")) {
15638 acl_type = NO_ACL;
15639 if (cct->_conf->client_acl_type == "posix_acl")
15640 acl_type = POSIX_ACL;
15641 }
f67539c2
TL
15642 if (changed.count("client_oc_size")) {
15643 objectcacher->set_max_size(cct->_conf->client_oc_size);
15644 }
15645 if (changed.count("client_oc_max_objects")) {
15646 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15647 }
15648 if (changed.count("client_oc_max_dirty")) {
15649 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15650 }
15651 if (changed.count("client_oc_target_dirty")) {
15652 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15653 }
15654 if (changed.count("client_oc_max_dirty_age")) {
15655 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15656 }
7c673cae
FG
15657}
15658
7c673cae
FG
15659void intrusive_ptr_add_ref(Inode *in)
15660{
b3b6e05e 15661 in->iget();
7c673cae 15662}
f67539c2 15663
7c673cae
FG
15664void intrusive_ptr_release(Inode *in)
15665{
15666 in->client->put_inode(in);
15667}
15668
15669mds_rank_t Client::_get_random_up_mds() const
15670{
9f95a23c 15671 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
15672
15673 std::set<mds_rank_t> up;
15674 mdsmap->get_up_mds_set(up);
15675
15676 if (up.empty())
15677 return MDS_RANK_NONE;
15678 std::set<mds_rank_t>::const_iterator p = up.begin();
15679 for (int n = rand() % up.size(); n; n--)
15680 ++p;
15681 return *p;
15682}
15683
15684
f67539c2
TL
15685StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15686 boost::asio::io_context& ictx)
15687 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
7c673cae
FG
15688{
15689 monclient->set_messenger(m);
15690 objecter->set_client_incarnation(0);
15691}
15692
15693StandaloneClient::~StandaloneClient()
15694{
15695 delete objecter;
15696 objecter = nullptr;
15697}
15698
15699int StandaloneClient::init()
15700{
f67539c2
TL
15701 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15702 ceph_assert(iref_writer.is_first_writer());
15703
e306af50 15704 _pre_init();
7c673cae
FG
15705 objecter->init();
15706
9f95a23c 15707 client_lock.lock();
7c673cae
FG
15708
15709 messenger->add_dispatcher_tail(objecter);
15710 messenger->add_dispatcher_tail(this);
15711
15712 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15713 int r = monclient->init();
15714 if (r < 0) {
15715 // need to do cleanup because we're in an intermediate init state
f67539c2
TL
15716 {
15717 std::scoped_lock l(timer_lock);
15718 timer.shutdown();
15719 }
15720
9f95a23c 15721 client_lock.unlock();
7c673cae
FG
15722 objecter->shutdown();
15723 objectcacher->stop();
15724 monclient->shutdown();
15725 return r;
15726 }
15727 objecter->start();
15728
9f95a23c 15729 client_lock.unlock();
7c673cae 15730 _finish_init();
f67539c2 15731 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
15732
15733 return 0;
15734}
15735
15736void StandaloneClient::shutdown()
15737{
15738 Client::shutdown();
15739 objecter->shutdown();
15740 monclient->shutdown();
15741}