]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
f67539c2 26#ifndef _WIN32
7c673cae 27#include <sys/utsname.h>
f67539c2 28#endif
7c673cae
FG
29#include <sys/uio.h>
30
31#include <boost/lexical_cast.hpp>
32#include <boost/fusion/include/std_pair.hpp>
33
f67539c2
TL
34#include "common/async/waiter.h"
35
36#if defined(__FreeBSD__) || defined(_WIN32)
7c673cae
FG
37#define XATTR_CREATE 0x1
38#define XATTR_REPLACE 0x2
39#else
40#include <sys/xattr.h>
41#endif
42
43#if defined(__linux__)
44#include <linux/falloc.h>
45#endif
46
47#include <sys/statvfs.h>
48
49#include "common/config.h"
50#include "common/version.h"
f67539c2 51#include "common/async/blocked_completion.h"
7c673cae 52
11fdf7f2
TL
53#include "mon/MonClient.h"
54
55#include "messages/MClientCaps.h"
56#include "messages/MClientLease.h"
57#include "messages/MClientQuota.h"
58#include "messages/MClientReclaim.h"
59#include "messages/MClientReclaimReply.h"
7c673cae 60#include "messages/MClientReconnect.h"
11fdf7f2 61#include "messages/MClientReply.h"
7c673cae
FG
62#include "messages/MClientRequest.h"
63#include "messages/MClientRequestForward.h"
11fdf7f2 64#include "messages/MClientSession.h"
7c673cae 65#include "messages/MClientSnap.h"
f67539c2 66#include "messages/MClientMetrics.h"
7c673cae 67#include "messages/MCommandReply.h"
7c673cae
FG
68#include "messages/MFSMap.h"
69#include "messages/MFSMapUser.h"
11fdf7f2
TL
70#include "messages/MMDSMap.h"
71#include "messages/MOSDMap.h"
7c673cae
FG
72
73#include "mds/flock.h"
11fdf7f2 74#include "mds/cephfs_features.h"
7c673cae
FG
75#include "osd/OSDMap.h"
76#include "osdc/Filer.h"
77
78#include "common/Cond.h"
7c673cae
FG
79#include "common/perf_counters.h"
80#include "common/admin_socket.h"
81#include "common/errno.h"
82#include "include/str_list.h"
83
84#define dout_subsys ceph_subsys_client
85
86#include "include/lru.h"
87#include "include/compat.h"
88#include "include/stringify.h"
f67539c2 89#include "include/random.h"
7c673cae
FG
90
91#include "Client.h"
92#include "Inode.h"
93#include "Dentry.h"
b32b8144 94#include "Delegation.h"
7c673cae
FG
95#include "Dir.h"
96#include "ClientSnapRealm.h"
97#include "Fh.h"
98#include "MetaSession.h"
99#include "MetaRequest.h"
100#include "ObjecterWriteback.h"
101#include "posix_acl.h"
102
11fdf7f2 103#include "include/ceph_assert.h"
7c673cae
FG
104#include "include/stat.h"
105
e306af50 106#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
107
108#if HAVE_GETGROUPLIST
109#include <grp.h>
110#include <pwd.h>
111#include <unistd.h>
112#endif
113
114#undef dout_prefix
115#define dout_prefix *_dout << "client." << whoami << " "
116
117#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119// FreeBSD fails to define this
120#ifndef O_DSYNC
121#define O_DSYNC 0x0
122#endif
123// Darwin fails to define this
124#ifndef O_RSYNC
125#define O_RSYNC 0x0
126#endif
127
128#ifndef O_DIRECT
129#define O_DIRECT 0x0
130#endif
131
f67539c2
TL
132// Windows doesn't define those values. While the Posix compatibilty layer
133// doesn't support those values, the Windows native functions do provide
134// similar flags. Special care should be taken if we're going to use those
135// flags in ceph-dokan. The current values are no-ops, while propagating
136// them to the rest of the code might cause the Windows functions to reject
137// them as invalid.
138#ifndef O_NOFOLLOW
139#define O_NOFOLLOW 0x0
140#endif
141
142#ifndef O_SYNC
143#define O_SYNC 0x0
144#endif
145
7c673cae
FG
146#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
b3b6e05e
TL
148#ifndef S_IXUGO
149#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150#endif
151
adb31ebb
TL
152using namespace TOPNSPC::common;
153
f67539c2
TL
154namespace bs = boost::system;
155namespace ca = ceph::async;
156
7c673cae
FG
157void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
158{
159 Client *client = static_cast<Client*>(p);
160 client->flush_set_callback(oset);
161}
162
b3b6e05e
TL
163bool Client::is_reserved_vino(vinodeno_t &vino) {
164 if (MDS_IS_PRIVATE_INO(vino.ino)) {
165 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
166 return true;
167 }
168 return false;
169}
170
7c673cae
FG
171
172// -------------
173
174Client::CommandHook::CommandHook(Client *client) :
175 m_client(client)
176{
177}
178
9f95a23c
TL
179int Client::CommandHook::call(
180 std::string_view command,
181 const cmdmap_t& cmdmap,
182 Formatter *f,
183 std::ostream& errss,
184 bufferlist& out)
7c673cae 185{
7c673cae 186 f->open_object_section("result");
9f95a23c 187 {
f67539c2 188 std::scoped_lock l{m_client->client_lock};
9f95a23c
TL
189 if (command == "mds_requests")
190 m_client->dump_mds_requests(f);
adb31ebb
TL
191 else if (command == "mds_sessions") {
192 bool cap_dump = false;
193 cmd_getval(cmdmap, "cap_dump", cap_dump);
194 m_client->dump_mds_sessions(f, cap_dump);
195 } else if (command == "dump_cache")
9f95a23c
TL
196 m_client->dump_cache(f);
197 else if (command == "kick_stale_sessions")
198 m_client->_kick_stale_sessions();
199 else if (command == "status")
200 m_client->dump_status(f);
201 else
202 ceph_abort_msg("bad command registered");
203 }
7c673cae 204 f->close_section();
9f95a23c 205 return 0;
7c673cae
FG
206}
207
208
209// -------------
210
b3b6e05e
TL
211int Client::get_fd_inode(int fd, InodeRef *in) {
212 int r = 0;
213 if (fd == CEPHFS_AT_FDCWD) {
214 *in = cwd;
215 } else {
216 Fh *f = get_filehandle(fd);
217 if (!f) {
218 r = -CEPHFS_EBADF;
219 } else {
220 *in = f->inode;
221 }
222 }
223 return r;
224}
225
7c673cae
FG
226dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
227 : inode(in), offset(0), next_offset(2),
228 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
229 perms(perms)
230 { }
231
232void Client::_reset_faked_inos()
233{
234 ino_t start = 1024;
235 free_faked_inos.clear();
236 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
237 last_used_faked_ino = 0;
11fdf7f2 238 last_used_faked_root = 0;
f67539c2
TL
239 #ifdef _WIN32
240 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
241 // Windows structures, including Dokan ones, are using 64B identifiers.
242 _use_faked_inos = false;
243 #else
7c673cae 244 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
f67539c2 245 #endif
7c673cae
FG
246}
247
248void Client::_assign_faked_ino(Inode *in)
249{
11fdf7f2
TL
250 if (0 == last_used_faked_ino)
251 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
252 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
253 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 254 last_used_faked_ino = 2048;
7c673cae
FG
255 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
256 }
11fdf7f2 257 ceph_assert(it != free_faked_inos.end());
7c673cae 258 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 259 ceph_assert(it.get_len() > 0);
7c673cae
FG
260 last_used_faked_ino = it.get_start();
261 } else {
262 ++last_used_faked_ino;
11fdf7f2 263 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
264 }
265 in->faked_ino = last_used_faked_ino;
266 free_faked_inos.erase(in->faked_ino);
267 faked_ino_map[in->faked_ino] = in->vino();
268}
269
11fdf7f2
TL
270/*
271 * In the faked mode, if you export multiple subdirectories,
272 * you will see that the inode numbers of the exported subdirectories
273 * are the same. so we distinguish the mount point by reserving
274 * the "fake ids" between "1024~2048" and combining the last
275 * 10bits(0x3ff) of the "root inodes".
276*/
277void Client::_assign_faked_root(Inode *in)
278{
279 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
280 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
281 last_used_faked_root = 0;
282 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
283 }
284 assert(it != free_faked_inos.end());
285 vinodeno_t inode_info = in->vino();
286 uint64_t inode_num = (uint64_t)inode_info.ino;
287 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
288 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
289 assert(it.get_start() + it.get_len() > last_used_faked_root);
290
291 in->faked_ino = last_used_faked_root;
292 free_faked_inos.erase(in->faked_ino);
293 faked_ino_map[in->faked_ino] = in->vino();
294}
295
7c673cae
FG
296void Client::_release_faked_ino(Inode *in)
297{
298 free_faked_inos.insert(in->faked_ino);
299 faked_ino_map.erase(in->faked_ino);
300}
301
302vinodeno_t Client::_map_faked_ino(ino_t ino)
303{
304 vinodeno_t vino;
305 if (ino == 1)
306 vino = root->vino();
307 else if (faked_ino_map.count(ino))
308 vino = faked_ino_map[ino];
309 else
310 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 311 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
312 return vino;
313}
314
315vinodeno_t Client::map_faked_ino(ino_t ino)
316{
f67539c2 317 std::scoped_lock lock(client_lock);
7c673cae
FG
318 return _map_faked_ino(ino);
319}
320
321// cons/des
322
323Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
f67539c2
TL
324 : Dispatcher(m->cct->get()),
325 timer(m->cct, timer_lock, false),
11fdf7f2
TL
326 messenger(m),
327 monclient(mc),
328 objecter(objecter_),
329 whoami(mc->get_global_id()),
f67539c2
TL
330 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
331 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
332 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
7c673cae
FG
333 async_ino_invalidator(m->cct),
334 async_dentry_invalidator(m->cct),
335 interrupt_finisher(m->cct),
336 remount_finisher(m->cct),
e306af50 337 async_ino_releasor(m->cct),
7c673cae 338 objecter_finisher(m->cct),
11fdf7f2
TL
339 m_command_hook(this),
340 fscid(0)
7c673cae
FG
341{
342 _reset_faked_inos();
7c673cae 343
7c673cae
FG
344 user_id = cct->_conf->client_mount_uid;
345 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
346 fuse_default_permissions = cct->_conf.get_val<bool>(
347 "fuse_default_permissions");
7c673cae 348
7c673cae
FG
349 if (cct->_conf->client_acl_type == "posix_acl")
350 acl_type = POSIX_ACL;
351
7c673cae
FG
352 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
353
354 // file handles
355 free_fd_set.insert(10, 1<<30);
356
357 mdsmap.reset(new MDSMap);
358
359 // osd interfaces
360 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
361 &client_lock));
362 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
363 client_flush_set_callback, // all commit callback
364 (void*)this,
365 cct->_conf->client_oc_size,
366 cct->_conf->client_oc_max_objects,
367 cct->_conf->client_oc_max_dirty,
368 cct->_conf->client_oc_target_dirty,
369 cct->_conf->client_oc_max_dirty_age,
370 true));
7c673cae
FG
371}
372
373
374Client::~Client()
375{
9f95a23c 376 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 377
f67539c2
TL
378 // If the task is crashed or aborted and doesn't
379 // get any chance to run the umount and shutdow.
380 {
381 std::scoped_lock l{client_lock};
382 tick_thread_stopped = true;
383 upkeep_cond.notify_one();
384 }
385
386 if (upkeeper.joinable())
387 upkeeper.join();
388
31f18b77
FG
389 // It is necessary to hold client_lock, because any inode destruction
390 // may call into ObjectCacher, which asserts that it's lock (which is
391 // client_lock) is held.
f67539c2 392 std::scoped_lock l{client_lock};
7c673cae
FG
393 tear_down_cache();
394}
395
396void Client::tear_down_cache()
397{
398 // fd's
f67539c2
TL
399 for (auto &[fd, fh] : fd_map) {
400 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
7c673cae
FG
401 _release_fh(fh);
402 }
403 fd_map.clear();
404
405 while (!opened_dirs.empty()) {
406 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 407 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
408 _closedir(dirp);
409 }
410
411 // caps!
412 // *** FIXME ***
413
414 // empty lru
7c673cae 415 trim_cache();
11fdf7f2 416 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
417
418 // close root ino
11fdf7f2 419 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae 420 if (root && inode_map.size() == 1 + root_parents.size()) {
b3b6e05e 421 root.reset();
7c673cae
FG
422 }
423
11fdf7f2 424 ceph_assert(inode_map.empty());
7c673cae
FG
425}
426
427inodeno_t Client::get_root_ino()
428{
f67539c2 429 std::scoped_lock l(client_lock);
7c673cae
FG
430 if (use_faked_inos())
431 return root->faked_ino;
432 else
433 return root->ino;
434}
435
436Inode *Client::get_root()
437{
f67539c2 438 std::scoped_lock l(client_lock);
7c673cae 439 root->ll_get();
b3b6e05e 440 return root.get();
7c673cae
FG
441}
442
443
444// debug crapola
445
446void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
447{
448 filepath path;
449 in->make_long_path(path);
450 ldout(cct, 1) << "dump_inode: "
451 << (disconnected ? "DISCONNECTED ":"")
452 << "inode " << in->ino
453 << " " << path
b3b6e05e 454 << " ref " << in->get_nref()
f67539c2 455 << " " << *in << dendl;
7c673cae
FG
456
457 if (f) {
458 f->open_object_section("inode");
459 f->dump_stream("path") << path;
460 if (disconnected)
461 f->dump_int("disconnected", 1);
462 in->dump(f);
463 f->close_section();
464 }
465
466 did.insert(in);
467 if (in->dir) {
468 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
469 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
470 it != in->dir->dentries.end();
471 ++it) {
472 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
473 if (f) {
474 f->open_object_section("dentry");
475 it->second->dump(f);
476 f->close_section();
477 }
478 if (it->second->inode)
479 dump_inode(f, it->second->inode.get(), did, false);
480 }
481 }
482}
483
484void Client::dump_cache(Formatter *f)
485{
486 set<Inode*> did;
487
11fdf7f2 488 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
489
490 if (f)
491 f->open_array_section("cache");
492
493 if (root)
b3b6e05e 494 dump_inode(f, root.get(), did, true);
7c673cae
FG
495
496 // make a second pass to catch anything disconnected
497 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
498 it != inode_map.end();
499 ++it) {
500 if (did.count(it->second))
501 continue;
502 dump_inode(f, it->second, did, true);
503 }
504
505 if (f)
506 f->close_section();
507}
508
509void Client::dump_status(Formatter *f)
510{
9f95a23c 511 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
512
513 ldout(cct, 1) << __func__ << dendl;
514
515 const epoch_t osd_epoch
516 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
517
518 if (f) {
519 f->open_object_section("metadata");
520 for (const auto& kv : metadata)
521 f->dump_string(kv.first.c_str(), kv.second);
522 f->close_section();
523
524 f->dump_int("dentry_count", lru.lru_get_size());
525 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
526 f->dump_int("id", get_nodeid().v);
11fdf7f2 527 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 528 f->dump_object("inst", inst);
11fdf7f2
TL
529 f->dump_object("addr", inst.addr);
530 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
531 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
532 f->dump_int("inode_count", inode_map.size());
533 f->dump_int("mds_epoch", mdsmap->get_epoch());
534 f->dump_int("osd_epoch", osd_epoch);
535 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f67539c2 536 f->dump_bool("blocklisted", blocklisted);
adb31ebb 537 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
538 }
539}
540
e306af50 541void Client::_pre_init()
7c673cae
FG
542{
543 timer.init();
e306af50
TL
544
545 objecter_finisher.start();
546 filer.reset(new Filer(objecter, &objecter_finisher));
f67539c2 547 objecter->enable_blocklist_events();
e306af50 548
7c673cae 549 objectcacher->start();
e306af50
TL
550}
551
552int Client::init()
553{
f67539c2
TL
554 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
555 ceph_assert(iref_writer.is_first_writer());
556
e306af50 557 _pre_init();
9f95a23c 558 {
f67539c2 559 std::scoped_lock l{client_lock};
9f95a23c
TL
560 messenger->add_dispatcher_tail(this);
561 }
7c673cae 562 _finish_init();
f67539c2 563 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
564 return 0;
565}
566
567void Client::_finish_init()
568{
9f95a23c 569 {
f67539c2 570 std::scoped_lock l{client_lock};
9f95a23c
TL
571 // logger
572 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
573 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
574 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
575 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
576 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
577 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
578 logger.reset(plb.create_perf_counters());
579 cct->get_perfcounters_collection()->add(logger.get());
580 }
7c673cae 581
11fdf7f2 582 cct->_conf.add_observer(this);
7c673cae
FG
583
584 AdminSocket* admin_socket = cct->get_admin_socket();
585 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
586 &m_command_hook,
587 "show in-progress mds requests");
588 if (ret < 0) {
589 lderr(cct) << "error registering admin socket command: "
590 << cpp_strerror(-ret) << dendl;
591 }
adb31ebb
TL
592 ret = admin_socket->register_command("mds_sessions "
593 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
594 &m_command_hook,
595 "show mds session state");
596 if (ret < 0) {
597 lderr(cct) << "error registering admin socket command: "
598 << cpp_strerror(-ret) << dendl;
599 }
600 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
601 &m_command_hook,
602 "show in-memory metadata cache contents");
603 if (ret < 0) {
604 lderr(cct) << "error registering admin socket command: "
605 << cpp_strerror(-ret) << dendl;
606 }
607 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
608 &m_command_hook,
609 "kick sessions that were remote reset");
610 if (ret < 0) {
611 lderr(cct) << "error registering admin socket command: "
612 << cpp_strerror(-ret) << dendl;
613 }
614 ret = admin_socket->register_command("status",
7c673cae
FG
615 &m_command_hook,
616 "show overall client status");
617 if (ret < 0) {
618 lderr(cct) << "error registering admin socket command: "
619 << cpp_strerror(-ret) << dendl;
620 }
7c673cae
FG
621}
622
623void Client::shutdown()
624{
11fdf7f2 625 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
626
627 // If we were not mounted, but were being used for sending
628 // MDS commands, we may have sessions that need closing.
9f95a23c 629 {
f67539c2
TL
630 std::scoped_lock l{client_lock};
631
632 // To make sure the tick thread will be stoppped before
633 // destructing the Client, just in case like the _mount()
634 // failed but didn't not get a chance to stop the tick
635 // thread
636 tick_thread_stopped = true;
637 upkeep_cond.notify_one();
638
9f95a23c
TL
639 _close_sessions();
640 }
11fdf7f2 641 cct->_conf.remove_observer(this);
7c673cae 642
11fdf7f2 643 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
644
645 if (ino_invalidate_cb) {
646 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
647 async_ino_invalidator.wait_for_empty();
648 async_ino_invalidator.stop();
649 }
650
651 if (dentry_invalidate_cb) {
652 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
653 async_dentry_invalidator.wait_for_empty();
654 async_dentry_invalidator.stop();
655 }
656
657 if (switch_interrupt_cb) {
658 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
659 interrupt_finisher.wait_for_empty();
660 interrupt_finisher.stop();
661 }
662
663 if (remount_cb) {
664 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
665 remount_finisher.wait_for_empty();
666 remount_finisher.stop();
667 }
668
e306af50
TL
669 if (ino_release_cb) {
670 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
671 async_ino_releasor.wait_for_empty();
672 async_ino_releasor.stop();
673 }
674
7c673cae 675 objectcacher->stop(); // outside of client_lock! this does a join.
f67539c2
TL
676
677 /*
678 * We are shuting down the client.
679 *
680 * Just declare the state to CLIENT_NEW to block and fail any
681 * new comming "reader" and then try to wait all the in-flight
682 * "readers" to finish.
683 */
684 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
685 if (!iref_writer.is_first_writer())
686 return;
687 iref_writer.wait_readers_done();
688
9f95a23c 689 {
f67539c2 690 std::scoped_lock l(timer_lock);
9f95a23c
TL
691 timer.shutdown();
692 }
f67539c2 693
7c673cae
FG
694 objecter_finisher.wait_for_empty();
695 objecter_finisher.stop();
696
697 if (logger) {
698 cct->get_perfcounters_collection()->remove(logger.get());
699 logger.reset();
700 }
701}
702
703
704// ===================
705// metadata cache stuff
706
707void Client::trim_cache(bool trim_kernel_dcache)
708{
181888fb
FG
709 uint64_t max = cct->_conf->client_cache_size;
710 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
711 unsigned last = 0;
712 while (lru.lru_get_size() != last) {
713 last = lru.lru_get_size();
714
f67539c2 715 if (!is_unmounting() && lru.lru_get_size() <= max) break;
7c673cae
FG
716
717 // trim!
31f18b77 718 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
719 if (!dn)
720 break; // done
f67539c2 721
7c673cae
FG
722 trim_dentry(dn);
723 }
724
181888fb 725 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
726 _invalidate_kernel_dcache();
727
728 // hose root?
b3b6e05e 729 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
7c673cae 730 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
b3b6e05e 731 root.reset();
7c673cae
FG
732 }
733}
734
735void Client::trim_cache_for_reconnect(MetaSession *s)
736{
737 mds_rank_t mds = s->mds_num;
11fdf7f2 738 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
739
740 int trimmed = 0;
741 list<Dentry*> skipped;
742 while (lru.lru_get_size() > 0) {
743 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
744 if (!dn)
745 break;
746
747 if ((dn->inode && dn->inode->caps.count(mds)) ||
748 dn->dir->parent_inode->caps.count(mds)) {
749 trim_dentry(dn);
750 trimmed++;
751 } else
752 skipped.push_back(dn);
753 }
754
755 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
756 lru.lru_insert_mid(*p);
757
11fdf7f2 758 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
759 << " trimmed " << trimmed << " dentries" << dendl;
760
761 if (s->caps.size() > 0)
762 _invalidate_kernel_dcache();
763}
764
765void Client::trim_dentry(Dentry *dn)
766{
767 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
768 << " in dir "
769 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
770 << dendl;
771 if (dn->inode) {
772 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
773 clear_dir_complete_and_ordered(diri, true);
774 }
775 unlink(dn, false, false); // drop dir, drop dentry
776}
777
778
1adf2230
AA
779void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
780 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 781{
7c673cae
FG
782 uint64_t prior_size = in->size;
783
7c673cae
FG
784 if (truncate_seq > in->truncate_seq ||
785 (truncate_seq == in->truncate_seq && size > in->size)) {
786 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
787 in->size = size;
788 in->reported_size = size;
789 if (truncate_seq != in->truncate_seq) {
790 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
791 << truncate_seq << dendl;
792 in->truncate_seq = truncate_seq;
793 in->oset.truncate_seq = truncate_seq;
794
795 // truncate cached file data
796 if (prior_size > size) {
797 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
798 }
799 }
800
801 // truncate inline data
802 if (in->inline_version < CEPH_INLINE_NONE) {
803 uint32_t len = in->inline_data.length();
804 if (size < len)
805 in->inline_data.splice(size, len - size);
806 }
807 }
808 if (truncate_seq >= in->truncate_seq &&
809 in->truncate_size != truncate_size) {
810 if (in->is_file()) {
811 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
812 << truncate_size << dendl;
813 in->truncate_size = truncate_size;
814 in->oset.truncate_size = truncate_size;
815 } else {
816 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
817 }
818 }
1adf2230
AA
819}
820
821void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
822 utime_t ctime, utime_t mtime, utime_t atime)
823{
824 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
825 << " ctime " << ctime << " mtime " << mtime << dendl;
826
827 if (time_warp_seq > in->time_warp_seq)
828 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
829 << " is higher than local time_warp_seq "
830 << in->time_warp_seq << dendl;
831
832 int warn = false;
7c673cae
FG
833 // be careful with size, mtime, atime
834 if (issued & (CEPH_CAP_FILE_EXCL|
835 CEPH_CAP_FILE_WR|
836 CEPH_CAP_FILE_BUFFER|
837 CEPH_CAP_AUTH_EXCL|
838 CEPH_CAP_XATTR_EXCL)) {
839 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
840 if (ctime > in->ctime)
841 in->ctime = ctime;
842 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
843 //the mds updated times, so take those!
844 in->mtime = mtime;
845 in->atime = atime;
846 in->time_warp_seq = time_warp_seq;
847 } else if (time_warp_seq == in->time_warp_seq) {
848 //take max times
849 if (mtime > in->mtime)
850 in->mtime = mtime;
851 if (atime > in->atime)
852 in->atime = atime;
853 } else if (issued & CEPH_CAP_FILE_EXCL) {
854 //ignore mds values as we have a higher seq
855 } else warn = true;
856 } else {
857 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
858 if (time_warp_seq >= in->time_warp_seq) {
859 in->ctime = ctime;
860 in->mtime = mtime;
861 in->atime = atime;
862 in->time_warp_seq = time_warp_seq;
863 } else warn = true;
864 }
865 if (warn) {
866 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
867 << time_warp_seq << " is lower than local time_warp_seq "
868 << in->time_warp_seq
869 << dendl;
870 }
871}
872
873void Client::_fragmap_remove_non_leaves(Inode *in)
874{
875 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
876 if (!in->dirfragtree.is_leaf(p->first))
877 in->fragmap.erase(p++);
878 else
879 ++p;
880}
881
882void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
883{
884 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
885 if (p->second == mds)
886 in->fragmap.erase(p++);
887 else
888 ++p;
889}
890
891Inode * Client::add_update_inode(InodeStat *st, utime_t from,
892 MetaSession *session,
893 const UserPerm& request_perms)
894{
895 Inode *in;
896 bool was_new = false;
897 if (inode_map.count(st->vino)) {
898 in = inode_map[st->vino];
11fdf7f2 899 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
900 } else {
901 in = new Inode(this, st->vino, &st->layout);
902 inode_map[st->vino] = in;
903
904 if (use_faked_inos())
905 _assign_faked_ino(in);
906
907 if (!root) {
908 root = in;
11fdf7f2 909 if (use_faked_inos())
b3b6e05e 910 _assign_faked_root(root.get());
7c673cae
FG
911 root_ancestor = in;
912 cwd = root;
f67539c2 913 } else if (is_mounting()) {
7c673cae
FG
914 root_parents[root_ancestor] = in;
915 root_ancestor = in;
916 }
917
918 // immutable bits
919 in->ino = st->vino.ino;
920 in->snapid = st->vino.snapid;
921 in->mode = st->mode & S_IFMT;
922 was_new = true;
923 }
924
925 in->rdev = st->rdev;
926 if (in->is_symlink())
927 in->symlink = st->symlink;
928
7c673cae 929 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
930 bool new_version = false;
931 if (in->version == 0 ||
932 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
933 (in->version & ~1) < st->version))
934 new_version = true;
7c673cae 935
1adf2230
AA
936 int issued;
937 in->caps_issued(&issued);
938 issued |= in->caps_dirty();
939 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 940
1adf2230
AA
941 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
942 !(issued & CEPH_CAP_AUTH_EXCL)) {
943 in->mode = st->mode;
944 in->uid = st->uid;
945 in->gid = st->gid;
946 in->btime = st->btime;
81eedcae 947 in->snap_btime = st->snap_btime;
f67539c2 948 in->snap_metadata = st->snap_metadata;
1adf2230 949 }
7c673cae 950
1adf2230
AA
951 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
952 !(issued & CEPH_CAP_LINK_EXCL)) {
953 in->nlink = st->nlink;
954 }
7c673cae 955
1adf2230
AA
956 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
957 update_inode_file_time(in, issued, st->time_warp_seq,
958 st->ctime, st->mtime, st->atime);
959 }
7c673cae 960
1adf2230
AA
961 if (new_version ||
962 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 963 in->layout = st->layout;
1adf2230
AA
964 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
965 }
7c673cae 966
1adf2230
AA
967 if (in->is_dir()) {
968 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
969 in->dirstat = st->dirstat;
970 }
971 // dir_layout/rstat/quota are not tracked by capability, update them only if
972 // the inode stat is from auth mds
973 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
974 in->dir_layout = st->dir_layout;
975 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
976 in->rstat = st->rstat;
977 in->quota = st->quota;
11fdf7f2 978 in->dir_pin = st->dir_pin;
1adf2230
AA
979 }
980 // move me if/when version reflects fragtree changes.
981 if (in->dirfragtree != st->dirfragtree) {
982 in->dirfragtree = st->dirfragtree;
983 _fragmap_remove_non_leaves(in);
7c673cae 984 }
7c673cae
FG
985 }
986
987 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
988 st->xattrbl.length() &&
989 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
990 auto p = st->xattrbl.cbegin();
991 decode(in->xattrs, p);
7c673cae
FG
992 in->xattr_version = st->xattr_version;
993 }
994
1adf2230
AA
995 if (st->inline_version > in->inline_version) {
996 in->inline_data = st->inline_data;
997 in->inline_version = st->inline_version;
7c673cae
FG
998 }
999
1adf2230
AA
1000 /* always take a newer change attr */
1001 if (st->change_attr > in->change_attr)
1002 in->change_attr = st->change_attr;
1003
1004 if (st->version > in->version)
1005 in->version = st->version;
1006
1007 if (was_new)
1008 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1009
1010 if (!st->cap.caps)
1011 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1012
7c673cae 1013 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
1014 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1015 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1016 st->cap.flags, request_perms);
28e407b8 1017 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 1018 in->max_size = st->max_size;
28e407b8
AA
1019 in->rstat = st->rstat;
1020 }
7c673cae 1021
1adf2230
AA
1022 // setting I_COMPLETE needs to happen after adding the cap
1023 if (in->is_dir() &&
1024 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1025 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1026 in->dirstat.nfiles == 0 &&
1027 in->dirstat.nsubdirs == 0) {
1028 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1029 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1030 if (in->dir) {
1031 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1032 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1033 in->dir->readdir_cache.clear();
1034 for (const auto& p : in->dir->dentries) {
1035 unlink(p.second, true, true); // keep dir, keep dentry
1036 }
1037 if (in->dir->dentries.empty())
1038 close_dir(in->dir);
7c673cae 1039 }
7c673cae 1040 }
1adf2230
AA
1041 } else {
1042 in->snap_caps |= st->cap.caps;
7c673cae
FG
1043 }
1044
f67539c2 1045 in->fscrypt = st->fscrypt;
7c673cae
FG
1046 return in;
1047}
1048
1049
1050/*
1051 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1052 */
1053Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1054 Inode *in, utime_t from, MetaSession *session,
1055 Dentry *old_dentry)
1056{
1057 Dentry *dn = NULL;
1058 if (dir->dentries.count(dname))
1059 dn = dir->dentries[dname];
1060
11fdf7f2 1061 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
1062 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1063 << dendl;
1064
1065 if (dn && dn->inode) {
1066 if (dn->inode->vino() == in->vino()) {
1067 touch_dn(dn);
1068 ldout(cct, 12) << " had dentry " << dname
1069 << " with correct vino " << dn->inode->vino()
1070 << dendl;
1071 } else {
1072 ldout(cct, 12) << " had dentry " << dname
1073 << " with WRONG vino " << dn->inode->vino()
1074 << dendl;
1075 unlink(dn, true, true); // keep dir, keep dentry
1076 }
1077 }
1078
1079 if (!dn || !dn->inode) {
1080 InodeRef tmp_ref(in);
1081 if (old_dentry) {
1082 if (old_dentry->dir != dir) {
1083 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1084 clear_dir_complete_and_ordered(old_diri, false);
1085 }
1086 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1087 }
1088 Inode *diri = dir->parent_inode;
7c673cae
FG
1089 clear_dir_complete_and_ordered(diri, false);
1090 dn = link(dir, dname, in, dn);
1091 }
1092
1093 update_dentry_lease(dn, dlease, from, session);
1094 return dn;
1095}
1096
1097void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1098{
1099 utime_t dttl = from;
1100 dttl += (float)dlease->duration_ms / 1000.0;
f67539c2
TL
1101
1102 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
7c673cae 1103
11fdf7f2 1104 ceph_assert(dn);
7c673cae 1105
9f95a23c 1106 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1107 if (dttl > dn->lease_ttl) {
1108 ldout(cct, 10) << "got dentry lease on " << dn->name
1109 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1110 dn->lease_ttl = dttl;
1111 dn->lease_mds = session->mds_num;
1112 dn->lease_seq = dlease->seq;
1113 dn->lease_gen = session->cap_gen;
1114 }
1115 }
1116 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1117 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1118 dn->mark_primary();
f67539c2 1119 dn->alternate_name = std::move(dlease->alternate_name);
7c673cae
FG
1120}
1121
1122
1123/*
1124 * update MDS location cache for a single inode
1125 */
1126void Client::update_dir_dist(Inode *in, DirStat *dst)
1127{
1128 // auth
1129 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1130 if (dst->auth >= 0) {
1131 in->fragmap[dst->frag] = dst->auth;
1132 } else {
1133 in->fragmap.erase(dst->frag);
1134 }
1135 if (!in->dirfragtree.is_leaf(dst->frag)) {
1136 in->dirfragtree.force_to_leaf(cct, dst->frag);
1137 _fragmap_remove_non_leaves(in);
1138 }
1139
1140 // replicated
f67539c2
TL
1141 in->dir_replicated = !dst->dist.empty();
1142 if (!dst->dist.empty())
1143 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1144 else
1145 in->frag_repmap.erase(dst->frag);
7c673cae
FG
1146}
1147
1148void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1149{
f91f0fd5
TL
1150 if (complete)
1151 diri->dir_release_count++;
1152 else
1153 diri->dir_ordered_count++;
7c673cae
FG
1154 if (diri->flags & I_COMPLETE) {
1155 if (complete) {
1156 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1157 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1158 } else {
1159 if (diri->flags & I_DIR_ORDERED) {
1160 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1161 diri->flags &= ~I_DIR_ORDERED;
1162 }
1163 }
1164 if (diri->dir)
1165 diri->dir->readdir_cache.clear();
1166 }
1167}
1168
1169/*
1170 * insert results from readdir or lssnap into the metadata cache.
1171 */
1172void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1173
11fdf7f2 1174 auto& reply = request->reply;
7c673cae 1175 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1176 uint64_t features;
1177 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1178 features = (uint64_t)-1;
1179 }
1180 else {
1181 features = con->get_features();
1182 }
7c673cae
FG
1183
1184 dir_result_t *dirp = request->dirp;
11fdf7f2 1185 ceph_assert(dirp);
7c673cae
FG
1186
1187 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1188 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1189 if (!p.end()) {
1190 // snapdir?
1191 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1192 ceph_assert(diri);
7c673cae
FG
1193 diri = open_snapdir(diri);
1194 }
1195
1196 // only open dir if we're actually adding stuff to it!
1197 Dir *dir = diri->open_dir();
11fdf7f2 1198 ceph_assert(dir);
7c673cae
FG
1199
1200 // dirstat
11fdf7f2 1201 DirStat dst(p, features);
7c673cae
FG
1202 __u32 numdn;
1203 __u16 flags;
11fdf7f2
TL
1204 decode(numdn, p);
1205 decode(flags, p);
7c673cae
FG
1206
1207 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1208 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1209
1210 frag_t fg = (unsigned)request->head.args.readdir.frag;
1211 unsigned readdir_offset = dirp->next_offset;
1212 string readdir_start = dirp->last_name;
11fdf7f2 1213 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1214
1215 unsigned last_hash = 0;
1216 if (hash_order) {
1217 if (!readdir_start.empty()) {
1218 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1219 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1220 /* mds understands offset_hash */
1221 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1222 }
1223 }
1224
1225 if (fg != dst.frag) {
1226 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1227 fg = dst.frag;
1228 if (!hash_order) {
1229 readdir_offset = 2;
1230 readdir_start.clear();
1231 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1232 }
1233 }
1234
1235 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1236 << ", hash_order=" << hash_order
1237 << ", readdir_start " << readdir_start
1238 << ", last_hash " << last_hash
1239 << ", next_offset " << readdir_offset << dendl;
1240
1241 if (diri->snapid != CEPH_SNAPDIR &&
1242 fg.is_leftmost() && readdir_offset == 2 &&
1243 !(hash_order && last_hash)) {
1244 dirp->release_count = diri->dir_release_count;
1245 dirp->ordered_count = diri->dir_ordered_count;
1246 dirp->start_shared_gen = diri->shared_gen;
1247 dirp->cache_index = 0;
1248 }
1249
1250 dirp->buffer_frag = fg;
1251
1252 _readdir_drop_dirp_buffer(dirp);
1253 dirp->buffer.reserve(numdn);
1254
1255 string dname;
1256 LeaseStat dlease;
1257 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1258 decode(dname, p);
1259 dlease.decode(p, features);
7c673cae
FG
1260 InodeStat ist(p, features);
1261
1262 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1263
1264 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1265 request->perms);
1266 Dentry *dn;
1267 if (diri->dir->dentries.count(dname)) {
1268 Dentry *olddn = diri->dir->dentries[dname];
1269 if (olddn->inode != in) {
1270 // replace incorrect dentry
1271 unlink(olddn, true, true); // keep dir, dentry
1272 dn = link(dir, dname, in, olddn);
11fdf7f2 1273 ceph_assert(dn == olddn);
7c673cae
FG
1274 } else {
1275 // keep existing dn
1276 dn = olddn;
1277 touch_dn(dn);
1278 }
1279 } else {
1280 // new dn
1281 dn = link(dir, dname, in, NULL);
1282 }
f67539c2 1283 dn->alternate_name = std::move(dlease.alternate_name);
7c673cae
FG
1284
1285 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1286 if (hash_order) {
1287 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1288 if (hash != last_hash)
1289 readdir_offset = 2;
1290 last_hash = hash;
1291 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1292 } else {
1293 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1294 }
1295 // add to readdir cache
1296 if (dirp->release_count == diri->dir_release_count &&
1297 dirp->ordered_count == diri->dir_ordered_count &&
1298 dirp->start_shared_gen == diri->shared_gen) {
1299 if (dirp->cache_index == dir->readdir_cache.size()) {
1300 if (i == 0) {
11fdf7f2 1301 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1302 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1303 }
1304 dir->readdir_cache.push_back(dn);
1305 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1306 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1307 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1308 else
1309 dir->readdir_cache[dirp->cache_index] = dn;
1310 } else {
11fdf7f2 1311 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1312 }
1313 dirp->cache_index++;
1314 }
1315 // add to cached result list
f67539c2 1316 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
7c673cae
FG
1317 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1318 }
1319
1320 if (numdn > 0)
1321 dirp->last_name = dname;
1322 if (end)
1323 dirp->next_offset = 2;
1324 else
1325 dirp->next_offset = readdir_offset;
1326
1327 if (dir->is_empty())
1328 close_dir(dir);
1329 }
1330}
1331
1332/** insert_trace
1333 *
1334 * insert a trace from a MDS reply into the cache.
1335 */
1336Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1337{
11fdf7f2 1338 auto& reply = request->reply;
7c673cae
FG
1339 int op = request->get_op();
1340
1341 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1342 << " is_target=" << (int)reply->head.is_target
1343 << " is_dentry=" << (int)reply->head.is_dentry
1344 << dendl;
1345
11fdf7f2 1346 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1347 if (request->got_unsafe) {
1348 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1349 ceph_assert(p.end());
7c673cae
FG
1350 return NULL;
1351 }
1352
1353 if (p.end()) {
1354 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1355
1356 Dentry *d = request->dentry();
1357 if (d) {
1358 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1359 clear_dir_complete_and_ordered(diri, true);
1360 }
1361
1362 if (d && reply->get_result() == 0) {
1363 if (op == CEPH_MDS_OP_RENAME) {
1364 // rename
1365 Dentry *od = request->old_dentry();
1366 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1367 ceph_assert(od);
7c673cae
FG
1368 unlink(od, true, true); // keep dir, dentry
1369 } else if (op == CEPH_MDS_OP_RMDIR ||
1370 op == CEPH_MDS_OP_UNLINK) {
1371 // unlink, rmdir
1372 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1373 unlink(d, true, true); // keep dir, dentry
1374 }
1375 }
1376 return NULL;
1377 }
1378
1379 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1380 uint64_t features;
1381 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1382 features = (uint64_t)-1;
1383 }
1384 else {
1385 features = con->get_features();
1386 }
7c673cae
FG
1387 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1388
1389 // snap trace
1390 SnapRealm *realm = NULL;
1391 if (reply->snapbl.length())
1392 update_snap_trace(reply->snapbl, &realm);
1393
1394 ldout(cct, 10) << " hrm "
1395 << " is_target=" << (int)reply->head.is_target
1396 << " is_dentry=" << (int)reply->head.is_dentry
1397 << dendl;
1398
1399 InodeStat dirst;
1400 DirStat dst;
1401 string dname;
1402 LeaseStat dlease;
1403 InodeStat ist;
1404
1405 if (reply->head.is_dentry) {
1406 dirst.decode(p, features);
11fdf7f2
TL
1407 dst.decode(p, features);
1408 decode(dname, p);
1409 dlease.decode(p, features);
7c673cae
FG
1410 }
1411
1412 Inode *in = 0;
1413 if (reply->head.is_target) {
1414 ist.decode(p, features);
1415 if (cct->_conf->client_debug_getattr_caps) {
1416 unsigned wanted = 0;
1417 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1418 wanted = request->head.args.getattr.mask;
1419 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1420 wanted = request->head.args.open.mask;
1421
1422 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1423 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1424 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1425 }
1426
1427 in = add_update_inode(&ist, request->sent_stamp, session,
1428 request->perms);
1429 }
1430
1431 Inode *diri = NULL;
1432 if (reply->head.is_dentry) {
1433 diri = add_update_inode(&dirst, request->sent_stamp, session,
1434 request->perms);
1435 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1436
1437 if (in) {
1438 Dir *dir = diri->open_dir();
1439 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1440 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1441 } else {
1442 Dentry *dn = NULL;
1443 if (diri->dir && diri->dir->dentries.count(dname)) {
1444 dn = diri->dir->dentries[dname];
1445 if (dn->inode) {
7c673cae
FG
1446 clear_dir_complete_and_ordered(diri, false);
1447 unlink(dn, true, true); // keep dir, dentry
1448 }
1449 }
1450 if (dlease.duration_ms > 0) {
1451 if (!dn) {
1452 Dir *dir = diri->open_dir();
1453 dn = link(dir, dname, NULL, NULL);
1454 }
1455 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1456 }
1457 }
1458 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1459 op == CEPH_MDS_OP_MKSNAP) {
1460 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1461 // fake it for snap lookup
1462 vinodeno_t vino = ist.vino;
1463 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1464 ceph_assert(inode_map.count(vino));
7c673cae
FG
1465 diri = inode_map[vino];
1466
1467 string dname = request->path.last_dentry();
1468
1469 LeaseStat dlease;
1470 dlease.duration_ms = 0;
1471
1472 if (in) {
1473 Dir *dir = diri->open_dir();
1474 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1475 } else {
1476 if (diri->dir && diri->dir->dentries.count(dname)) {
1477 Dentry *dn = diri->dir->dentries[dname];
1478 if (dn->inode)
1479 unlink(dn, true, true); // keep dir, dentry
1480 }
1481 }
1482 }
1483
1484 if (in) {
1485 if (op == CEPH_MDS_OP_READDIR ||
1486 op == CEPH_MDS_OP_LSSNAP) {
1487 insert_readdir_results(request, session, in);
1488 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1489 // hack: return parent inode instead
1490 in = diri;
1491 }
1492
1493 if (request->dentry() == NULL && in != request->inode()) {
1494 // pin the target inode if its parent dentry is not pinned
1495 request->set_other_inode(in);
1496 }
1497 }
1498
1499 if (realm)
1500 put_snap_realm(realm);
1501
1502 request->target = in;
1503 return in;
1504}
1505
1506// -------
1507
1508mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1509{
1510 mds_rank_t mds = MDS_RANK_NONE;
1511 __u32 hash = 0;
1512 bool is_hash = false;
1513
1514 Inode *in = NULL;
1515 Dentry *de = NULL;
7c673cae
FG
1516
1517 if (req->resend_mds >= 0) {
1518 mds = req->resend_mds;
1519 req->resend_mds = -1;
11fdf7f2 1520 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1521 goto out;
1522 }
1523
1524 if (cct->_conf->client_use_random_mds)
1525 goto random_mds;
1526
1527 in = req->inode();
1528 de = req->dentry();
1529 if (in) {
11fdf7f2 1530 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1531 if (req->path.depth()) {
1532 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1533 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1534 << " on " << req->path[0]
1535 << " => " << hash << dendl;
1536 is_hash = true;
1537 }
1538 } else if (de) {
1539 if (de->inode) {
1540 in = de->inode.get();
11fdf7f2 1541 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1542 } else {
1543 in = de->dir->parent_inode;
1544 hash = in->hash_dentry_name(de->name);
11fdf7f2 1545 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1546 << " on " << de->name
1547 << " => " << hash << dendl;
1548 is_hash = true;
1549 }
1550 }
1551 if (in) {
1552 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1553 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1554 while (in->snapid != CEPH_NOSNAP) {
1555 if (in->snapid == CEPH_SNAPDIR)
1556 in = in->snapdir_parent.get();
11fdf7f2 1557 else if (!in->dentries.empty())
7c673cae
FG
1558 /* In most cases there will only be one dentry, so getting it
1559 * will be the correct action. If there are multiple hard links,
1560 * I think the MDS should be able to redirect as needed*/
1561 in = in->get_first_parent()->dir->parent_inode;
1562 else {
1563 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1564 break;
1565 }
1566 }
1567 is_hash = false;
1568 }
1569
11fdf7f2 1570 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1571 << " hash=" << hash << dendl;
1572
f67539c2 1573 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
7c673cae 1574 frag_t fg = in->dirfragtree[hash];
f67539c2
TL
1575 if (!req->auth_is_best()) {
1576 auto repmapit = in->frag_repmap.find(fg);
1577 if (repmapit != in->frag_repmap.end()) {
1578 auto& repmap = repmapit->second;
1579 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1580 mds = repmap.at(r);
1581 }
1582 } else if (in->fragmap.count(fg)) {
7c673cae
FG
1583 mds = in->fragmap[fg];
1584 if (phash_diri)
1585 *phash_diri = in;
91327a77 1586 } else if (in->auth_cap) {
f67539c2 1587 req->send_to_auth = true;
91327a77
AA
1588 mds = in->auth_cap->session->mds_num;
1589 }
1590 if (mds >= 0) {
11fdf7f2 1591 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1592 goto out;
1593 }
1594 }
1595
11fdf7f2
TL
1596 if (in->auth_cap && req->auth_is_best()) {
1597 mds = in->auth_cap->session->mds_num;
1598 } else if (!in->caps.empty()) {
1599 mds = in->caps.begin()->second.session->mds_num;
1600 } else {
7c673cae 1601 goto random_mds;
11fdf7f2
TL
1602 }
1603 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1604
1605 goto out;
1606 }
1607
1608random_mds:
1609 if (mds < 0) {
1610 mds = _get_random_up_mds();
1611 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1612 }
1613
1614out:
1615 ldout(cct, 20) << "mds is " << mds << dendl;
1616 return mds;
1617}
1618
7c673cae
FG
1619void Client::connect_mds_targets(mds_rank_t mds)
1620{
11fdf7f2
TL
1621 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1622 ceph_assert(mds_sessions.count(mds));
7c673cae 1623 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
f67539c2
TL
1624 for (const auto &rank : info.export_targets) {
1625 if (mds_sessions.count(rank) == 0 &&
1626 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
7c673cae 1627 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
f67539c2
TL
1628 << " export target mds." << rank << dendl;
1629 _open_mds_session(rank);
7c673cae
FG
1630 }
1631 }
1632}
1633
adb31ebb 1634void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1635{
1636 f->dump_int("id", get_nodeid().v);
11fdf7f2 1637 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1638 f->dump_object("inst", inst);
1639 f->dump_stream("inst_str") << inst;
1640 f->dump_stream("addr_str") << inst.addr;
7c673cae 1641 f->open_array_section("sessions");
11fdf7f2 1642 for (const auto &p : mds_sessions) {
7c673cae 1643 f->open_object_section("session");
adb31ebb 1644 p.second.dump(f, cap_dump);
7c673cae
FG
1645 f->close_section();
1646 }
1647 f->close_section();
1648 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1649}
f67539c2 1650
7c673cae
FG
1651void Client::dump_mds_requests(Formatter *f)
1652{
1653 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1654 p != mds_requests.end();
1655 ++p) {
1656 f->open_object_section("request");
1657 p->second->dump(f);
1658 f->close_section();
1659 }
1660}
1661
9f95a23c 1662int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1663 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1664 InodeRef *ptarget, bool *pcreated,
1665 const UserPerm& perms)
1666{
1667 // check whether this request actually did the create, and set created flag
1668 bufferlist extra_bl;
1669 inodeno_t created_ino;
1670 bool got_created_ino = false;
1671 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1672
11fdf7f2 1673 extra_bl = reply->get_extra_bl();
7c673cae 1674 if (extra_bl.length() >= 8) {
9f95a23c
TL
1675 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1676 struct openc_response_t ocres;
1677
1678 decode(ocres, extra_bl);
1679 created_ino = ocres.created_ino;
1680 /*
1681 * The userland cephfs client doesn't have a way to do an async create
1682 * (yet), so just discard delegated_inos for now. Eventually we should
1683 * store them and use them in create calls, even if they are synchronous,
1684 * if only for testing purposes.
1685 */
1686 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1687 } else {
1688 // u64 containing number of created ino
1689 decode(created_ino, extra_bl);
1690 }
7c673cae 1691 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1692 got_created_ino = true;
7c673cae
FG
1693 }
1694
1695 if (pcreated)
1696 *pcreated = got_created_ino;
1697
1698 if (request->target) {
1699 *ptarget = request->target;
1700 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1701 } else {
1702 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1703 (*ptarget) = p->second;
1704 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1705 } else {
1706 // we got a traceless reply, and need to look up what we just
1707 // created. for now, do this by name. someday, do this by the
1708 // ino... which we know! FIXME.
1709 InodeRef target;
1710 Dentry *d = request->dentry();
1711 if (d) {
1712 if (d->dir) {
1713 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1714 << d->dir->parent_inode->ino << "/" << d->name
1715 << " got_ino " << got_created_ino
1716 << " ino " << created_ino
1717 << dendl;
1718 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1719 &target, perms);
1720 } else {
1721 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1722 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1723 }
1724 } else {
1725 Inode *in = request->inode();
1726 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1727 << in->ino << dendl;
1728 r = _getattr(in, request->regetattr_mask, perms, true);
1729 target = in;
1730 }
1731 if (r >= 0) {
1732 // verify ino returned in reply and trace_dist are the same
1733 if (got_created_ino &&
1734 created_ino.val != target->ino.val) {
1735 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
f67539c2 1736 r = -CEPHFS_EINTR;
7c673cae
FG
1737 }
1738 if (ptarget)
1739 ptarget->swap(target);
1740 }
1741 }
1742 }
1743
1744 return r;
1745}
1746
1747
1748/**
1749 * make a request
1750 *
1751 * Blocking helper to make an MDS request.
1752 *
1753 * If the ptarget flag is set, behavior changes slightly: the caller
1754 * expects to get a pointer to the inode we are creating or operating
1755 * on. As a result, we will follow up any traceless mutation reply
1756 * with a getattr or lookup to transparently handle a traceless reply
1757 * from the MDS (as when the MDS restarts and the client has to replay
1758 * a request).
1759 *
1760 * @param request the MetaRequest to execute
1761 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1762 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1763 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1764 * @param use_mds [optional] prefer a specific mds (-1 for default)
1765 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1766 */
1767int Client::make_request(MetaRequest *request,
1768 const UserPerm& perms,
1769 InodeRef *ptarget, bool *pcreated,
1770 mds_rank_t use_mds,
1771 bufferlist *pdirbl)
1772{
1773 int r = 0;
1774
1775 // assign a unique tid
1776 ceph_tid_t tid = ++last_tid;
1777 request->set_tid(tid);
1778
1779 // and timestamp
1780 request->op_stamp = ceph_clock_now();
1781
1782 // make note
1783 mds_requests[tid] = request->get();
1784 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1785 oldest_tid = tid;
1786
1787 request->set_caller_perms(perms);
1788
1789 if (cct->_conf->client_inject_fixed_oldest_tid) {
1790 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1791 request->set_oldest_client_tid(1);
1792 } else {
1793 request->set_oldest_client_tid(oldest_tid);
1794 }
1795
1796 // hack target mds?
1797 if (use_mds >= 0)
1798 request->resend_mds = use_mds;
1799
9f95a23c 1800 MetaSession *session = NULL;
7c673cae
FG
1801 while (1) {
1802 if (request->aborted())
1803 break;
1804
f67539c2
TL
1805 if (blocklisted) {
1806 request->abort(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
1807 break;
1808 }
1809
7c673cae 1810 // set up wait cond
9f95a23c 1811 ceph::condition_variable caller_cond;
7c673cae
FG
1812 request->caller_cond = &caller_cond;
1813
1814 // choose mds
1815 Inode *hash_diri = NULL;
1816 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1817 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1818 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1819 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1820 if (hash_diri) {
1821 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1822 _fragmap_remove_stopped_mds(hash_diri, mds);
1823 } else {
1824 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1825 request->resend_mds = _get_random_up_mds();
1826 }
1827 } else {
1828 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1829 wait_on_list(waiting_for_mdsmap);
1830 }
1831 continue;
1832 }
1833
1834 // open a session?
7c673cae
FG
1835 if (!have_open_session(mds)) {
1836 session = _get_or_open_mds_session(mds);
f6b5b4d7 1837 if (session->state == MetaSession::STATE_REJECTED) {
f67539c2 1838 request->abort(-CEPHFS_EPERM);
f6b5b4d7
TL
1839 break;
1840 }
7c673cae
FG
1841 // wait
1842 if (session->state == MetaSession::STATE_OPENING) {
1843 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1844 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1845 continue;
1846 }
1847
1848 if (!have_open_session(mds))
1849 continue;
1850 } else {
11fdf7f2 1851 session = &mds_sessions.at(mds);
7c673cae
FG
1852 }
1853
1854 // send request.
1855 send_request(request, session);
1856
1857 // wait for signal
1858 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1859 request->kick = false;
9f95a23c
TL
1860 std::unique_lock l{client_lock, std::adopt_lock};
1861 caller_cond.wait(l, [request] {
1862 return (request->reply || // reply
1863 request->resend_mds >= 0 || // forward
1864 request->kick);
1865 });
1866 l.release();
1867 request->caller_cond = nullptr;
7c673cae
FG
1868
1869 // did we get a reply?
1870 if (request->reply)
1871 break;
1872 }
1873
1874 if (!request->reply) {
11fdf7f2
TL
1875 ceph_assert(request->aborted());
1876 ceph_assert(!request->got_unsafe);
7c673cae
FG
1877 r = request->get_abort_code();
1878 request->item.remove_myself();
1879 unregister_request(request);
11fdf7f2 1880 put_request(request);
7c673cae
FG
1881 return r;
1882 }
1883
1884 // got it!
11fdf7f2 1885 auto reply = std::move(request->reply);
7c673cae
FG
1886 r = reply->get_result();
1887 if (r >= 0)
1888 request->success = true;
1889
1890 // kick dispatcher (we've got it!)
11fdf7f2 1891 ceph_assert(request->dispatch_cond);
9f95a23c 1892 request->dispatch_cond->notify_all();
7c673cae
FG
1893 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1894 request->dispatch_cond = 0;
1895
1896 if (r >= 0 && ptarget)
9f95a23c 1897 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
7c673cae
FG
1898
1899 if (pdirbl)
11fdf7f2 1900 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1901
1902 // -- log times --
1903 utime_t lat = ceph_clock_now();
1904 lat -= request->sent_stamp;
1905 ldout(cct, 20) << "lat " << lat << dendl;
1906 logger->tinc(l_c_lat, lat);
1907 logger->tinc(l_c_reply, lat);
1908
1909 put_request(request);
7c673cae
FG
1910 return r;
1911}
1912
1913void Client::unregister_request(MetaRequest *req)
1914{
1915 mds_requests.erase(req->tid);
1916 if (req->tid == oldest_tid) {
1917 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1918 while (true) {
1919 if (p == mds_requests.end()) {
1920 oldest_tid = 0;
1921 break;
1922 }
1923 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1924 oldest_tid = p->first;
1925 break;
1926 }
1927 ++p;
1928 }
1929 }
1930 put_request(req);
1931}
1932
1933void Client::put_request(MetaRequest *request)
1934{
1935 if (request->_put()) {
1936 int op = -1;
1937 if (request->success)
1938 op = request->get_op();
1939 InodeRef other_in;
1940 request->take_other_inode(&other_in);
1941 delete request;
1942
1943 if (other_in &&
1944 (op == CEPH_MDS_OP_RMDIR ||
1945 op == CEPH_MDS_OP_RENAME ||
1946 op == CEPH_MDS_OP_RMSNAP)) {
1947 _try_to_trim_inode(other_in.get(), false);
1948 }
1949 }
1950}
1951
1952int Client::encode_inode_release(Inode *in, MetaRequest *req,
1953 mds_rank_t mds, int drop,
1954 int unless, int force)
1955{
11fdf7f2 1956 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
f67539c2 1957 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1911f103 1958 << ", force:" << force << ")" << dendl;
7c673cae 1959 int released = 0;
11fdf7f2
TL
1960 auto it = in->caps.find(mds);
1961 if (it != in->caps.end()) {
1962 Cap &cap = it->second;
7c673cae 1963 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1964 if ((drop & cap.issued) &&
1965 !(unless & cap.issued)) {
1911f103 1966 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
1967 cap.issued &= ~drop;
1968 cap.implemented &= ~drop;
7c673cae 1969 released = 1;
7c673cae
FG
1970 } else {
1971 released = force;
1972 }
1973 if (released) {
1911f103
TL
1974 cap.wanted = in->caps_wanted();
1975 if (&cap == in->auth_cap &&
1976 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1977 in->requested_max_size = 0;
1978 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1979 }
7c673cae
FG
1980 ceph_mds_request_release rel;
1981 rel.ino = in->ino;
11fdf7f2
TL
1982 rel.cap_id = cap.cap_id;
1983 rel.seq = cap.seq;
1984 rel.issue_seq = cap.issue_seq;
1985 rel.mseq = cap.mseq;
1986 rel.caps = cap.implemented;
1987 rel.wanted = cap.wanted;
7c673cae
FG
1988 rel.dname_len = 0;
1989 rel.dname_seq = 0;
1990 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1991 }
1992 }
11fdf7f2 1993 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1994 << released << dendl;
1995 return released;
1996}
1997
1998void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1999 mds_rank_t mds, int drop, int unless)
2000{
11fdf7f2 2001 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
2002 << dn << ")" << dendl;
2003 int released = 0;
2004 if (dn->dir)
2005 released = encode_inode_release(dn->dir->parent_inode, req,
2006 mds, drop, unless, 1);
2007 if (released && dn->lease_mds == mds) {
2008 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 2009 auto& rel = req->cap_releases.back();
7c673cae
FG
2010 rel.item.dname_len = dn->name.length();
2011 rel.item.dname_seq = dn->lease_seq;
2012 rel.dname = dn->name;
adb31ebb 2013 dn->lease_mds = -1;
7c673cae 2014 }
11fdf7f2 2015 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
2016 << dn << ")" << dendl;
2017}
2018
2019
2020/*
2021 * This requires the MClientRequest *request member to be set.
2022 * It will error out horribly without one.
2023 * Additionally, if you set any *drop member, you'd better have
2024 * set the corresponding dentry!
2025 */
2026void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2027{
11fdf7f2 2028 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
2029 << req << ", mds: " << mds << ")" << dendl;
2030 if (req->inode_drop && req->inode())
2031 encode_inode_release(req->inode(), req,
2032 mds, req->inode_drop,
2033 req->inode_unless);
2034
2035 if (req->old_inode_drop && req->old_inode())
2036 encode_inode_release(req->old_inode(), req,
2037 mds, req->old_inode_drop,
2038 req->old_inode_unless);
2039 if (req->other_inode_drop && req->other_inode())
2040 encode_inode_release(req->other_inode(), req,
2041 mds, req->other_inode_drop,
2042 req->other_inode_unless);
2043
2044 if (req->dentry_drop && req->dentry())
2045 encode_dentry_release(req->dentry(), req,
2046 mds, req->dentry_drop,
2047 req->dentry_unless);
2048
2049 if (req->old_dentry_drop && req->old_dentry())
2050 encode_dentry_release(req->old_dentry(), req,
2051 mds, req->old_dentry_drop,
2052 req->old_dentry_unless);
11fdf7f2 2053 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
2054 << req << ", mds " << mds <<dendl;
2055}
2056
2057bool Client::have_open_session(mds_rank_t mds)
2058{
11fdf7f2
TL
2059 const auto &it = mds_sessions.find(mds);
2060 return it != mds_sessions.end() &&
2061 (it->second.state == MetaSession::STATE_OPEN ||
2062 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
2063}
2064
2065MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
2066{
11fdf7f2
TL
2067 const auto &it = mds_sessions.find(mds);
2068 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 2069 return NULL;
11fdf7f2
TL
2070 } else {
2071 return &it->second;
2072 }
7c673cae
FG
2073}
2074
2075MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
2076{
11fdf7f2
TL
2077 auto it = mds_sessions.find(mds);
2078 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
2079}
2080
2081/**
2082 * Populate a map of strings with client-identifying metadata,
2083 * such as the hostname. Call this once at initialization.
2084 */
2085void Client::populate_metadata(const std::string &mount_root)
2086{
2087 // Hostname
f67539c2
TL
2088#ifdef _WIN32
2089 // TODO: move this to compat.h
2090 char hostname[64];
2091 DWORD hostname_sz = 64;
2092 GetComputerNameA(hostname, &hostname_sz);
2093 metadata["hostname"] = hostname;
2094#else
7c673cae
FG
2095 struct utsname u;
2096 int r = uname(&u);
2097 if (r >= 0) {
2098 metadata["hostname"] = u.nodename;
2099 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2100 } else {
2101 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2102 }
f67539c2 2103#endif
7c673cae
FG
2104
2105 metadata["pid"] = stringify(getpid());
2106
2107 // Ceph entity id (the '0' in "client.0")
2108 metadata["entity_id"] = cct->_conf->name.get_id();
2109
2110 // Our mount position
2111 if (!mount_root.empty()) {
2112 metadata["root"] = mount_root;
2113 }
2114
2115 // Ceph version
2116 metadata["ceph_version"] = pretty_version_to_str();
2117 metadata["ceph_sha1"] = git_version_to_str();
2118
2119 // Apply any metadata from the user's configured overrides
2120 std::vector<std::string> tokens;
2121 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2122 for (const auto &i : tokens) {
2123 auto eqpos = i.find("=");
2124 // Throw out anything that isn't of the form "<str>=<str>"
2125 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2126 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2127 continue;
2128 }
2129 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2130 }
2131}
2132
2133/**
2134 * Optionally add or override client metadata fields.
2135 */
2136void Client::update_metadata(std::string const &k, std::string const &v)
2137{
f67539c2
TL
2138 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2139 ceph_assert(iref_reader.is_state_satisfied());
2140
2141 std::scoped_lock l(client_lock);
7c673cae 2142
11fdf7f2
TL
2143 auto it = metadata.find(k);
2144 if (it != metadata.end()) {
7c673cae 2145 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2146 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2147 }
2148
2149 metadata[k] = v;
2150}
2151
2152MetaSession *Client::_open_mds_session(mds_rank_t mds)
2153{
11fdf7f2
TL
2154 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2155 auto addrs = mdsmap->get_addrs(mds);
2156 auto em = mds_sessions.emplace(std::piecewise_construct,
2157 std::forward_as_tuple(mds),
2158 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2159 ceph_assert(em.second); /* not already present */
2160 MetaSession *session = &em.first->second;
7c673cae 2161
9f95a23c 2162 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2163 m->metadata = metadata;
2164 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
f67539c2 2165 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
11fdf7f2 2166 session->con->send_message2(std::move(m));
7c673cae
FG
2167 return session;
2168}
2169
2170void Client::_close_mds_session(MetaSession *s)
2171{
11fdf7f2 2172 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2173 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2174 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2175}
2176
f6b5b4d7 2177void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2178{
11fdf7f2 2179 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2180 if (rejected && s->state != MetaSession::STATE_CLOSING)
2181 s->state = MetaSession::STATE_REJECTED;
2182 else
2183 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2184 s->con->mark_down();
2185 signal_context_list(s->waiting_for_open);
9f95a23c 2186 mount_cond.notify_all();
f6b5b4d7 2187 remove_session_caps(s, err);
7c673cae 2188 kick_requests_closed(s);
f6b5b4d7
TL
2189 mds_ranks_closing.erase(s->mds_num);
2190 if (s->state == MetaSession::STATE_CLOSED)
2191 mds_sessions.erase(s->mds_num);
7c673cae
FG
2192}
2193
11fdf7f2 2194void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2195{
2196 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2197 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 2198
f67539c2 2199 std::scoped_lock cl(client_lock);
7c673cae
FG
2200 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2201 if (!session) {
2202 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2203 return;
2204 }
2205
2206 switch (m->get_op()) {
2207 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2208 {
2209 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2210 missing_features -= m->supported_features;
2211 if (!missing_features.empty()) {
2212 lderr(cct) << "mds." << from << " lacks required features '"
2213 << missing_features << "', closing session " << dendl;
11fdf7f2 2214 _close_mds_session(session);
f67539c2 2215 _closed_mds_session(session, -CEPHFS_EPERM, true);
11fdf7f2
TL
2216 break;
2217 }
2218 session->mds_features = std::move(m->supported_features);
2219
2220 renew_caps(session);
2221 session->state = MetaSession::STATE_OPEN;
f67539c2 2222 if (is_unmounting())
9f95a23c 2223 mount_cond.notify_all();
11fdf7f2
TL
2224 else
2225 connect_mds_targets(from);
2226 signal_context_list(session->waiting_for_open);
2227 break;
2228 }
7c673cae
FG
2229
2230 case CEPH_SESSION_CLOSE:
2231 _closed_mds_session(session);
2232 break;
2233
2234 case CEPH_SESSION_RENEWCAPS:
2235 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2236 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2237 session->cap_ttl =
2238 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2239 if (was_stale)
2240 wake_up_session_caps(session, false);
7c673cae
FG
2241 }
2242 break;
2243
2244 case CEPH_SESSION_STALE:
28e407b8
AA
2245 // invalidate session caps/leases
2246 session->cap_gen++;
2247 session->cap_ttl = ceph_clock_now();
2248 session->cap_ttl -= 1;
7c673cae
FG
2249 renew_caps(session);
2250 break;
2251
2252 case CEPH_SESSION_RECALL_STATE:
f67539c2
TL
2253 /*
2254 * Call the renew caps and flush cap releases just before
2255 * triming the caps in case the tick() won't get a chance
2256 * to run them, which could cause the client to be blocklisted
2257 * and MDS daemons trying to recall the caps again and
2258 * again.
2259 *
2260 * In most cases it will do nothing, and the new cap releases
2261 * added by trim_caps() followed will be deferred flushing
2262 * by tick().
2263 */
2264 renew_and_flush_cap_releases();
7c673cae
FG
2265 trim_caps(session, m->get_max_caps());
2266 break;
2267
2268 case CEPH_SESSION_FLUSHMSG:
a8e16298 2269 /* flush cap release */
11fdf7f2
TL
2270 if (auto& m = session->release; m) {
2271 session->con->send_message2(std::move(m));
a8e16298 2272 }
9f95a23c 2273 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2274 break;
2275
2276 case CEPH_SESSION_FORCE_RO:
2277 force_session_readonly(session);
2278 break;
2279
2280 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2281 {
2282 std::string_view error_str;
2283 auto it = m->metadata.find("error_string");
2284 if (it != m->metadata.end())
2285 error_str = it->second;
2286 else
2287 error_str = "unknown error";
2288 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2289
f67539c2 2290 _closed_mds_session(session, -CEPHFS_EPERM, true);
11fdf7f2 2291 }
7c673cae
FG
2292 break;
2293
2294 default:
2295 ceph_abort();
2296 }
7c673cae
FG
2297}
2298
2299bool Client::_any_stale_sessions() const
2300{
9f95a23c 2301 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2302
11fdf7f2
TL
2303 for (const auto &p : mds_sessions) {
2304 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2305 return true;
2306 }
2307 }
2308
2309 return false;
2310}
2311
2312void Client::_kick_stale_sessions()
2313{
11fdf7f2 2314 ldout(cct, 1) << __func__ << dendl;
7c673cae 2315
11fdf7f2
TL
2316 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2317 MetaSession &s = it->second;
f6b5b4d7
TL
2318 if (s.state == MetaSession::STATE_REJECTED) {
2319 mds_sessions.erase(it++);
2320 continue;
2321 }
11fdf7f2
TL
2322 ++it;
2323 if (s.state == MetaSession::STATE_STALE)
2324 _closed_mds_session(&s);
7c673cae
FG
2325 }
2326}
2327
2328void Client::send_request(MetaRequest *request, MetaSession *session,
2329 bool drop_cap_releases)
2330{
2331 // make the request
2332 mds_rank_t mds = session->mds_num;
11fdf7f2 2333 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2334 << " for mds." << mds << dendl;
11fdf7f2 2335 auto r = build_client_request(request);
7c673cae
FG
2336 if (request->dentry()) {
2337 r->set_dentry_wanted();
2338 }
2339 if (request->got_unsafe) {
2340 r->set_replayed_op();
2341 if (request->target)
2342 r->head.ino = request->target->ino;
2343 } else {
2344 encode_cap_releases(request, mds);
2345 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2346 request->cap_releases.clear();
2347 else
2348 r->releases.swap(request->cap_releases);
2349 }
2350 r->set_mdsmap_epoch(mdsmap->get_epoch());
2351 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2352 objecter->with_osdmap([r](const OSDMap& o) {
2353 r->set_osdmap_epoch(o.get_epoch());
2354 });
2355 }
2356
2357 if (request->mds == -1) {
2358 request->sent_stamp = ceph_clock_now();
11fdf7f2 2359 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2360 }
2361 request->mds = mds;
2362
2363 Inode *in = request->inode();
11fdf7f2
TL
2364 if (in) {
2365 auto it = in->caps.find(mds);
2366 if (it != in->caps.end()) {
2367 request->sent_on_mseq = it->second.mseq;
2368 }
2369 }
7c673cae
FG
2370
2371 session->requests.push_back(&request->item);
2372
11fdf7f2
TL
2373 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2374 session->con->send_message2(std::move(r));
7c673cae
FG
2375}
2376
9f95a23c 2377ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2378{
9f95a23c 2379 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2380 req->set_tid(request->tid);
2381 req->set_stamp(request->op_stamp);
2382 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2383
2384 // if the filepath's haven't been set, set them!
2385 if (request->path.empty()) {
2386 Inode *in = request->inode();
2387 Dentry *de = request->dentry();
2388 if (in)
2389 in->make_nosnap_relative_path(request->path);
2390 else if (de) {
2391 if (de->inode)
2392 de->inode->make_nosnap_relative_path(request->path);
2393 else if (de->dir) {
2394 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2395 request->path.push_dentry(de->name);
2396 }
2397 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2398 << " No path, inode, or appropriately-endowed dentry given!"
2399 << dendl;
2400 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2401 << " No path, inode, or dentry given!"
2402 << dendl;
2403 }
2404 req->set_filepath(request->get_filepath());
2405 req->set_filepath2(request->get_filepath2());
f67539c2 2406 req->set_alternate_name(request->alternate_name);
7c673cae
FG
2407 req->set_data(request->data);
2408 req->set_retry_attempt(request->retry_attempt++);
2409 req->head.num_fwd = request->num_fwd;
2410 const gid_t *_gids;
2411 int gid_count = request->perms.get_gids(&_gids);
2412 req->set_gid_list(gid_count, _gids);
2413 return req;
2414}
2415
2416
2417
11fdf7f2 2418void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2419{
2420 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
f67539c2
TL
2421
2422 std::scoped_lock cl(client_lock);
7c673cae
FG
2423 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2424 if (!session) {
7c673cae
FG
2425 return;
2426 }
2427 ceph_tid_t tid = fwd->get_tid();
2428
2429 if (mds_requests.count(tid) == 0) {
11fdf7f2 2430 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2431 return;
2432 }
2433
2434 MetaRequest *request = mds_requests[tid];
11fdf7f2 2435 ceph_assert(request);
7c673cae
FG
2436
2437 // reset retry counter
2438 request->retry_attempt = 0;
2439
2440 // request not forwarded, or dest mds has no session.
2441 // resend.
11fdf7f2 2442 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2443 << " fwd " << fwd->get_num_fwd()
2444 << " to mds." << fwd->get_dest_mds()
2445 << ", resending to " << fwd->get_dest_mds()
2446 << dendl;
2447
2448 request->mds = -1;
2449 request->item.remove_myself();
2450 request->num_fwd = fwd->get_num_fwd();
2451 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2452 request->caller_cond->notify_all();
7c673cae
FG
2453}
2454
2455bool Client::is_dir_operation(MetaRequest *req)
2456{
2457 int op = req->get_op();
2458 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2459 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2460 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2461 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2462 return true;
2463 return false;
2464}
2465
11fdf7f2 2466void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2467{
2468 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
f67539c2
TL
2469
2470 std::scoped_lock cl(client_lock);
7c673cae
FG
2471 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2472 if (!session) {
7c673cae
FG
2473 return;
2474 }
2475
2476 ceph_tid_t tid = reply->get_tid();
2477 bool is_safe = reply->is_safe();
2478
2479 if (mds_requests.count(tid) == 0) {
11fdf7f2 2480 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2481 << " safe is:" << is_safe << dendl;
7c673cae
FG
2482 return;
2483 }
2484 MetaRequest *request = mds_requests.at(tid);
2485
11fdf7f2 2486 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2487 << " tid " << tid << dendl;
2488
2489 if (request->got_unsafe && !is_safe) {
2490 //duplicate response
2491 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2492 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2493 return;
2494 }
2495
f67539c2 2496 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
7c673cae
FG
2497 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2498 << " from mds." << request->mds << dendl;
2499 request->send_to_auth = true;
2500 request->resend_mds = choose_target_mds(request);
2501 Inode *in = request->inode();
11fdf7f2 2502 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2503 if (request->resend_mds >= 0 &&
2504 request->resend_mds == request->mds &&
2505 (in == NULL ||
11fdf7f2
TL
2506 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2507 request->sent_on_mseq == it->second.mseq)) {
2508 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae 2509 } else {
9f95a23c 2510 request->caller_cond->notify_all();
7c673cae
FG
2511 return;
2512 }
7c673cae
FG
2513 }
2514
11fdf7f2 2515 ceph_assert(!request->reply);
7c673cae
FG
2516 request->reply = reply;
2517 insert_trace(request, session);
2518
2519 // Handle unsafe reply
2520 if (!is_safe) {
2521 request->got_unsafe = true;
2522 session->unsafe_requests.push_back(&request->unsafe_item);
2523 if (is_dir_operation(request)) {
2524 Inode *dir = request->inode();
11fdf7f2 2525 ceph_assert(dir);
7c673cae
FG
2526 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2527 }
2528 if (request->target) {
2529 InodeRef &in = request->target;
2530 in->unsafe_ops.push_back(&request->unsafe_target_item);
2531 }
2532 }
2533
2534 // Only signal the caller once (on the first reply):
2535 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2536 if (!is_safe || !request->got_unsafe) {
9f95a23c 2537 ceph::condition_variable cond;
7c673cae
FG
2538 request->dispatch_cond = &cond;
2539
2540 // wake up waiter
11fdf7f2 2541 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2542 request->caller_cond->notify_all();
7c673cae
FG
2543
2544 // wake for kick back
9f95a23c
TL
2545 std::unique_lock l{client_lock, std::adopt_lock};
2546 cond.wait(l, [tid, request, &cond, this] {
2547 if (request->dispatch_cond) {
2548 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2549 << tid << " " << &cond << dendl;
2550 }
2551 return !request->dispatch_cond;
2552 });
2553 l.release();
7c673cae
FG
2554 }
2555
2556 if (is_safe) {
2557 // the filesystem change is committed to disk
2558 // we're done, clean up
2559 if (request->got_unsafe) {
2560 request->unsafe_item.remove_myself();
2561 request->unsafe_dir_item.remove_myself();
2562 request->unsafe_target_item.remove_myself();
2563 signal_cond_list(request->waitfor_safe);
2564 }
2565 request->item.remove_myself();
2566 unregister_request(request);
2567 }
f67539c2 2568 if (is_unmounting())
9f95a23c 2569 mount_cond.notify_all();
7c673cae
FG
2570}
2571
2572void Client::_handle_full_flag(int64_t pool)
2573{
2574 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2575 << "on " << pool << dendl;
f67539c2 2576 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
7c673cae
FG
2577 // to do this rather than blocking, because otherwise when we fill up we
2578 // potentially lock caps forever on files with dirty pages, and we need
2579 // to be able to release those caps to the MDS so that it can delete files
2580 // and free up space.
f67539c2 2581 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
7c673cae
FG
2582
2583 // For all inodes with layouts in this pool and a pending flush write op
2584 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2585 // from ObjectCacher so that it doesn't re-issue the write in response to
2586 // the ENOSPC error.
2587 // Fortunately since we're cancelling everything in a given pool, we don't
2588 // need to know which ops belong to which ObjectSet, we can just blow all
2589 // the un-flushed cached data away and mark any dirty inodes' async_err
f67539c2 2590 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
7c673cae
FG
2591 // affecting this pool, and all the objectsets we're purging were also
2592 // in this pool.
2593 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2594 i != inode_map.end(); ++i)
2595 {
2596 Inode *inode = i->second;
2597 if (inode->oset.dirty_or_tx
2598 && (pool == -1 || inode->layout.pool_id == pool)) {
2599 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2600 << " has dirty objects, purging and setting ENOSPC" << dendl;
2601 objectcacher->purge_set(&inode->oset);
f67539c2 2602 inode->set_async_err(-CEPHFS_ENOSPC);
7c673cae
FG
2603 }
2604 }
2605
2606 if (cancelled_epoch != (epoch_t)-1) {
2607 set_cap_epoch_barrier(cancelled_epoch);
2608 }
2609}
2610
11fdf7f2 2611void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2612{
f67539c2
TL
2613 std::set<entity_addr_t> new_blocklists;
2614
2615 std::scoped_lock cl(client_lock);
2616 objecter->consume_blocklist_events(&new_blocklists);
31f18b77 2617
11fdf7f2 2618 const auto myaddrs = messenger->get_myaddrs();
f67539c2 2619 bool new_blocklist = false;
11fdf7f2
TL
2620 bool prenautilus = objecter->with_osdmap(
2621 [&](const OSDMap& o) {
9f95a23c 2622 return o.require_osd_release < ceph_release_t::nautilus;
11fdf7f2 2623 });
f67539c2 2624 if (!blocklisted) {
11fdf7f2 2625 for (auto a : myaddrs.v) {
f67539c2 2626 // blocklist entries are always TYPE_ANY for nautilus+
11fdf7f2 2627 a.set_type(entity_addr_t::TYPE_ANY);
f67539c2
TL
2628 if (new_blocklists.count(a)) {
2629 new_blocklist = true;
11fdf7f2
TL
2630 break;
2631 }
2632 if (prenautilus) {
2633 // ...except pre-nautilus, they were TYPE_LEGACY
2634 a.set_type(entity_addr_t::TYPE_LEGACY);
f67539c2
TL
2635 if (new_blocklists.count(a)) {
2636 new_blocklist = true;
11fdf7f2
TL
2637 break;
2638 }
2639 }
2640 }
2641 }
f67539c2 2642 if (new_blocklist) {
31f18b77
FG
2643 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2644 return o.get_epoch();
2645 });
f67539c2
TL
2646 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2647 blocklisted = true;
31f18b77 2648
f67539c2 2649 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
2650
2651 // Since we know all our OSD ops will fail, cancel them all preemtively,
2652 // so that on an unhealthy cluster we can umount promptly even if e.g.
2653 // some PGs were inaccessible.
f67539c2
TL
2654 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2655
2656 }
31f18b77 2657
f67539c2
TL
2658 if (blocklisted) {
2659 // Handle case where we were blocklisted but no longer are
2660 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2661 return o.is_blocklisted(myaddrs);});
31f18b77
FG
2662 }
2663
f67539c2
TL
2664 // Always subscribe to next osdmap for blocklisted client
2665 // until this client is not blocklisted.
2666 if (blocklisted) {
f64942e4
AA
2667 objecter->maybe_request_map();
2668 }
2669
7c673cae
FG
2670 if (objecter->osdmap_full_flag()) {
2671 _handle_full_flag(-1);
2672 } else {
2673 // Accumulate local list of full pools so that I can drop
2674 // the objecter lock before re-entering objecter in
2675 // cancel_writes
2676 std::vector<int64_t> full_pools;
2677
2678 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2679 for (const auto& kv : o.get_pools()) {
2680 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2681 full_pools.push_back(kv.first);
2682 }
2683 }
2684 });
2685
2686 for (auto p : full_pools)
2687 _handle_full_flag(p);
2688
2689 // Subscribe to subsequent maps to watch for the full flag going
2690 // away. For the global full flag objecter does this for us, but
2691 // it pays no attention to the per-pool full flag so in this branch
2692 // we do it ourselves.
2693 if (!full_pools.empty()) {
2694 objecter->maybe_request_map();
2695 }
2696 }
7c673cae
FG
2697}
2698
2699
2700// ------------------------
2701// incoming messages
2702
2703
11fdf7f2 2704bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2705{
f67539c2
TL
2706 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2707 if (!iref_reader.is_state_satisfied()) {
7c673cae 2708 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2709 return true;
2710 }
2711
2712 switch (m->get_type()) {
2713 // mounting and mds sessions
2714 case CEPH_MSG_MDS_MAP:
9f95a23c 2715 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2716 break;
2717 case CEPH_MSG_FS_MAP:
9f95a23c 2718 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2719 break;
2720 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2721 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2722 break;
2723 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2724 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2725 break;
2726
2727 case CEPH_MSG_OSD_MAP:
9f95a23c 2728 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2729 break;
2730
2731 // requests
2732 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2733 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2734 break;
2735 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2736 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2737 break;
2738
2739 // reclaim reply
2740 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2741 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2742 break;
2743
2744 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2745 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2746 break;
2747 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2748 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2749 break;
2750 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2751 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2752 break;
2753 case MSG_COMMAND_REPLY:
2754 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2755 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2756 } else {
2757 return false;
2758 }
2759 break;
2760 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2761 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2762 break;
2763
2764 default:
2765 return false;
2766 }
2767
2768 // unmounting?
f67539c2
TL
2769 std::scoped_lock cl(client_lock);
2770 if (is_unmounting()) {
7c673cae
FG
2771 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2772 << "+" << inode_map.size() << dendl;
f67539c2 2773 uint64_t size = lru.lru_get_size() + inode_map.size();
7c673cae 2774 trim_cache();
f67539c2 2775 if (size > lru.lru_get_size() + inode_map.size()) {
7c673cae 2776 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2777 mount_cond.notify_all();
7c673cae
FG
2778 } else {
2779 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2780 << "+" << inode_map.size() << dendl;
2781 }
2782 }
2783
2784 return true;
2785}
2786
11fdf7f2 2787void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae 2788{
f67539c2 2789 std::scoped_lock cl(client_lock);
7c673cae 2790 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2791
2792 signal_cond_list(waiting_for_fsmap);
2793
2794 monclient->sub_got("fsmap", fsmap->get_epoch());
2795}
2796
11fdf7f2 2797void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae 2798{
f67539c2 2799 std::scoped_lock cl(client_lock);
7c673cae
FG
2800 fsmap_user.reset(new FSMapUser);
2801 *fsmap_user = m->get_fsmap();
7c673cae
FG
2802
2803 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2804 signal_cond_list(waiting_for_fsmap);
2805}
2806
f67539c2
TL
2807// Cancel all the commands for missing or laggy GIDs
2808void Client::cancel_commands(const MDSMap& newmap)
7c673cae 2809{
f67539c2 2810 std::vector<ceph_tid_t> cancel_ops;
7c673cae 2811
f67539c2 2812 std::scoped_lock cmd_lock(command_lock);
7c673cae 2813 auto &commands = command_table.get_commands();
f67539c2 2814 for (const auto &[tid, op] : commands) {
7c673cae 2815 const mds_gid_t op_mds_gid = op.mds_gid;
f67539c2
TL
2816 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2817 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2818 cancel_ops.push_back(tid);
7c673cae
FG
2819 if (op.outs) {
2820 std::ostringstream ss;
2821 ss << "MDS " << op_mds_gid << " went away";
2822 *(op.outs) = ss.str();
2823 }
f67539c2
TL
2824 /*
2825 * No need to make the con->mark_down under
2826 * client_lock here, because the con will
2827 * has its own lock.
2828 */
7c673cae 2829 op.con->mark_down();
f67539c2
TL
2830 if (op.on_finish)
2831 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
7c673cae
FG
2832 }
2833 }
2834
f67539c2
TL
2835 for (const auto &tid : cancel_ops)
2836 command_table.erase(tid);
2837}
2838
2839void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2840{
2841 std::unique_lock cl(client_lock);
2842 if (m->get_epoch() <= mdsmap->get_epoch()) {
2843 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2844 << " is identical to or older than our "
2845 << mdsmap->get_epoch() << dendl;
2846 return;
7c673cae
FG
2847 }
2848
f67539c2
TL
2849 cl.unlock();
2850 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2851 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2852 _mdsmap->decode(m->get_encoded());
2853 cancel_commands(*_mdsmap.get());
2854 cl.lock();
2855
2856 _mdsmap.swap(mdsmap);
2857
7c673cae 2858 // reset session
11fdf7f2 2859 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2860 mds_rank_t mds = p->first;
11fdf7f2 2861 MetaSession *session = &p->second;
7c673cae
FG
2862 ++p;
2863
f67539c2 2864 int oldstate = _mdsmap->get_state(mds);
7c673cae
FG
2865 int newstate = mdsmap->get_state(mds);
2866 if (!mdsmap->is_up(mds)) {
2867 session->con->mark_down();
11fdf7f2 2868 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f67539c2
TL
2869 auto old_inc = _mdsmap->get_incarnation(mds);
2870 auto new_inc = mdsmap->get_incarnation(mds);
f64942e4
AA
2871 if (old_inc != new_inc) {
2872 ldout(cct, 1) << "mds incarnation changed from "
2873 << old_inc << " to " << new_inc << dendl;
2874 oldstate = MDSMap::STATE_NULL;
2875 }
7c673cae 2876 session->con->mark_down();
11fdf7f2 2877 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2878 // When new MDS starts to take over, notify kernel to trim unused entries
2879 // in its dcache/icache. Hopefully, the kernel will release some unused
2880 // inodes before the new MDS enters reconnect state.
2881 trim_cache_for_reconnect(session);
2882 } else if (oldstate == newstate)
2883 continue; // no change
f67539c2 2884
7c673cae
FG
2885 session->mds_state = newstate;
2886 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2887 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2888 send_reconnect(session);
81eedcae
TL
2889 } else if (newstate > MDSMap::STATE_RECONNECT) {
2890 if (oldstate < MDSMap::STATE_RECONNECT) {
2891 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2892 _closed_mds_session(session);
2893 continue;
2894 }
2895 if (newstate >= MDSMap::STATE_ACTIVE) {
2896 if (oldstate < MDSMap::STATE_ACTIVE) {
2897 // kick new requests
2898 kick_requests(session);
2899 kick_flushing_caps(session);
2900 signal_context_list(session->waiting_for_open);
2901 wake_up_session_caps(session, true);
2902 }
2903 connect_mds_targets(mds);
7c673cae 2904 }
7c673cae
FG
2905 } else if (newstate == MDSMap::STATE_NULL &&
2906 mds >= mdsmap->get_max_mds()) {
2907 _closed_mds_session(session);
2908 }
2909 }
2910
2911 // kick any waiting threads
2912 signal_cond_list(waiting_for_mdsmap);
2913
7c673cae
FG
2914 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2915}
2916
2917void Client::send_reconnect(MetaSession *session)
2918{
2919 mds_rank_t mds = session->mds_num;
11fdf7f2 2920 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2921
2922 // trim unused caps to reduce MDS's cache rejoin time
2923 trim_cache_for_reconnect(session);
2924
2925 session->readonly = false;
2926
11fdf7f2 2927 session->release.reset();
7c673cae
FG
2928
2929 // reset my cap seq number
2930 session->seq = 0;
2931 //connect to the mds' offload targets
2932 connect_mds_targets(mds);
2933 //make sure unsafe requests get saved
2934 resend_unsafe_requests(session);
2935
11fdf7f2
TL
2936 early_kick_flushing_caps(session);
2937
9f95a23c 2938 auto m = make_message<MClientReconnect>();
11fdf7f2 2939 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2940
2941 // i have an open session.
2942 ceph::unordered_set<inodeno_t> did_snaprealm;
2943 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2944 p != inode_map.end();
2945 ++p) {
2946 Inode *in = p->second;
11fdf7f2
TL
2947 auto it = in->caps.find(mds);
2948 if (it != in->caps.end()) {
2949 if (allow_multi &&
9f95a23c
TL
2950 m->get_approx_size() >=
2951 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
2952 m->mark_more();
2953 session->con->send_message2(std::move(m));
2954
9f95a23c 2955 m = make_message<MClientReconnect>();
11fdf7f2
TL
2956 }
2957
2958 Cap &cap = it->second;
7c673cae 2959 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2960 << " " << ccap_string(cap.issued)
7c673cae
FG
2961 << " wants " << ccap_string(in->caps_wanted())
2962 << dendl;
2963 filepath path;
f91f0fd5 2964 in->make_short_path(path);
7c673cae
FG
2965 ldout(cct, 10) << " path " << path << dendl;
2966
2967 bufferlist flockbl;
2968 _encode_filelocks(in, flockbl);
2969
11fdf7f2
TL
2970 cap.seq = 0; // reset seq.
2971 cap.issue_seq = 0; // reset seq.
2972 cap.mseq = 0; // reset seq.
2973 // cap gen should catch up with session cap_gen
2974 if (cap.gen < session->cap_gen) {
2975 cap.gen = session->cap_gen;
2976 cap.issued = cap.implemented = CEPH_CAP_PIN;
2977 } else {
2978 cap.issued = cap.implemented;
2979 }
7c673cae
FG
2980 snapid_t snap_follows = 0;
2981 if (!in->cap_snaps.empty())
2982 snap_follows = in->cap_snaps.begin()->first;
2983
2984 m->add_cap(p->first.ino,
11fdf7f2 2985 cap.cap_id,
7c673cae
FG
2986 path.get_ino(), path.get_path(), // ino
2987 in->caps_wanted(), // wanted
11fdf7f2 2988 cap.issued, // issued
7c673cae
FG
2989 in->snaprealm->ino,
2990 snap_follows,
2991 flockbl);
2992
2993 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2994 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2995 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2996 did_snaprealm.insert(in->snaprealm->ino);
2997 }
2998 }
2999 }
3000
11fdf7f2
TL
3001 if (!allow_multi)
3002 m->set_encoding_version(0); // use connection features to choose encoding
3003 session->con->send_message2(std::move(m));
7c673cae 3004
9f95a23c 3005 mount_cond.notify_all();
11fdf7f2
TL
3006
3007 if (session->reclaim_state == MetaSession::RECLAIMING)
3008 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
3009}
3010
3011
3012void Client::kick_requests(MetaSession *session)
3013{
11fdf7f2 3014 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3015 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3016 p != mds_requests.end();
3017 ++p) {
31f18b77
FG
3018 MetaRequest *req = p->second;
3019 if (req->got_unsafe)
3020 continue;
3021 if (req->aborted()) {
3022 if (req->caller_cond) {
3023 req->kick = true;
9f95a23c 3024 req->caller_cond->notify_all();
31f18b77 3025 }
7c673cae 3026 continue;
31f18b77
FG
3027 }
3028 if (req->retry_attempt > 0)
7c673cae 3029 continue; // new requests only
31f18b77 3030 if (req->mds == session->mds_num) {
7c673cae
FG
3031 send_request(p->second, session);
3032 }
3033 }
3034}
3035
3036void Client::resend_unsafe_requests(MetaSession *session)
3037{
3038 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3039 !iter.end();
3040 ++iter)
3041 send_request(*iter, session);
3042
3043 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3044 // process completed requests in clientreplay stage.
3045 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3046 p != mds_requests.end();
3047 ++p) {
3048 MetaRequest *req = p->second;
3049 if (req->got_unsafe)
3050 continue;
31f18b77
FG
3051 if (req->aborted())
3052 continue;
7c673cae
FG
3053 if (req->retry_attempt == 0)
3054 continue; // old requests only
3055 if (req->mds == session->mds_num)
3056 send_request(req, session, true);
3057 }
3058}
3059
3060void Client::wait_unsafe_requests()
3061{
3062 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
3063 for (const auto &p : mds_sessions) {
3064 const MetaSession &s = p.second;
3065 if (!s.unsafe_requests.empty()) {
3066 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
3067 req->get();
3068 last_unsafe_reqs.push_back(req);
3069 }
3070 }
3071
3072 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3073 p != last_unsafe_reqs.end();
3074 ++p) {
3075 MetaRequest *req = *p;
3076 if (req->unsafe_item.is_on_list())
3077 wait_on_list(req->waitfor_safe);
3078 put_request(req);
3079 }
3080}
3081
3082void Client::kick_requests_closed(MetaSession *session)
3083{
11fdf7f2 3084 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3085 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3086 p != mds_requests.end(); ) {
3087 MetaRequest *req = p->second;
3088 ++p;
3089 if (req->mds == session->mds_num) {
3090 if (req->caller_cond) {
3091 req->kick = true;
9f95a23c 3092 req->caller_cond->notify_all();
7c673cae
FG
3093 }
3094 req->item.remove_myself();
3095 if (req->got_unsafe) {
11fdf7f2 3096 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 3097 req->unsafe_item.remove_myself();
eafe8130
TL
3098 if (is_dir_operation(req)) {
3099 Inode *dir = req->inode();
3100 assert(dir);
f67539c2 3101 dir->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3102 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3103 << dir->ino << " " << req->get_tid() << dendl;
3104 req->unsafe_dir_item.remove_myself();
3105 }
3106 if (req->target) {
3107 InodeRef &in = req->target;
f67539c2 3108 in->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3109 lderr(cct) << "kick_requests_closed drop req of inode : "
3110 << in->ino << " " << req->get_tid() << dendl;
3111 req->unsafe_target_item.remove_myself();
3112 }
7c673cae
FG
3113 signal_cond_list(req->waitfor_safe);
3114 unregister_request(req);
3115 }
3116 }
3117 }
11fdf7f2
TL
3118 ceph_assert(session->requests.empty());
3119 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
3120}
3121
3122
3123
3124
3125/************
3126 * leases
3127 */
3128
3129void Client::got_mds_push(MetaSession *s)
3130{
3131 s->seq++;
3132 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3133 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 3134 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
3135 }
3136}
3137
11fdf7f2 3138void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3139{
11fdf7f2 3140 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3141
11fdf7f2 3142 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae 3143 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
3144
3145 std::scoped_lock cl(client_lock);
7c673cae
FG
3146 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3147 if (!session) {
7c673cae
FG
3148 return;
3149 }
3150
3151 got_mds_push(session);
3152
3153 ceph_seq_t seq = m->get_seq();
3154
3155 Inode *in;
3156 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3157 if (inode_map.count(vino) == 0) {
3158 ldout(cct, 10) << " don't have vino " << vino << dendl;
3159 goto revoke;
3160 }
3161 in = inode_map[vino];
3162
9f95a23c 3163 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3164 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3165 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3166 goto revoke;
3167 }
3168 Dentry *dn = in->dir->dentries[m->dname];
3169 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3170 dn->lease_mds = -1;
3171 }
3172
3173 revoke:
11fdf7f2 3174 {
9f95a23c
TL
3175 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3176 m->get_mask(), m->get_ino(),
3177 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3178 m->get_connection()->send_message2(std::move(reply));
3179 }
7c673cae
FG
3180}
3181
f67539c2 3182void Client::_put_inode(Inode *in, int n)
7c673cae 3183{
f67539c2
TL
3184 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3185
b3b6e05e
TL
3186 int left = in->get_nref();
3187 ceph_assert(left >= n + 1);
3188 in->iput(n);
3189 left -= n;
3190 if (left == 1) { // the last one will be held by the inode_map
7c673cae
FG
3191 // release any caps
3192 remove_all_caps(in);
3193
11fdf7f2 3194 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3195 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3196 ceph_assert(!unclean);
7c673cae
FG
3197 inode_map.erase(in->vino());
3198 if (use_faked_inos())
3199 _release_faked_ino(in);
3200
b3b6e05e 3201 if (root == nullptr) {
7c673cae
FG
3202 root_ancestor = 0;
3203 while (!root_parents.empty())
3204 root_parents.erase(root_parents.begin());
3205 }
3206
b3b6e05e 3207 in->iput();
7c673cae
FG
3208 }
3209}
3210
f67539c2
TL
3211void Client::delay_put_inodes(bool wakeup)
3212{
3213 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3214
3215 std::map<Inode*,int> release;
3216 {
3217 std::scoped_lock dl(delay_i_lock);
3218 release.swap(delay_i_release);
3219 }
3220
3221 if (release.empty())
3222 return;
3223
3224 for (auto &[in, cnt] : release)
3225 _put_inode(in, cnt);
3226
3227 if (wakeup)
3228 mount_cond.notify_all();
3229}
3230
3231void Client::put_inode(Inode *in, int n)
3232{
3233 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3234
3235 std::scoped_lock dl(delay_i_lock);
3236 delay_i_release[in] += n;
3237}
3238
7c673cae
FG
3239void Client::close_dir(Dir *dir)
3240{
3241 Inode *in = dir->parent_inode;
11fdf7f2
TL
3242 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3243 ceph_assert(dir->is_empty());
3244 ceph_assert(in->dir == dir);
3245 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3246 if (!in->dentries.empty())
7c673cae
FG
3247 in->get_first_parent()->put(); // unpin dentry
3248
3249 delete in->dir;
3250 in->dir = 0;
3251 put_inode(in); // unpin inode
3252}
3253
3254 /**
3255 * Don't call this with in==NULL, use get_or_create for that
3256 * leave dn set to default NULL unless you're trying to add
3257 * a new inode to a pre-created Dentry
3258 */
3259Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3260{
3261 if (!dn) {
3262 // create a new Dentry
11fdf7f2
TL
3263 dn = new Dentry(dir, name);
3264
7c673cae
FG
3265 lru.lru_insert_mid(dn); // mid or top?
3266
3267 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3268 << " dn " << dn << " (new dn)" << dendl;
3269 } else {
11fdf7f2 3270 ceph_assert(!dn->inode);
7c673cae
FG
3271 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3272 << " dn " << dn << " (old dn)" << dendl;
3273 }
3274
3275 if (in) { // link to inode
11fdf7f2 3276 InodeRef tmp_ref;
7c673cae 3277 // only one parent for directories!
11fdf7f2
TL
3278 if (in->is_dir() && !in->dentries.empty()) {
3279 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3280 Dentry *olddn = in->get_first_parent();
11fdf7f2 3281 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3282 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3283 clear_dir_complete_and_ordered(old_diri, true);
3284 unlink(olddn, true, true); // keep dir, dentry
3285 }
3286
11fdf7f2 3287 dn->link(in);
f67539c2 3288 inc_dentry_nr();
11fdf7f2 3289 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3290 }
3291
3292 return dn;
3293}
3294
3295void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3296{
11fdf7f2 3297 InodeRef in(dn->inode);
7c673cae
FG
3298 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3299 << " inode " << dn->inode << dendl;
3300
3301 // unlink from inode
11fdf7f2
TL
3302 if (dn->inode) {
3303 dn->unlink();
f67539c2 3304 dec_dentry_nr();
11fdf7f2 3305 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3306 }
3307
3308 if (keepdentry) {
3309 dn->lease_mds = -1;
3310 } else {
3311 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3312
3313 // unlink from dir
11fdf7f2
TL
3314 Dir *dir = dn->dir;
3315 dn->detach();
7c673cae
FG
3316
3317 // delete den
3318 lru.lru_remove(dn);
3319 dn->put();
11fdf7f2
TL
3320
3321 if (dir->is_empty() && !keepdir)
3322 close_dir(dir);
7c673cae
FG
3323 }
3324}
3325
3326/**
3327 * For asynchronous flushes, check for errors from the IO and
3328 * update the inode if necessary
3329 */
3330class C_Client_FlushComplete : public Context {
3331private:
3332 Client *client;
3333 InodeRef inode;
3334public:
3335 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3336 void finish(int r) override {
9f95a23c 3337 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3338 if (r != 0) {
3339 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3340 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3341 << " 0x" << std::hex << inode->ino << std::dec
3342 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3343 inode->set_async_err(r);
3344 }
3345 }
3346};
3347
3348
3349/****
3350 * caps
3351 */
3352
3353void Client::get_cap_ref(Inode *in, int cap)
3354{
3355 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3356 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3357 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
b3b6e05e 3358 in->iget();
7c673cae
FG
3359 }
3360 if ((cap & CEPH_CAP_FILE_CACHE) &&
3361 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3362 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
b3b6e05e 3363 in->iget();
7c673cae
FG
3364 }
3365 in->get_cap_ref(cap);
3366}
3367
3368void Client::put_cap_ref(Inode *in, int cap)
3369{
3370 int last = in->put_cap_ref(cap);
3371 if (last) {
3372 int put_nref = 0;
3373 int drop = last & ~in->caps_issued();
3374 if (in->snapid == CEPH_NOSNAP) {
f67539c2 3375 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3376 !in->cap_snaps.empty() &&
3377 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3378 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3379 in->cap_snaps.rbegin()->second.writing = 0;
3380 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3381 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3382 }
3383 if (last & CEPH_CAP_FILE_BUFFER) {
3384 for (auto &p : in->cap_snaps)
3385 p.second.dirty_data = 0;
3386 signal_cond_list(in->waitfor_commit);
11fdf7f2 3387 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3388 ++put_nref;
3389 }
3390 }
3391 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3392 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3393 ++put_nref;
3394 }
3395 if (drop)
3396 check_caps(in, 0);
3397 if (put_nref)
3398 put_inode(in, put_nref);
3399 }
3400}
3401
f67539c2
TL
3402// get caps for a given file handle -- the inode should have @need caps
3403// issued by the mds and @want caps not revoked (or not under revocation).
3404// this routine blocks till the cap requirement is satisfied. also account
3405// (track) for capability hit when required (when cap requirement succeedes).
f6b5b4d7 3406int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3407{
f6b5b4d7
TL
3408 Inode *in = fh->inode.get();
3409
7c673cae
FG
3410 int r = check_pool_perm(in, need);
3411 if (r < 0)
3412 return r;
3413
3414 while (1) {
3415 int file_wanted = in->caps_file_wanted();
3416 if ((file_wanted & need) != need) {
3417 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3418 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3419 << dendl;
f67539c2 3420 return -CEPHFS_EBADF;
7c673cae
FG
3421 }
3422
f6b5b4d7 3423 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
f67539c2 3424 return -CEPHFS_EBADF;
f6b5b4d7
TL
3425
3426 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
f67539c2 3427 return -CEPHFS_EIO;
f6b5b4d7 3428
7c673cae
FG
3429 int implemented;
3430 int have = in->caps_issued(&implemented);
3431
3432 bool waitfor_caps = false;
3433 bool waitfor_commit = false;
3434
3435 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3436 if (endoff > 0) {
3437 if ((endoff >= (loff_t)in->max_size ||
3438 endoff > (loff_t)(in->size << 1)) &&
3439 endoff > (loff_t)in->wanted_max_size) {
3440 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3441 in->wanted_max_size = endoff;
3442 }
3443 if (in->wanted_max_size > in->max_size &&
3444 in->wanted_max_size > in->requested_max_size)
3445 check_caps(in, 0);
7c673cae
FG
3446 }
3447
3448 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3449 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3450 waitfor_caps = true;
3451 }
3452 if (!in->cap_snaps.empty()) {
3453 if (in->cap_snaps.rbegin()->second.writing) {
3454 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3455 waitfor_caps = true;
3456 }
3457 for (auto &p : in->cap_snaps) {
3458 if (p.second.dirty_data) {
3459 waitfor_commit = true;
3460 break;
3461 }
3462 }
3463 if (waitfor_commit) {
3464 _flush(in, new C_Client_FlushComplete(this, in));
3465 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3466 }
3467 }
3468 }
3469
3470 if (!waitfor_caps && !waitfor_commit) {
3471 if ((have & need) == need) {
7c673cae
FG
3472 int revoking = implemented & ~have;
3473 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3474 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3475 << " revoking " << ccap_string(revoking)
7c673cae 3476 << dendl;
c07f9fc5 3477 if ((revoking & want) == 0) {
7c673cae
FG
3478 *phave = need | (have & want);
3479 in->get_cap_ref(need);
f67539c2 3480 cap_hit();
7c673cae
FG
3481 return 0;
3482 }
3483 }
3484 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3485 waitfor_caps = true;
3486 }
3487
3488 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3489 in->auth_cap->session->readonly)
f67539c2 3490 return -CEPHFS_EROFS;
7c673cae
FG
3491
3492 if (in->flags & I_CAP_DROPPED) {
3493 int mds_wanted = in->caps_mds_wanted();
3494 if ((mds_wanted & need) != need) {
3495 int ret = _renew_caps(in);
3496 if (ret < 0)
3497 return ret;
3498 continue;
3499 }
a8e16298 3500 if (!(file_wanted & ~mds_wanted))
7c673cae 3501 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3502 }
3503
3504 if (waitfor_caps)
3505 wait_on_list(in->waitfor_caps);
3506 else if (waitfor_commit)
3507 wait_on_list(in->waitfor_commit);
3508 }
3509}
3510
3511int Client::get_caps_used(Inode *in)
3512{
3513 unsigned used = in->caps_used();
3514 if (!(used & CEPH_CAP_FILE_CACHE) &&
3515 !objectcacher->set_is_empty(&in->oset))
3516 used |= CEPH_CAP_FILE_CACHE;
3517 return used;
3518}
3519
3520void Client::cap_delay_requeue(Inode *in)
3521{
11fdf7f2 3522 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3523 in->hold_caps_until = ceph_clock_now();
3524 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3525 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3526}
3527
3528void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3529 int flags, int used, int want, int retain,
7c673cae
FG
3530 int flush, ceph_tid_t flush_tid)
3531{
3532 int held = cap->issued | cap->implemented;
3533 int revoking = cap->implemented & ~cap->issued;
3534 retain &= ~revoking;
3535 int dropping = cap->issued & ~retain;
3536 int op = CEPH_CAP_OP_UPDATE;
3537
11fdf7f2 3538 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3539 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3540 << " used " << ccap_string(used)
3541 << " want " << ccap_string(want)
3542 << " flush " << ccap_string(flush)
3543 << " retain " << ccap_string(retain)
3544 << " held "<< ccap_string(held)
3545 << " revoking " << ccap_string(revoking)
3546 << " dropping " << ccap_string(dropping)
3547 << dendl;
3548
3549 if (cct->_conf->client_inject_release_failure && revoking) {
3550 const int would_have_issued = cap->issued & retain;
3551 const int would_have_implemented = cap->implemented & (cap->issued | used);
3552 // Simulated bug:
3553 // - tell the server we think issued is whatever they issued plus whatever we implemented
3554 // - leave what we have implemented in place
3555 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3556 cap->issued = cap->issued | cap->implemented;
3557
3558 // Make an exception for revoking xattr caps: we are injecting
3559 // failure to release other caps, but allow xattr because client
3560 // will block on xattr ops if it can't release these to MDS (#9800)
3561 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3562 cap->issued ^= xattr_mask & revoking;
3563 cap->implemented ^= xattr_mask & revoking;
3564
3565 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3566 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3567 } else {
3568 // Normal behaviour
3569 cap->issued &= retain;
3570 cap->implemented &= cap->issued | used;
3571 }
3572
3573 snapid_t follows = 0;
3574
3575 if (flush)
3576 follows = in->snaprealm->get_snap_context().seq;
3577
9f95a23c 3578 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3579 in->ino,
3580 0,
3581 cap->cap_id, cap->seq,
3582 cap->implemented,
3583 want,
3584 flush,
3585 cap->mseq,
3586 cap_epoch_barrier);
3587 m->caller_uid = in->cap_dirtier_uid;
3588 m->caller_gid = in->cap_dirtier_gid;
3589
3590 m->head.issue_seq = cap->issue_seq;
3591 m->set_tid(flush_tid);
3592
3593 m->head.uid = in->uid;
3594 m->head.gid = in->gid;
3595 m->head.mode = in->mode;
3596
3597 m->head.nlink = in->nlink;
3598
3599 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3600 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3601 m->head.xattr_version = in->xattr_version;
3602 }
3603
3604 m->size = in->size;
3605 m->max_size = in->max_size;
3606 m->truncate_seq = in->truncate_seq;
3607 m->truncate_size = in->truncate_size;
3608 m->mtime = in->mtime;
3609 m->atime = in->atime;
3610 m->ctime = in->ctime;
3611 m->btime = in->btime;
3612 m->time_warp_seq = in->time_warp_seq;
3613 m->change_attr = in->change_attr;
eafe8130
TL
3614
3615 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3616 !in->cap_snaps.empty() &&
3617 in->cap_snaps.rbegin()->second.flush_tid == 0)
3618 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3619 m->flags = flags;
3620
7c673cae
FG
3621 if (flush & CEPH_CAP_FILE_WR) {
3622 m->inline_version = in->inline_version;
3623 m->inline_data = in->inline_data;
3624 }
3625
3626 in->reported_size = in->size;
3627 m->set_snap_follows(follows);
3628 cap->wanted = want;
3629 if (cap == in->auth_cap) {
1911f103
TL
3630 if (want & CEPH_CAP_ANY_FILE_WR) {
3631 m->set_max_size(in->wanted_max_size);
3632 in->requested_max_size = in->wanted_max_size;
3633 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3634 } else {
3635 in->requested_max_size = 0;
3636 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3637 }
7c673cae
FG
3638 }
3639
3640 if (!session->flushing_caps_tids.empty())
3641 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3642
11fdf7f2 3643 session->con->send_message2(std::move(m));
7c673cae
FG
3644}
3645
31f18b77
FG
3646static bool is_max_size_approaching(Inode *in)
3647{
3648 /* mds will adjust max size according to the reported size */
3649 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3650 return false;
3651 if (in->size >= in->max_size)
3652 return true;
3653 /* half of previous max_size increment has been used */
3654 if (in->max_size > in->reported_size &&
3655 (in->size << 1) >= in->max_size + in->reported_size)
3656 return true;
3657 return false;
3658}
7c673cae 3659
11fdf7f2
TL
3660static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3661{
3662 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3663 return used;
3664 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3665 return used;
3666
3667 if (issued & CEPH_CAP_FILE_LAZYIO) {
3668 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3669 used &= ~CEPH_CAP_FILE_CACHE;
3670 used |= CEPH_CAP_FILE_LAZYIO;
3671 }
3672 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3673 used &= ~CEPH_CAP_FILE_BUFFER;
3674 used |= CEPH_CAP_FILE_LAZYIO;
3675 }
3676 } else {
3677 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3678 used &= ~CEPH_CAP_FILE_CACHE;
3679 used |= CEPH_CAP_FILE_LAZYIO;
3680 }
3681 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3682 used &= ~CEPH_CAP_FILE_BUFFER;
3683 used |= CEPH_CAP_FILE_LAZYIO;
3684 }
3685 }
3686 return used;
3687}
3688
7c673cae
FG
3689/**
3690 * check_caps
3691 *
3692 * Examine currently used and wanted versus held caps. Release, flush or ack
3693 * revoked caps to the MDS as appropriate.
3694 *
3695 * @param in the inode to check
3696 * @param flags flags to apply to cap check
3697 */
3698void Client::check_caps(Inode *in, unsigned flags)
3699{
3700 unsigned wanted = in->caps_wanted();
3701 unsigned used = get_caps_used(in);
3702 unsigned cap_used;
3703
7c673cae
FG
3704 int implemented;
3705 int issued = in->caps_issued(&implemented);
3706 int revoking = implemented & ~issued;
3707
11fdf7f2
TL
3708 int orig_used = used;
3709 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3710
7c673cae 3711 int retain = wanted | used | CEPH_CAP_PIN;
f67539c2 3712 if (!is_unmounting() && in->nlink > 0) {
a8e16298 3713 if (wanted) {
7c673cae 3714 retain |= CEPH_CAP_ANY;
a8e16298
TL
3715 } else if (in->is_dir() &&
3716 (issued & CEPH_CAP_FILE_SHARED) &&
3717 (in->flags & I_COMPLETE)) {
3718 // we do this here because we don't want to drop to Fs (and then
3719 // drop the Fs if we do a create!) if that alone makes us send lookups
3720 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3721 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3722 retain |= wanted;
3723 } else {
7c673cae 3724 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3725 // keep RD only if we didn't have the file open RW,
3726 // because then the mds would revoke it anyway to
3727 // journal max_size=0.
3728 if (in->max_size == 0)
3729 retain |= CEPH_CAP_ANY_RD;
3730 }
7c673cae
FG
3731 }
3732
11fdf7f2 3733 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3734 << " wanted " << ccap_string(wanted)
3735 << " used " << ccap_string(used)
3736 << " issued " << ccap_string(issued)
3737 << " revoking " << ccap_string(revoking)
3738 << " flags=" << flags
3739 << dendl;
3740
3741 if (in->snapid != CEPH_NOSNAP)
3742 return; //snap caps last forever, can't write
3743
3744 if (in->caps.empty())
3745 return; // guard if at end of func
3746
11fdf7f2
TL
3747 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3748 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3749 if (_release(in))
11fdf7f2 3750 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3751 }
7c673cae 3752
7c673cae 3753
11fdf7f2
TL
3754 for (auto &p : in->caps) {
3755 mds_rank_t mds = p.first;
3756 Cap &cap = p.second;
7c673cae 3757
11fdf7f2 3758 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3759
3760 cap_used = used;
11fdf7f2 3761 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3762 cap_used &= ~in->auth_cap->issued;
3763
11fdf7f2 3764 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3765
3766 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3767 << " issued " << ccap_string(cap.issued)
3768 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3769 << " revoking " << ccap_string(revoking) << dendl;
3770
3771 if (in->wanted_max_size > in->max_size &&
3772 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3773 &cap == in->auth_cap)
7c673cae
FG
3774 goto ack;
3775
3776 /* approaching file_max? */
11fdf7f2
TL
3777 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3778 &cap == in->auth_cap &&
31f18b77 3779 is_max_size_approaching(in)) {
7c673cae 3780 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3781 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3782 goto ack;
3783 }
3784
3785 /* completed revocation? */
3786 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3787 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3788 goto ack;
3789 }
3790
3791 /* want more caps from mds? */
11fdf7f2 3792 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3793 goto ack;
3794
f67539c2 3795 if (!revoking && is_unmounting() && (cap_used == 0))
7c673cae
FG
3796 goto ack;
3797
11fdf7f2 3798 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3799 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3800 continue;
3801
11fdf7f2 3802 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3803 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3804 cap_delay_requeue(in);
7c673cae
FG
3805 continue;
3806 }
3807
3808 ack:
eafe8130
TL
3809 if (&cap == in->auth_cap) {
3810 if (in->flags & I_KICK_FLUSH) {
3811 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3812 << " to mds." << mds << dendl;
3813 kick_flushing_caps(in, session);
3814 }
3815 if (!in->cap_snaps.empty() &&
3816 in->cap_snaps.rbegin()->second.flush_tid == 0)
3817 flush_snaps(in);
7c673cae
FG
3818 }
3819
3820 int flushing;
e306af50 3821 int msg_flags = 0;
7c673cae 3822 ceph_tid_t flush_tid;
11fdf7f2 3823 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3824 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3825 if (flags & CHECK_CAPS_SYNCHRONOUS)
3826 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3827 } else {
3828 flushing = 0;
3829 flush_tid = 0;
3830 }
3831
eafe8130
TL
3832 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3833 flushing, flush_tid);
7c673cae
FG
3834 }
3835}
3836
3837
3838void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3839{
3840 int used = get_caps_used(in);
3841 int dirty = in->caps_dirty();
11fdf7f2 3842 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3843
3844 if (in->cap_snaps.size() &&
3845 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3846 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3847 return;
3848 } else if (in->caps_dirty() ||
3849 (used & CEPH_CAP_FILE_WR) ||
3850 (dirty & CEPH_CAP_ANY_WR)) {
3851 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3852 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3853 CapSnap &capsnap = capsnapem.first->second;
3854 capsnap.context = old_snapc;
3855 capsnap.issued = in->caps_issued();
3856 capsnap.dirty = in->caps_dirty();
f67539c2 3857
7c673cae 3858 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
f67539c2 3859
7c673cae
FG
3860 capsnap.uid = in->uid;
3861 capsnap.gid = in->gid;
3862 capsnap.mode = in->mode;
3863 capsnap.btime = in->btime;
3864 capsnap.xattrs = in->xattrs;
3865 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3866 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3867 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
f67539c2 3868
7c673cae 3869 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3870 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3871 capsnap.writing = 1;
3872 } else {
3873 finish_cap_snap(in, capsnap, used);
3874 }
3875 } else {
11fdf7f2 3876 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3877 }
3878}
3879
3880void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3881{
11fdf7f2 3882 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3883 capsnap.size = in->size;
3884 capsnap.mtime = in->mtime;
3885 capsnap.atime = in->atime;
3886 capsnap.ctime = in->ctime;
3887 capsnap.time_warp_seq = in->time_warp_seq;
3888 capsnap.change_attr = in->change_attr;
7c673cae
FG
3889 capsnap.dirty |= in->caps_dirty();
3890
11fdf7f2
TL
3891 /* Only reset it if it wasn't set before */
3892 if (capsnap.cap_dirtier_uid == -1) {
3893 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3894 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3895 }
3896
7c673cae
FG
3897 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3898 capsnap.inline_data = in->inline_data;
3899 capsnap.inline_version = in->inline_version;
3900 }
3901
3902 if (used & CEPH_CAP_FILE_BUFFER) {
f67539c2 3903 capsnap.writing = 1;
11fdf7f2 3904 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3905 << " WRBUFFER, delaying" << dendl;
3906 } else {
3907 capsnap.dirty_data = 0;
3908 flush_snaps(in);
3909 }
3910}
3911
eafe8130
TL
3912void Client::send_flush_snap(Inode *in, MetaSession *session,
3913 snapid_t follows, CapSnap& capsnap)
3914{
9f95a23c
TL
3915 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3916 in->ino, in->snaprealm->ino, 0,
3917 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
3918 m->caller_uid = capsnap.cap_dirtier_uid;
3919 m->caller_gid = capsnap.cap_dirtier_gid;
3920
3921 m->set_client_tid(capsnap.flush_tid);
3922 m->head.snap_follows = follows;
3923
3924 m->head.caps = capsnap.issued;
3925 m->head.dirty = capsnap.dirty;
3926
3927 m->head.uid = capsnap.uid;
3928 m->head.gid = capsnap.gid;
3929 m->head.mode = capsnap.mode;
3930 m->btime = capsnap.btime;
3931
3932 m->size = capsnap.size;
3933
3934 m->head.xattr_version = capsnap.xattr_version;
3935 encode(capsnap.xattrs, m->xattrbl);
3936
3937 m->ctime = capsnap.ctime;
3938 m->btime = capsnap.btime;
3939 m->mtime = capsnap.mtime;
3940 m->atime = capsnap.atime;
3941 m->time_warp_seq = capsnap.time_warp_seq;
3942 m->change_attr = capsnap.change_attr;
3943
3944 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3945 m->inline_version = in->inline_version;
3946 m->inline_data = in->inline_data;
3947 }
3948
3949 ceph_assert(!session->flushing_caps_tids.empty());
3950 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3951
3952 session->con->send_message2(std::move(m));
3953}
3954
3955void Client::flush_snaps(Inode *in)
7c673cae 3956{
eafe8130 3957 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3958 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3959
3960 // pick auth mds
11fdf7f2 3961 ceph_assert(in->auth_cap);
7c673cae 3962 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3963
3964 for (auto &p : in->cap_snaps) {
3965 CapSnap &capsnap = p.second;
eafe8130
TL
3966 // only do new flush
3967 if (capsnap.flush_tid > 0)
3968 continue;
7c673cae
FG
3969
3970 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3971 << " follows " << p.first
3972 << " size " << capsnap.size
3973 << " mtime " << capsnap.mtime
3974 << " dirty_data=" << capsnap.dirty_data
3975 << " writing=" << capsnap.writing
3976 << " on " << *in << dendl;
3977 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3978 break;
f67539c2 3979
eafe8130
TL
3980 capsnap.flush_tid = ++last_flush_tid;
3981 session->flushing_caps_tids.insert(capsnap.flush_tid);
3982 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3983 if (!in->flushing_cap_item.is_on_list())
3984 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3985
eafe8130 3986 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3987 }
3988}
3989
9f95a23c 3990void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 3991{
9f95a23c 3992 ceph::condition_variable cond;
7c673cae 3993 ls.push_back(&cond);
9f95a23c
TL
3994 std::unique_lock l{client_lock, std::adopt_lock};
3995 cond.wait(l);
3996 l.release();
7c673cae
FG
3997 ls.remove(&cond);
3998}
3999
9f95a23c 4000void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 4001{
9f95a23c
TL
4002 for (auto cond : ls) {
4003 cond->notify_all();
4004 }
7c673cae
FG
4005}
4006
4007void Client::wait_on_context_list(list<Context*>& ls)
4008{
9f95a23c 4009 ceph::condition_variable cond;
7c673cae
FG
4010 bool done = false;
4011 int r;
9f95a23c
TL
4012 ls.push_back(new C_Cond(cond, &done, &r));
4013 std::unique_lock l{client_lock, std::adopt_lock};
4014 cond.wait(l, [&done] { return done;});
4015 l.release();
7c673cae
FG
4016}
4017
4018void Client::signal_context_list(list<Context*>& ls)
4019{
4020 while (!ls.empty()) {
4021 ls.front()->complete(0);
4022 ls.pop_front();
4023 }
4024}
4025
a8e16298 4026void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 4027{
11fdf7f2
TL
4028 for (const auto &cap : s->caps) {
4029 auto &in = cap->inode;
a8e16298 4030 if (reconnect) {
11fdf7f2
TL
4031 in.requested_max_size = 0;
4032 in.wanted_max_size = 0;
a8e16298
TL
4033 } else {
4034 if (cap->gen < s->cap_gen) {
4035 // mds did not re-issue stale cap.
4036 cap->issued = cap->implemented = CEPH_CAP_PIN;
4037 // make sure mds knows what we want.
11fdf7f2
TL
4038 if (in.caps_file_wanted() & ~cap->wanted)
4039 in.flags |= I_CAP_DROPPED;
a8e16298
TL
4040 }
4041 }
11fdf7f2 4042 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4043 }
4044}
4045
4046
4047// flush dirty data (from objectcache)
4048
4049class C_Client_CacheInvalidate : public Context {
4050private:
4051 Client *client;
4052 vinodeno_t ino;
4053 int64_t offset, length;
4054public:
4055 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4056 client(c), offset(off), length(len) {
4057 if (client->use_faked_inos())
4058 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4059 else
4060 ino = in->vino();
4061 }
4062 void finish(int r) override {
4063 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 4064 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
4065 client->_async_invalidate(ino, offset, length);
4066 }
4067};
4068
4069void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4070{
f67539c2
TL
4071 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4072 if (!mref_reader.is_state_satisfied())
7c673cae 4073 return;
f67539c2 4074
11fdf7f2 4075 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
4076 ino_invalidate_cb(callback_handle, ino, off, len);
4077}
4078
4079void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4080
4081 if (ino_invalidate_cb)
4082 // we queue the invalidate, which calls the callback and decrements the ref
4083 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4084}
4085
4086void Client::_invalidate_inode_cache(Inode *in)
4087{
11fdf7f2 4088 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
4089
4090 // invalidate our userspace inode cache
94b18763 4091 if (cct->_conf->client_oc) {
7c673cae 4092 objectcacher->release_set(&in->oset);
94b18763
FG
4093 if (!objectcacher->set_is_empty(&in->oset))
4094 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4095 }
7c673cae
FG
4096
4097 _schedule_invalidate_callback(in, 0, 0);
4098}
4099
4100void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4101{
11fdf7f2 4102 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
4103
4104 // invalidate our userspace inode cache
4105 if (cct->_conf->client_oc) {
4106 vector<ObjectExtent> ls;
4107 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 4108 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
4109 }
4110
4111 _schedule_invalidate_callback(in, off, len);
4112}
4113
4114bool Client::_release(Inode *in)
4115{
4116 ldout(cct, 20) << "_release " << *in << dendl;
4117 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4118 _invalidate_inode_cache(in);
4119 return true;
4120 }
4121 return false;
4122}
4123
4124bool Client::_flush(Inode *in, Context *onfinish)
4125{
4126 ldout(cct, 10) << "_flush " << *in << dendl;
4127
4128 if (!in->oset.dirty_or_tx) {
4129 ldout(cct, 10) << " nothing to flush" << dendl;
4130 onfinish->complete(0);
4131 return true;
4132 }
4133
4134 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 4135 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
4136 objectcacher->purge_set(&in->oset);
4137 if (onfinish) {
f67539c2 4138 onfinish->complete(-CEPHFS_ENOSPC);
7c673cae
FG
4139 }
4140 return true;
4141 }
4142
4143 return objectcacher->flush_set(&in->oset, onfinish);
4144}
4145
4146void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4147{
f67539c2 4148 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
4149 if (!in->oset.dirty_or_tx) {
4150 ldout(cct, 10) << " nothing to flush" << dendl;
4151 return;
4152 }
4153
11fdf7f2 4154 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 4155 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 4156 offset, size, &onflush);
7c673cae
FG
4157 if (!ret) {
4158 // wait for flush
9f95a23c 4159 client_lock.unlock();
11fdf7f2 4160 onflush.wait();
9f95a23c 4161 client_lock.lock();
7c673cae
FG
4162 }
4163}
4164
4165void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4166{
f67539c2
TL
4167 // std::scoped_lock l(client_lock);
4168 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 4169 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 4170 ceph_assert(in);
7c673cae
FG
4171 _flushed(in);
4172}
4173
4174void Client::_flushed(Inode *in)
4175{
4176 ldout(cct, 10) << "_flushed " << *in << dendl;
4177
4178 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4179}
4180
4181
4182
4183// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4184void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4185{
4186 unsigned had = in->caps_issued();
4187
4188 if ((issued & CEPH_CAP_FILE_CACHE) &&
4189 !(had & CEPH_CAP_FILE_CACHE))
4190 in->cache_gen++;
4191
f91f0fd5
TL
4192 if ((issued & CEPH_CAP_FILE_SHARED) !=
4193 (had & CEPH_CAP_FILE_SHARED)) {
4194 if (issued & CEPH_CAP_FILE_SHARED)
4195 in->shared_gen++;
7c673cae
FG
4196 if (in->is_dir())
4197 clear_dir_complete_and_ordered(in, true);
4198 }
4199}
4200
4201void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4202 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4203 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4204{
11fdf7f2
TL
4205 if (!in->is_any_caps()) {
4206 ceph_assert(in->snaprealm == 0);
4207 in->snaprealm = get_snap_realm(realm);
4208 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4209 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4210 } else {
4211 ceph_assert(in->snaprealm);
4212 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4213 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4214 in->snaprealm_item.remove_myself();
4215 auto oldrealm = in->snaprealm;
4216 in->snaprealm = get_snap_realm(realm);
4217 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4218 put_snap_realm(oldrealm);
4219 }
4220 }
4221
7c673cae 4222 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4223 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4224 Cap &cap = capem.first->second;
4225 if (!capem.second) {
4226 if (cap.gen < mds_session->cap_gen)
4227 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4228
4229 /*
4230 * auth mds of the inode changed. we received the cap export
4231 * message, but still haven't received the cap import message.
4232 * handle_cap_export() updated the new auth MDS' cap.
4233 *
4234 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4235 * a message that was send before the cap import message. So
4236 * don't remove caps.
4237 */
11fdf7f2 4238 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4239 if (&cap != in->auth_cap)
4240 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4241
11fdf7f2
TL
4242 ceph_assert(cap.cap_id == cap_id);
4243 seq = cap.seq;
4244 mseq = cap.mseq;
4245 issued |= cap.issued;
7c673cae
FG
4246 flags |= CEPH_CAP_FLAG_AUTH;
4247 }
f67539c2
TL
4248 } else {
4249 inc_pinned_icaps();
7c673cae
FG
4250 }
4251
11fdf7f2 4252 check_cap_issue(in, issued);
7c673cae
FG
4253
4254 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4255 if (in->auth_cap != &cap &&
7c673cae
FG
4256 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4257 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4258 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4259 << "add myself to new auth MDS' flushing caps list" << dendl;
4260 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4261 }
11fdf7f2 4262 in->auth_cap = &cap;
7c673cae
FG
4263 }
4264 }
4265
11fdf7f2
TL
4266 unsigned old_caps = cap.issued;
4267 cap.cap_id = cap_id;
4268 cap.issued = issued;
4269 cap.implemented |= issued;
4270 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4271 cap.wanted = wanted;
a8e16298 4272 else
11fdf7f2
TL
4273 cap.wanted |= wanted;
4274 cap.seq = seq;
4275 cap.issue_seq = seq;
4276 cap.mseq = mseq;
4277 cap.gen = mds_session->cap_gen;
4278 cap.latest_perms = cap_perms;
4279 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4280 << " from mds." << mds
4281 << " on " << *in
4282 << dendl;
4283
4284 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4285 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4286 for (auto &p : in->caps) {
4287 if (&p.second == &cap)
7c673cae 4288 continue;
11fdf7f2 4289 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4290 check_caps(in, CHECK_CAPS_NODELAY);
4291 break;
4292 }
4293 }
4294 }
4295
4296 if (issued & ~old_caps)
4297 signal_cond_list(in->waitfor_caps);
4298}
4299
4300void Client::remove_cap(Cap *cap, bool queue_release)
4301{
11fdf7f2 4302 auto &in = cap->inode;
7c673cae
FG
4303 MetaSession *session = cap->session;
4304 mds_rank_t mds = cap->session->mds_num;
4305
11fdf7f2 4306 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4307
4308 if (queue_release) {
4309 session->enqueue_cap_release(
11fdf7f2 4310 in.ino,
7c673cae
FG
4311 cap->cap_id,
4312 cap->issue_seq,
4313 cap->mseq,
4314 cap_epoch_barrier);
f67539c2
TL
4315 } else {
4316 dec_pinned_icaps();
7c673cae
FG
4317 }
4318
f67539c2 4319
11fdf7f2
TL
4320 if (in.auth_cap == cap) {
4321 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4322 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4323 in.flushing_cap_item.remove_myself();
7c673cae 4324 }
11fdf7f2 4325 in.auth_cap = NULL;
7c673cae 4326 }
11fdf7f2
TL
4327 size_t n = in.caps.erase(mds);
4328 ceph_assert(n == 1);
7c673cae
FG
4329 cap = nullptr;
4330
11fdf7f2
TL
4331 if (!in.is_any_caps()) {
4332 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4333 in.snaprealm_item.remove_myself();
4334 put_snap_realm(in.snaprealm);
4335 in.snaprealm = 0;
7c673cae
FG
4336 }
4337}
4338
4339void Client::remove_all_caps(Inode *in)
4340{
4341 while (!in->caps.empty())
11fdf7f2 4342 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4343}
4344
f6b5b4d7 4345void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4346{
11fdf7f2 4347 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4348
4349 while (s->caps.size()) {
4350 Cap *cap = *s->caps.begin();
11fdf7f2 4351 InodeRef in(&cap->inode);
eafe8130 4352 bool dirty_caps = false;
7c673cae 4353 if (in->auth_cap == cap) {
7c673cae
FG
4354 dirty_caps = in->dirty_caps | in->flushing_caps;
4355 in->wanted_max_size = 0;
4356 in->requested_max_size = 0;
f6b5b4d7
TL
4357 if (in->has_any_filelocks())
4358 in->flags |= I_ERROR_FILELOCK;
7c673cae 4359 }
f6b5b4d7 4360 auto caps = cap->implemented;
a8e16298
TL
4361 if (cap->wanted | cap->issued)
4362 in->flags |= I_CAP_DROPPED;
7c673cae 4363 remove_cap(cap, false);
eafe8130 4364 in->cap_snaps.clear();
7c673cae 4365 if (dirty_caps) {
11fdf7f2 4366 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4367 if (in->flushing_caps) {
4368 num_flushing_caps--;
4369 in->flushing_cap_tids.clear();
4370 }
4371 in->flushing_caps = 0;
28e407b8 4372 in->mark_caps_clean();
11fdf7f2 4373 put_inode(in.get());
7c673cae 4374 }
f6b5b4d7
TL
4375 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4376 if (caps && !in->caps_issued_mask(caps, true)) {
f67539c2 4377 if (err == -CEPHFS_EBLOCKLISTED) {
f6b5b4d7
TL
4378 if (in->oset.dirty_or_tx) {
4379 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4380 in->set_async_err(err);
4381 }
4382 objectcacher->purge_set(&in->oset);
4383 } else {
4384 objectcacher->release_set(&in->oset);
4385 }
4386 _schedule_invalidate_callback(in.get(), 0, 0);
4387 }
4388
a8e16298 4389 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4390 }
4391 s->flushing_caps_tids.clear();
9f95a23c 4392 sync_cond.notify_all();
7c673cae
FG
4393}
4394
91327a77 4395int Client::_do_remount(bool retry_on_error)
b32b8144 4396{
adb31ebb 4397 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4398
b32b8144
FG
4399 errno = 0;
4400 int r = remount_cb(callback_handle);
91327a77
AA
4401 if (r == 0) {
4402 retries_on_invalidate = 0;
4403 } else {
b32b8144
FG
4404 int e = errno;
4405 client_t whoami = get_nodeid();
4406 if (r == -1) {
4407 lderr(cct) <<
4408 "failed to remount (to trim kernel dentries): "
4409 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4410 } else {
4411 lderr(cct) <<
4412 "failed to remount (to trim kernel dentries): "
4413 "return code = " << r << dendl;
4414 }
91327a77 4415 bool should_abort =
11fdf7f2
TL
4416 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4417 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4418 !(retry_on_error && (++retries_on_invalidate < max_retries));
f67539c2 4419 if (should_abort && !is_unmounting()) {
b32b8144
FG
4420 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4421 ceph_abort();
4422 }
4423 }
4424 return r;
4425}
4426
7c673cae
FG
4427class C_Client_Remount : public Context {
4428private:
4429 Client *client;
4430public:
4431 explicit C_Client_Remount(Client *c) : client(c) {}
4432 void finish(int r) override {
11fdf7f2 4433 ceph_assert(r == 0);
91327a77 4434 client->_do_remount(true);
7c673cae
FG
4435 }
4436};
4437
4438void Client::_invalidate_kernel_dcache()
4439{
f67539c2
TL
4440 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4441 if (!mref_reader.is_state_satisfied())
7c673cae 4442 return;
f67539c2 4443
94b18763
FG
4444 if (can_invalidate_dentries) {
4445 if (dentry_invalidate_cb && root->dir) {
4446 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4447 p != root->dir->dentries.end();
4448 ++p) {
4449 if (p->second->inode)
4450 _schedule_invalidate_dentry_callback(p->second, false);
4451 }
7c673cae
FG
4452 }
4453 } else if (remount_cb) {
4454 // Hacky:
4455 // when remounting a file system, linux kernel trims all unused dentries in the fs
4456 remount_finisher.queue(new C_Client_Remount(this));
4457 }
4458}
4459
91327a77
AA
4460void Client::_trim_negative_child_dentries(InodeRef& in)
4461{
4462 if (!in->is_dir())
4463 return;
4464
4465 Dir* dir = in->dir;
4466 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4467 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4468 Dentry *dn = p->second;
4469 ++p;
11fdf7f2 4470 ceph_assert(!dn->inode);
91327a77
AA
4471 if (dn->lru_is_expireable())
4472 unlink(dn, true, false); // keep dir, drop dentry
4473 }
4474 if (dir->dentries.empty()) {
4475 close_dir(dir);
4476 }
4477 }
4478
4479 if (in->flags & I_SNAPDIR_OPEN) {
4480 InodeRef snapdir = open_snapdir(in.get());
4481 _trim_negative_child_dentries(snapdir);
4482 }
4483}
4484
e306af50
TL
4485class C_Client_CacheRelease : public Context {
4486private:
4487 Client *client;
4488 vinodeno_t ino;
4489public:
4490 C_Client_CacheRelease(Client *c, Inode *in) :
4491 client(c) {
4492 if (client->use_faked_inos())
4493 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4494 else
4495 ino = in->vino();
4496 }
4497 void finish(int r) override {
4498 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4499 client->_async_inode_release(ino);
4500 }
4501};
4502
4503void Client::_async_inode_release(vinodeno_t ino)
4504{
f67539c2
TL
4505 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4506 if (!mref_reader.is_state_satisfied())
e306af50 4507 return;
f67539c2 4508
e306af50
TL
4509 ldout(cct, 10) << __func__ << " " << ino << dendl;
4510 ino_release_cb(callback_handle, ino);
4511}
4512
4513void Client::_schedule_ino_release_callback(Inode *in) {
4514
4515 if (ino_release_cb)
4516 // we queue the invalidate, which calls the callback and decrements the ref
4517 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4518}
4519
28e407b8 4520void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4521{
4522 mds_rank_t mds = s->mds_num;
28e407b8 4523 size_t caps_size = s->caps.size();
11fdf7f2 4524 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4525 << " caps " << caps_size << dendl;
4526
28e407b8
AA
4527 uint64_t trimmed = 0;
4528 auto p = s->caps.begin();
4529 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4530 * looking at from getting deleted during traversal. */
7c673cae
FG
4531 while ((caps_size - trimmed) > max && !p.end()) {
4532 Cap *cap = *p;
11fdf7f2 4533 InodeRef in(&cap->inode);
7c673cae
FG
4534
4535 // Increment p early because it will be invalidated if cap
4536 // is deleted inside remove_cap
4537 ++p;
4538
4539 if (in->caps.size() > 1 && cap != in->auth_cap) {
4540 int mine = cap->issued | cap->implemented;
4541 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4542 // disposable non-auth cap
b32b8144 4543 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4544 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4545 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4546 trimmed++;
4547 }
4548 } else {
4549 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4550 _trim_negative_child_dentries(in);
7c673cae 4551 bool all = true;
11fdf7f2
TL
4552 auto q = in->dentries.begin();
4553 while (q != in->dentries.end()) {
4554 Dentry *dn = *q;
4555 ++q;
7c673cae
FG
4556 if (dn->lru_is_expireable()) {
4557 if (can_invalidate_dentries &&
b3b6e05e 4558 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
7c673cae
FG
4559 // Only issue one of these per DN for inodes in root: handle
4560 // others more efficiently by calling for root-child DNs at
4561 // the end of this function.
4562 _schedule_invalidate_dentry_callback(dn, true);
4563 }
28e407b8
AA
4564 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4565 to_trim.insert(dn);
7c673cae
FG
4566 } else {
4567 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4568 all = false;
4569 }
4570 }
b3b6e05e 4571 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
f91f0fd5
TL
4572 _schedule_ino_release_callback(in.get());
4573 }
b3b6e05e 4574 if (all && in->ino != CEPH_INO_ROOT) {
7c673cae
FG
4575 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4576 trimmed++;
4577 }
4578 }
4579 }
28e407b8
AA
4580 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4581 for (const auto &dn : to_trim) {
4582 trim_dentry(dn);
4583 }
4584 to_trim.clear();
7c673cae 4585
b32b8144 4586 caps_size = s->caps.size();
11fdf7f2 4587 if (caps_size > (size_t)max)
7c673cae
FG
4588 _invalidate_kernel_dcache();
4589}
4590
4591void Client::force_session_readonly(MetaSession *s)
4592{
4593 s->readonly = true;
4594 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4595 auto &in = (*p)->inode;
4596 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4597 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4598 }
4599}
4600
7c673cae
FG
4601int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4602{
4603 MetaSession *session = in->auth_cap->session;
4604
4605 int flushing = in->dirty_caps;
11fdf7f2 4606 ceph_assert(flushing);
7c673cae
FG
4607
4608 ceph_tid_t flush_tid = ++last_flush_tid;
4609 in->flushing_cap_tids[flush_tid] = flushing;
4610
4611 if (!in->flushing_caps) {
11fdf7f2 4612 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4613 num_flushing_caps++;
4614 } else {
11fdf7f2 4615 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4616 }
4617
4618 in->flushing_caps |= flushing;
28e407b8 4619 in->mark_caps_clean();
7c673cae
FG
4620
4621 if (!in->flushing_cap_item.is_on_list())
4622 session->flushing_caps.push_back(&in->flushing_cap_item);
4623 session->flushing_caps_tids.insert(flush_tid);
4624
4625 *ptid = flush_tid;
4626 return flushing;
4627}
4628
4629void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4630{
4631 for (auto &p : in->cap_snaps) {
4632 CapSnap &capsnap = p.second;
4633 if (capsnap.flush_tid > 0) {
4634 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4635 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4636 }
4637 }
4638 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4639 it != in->flushing_cap_tids.end();
4640 ++it) {
4641 old_s->flushing_caps_tids.erase(it->first);
4642 new_s->flushing_caps_tids.insert(it->first);
4643 }
4644 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4645}
4646
4647/*
4648 * Flush all caps back to the MDS. Because the callers generally wait on the
4649 * result of this function (syncfs and umount cases), we set
4650 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4651 */
4652void Client::flush_caps_sync()
4653{
4654 ldout(cct, 10) << __func__ << dendl;
28e407b8 4655 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4656 while (!p.end()) {
4657 unsigned flags = CHECK_CAPS_NODELAY;
4658 Inode *in = *p;
4659
4660 ++p;
28e407b8
AA
4661 delayed_list.pop_front();
4662 if (p.end() && dirty_list.empty())
7c673cae
FG
4663 flags |= CHECK_CAPS_SYNCHRONOUS;
4664 check_caps(in, flags);
4665 }
4666
4667 // other caps, too
28e407b8 4668 p = dirty_list.begin();
7c673cae
FG
4669 while (!p.end()) {
4670 unsigned flags = CHECK_CAPS_NODELAY;
4671 Inode *in = *p;
4672
4673 ++p;
4674 if (p.end())
4675 flags |= CHECK_CAPS_SYNCHRONOUS;
4676 check_caps(in, flags);
4677 }
4678}
4679
7c673cae
FG
4680void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4681{
4682 while (in->flushing_caps) {
4683 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4684 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4685 if (it->first > want)
4686 break;
11fdf7f2 4687 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4688 << ccap_string(it->second) << " want " << want
4689 << " last " << it->first << dendl;
4690 wait_on_list(in->waitfor_caps);
4691 }
4692}
4693
4694void Client::wait_sync_caps(ceph_tid_t want)
4695{
4696 retry:
11fdf7f2 4697 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4698 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4699 for (auto &p : mds_sessions) {
4700 MetaSession *s = &p.second;
7c673cae
FG
4701 if (s->flushing_caps_tids.empty())
4702 continue;
4703 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4704 if (oldest_tid <= want) {
11fdf7f2 4705 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4706 << " (want " << want << ")" << dendl;
9f95a23c
TL
4707 std::unique_lock l{client_lock, std::adopt_lock};
4708 sync_cond.wait(l);
4709 l.release();
7c673cae
FG
4710 goto retry;
4711 }
4712 }
4713}
4714
eafe8130
TL
4715void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4716{
4717 in->flags &= ~I_KICK_FLUSH;
4718
4719 Cap *cap = in->auth_cap;
4720 ceph_assert(cap->session == session);
4721
4722 ceph_tid_t last_snap_flush = 0;
4723 for (auto p = in->flushing_cap_tids.rbegin();
4724 p != in->flushing_cap_tids.rend();
4725 ++p) {
4726 if (!p->second) {
4727 last_snap_flush = p->first;
4728 break;
4729 }
4730 }
4731
4732 int wanted = in->caps_wanted();
4733 int used = get_caps_used(in) | in->caps_dirty();
4734 auto it = in->cap_snaps.begin();
4735 for (auto& p : in->flushing_cap_tids) {
4736 if (p.second) {
4737 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4738 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4739 p.second, p.first);
4740 } else {
4741 ceph_assert(it != in->cap_snaps.end());
4742 ceph_assert(it->second.flush_tid == p.first);
4743 send_flush_snap(in, session, it->first, it->second);
4744 ++it;
4745 }
4746 }
4747}
4748
7c673cae
FG
4749void Client::kick_flushing_caps(MetaSession *session)
4750{
4751 mds_rank_t mds = session->mds_num;
11fdf7f2 4752 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4753
4754 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4755 Inode *in = *p;
eafe8130
TL
4756 if (in->flags & I_KICK_FLUSH) {
4757 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4758 kick_flushing_caps(in, session);
4759 }
7c673cae 4760 }
7c673cae
FG
4761}
4762
4763void Client::early_kick_flushing_caps(MetaSession *session)
4764{
7c673cae
FG
4765 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4766 Inode *in = *p;
11fdf7f2
TL
4767 Cap *cap = in->auth_cap;
4768 ceph_assert(cap);
7c673cae
FG
4769
4770 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4771 // stage. This guarantees that MDS processes the cap flush message before issuing
4772 // the flushing caps to other client.
eafe8130
TL
4773 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4774 in->flags |= I_KICK_FLUSH;
7c673cae 4775 continue;
eafe8130 4776 }
7c673cae
FG
4777
4778 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4779 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4780 // send_reconnect() also will reset these sequence numbers. make sure
4781 // sequence numbers in cap flush message match later reconnect message.
4782 cap->seq = 0;
4783 cap->issue_seq = 0;
4784 cap->mseq = 0;
4785 cap->issued = cap->implemented;
4786
eafe8130 4787 kick_flushing_caps(in, session);
7c673cae
FG
4788 }
4789}
4790
7c673cae
FG
4791void SnapRealm::build_snap_context()
4792{
4793 set<snapid_t> snaps;
4794 snapid_t max_seq = seq;
4795
4796 // start with prior_parents?
4797 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4798 snaps.insert(prior_parent_snaps[i]);
4799
4800 // current parent's snaps
4801 if (pparent) {
4802 const SnapContext& psnapc = pparent->get_snap_context();
4803 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4804 if (psnapc.snaps[i] >= parent_since)
4805 snaps.insert(psnapc.snaps[i]);
4806 if (psnapc.seq > max_seq)
4807 max_seq = psnapc.seq;
4808 }
4809
4810 // my snaps
4811 for (unsigned i=0; i<my_snaps.size(); i++)
4812 snaps.insert(my_snaps[i]);
4813
4814 // ok!
4815 cached_snap_context.seq = max_seq;
4816 cached_snap_context.snaps.resize(0);
4817 cached_snap_context.snaps.reserve(snaps.size());
4818 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4819 cached_snap_context.snaps.push_back(*p);
4820}
4821
4822void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4823{
4824 list<SnapRealm*> q;
4825 q.push_back(realm);
4826
4827 while (!q.empty()) {
4828 realm = q.front();
4829 q.pop_front();
4830
11fdf7f2 4831 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4832 realm->invalidate_cache();
4833
4834 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4835 p != realm->pchildren.end();
4836 ++p)
4837 q.push_back(*p);
4838 }
4839}
4840
4841SnapRealm *Client::get_snap_realm(inodeno_t r)
4842{
4843 SnapRealm *realm = snap_realms[r];
4844 if (!realm)
4845 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4846 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4847 realm->nref++;
4848 return realm;
4849}
4850
4851SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4852{
4853 if (snap_realms.count(r) == 0) {
11fdf7f2 4854 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4855 return NULL;
4856 }
4857 SnapRealm *realm = snap_realms[r];
11fdf7f2 4858 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4859 realm->nref++;
4860 return realm;
4861}
4862
4863void Client::put_snap_realm(SnapRealm *realm)
4864{
11fdf7f2 4865 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4866 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4867 if (--realm->nref == 0) {
4868 snap_realms.erase(realm->ino);
4869 if (realm->pparent) {
4870 realm->pparent->pchildren.erase(realm);
4871 put_snap_realm(realm->pparent);
4872 }
4873 delete realm;
4874 }
4875}
4876
4877bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4878{
4879 if (realm->parent != parent) {
11fdf7f2 4880 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4881 << " " << realm->parent << " -> " << parent << dendl;
4882 realm->parent = parent;
4883 if (realm->pparent) {
4884 realm->pparent->pchildren.erase(realm);
4885 put_snap_realm(realm->pparent);
4886 }
4887 realm->pparent = get_snap_realm(parent);
4888 realm->pparent->pchildren.insert(realm);
4889 return true;
4890 }
4891 return false;
4892}
4893
4894static bool has_new_snaps(const SnapContext& old_snapc,
4895 const SnapContext& new_snapc)
4896{
4897 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4898}
4899
4900
11fdf7f2 4901void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4902{
4903 SnapRealm *first_realm = NULL;
11fdf7f2 4904 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4905
4906 map<SnapRealm*, SnapContext> dirty_realms;
4907
11fdf7f2 4908 auto p = bl.cbegin();
7c673cae
FG
4909 while (!p.end()) {
4910 SnapRealmInfo info;
11fdf7f2 4911 decode(info, p);
7c673cae
FG
4912 SnapRealm *realm = get_snap_realm(info.ino());
4913
4914 bool invalidate = false;
4915
4916 if (info.seq() > realm->seq) {
11fdf7f2 4917 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4918 << dendl;
4919
4920 if (flush) {
4921 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4922 // flush me + children
4923 list<SnapRealm*> q;
4924 q.push_back(realm);
4925 while (!q.empty()) {
4926 SnapRealm *realm = q.front();
4927 q.pop_front();
4928
4929 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4930 p != realm->pchildren.end();
4931 ++p)
4932 q.push_back(*p);
4933
4934 if (dirty_realms.count(realm) == 0) {
4935 realm->nref++;
4936 dirty_realms[realm] = realm->get_snap_context();
4937 }
4938 }
4939 }
4940
4941 // update
4942 realm->seq = info.seq();
4943 realm->created = info.created();
4944 realm->parent_since = info.parent_since();
4945 realm->prior_parent_snaps = info.prior_parent_snaps;
4946 realm->my_snaps = info.my_snaps;
4947 invalidate = true;
4948 }
4949
4950 // _always_ verify parent
4951 if (adjust_realm_parent(realm, info.parent()))
4952 invalidate = true;
4953
4954 if (invalidate) {
4955 invalidate_snaprealm_and_children(realm);
11fdf7f2 4956 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4957 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4958 } else {
11fdf7f2 4959 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4960 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4961 }
f67539c2 4962
7c673cae
FG
4963 if (!first_realm)
4964 first_realm = realm;
4965 else
4966 put_snap_realm(realm);
4967 }
4968
f67539c2 4969 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 4970 // if there are new snaps ?
f67539c2 4971 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 4972 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
f67539c2
TL
4973 for (auto&& in : realm->inodes_with_caps) {
4974 queue_cap_snap(in, snapc);
7c673cae
FG
4975 }
4976 } else {
4977 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4978 }
4979 put_snap_realm(realm);
4980 }
4981
4982 if (realm_ret)
4983 *realm_ret = first_realm;
4984 else
4985 put_snap_realm(first_realm);
4986}
4987
11fdf7f2 4988void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4989{
11fdf7f2 4990 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 4991 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
4992
4993 std::scoped_lock cl(client_lock);
7c673cae
FG
4994 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4995 if (!session) {
7c673cae
FG
4996 return;
4997 }
4998
4999 got_mds_push(session);
5000
5001 map<Inode*, SnapContext> to_move;
5002 SnapRealm *realm = 0;
5003
5004 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 5005 ceph_assert(m->head.split);
7c673cae 5006 SnapRealmInfo info;
11fdf7f2
TL
5007 auto p = m->bl.cbegin();
5008 decode(info, p);
5009 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
5010
5011 // flush, then move, ino's.
5012 realm = get_snap_realm(info.ino());
5013 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
5014 for (auto& ino : m->split_inos) {
5015 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
5016 if (inode_map.count(vino)) {
5017 Inode *in = inode_map[vino];
5018 if (!in->snaprealm || in->snaprealm == realm)
5019 continue;
5020 if (in->snaprealm->created > info.created()) {
5021 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5022 << *in->snaprealm << dendl;
5023 continue;
5024 }
5025 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5026
5027
5028 in->snaprealm_item.remove_myself();
5029 to_move[in] = in->snaprealm->get_snap_context();
5030 put_snap_realm(in->snaprealm);
5031 }
5032 }
5033
5034 // move child snaprealms, too
11fdf7f2
TL
5035 for (auto& child_realm : m->split_realms) {
5036 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5037 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
5038 if (!child)
5039 continue;
5040 adjust_realm_parent(child, realm->ino);
5041 put_snap_realm(child);
5042 }
5043 }
5044
5045 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5046
5047 if (realm) {
5048 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5049 Inode *in = p->first;
5050 in->snaprealm = realm;
5051 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5052 realm->nref++;
5053 // queue for snap writeback
5054 if (has_new_snaps(p->second, realm->get_snap_context()))
5055 queue_cap_snap(in, p->second);
5056 }
5057 put_snap_realm(realm);
5058 }
7c673cae
FG
5059}
5060
11fdf7f2 5061void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
5062{
5063 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5064
5065 std::scoped_lock cl(client_lock);
7c673cae
FG
5066 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5067 if (!session) {
7c673cae
FG
5068 return;
5069 }
5070
5071 got_mds_push(session);
5072
11fdf7f2 5073 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
5074
5075 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5076 if (inode_map.count(vino)) {
5077 Inode *in = NULL;
5078 in = inode_map[vino];
5079
5080 if (in) {
5081 in->quota = m->quota;
5082 in->rstat = m->rstat;
5083 }
5084 }
7c673cae
FG
5085}
5086
11fdf7f2 5087void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
5088{
5089 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5090
5091 std::scoped_lock cl(client_lock);
7c673cae
FG
5092 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5093 if (!session) {
7c673cae
FG
5094 return;
5095 }
5096
5097 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5098 // Pause RADOS operations until we see the required epoch
5099 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5100 }
5101
5102 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5103 // Record the barrier so that we will transmit it to MDS when releasing
5104 set_cap_epoch_barrier(m->osd_epoch_barrier);
5105 }
5106
5107 got_mds_push(session);
5108
11fdf7f2 5109 Inode *in;
7c673cae 5110 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
5111 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5112 in = it->second;
5113 } else {
7c673cae 5114 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 5115 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
5116 session->enqueue_cap_release(
5117 m->get_ino(),
5118 m->get_cap_id(),
5119 m->get_seq(),
5120 m->get_mseq(),
5121 cap_epoch_barrier);
5122 } else {
11fdf7f2 5123 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 5124 }
7c673cae
FG
5125
5126 // in case the mds is waiting on e.g. a revocation
5127 flush_cap_releases();
5128 return;
5129 }
5130
5131 switch (m->get_op()) {
11fdf7f2
TL
5132 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
5133 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
5134 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
5135 }
5136
11fdf7f2
TL
5137 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5138 Cap &cap = in->caps.at(mds);
7c673cae 5139
11fdf7f2
TL
5140 switch (m->get_op()) {
5141 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
5142 case CEPH_CAP_OP_IMPORT:
5143 case CEPH_CAP_OP_REVOKE:
5144 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
5145 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
5146 }
5147 } else {
5148 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5149 return;
7c673cae
FG
5150 }
5151}
5152
11fdf7f2 5153void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5154{
5155 mds_rank_t mds = session->mds_num;
5156
11fdf7f2 5157 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5158 << " IMPORT from mds." << mds << dendl;
5159
5160 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5161 Cap *cap = NULL;
5162 UserPerm cap_perms;
11fdf7f2
TL
5163 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5164 cap = &it->second;
5165 cap_perms = cap->latest_perms;
7c673cae
FG
5166 }
5167
5168 // add/update it
5169 SnapRealm *realm = NULL;
5170 update_snap_trace(m->snapbl, &realm);
5171
1911f103
TL
5172 int issued = m->get_caps();
5173 int wanted = m->get_wanted();
7c673cae 5174 add_update_cap(in, session, m->get_cap_id(),
1911f103 5175 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 5176 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
5177
5178 if (cap && cap->cap_id == m->peer.cap_id) {
5179 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5180 }
5181
5182 if (realm)
5183 put_snap_realm(realm);
5184
eafe8130 5185 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
5186 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5187 in->requested_max_size > m->get_max_size()) {
5188 in->requested_max_size = 0;
5189 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5190 }
7c673cae 5191 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5192 kick_flushing_caps(in, session);
7c673cae
FG
5193 }
5194}
5195
11fdf7f2 5196void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5197{
5198 mds_rank_t mds = session->mds_num;
5199
11fdf7f2 5200 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5201 << " EXPORT from mds." << mds << dendl;
5202
11fdf7f2
TL
5203 auto it = in->caps.find(mds);
5204 if (it != in->caps.end()) {
5205 Cap &cap = it->second;
5206 if (cap.cap_id == m->get_cap_id()) {
5207 if (m->peer.cap_id) {
5208 const auto peer_mds = mds_rank_t(m->peer.mds);
5209 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5210 auto it = in->caps.find(peer_mds);
5211 if (it != in->caps.end()) {
5212 Cap &tcap = it->second;
5213 if (tcap.cap_id == m->peer.cap_id &&
5214 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5215 tcap.cap_id = m->peer.cap_id;
5216 tcap.seq = m->peer.seq - 1;
5217 tcap.issue_seq = tcap.seq;
5218 tcap.issued |= cap.issued;
5219 tcap.implemented |= cap.issued;
5220 if (&cap == in->auth_cap)
5221 in->auth_cap = &tcap;
5222 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5223 adjust_session_flushing_caps(in, session, tsession);
5224 }
5225 } else {
5226 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5227 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5228 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5229 cap.latest_perms);
5230 }
7c673cae 5231 } else {
11fdf7f2
TL
5232 if (cap.wanted | cap.issued)
5233 in->flags |= I_CAP_DROPPED;
7c673cae 5234 }
7c673cae 5235
11fdf7f2
TL
5236 remove_cap(&cap, false);
5237 }
7c673cae 5238 }
7c673cae
FG
5239}
5240
11fdf7f2 5241void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5242{
5243 mds_rank_t mds = session->mds_num;
11fdf7f2 5244 ceph_assert(in->caps.count(mds));
7c673cae 5245
11fdf7f2 5246 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5247 << " size " << in->size << " -> " << m->get_size()
5248 << dendl;
5249
1adf2230
AA
5250 int issued;
5251 in->caps_issued(&issued);
5252 issued |= in->caps_dirty();
5253 update_inode_file_size(in, issued, m->get_size(),
5254 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
5255}
5256
11fdf7f2 5257void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5258{
5259 ceph_tid_t flush_ack_tid = m->get_client_tid();
5260 int dirty = m->get_dirty();
5261 int cleaned = 0;
5262 int flushed = 0;
5263
11fdf7f2
TL
5264 auto it = in->flushing_cap_tids.begin();
5265 if (it->first < flush_ack_tid) {
5266 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5267 << " got unexpected flush ack tid " << flush_ack_tid
5268 << " expected is " << it->first << dendl;
5269 }
5270 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5271 if (!it->second) {
5272 // cap snap
5273 ++it;
5274 continue;
5275 }
7c673cae
FG
5276 if (it->first == flush_ack_tid)
5277 cleaned = it->second;
5278 if (it->first <= flush_ack_tid) {
5279 session->flushing_caps_tids.erase(it->first);
5280 in->flushing_cap_tids.erase(it++);
5281 ++flushed;
5282 continue;
5283 }
5284 cleaned &= ~it->second;
5285 if (!cleaned)
5286 break;
5287 ++it;
5288 }
5289
11fdf7f2 5290 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5291 << " cleaned " << ccap_string(cleaned) << " on " << *in
5292 << " with " << ccap_string(dirty) << dendl;
5293
5294 if (flushed) {
5295 signal_cond_list(in->waitfor_caps);
5296 if (session->flushing_caps_tids.empty() ||
5297 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5298 sync_cond.notify_all();
7c673cae
FG
5299 }
5300
5301 if (!dirty) {
5302 in->cap_dirtier_uid = -1;
5303 in->cap_dirtier_gid = -1;
5304 }
5305
5306 if (!cleaned) {
5307 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5308 } else {
5309 if (in->flushing_caps) {
5310 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5311 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5312 in->flushing_caps &= ~cleaned;
5313 if (in->flushing_caps == 0) {
5314 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5315 num_flushing_caps--;
eafe8130 5316 if (in->flushing_cap_tids.empty())
7c673cae
FG
5317 in->flushing_cap_item.remove_myself();
5318 }
5319 if (!in->caps_dirty())
5320 put_inode(in);
5321 }
5322 }
7c673cae
FG
5323}
5324
5325
11fdf7f2 5326void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5327{
eafe8130 5328 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5329 mds_rank_t mds = session->mds_num;
11fdf7f2 5330 ceph_assert(in->caps.count(mds));
7c673cae
FG
5331 snapid_t follows = m->get_snap_follows();
5332
11fdf7f2
TL
5333 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5334 auto& capsnap = it->second;
eafe8130
TL
5335 if (flush_ack_tid != capsnap.flush_tid) {
5336 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5337 } else {
eafe8130 5338 InodeRef tmp_ref(in);
11fdf7f2 5339 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5340 << " on " << *in << dendl;
7c673cae 5341 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5342 in->flushing_cap_tids.erase(capsnap.flush_tid);
5343 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5344 in->flushing_cap_item.remove_myself();
11fdf7f2 5345 in->cap_snaps.erase(it);
eafe8130
TL
5346
5347 signal_cond_list(in->waitfor_caps);
5348 if (session->flushing_caps_tids.empty() ||
5349 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5350 sync_cond.notify_all();
7c673cae
FG
5351 }
5352 } else {
11fdf7f2 5353 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5354 << " on " << *in << dendl;
5355 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5356 }
7c673cae
FG
5357}
5358
5359class C_Client_DentryInvalidate : public Context {
5360private:
5361 Client *client;
5362 vinodeno_t dirino;
5363 vinodeno_t ino;
5364 string name;
5365public:
5366 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5367 client(c), name(dn->name) {
5368 if (client->use_faked_inos()) {
5369 dirino.ino = dn->dir->parent_inode->faked_ino;
5370 if (del)
5371 ino.ino = dn->inode->faked_ino;
5372 } else {
5373 dirino = dn->dir->parent_inode->vino();
5374 if (del)
5375 ino = dn->inode->vino();
5376 }
5377 if (!del)
5378 ino.ino = inodeno_t();
5379 }
5380 void finish(int r) override {
5381 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5382 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5383 client->_async_dentry_invalidate(dirino, ino, name);
5384 }
5385};
5386
5387void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5388{
f67539c2
TL
5389 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5390 if (!mref_reader.is_state_satisfied())
7c673cae 5391 return;
f67539c2 5392
11fdf7f2 5393 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5394 << " in dir " << dirino << dendl;
e306af50 5395 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5396}
5397
5398void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5399{
5400 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5401 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5402}
5403
5404void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5405{
b3b6e05e 5406 int ref = in->get_nref();
494da23a 5407 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5408
5409 if (in->dir && !in->dir->dentries.empty()) {
5410 for (auto p = in->dir->dentries.begin();
5411 p != in->dir->dentries.end(); ) {
5412 Dentry *dn = p->second;
5413 ++p;
5414 /* rmsnap removes whole subtree, need trim inodes recursively.
5415 * we don't need to invalidate dentries recursively. because
5416 * invalidating a directory dentry effectively invalidate
5417 * whole subtree */
5418 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5419 _try_to_trim_inode(dn->inode.get(), false);
5420
5421 if (dn->lru_is_expireable())
5422 unlink(dn, true, false); // keep dir, drop dentry
5423 }
5424 if (in->dir->dentries.empty()) {
5425 close_dir(in->dir);
5426 --ref;
5427 }
5428 }
5429
b3b6e05e 5430 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
7c673cae
FG
5431 InodeRef snapdir = open_snapdir(in);
5432 _try_to_trim_inode(snapdir.get(), false);
5433 --ref;
5434 }
5435
b3b6e05e 5436 if (ref > 1) {
11fdf7f2
TL
5437 auto q = in->dentries.begin();
5438 while (q != in->dentries.end()) {
5439 Dentry *dn = *q;
5440 ++q;
494da23a
TL
5441 if( in->ll_ref > 0 && sched_inval) {
5442 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5443 // so in->dentries doesn't always reflect the state of kernel's dcache.
5444 _schedule_invalidate_dentry_callback(dn, true);
5445 }
7c673cae
FG
5446 unlink(dn, true, true);
5447 }
5448 }
5449}
5450
11fdf7f2 5451void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5452{
5453 mds_rank_t mds = session->mds_num;
5454 int used = get_caps_used(in);
5455 int wanted = in->caps_wanted();
5456
a8e16298
TL
5457 const unsigned new_caps = m->get_caps();
5458 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5459 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5460 << " mds." << mds << " seq " << m->get_seq()
5461 << " caps now " << ccap_string(new_caps)
a8e16298 5462 << " was " << ccap_string(cap->issued)
92f5a8d4 5463 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5464
5465 if (was_stale)
5466 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5467 cap->seq = m->get_seq();
28e407b8 5468 cap->gen = session->cap_gen;
7c673cae 5469
11fdf7f2 5470 check_cap_issue(in, new_caps);
a8e16298 5471
7c673cae 5472 // update inode
1adf2230
AA
5473 int issued;
5474 in->caps_issued(&issued);
5475 issued |= in->caps_dirty();
7c673cae 5476
1adf2230
AA
5477 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5478 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5479 in->mode = m->head.mode;
5480 in->uid = m->head.uid;
5481 in->gid = m->head.gid;
5482 in->btime = m->btime;
5483 }
5484 bool deleted_inode = false;
1adf2230
AA
5485 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5486 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5487 in->nlink = m->head.nlink;
5488 if (in->nlink == 0 &&
5489 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5490 deleted_inode = true;
5491 }
1adf2230 5492 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5493 m->xattrbl.length() &&
5494 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5495 auto p = m->xattrbl.cbegin();
5496 decode(in->xattrs, p);
7c673cae
FG
5497 in->xattr_version = m->head.xattr_version;
5498 }
28e407b8
AA
5499
5500 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5501 in->dirstat.nfiles = m->get_nfiles();
5502 in->dirstat.nsubdirs = m->get_nsubdirs();
5503 }
5504
1adf2230
AA
5505 if (new_caps & CEPH_CAP_ANY_RD) {
5506 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5507 m->get_ctime(), m->get_mtime(), m->get_atime());
5508 }
5509
5510 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5511 in->layout = m->get_layout();
5512 update_inode_file_size(in, issued, m->get_size(),
5513 m->get_truncate_seq(), m->get_truncate_size());
5514 }
5515
5516 if (m->inline_version > in->inline_version) {
5517 in->inline_data = m->inline_data;
5518 in->inline_version = m->inline_version;
5519 }
5520
5521 /* always take a newer change attr */
5522 if (m->get_change_attr() > in->change_attr)
5523 in->change_attr = m->get_change_attr();
7c673cae
FG
5524
5525 // max_size
5526 if (cap == in->auth_cap &&
1adf2230
AA
5527 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5528 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5529 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5530 in->max_size = m->get_max_size();
5531 if (in->max_size > in->wanted_max_size) {
5532 in->wanted_max_size = 0;
5533 in->requested_max_size = 0;
5534 }
5535 }
5536
5537 bool check = false;
a8e16298
TL
5538 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5539 (wanted & ~(cap->wanted | new_caps))) {
5540 // If mds is importing cap, prior cap messages that update 'wanted'
5541 // may get dropped by mds (migrate seq mismatch).
5542 //
5543 // We don't send cap message to update 'wanted' if what we want are
5544 // already issued. If mds revokes caps, cap message that releases caps
5545 // also tells mds what we want. But if caps got revoked by mds forcedly
5546 // (session stale). We may haven't told mds what we want.
7c673cae 5547 check = true;
a8e16298 5548 }
7c673cae 5549
7c673cae
FG
5550
5551 // update caps
a8e16298 5552 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5553 if (revoked) {
5554 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5555 cap->issued = new_caps;
5556 cap->implemented |= new_caps;
5557
b32b8144
FG
5558 // recall delegations if we're losing caps necessary for them
5559 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5560 in->recall_deleg(false);
5561 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5562 in->recall_deleg(true);
5563
11fdf7f2
TL
5564 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5565 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5566 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5567 // waitin' for flush
11fdf7f2 5568 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5569 if (_release(in))
5570 check = true;
5571 } else {
5572 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5573 check = true;
5574 }
a8e16298
TL
5575 } else if (cap->issued == new_caps) {
5576 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5577 } else {
a8e16298 5578 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5579 cap->issued = new_caps;
5580 cap->implemented |= new_caps;
5581
5582 if (cap == in->auth_cap) {
5583 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5584 for (const auto &p : in->caps) {
5585 if (&p.second == cap)
7c673cae 5586 continue;
11fdf7f2 5587 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5588 check = true;
5589 break;
5590 }
5591 }
5592 }
5593 }
5594
5595 if (check)
5596 check_caps(in, 0);
5597
5598 // wake up waiters
5599 if (new_caps)
5600 signal_cond_list(in->waitfor_caps);
5601
5602 // may drop inode's last ref
5603 if (deleted_inode)
5604 _try_to_trim_inode(in, true);
7c673cae
FG
5605}
5606
7c673cae
FG
5607int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5608{
b3b6e05e
TL
5609 if (perms.uid() == 0) {
5610 // Executable are overridable when there is at least one exec bit set
5611 if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
5612 return -CEPHFS_EACCES;
7c673cae 5613 return 0;
b3b6e05e 5614 }
7c673cae
FG
5615
5616 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5617 int ret = _posix_acl_permission(in, perms, want);
f67539c2 5618 if (ret != -CEPHFS_EAGAIN)
7c673cae
FG
5619 return ret;
5620 }
5621
5622 // check permissions before doing anything else
5623 if (!in->check_mode(perms, want))
f67539c2 5624 return -CEPHFS_EACCES;
7c673cae
FG
5625 return 0;
5626}
5627
5628int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5629 const UserPerm& perms)
5630{
5631 int r = _getattr_for_perm(in, perms);
5632 if (r < 0)
5633 goto out;
5634
5635 r = 0;
5636 if (strncmp(name, "system.", 7) == 0) {
5637 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
f67539c2 5638 r = -CEPHFS_EPERM;
7c673cae
FG
5639 } else {
5640 r = inode_permission(in, perms, want);
5641 }
5642out:
1adf2230 5643 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5644 return r;
5645}
5646
5647ostream& operator<<(ostream &out, const UserPerm& perm) {
5648 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5649 return out;
5650}
5651
5652int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5653 const UserPerm& perms)
5654{
181888fb 5655 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5656 int r = _getattr_for_perm(in, perms);
5657 if (r < 0)
5658 goto out;
5659
5660 if (mask & CEPH_SETATTR_SIZE) {
5661 r = inode_permission(in, perms, MAY_WRITE);
5662 if (r < 0)
5663 goto out;
5664 }
5665
f67539c2 5666 r = -CEPHFS_EPERM;
7c673cae
FG
5667 if (mask & CEPH_SETATTR_UID) {
5668 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5669 goto out;
5670 }
5671 if (mask & CEPH_SETATTR_GID) {
5672 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5673 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5674 goto out;
5675 }
5676
5677 if (mask & CEPH_SETATTR_MODE) {
5678 if (perms.uid() != 0 && perms.uid() != in->uid)
5679 goto out;
5680
5681 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5682 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5683 stx->stx_mode &= ~S_ISGID;
5684 }
5685
5686 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5687 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5688 if (perms.uid() != 0 && perms.uid() != in->uid) {
5689 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5690 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5691 check_mask |= CEPH_SETATTR_MTIME;
5692 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5693 check_mask |= CEPH_SETATTR_ATIME;
5694 if (check_mask & mask) {
5695 goto out;
5696 } else {
5697 r = inode_permission(in, perms, MAY_WRITE);
5698 if (r < 0)
5699 goto out;
5700 }
5701 }
5702 }
5703 r = 0;
5704out:
5705 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5706 return r;
5707}
5708
5709int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5710{
181888fb 5711 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5712 unsigned want = 0;
5713
5714 if ((flags & O_ACCMODE) == O_WRONLY)
5715 want = MAY_WRITE;
5716 else if ((flags & O_ACCMODE) == O_RDWR)
5717 want = MAY_READ | MAY_WRITE;
5718 else if ((flags & O_ACCMODE) == O_RDONLY)
5719 want = MAY_READ;
5720 if (flags & O_TRUNC)
5721 want |= MAY_WRITE;
5722
5723 int r = 0;
5724 switch (in->mode & S_IFMT) {
5725 case S_IFLNK:
f67539c2 5726 r = -CEPHFS_ELOOP;
7c673cae
FG
5727 goto out;
5728 case S_IFDIR:
5729 if (want & MAY_WRITE) {
f67539c2 5730 r = -CEPHFS_EISDIR;
7c673cae
FG
5731 goto out;
5732 }
5733 break;
5734 }
5735
5736 r = _getattr_for_perm(in, perms);
5737 if (r < 0)
5738 goto out;
5739
5740 r = inode_permission(in, perms, want);
5741out:
5742 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5743 return r;
5744}
5745
5746int Client::may_lookup(Inode *dir, const UserPerm& perms)
5747{
181888fb 5748 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5749 int r = _getattr_for_perm(dir, perms);
5750 if (r < 0)
5751 goto out;
5752
5753 r = inode_permission(dir, perms, MAY_EXEC);
5754out:
5755 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5756 return r;
5757}
5758
5759int Client::may_create(Inode *dir, const UserPerm& perms)
5760{
181888fb 5761 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5762 int r = _getattr_for_perm(dir, perms);
5763 if (r < 0)
5764 goto out;
5765
5766 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5767out:
5768 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5769 return r;
5770}
5771
5772int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5773{
181888fb 5774 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5775 int r = _getattr_for_perm(dir, perms);
5776 if (r < 0)
5777 goto out;
5778
5779 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5780 if (r < 0)
5781 goto out;
5782
f67539c2 5783 /* 'name == NULL' means rmsnap w/o permission checks */
7c673cae
FG
5784 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5785 InodeRef otherin;
5786 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5787 if (r < 0)
5788 goto out;
5789 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
f67539c2 5790 r = -CEPHFS_EPERM;
7c673cae
FG
5791 }
5792out:
5793 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5794 return r;
5795}
5796
f67539c2
TL
5797int Client::may_delete(const char *relpath, const UserPerm& perms) {
5798 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5799
5800 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5801 if (!mref_reader.is_state_satisfied())
5802 return -ENOTCONN;
5803
5804 filepath path(relpath);
5805 string name = path.last_dentry();
5806 path.pop_dentry();
5807 InodeRef dir;
5808
5809 std::scoped_lock lock(client_lock);
5810 int r = path_walk(path, &dir, perms);
5811 if (r < 0)
5812 return r;
5813 if (cct->_conf->client_permissions) {
5814 int r = may_delete(dir.get(), name.c_str(), perms);
5815 if (r < 0)
5816 return r;
5817 }
5818
5819 return 0;
5820}
5821
7c673cae
FG
5822int Client::may_hardlink(Inode *in, const UserPerm& perms)
5823{
181888fb 5824 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5825 int r = _getattr_for_perm(in, perms);
5826 if (r < 0)
5827 goto out;
5828
5829 if (perms.uid() == 0 || perms.uid() == in->uid) {
5830 r = 0;
5831 goto out;
5832 }
5833
f67539c2 5834 r = -CEPHFS_EPERM;
7c673cae
FG
5835 if (!S_ISREG(in->mode))
5836 goto out;
5837
5838 if (in->mode & S_ISUID)
5839 goto out;
5840
5841 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5842 goto out;
5843
5844 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5845out:
5846 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5847 return r;
5848}
5849
5850int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5851{
5852 int mask = CEPH_STAT_CAP_MODE;
5853 bool force = false;
5854 if (acl_type != NO_ACL) {
5855 mask |= CEPH_STAT_CAP_XATTR;
5856 force = in->xattr_version == 0;
5857 }
5858 return _getattr(in, mask, perms, force);
5859}
5860
5861vinodeno_t Client::_get_vino(Inode *in)
5862{
5863 /* The caller must hold the client lock */
5864 return vinodeno_t(in->ino, in->snapid);
5865}
5866
7c673cae
FG
5867/**
5868 * Resolve an MDS spec to a list of MDS daemon GIDs.
5869 *
5870 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5871 * It may be '*' in which case it matches all GIDs.
5872 *
5873 * If no error is returned, the `targets` vector will be populated with at least
5874 * one MDS.
5875 */
5876int Client::resolve_mds(
5877 const std::string &mds_spec,
5878 std::vector<mds_gid_t> *targets)
5879{
11fdf7f2
TL
5880 ceph_assert(fsmap);
5881 ceph_assert(targets != nullptr);
7c673cae
FG
5882
5883 mds_role_t role;
f67539c2
TL
5884 CachedStackStringStream css;
5885 int role_r = fsmap->parse_role(mds_spec, &role, *css);
7c673cae
FG
5886 if (role_r == 0) {
5887 // We got a role, resolve it to a GID
f67539c2
TL
5888 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5889 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5890 << role << "' aka " << info.human_name() << dendl;
5891 targets->push_back(info.global_id);
7c673cae
FG
5892 return 0;
5893 }
5894
5895 std::string strtol_err;
5896 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5897 if (strtol_err.empty()) {
5898 // It is a possible GID
5899 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5900 if (fsmap->gid_exists(mds_gid)) {
f67539c2
TL
5901 auto& info = fsmap->get_info_gid(mds_gid);
5902 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5903 << info.human_name() << dendl;
7c673cae 5904 targets->push_back(mds_gid);
f67539c2 5905 return 0;
7c673cae 5906 } else {
f67539c2 5907 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
7c673cae 5908 << dendl;
f67539c2
TL
5909 lderr(cct) << "FSMap: " << *fsmap << dendl;
5910 return -CEPHFS_ENOENT;
7c673cae
FG
5911 }
5912 } else if (mds_spec == "*") {
5913 // It is a wildcard: use all MDSs
f67539c2 5914 const auto& mds_info = fsmap->get_mds_info();
7c673cae 5915
f67539c2 5916 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
7c673cae 5917 if (mds_info.empty()) {
f67539c2
TL
5918 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5919 lderr(cct) << "FSMap: " << *fsmap << dendl;
5920 return -CEPHFS_ENOENT;
7c673cae
FG
5921 }
5922
f67539c2
TL
5923 for (const auto& [gid, info] : mds_info) {
5924 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5925 targets->push_back(gid);
7c673cae 5926 }
f67539c2 5927 return 0;
7c673cae
FG
5928 } else {
5929 // It did not parse as an integer, it is not a wildcard, it must be a name
5930 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5931 if (mds_gid == 0) {
f67539c2 5932 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
7c673cae 5933 lderr(cct) << "FSMap: " << *fsmap << dendl;
f67539c2 5934 return -CEPHFS_ENOENT;
7c673cae 5935 } else {
f67539c2
TL
5936 auto& info = fsmap->get_info_gid(mds_gid);
5937 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5938 << "' to " << info.human_name() << dendl;
7c673cae
FG
5939 targets->push_back(mds_gid);
5940 }
f67539c2 5941 return 0;
7c673cae 5942 }
7c673cae
FG
5943}
5944
5945
5946/**
5947 * Authenticate with mon and establish global ID
5948 */
5949int Client::authenticate()
5950{
9f95a23c 5951 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
5952
5953 if (monclient->is_authenticated()) {
5954 return 0;
5955 }
5956
9f95a23c 5957 client_lock.unlock();
7c673cae 5958 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
9f95a23c 5959 client_lock.lock();
7c673cae
FG
5960 if (r < 0) {
5961 return r;
5962 }
5963
5964 whoami = monclient->get_global_id();
5965 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5966
5967 return 0;
5968}
5969
5970int Client::fetch_fsmap(bool user)
5971{
f67539c2
TL
5972 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5973
7c673cae
FG
5974 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5975 // rather than MDSMap because no one MDSMap contains all the daemons, and
5976 // a `tell` can address any daemon.
5977 version_t fsmap_latest;
f67539c2 5978 bs::error_code ec;
7c673cae 5979 do {
9f95a23c 5980 client_lock.unlock();
f67539c2
TL
5981 std::tie(fsmap_latest, std::ignore) =
5982 monclient->get_version("fsmap", ca::use_blocked[ec]);
9f95a23c 5983 client_lock.lock();
f67539c2 5984 } while (ec == bs::errc::resource_unavailable_try_again);
7c673cae 5985
f67539c2
TL
5986 if (ec) {
5987 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5988 return ceph::from_error_code(ec);
7c673cae
FG
5989 }
5990
5991 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5992
5993 if (user) {
5994 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5995 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5996 monclient->renew_subs();
5997 wait_on_list(waiting_for_fsmap);
5998 }
11fdf7f2
TL
5999 ceph_assert(fsmap_user);
6000 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
6001 } else {
6002 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6003 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6004 monclient->renew_subs();
6005 wait_on_list(waiting_for_fsmap);
6006 }
11fdf7f2
TL
6007 ceph_assert(fsmap);
6008 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
6009 }
6010 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6011 << fsmap_latest << dendl;
6012 return 0;
6013}
6014
6015/**
6016 *
6017 * @mds_spec one of ID, rank, GID, "*"
6018 *
6019 */
6020int Client::mds_command(
6021 const std::string &mds_spec,
6022 const vector<string>& cmd,
6023 const bufferlist& inbl,
6024 bufferlist *outbl,
6025 string *outs,
6026 Context *onfinish)
6027{
f67539c2
TL
6028 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6029 if (!iref_reader.is_state_satisfied())
6030 return -CEPHFS_ENOTCONN;
7c673cae 6031
f67539c2 6032 std::unique_lock cl(client_lock);
7c673cae
FG
6033
6034 int r;
6035 r = authenticate();
6036 if (r < 0) {
6037 return r;
6038 }
6039
6040 r = fetch_fsmap(false);
6041 if (r < 0) {
6042 return r;
6043 }
6044
6045 // Look up MDS target(s) of the command
6046 std::vector<mds_gid_t> targets;
6047 r = resolve_mds(mds_spec, &targets);
6048 if (r < 0) {
6049 return r;
6050 }
6051
6052 // If daemons are laggy, we won't send them commands. If all
6053 // are laggy then we fail.
6054 std::vector<mds_gid_t> non_laggy;
f67539c2 6055 for (const auto& gid : targets) {
7c673cae
FG
6056 const auto info = fsmap->get_info_gid(gid);
6057 if (!info.laggy()) {
6058 non_laggy.push_back(gid);
6059 }
6060 }
6061 if (non_laggy.size() == 0) {
6062 *outs = "All targeted MDS daemons are laggy";
f67539c2 6063 return -CEPHFS_ENOENT;
7c673cae
FG
6064 }
6065
6066 if (metadata.empty()) {
6067 // We are called on an unmounted client, so metadata
6068 // won't be initialized yet.
6069 populate_metadata("");
6070 }
6071
6072 // Send commands to targets
6073 C_GatherBuilder gather(cct, onfinish);
f67539c2 6074 for (const auto& target_gid : non_laggy) {
7c673cae
FG
6075 const auto info = fsmap->get_info_gid(target_gid);
6076
6077 // Open a connection to the target MDS
11fdf7f2 6078 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae 6079
f67539c2
TL
6080 cl.unlock();
6081 {
6082 std::scoped_lock cmd_lock(command_lock);
6083 // Generate MDSCommandOp state
6084 auto &op = command_table.start_command();
7c673cae 6085
f67539c2
TL
6086 op.on_finish = gather.new_sub();
6087 op.cmd = cmd;
6088 op.outbl = outbl;
6089 op.outs = outs;
6090 op.inbl = inbl;
6091 op.mds_gid = target_gid;
6092 op.con = conn;
7c673cae 6093
f67539c2
TL
6094 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6095 << " tid=" << op.tid << cmd << dendl;
7c673cae 6096
f67539c2
TL
6097 // Construct and send MCommand
6098 MessageRef m = op.get_message(monclient->get_fsid());
6099 conn->send_message2(std::move(m));
6100 }
6101 cl.lock();
7c673cae
FG
6102 }
6103 gather.activate();
6104
6105 return 0;
6106}
6107
11fdf7f2 6108void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
6109{
6110 ceph_tid_t const tid = m->get_tid();
6111
6112 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6113
f67539c2 6114 std::scoped_lock cmd_lock(command_lock);
7c673cae
FG
6115 if (!command_table.exists(tid)) {
6116 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
6117 return;
6118 }
6119
6120 auto &op = command_table.get_command(tid);
6121 if (op.outbl) {
11fdf7f2 6122 *op.outbl = m->get_data();
7c673cae
FG
6123 }
6124 if (op.outs) {
6125 *op.outs = m->rs;
6126 }
6127
6128 if (op.on_finish) {
6129 op.on_finish->complete(m->r);
6130 }
6131
6132 command_table.erase(tid);
7c673cae
FG
6133}
6134
6135// -------------------
6136// MOUNT
6137
11fdf7f2 6138int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 6139{
7c673cae
FG
6140 int r = authenticate();
6141 if (r < 0) {
6142 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6143 return r;
6144 }
6145
11fdf7f2
TL
6146 std::string resolved_fs_name;
6147 if (fs_name.empty()) {
9f95a23c
TL
6148 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6149 if (resolved_fs_name.empty())
6150 // Try the backwards compatibility fs name option
6151 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
6152 } else {
6153 resolved_fs_name = fs_name;
6154 }
6155
7c673cae 6156 std::string want = "mdsmap";
11fdf7f2 6157 if (!resolved_fs_name.empty()) {
7c673cae
FG
6158 r = fetch_fsmap(true);
6159 if (r < 0)
6160 return r;
11fdf7f2
TL
6161 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6162 if (fscid == FS_CLUSTER_ID_NONE) {
f67539c2 6163 return -CEPHFS_ENOENT;
11fdf7f2 6164 }
7c673cae
FG
6165
6166 std::ostringstream oss;
11fdf7f2 6167 oss << want << "." << fscid;
7c673cae
FG
6168 want = oss.str();
6169 }
6170 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6171
6172 monclient->sub_want(want, 0, 0);
6173 monclient->renew_subs();
6174
11fdf7f2
TL
6175 return 0;
6176}
6177
6178int Client::mount(const std::string &mount_root, const UserPerm& perms,
6179 bool require_mds, const std::string &fs_name)
6180{
f67539c2 6181 ceph_assert(is_initialized());
11fdf7f2 6182
f67539c2
TL
6183 /*
6184 * To make sure that the _unmount() must wait until the mount()
6185 * is done.
6186 */
6187 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6188 if (!mref_writer.is_first_writer()) // already mounting or mounted
11fdf7f2 6189 return 0;
11fdf7f2 6190
f67539c2 6191 std::unique_lock cl(client_lock);
11fdf7f2
TL
6192
6193 int r = subscribe_mdsmap(fs_name);
6194 if (r < 0) {
6195 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6196 return r;
6197 }
6198
f67539c2
TL
6199 start_tick_thread(); // start tick thread
6200
7c673cae
FG
6201 if (require_mds) {
6202 while (1) {
6203 auto availability = mdsmap->is_cluster_available();
6204 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6205 // Error out
6206 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6207 return CEPH_FUSE_NO_MDS_UP;
6208 } else if (availability == MDSMap::AVAILABLE) {
6209 // Continue to mount
6210 break;
6211 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6212 // Else, wait. MDSMonitor will update the map to bring
6213 // us to a conclusion eventually.
6214 wait_on_list(waiting_for_mdsmap);
6215 } else {
6216 // Unexpected value!
6217 ceph_abort();
6218 }
6219 }
6220 }
6221
6222 populate_metadata(mount_root.empty() ? "/" : mount_root);
6223
6224 filepath fp(CEPH_INO_ROOT);
6225 if (!mount_root.empty()) {
6226 fp = filepath(mount_root.c_str());
6227 }
6228 while (true) {
6229 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6230 req->set_filepath(fp);
6231 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6232 int res = make_request(req, perms);
6233 if (res < 0) {
f67539c2 6234 if (res == -CEPHFS_EACCES && root) {
7c673cae
FG
6235 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6236 break;
6237 }
6238 return res;
6239 }
6240
6241 if (fp.depth())
6242 fp.pop_dentry();
6243 else
6244 break;
6245 }
6246
11fdf7f2 6247 ceph_assert(root);
b3b6e05e 6248 _ll_get(root.get());
7c673cae 6249
7c673cae
FG
6250 // trace?
6251 if (!cct->_conf->client_trace.empty()) {
6252 traceout.open(cct->_conf->client_trace.c_str());
6253 if (traceout.is_open()) {
6254 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6255 } else {
6256 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6257 }
6258 }
6259
6260 /*
6261 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6262 ldout(cct, 3) << "op: struct stat st;" << dendl;
6263 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6264 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6265 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6266 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6267 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6268 ldout(cct, 3) << "op: int fd;" << dendl;
6269 */
f67539c2
TL
6270
6271 mref_writer.update_state(CLIENT_MOUNTED);
7c673cae
FG
6272 return 0;
6273}
6274
6275// UNMOUNT
6276
6277void Client::_close_sessions()
6278{
f6b5b4d7
TL
6279 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6280 if (it->second.state == MetaSession::STATE_REJECTED)
6281 mds_sessions.erase(it++);
6282 else
6283 ++it;
6284 }
6285
7c673cae
FG
6286 while (!mds_sessions.empty()) {
6287 // send session closes!
11fdf7f2
TL
6288 for (auto &p : mds_sessions) {
6289 if (p.second.state != MetaSession::STATE_CLOSING) {
6290 _close_mds_session(&p.second);
f6b5b4d7 6291 mds_ranks_closing.insert(p.first);
7c673cae
FG
6292 }
6293 }
6294
6295 // wait for sessions to close
f6b5b4d7
TL
6296 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6297 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6298 << timo << "s)" << dendl;
9f95a23c 6299 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6300 if (!timo) {
6301 mount_cond.wait(l);
6302 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6303 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6304 while (!mds_ranks_closing.empty()) {
6305 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6306 // this prunes entry from mds_sessions and mds_ranks_closing
f67539c2 6307 _closed_mds_session(&session, -CEPHFS_ETIMEDOUT);
f6b5b4d7
TL
6308 }
6309 }
6310
6311 mds_ranks_closing.clear();
9f95a23c 6312 l.release();
7c673cae
FG
6313 }
6314}
6315
31f18b77
FG
6316void Client::flush_mdlog_sync()
6317{
6318 if (mds_requests.empty())
6319 return;
11fdf7f2
TL
6320 for (auto &p : mds_sessions) {
6321 flush_mdlog(&p.second);
31f18b77
FG
6322 }
6323}
6324
6325void Client::flush_mdlog(MetaSession *session)
6326{
6327 // Only send this to Luminous or newer MDS daemons, older daemons
6328 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6329 const uint64_t features = session->con->get_features();
6330 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6331 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6332 session->con->send_message2(std::move(m));
31f18b77
FG
6333 }
6334}
6335
6336
11fdf7f2
TL
6337void Client::_abort_mds_sessions(int err)
6338{
6339 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6340 auto req = p->second;
6341 ++p;
6342 // unsafe requests will be removed during close session below.
6343 if (req->got_unsafe)
6344 continue;
6345
6346 req->abort(err);
6347 if (req->caller_cond) {
6348 req->kick = true;
9f95a23c 6349 req->caller_cond->notify_all();
11fdf7f2
TL
6350 }
6351 }
6352
6353 // Process aborts on any requests that were on this waitlist.
6354 // Any requests that were on a waiting_for_open session waitlist
6355 // will get kicked during close session below.
6356 signal_cond_list(waiting_for_mdsmap);
6357
6358 // Force-close all sessions
6359 while(!mds_sessions.empty()) {
6360 auto& session = mds_sessions.begin()->second;
f6b5b4d7 6361 _closed_mds_session(&session, err);
11fdf7f2
TL
6362 }
6363}
6364
6365void Client::_unmount(bool abort)
7c673cae 6366{
f67539c2
TL
6367 /*
6368 * We are unmounting the client.
6369 *
6370 * Just declare the state to STATE_UNMOUNTING to block and fail
6371 * any new comming "reader" and then try to wait all the in-flight
6372 * "readers" to finish.
6373 */
6374 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6375 if (!mref_writer.is_first_writer())
181888fb 6376 return;
f67539c2 6377 mref_writer.wait_readers_done();
7c673cae 6378
f67539c2
TL
6379 std::unique_lock lock{client_lock};
6380
6381 if (abort || blocklisted) {
6382 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
11fdf7f2
TL
6383 } else {
6384 ldout(cct, 2) << "unmounting" << dendl;
6385 }
7c673cae 6386
b32b8144
FG
6387 deleg_timeout = 0;
6388
11fdf7f2 6389 if (abort) {
f67539c2 6390 mount_aborted = true;
11fdf7f2 6391 // Abort all mds sessions
f67539c2 6392 _abort_mds_sessions(-CEPHFS_ENOTCONN);
11fdf7f2 6393
f67539c2 6394 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
11fdf7f2
TL
6395 } else {
6396 // flush the mdlog for pending requests, if any
6397 flush_mdlog_sync();
6398 }
6399
9f95a23c
TL
6400 mount_cond.wait(lock, [this] {
6401 if (!mds_requests.empty()) {
6402 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6403 << dendl;
6404 }
6405 return mds_requests.empty();
6406 });
7c673cae
FG
6407
6408 cwd.reset();
b3b6e05e 6409 root.reset();
7c673cae
FG
6410
6411 // clean up any unclosed files
6412 while (!fd_map.empty()) {
6413 Fh *fh = fd_map.begin()->second;
6414 fd_map.erase(fd_map.begin());
6415 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6416 _release_fh(fh);
6417 }
6418
6419 while (!ll_unclosed_fh_set.empty()) {
6420 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6421 Fh *fh = *it;
6422 ll_unclosed_fh_set.erase(fh);
6423 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6424 _release_fh(fh);
6425 }
6426
6427 while (!opened_dirs.empty()) {
6428 dir_result_t *dirp = *opened_dirs.begin();
6429 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6430 _closedir(dirp);
6431 }
6432
6433 _ll_drop_pins();
6434
7c673cae
FG
6435 if (cct->_conf->client_oc) {
6436 // flush/release all buffered data
11fdf7f2
TL
6437 std::list<InodeRef> anchor;
6438 for (auto& p : inode_map) {
6439 Inode *in = p.second;
7c673cae 6440 if (!in) {
11fdf7f2
TL
6441 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6442 ceph_assert(in);
7c673cae 6443 }
11fdf7f2
TL
6444
6445 // prevent inode from getting freed
6446 anchor.emplace_back(in);
6447
f67539c2 6448 if (abort || blocklisted) {
11fdf7f2
TL
6449 objectcacher->purge_set(&in->oset);
6450 } else if (!in->caps.empty()) {
7c673cae
FG
6451 _release(in);
6452 _flush(in, new C_Client_FlushComplete(this, in));
6453 }
6454 }
6455 }
6456
f67539c2 6457 if (abort || blocklisted) {
11fdf7f2
TL
6458 for (auto p = dirty_list.begin(); !p.end(); ) {
6459 Inode *in = *p;
6460 ++p;
6461 if (in->dirty_caps) {
6462 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6463 in->mark_caps_clean();
6464 put_inode(in);
6465 }
6466 }
6467 } else {
6468 flush_caps_sync();
6469 wait_sync_caps(last_flush_tid);
6470 }
7c673cae
FG
6471
6472 // empty lru cache
7c673cae
FG
6473 trim_cache();
6474
f67539c2
TL
6475 delay_put_inodes();
6476
7c673cae
FG
6477 while (lru.lru_get_size() > 0 ||
6478 !inode_map.empty()) {
6479 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6480 << "+" << inode_map.size() << " items"
6481 << ", waiting (for caps to release?)"
6482 << dendl;
f67539c2 6483
9f95a23c
TL
6484 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6485 r == std::cv_status::timeout) {
7c673cae
FG
6486 dump_cache(NULL);
6487 }
6488 }
11fdf7f2
TL
6489 ceph_assert(lru.lru_get_size() == 0);
6490 ceph_assert(inode_map.empty());
7c673cae
FG
6491
6492 // stop tracing
6493 if (!cct->_conf->client_trace.empty()) {
6494 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6495 traceout.close();
6496 }
6497
f67539c2
TL
6498 // stop the tick thread
6499 tick_thread_stopped = true;
6500 upkeep_cond.notify_one();
6501
7c673cae
FG
6502 _close_sessions();
6503
f67539c2 6504 mref_writer.update_state(CLIENT_UNMOUNTED);
7c673cae
FG
6505
6506 ldout(cct, 2) << "unmounted." << dendl;
6507}
6508
b32b8144
FG
6509void Client::unmount()
6510{
11fdf7f2
TL
6511 _unmount(false);
6512}
6513
6514void Client::abort_conn()
6515{
11fdf7f2 6516 _unmount(true);
b32b8144
FG
6517}
6518
7c673cae
FG
6519void Client::flush_cap_releases()
6520{
f67539c2
TL
6521 uint64_t nr_caps = 0;
6522
7c673cae 6523 // send any cap releases
11fdf7f2
TL
6524 for (auto &p : mds_sessions) {
6525 auto &session = p.second;
6526 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6527 p.first)) {
f67539c2 6528 nr_caps += session.release->caps.size();
7c673cae
FG
6529 if (cct->_conf->client_inject_release_failure) {
6530 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6531 } else {
11fdf7f2 6532 session.con->send_message2(std::move(session.release));
7c673cae 6533 }
11fdf7f2 6534 session.release.reset();
7c673cae
FG
6535 }
6536 }
f67539c2
TL
6537
6538 if (nr_caps > 0) {
6539 dec_pinned_icaps(nr_caps);
6540 }
7c673cae
FG
6541}
6542
f67539c2 6543void Client::renew_and_flush_cap_releases()
7c673cae 6544{
f67539c2
TL
6545 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6546
6547 if (!mount_aborted && mdsmap->get_epoch()) {
6548 // renew caps?
6549 utime_t el = ceph_clock_now() - last_cap_renew;
6550 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6551 renew_caps();
6552
6553 flush_cap_releases();
7c673cae 6554 }
f67539c2
TL
6555}
6556
6557void Client::tick()
6558{
6559 ldout(cct, 20) << "tick" << dendl;
7c673cae 6560
7c673cae
FG
6561 utime_t now = ceph_clock_now();
6562
f67539c2
TL
6563 /*
6564 * If the mount() is not finished
6565 */
6566 if (is_mounting() && !mds_requests.empty()) {
7c673cae 6567 MetaRequest *req = mds_requests.begin()->second;
f67539c2 6568
7c673cae 6569 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
f67539c2 6570 req->abort(-CEPHFS_ETIMEDOUT);
7c673cae 6571 if (req->caller_cond) {
f67539c2
TL
6572 req->kick = true;
6573 req->caller_cond->notify_all();
7c673cae
FG
6574 }
6575 signal_cond_list(waiting_for_mdsmap);
11fdf7f2 6576 for (auto &p : mds_sessions) {
f67539c2 6577 signal_context_list(p.second.waiting_for_open);
11fdf7f2 6578 }
7c673cae
FG
6579 }
6580 }
6581
f67539c2 6582 renew_and_flush_cap_releases();
7c673cae
FG
6583
6584 // delayed caps
28e407b8 6585 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6586 while (!p.end()) {
6587 Inode *in = *p;
6588 ++p;
f67539c2 6589 if (!mount_aborted && in->hold_caps_until > now)
7c673cae 6590 break;
28e407b8 6591 delayed_list.pop_front();
f67539c2
TL
6592 if (!mount_aborted)
6593 check_caps(in, CHECK_CAPS_NODELAY);
7c673cae
FG
6594 }
6595
f67539c2
TL
6596 if (!mount_aborted)
6597 collect_and_send_metrics();
6598
6599 delay_put_inodes(is_unmounting());
7c673cae 6600 trim_cache(true);
f6b5b4d7 6601
f67539c2 6602 if (blocklisted && (is_mounted() || is_unmounting()) &&
f6b5b4d7
TL
6603 last_auto_reconnect + 30 * 60 < now &&
6604 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6605 messenger->client_reset();
6606 fd_gen++; // invalidate open files
f67539c2 6607 blocklisted = false;
f6b5b4d7
TL
6608 _kick_stale_sessions();
6609 last_auto_reconnect = now;
6610 }
7c673cae
FG
6611}
6612
f67539c2
TL
6613void Client::start_tick_thread()
6614{
6615 upkeeper = std::thread([this]() {
6616 using time = ceph::coarse_mono_time;
6617 using sec = std::chrono::seconds;
6618
6619 auto last_tick = time::min();
6620
6621 std::unique_lock cl(client_lock);
6622 while (!tick_thread_stopped) {
6623 auto now = clock::now();
6624 auto since = now - last_tick;
6625
6626 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6627 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6628
6629 auto interval = std::max(t_interval, d_interval);
6630 if (likely(since >= interval*.90)) {
6631 tick();
6632 last_tick = clock::now();
6633 } else {
6634 interval -= since;
6635 }
6636
6637 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6638 if (!tick_thread_stopped)
6639 upkeep_cond.wait_for(cl, interval);
6640 }
6641 });
6642}
6643
6644void Client::collect_and_send_metrics() {
6645 ldout(cct, 20) << __func__ << dendl;
6646
6647 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6648
6649 // right now, we only track and send global metrics. its sufficient
6650 // to send these metrics to MDS rank0.
6651 collect_and_send_global_metrics();
6652}
6653
6654void Client::collect_and_send_global_metrics() {
6655 ldout(cct, 20) << __func__ << dendl;
6656 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6657
6658 if (!have_open_session((mds_rank_t)0)) {
6659 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6660 << dendl;
6661 return;
6662 }
6663 auto session = _get_or_open_mds_session((mds_rank_t)0);
6664 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6665 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6666 return;
6667 }
6668
6669 ClientMetricMessage metric;
6670 std::vector<ClientMetricMessage> message;
6671
6672 // read latency
6673 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6674 message.push_back(metric);
6675
6676 // write latency
6677 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6678 message.push_back(metric);
6679
6680 // metadata latency
6681 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6682 message.push_back(metric);
6683
6684 // cap hit ratio -- nr_caps is unused right now
6685 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6686 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6687 message.push_back(metric);
6688
6689 // dentry lease hit ratio
6690 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6691 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6692 message.push_back(metric);
6693
6694 // opened files
6695 {
6696 auto [opened_files, total_inodes] = get_opened_files_rates();
6697 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6698 }
6699 message.push_back(metric);
6700
6701 // pinned i_caps
6702 {
6703 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6704 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6705 }
6706 message.push_back(metric);
6707
6708 // opened inodes
6709 {
6710 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6711 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6712 }
6713 message.push_back(metric);
6714
6715 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6716}
6717
7c673cae
FG
6718void Client::renew_caps()
6719{
6720 ldout(cct, 10) << "renew_caps()" << dendl;
6721 last_cap_renew = ceph_clock_now();
6722
11fdf7f2
TL
6723 for (auto &p : mds_sessions) {
6724 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6725 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6726 renew_caps(&p.second);
7c673cae
FG
6727 }
6728}
6729
6730void Client::renew_caps(MetaSession *session)
6731{
6732 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6733 session->last_cap_renew_request = ceph_clock_now();
6734 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6735 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6736}
6737
6738
6739// ===============================================================
6740// high level (POSIXy) interface
6741
6742int Client::_do_lookup(Inode *dir, const string& name, int mask,
6743 InodeRef *target, const UserPerm& perms)
6744{
6745 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6746 MetaRequest *req = new MetaRequest(op);
6747 filepath path;
6748 dir->make_nosnap_relative_path(path);
6749 path.push_dentry(name);
6750 req->set_filepath(path);
6751 req->set_inode(dir);
6752 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6753 mask |= DEBUG_GETATTR_CAPS;
6754 req->head.args.getattr.mask = mask;
6755
11fdf7f2 6756 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6757
6758 int r = make_request(req, perms, target);
11fdf7f2 6759 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6760 return r;
6761}
6762
f67539c2
TL
6763bool Client::_dentry_valid(const Dentry *dn)
6764{
6765 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6766
6767 // is dn lease valid?
6768 utime_t now = ceph_clock_now();
6769 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6770 mds_sessions.count(dn->lease_mds)) {
6771 MetaSession &s = mds_sessions.at(dn->lease_mds);
6772 if (s.cap_ttl > now && s.cap_gen == dn->lease_gen) {
6773 dlease_hit();
6774 return true;
6775 }
6776
6777 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6778 << " vs lease_gen " << dn->lease_gen << dendl;
6779 }
6780
6781 dlease_miss();
6782 return false;
6783}
6784
7c673cae 6785int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
f67539c2 6786 const UserPerm& perms, std::string* alternate_name)
7c673cae
FG
6787{
6788 int r = 0;
6789 Dentry *dn = NULL;
f67539c2 6790 bool did_lookup_request = false;
f91f0fd5
TL
6791 // can only request shared caps
6792 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 6793
7c673cae 6794 if (dname == "..") {
11fdf7f2
TL
6795 if (dir->dentries.empty()) {
6796 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6797 filepath path(dir->ino);
6798 req->set_filepath(path);
6799
6800 InodeRef tmptarget;
6801 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6802
6803 if (r == 0) {
f91f0fd5 6804 *target = std::move(tmptarget);
11fdf7f2
TL
6805 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6806 } else {
6807 *target = dir;
6808 }
6809 }
7c673cae
FG
6810 else
6811 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6812 goto done;
6813 }
6814
6815 if (dname == ".") {
6816 *target = dir;
6817 goto done;
6818 }
6819
11fdf7f2 6820 if (!dir->is_dir()) {
f67539c2 6821 r = -CEPHFS_ENOTDIR;
11fdf7f2
TL
6822 goto done;
6823 }
6824
7c673cae 6825 if (dname.length() > NAME_MAX) {
f67539c2 6826 r = -CEPHFS_ENAMETOOLONG;
7c673cae
FG
6827 goto done;
6828 }
6829
6830 if (dname == cct->_conf->client_snapdir &&
6831 dir->snapid == CEPH_NOSNAP) {
6832 *target = open_snapdir(dir);
6833 goto done;
6834 }
6835
f67539c2 6836relookup:
7c673cae
FG
6837 if (dir->dir &&
6838 dir->dir->dentries.count(dname)) {
6839 dn = dir->dir->dentries[dname];
6840
f67539c2
TL
6841 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6842 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7c673cae 6843
94b18763 6844 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
f67539c2
TL
6845 if (_dentry_valid(dn)) {
6846 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6847 // make trim_caps() behave.
6848 dir->try_touch_cap(dn->lease_mds);
6849 goto hit_dn;
7c673cae 6850 }
92f5a8d4 6851 // dir shared caps?
94b18763 6852 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6853 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6854 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6855 goto hit_dn;
6856 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6857 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae 6858 << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6859 return -CEPHFS_ENOENT;
7c673cae
FG
6860 }
6861 }
6862 } else {
6863 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6864 }
6865 } else {
6866 // can we conclude ENOENT locally?
94b18763 6867 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6868 (dir->flags & I_COMPLETE)) {
11fdf7f2 6869 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6870 return -CEPHFS_ENOENT;
7c673cae
FG
6871 }
6872 }
6873
f67539c2
TL
6874 if (did_lookup_request) {
6875 r = 0;
6876 goto done;
6877 }
7c673cae 6878 r = _do_lookup(dir, dname, mask, target, perms);
f67539c2
TL
6879 did_lookup_request = true;
6880 if (r == 0) {
6881 /* complete lookup to get dentry for alternate_name */
6882 goto relookup;
6883 } else {
6884 goto done;
6885 }
6886
6887 hit_dn:
6888 if (dn->inode) {
7c673cae 6889 *target = dn->inode;
f67539c2
TL
6890 if (alternate_name)
6891 *alternate_name = dn->alternate_name;
7c673cae 6892 } else {
f67539c2 6893 r = -CEPHFS_ENOENT;
7c673cae
FG
6894 }
6895 touch_dn(dn);
f67539c2 6896 goto done;
7c673cae
FG
6897
6898 done:
6899 if (r < 0)
11fdf7f2 6900 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6901 else
11fdf7f2 6902 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6903 return r;
6904}
6905
6906int Client::get_or_create(Inode *dir, const char* name,
6907 Dentry **pdn, bool expect_null)
6908{
6909 // lookup
11fdf7f2 6910 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6911 dir->open_dir();
6912 if (dir->dir->dentries.count(name)) {
6913 Dentry *dn = dir->dir->dentries[name];
f67539c2
TL
6914 if (_dentry_valid(dn)) {
6915 if (expect_null)
6916 return -CEPHFS_EEXIST;
7c673cae
FG
6917 }
6918 *pdn = dn;
6919 } else {
6920 // otherwise link up a new one
6921 *pdn = link(dir->dir, name, NULL, NULL);
6922 }
6923
6924 // success
6925 return 0;
6926}
6927
f67539c2
TL
6928int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6929{
6930 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6931 if (!mref_reader.is_state_satisfied())
6932 return -CEPHFS_ENOTCONN;
6933
6934 ldout(cct, 10) << __func__ << ": " << path << dendl;
6935
6936 std::scoped_lock lock(client_lock);
6937
6938 return path_walk(path, wdr, perms, followsym);
6939}
6940
7c673cae 6941int Client::path_walk(const filepath& origpath, InodeRef *end,
b3b6e05e 6942 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
f67539c2
TL
6943{
6944 walk_dentry_result wdr;
b3b6e05e 6945 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
f67539c2
TL
6946 *end = std::move(wdr.in);
6947 return rc;
6948}
6949
b3b6e05e
TL
6950int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
6951 bool followsym, int mask, InodeRef dirinode)
7c673cae
FG
6952{
6953 filepath path = origpath;
6954 InodeRef cur;
f67539c2 6955 std::string alternate_name;
7c673cae
FG
6956 if (origpath.absolute())
6957 cur = root;
b3b6e05e 6958 else if (!dirinode)
7c673cae 6959 cur = cwd;
b3b6e05e
TL
6960 else {
6961 cur = dirinode;
6962 }
11fdf7f2 6963 ceph_assert(cur);
7c673cae 6964
b3b6e05e 6965 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
11fdf7f2 6966 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6967
6968 int symlinks = 0;
6969
6970 unsigned i=0;
6971 while (i < path.depth() && cur) {
6972 int caps = 0;
6973 const string &dname = path[i];
6974 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6975 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6976 InodeRef next;
6977 if (cct->_conf->client_permissions) {
6978 int r = may_lookup(cur.get(), perms);
6979 if (r < 0)
6980 return r;
6981 caps = CEPH_CAP_AUTH_SHARED;
6982 }
6983
6984 /* Get extra requested caps on the last component */
6985 if (i == (path.depth() - 1))
6986 caps |= mask;
f67539c2 6987 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7c673cae
FG
6988 if (r < 0)
6989 return r;
6990 // only follow trailing symlink if followsym. always follow
6991 // 'directory' symlinks.
6992 if (next && next->is_symlink()) {
6993 symlinks++;
6994 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6995 if (symlinks > MAXSYMLINKS) {
f67539c2 6996 return -CEPHFS_ELOOP;
7c673cae
FG
6997 }
6998
6999 if (i < path.depth() - 1) {
7000 // dir symlink
7001 // replace consumed components of path with symlink dir target
7002 filepath resolved(next->symlink.c_str());
7003 resolved.append(path.postfixpath(i + 1));
7004 path = resolved;
7005 i = 0;
7006 if (next->symlink[0] == '/') {
7007 cur = root;
7008 }
7009 continue;
7010 } else if (followsym) {
7011 if (next->symlink[0] == '/') {
7012 path = next->symlink.c_str();
7013 i = 0;
7014 // reset position
7015 cur = root;
7016 } else {
7017 filepath more(next->symlink.c_str());
7018 // we need to remove the symlink component from off of the path
7019 // before adding the target that the symlink points to. remain
7020 // at the same position in the path.
7021 path.pop_dentry();
7022 path.append(more);
7023 }
7024 continue;
7025 }
7026 }
7027 cur.swap(next);
7028 i++;
7029 }
7030 if (!cur)
f67539c2
TL
7031 return -CEPHFS_ENOENT;
7032 if (result) {
7033 result->in = std::move(cur);
7034 result->alternate_name = std::move(alternate_name);
7035 }
7c673cae
FG
7036 return 0;
7037}
7038
7039
7040// namespace ops
7041
f67539c2 7042int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7c673cae 7043{
f67539c2
TL
7044 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7045 if (!mref_reader.is_state_satisfied())
7046 return -CEPHFS_ENOTCONN;
7047
7c673cae
FG
7048 tout(cct) << "link" << std::endl;
7049 tout(cct) << relexisting << std::endl;
7050 tout(cct) << relpath << std::endl;
7051
7052 filepath existing(relexisting);
7053
7054 InodeRef in, dir;
f67539c2
TL
7055
7056 std::scoped_lock lock(client_lock);
7c673cae
FG
7057 int r = path_walk(existing, &in, perm, true);
7058 if (r < 0)
7059 return r;
7060 if (std::string(relpath) == "/") {
f67539c2 7061 r = -CEPHFS_EEXIST;
7c673cae
FG
7062 return r;
7063 }
7064 filepath path(relpath);
7065 string name = path.last_dentry();
7066 path.pop_dentry();
7067
7068 r = path_walk(path, &dir, perm, true);
7069 if (r < 0)
7070 return r;
7071 if (cct->_conf->client_permissions) {
7072 if (S_ISDIR(in->mode)) {
f67539c2 7073 r = -CEPHFS_EPERM;
7c673cae
FG
7074 return r;
7075 }
7076 r = may_hardlink(in.get(), perm);
7077 if (r < 0)
7078 return r;
7079 r = may_create(dir.get(), perm);
7080 if (r < 0)
7081 return r;
7082 }
f67539c2 7083 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7084 return r;
7085}
7086
7087int Client::unlink(const char *relpath, const UserPerm& perm)
b3b6e05e
TL
7088{
7089 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7090}
7091
7092int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7c673cae 7093{
f67539c2 7094 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7095 if (!mref_reader.is_state_satisfied()) {
f67539c2 7096 return -CEPHFS_ENOTCONN;
b3b6e05e 7097 }
f67539c2 7098
11fdf7f2 7099 tout(cct) << __func__ << std::endl;
b3b6e05e 7100 tout(cct) << dirfd << std::endl;
7c673cae 7101 tout(cct) << relpath << std::endl;
b3b6e05e 7102 tout(cct) << flags << std::endl;
7c673cae 7103
b3b6e05e
TL
7104 if (std::string(relpath) == "/") {
7105 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7106 }
7c673cae
FG
7107
7108 filepath path(relpath);
7109 string name = path.last_dentry();
7110 path.pop_dentry();
7111 InodeRef dir;
f67539c2
TL
7112
7113 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7114
7115 InodeRef dirinode;
7116 int r = get_fd_inode(dirfd, &dirinode);
7117 if (r < 0) {
7118 return r;
7119 }
7120
7121 r = path_walk(path, &dir, perm, true, 0, dirinode);
7122 if (r < 0) {
7c673cae 7123 return r;
b3b6e05e 7124 }
7c673cae
FG
7125 if (cct->_conf->client_permissions) {
7126 r = may_delete(dir.get(), name.c_str(), perm);
b3b6e05e 7127 if (r < 0) {
7c673cae 7128 return r;
b3b6e05e 7129 }
7c673cae 7130 }
b3b6e05e
TL
7131 if (flags & AT_REMOVEDIR) {
7132 r = _rmdir(dir.get(), name.c_str(), perm);
7133 } else {
7134 r = _unlink(dir.get(), name.c_str(), perm);
7135 }
7136 return r;
7c673cae
FG
7137}
7138
f67539c2 7139int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7c673cae 7140{
f67539c2
TL
7141 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7142 if (!mref_reader.is_state_satisfied())
7143 return -CEPHFS_ENOTCONN;
7144
11fdf7f2 7145 tout(cct) << __func__ << std::endl;
7c673cae
FG
7146 tout(cct) << relfrom << std::endl;
7147 tout(cct) << relto << std::endl;
7148
7149 if (std::string(relfrom) == "/" || std::string(relto) == "/")
f67539c2 7150 return -CEPHFS_EBUSY;
7c673cae
FG
7151
7152 filepath from(relfrom);
7153 filepath to(relto);
7154 string fromname = from.last_dentry();
7155 from.pop_dentry();
7156 string toname = to.last_dentry();
7157 to.pop_dentry();
7158
7159 InodeRef fromdir, todir;
f67539c2
TL
7160
7161 std::scoped_lock lock(client_lock);
7c673cae
FG
7162 int r = path_walk(from, &fromdir, perm);
7163 if (r < 0)
7164 goto out;
7165 r = path_walk(to, &todir, perm);
7166 if (r < 0)
7167 goto out;
7168
7169 if (cct->_conf->client_permissions) {
7170 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7171 if (r < 0)
7172 return r;
7173 r = may_delete(todir.get(), toname.c_str(), perm);
f67539c2 7174 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
7175 return r;
7176 }
f67539c2 7177 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7178out:
7179 return r;
7180}
7181
7182// dirs
7183
f67539c2 7184int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
b3b6e05e
TL
7185{
7186 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7187}
7188
7189int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7190 std::string alternate_name)
7c673cae 7191{
f67539c2
TL
7192 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7193 if (!mref_reader.is_state_satisfied())
7194 return -CEPHFS_ENOTCONN;
7195
11fdf7f2 7196 tout(cct) << __func__ << std::endl;
b3b6e05e 7197 tout(cct) << dirfd << std::endl;
7c673cae
FG
7198 tout(cct) << relpath << std::endl;
7199 tout(cct) << mode << std::endl;
11fdf7f2 7200 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 7201
b3b6e05e 7202 if (std::string(relpath) == "/") {
f67539c2 7203 return -CEPHFS_EEXIST;
b3b6e05e 7204 }
7c673cae
FG
7205
7206 filepath path(relpath);
7207 string name = path.last_dentry();
7208 path.pop_dentry();
7209 InodeRef dir;
f67539c2
TL
7210
7211 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7212
7213 InodeRef dirinode;
7214 int r = get_fd_inode(dirfd, &dirinode);
7215 if (r < 0) {
7c673cae 7216 return r;
b3b6e05e
TL
7217 }
7218
7219 r = path_walk(path, &dir, perm, true, 0, dirinode);
7220 if (r < 0) {
7221 return r;
7222 }
7c673cae
FG
7223 if (cct->_conf->client_permissions) {
7224 r = may_create(dir.get(), perm);
b3b6e05e 7225 if (r < 0) {
7c673cae 7226 return r;
b3b6e05e 7227 }
7c673cae 7228 }
f67539c2 7229 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7c673cae
FG
7230}
7231
7232int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7233{
f67539c2
TL
7234 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7235 if (!mref_reader.is_state_satisfied())
7236 return -CEPHFS_ENOTCONN;
7237
7c673cae 7238 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 7239 tout(cct) << __func__ << std::endl;
7c673cae
FG
7240 tout(cct) << relpath << std::endl;
7241 tout(cct) << mode << std::endl;
7242
7243 //get through existing parts of path
7244 filepath path(relpath);
7245 unsigned int i;
7246 int r = 0, caps = 0;
7247 InodeRef cur, next;
f67539c2
TL
7248
7249 std::scoped_lock lock(client_lock);
7c673cae
FG
7250 cur = cwd;
7251 for (i=0; i<path.depth(); ++i) {
7252 if (cct->_conf->client_permissions) {
7253 r = may_lookup(cur.get(), perms);
7254 if (r < 0)
7255 break;
7256 caps = CEPH_CAP_AUTH_SHARED;
7257 }
7258 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7259 if (r < 0)
7260 break;
7261 cur.swap(next);
7262 }
f67539c2 7263 if (r!=-CEPHFS_ENOENT) return r;
11fdf7f2 7264 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
7265 //make new directory at each level
7266 for (; i<path.depth(); ++i) {
7267 if (cct->_conf->client_permissions) {
7268 r = may_create(cur.get(), perms);
7269 if (r < 0)
7270 return r;
7271 }
7272 //make new dir
7273 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 7274
7c673cae 7275 //check proper creation/existence
f67539c2 7276 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
c07f9fc5
FG
7277 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7278 }
7279 if (r < 0)
7280 return r;
7c673cae
FG
7281 //move to new dir and continue
7282 cur.swap(next);
11fdf7f2 7283 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
7284 << filepath(cur->ino).get_path() << dendl;
7285 }
7286 return 0;
7287}
7288
7289int Client::rmdir(const char *relpath, const UserPerm& perms)
7290{
b3b6e05e 7291 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7c673cae
FG
7292}
7293
7294int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
f67539c2
TL
7295{
7296 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7297 if (!mref_reader.is_state_satisfied())
7298 return -CEPHFS_ENOTCONN;
7299
11fdf7f2 7300 tout(cct) << __func__ << std::endl;
7c673cae
FG
7301 tout(cct) << relpath << std::endl;
7302 tout(cct) << mode << std::endl;
7303 tout(cct) << rdev << std::endl;
7304
7305 if (std::string(relpath) == "/")
f67539c2 7306 return -CEPHFS_EEXIST;
7c673cae
FG
7307
7308 filepath path(relpath);
7309 string name = path.last_dentry();
7310 path.pop_dentry();
7311 InodeRef dir;
f67539c2
TL
7312
7313 std::scoped_lock lock(client_lock);
7c673cae
FG
7314 int r = path_walk(path, &dir, perms);
7315 if (r < 0)
7316 return r;
7317 if (cct->_conf->client_permissions) {
7318 int r = may_create(dir.get(), perms);
7319 if (r < 0)
7320 return r;
7321 }
7322 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7323}
7324
7325// symlinks
7326
f67539c2 7327int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
b3b6e05e
TL
7328{
7329 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7330}
7331
7332int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7333 std::string alternate_name)
7c673cae 7334{
f67539c2 7335 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7336 if (!mref_reader.is_state_satisfied()) {
f67539c2 7337 return -CEPHFS_ENOTCONN;
b3b6e05e 7338 }
f67539c2 7339
11fdf7f2 7340 tout(cct) << __func__ << std::endl;
7c673cae 7341 tout(cct) << target << std::endl;
b3b6e05e 7342 tout(cct) << dirfd << std::endl;
7c673cae
FG
7343 tout(cct) << relpath << std::endl;
7344
b3b6e05e 7345 if (std::string(relpath) == "/") {
f67539c2 7346 return -CEPHFS_EEXIST;
b3b6e05e 7347 }
7c673cae
FG
7348
7349 filepath path(relpath);
7350 string name = path.last_dentry();
7351 path.pop_dentry();
7352 InodeRef dir;
f67539c2
TL
7353
7354 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7355
7356 InodeRef dirinode;
7357 int r = get_fd_inode(dirfd, &dirinode);
7358 if (r < 0) {
7c673cae 7359 return r;
b3b6e05e
TL
7360 }
7361 r = path_walk(path, &dir, perms, true, 0, dirinode);
7362 if (r < 0) {
7363 return r;
7364 }
7c673cae
FG
7365 if (cct->_conf->client_permissions) {
7366 int r = may_create(dir.get(), perms);
b3b6e05e 7367 if (r < 0) {
7c673cae 7368 return r;
b3b6e05e 7369 }
7c673cae 7370 }
f67539c2 7371 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7c673cae
FG
7372}
7373
7374int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7375{
b3b6e05e
TL
7376 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7377}
7378
7379int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
f67539c2 7380 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7381 if (!mref_reader.is_state_satisfied()) {
f67539c2 7382 return -CEPHFS_ENOTCONN;
b3b6e05e 7383 }
f67539c2 7384
11fdf7f2 7385 tout(cct) << __func__ << std::endl;
b3b6e05e 7386 tout(cct) << dirfd << std::endl;
7c673cae
FG
7387 tout(cct) << relpath << std::endl;
7388
b3b6e05e 7389 InodeRef dirinode;
f67539c2 7390 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7391 int r = get_fd_inode(dirfd, &dirinode);
7392 if (r < 0) {
7c673cae 7393 return r;
b3b6e05e
TL
7394 }
7395
7396 InodeRef in;
7397 filepath path(relpath);
7398 r = path_walk(path, &in, perms, false, 0, dirinode);
7399 if (r < 0) {
7400 return r;
7401 }
7c673cae
FG
7402
7403 return _readlink(in.get(), buf, size);
7404}
7405
7406int Client::_readlink(Inode *in, char *buf, size_t size)
7407{
7408 if (!in->is_symlink())
f67539c2 7409 return -CEPHFS_EINVAL;
7c673cae
FG
7410
7411 // copy into buf (at most size bytes)
7412 int r = in->symlink.length();
7413 if (r > (int)size)
7414 r = size;
7415 memcpy(buf, in->symlink.c_str(), r);
7416 return r;
7417}
7418
7419
7420// inode stuff
7421
7422int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7423{
94b18763 7424 bool yes = in->caps_issued_mask(mask, true);
7c673cae 7425
11fdf7f2 7426 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
7427 if (yes && !force)
7428 return 0;
7429
7430 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7431 filepath path;
7432 in->make_nosnap_relative_path(path);
7433 req->set_filepath(path);
7434 req->set_inode(in);
7435 req->head.args.getattr.mask = mask;
7436
7437 int res = make_request(req, perms);
11fdf7f2 7438 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
7439 return res;
7440}
7441
7442int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7443 const UserPerm& perms, InodeRef *inp)
7444{
7445 int issued = in->caps_issued();
7446
11fdf7f2 7447 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
7448 ccap_string(issued) << dendl;
7449
7450 if (in->snapid != CEPH_NOSNAP) {
f67539c2 7451 return -CEPHFS_EROFS;
7c673cae
FG
7452 }
7453 if ((mask & CEPH_SETATTR_SIZE) &&
f67539c2
TL
7454 (uint64_t)stx->stx_size > in->size &&
7455 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7c673cae 7456 perms)) {
f67539c2 7457 return -CEPHFS_EDQUOT;
7c673cae
FG
7458 }
7459
7460 // make the change locally?
7461 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7462 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7463 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7464 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7465 << in->cap_dirtier_gid << ", forcing sync setattr"
7466 << dendl;
7467 /*
7468 * This works because we implicitly flush the caps as part of the
7469 * request, so the cap update check will happen with the writeback
7470 * cap context, and then the setattr check will happen with the
7471 * caller's context.
7472 *
7473 * In reality this pattern is likely pretty rare (different users
7474 * setattr'ing the same file). If that turns out not to be the
7475 * case later, we can build a more complex pipelined cap writeback
7476 * infrastructure...
7477 */
7478 if (!mask)
7479 mask |= CEPH_SETATTR_CTIME;
7480 goto force_request;
7481 }
7482
7483 if (!mask) {
7484 // caller just needs us to bump the ctime
7485 in->ctime = ceph_clock_now();
7486 in->cap_dirtier_uid = perms.uid();
7487 in->cap_dirtier_gid = perms.gid();
7488 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7489 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7490 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7491 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7492 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7493 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7494 else
7495 mask |= CEPH_SETATTR_CTIME;
7496 }
7497
7498 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7499 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7500
7501 mask &= ~CEPH_SETATTR_KILL_SGUID;
7502
7503 if (mask & CEPH_SETATTR_UID) {
7504 in->ctime = ceph_clock_now();
7505 in->cap_dirtier_uid = perms.uid();
7506 in->cap_dirtier_gid = perms.gid();
7507 in->uid = stx->stx_uid;
28e407b8 7508 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7509 mask &= ~CEPH_SETATTR_UID;
7510 kill_sguid = true;
7511 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7512 }
7513 if (mask & CEPH_SETATTR_GID) {
7514 in->ctime = ceph_clock_now();
7515 in->cap_dirtier_uid = perms.uid();
7516 in->cap_dirtier_gid = perms.gid();
7517 in->gid = stx->stx_gid;
28e407b8 7518 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7519 mask &= ~CEPH_SETATTR_GID;
7520 kill_sguid = true;
7521 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7522 }
7523
7524 if (mask & CEPH_SETATTR_MODE) {
7525 in->ctime = ceph_clock_now();
7526 in->cap_dirtier_uid = perms.uid();
7527 in->cap_dirtier_gid = perms.gid();
7528 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7529 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7530 mask &= ~CEPH_SETATTR_MODE;
7531 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 7532 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 7533 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 7534 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 7535 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7536 }
7537
7538 if (mask & CEPH_SETATTR_BTIME) {
7539 in->ctime = ceph_clock_now();
7540 in->cap_dirtier_uid = perms.uid();
7541 in->cap_dirtier_gid = perms.gid();
7542 in->btime = utime_t(stx->stx_btime);
28e407b8 7543 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7544 mask &= ~CEPH_SETATTR_BTIME;
7545 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7546 }
7547 } else if (mask & CEPH_SETATTR_SIZE) {
7548 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7549 mask |= CEPH_SETATTR_KILL_SGUID;
7550 }
7551
7552 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7553 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7554 if (mask & CEPH_SETATTR_MTIME)
7555 in->mtime = utime_t(stx->stx_mtime);
7556 if (mask & CEPH_SETATTR_ATIME)
7557 in->atime = utime_t(stx->stx_atime);
7558 in->ctime = ceph_clock_now();
7559 in->cap_dirtier_uid = perms.uid();
7560 in->cap_dirtier_gid = perms.gid();
7561 in->time_warp_seq++;
28e407b8 7562 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
7563 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7564 }
7565 }
7566 if (!mask) {
7567 in->change_attr++;
7568 return 0;
7569 }
7570
7571force_request:
7572 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7573
7574 filepath path;
7575
7576 in->make_nosnap_relative_path(path);
7577 req->set_filepath(path);
7578 req->set_inode(in);
7579
7580 if (mask & CEPH_SETATTR_KILL_SGUID) {
7581 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7582 }
7583 if (mask & CEPH_SETATTR_MODE) {
7584 req->head.args.setattr.mode = stx->stx_mode;
7585 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7586 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7587 }
7588 if (mask & CEPH_SETATTR_UID) {
7589 req->head.args.setattr.uid = stx->stx_uid;
7590 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7591 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7592 }
7593 if (mask & CEPH_SETATTR_GID) {
7594 req->head.args.setattr.gid = stx->stx_gid;
7595 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7596 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7597 }
7598 if (mask & CEPH_SETATTR_BTIME) {
7599 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7600 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7601 }
7602 if (mask & CEPH_SETATTR_MTIME) {
7603 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 7604 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7605 CEPH_CAP_FILE_WR;
7606 }
7607 if (mask & CEPH_SETATTR_ATIME) {
7608 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7609 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7610 CEPH_CAP_FILE_WR;
7611 }
7612 if (mask & CEPH_SETATTR_SIZE) {
f67539c2 7613 if ((uint64_t)stx->stx_size < mdsmap->get_max_filesize()) {
7c673cae
FG
7614 req->head.args.setattr.size = stx->stx_size;
7615 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7616 } else { //too big!
7617 put_request(req);
7618 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
f67539c2 7619 return -CEPHFS_EFBIG;
7c673cae 7620 }
94b18763 7621 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7622 CEPH_CAP_FILE_WR;
7623 }
7624 req->head.args.setattr.mask = mask;
7625
7626 req->regetattr_mask = mask;
7627
7628 int res = make_request(req, perms, inp);
7629 ldout(cct, 10) << "_setattr result=" << res << dendl;
7630 return res;
7631}
7632
7633/* Note that we only care about attrs that setattr cares about */
7634void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7635{
7636 stx->stx_size = st->st_size;
7637 stx->stx_mode = st->st_mode;
7638 stx->stx_uid = st->st_uid;
7639 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7640#ifdef __APPLE__
7641 stx->stx_mtime = st->st_mtimespec;
7642 stx->stx_atime = st->st_atimespec;
f67539c2
TL
7643#elif __WIN32
7644 stx->stx_mtime.tv_sec = st->st_mtime;
7645 stx->stx_atime.tv_sec = st->st_atime;
11fdf7f2 7646#else
7c673cae
FG
7647 stx->stx_mtime = st->st_mtim;
7648 stx->stx_atime = st->st_atim;
11fdf7f2 7649#endif
7c673cae
FG
7650}
7651
7652int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7653 const UserPerm& perms, InodeRef *inp)
7654{
7655 int ret = _do_setattr(in, stx, mask, perms, inp);
7656 if (ret < 0)
7657 return ret;
7658 if (mask & CEPH_SETATTR_MODE)
7659 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7660 return ret;
7661}
7662
7663int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7664 const UserPerm& perms)
7665{
7666 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7667 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7668 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7669 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7670 if (cct->_conf->client_permissions) {
7671 int r = may_setattr(in.get(), stx, mask, perms);
7672 if (r < 0)
7673 return r;
7674 }
7675 return __setattrx(in.get(), stx, mask, perms);
7676}
7677
7678int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7679 const UserPerm& perms)
7680{
7681 struct ceph_statx stx;
7682
7683 stat_to_statx(attr, &stx);
7684 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7685
7686 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7687 mask &= ~CEPH_SETATTR_UID;
7688 }
7689 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7690 mask &= ~CEPH_SETATTR_GID;
7691 }
7692
7c673cae
FG
7693 return _setattrx(in, &stx, mask, perms);
7694}
7695
7696int Client::setattr(const char *relpath, struct stat *attr, int mask,
7697 const UserPerm& perms)
7698{
f67539c2
TL
7699 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7700 if (!mref_reader.is_state_satisfied())
7701 return -CEPHFS_ENOTCONN;
7702
11fdf7f2 7703 tout(cct) << __func__ << std::endl;
7c673cae
FG
7704 tout(cct) << relpath << std::endl;
7705 tout(cct) << mask << std::endl;
7706
7707 filepath path(relpath);
7708 InodeRef in;
f67539c2
TL
7709
7710 std::scoped_lock lock(client_lock);
7c673cae
FG
7711 int r = path_walk(path, &in, perms);
7712 if (r < 0)
7713 return r;
7714 return _setattr(in, attr, mask, perms);
7715}
7716
7717int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7718 const UserPerm& perms, int flags)
7719{
f67539c2
TL
7720 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7721 if (!mref_reader.is_state_satisfied())
7722 return -CEPHFS_ENOTCONN;
7723
11fdf7f2 7724 tout(cct) << __func__ << std::endl;
7c673cae
FG
7725 tout(cct) << relpath << std::endl;
7726 tout(cct) << mask << std::endl;
7727
7728 filepath path(relpath);
7729 InodeRef in;
f67539c2
TL
7730
7731 std::scoped_lock lock(client_lock);
7c673cae
FG
7732 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7733 if (r < 0)
7734 return r;
7735 return _setattrx(in, stx, mask, perms);
7736}
7737
7738int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7739{
f67539c2
TL
7740 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7741 if (!mref_reader.is_state_satisfied())
7742 return -CEPHFS_ENOTCONN;
7743
11fdf7f2 7744 tout(cct) << __func__ << std::endl;
7c673cae
FG
7745 tout(cct) << fd << std::endl;
7746 tout(cct) << mask << std::endl;
7747
f67539c2 7748 std::scoped_lock lock(client_lock);
7c673cae
FG
7749 Fh *f = get_filehandle(fd);
7750 if (!f)
f67539c2 7751 return -CEPHFS_EBADF;
7c673cae
FG
7752#if defined(__linux__) && defined(O_PATH)
7753 if (f->flags & O_PATH)
f67539c2 7754 return -CEPHFS_EBADF;
7c673cae
FG
7755#endif
7756 return _setattr(f->inode, attr, mask, perms);
7757}
7758
7759int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7760{
f67539c2
TL
7761 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7762 if (!mref_reader.is_state_satisfied())
7763 return -CEPHFS_ENOTCONN;
7764
11fdf7f2 7765 tout(cct) << __func__ << std::endl;
7c673cae
FG
7766 tout(cct) << fd << std::endl;
7767 tout(cct) << mask << std::endl;
7768
f67539c2 7769 std::scoped_lock lock(client_lock);
7c673cae
FG
7770 Fh *f = get_filehandle(fd);
7771 if (!f)
f67539c2 7772 return -CEPHFS_EBADF;
7c673cae
FG
7773#if defined(__linux__) && defined(O_PATH)
7774 if (f->flags & O_PATH)
f67539c2 7775 return -CEPHFS_EBADF;
7c673cae
FG
7776#endif
7777 return _setattrx(f->inode, stx, mask, perms);
7778}
7779
7780int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7781 frag_info_t *dirstat, int mask)
7782{
f67539c2
TL
7783 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7784 if (!mref_reader.is_state_satisfied())
7785 return -CEPHFS_ENOTCONN;
7786
11fdf7f2 7787 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7788 tout(cct) << "stat" << std::endl;
7789 tout(cct) << relpath << std::endl;
181888fb 7790
7c673cae
FG
7791 filepath path(relpath);
7792 InodeRef in;
f67539c2
TL
7793
7794 std::scoped_lock lock(client_lock);
7c673cae
FG
7795 int r = path_walk(path, &in, perms, true, mask);
7796 if (r < 0)
7797 return r;
7798 r = _getattr(in, mask, perms);
7799 if (r < 0) {
11fdf7f2 7800 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7801 return r;
7802 }
7803 fill_stat(in, stbuf, dirstat);
11fdf7f2 7804 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7805 return r;
7806}
7807
7808unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7809{
7810 unsigned mask = 0;
7811
7812 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7813 if (flags & AT_NO_ATTR_SYNC)
7814 goto out;
7815
7816 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7817 mask |= CEPH_CAP_PIN;
7818 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7819 mask |= CEPH_CAP_AUTH_SHARED;
7820 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7821 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 7822 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
7823 mask |= CEPH_CAP_FILE_SHARED;
7824 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7825 mask |= CEPH_CAP_XATTR_SHARED;
7826out:
7827 return mask;
7828}
7829
7830int Client::statx(const char *relpath, struct ceph_statx *stx,
7831 const UserPerm& perms,
7832 unsigned int want, unsigned int flags)
7833{
b3b6e05e 7834 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7c673cae
FG
7835}
7836
7837int Client::lstat(const char *relpath, struct stat *stbuf,
7838 const UserPerm& perms, frag_info_t *dirstat, int mask)
7839{
f67539c2
TL
7840 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7841 if (!mref_reader.is_state_satisfied())
7842 return -CEPHFS_ENOTCONN;
7843
11fdf7f2 7844 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
11fdf7f2 7845 tout(cct) << __func__ << std::endl;
7c673cae 7846 tout(cct) << relpath << std::endl;
181888fb 7847
7c673cae
FG
7848 filepath path(relpath);
7849 InodeRef in;
f67539c2
TL
7850
7851 std::scoped_lock lock(client_lock);
7c673cae
FG
7852 // don't follow symlinks
7853 int r = path_walk(path, &in, perms, false, mask);
7854 if (r < 0)
7855 return r;
7856 r = _getattr(in, mask, perms);
7857 if (r < 0) {
11fdf7f2 7858 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7859 return r;
7860 }
7861 fill_stat(in, stbuf, dirstat);
11fdf7f2 7862 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7863 return r;
7864}
7865
7866int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7867{
11fdf7f2 7868 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7869 << " mode 0" << oct << in->mode << dec
7870 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7871 memset(st, 0, sizeof(struct stat));
7872 if (use_faked_inos())
7873 st->st_ino = in->faked_ino;
7874 else
7875 st->st_ino = in->ino;
7876 st->st_dev = in->snapid;
7877 st->st_mode = in->mode;
7878 st->st_rdev = in->rdev;
28e407b8
AA
7879 if (in->is_dir()) {
7880 switch (in->nlink) {
7881 case 0:
7882 st->st_nlink = 0; /* dir is unlinked */
7883 break;
7884 case 1:
7885 st->st_nlink = 1 /* parent dentry */
7886 + 1 /* <dir>/. */
7887 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7888 break;
7889 default:
7890 ceph_abort();
7891 }
7892 } else {
7893 st->st_nlink = in->nlink;
7894 }
7c673cae
FG
7895 st->st_uid = in->uid;
7896 st->st_gid = in->gid;
7897 if (in->ctime > in->mtime) {
7898 stat_set_ctime_sec(st, in->ctime.sec());
7899 stat_set_ctime_nsec(st, in->ctime.nsec());
7900 } else {
7901 stat_set_ctime_sec(st, in->mtime.sec());
7902 stat_set_ctime_nsec(st, in->mtime.nsec());
7903 }
7904 stat_set_atime_sec(st, in->atime.sec());
7905 stat_set_atime_nsec(st, in->atime.nsec());
7906 stat_set_mtime_sec(st, in->mtime.sec());
7907 stat_set_mtime_nsec(st, in->mtime.nsec());
7908 if (in->is_dir()) {
7909 if (cct->_conf->client_dirsize_rbytes)
7910 st->st_size = in->rstat.rbytes;
7911 else
7912 st->st_size = in->dirstat.size();
f67539c2
TL
7913// The Windows "stat" structure provides just a subset of the fields that are
7914// available on Linux.
7915#ifndef _WIN32
7c673cae 7916 st->st_blocks = 1;
f67539c2 7917#endif
7c673cae
FG
7918 } else {
7919 st->st_size = in->size;
f67539c2 7920#ifndef _WIN32
7c673cae 7921 st->st_blocks = (in->size + 511) >> 9;
f67539c2 7922#endif
7c673cae 7923 }
f67539c2 7924#ifndef _WIN32
11fdf7f2 7925 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
f67539c2 7926#endif
7c673cae
FG
7927
7928 if (dirstat)
7929 *dirstat = in->dirstat;
7930 if (rstat)
7931 *rstat = in->rstat;
7932
7933 return in->caps_issued();
7934}
7935
7936void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7937{
11fdf7f2 7938 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7939 << " mode 0" << oct << in->mode << dec
7940 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7941 memset(stx, 0, sizeof(struct ceph_statx));
7942
7943 /*
7944 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7945 * so that all bits are set.
7946 */
7947 if (!mask)
7948 mask = ~0;
7949
7950 /* These are always considered to be available */
7951 stx->stx_dev = in->snapid;
11fdf7f2 7952 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7953
7954 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7955 stx->stx_mode = S_IFMT & in->mode;
7956 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7957 stx->stx_rdev = in->rdev;
7958 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7959
7960 if (mask & CEPH_CAP_AUTH_SHARED) {
7961 stx->stx_uid = in->uid;
7962 stx->stx_gid = in->gid;
7963 stx->stx_mode = in->mode;
7964 in->btime.to_timespec(&stx->stx_btime);
7965 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7966 }
7967
7968 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7969 if (in->is_dir()) {
7970 switch (in->nlink) {
7971 case 0:
7972 stx->stx_nlink = 0; /* dir is unlinked */
7973 break;
7974 case 1:
7975 stx->stx_nlink = 1 /* parent dentry */
7976 + 1 /* <dir>/. */
7977 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7978 break;
7979 default:
7980 ceph_abort();
7981 }
7982 } else {
7983 stx->stx_nlink = in->nlink;
7984 }
7c673cae
FG
7985 stx->stx_mask |= CEPH_STATX_NLINK;
7986 }
7987
7988 if (mask & CEPH_CAP_FILE_SHARED) {
7989
7990 in->atime.to_timespec(&stx->stx_atime);
7991 in->mtime.to_timespec(&stx->stx_mtime);
7992
7993 if (in->is_dir()) {
7994 if (cct->_conf->client_dirsize_rbytes)
7995 stx->stx_size = in->rstat.rbytes;
7996 else
7997 stx->stx_size = in->dirstat.size();
7998 stx->stx_blocks = 1;
7999 } else {
8000 stx->stx_size = in->size;
8001 stx->stx_blocks = (in->size + 511) >> 9;
8002 }
8003 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8004 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8005 }
8006
8007 /* Change time and change_attr both require all shared caps to view */
8008 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8009 stx->stx_version = in->change_attr;
8010 if (in->ctime > in->mtime)
8011 in->ctime.to_timespec(&stx->stx_ctime);
8012 else
8013 in->mtime.to_timespec(&stx->stx_ctime);
8014 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8015 }
8016
8017}
8018
8019void Client::touch_dn(Dentry *dn)
8020{
8021 lru.lru_touch(dn);
8022}
8023
8024int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8025{
b3b6e05e 8026 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
7c673cae
FG
8027}
8028
8029int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8030{
f67539c2
TL
8031 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8032 if (!mref_reader.is_state_satisfied())
8033 return -CEPHFS_ENOTCONN;
8034
11fdf7f2 8035 tout(cct) << __func__ << std::endl;
7c673cae
FG
8036 tout(cct) << fd << std::endl;
8037 tout(cct) << mode << std::endl;
181888fb 8038
f67539c2 8039 std::scoped_lock lock(client_lock);
7c673cae
FG
8040 Fh *f = get_filehandle(fd);
8041 if (!f)
f67539c2 8042 return -CEPHFS_EBADF;
7c673cae
FG
8043#if defined(__linux__) && defined(O_PATH)
8044 if (f->flags & O_PATH)
f67539c2 8045 return -CEPHFS_EBADF;
7c673cae
FG
8046#endif
8047 struct stat attr;
8048 attr.st_mode = mode;
8049 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8050}
8051
b3b6e05e
TL
8052int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8053 const UserPerm& perms) {
f67539c2 8054 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8055 if (!mref_reader.is_state_satisfied()) {
f67539c2 8056 return -CEPHFS_ENOTCONN;
b3b6e05e 8057 }
f67539c2 8058
11fdf7f2 8059 tout(cct) << __func__ << std::endl;
b3b6e05e 8060 tout(cct) << dirfd << std::endl;
7c673cae
FG
8061 tout(cct) << relpath << std::endl;
8062 tout(cct) << mode << std::endl;
b3b6e05e 8063 tout(cct) << flags << std::endl;
181888fb 8064
7c673cae
FG
8065 filepath path(relpath);
8066 InodeRef in;
b3b6e05e 8067 InodeRef dirinode;
f67539c2
TL
8068
8069 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8070 int r = get_fd_inode(dirfd, &dirinode);
8071 if (r < 0) {
8072 return r;
8073 }
8074
8075 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8076 if (r < 0) {
7c673cae 8077 return r;
b3b6e05e 8078 }
7c673cae
FG
8079 struct stat attr;
8080 attr.st_mode = mode;
8081 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8082}
8083
b3b6e05e
TL
8084int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8085{
8086 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8087}
8088
7c673cae
FG
8089int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8090 const UserPerm& perms)
8091{
b3b6e05e 8092 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
7c673cae
FG
8093}
8094
8095int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8096{
f67539c2
TL
8097 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8098 if (!mref_reader.is_state_satisfied())
8099 return -CEPHFS_ENOTCONN;
8100
11fdf7f2 8101 tout(cct) << __func__ << std::endl;
7c673cae
FG
8102 tout(cct) << fd << std::endl;
8103 tout(cct) << new_uid << std::endl;
8104 tout(cct) << new_gid << std::endl;
181888fb 8105
f67539c2 8106 std::scoped_lock lock(client_lock);
7c673cae
FG
8107 Fh *f = get_filehandle(fd);
8108 if (!f)
f67539c2 8109 return -CEPHFS_EBADF;
7c673cae
FG
8110#if defined(__linux__) && defined(O_PATH)
8111 if (f->flags & O_PATH)
f67539c2 8112 return -CEPHFS_EBADF;
7c673cae
FG
8113#endif
8114 struct stat attr;
8115 attr.st_uid = new_uid;
8116 attr.st_gid = new_gid;
8117 int mask = 0;
8118 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8119 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8120 return _setattr(f->inode, &attr, mask, perms);
8121}
8122
8123int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8124 const UserPerm& perms)
8125{
b3b6e05e
TL
8126 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8127}
8128
8129int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8130 int flags, const UserPerm& perms) {
f67539c2 8131 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8132 if (!mref_reader.is_state_satisfied()) {
f67539c2 8133 return -CEPHFS_ENOTCONN;
b3b6e05e 8134 }
f67539c2 8135
11fdf7f2 8136 tout(cct) << __func__ << std::endl;
b3b6e05e 8137 tout(cct) << dirfd << std::endl;
7c673cae
FG
8138 tout(cct) << relpath << std::endl;
8139 tout(cct) << new_uid << std::endl;
8140 tout(cct) << new_gid << std::endl;
b3b6e05e 8141 tout(cct) << flags << std::endl;
181888fb 8142
7c673cae
FG
8143 filepath path(relpath);
8144 InodeRef in;
b3b6e05e 8145 InodeRef dirinode;
f67539c2
TL
8146
8147 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8148 int r = get_fd_inode(dirfd, &dirinode);
8149 if (r < 0) {
7c673cae 8150 return r;
b3b6e05e
TL
8151 }
8152
8153 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8154 if (r < 0) {
8155 return r;
8156 }
7c673cae
FG
8157 struct stat attr;
8158 attr.st_uid = new_uid;
8159 attr.st_gid = new_gid;
b3b6e05e 8160 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
8161}
8162
11fdf7f2
TL
8163static void attr_set_atime_and_mtime(struct stat *attr,
8164 const utime_t &atime,
8165 const utime_t &mtime)
8166{
8167 stat_set_atime_sec(attr, atime.tv.tv_sec);
8168 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8169 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8170 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8171}
8172
8173// for [l]utime() invoke the timeval variant as the timespec
8174// variant are not yet implemented. for futime[s](), invoke
8175// the timespec variant.
7c673cae
FG
8176int Client::utime(const char *relpath, struct utimbuf *buf,
8177 const UserPerm& perms)
8178{
11fdf7f2
TL
8179 struct timeval tv[2];
8180 tv[0].tv_sec = buf->actime;
8181 tv[0].tv_usec = 0;
8182 tv[1].tv_sec = buf->modtime;
8183 tv[1].tv_usec = 0;
8184
8185 return utimes(relpath, tv, perms);
8186}
8187
8188int Client::lutime(const char *relpath, struct utimbuf *buf,
8189 const UserPerm& perms)
8190{
8191 struct timeval tv[2];
8192 tv[0].tv_sec = buf->actime;
8193 tv[0].tv_usec = 0;
8194 tv[1].tv_sec = buf->modtime;
8195 tv[1].tv_usec = 0;
8196
8197 return lutimes(relpath, tv, perms);
8198}
8199
8200int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8201{
8202 struct timespec ts[2];
8203 ts[0].tv_sec = buf->actime;
8204 ts[0].tv_nsec = 0;
8205 ts[1].tv_sec = buf->modtime;
8206 ts[1].tv_nsec = 0;
8207
8208 return futimens(fd, ts, perms);
8209}
8210
8211int Client::utimes(const char *relpath, struct timeval times[2],
8212 const UserPerm& perms)
8213{
f67539c2
TL
8214 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8215 if (!mref_reader.is_state_satisfied())
8216 return -CEPHFS_ENOTCONN;
8217
11fdf7f2 8218 tout(cct) << __func__ << std::endl;
7c673cae 8219 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8220 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8221 << std::endl;
8222 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8223 << std::endl;
181888fb 8224
7c673cae
FG
8225 filepath path(relpath);
8226 InodeRef in;
f67539c2
TL
8227
8228 std::scoped_lock lock(client_lock);
7c673cae
FG
8229 int r = path_walk(path, &in, perms);
8230 if (r < 0)
8231 return r;
8232 struct stat attr;
11fdf7f2
TL
8233 utime_t atime(times[0]);
8234 utime_t mtime(times[1]);
8235
8236 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8237 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8238}
8239
11fdf7f2
TL
8240int Client::lutimes(const char *relpath, struct timeval times[2],
8241 const UserPerm& perms)
7c673cae 8242{
f67539c2
TL
8243 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8244 if (!mref_reader.is_state_satisfied())
8245 return -CEPHFS_ENOTCONN;
8246
11fdf7f2 8247 tout(cct) << __func__ << std::endl;
7c673cae 8248 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8249 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8250 << std::endl;
8251 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8252 << std::endl;
181888fb 8253
7c673cae
FG
8254 filepath path(relpath);
8255 InodeRef in;
f67539c2
TL
8256
8257 std::scoped_lock lock(client_lock);
7c673cae
FG
8258 int r = path_walk(path, &in, perms, false);
8259 if (r < 0)
8260 return r;
8261 struct stat attr;
11fdf7f2
TL
8262 utime_t atime(times[0]);
8263 utime_t mtime(times[1]);
8264
8265 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8266 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8267}
8268
11fdf7f2
TL
8269int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8270{
8271 struct timespec ts[2];
8272 ts[0].tv_sec = times[0].tv_sec;
8273 ts[0].tv_nsec = times[0].tv_usec * 1000;
8274 ts[1].tv_sec = times[1].tv_sec;
8275 ts[1].tv_nsec = times[1].tv_usec * 1000;
8276
8277 return futimens(fd, ts, perms);
8278}
8279
8280int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8281{
f67539c2
TL
8282 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8283 if (!mref_reader.is_state_satisfied())
8284 return -CEPHFS_ENOTCONN;
8285
11fdf7f2
TL
8286 tout(cct) << __func__ << std::endl;
8287 tout(cct) << fd << std::endl;
8288 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8289 << std::endl;
8290 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8291 << std::endl;
8292
f67539c2 8293 std::scoped_lock lock(client_lock);
11fdf7f2
TL
8294 Fh *f = get_filehandle(fd);
8295 if (!f)
f67539c2 8296 return -CEPHFS_EBADF;
11fdf7f2
TL
8297#if defined(__linux__) && defined(O_PATH)
8298 if (f->flags & O_PATH)
f67539c2 8299 return -CEPHFS_EBADF;
11fdf7f2
TL
8300#endif
8301 struct stat attr;
8302 utime_t atime(times[0]);
8303 utime_t mtime(times[1]);
8304
8305 attr_set_atime_and_mtime(&attr, atime, mtime);
8306 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8307}
8308
b3b6e05e
TL
8309int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8310 const UserPerm& perms) {
8311 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8312 if (!mref_reader.is_state_satisfied()) {
8313 return -CEPHFS_ENOTCONN;
8314 }
8315
8316 tout(cct) << __func__ << std::endl;
8317 tout(cct) << dirfd << std::endl;
8318 tout(cct) << relpath << std::endl;
8319 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8320 << std::endl;
8321 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8322 << std::endl;
8323 tout(cct) << flags << std::endl;
8324
8325 filepath path(relpath);
8326 InodeRef in;
8327 InodeRef dirinode;
8328
8329 std::scoped_lock lock(client_lock);
8330 int r = get_fd_inode(dirfd, &dirinode);
8331 if (r < 0) {
8332 return r;
8333 }
8334
8335#if defined(__linux__) && defined(O_PATH)
8336 if (flags & O_PATH) {
8337 return -CEPHFS_EBADF;
8338 }
8339#endif
8340
8341 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8342 if (r < 0) {
8343 return r;
8344 }
8345 struct stat attr;
8346 utime_t atime(times[0]);
8347 utime_t mtime(times[1]);
8348
8349 attr_set_atime_and_mtime(&attr, atime, mtime);
8350 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8351}
8352
7c673cae
FG
8353int Client::flock(int fd, int operation, uint64_t owner)
8354{
f67539c2
TL
8355 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8356 if (!mref_reader.is_state_satisfied())
8357 return -CEPHFS_ENOTCONN;
8358
11fdf7f2 8359 tout(cct) << __func__ << std::endl;
7c673cae
FG
8360 tout(cct) << fd << std::endl;
8361 tout(cct) << operation << std::endl;
8362 tout(cct) << owner << std::endl;
181888fb 8363
f67539c2 8364 std::scoped_lock lock(client_lock);
7c673cae
FG
8365 Fh *f = get_filehandle(fd);
8366 if (!f)
f67539c2 8367 return -CEPHFS_EBADF;
7c673cae
FG
8368
8369 return _flock(f, operation, owner);
8370}
8371
8372int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8373{
f67539c2
TL
8374 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8375 if (!mref_reader.is_state_satisfied())
8376 return -CEPHFS_ENOTCONN;
8377
11fdf7f2 8378 tout(cct) << __func__ << std::endl;
7c673cae 8379 tout(cct) << relpath << std::endl;
181888fb 8380
7c673cae
FG
8381 filepath path(relpath);
8382 InodeRef in;
f67539c2
TL
8383
8384 std::scoped_lock lock(client_lock);
7c673cae
FG
8385 int r = path_walk(path, &in, perms, true);
8386 if (r < 0)
8387 return r;
8388 if (cct->_conf->client_permissions) {
8389 int r = may_open(in.get(), O_RDONLY, perms);
8390 if (r < 0)
8391 return r;
8392 }
8393 r = _opendir(in.get(), dirpp, perms);
8394 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
f67539c2
TL
8395 if (r != -CEPHFS_ENOTDIR)
8396 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
8397 return r;
8398}
8399
b3b6e05e
TL
8400int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8401 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8402 if (!mref_reader.is_state_satisfied()) {
8403 return -CEPHFS_ENOTCONN;
8404 }
8405
8406 tout(cct) << __func__ << std::endl;
8407 tout(cct) << dirfd << std::endl;
8408
8409 InodeRef dirinode;
8410 std::scoped_lock locker(client_lock);
8411 int r = get_fd_inode(dirfd, &dirinode);
8412 if (r < 0) {
8413 return r;
8414 }
8415
8416 if (cct->_conf->client_permissions) {
8417 r = may_open(dirinode.get(), O_RDONLY, perms);
8418 if (r < 0) {
8419 return r;
8420 }
8421 }
8422 r = _opendir(dirinode.get(), dirpp, perms);
8423 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8424 if (r != -CEPHFS_ENOTDIR) {
8425 tout(cct) << (uintptr_t)*dirpp << std::endl;
8426 }
8427 return r;
8428}
8429
7c673cae
FG
8430int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8431{
8432 if (!in->is_dir())
f67539c2 8433 return -CEPHFS_ENOTDIR;
7c673cae
FG
8434 *dirpp = new dir_result_t(in, perms);
8435 opened_dirs.insert(*dirpp);
11fdf7f2 8436 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
8437 return 0;
8438}
8439
8440
8441int Client::closedir(dir_result_t *dir)
8442{
11fdf7f2 8443 tout(cct) << __func__ << std::endl;
f67539c2 8444 tout(cct) << (uintptr_t)dir << std::endl;
7c673cae 8445
11fdf7f2 8446 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
f67539c2 8447 std::scoped_lock lock(client_lock);
7c673cae
FG
8448 _closedir(dir);
8449 return 0;
8450}
8451
8452void Client::_closedir(dir_result_t *dirp)
8453{
11fdf7f2 8454 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
f67539c2 8455
7c673cae 8456 if (dirp->inode) {
11fdf7f2 8457 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
8458 dirp->inode.reset();
8459 }
8460 _readdir_drop_dirp_buffer(dirp);
8461 opened_dirs.erase(dirp);
8462 delete dirp;
8463}
8464
8465void Client::rewinddir(dir_result_t *dirp)
8466{
11fdf7f2 8467 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb 8468
f67539c2
TL
8469 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8470 if (!mref_reader.is_state_satisfied())
181888fb
FG
8471 return;
8472
f67539c2 8473 std::scoped_lock lock(client_lock);
7c673cae
FG
8474 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8475 _readdir_drop_dirp_buffer(d);
8476 d->reset();
8477}
8478
8479loff_t Client::telldir(dir_result_t *dirp)
8480{
8481 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 8482 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
8483 return d->offset;
8484}
8485
8486void Client::seekdir(dir_result_t *dirp, loff_t offset)
8487{
11fdf7f2 8488 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 8489
f67539c2
TL
8490 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8491 if (!mref_reader.is_state_satisfied())
181888fb
FG
8492 return;
8493
f67539c2
TL
8494 std::scoped_lock lock(client_lock);
8495
7c673cae
FG
8496 if (offset == dirp->offset)
8497 return;
8498
8499 if (offset > dirp->offset)
8500 dirp->release_count = 0; // bump if we do a forward seek
8501 else
8502 dirp->ordered_count = 0; // disable filling readdir cache
8503
8504 if (dirp->hash_order()) {
8505 if (dirp->offset > offset) {
8506 _readdir_drop_dirp_buffer(dirp);
8507 dirp->reset();
8508 }
8509 } else {
8510 if (offset == 0 ||
8511 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8512 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8513 _readdir_drop_dirp_buffer(dirp);
8514 dirp->reset();
8515 }
8516 }
8517
8518 dirp->offset = offset;
8519}
8520
8521
8522//struct dirent {
8523// ino_t d_ino; /* inode number */
8524// off_t d_off; /* offset to the next dirent */
8525// unsigned short d_reclen; /* length of this record */
8526// unsigned char d_type; /* type of file */
8527// char d_name[256]; /* filename */
8528//};
8529void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8530{
8531 strncpy(de->d_name, name, 255);
8532 de->d_name[255] = '\0';
f67539c2 8533#if !defined(__CYGWIN__) && !(defined(_WIN32))
7c673cae 8534 de->d_ino = ino;
11fdf7f2 8535#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
8536 de->d_off = next_off;
8537#endif
8538 de->d_reclen = 1;
8539 de->d_type = IFTODT(type);
11fdf7f2 8540 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
8541 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8542#endif
8543}
8544
8545void Client::_readdir_next_frag(dir_result_t *dirp)
8546{
8547 frag_t fg = dirp->buffer_frag;
8548
8549 if (fg.is_rightmost()) {
11fdf7f2 8550 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
8551 dirp->set_end();
8552 return;
8553 }
8554
8555 // advance
8556 fg = fg.next();
11fdf7f2 8557 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
8558
8559 if (dirp->hash_order()) {
8560 // keep last_name
8561 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8562 if (dirp->offset < new_offset) // don't decrease offset
8563 dirp->offset = new_offset;
8564 } else {
8565 dirp->last_name.clear();
8566 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8567 _readdir_rechoose_frag(dirp);
8568 }
8569}
8570
8571void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8572{
11fdf7f2 8573 ceph_assert(dirp->inode);
7c673cae
FG
8574
8575 if (dirp->hash_order())
8576 return;
8577
8578 frag_t cur = frag_t(dirp->offset_high());
8579 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8580 if (fg != cur) {
11fdf7f2 8581 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
8582 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8583 dirp->last_name.clear();
8584 dirp->next_offset = 2;
8585 }
8586}
8587
8588void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8589{
11fdf7f2 8590 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
8591 dirp->buffer.clear();
8592}
8593
8594int Client::_readdir_get_frag(dir_result_t *dirp)
8595{
11fdf7f2
TL
8596 ceph_assert(dirp);
8597 ceph_assert(dirp->inode);
7c673cae
FG
8598
8599 // get the current frag.
8600 frag_t fg;
8601 if (dirp->hash_order())
8602 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8603 else
8604 fg = frag_t(dirp->offset_high());
8605
11fdf7f2 8606 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
8607 << " offset " << hex << dirp->offset << dec << dendl;
8608
8609 int op = CEPH_MDS_OP_READDIR;
8610 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8611 op = CEPH_MDS_OP_LSSNAP;
8612
8613 InodeRef& diri = dirp->inode;
8614
8615 MetaRequest *req = new MetaRequest(op);
8616 filepath path;
8617 diri->make_nosnap_relative_path(path);
8618 req->set_filepath(path);
8619 req->set_inode(diri.get());
8620 req->head.args.readdir.frag = fg;
8621 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8622 if (dirp->last_name.length()) {
94b18763 8623 req->path2.set_path(dirp->last_name);
7c673cae
FG
8624 } else if (dirp->hash_order()) {
8625 req->head.args.readdir.offset_hash = dirp->offset_high();
8626 }
8627 req->dirp = dirp;
8628
8629 bufferlist dirbl;
8630 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8631
f67539c2 8632 if (res == -CEPHFS_EAGAIN) {
11fdf7f2 8633 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
8634 _readdir_rechoose_frag(dirp);
8635 return _readdir_get_frag(dirp);
8636 }
8637
8638 if (res == 0) {
11fdf7f2 8639 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
8640 << " size " << dirp->buffer.size() << dendl;
8641 } else {
11fdf7f2 8642 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
8643 dirp->set_end();
8644 }
8645
8646 return res;
8647}
8648
8649struct dentry_off_lt {
8650 bool operator()(const Dentry* dn, int64_t off) const {
8651 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8652 }
8653};
8654
8655int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8656 int caps, bool getref)
8657{
f67539c2 8658 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11fdf7f2 8659 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
8660 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8661 << dendl;
8662 Dir *dir = dirp->inode->dir;
8663
8664 if (!dir) {
8665 ldout(cct, 10) << " dir is empty" << dendl;
8666 dirp->set_end();
8667 return 0;
8668 }
8669
8670 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8671 dir->readdir_cache.end(),
8672 dirp->offset, dentry_off_lt());
8673
8674 string dn_name;
8675 while (true) {
adb31ebb 8676 int mask = caps;
7c673cae 8677 if (!dirp->inode->is_complete_and_ordered())
f67539c2 8678 return -CEPHFS_EAGAIN;
7c673cae
FG
8679 if (pd == dir->readdir_cache.end())
8680 break;
8681 Dentry *dn = *pd;
8682 if (dn->inode == NULL) {
8683 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8684 ++pd;
8685 continue;
8686 }
8687 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8688 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8689 ++pd;
8690 continue;
8691 }
8692
92f5a8d4 8693 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
8694 if (dn->inode->is_dir()) {
8695 mask |= CEPH_STAT_RSTAT;
8696 }
8697 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
8698 if (r < 0)
8699 return r;
92f5a8d4
TL
8700
8701 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8702 pd = dir->readdir_cache.begin() + idx;
8703 if (pd >= dir->readdir_cache.end() || *pd != dn)
f67539c2 8704 return -CEPHFS_EAGAIN;
7c673cae
FG
8705
8706 struct ceph_statx stx;
8707 struct dirent de;
8708 fill_statx(dn->inode, caps, &stx);
8709
8710 uint64_t next_off = dn->offset + 1;
eafe8130 8711 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8712 ++pd;
8713 if (pd == dir->readdir_cache.end())
8714 next_off = dir_result_t::END;
8715
8716 Inode *in = NULL;
7c673cae
FG
8717 if (getref) {
8718 in = dn->inode.get();
8719 _ll_get(in);
8720 }
8721
8722 dn_name = dn->name; // fill in name while we have lock
8723
9f95a23c 8724 client_lock.unlock();
7c673cae 8725 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8726 client_lock.lock();
7c673cae
FG
8727 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8728 << " = " << r << dendl;
8729 if (r < 0) {
8730 return r;
8731 }
8732
8733 dirp->offset = next_off;
8734 if (dirp->at_end())
8735 dirp->next_offset = 2;
8736 else
8737 dirp->next_offset = dirp->offset_low();
8738 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8739 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8740 if (r > 0)
8741 return r;
8742 }
8743
11fdf7f2 8744 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8745 dirp->set_end();
8746 return 0;
8747}
8748
8749int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8750 unsigned want, unsigned flags, bool getref)
8751{
8752 int caps = statx_to_mask(flags, want);
8753
f67539c2
TL
8754 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8755 if (!mref_reader.is_state_satisfied())
8756 return -CEPHFS_ENOTCONN;
7c673cae 8757
f67539c2 8758 std::unique_lock cl(client_lock);
181888fb 8759
7c673cae
FG
8760 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8761
11fdf7f2 8762 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8763 << dec << " at_end=" << dirp->at_end()
8764 << " hash_order=" << dirp->hash_order() << dendl;
8765
8766 struct dirent de;
8767 struct ceph_statx stx;
8768 memset(&de, 0, sizeof(de));
8769 memset(&stx, 0, sizeof(stx));
8770
8771 InodeRef& diri = dirp->inode;
8772
8773 if (dirp->at_end())
8774 return 0;
8775
8776 if (dirp->offset == 0) {
8777 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8778 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8779 uint64_t next_off = 1;
8780
8781 int r;
adb31ebb 8782 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8783 if (r < 0)
8784 return r;
8785
8786 fill_statx(diri, caps, &stx);
8787 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8788
8789 Inode *inode = NULL;
8790 if (getref) {
8791 inode = diri.get();
8792 _ll_get(inode);
8793 }
8794
f67539c2 8795 cl.unlock();
7c673cae 8796 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8797 cl.lock();
7c673cae
FG
8798 if (r < 0)
8799 return r;
8800
8801 dirp->offset = next_off;
8802 if (r > 0)
8803 return r;
8804 }
8805 if (dirp->offset == 1) {
8806 ldout(cct, 15) << " including .." << dendl;
8807 uint64_t next_off = 2;
8808 InodeRef in;
11fdf7f2 8809 if (diri->dentries.empty())
7c673cae
FG
8810 in = diri;
8811 else
94b18763 8812 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8813
8814 int r;
adb31ebb 8815 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8816 if (r < 0)
8817 return r;
8818
8819 fill_statx(in, caps, &stx);
8820 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8821
8822 Inode *inode = NULL;
8823 if (getref) {
8824 inode = in.get();
8825 _ll_get(inode);
8826 }
8827
f67539c2 8828 cl.unlock();
7c673cae 8829 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8830 cl.lock();
7c673cae
FG
8831 if (r < 0)
8832 return r;
8833
8834 dirp->offset = next_off;
8835 if (r > 0)
8836 return r;
8837 }
8838
8839 // can we read from our cache?
8840 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8841 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8842 << dirp->inode->is_complete_and_ordered()
8843 << " issued " << ccap_string(dirp->inode->caps_issued())
8844 << dendl;
8845 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8846 dirp->inode->is_complete_and_ordered() &&
94b18763 8847 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 8848 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
f67539c2 8849 if (err != -CEPHFS_EAGAIN)
7c673cae
FG
8850 return err;
8851 }
8852
8853 while (1) {
8854 if (dirp->at_end())
8855 return 0;
8856
8857 bool check_caps = true;
8858 if (!dirp->is_cached()) {
8859 int r = _readdir_get_frag(dirp);
8860 if (r)
8861 return r;
8862 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8863 // different than the requested one. (our dirfragtree was outdated)
8864 check_caps = false;
8865 }
8866 frag_t fg = dirp->buffer_frag;
8867
8868 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8869 << " offset " << hex << dirp->offset << dendl;
8870
8871 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8872 dirp->offset, dir_result_t::dentry_off_lt());
8873 it != dirp->buffer.end();
8874 ++it) {
8875 dir_result_t::dentry &entry = *it;
8876
8877 uint64_t next_off = entry.offset + 1;
8878
8879 int r;
8880 if (check_caps) {
adb31ebb
TL
8881 int mask = caps;
8882 if(entry.inode->is_dir()){
8883 mask |= CEPH_STAT_RSTAT;
8884 }
8885 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
8886 if (r < 0)
8887 return r;
8888 }
8889
8890 fill_statx(entry.inode, caps, &stx);
8891 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8892
8893 Inode *inode = NULL;
8894 if (getref) {
8895 inode = entry.inode.get();
8896 _ll_get(inode);
8897 }
8898
f67539c2 8899 cl.unlock();
7c673cae 8900 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
f67539c2 8901 cl.lock();
7c673cae
FG
8902
8903 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8904 << " = " << r << dendl;
8905 if (r < 0)
8906 return r;
8907
8908 dirp->offset = next_off;
8909 if (r > 0)
8910 return r;
8911 }
8912
8913 if (dirp->next_offset > 2) {
8914 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8915 _readdir_drop_dirp_buffer(dirp);
8916 continue; // more!
8917 }
8918
8919 if (!fg.is_rightmost()) {
8920 // next frag!
8921 _readdir_next_frag(dirp);
8922 continue;
8923 }
8924
8925 if (diri->shared_gen == dirp->start_shared_gen &&
8926 diri->dir_release_count == dirp->release_count) {
8927 if (diri->dir_ordered_count == dirp->ordered_count) {
8928 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8929 if (diri->dir) {
11fdf7f2 8930 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8931 diri->dir->readdir_cache.resize(dirp->cache_index);
8932 }
8933 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8934 } else {
8935 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8936 diri->flags |= I_COMPLETE;
8937 }
8938 }
8939
8940 dirp->set_end();
8941 return 0;
8942 }
8943 ceph_abort();
8944 return 0;
8945}
8946
8947
8948int Client::readdir_r(dir_result_t *d, struct dirent *de)
8949{
8950 return readdirplus_r(d, de, 0, 0, 0, NULL);
8951}
8952
8953/*
8954 * readdirplus_r
8955 *
8956 * returns
8957 * 1 if we got a dirent
8958 * 0 for end of directory
8959 * <0 on error
8960 */
8961
8962struct single_readdir {
8963 struct dirent *de;
8964 struct ceph_statx *stx;
8965 Inode *inode;
8966 bool full;
8967};
8968
8969static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8970 struct ceph_statx *stx, off_t off,
8971 Inode *in)
8972{
8973 single_readdir *c = static_cast<single_readdir *>(p);
8974
8975 if (c->full)
8976 return -1; // already filled this dirent
8977
8978 *c->de = *de;
8979 if (c->stx)
8980 *c->stx = *stx;
8981 c->inode = in;
8982 c->full = true;
8983 return 1;
8984}
8985
8986struct dirent *Client::readdir(dir_result_t *d)
8987{
8988 int ret;
f91f0fd5 8989 auto& de = d->de;
7c673cae
FG
8990 single_readdir sr;
8991 sr.de = &de;
8992 sr.stx = NULL;
8993 sr.inode = NULL;
8994 sr.full = false;
8995
8996 // our callback fills the dirent and sets sr.full=true on first
8997 // call, and returns -1 the second time around.
8998 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8999 if (ret < -1) {
9000 errno = -ret; // this sucks.
9001 return (dirent *) NULL;
9002 }
9003 if (sr.full) {
9004 return &de;
9005 }
9006 return (dirent *) NULL;
9007}
9008
9009int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9010 struct ceph_statx *stx, unsigned want,
9011 unsigned flags, Inode **out)
9012{
9013 single_readdir sr;
9014 sr.de = de;
9015 sr.stx = stx;
9016 sr.inode = NULL;
9017 sr.full = false;
9018
9019 // our callback fills the dirent and sets sr.full=true on first
9020 // call, and returns -1 the second time around.
9021 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9022 if (r < -1)
9023 return r;
9024 if (out)
9025 *out = sr.inode;
9026 if (sr.full)
9027 return 1;
9028 return 0;
9029}
9030
9031
9032/* getdents */
9033struct getdents_result {
9034 char *buf;
9035 int buflen;
9036 int pos;
9037 bool fullent;
9038};
9039
9040static int _readdir_getdent_cb(void *p, struct dirent *de,
9041 struct ceph_statx *stx, off_t off, Inode *in)
9042{
9043 struct getdents_result *c = static_cast<getdents_result *>(p);
9044
9045 int dlen;
9046 if (c->fullent)
9047 dlen = sizeof(*de);
9048 else
9049 dlen = strlen(de->d_name) + 1;
9050
9051 if (c->pos + dlen > c->buflen)
9052 return -1; // doesn't fit
9053
9054 if (c->fullent) {
9055 memcpy(c->buf + c->pos, de, sizeof(*de));
9056 } else {
9057 memcpy(c->buf + c->pos, de->d_name, dlen);
9058 }
9059 c->pos += dlen;
9060 return 0;
9061}
9062
9063int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9064{
9065 getdents_result gr;
9066 gr.buf = buf;
9067 gr.buflen = buflen;
9068 gr.fullent = fullent;
9069 gr.pos = 0;
9070
9071 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9072
9073 if (r < 0) { // some error
9074 if (r == -1) { // buffer ran out of space
9075 if (gr.pos) { // but we got some entries already!
9076 return gr.pos;
9077 } // or we need a larger buffer
f67539c2 9078 return -CEPHFS_ERANGE;
7c673cae
FG
9079 } else { // actual error, return it
9080 return r;
9081 }
9082 }
9083 return gr.pos;
9084}
9085
9086
9087/* getdir */
9088struct getdir_result {
9089 list<string> *contents;
9090 int num;
9091};
9092
9093static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9094{
9095 getdir_result *r = static_cast<getdir_result *>(p);
9096
9097 r->contents->push_back(de->d_name);
9098 r->num++;
9099 return 0;
9100}
9101
9102int Client::getdir(const char *relpath, list<string>& contents,
9103 const UserPerm& perms)
9104{
9105 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
f67539c2
TL
9106 tout(cct) << "getdir" << std::endl;
9107 tout(cct) << relpath << std::endl;
7c673cae
FG
9108
9109 dir_result_t *d;
9110 int r = opendir(relpath, &d, perms);
9111 if (r < 0)
9112 return r;
9113
9114 getdir_result gr;
9115 gr.contents = &contents;
9116 gr.num = 0;
9117 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9118
9119 closedir(d);
9120
9121 if (r < 0)
9122 return r;
9123 return gr.num;
9124}
9125
9126
9127/****** file i/o **********/
f67539c2 9128
b3b6e05e
TL
9129// common parts for open and openat. call with client_lock locked.
9130int Client::create_and_open(std::optional<int> dirfd, const char *relpath, int flags,
9131 const UserPerm& perms, mode_t mode, int stripe_unit,
9132 int stripe_count, int object_size, const char *data_pool,
9133 std::string alternate_name) {
9134 ceph_assert(ceph_mutex_is_locked(client_lock));
f91f0fd5 9135 int cflags = ceph_flags_sys2wire(flags);
f91f0fd5 9136 tout(cct) << cflags << std::endl;
7c673cae
FG
9137
9138 Fh *fh = NULL;
9139
9140#if defined(__linux__) && defined(O_PATH)
9141 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9142 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9143 * in kernel (fs/open.c). */
9144 if (flags & O_PATH)
9145 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9146#endif
9147
9148 filepath path(relpath);
9149 InodeRef in;
9150 bool created = false;
9151 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9152 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
9153 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9154
b3b6e05e
TL
9155 InodeRef dirinode = nullptr;
9156 if (dirfd) {
9157 int r = get_fd_inode(*dirfd, &dirinode);
9158 if (r < 0) {
9159 return r;
9160 }
9161 }
7c673cae 9162
b3b6e05e 9163 int r = path_walk(path, &in, perms, followsym, mask, dirinode);
7c673cae 9164 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 9165 return -CEPHFS_EEXIST;
7c673cae
FG
9166
9167#if defined(__linux__) && defined(O_PATH)
9168 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9169#else
b3b6e05e 9170 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
7c673cae 9171#endif
f67539c2 9172 return -CEPHFS_ELOOP;
7c673cae 9173
f67539c2 9174 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
7c673cae
FG
9175 filepath dirpath = path;
9176 string dname = dirpath.last_dentry();
9177 dirpath.pop_dentry();
9178 InodeRef dir;
9179 r = path_walk(dirpath, &dir, perms, true,
b3b6e05e
TL
9180 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9181 if (r < 0) {
7c673cae 9182 goto out;
b3b6e05e 9183 }
7c673cae
FG
9184 if (cct->_conf->client_permissions) {
9185 r = may_create(dir.get(), perms);
9186 if (r < 0)
b3b6e05e 9187 goto out;
7c673cae
FG
9188 }
9189 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
f67539c2
TL
9190 stripe_count, object_size, data_pool, &created, perms,
9191 std::move(alternate_name));
7c673cae
FG
9192 }
9193 if (r < 0)
9194 goto out;
9195
9196 if (!created) {
9197 // posix says we can only check permissions of existing files
9198 if (cct->_conf->client_permissions) {
9199 r = may_open(in.get(), flags, perms);
9200 if (r < 0)
b3b6e05e 9201 goto out;
7c673cae
FG
9202 }
9203 }
9204
9205 if (!fh)
9206 r = _open(in.get(), flags, mode, &fh, perms);
9207 if (r >= 0) {
9208 // allocate a integer file descriptor
11fdf7f2 9209 ceph_assert(fh);
7c673cae 9210 r = get_fd();
11fdf7f2 9211 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
9212 fd_map[r] = fh;
9213 }
9214
9215 out:
b3b6e05e
TL
9216 return r;
9217}
9218
9219int Client::open(const char *relpath, int flags, const UserPerm& perms,
9220 mode_t mode, int stripe_unit, int stripe_count,
9221 int object_size, const char *data_pool, std::string alternate_name)
9222{
9223 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9224 stripe_count, object_size, data_pool, alternate_name);
9225}
9226
9227int Client::_openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9228 mode_t mode, std::string alternate_name) {
9229 return create_and_open(dirfd, relpath, flags, perms, mode, 0, 0, 0, NULL, alternate_name);
9230}
9231
9232int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9233 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9234 const char *data_pool, std::string alternate_name) {
9235 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9236 if (!mref_reader.is_state_satisfied()) {
9237 return -CEPHFS_ENOTCONN;
9238 }
9239
9240 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9241 tout(cct) << dirfd << std::endl;
9242 tout(cct) << relpath << std::endl;
9243 tout(cct) << flags << std::endl;
9244 tout(cct) << mode << std::endl;
9245
9246 std::scoped_lock locker(client_lock);
9247 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9248 object_size, data_pool, alternate_name);
9249
7c673cae 9250 tout(cct) << r << std::endl;
b3b6e05e 9251 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
7c673cae
FG
9252 return r;
9253}
9254
7c673cae
FG
9255int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9256 const UserPerm& perms)
9257{
11fdf7f2 9258 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 9259
f67539c2
TL
9260 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9261 if (!mref_reader.is_state_satisfied())
9262 return -CEPHFS_ENOTCONN;
181888fb 9263
f67539c2 9264 std::scoped_lock lock(client_lock);
7c673cae
FG
9265 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9266 filepath path(ino);
9267 req->set_filepath(path);
9268
9269 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9270 char f[30];
9271 sprintf(f, "%u", h);
9272 filepath path2(dirino);
9273 path2.push_dentry(string(f));
9274 req->set_filepath2(path2);
9275
9276 int r = make_request(req, perms, NULL, NULL,
9277 rand() % mdsmap->get_num_in_mds());
11fdf7f2 9278 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
9279 return r;
9280}
9281
9282
9283/**
9284 * Load inode into local cache.
9285 *
9286 * If inode pointer is non-NULL, and take a reference on
9287 * the resulting Inode object in one operation, so that caller
9288 * can safely assume inode will still be there after return.
9289 */
f67539c2 9290int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
7c673cae 9291{
f67539c2 9292 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
7c673cae 9293
f67539c2
TL
9294 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9295 if (!mref_reader.is_state_satisfied())
9296 return -CEPHFS_ENOTCONN;
181888fb 9297
b3b6e05e
TL
9298 if (is_reserved_vino(vino))
9299 return -CEPHFS_ESTALE;
9300
7c673cae 9301 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
f67539c2 9302 filepath path(vino.ino);
7c673cae
FG
9303 req->set_filepath(path);
9304
f67539c2
TL
9305 /*
9306 * The MDS expects either a "real" snapid here or 0. The special value
9307 * carveouts for the snapid are all at the end of the range so we can
9308 * just look for any snapid below this value.
9309 */
9310 if (vino.snapid < CEPH_NOSNAP)
9311 req->head.args.lookupino.snapid = vino.snapid;
9312
7c673cae
FG
9313 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9314 if (r == 0 && inode != NULL) {
7c673cae 9315 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 9316 ceph_assert(p != inode_map.end());
7c673cae
FG
9317 *inode = p->second;
9318 _ll_get(*inode);
9319 }
f67539c2 9320 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
7c673cae
FG
9321 return r;
9322}
9323
1adf2230
AA
9324int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9325{
f67539c2
TL
9326 vinodeno_t vino(ino, CEPH_NOSNAP);
9327 std::scoped_lock lock(client_lock);
9328 return _lookup_vino(vino, perms, inode);
1adf2230 9329}
7c673cae
FG
9330
9331/**
9332 * Find the parent inode of `ino` and insert it into
9333 * our cache. Conditionally also set `parent` to a referenced
9334 * Inode* if caller provides non-NULL value.
9335 */
1adf2230 9336int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 9337{
11fdf7f2 9338 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9339
7c673cae
FG
9340 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9341 filepath path(ino->ino);
9342 req->set_filepath(path);
9343
9344 InodeRef target;
9345 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9346 // Give caller a reference to the parent ino if they provided a pointer.
9347 if (parent != NULL) {
9348 if (r == 0) {
9349 *parent = target.get();
9350 _ll_get(*parent);
11fdf7f2 9351 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
9352 } else {
9353 *parent = NULL;
9354 }
9355 }
11fdf7f2 9356 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9357 return r;
9358}
9359
7c673cae
FG
9360/**
9361 * Populate the parent dentry for `ino`, provided it is
9362 * a child of `parent`.
9363 */
1adf2230 9364int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 9365{
11fdf7f2
TL
9366 ceph_assert(parent->is_dir());
9367 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9368
f67539c2
TL
9369 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9370 if (!mref_reader.is_state_satisfied())
9371 return -CEPHFS_ENOTCONN;
181888fb 9372
7c673cae
FG
9373 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9374 req->set_filepath2(filepath(parent->ino));
9375 req->set_filepath(filepath(ino->ino));
9376 req->set_inode(ino);
9377
9378 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 9379 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9380 return r;
9381}
9382
1adf2230
AA
9383int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9384{
f67539c2 9385 std::scoped_lock lock(client_lock);
1adf2230
AA
9386 return _lookup_name(ino, parent, perms);
9387}
7c673cae 9388
11fdf7f2 9389Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 9390{
11fdf7f2 9391 ceph_assert(in);
f6b5b4d7 9392 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 9393
11fdf7f2 9394 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
9395
9396 if (in->snapid != CEPH_NOSNAP) {
9397 in->snap_cap_refs++;
9398 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9399 << ccap_string(in->caps_issued()) << dendl;
9400 }
9401
11fdf7f2 9402 const auto& conf = cct->_conf;
7c673cae
FG
9403 f->readahead.set_trigger_requests(1);
9404 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9405 uint64_t max_readahead = Readahead::NO_LIMIT;
9406 if (conf->client_readahead_max_bytes) {
11fdf7f2 9407 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
9408 }
9409 if (conf->client_readahead_max_periods) {
11fdf7f2 9410 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
9411 }
9412 f->readahead.set_max_readahead_size(max_readahead);
9413 vector<uint64_t> alignments;
9414 alignments.push_back(in->layout.get_period());
9415 alignments.push_back(in->layout.stripe_unit);
9416 f->readahead.set_alignments(alignments);
9417
9418 return f;
9419}
9420
9421int Client::_release_fh(Fh *f)
9422{
9423 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9424 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9425 Inode *in = f->inode.get();
11fdf7f2 9426 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 9427
b32b8144
FG
9428 in->unset_deleg(f);
9429
7c673cae
FG
9430 if (in->snapid == CEPH_NOSNAP) {
9431 if (in->put_open_ref(f->mode)) {
9432 _flush(in, new C_Client_FlushComplete(this, in));
9433 check_caps(in, 0);
9434 }
9435 } else {
11fdf7f2 9436 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
9437 in->snap_cap_refs--;
9438 }
9439
9440 _release_filelocks(f);
9441
9442 // Finally, read any async err (i.e. from flushes)
9443 int err = f->take_async_err();
9444 if (err != 0) {
11fdf7f2 9445 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
9446 << cpp_strerror(err) << dendl;
9447 } else {
11fdf7f2 9448 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
9449 }
9450
9451 _put_fh(f);
9452
9453 return err;
9454}
9455
9456void Client::_put_fh(Fh *f)
9457{
9458 int left = f->put();
9459 if (!left) {
9460 delete f;
9461 }
9462}
9463
9464int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9465 const UserPerm& perms)
9466{
9467 if (in->snapid != CEPH_NOSNAP &&
9468 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
f67539c2 9469 return -CEPHFS_EROFS;
7c673cae
FG
9470 }
9471
9472 // use normalized flags to generate cmode
11fdf7f2
TL
9473 int cflags = ceph_flags_sys2wire(flags);
9474 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9475 cflags |= CEPH_O_LAZY;
9476
9477 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
9478 int want = ceph_caps_for_mode(cmode);
9479 int result = 0;
9480
9481 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9482
b32b8144 9483 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
9484 // update wanted?
9485 check_caps(in, CHECK_CAPS_NODELAY);
9486 } else {
b32b8144 9487
7c673cae
FG
9488 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9489 filepath path;
9490 in->make_nosnap_relative_path(path);
9491 req->set_filepath(path);
11fdf7f2 9492 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
9493 req->head.args.open.mode = mode;
9494 req->head.args.open.pool = -1;
9495 if (cct->_conf->client_debug_getattr_caps)
9496 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9497 else
9498 req->head.args.open.mask = 0;
9499 req->head.args.open.old_size = in->size; // for O_TRUNC
9500 req->set_inode(in);
9501 result = make_request(req, perms);
b32b8144
FG
9502
9503 /*
9504 * NFS expects that delegations will be broken on a conflicting open,
9505 * not just when there is actual conflicting access to the file. SMB leases
9506 * and oplocks also have similar semantics.
9507 *
9508 * Ensure that clients that have delegations enabled will wait on minimal
9509 * caps during open, just to ensure that other clients holding delegations
9510 * return theirs first.
9511 */
9512 if (deleg_timeout && result == 0) {
9513 int need = 0, have;
9514
9515 if (cmode & CEPH_FILE_MODE_WR)
9516 need |= CEPH_CAP_FILE_WR;
9517 if (cmode & CEPH_FILE_MODE_RD)
9518 need |= CEPH_CAP_FILE_RD;
9519
f6b5b4d7
TL
9520 Fh fh(in, flags, cmode, fd_gen, perms);
9521 result = get_caps(&fh, need, want, &have, -1);
b32b8144 9522 if (result < 0) {
1adf2230 9523 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
9524 " . Denying open: " <<
9525 cpp_strerror(result) << dendl;
b32b8144
FG
9526 } else {
9527 put_cap_ref(in, need);
9528 }
9529 }
7c673cae
FG
9530 }
9531
9532 // success?
9533 if (result >= 0) {
9534 if (fhp)
9535 *fhp = _create_fh(in, flags, cmode, perms);
9536 } else {
9537 in->put_open_ref(cmode);
9538 }
9539
9540 trim_cache();
9541
9542 return result;
9543}
9544
9545int Client::_renew_caps(Inode *in)
9546{
9547 int wanted = in->caps_file_wanted();
9548 if (in->is_any_caps() &&
9549 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9550 check_caps(in, CHECK_CAPS_NODELAY);
9551 return 0;
9552 }
9553
9554 int flags = 0;
9555 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9556 flags = O_RDWR;
9557 else if (wanted & CEPH_CAP_FILE_RD)
9558 flags = O_RDONLY;
9559 else if (wanted & CEPH_CAP_FILE_WR)
9560 flags = O_WRONLY;
9561
9562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9563 filepath path;
9564 in->make_nosnap_relative_path(path);
9565 req->set_filepath(path);
9566 req->head.args.open.flags = flags;
9567 req->head.args.open.pool = -1;
9568 if (cct->_conf->client_debug_getattr_caps)
9569 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9570 else
9571 req->head.args.open.mask = 0;
9572 req->set_inode(in);
9573
9574 // duplicate in case Cap goes away; not sure if that race is a concern?
9575 const UserPerm *pperm = in->get_best_perms();
9576 UserPerm perms;
9577 if (pperm != NULL)
9578 perms = *pperm;
9579 int ret = make_request(req, perms);
9580 return ret;
9581}
9582
b3b6e05e 9583int Client::_close(int fd)
7c673cae
FG
9584{
9585 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
7c673cae
FG
9586 tout(cct) << "close" << std::endl;
9587 tout(cct) << fd << std::endl;
9588
9589 Fh *fh = get_filehandle(fd);
9590 if (!fh)
f67539c2 9591 return -CEPHFS_EBADF;
7c673cae
FG
9592 int err = _release_fh(fh);
9593 fd_map.erase(fd);
9594 put_fd(fd);
9595 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9596 return err;
9597}
9598
b3b6e05e
TL
9599int Client::close(int fd) {
9600 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9601 if (!mref_reader.is_state_satisfied())
9602 return -CEPHFS_ENOTCONN;
9603
9604 std::scoped_lock lock(client_lock);
9605 return _close(fd);
9606}
7c673cae
FG
9607
9608// ------------
9609// read, write
9610
9611loff_t Client::lseek(int fd, loff_t offset, int whence)
9612{
f67539c2
TL
9613 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9614 if (!mref_reader.is_state_satisfied())
9615 return -CEPHFS_ENOTCONN;
9616
7c673cae
FG
9617 tout(cct) << "lseek" << std::endl;
9618 tout(cct) << fd << std::endl;
9619 tout(cct) << offset << std::endl;
9620 tout(cct) << whence << std::endl;
9621
f67539c2 9622 std::scoped_lock lock(client_lock);
7c673cae
FG
9623 Fh *f = get_filehandle(fd);
9624 if (!f)
f67539c2 9625 return -CEPHFS_EBADF;
7c673cae
FG
9626#if defined(__linux__) && defined(O_PATH)
9627 if (f->flags & O_PATH)
f67539c2 9628 return -CEPHFS_EBADF;
7c673cae
FG
9629#endif
9630 return _lseek(f, offset, whence);
9631}
9632
9633loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9634{
9635 Inode *in = f->inode.get();
9f95a23c 9636 bool whence_check = false;
11fdf7f2 9637 loff_t pos = -1;
7c673cae 9638
9f95a23c
TL
9639 switch (whence) {
9640 case SEEK_END:
9641 whence_check = true;
9642 break;
9643
9644#ifdef SEEK_DATA
9645 case SEEK_DATA:
9646 whence_check = true;
9647 break;
9648#endif
9649
9650#ifdef SEEK_HOLE
9651 case SEEK_HOLE:
9652 whence_check = true;
9653 break;
9654#endif
9655 }
9656
9657 if (whence_check) {
9658 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9659 if (r < 0)
92f5a8d4 9660 return r;
92f5a8d4
TL
9661 }
9662
7c673cae
FG
9663 switch (whence) {
9664 case SEEK_SET:
11fdf7f2 9665 pos = offset;
7c673cae
FG
9666 break;
9667
9668 case SEEK_CUR:
92f5a8d4 9669 pos = f->pos + offset;
7c673cae
FG
9670 break;
9671
9672 case SEEK_END:
11fdf7f2 9673 pos = in->size + offset;
7c673cae
FG
9674 break;
9675
9f95a23c 9676#ifdef SEEK_DATA
92f5a8d4 9677 case SEEK_DATA:
9f95a23c 9678 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9679 return -CEPHFS_ENXIO;
92f5a8d4
TL
9680 pos = offset;
9681 break;
9f95a23c 9682#endif
92f5a8d4 9683
9f95a23c 9684#ifdef SEEK_HOLE
92f5a8d4 9685 case SEEK_HOLE:
9f95a23c 9686 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9687 return -CEPHFS_ENXIO;
9f95a23c 9688 pos = in->size;
92f5a8d4 9689 break;
9f95a23c 9690#endif
92f5a8d4 9691
7c673cae 9692 default:
92f5a8d4 9693 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
f67539c2 9694 return -CEPHFS_EINVAL;
7c673cae
FG
9695 }
9696
11fdf7f2 9697 if (pos < 0) {
f67539c2 9698 return -CEPHFS_EINVAL;
11fdf7f2
TL
9699 } else {
9700 f->pos = pos;
9701 }
9702
1adf2230 9703 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
9704 return f->pos;
9705}
9706
9707
9708void Client::lock_fh_pos(Fh *f)
9709{
11fdf7f2 9710 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9711
9712 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 9713 ceph::condition_variable cond;
7c673cae 9714 f->pos_waiters.push_back(&cond);
11fdf7f2 9715 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
9716 std::unique_lock l{client_lock, std::adopt_lock};
9717 cond.wait(l, [f, me=&cond] {
9718 return !f->pos_locked && f->pos_waiters.front() == me;
9719 });
9720 l.release();
11fdf7f2
TL
9721 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9722 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9723 f->pos_waiters.pop_front();
9724 }
9725
9726 f->pos_locked = true;
9727}
9728
9729void Client::unlock_fh_pos(Fh *f)
9730{
f67539c2
TL
9731 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9732
11fdf7f2 9733 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 9734 f->pos_locked = false;
f67539c2
TL
9735 if (!f->pos_waiters.empty()) {
9736 // only wake up the oldest waiter
9737 auto cond = f->pos_waiters.front();
9738 cond->notify_one();
9739 }
7c673cae
FG
9740}
9741
9742int Client::uninline_data(Inode *in, Context *onfinish)
9743{
9744 if (!in->inline_data.length()) {
9745 onfinish->complete(0);
9746 return 0;
9747 }
9748
9749 char oid_buf[32];
9750 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9751 object_t oid = oid_buf;
9752
9753 ObjectOperation create_ops;
9754 create_ops.create(false);
9755
9756 objecter->mutate(oid,
9757 OSDMap::file_to_object_locator(in->layout),
9758 create_ops,
9759 in->snaprealm->get_snap_context(),
9760 ceph::real_clock::now(),
9761 0,
9762 NULL);
9763
9764 bufferlist inline_version_bl;
11fdf7f2 9765 encode(in->inline_version, inline_version_bl);
7c673cae
FG
9766
9767 ObjectOperation uninline_ops;
9768 uninline_ops.cmpxattr("inline_version",
9769 CEPH_OSD_CMPXATTR_OP_GT,
9770 CEPH_OSD_CMPXATTR_MODE_U64,
9771 inline_version_bl);
9772 bufferlist inline_data = in->inline_data;
9773 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9774 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9775
9776 objecter->mutate(oid,
9777 OSDMap::file_to_object_locator(in->layout),
9778 uninline_ops,
9779 in->snaprealm->get_snap_context(),
9780 ceph::real_clock::now(),
9781 0,
9782 onfinish);
9783
9784 return 0;
9785}
9786
9787//
9788
9789// blocking osd interface
9790
9791int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9792{
f67539c2
TL
9793 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9794 if (!mref_reader.is_state_satisfied())
9795 return -CEPHFS_ENOTCONN;
9796
7c673cae
FG
9797 tout(cct) << "read" << std::endl;
9798 tout(cct) << fd << std::endl;
9799 tout(cct) << size << std::endl;
9800 tout(cct) << offset << std::endl;
9801
f67539c2 9802 std::unique_lock lock(client_lock);
7c673cae
FG
9803 Fh *f = get_filehandle(fd);
9804 if (!f)
f67539c2 9805 return -CEPHFS_EBADF;
7c673cae
FG
9806#if defined(__linux__) && defined(O_PATH)
9807 if (f->flags & O_PATH)
f67539c2 9808 return -CEPHFS_EBADF;
7c673cae
FG
9809#endif
9810 bufferlist bl;
11fdf7f2
TL
9811 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9812 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9813 int r = _read(f, offset, size, &bl);
9814 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9815 if (r >= 0) {
f6b5b4d7 9816 lock.unlock();
9f95a23c 9817 bl.begin().copy(bl.length(), buf);
7c673cae
FG
9818 r = bl.length();
9819 }
9820 return r;
9821}
9822
9823int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9824{
9825 if (iovcnt < 0)
f67539c2 9826 return -CEPHFS_EINVAL;
7c673cae
FG
9827 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9828}
9829
11fdf7f2 9830int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9831{
f67539c2
TL
9832 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9833
11fdf7f2
TL
9834 int want, have = 0;
9835 bool movepos = false;
9836 std::unique_ptr<C_SaferCond> onuninline;
adb31ebb 9837 int64_t rc = 0;
11fdf7f2 9838 const auto& conf = cct->_conf;
7c673cae 9839 Inode *in = f->inode.get();
11fdf7f2
TL
9840 utime_t lat;
9841 utime_t start = ceph_clock_now();
7c673cae
FG
9842
9843 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
f67539c2 9844 return -CEPHFS_EBADF;
7c673cae
FG
9845 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9846
7c673cae
FG
9847 if (offset < 0) {
9848 lock_fh_pos(f);
9849 offset = f->pos;
9850 movepos = true;
9851 }
9852 loff_t start_pos = offset;
9853
9854 if (in->inline_version == 0) {
adb31ebb 9855 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9856 if (r < 0) {
adb31ebb 9857 rc = r;
11fdf7f2 9858 goto done;
c07f9fc5 9859 }
11fdf7f2 9860 ceph_assert(in->inline_version > 0);
7c673cae
FG
9861 }
9862
9863retry:
11fdf7f2
TL
9864 if (f->mode & CEPH_FILE_MODE_LAZY)
9865 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9866 else
9867 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
9868 {
9869 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9870 if (r < 0) {
9871 rc = r;
9872 goto done;
9873 }
c07f9fc5 9874 }
7c673cae 9875 if (f->flags & O_DIRECT)
11fdf7f2 9876 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9877
9878 if (in->inline_version < CEPH_INLINE_NONE) {
9879 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9880 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9881 uninline_data(in, onuninline.get());
7c673cae
FG
9882 } else {
9883 uint32_t len = in->inline_data.length();
7c673cae
FG
9884 uint64_t endoff = offset + size;
9885 if (endoff > in->size)
9886 endoff = in->size;
9887
9888 if (offset < len) {
9889 if (endoff <= len) {
9890 bl->substr_of(in->inline_data, offset, endoff - offset);
9891 } else {
9892 bl->substr_of(in->inline_data, offset, len - offset);
9893 bl->append_zero(endoff - len);
9894 }
adb31ebb 9895 rc = endoff - offset;
7c673cae
FG
9896 } else if ((uint64_t)offset < endoff) {
9897 bl->append_zero(endoff - offset);
adb31ebb 9898 rc = endoff - offset;
11fdf7f2 9899 } else {
adb31ebb 9900 rc = 0;
7c673cae 9901 }
7c673cae
FG
9902 goto success;
9903 }
9904 }
9905
9906 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9907 conf->client_oc &&
9908 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9909
9910 if (f->flags & O_RSYNC) {
9911 _flush_range(in, offset, size);
9912 }
adb31ebb
TL
9913 rc = _read_async(f, offset, size, bl);
9914 if (rc < 0)
7c673cae
FG
9915 goto done;
9916 } else {
9917 if (f->flags & O_DIRECT)
9918 _flush_range(in, offset, size);
9919
9920 bool checkeof = false;
adb31ebb
TL
9921 rc = _read_sync(f, offset, size, bl, &checkeof);
9922 if (rc < 0)
7c673cae
FG
9923 goto done;
9924 if (checkeof) {
adb31ebb
TL
9925 offset += rc;
9926 size -= rc;
7c673cae
FG
9927
9928 put_cap_ref(in, CEPH_CAP_FILE_RD);
9929 have = 0;
9930 // reverify size
adb31ebb
TL
9931 {
9932 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9933 if (r < 0) {
9934 rc = r;
9935 goto done;
9936 }
9937 }
7c673cae
FG
9938
9939 // eof? short read.
9940 if ((uint64_t)offset < in->size)
9941 goto retry;
9942 }
9943 }
9944
9945success:
adb31ebb 9946 ceph_assert(rc >= 0);
7c673cae
FG
9947 if (movepos) {
9948 // adjust fd pos
adb31ebb 9949 f->pos = start_pos + rc;
7c673cae 9950 }
11fdf7f2
TL
9951
9952 lat = ceph_clock_now();
9953 lat -= start;
9954 logger->tinc(l_c_read, lat);
7c673cae
FG
9955
9956done:
9957 // done!
11fdf7f2 9958
7c673cae 9959 if (onuninline) {
9f95a23c 9960 client_lock.unlock();
11fdf7f2 9961 int ret = onuninline->wait();
9f95a23c 9962 client_lock.lock();
f67539c2 9963 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
9964 in->inline_data.clear();
9965 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9966 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9967 check_caps(in, 0);
9968 } else
adb31ebb 9969 rc = ret;
7c673cae 9970 }
11fdf7f2 9971 if (have) {
7c673cae 9972 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9973 }
9974 if (movepos) {
9975 unlock_fh_pos(f);
9976 }
adb31ebb 9977 return rc;
7c673cae
FG
9978}
9979
9980Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9981 client(c), f(f) {
9982 f->get();
9983 f->readahead.inc_pending();
9984}
9985
9986Client::C_Readahead::~C_Readahead() {
9987 f->readahead.dec_pending();
9988 client->_put_fh(f);
9989}
9990
9991void Client::C_Readahead::finish(int r) {
9992 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9993 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9994}
9995
9996int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9997{
f67539c2
TL
9998 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9999
11fdf7f2 10000 const auto& conf = cct->_conf;
7c673cae
FG
10001 Inode *in = f->inode.get();
10002
11fdf7f2 10003 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
10004
10005 // trim read based on file size?
10006 if (off >= in->size)
10007 return 0;
10008 if (len == 0)
10009 return 0;
10010 if (off + len > in->size) {
10011 len = in->size - off;
10012 }
10013
10014 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10015 << " max_bytes=" << f->readahead.get_max_readahead_size()
10016 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10017
10018 // read (and possibly block)
11fdf7f2
TL
10019 int r = 0;
10020 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 10021 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 10022 off, len, bl, 0, &onfinish);
7c673cae
FG
10023 if (r == 0) {
10024 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 10025 client_lock.unlock();
11fdf7f2 10026 r = onfinish.wait();
9f95a23c 10027 client_lock.lock();
7c673cae 10028 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
10029 }
10030
10031 if(f->readahead.get_min_readahead_size() > 0) {
10032 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10033 if (readahead_extent.second > 0) {
10034 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10035 << " (caller wants " << off << "~" << len << ")" << dendl;
10036 Context *onfinish2 = new C_Readahead(this, f);
10037 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10038 readahead_extent.first, readahead_extent.second,
10039 NULL, 0, onfinish2);
10040 if (r2 == 0) {
10041 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10042 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10043 } else {
10044 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10045 delete onfinish2;
10046 }
10047 }
10048 }
10049
10050 return r;
10051}
10052
10053int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10054 bool *checkeof)
10055{
f67539c2
TL
10056 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10057
7c673cae
FG
10058 Inode *in = f->inode.get();
10059 uint64_t pos = off;
10060 int left = len;
10061 int read = 0;
10062
11fdf7f2 10063 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 10064
f67539c2
TL
10065 // 0 success, 1 continue and < 0 error happen.
10066 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
11fdf7f2 10067 int r = onfinish.wait();
7c673cae
FG
10068
10069 // if we get ENOENT from OSD, assume 0 bytes returned
f67539c2 10070 if (r == -CEPHFS_ENOENT)
7c673cae
FG
10071 r = 0;
10072 if (r < 0)
10073 return r;
f67539c2 10074
7c673cae
FG
10075 if (tbl.length()) {
10076 r = tbl.length();
10077
10078 read += r;
10079 pos += r;
10080 left -= r;
10081 bl->claim_append(tbl);
10082 }
10083 // short read?
10084 if (r >= 0 && r < wanted) {
10085 if (pos < in->size) {
10086 // zero up to known EOF
10087 int64_t some = in->size - pos;
10088 if (some > left)
10089 some = left;
11fdf7f2
TL
10090 auto z = buffer::ptr_node::create(some);
10091 z->zero();
10092 bl->push_back(std::move(z));
7c673cae
FG
10093 read += some;
10094 pos += some;
10095 left -= some;
10096 if (left == 0)
f67539c2 10097 return 0;
7c673cae
FG
10098 }
10099
10100 *checkeof = true;
f67539c2 10101 return 0;
7c673cae 10102 }
f67539c2
TL
10103 return 1;
10104 };
7c673cae 10105
f67539c2
TL
10106 while (left > 0) {
10107 C_SaferCond onfinish("Client::_read_sync flock");
10108 bufferlist tbl;
7c673cae 10109
f67539c2
TL
10110 int wanted = left;
10111 filer->read_trunc(in->ino, &in->layout, in->snapid,
10112 pos, left, &tbl, 0,
10113 in->truncate_size, in->truncate_seq,
10114 &onfinish);
10115 client_lock.unlock();
10116 int r = wait_and_copy(onfinish, tbl, wanted);
10117 client_lock.lock();
10118 if (!r)
10119 return read;
10120 if (r < 0)
10121 return r;
7c673cae 10122 }
f67539c2 10123 return read;
7c673cae
FG
10124}
10125
10126int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10127{
f67539c2
TL
10128 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10129 if (!mref_reader.is_state_satisfied())
10130 return -CEPHFS_ENOTCONN;
10131
7c673cae
FG
10132 tout(cct) << "write" << std::endl;
10133 tout(cct) << fd << std::endl;
10134 tout(cct) << size << std::endl;
10135 tout(cct) << offset << std::endl;
10136
f67539c2 10137 std::scoped_lock lock(client_lock);
7c673cae
FG
10138 Fh *fh = get_filehandle(fd);
10139 if (!fh)
f67539c2 10140 return -CEPHFS_EBADF;
7c673cae
FG
10141#if defined(__linux__) && defined(O_PATH)
10142 if (fh->flags & O_PATH)
f67539c2 10143 return -CEPHFS_EBADF;
7c673cae 10144#endif
11fdf7f2
TL
10145 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10146 size = std::min(size, (loff_t)INT_MAX);
10147 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
10148 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10149 return r;
10150}
10151
10152int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10153{
10154 if (iovcnt < 0)
f67539c2 10155 return -CEPHFS_EINVAL;
7c673cae
FG
10156 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10157}
10158
11fdf7f2
TL
10159int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10160 unsigned iovcnt, int64_t offset, bool write,
f67539c2 10161 bool clamp_to_int, std::unique_lock<ceph::mutex> &cl)
7c673cae 10162{
7c673cae
FG
10163#if defined(__linux__) && defined(O_PATH)
10164 if (fh->flags & O_PATH)
f67539c2 10165 return -CEPHFS_EBADF;
7c673cae
FG
10166#endif
10167 loff_t totallen = 0;
10168 for (unsigned i = 0; i < iovcnt; i++) {
10169 totallen += iov[i].iov_len;
10170 }
11fdf7f2
TL
10171
10172 /*
10173 * Some of the API functions take 64-bit size values, but only return
10174 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10175 * we don't do I/Os larger than the values we can return.
10176 */
10177 if (clamp_to_int) {
10178 totallen = std::min(totallen, (loff_t)INT_MAX);
10179 }
7c673cae 10180 if (write) {
11fdf7f2
TL
10181 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10182 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
10183 return w;
10184 } else {
10185 bufferlist bl;
11fdf7f2
TL
10186 int64_t r = _read(fh, offset, totallen, &bl);
10187 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
10188 if (r <= 0)
10189 return r;
10190
f67539c2 10191 cl.unlock();
9f95a23c 10192 auto iter = bl.cbegin();
7c673cae
FG
10193 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10194 /*
f67539c2
TL
10195 * This piece of code aims to handle the case that bufferlist
10196 * does not have enough data to fill in the iov
7c673cae 10197 */
9f95a23c
TL
10198 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10199 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10200 resid -= round_size;
10201 /* iter is self-updating */
7c673cae 10202 }
f67539c2
TL
10203 cl.lock();
10204 return r;
7c673cae
FG
10205 }
10206}
10207
11fdf7f2
TL
10208int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10209{
f67539c2
TL
10210 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10211 if (!mref_reader.is_state_satisfied())
10212 return -CEPHFS_ENOTCONN;
10213
11fdf7f2
TL
10214 tout(cct) << fd << std::endl;
10215 tout(cct) << offset << std::endl;
10216
f67539c2 10217 std::unique_lock cl(client_lock);
11fdf7f2
TL
10218 Fh *fh = get_filehandle(fd);
10219 if (!fh)
f67539c2
TL
10220 return -CEPHFS_EBADF;
10221 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true, cl);
11fdf7f2
TL
10222}
10223
10224int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10225 const struct iovec *iov, int iovcnt)
7c673cae 10226{
f67539c2
TL
10227 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10228
f64942e4
AA
10229 uint64_t fpos = 0;
10230
7c673cae 10231 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
f67539c2 10232 return -CEPHFS_EFBIG;
7c673cae
FG
10233
10234 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10235 Inode *in = f->inode.get();
10236
10237 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
f67539c2 10238 return -CEPHFS_ENOSPC;
7c673cae
FG
10239 }
10240
11fdf7f2 10241 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
10242
10243 // was Fh opened as writeable?
10244 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10245 return -CEPHFS_EBADF;
7c673cae 10246
7c673cae
FG
10247 // use/adjust fd pos?
10248 if (offset < 0) {
10249 lock_fh_pos(f);
10250 /*
10251 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10252 * change out from under us.
10253 */
10254 if (f->flags & O_APPEND) {
9f95a23c 10255 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
10256 if (r < 0) {
10257 unlock_fh_pos(f);
10258 return r;
10259 }
10260 }
10261 offset = f->pos;
f64942e4 10262 fpos = offset+size;
7c673cae
FG
10263 unlock_fh_pos(f);
10264 }
10265
11fdf7f2
TL
10266 // check quota
10267 uint64_t endoff = offset + size;
10268 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10269 f->actor_perms)) {
f67539c2 10270 return -CEPHFS_EDQUOT;
11fdf7f2
TL
10271 }
10272
7c673cae
FG
10273 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10274
10275 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10276
10277 // time it.
10278 utime_t start = ceph_clock_now();
10279
10280 if (in->inline_version == 0) {
10281 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10282 if (r < 0)
10283 return r;
11fdf7f2 10284 ceph_assert(in->inline_version > 0);
7c673cae
FG
10285 }
10286
10287 // copy into fresh buffer (since our write may be resub, async)
10288 bufferlist bl;
10289 if (buf) {
10290 if (size > 0)
10291 bl.append(buf, size);
10292 } else if (iov){
10293 for (int i = 0; i < iovcnt; i++) {
10294 if (iov[i].iov_len > 0) {
10295 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10296 }
10297 }
10298 }
10299
10300 utime_t lat;
10301 uint64_t totalwritten;
11fdf7f2
TL
10302 int want, have;
10303 if (f->mode & CEPH_FILE_MODE_LAZY)
10304 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10305 else
10306 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 10307 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
10308 if (r < 0)
10309 return r;
10310
10311 /* clear the setuid/setgid bits, if any */
181888fb 10312 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
10313 struct ceph_statx stx = { 0 };
10314
10315 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10316 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10317 if (r < 0)
10318 return r;
10319 } else {
10320 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10321 }
10322
10323 if (f->flags & O_DIRECT)
11fdf7f2 10324 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10325
10326 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10327
11fdf7f2
TL
10328 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10329
7c673cae
FG
10330 if (in->inline_version < CEPH_INLINE_NONE) {
10331 if (endoff > cct->_conf->client_max_inline_size ||
10332 endoff > CEPH_INLINE_MAX_SIZE ||
10333 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
10334 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10335 uninline_data(in, onuninline.get());
7c673cae
FG
10336 } else {
10337 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10338
10339 uint32_t len = in->inline_data.length();
10340
10341 if (endoff < len)
9f95a23c 10342 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
10343
10344 if (offset < len)
10345 in->inline_data.splice(offset, len - offset);
10346 else if (offset > len)
10347 in->inline_data.append_zero(offset - len);
10348
10349 in->inline_data.append(bl);
10350 in->inline_version++;
10351
10352 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10353
10354 goto success;
10355 }
10356 }
10357
11fdf7f2
TL
10358 if (cct->_conf->client_oc &&
10359 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10360 // do buffered write
10361 if (!in->oset.dirty_or_tx)
10362 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10363
10364 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10365
10366 // async, caching, non-blocking.
10367 r = objectcacher->file_write(&in->oset, &in->layout,
10368 in->snaprealm->get_snap_context(),
10369 offset, size, bl, ceph::real_clock::now(),
10370 0);
10371 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10372
10373 if (r < 0)
10374 goto done;
10375
10376 // flush cached write if O_SYNC is set on file fh
10377 // O_DSYNC == O_SYNC on linux < 2.6.33
10378 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10379 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10380 _flush_range(in, offset, size);
10381 }
10382 } else {
10383 if (f->flags & O_DIRECT)
10384 _flush_range(in, offset, size);
10385
10386 // simple, non-atomic sync write
11fdf7f2 10387 C_SaferCond onfinish("Client::_write flock");
f67539c2 10388 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
10389
10390 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10391 offset, size, bl, ceph::real_clock::now(), 0,
10392 in->truncate_size, in->truncate_seq,
11fdf7f2 10393 &onfinish);
9f95a23c 10394 client_lock.unlock();
f6b5b4d7 10395 r = onfinish.wait();
9f95a23c 10396 client_lock.lock();
f67539c2 10397 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
f6b5b4d7
TL
10398 if (r < 0)
10399 goto done;
7c673cae
FG
10400 }
10401
10402 // if we get here, write was successful, update client metadata
10403success:
10404 // time
10405 lat = ceph_clock_now();
10406 lat -= start;
10407 logger->tinc(l_c_wrlat, lat);
10408
f64942e4
AA
10409 if (fpos) {
10410 lock_fh_pos(f);
10411 f->pos = fpos;
10412 unlock_fh_pos(f);
10413 }
7c673cae 10414 totalwritten = size;
11fdf7f2 10415 r = (int64_t)totalwritten;
7c673cae
FG
10416
10417 // extend file?
10418 if (totalwritten + offset > in->size) {
10419 in->size = totalwritten + offset;
28e407b8 10420 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 10421
11fdf7f2 10422 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 10423 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
10424 } else if (is_max_size_approaching(in)) {
10425 check_caps(in, 0);
7c673cae
FG
10426 }
10427
10428 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10429 } else {
10430 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10431 }
10432
10433 // mtime
91327a77 10434 in->mtime = in->ctime = ceph_clock_now();
7c673cae 10435 in->change_attr++;
28e407b8 10436 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10437
10438done:
10439
11fdf7f2 10440 if (nullptr != onuninline) {
9f95a23c 10441 client_lock.unlock();
11fdf7f2 10442 int uninline_ret = onuninline->wait();
9f95a23c 10443 client_lock.lock();
7c673cae 10444
f67539c2 10445 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10446 in->inline_data.clear();
10447 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10448 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10449 check_caps(in, 0);
10450 } else
10451 r = uninline_ret;
10452 }
10453
10454 put_cap_ref(in, CEPH_CAP_FILE_WR);
10455 return r;
10456}
10457
10458int Client::_flush(Fh *f)
10459{
10460 Inode *in = f->inode.get();
10461 int err = f->take_async_err();
10462 if (err != 0) {
10463 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10464 << cpp_strerror(err) << dendl;
10465 } else {
10466 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10467 }
10468
10469 return err;
10470}
10471
10472int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10473{
10474 struct ceph_statx stx;
10475 stx.stx_size = length;
10476 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10477}
10478
10479int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10480{
f67539c2
TL
10481 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10482 if (!mref_reader.is_state_satisfied())
10483 return -CEPHFS_ENOTCONN;
10484
11fdf7f2 10485 tout(cct) << __func__ << std::endl;
7c673cae
FG
10486 tout(cct) << fd << std::endl;
10487 tout(cct) << length << std::endl;
10488
f67539c2 10489 std::scoped_lock lock(client_lock);
7c673cae
FG
10490 Fh *f = get_filehandle(fd);
10491 if (!f)
f67539c2 10492 return -CEPHFS_EBADF;
7c673cae
FG
10493#if defined(__linux__) && defined(O_PATH)
10494 if (f->flags & O_PATH)
f67539c2 10495 return -CEPHFS_EBADF;
7c673cae 10496#endif
adb31ebb 10497 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10498 return -CEPHFS_EBADF;
7c673cae
FG
10499 struct stat attr;
10500 attr.st_size = length;
10501 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10502}
10503
10504int Client::fsync(int fd, bool syncdataonly)
10505{
f67539c2
TL
10506 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10507 if (!mref_reader.is_state_satisfied())
10508 return -CEPHFS_ENOTCONN;
10509
7c673cae
FG
10510 tout(cct) << "fsync" << std::endl;
10511 tout(cct) << fd << std::endl;
10512 tout(cct) << syncdataonly << std::endl;
10513
f67539c2 10514 std::scoped_lock lock(client_lock);
7c673cae
FG
10515 Fh *f = get_filehandle(fd);
10516 if (!f)
f67539c2 10517 return -CEPHFS_EBADF;
7c673cae
FG
10518#if defined(__linux__) && defined(O_PATH)
10519 if (f->flags & O_PATH)
f67539c2 10520 return -CEPHFS_EBADF;
7c673cae
FG
10521#endif
10522 int r = _fsync(f, syncdataonly);
10523 if (r == 0) {
10524 // The IOs in this fsync were okay, but maybe something happened
10525 // in the background that we shoudl be reporting?
10526 r = f->take_async_err();
1adf2230 10527 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
10528 << ") = 0, async_err = " << r << dendl;
10529 } else {
10530 // Assume that an error we encountered during fsync, even reported
10531 // synchronously, would also have applied the error to the Fh, and we
10532 // should clear it here to avoid returning the same error again on next
10533 // call.
1adf2230 10534 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
10535 << r << dendl;
10536 f->take_async_err();
10537 }
10538 return r;
10539}
10540
10541int Client::_fsync(Inode *in, bool syncdataonly)
10542{
f67539c2
TL
10543 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10544
7c673cae 10545 int r = 0;
11fdf7f2 10546 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
10547 ceph_tid_t flush_tid = 0;
10548 InodeRef tmp_ref;
11fdf7f2
TL
10549 utime_t lat;
10550 utime_t start = ceph_clock_now();
7c673cae 10551
1adf2230 10552 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
10553
10554 if (cct->_conf->client_oc) {
11fdf7f2
TL
10555 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10556 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10557 _flush(in, object_cacher_completion.get());
7c673cae
FG
10558 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10559 }
10560
10561 if (!syncdataonly && in->dirty_caps) {
10562 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10563 if (in->flushing_caps)
10564 flush_tid = last_flush_tid;
10565 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10566
10567 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
10568 flush_mdlog_sync();
10569
7c673cae
FG
10570 MetaRequest *req = in->unsafe_ops.back();
10571 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10572
10573 req->get();
10574 wait_on_list(req->waitfor_safe);
10575 put_request(req);
10576 }
10577
11fdf7f2 10578 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 10579 client_lock.unlock();
7c673cae 10580 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 10581 r = object_cacher_completion->wait();
9f95a23c 10582 client_lock.lock();
7c673cae
FG
10583 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10584 } else {
10585 // FIXME: this can starve
10586 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10587 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10588 << " uncommitted, waiting" << dendl;
10589 wait_on_list(in->waitfor_commit);
10590 }
10591 }
10592
10593 if (!r) {
10594 if (flush_tid > 0)
10595 wait_sync_caps(in, flush_tid);
10596
10597 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10598 } else {
1adf2230 10599 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
10600 << cpp_strerror(-r) << dendl;
10601 }
11fdf7f2
TL
10602
10603 lat = ceph_clock_now();
10604 lat -= start;
10605 logger->tinc(l_c_fsync, lat);
7c673cae
FG
10606
10607 return r;
10608}
10609
10610int Client::_fsync(Fh *f, bool syncdataonly)
10611{
1adf2230 10612 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
10613 return _fsync(f->inode.get(), syncdataonly);
10614}
10615
10616int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10617{
f67539c2
TL
10618 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10619 if (!mref_reader.is_state_satisfied())
10620 return -CEPHFS_ENOTCONN;
10621
7c673cae
FG
10622 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10623 tout(cct) << fd << std::endl;
10624
f67539c2 10625 std::scoped_lock lock(client_lock);
7c673cae
FG
10626 Fh *f = get_filehandle(fd);
10627 if (!f)
f67539c2 10628 return -CEPHFS_EBADF;
7c673cae
FG
10629 int r = _getattr(f->inode, mask, perms);
10630 if (r < 0)
10631 return r;
10632 fill_stat(f->inode, stbuf, NULL);
1adf2230 10633 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
10634 return r;
10635}
10636
10637int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10638 unsigned int want, unsigned int flags)
10639{
f67539c2
TL
10640 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10641 if (!mref_reader.is_state_satisfied())
10642 return -CEPHFS_ENOTCONN;
10643
7c673cae
FG
10644 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10645 tout(cct) << fd << std::endl;
10646
f67539c2 10647 std::scoped_lock lock(client_lock);
7c673cae
FG
10648 Fh *f = get_filehandle(fd);
10649 if (!f)
f67539c2 10650 return -CEPHFS_EBADF;
7c673cae
FG
10651
10652 unsigned mask = statx_to_mask(flags, want);
10653
10654 int r = 0;
b3b6e05e 10655 if (mask) {
7c673cae
FG
10656 r = _getattr(f->inode, mask, perms);
10657 if (r < 0) {
10658 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10659 return r;
10660 }
10661 }
10662
10663 fill_statx(f->inode, mask, stx);
10664 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10665 return r;
10666}
10667
b3b6e05e
TL
10668int Client::statxat(int dirfd, const char *relpath,
10669 struct ceph_statx *stx, const UserPerm& perms,
10670 unsigned int want, unsigned int flags) {
10671 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10672 if (!mref_reader.is_state_satisfied()) {
10673 return -CEPHFS_ENOTCONN;
10674 }
10675
10676 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10677 tout(cct) << dirfd << std::endl;
10678 tout(cct) << relpath << std::endl;
10679
10680 unsigned mask = statx_to_mask(flags, want);
10681
10682 InodeRef dirinode;
10683 std::scoped_lock lock(client_lock);
10684 int r = get_fd_inode(dirfd, &dirinode);
10685 if (r < 0) {
10686 return r;
10687 }
10688
10689 InodeRef in;
10690 filepath path(relpath);
10691 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10692 if (r < 0) {
10693 return r;
10694 }
10695 r = _getattr(in, mask, perms);
10696 if (r < 0) {
10697 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10698 return r;
10699 }
10700
10701 fill_statx(in, mask, stx);
10702 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10703 return r;
10704}
10705
7c673cae
FG
10706// not written yet, but i want to link!
10707
10708int Client::chdir(const char *relpath, std::string &new_cwd,
10709 const UserPerm& perms)
10710{
f67539c2
TL
10711 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10712 if (!mref_reader.is_state_satisfied())
10713 return -CEPHFS_ENOTCONN;
10714
7c673cae
FG
10715 tout(cct) << "chdir" << std::endl;
10716 tout(cct) << relpath << std::endl;
181888fb 10717
7c673cae
FG
10718 filepath path(relpath);
10719 InodeRef in;
f67539c2
TL
10720
10721 std::scoped_lock lock(client_lock);
7c673cae
FG
10722 int r = path_walk(path, &in, perms);
10723 if (r < 0)
10724 return r;
92f5a8d4
TL
10725
10726 if (!(in.get()->is_dir()))
f67539c2 10727 return -CEPHFS_ENOTDIR;
92f5a8d4 10728
7c673cae
FG
10729 if (cwd != in)
10730 cwd.swap(in);
10731 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10732
b5b8bbf5 10733 _getcwd(new_cwd, perms);
7c673cae
FG
10734 return 0;
10735}
10736
b5b8bbf5 10737void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
10738{
10739 filepath path;
11fdf7f2 10740 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
10741
10742 Inode *in = cwd.get();
b3b6e05e 10743 while (in != root.get()) {
11fdf7f2 10744 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
10745
10746 // A cwd or ancester is unlinked
11fdf7f2 10747 if (in->dentries.empty()) {
7c673cae
FG
10748 return;
10749 }
10750
10751 Dentry *dn = in->get_first_parent();
10752
10753
10754 if (!dn) {
10755 // look it up
11fdf7f2 10756 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
10757 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10758 filepath path(in->ino);
10759 req->set_filepath(path);
10760 req->set_inode(in);
10761 int res = make_request(req, perms);
10762 if (res < 0)
10763 break;
10764
10765 // start over
10766 path = filepath();
10767 in = cwd.get();
10768 continue;
10769 }
10770 path.push_front_dentry(dn->name);
10771 in = dn->dir->parent_inode;
10772 }
10773 dir = "/";
10774 dir += path.get_path();
10775}
10776
b5b8bbf5
FG
10777void Client::getcwd(string& dir, const UserPerm& perms)
10778{
f67539c2
TL
10779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10780 if (!mref_reader.is_state_satisfied())
10781 return;
10782
10783 std::scoped_lock l(client_lock);
10784
10785 _getcwd(dir, perms);
b5b8bbf5
FG
10786}
10787
7c673cae
FG
10788int Client::statfs(const char *path, struct statvfs *stbuf,
10789 const UserPerm& perms)
10790{
f67539c2
TL
10791 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10792 if (!mref_reader.is_state_satisfied())
10793 return -CEPHFS_ENOTCONN;
10794
11fdf7f2 10795 tout(cct) << __func__ << std::endl;
91327a77 10796 unsigned long int total_files_on_fs;
7c673cae
FG
10797
10798 ceph_statfs stats;
10799 C_SaferCond cond;
d2e6a577 10800
f67539c2 10801 std::unique_lock lock(client_lock);
d2e6a577
FG
10802 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10803 if (data_pools.size() == 1) {
10804 objecter->get_fs_stats(stats, data_pools[0], &cond);
10805 } else {
10806 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10807 }
7c673cae 10808
f67539c2 10809 lock.unlock();
7c673cae 10810 int rval = cond.wait();
f67539c2
TL
10811 lock.lock();
10812
91327a77
AA
10813 assert(root);
10814 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
10815
10816 if (rval < 0) {
10817 ldout(cct, 1) << "underlying call to statfs returned error: "
10818 << cpp_strerror(rval)
10819 << dendl;
10820 return rval;
10821 }
10822
10823 memset(stbuf, 0, sizeof(*stbuf));
10824
10825 /*
10826 * we're going to set a block size of 4MB so we can represent larger
10827 * FSes without overflowing. Additionally convert the space
10828 * measurements from KB to bytes while making them in terms of
10829 * blocks. We use 4MB only because it is big enough, and because it
10830 * actually *is* the (ceph) default block size.
10831 */
10832 const int CEPH_BLOCK_SHIFT = 22;
10833 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10834 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77 10835 stbuf->f_files = total_files_on_fs;
f67539c2 10836 stbuf->f_ffree = -1;
7c673cae
FG
10837 stbuf->f_favail = -1;
10838 stbuf->f_fsid = -1; // ??
10839 stbuf->f_flag = 0; // ??
10840 stbuf->f_namemax = NAME_MAX;
10841
10842 // Usually quota_root will == root_ancestor, but if the mount root has no
10843 // quota but we can see a parent of it that does have a quota, we'll
10844 // respect that one instead.
11fdf7f2 10845 ceph_assert(root != nullptr);
b3b6e05e 10846 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
7c673cae
FG
10847
10848 // get_quota_root should always give us something
10849 // because client quotas are always enabled
11fdf7f2 10850 ceph_assert(quota_root != nullptr);
7c673cae
FG
10851
10852 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10853
10854 // Skip the getattr if any sessions are stale, as we don't want to
10855 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10856 // is unhealthy.
10857 if (!_any_stale_sessions()) {
10858 int r = _getattr(quota_root, 0, perms, true);
10859 if (r != 0) {
10860 // Ignore return value: error getting latest inode metadata is not a good
10861 // reason to break "df".
10862 lderr(cct) << "Error in getattr on quota root 0x"
10863 << std::hex << quota_root->ino << std::dec
10864 << " statfs result may be outdated" << dendl;
10865 }
10866 }
10867
10868 // Special case: if there is a size quota set on the Inode acting
10869 // as the root for this client mount, then report the quota status
10870 // as the filesystem statistics.
10871 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10872 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10873 // It is possible for a quota to be exceeded: arithmetic here must
10874 // handle case where used > total.
10875 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10876
10877 stbuf->f_blocks = total;
10878 stbuf->f_bfree = free;
10879 stbuf->f_bavail = free;
10880 } else {
d2e6a577 10881 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10882 // multiple pools may be used without one filesystem namespace via
10883 // layouts, this is the most correct thing we can do.
10884 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10885 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10886 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10887 }
10888
10889 return rval;
10890}
10891
10892int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10893 struct flock *fl, uint64_t owner, bool removing)
10894{
11fdf7f2 10895 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10896 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10897 << " type " << fl->l_type << " owner " << owner
10898 << " " << fl->l_start << "~" << fl->l_len << dendl;
10899
f6b5b4d7 10900 if (in->flags & I_ERROR_FILELOCK)
f67539c2 10901 return -CEPHFS_EIO;
f6b5b4d7 10902
7c673cae
FG
10903 int lock_cmd;
10904 if (F_RDLCK == fl->l_type)
10905 lock_cmd = CEPH_LOCK_SHARED;
10906 else if (F_WRLCK == fl->l_type)
10907 lock_cmd = CEPH_LOCK_EXCL;
10908 else if (F_UNLCK == fl->l_type)
10909 lock_cmd = CEPH_LOCK_UNLOCK;
10910 else
f67539c2 10911 return -CEPHFS_EIO;
7c673cae
FG
10912
10913 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10914 sleep = 0;
10915
10916 /*
10917 * Set the most significant bit, so that MDS knows the 'owner'
10918 * is sufficient to identify the owner of lock. (old code uses
10919 * both 'owner' and 'pid')
10920 */
10921 owner |= (1ULL << 63);
10922
10923 MetaRequest *req = new MetaRequest(op);
10924 filepath path;
10925 in->make_nosnap_relative_path(path);
10926 req->set_filepath(path);
10927 req->set_inode(in);
10928
10929 req->head.args.filelock_change.rule = lock_type;
10930 req->head.args.filelock_change.type = lock_cmd;
10931 req->head.args.filelock_change.owner = owner;
10932 req->head.args.filelock_change.pid = fl->l_pid;
10933 req->head.args.filelock_change.start = fl->l_start;
10934 req->head.args.filelock_change.length = fl->l_len;
10935 req->head.args.filelock_change.wait = sleep;
10936
10937 int ret;
10938 bufferlist bl;
10939
10940 if (sleep && switch_interrupt_cb) {
10941 // enable interrupt
10942 switch_interrupt_cb(callback_handle, req->get());
10943 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10944 // disable interrupt
10945 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10946 if (ret == 0 && req->aborted()) {
10947 // effect of this lock request has been revoked by the 'lock intr' request
10948 ret = req->get_abort_code();
10949 }
7c673cae
FG
10950 put_request(req);
10951 } else {
10952 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10953 }
10954
10955 if (ret == 0) {
10956 if (op == CEPH_MDS_OP_GETFILELOCK) {
10957 ceph_filelock filelock;
11fdf7f2
TL
10958 auto p = bl.cbegin();
10959 decode(filelock, p);
7c673cae
FG
10960
10961 if (CEPH_LOCK_SHARED == filelock.type)
10962 fl->l_type = F_RDLCK;
10963 else if (CEPH_LOCK_EXCL == filelock.type)
10964 fl->l_type = F_WRLCK;
10965 else
10966 fl->l_type = F_UNLCK;
10967
10968 fl->l_whence = SEEK_SET;
10969 fl->l_start = filelock.start;
10970 fl->l_len = filelock.length;
10971 fl->l_pid = filelock.pid;
10972 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10973 ceph_lock_state_t *lock_state;
10974 if (lock_type == CEPH_LOCK_FCNTL) {
10975 if (!in->fcntl_locks)
11fdf7f2
TL
10976 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10977 lock_state = in->fcntl_locks.get();
7c673cae
FG
10978 } else if (lock_type == CEPH_LOCK_FLOCK) {
10979 if (!in->flock_locks)
11fdf7f2
TL
10980 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10981 lock_state = in->flock_locks.get();
7c673cae
FG
10982 } else {
10983 ceph_abort();
f67539c2 10984 return -CEPHFS_EINVAL;
7c673cae
FG
10985 }
10986 _update_lock_state(fl, owner, lock_state);
10987
10988 if (!removing) {
10989 if (lock_type == CEPH_LOCK_FCNTL) {
10990 if (!fh->fcntl_locks)
11fdf7f2
TL
10991 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10992 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10993 } else {
10994 if (!fh->flock_locks)
11fdf7f2
TL
10995 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10996 lock_state = fh->flock_locks.get();
7c673cae
FG
10997 }
10998 _update_lock_state(fl, owner, lock_state);
10999 }
11000 } else
11001 ceph_abort();
11002 }
11003 return ret;
11004}
11005
11006int Client::_interrupt_filelock(MetaRequest *req)
11007{
31f18b77
FG
11008 // Set abort code, but do not kick. The abort code prevents the request
11009 // from being re-sent.
f67539c2 11010 req->abort(-CEPHFS_EINTR);
31f18b77
FG
11011 if (req->mds < 0)
11012 return 0; // haven't sent the request
11013
7c673cae
FG
11014 Inode *in = req->inode();
11015
11016 int lock_type;
11017 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11018 lock_type = CEPH_LOCK_FLOCK_INTR;
11019 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11020 lock_type = CEPH_LOCK_FCNTL_INTR;
11021 else {
11022 ceph_abort();
f67539c2 11023 return -CEPHFS_EINVAL;
7c673cae
FG
11024 }
11025
11026 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11027 filepath path;
11028 in->make_nosnap_relative_path(path);
11029 intr_req->set_filepath(path);
11030 intr_req->set_inode(in);
11031 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11032 intr_req->head.args.filelock_change.rule = lock_type;
11033 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11034
11035 UserPerm perms(req->get_uid(), req->get_gid());
11036 return make_request(intr_req, perms, NULL, NULL, -1);
11037}
11038
11039void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11040{
11041 if (!in->fcntl_locks && !in->flock_locks)
11042 return;
11043
11044 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 11045 encode(nr_fcntl_locks, bl);
7c673cae 11046 if (nr_fcntl_locks) {
11fdf7f2 11047 auto &lock_state = in->fcntl_locks;
7c673cae
FG
11048 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
11049 p != lock_state->held_locks.end();
11050 ++p)
11fdf7f2 11051 encode(p->second, bl);
7c673cae
FG
11052 }
11053
11054 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 11055 encode(nr_flock_locks, bl);
7c673cae 11056 if (nr_flock_locks) {
11fdf7f2 11057 auto &lock_state = in->flock_locks;
7c673cae
FG
11058 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
11059 p != lock_state->held_locks.end();
11060 ++p)
11fdf7f2 11061 encode(p->second, bl);
7c673cae
FG
11062 }
11063
11fdf7f2 11064 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
11065 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11066}
11067
11068void Client::_release_filelocks(Fh *fh)
11069{
11070 if (!fh->fcntl_locks && !fh->flock_locks)
11071 return;
11072
11073 Inode *in = fh->inode.get();
11fdf7f2 11074 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 11075
f6b5b4d7
TL
11076 list<ceph_filelock> activated_locks;
11077
7c673cae
FG
11078 list<pair<int, ceph_filelock> > to_release;
11079
11080 if (fh->fcntl_locks) {
11fdf7f2 11081 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
11082 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11083 auto q = p++;
11084 if (in->flags & I_ERROR_FILELOCK) {
11085 lock_state->remove_lock(q->second, activated_locks);
11086 } else {
11087 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11088 }
11089 }
11fdf7f2 11090 lock_state.reset();
7c673cae
FG
11091 }
11092 if (fh->flock_locks) {
11fdf7f2 11093 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
11094 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11095 auto q = p++;
11096 if (in->flags & I_ERROR_FILELOCK) {
11097 lock_state->remove_lock(q->second, activated_locks);
11098 } else {
11099 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11100 }
11101 }
11fdf7f2 11102 lock_state.reset();
7c673cae
FG
11103 }
11104
f6b5b4d7
TL
11105 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11106 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 11107
f6b5b4d7 11108 if (to_release.empty())
11fdf7f2
TL
11109 return;
11110
7c673cae
FG
11111 struct flock fl;
11112 memset(&fl, 0, sizeof(fl));
11113 fl.l_whence = SEEK_SET;
11114 fl.l_type = F_UNLCK;
11115
11116 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11117 p != to_release.end();
11118 ++p) {
11119 fl.l_start = p->second.start;
11120 fl.l_len = p->second.length;
11121 fl.l_pid = p->second.pid;
11122 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11123 p->second.owner, true);
11124 }
11125}
11126
11127void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11128 ceph_lock_state_t *lock_state)
11129{
11130 int lock_cmd;
11131 if (F_RDLCK == fl->l_type)
11132 lock_cmd = CEPH_LOCK_SHARED;
11133 else if (F_WRLCK == fl->l_type)
11134 lock_cmd = CEPH_LOCK_EXCL;
11135 else
11136 lock_cmd = CEPH_LOCK_UNLOCK;;
11137
11138 ceph_filelock filelock;
11139 filelock.start = fl->l_start;
11140 filelock.length = fl->l_len;
11141 filelock.client = 0;
11142 // see comment in _do_filelock()
11143 filelock.owner = owner | (1ULL << 63);
11144 filelock.pid = fl->l_pid;
11145 filelock.type = lock_cmd;
11146
11147 if (filelock.type == CEPH_LOCK_UNLOCK) {
11148 list<ceph_filelock> activated_locks;
11149 lock_state->remove_lock(filelock, activated_locks);
11150 } else {
11151 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 11152 ceph_assert(r);
7c673cae
FG
11153 }
11154}
11155
11156int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11157{
11158 Inode *in = fh->inode.get();
11159 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11160 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11161 return ret;
11162}
11163
11164int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11165{
11166 Inode *in = fh->inode.get();
11167 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11168 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11169 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11170 return ret;
11171}
11172
11173int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11174{
11175 Inode *in = fh->inode.get();
11176 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11177
11178 int sleep = !(cmd & LOCK_NB);
11179 cmd &= ~LOCK_NB;
11180
11181 int type;
11182 switch (cmd) {
11183 case LOCK_SH:
11184 type = F_RDLCK;
11185 break;
11186 case LOCK_EX:
11187 type = F_WRLCK;
11188 break;
11189 case LOCK_UN:
11190 type = F_UNLCK;
11191 break;
11192 default:
f67539c2 11193 return -CEPHFS_EINVAL;
7c673cae
FG
11194 }
11195
11196 struct flock fl;
11197 memset(&fl, 0, sizeof(fl));
11198 fl.l_type = type;
11199 fl.l_whence = SEEK_SET;
11200
11201 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11202 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11203 return ret;
11204}
11205
f67539c2
TL
11206int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11207 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11208 if (!mref_reader.is_state_satisfied()) {
11209 return -CEPHFS_ENOTCONN;
11210 }
11211
11212 std::unique_lock locker(client_lock);
11213 InodeRef in;
11214 int r = Client::path_walk(path, &in, perms, true);
11215 if (r < 0) {
11216 return r;
11217 }
11218
11219 if (in->snapid == CEPH_NOSNAP) {
11220 return -CEPHFS_EINVAL;
11221 }
11222
11223 snap_info->id = in->snapid;
11224 snap_info->metadata = in->snap_metadata;
11225 return 0;
11226}
11227
11228int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11229{
11230 /* Since the only thing this does is wrap a call to statfs, and
11231 statfs takes a lock, it doesn't seem we have a need to split it
11232 out. */
7c673cae
FG
11233 return statfs(0, stbuf, perms);
11234}
11235
e306af50 11236void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
11237{
11238 if (!args)
11239 return;
f67539c2 11240 std::scoped_lock l(client_lock);
11fdf7f2 11241 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
11242 << " invalidate_ino_cb " << args->ino_cb
11243 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
11244 << " switch_interrupt_cb " << args->switch_intr_cb
11245 << " remount_cb " << args->remount_cb
11246 << dendl;
11247 callback_handle = args->handle;
11248 if (args->ino_cb) {
11249 ino_invalidate_cb = args->ino_cb;
11250 async_ino_invalidator.start();
11251 }
11252 if (args->dentry_cb) {
11253 dentry_invalidate_cb = args->dentry_cb;
11254 async_dentry_invalidator.start();
11255 }
11256 if (args->switch_intr_cb) {
11257 switch_interrupt_cb = args->switch_intr_cb;
11258 interrupt_finisher.start();
11259 }
11260 if (args->remount_cb) {
11261 remount_cb = args->remount_cb;
11262 remount_finisher.start();
11263 }
e306af50
TL
11264 if (args->ino_release_cb) {
11265 ino_release_cb = args->ino_release_cb;
11266 async_ino_releasor.start();
11267 }
11268 if (args->umask_cb)
11269 umask_cb = args->umask_cb;
7c673cae
FG
11270}
11271
11272int Client::test_dentry_handling(bool can_invalidate)
11273{
11274 int r = 0;
11275
f67539c2
TL
11276 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11277 if (!iref_reader.is_state_satisfied())
11278 return -CEPHFS_ENOTCONN;
11279
7c673cae
FG
11280 can_invalidate_dentries = can_invalidate;
11281
11282 if (can_invalidate_dentries) {
11fdf7f2 11283 ceph_assert(dentry_invalidate_cb);
7c673cae 11284 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 11285 r = 0;
11fdf7f2
TL
11286 } else {
11287 ceph_assert(remount_cb);
7c673cae 11288 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 11289 r = _do_remount(false);
b32b8144 11290 }
11fdf7f2 11291
7c673cae
FG
11292 return r;
11293}
11294
11295int Client::_sync_fs()
11296{
f67539c2
TL
11297 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11298
11fdf7f2 11299 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
11300
11301 // flush file data
11fdf7f2
TL
11302 std::unique_ptr<C_SaferCond> cond = nullptr;
11303 if (cct->_conf->client_oc) {
11304 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11305 objectcacher->flush_all(cond.get());
11306 }
7c673cae
FG
11307
11308 // flush caps
11309 flush_caps_sync();
11310 ceph_tid_t flush_tid = last_flush_tid;
11311
11312 // wait for unsafe mds requests
11313 wait_unsafe_requests();
11314
11315 wait_sync_caps(flush_tid);
11316
11fdf7f2 11317 if (nullptr != cond) {
9f95a23c 11318 client_lock.unlock();
11fdf7f2
TL
11319 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11320 cond->wait();
11321 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 11322 client_lock.lock();
7c673cae
FG
11323 }
11324
11325 return 0;
11326}
11327
11328int Client::sync_fs()
11329{
f67539c2
TL
11330 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11331 if (!mref_reader.is_state_satisfied())
11332 return -CEPHFS_ENOTCONN;
181888fb 11333
f67539c2 11334 std::scoped_lock l(client_lock);
181888fb 11335
7c673cae
FG
11336 return _sync_fs();
11337}
11338
11339int64_t Client::drop_caches()
11340{
f67539c2 11341 std::scoped_lock l(client_lock);
7c673cae
FG
11342 return objectcacher->release_all();
11343}
11344
11fdf7f2
TL
11345int Client::_lazyio(Fh *fh, int enable)
11346{
11347 Inode *in = fh->inode.get();
11348 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11349
11350 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11351 return 0;
11352
11353 int orig_mode = fh->mode;
11354 if (enable) {
11355 fh->mode |= CEPH_FILE_MODE_LAZY;
11356 in->get_open_ref(fh->mode);
11357 in->put_open_ref(orig_mode);
11358 check_caps(in, CHECK_CAPS_NODELAY);
11359 } else {
11360 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11361 in->get_open_ref(fh->mode);
11362 in->put_open_ref(orig_mode);
11363 check_caps(in, 0);
11364 }
11365
11366 return 0;
11367}
11368
11369int Client::lazyio(int fd, int enable)
11370{
f67539c2 11371 std::scoped_lock l(client_lock);
11fdf7f2
TL
11372 Fh *f = get_filehandle(fd);
11373 if (!f)
f67539c2 11374 return -CEPHFS_EBADF;
11fdf7f2
TL
11375
11376 return _lazyio(f, enable);
11377}
11378
11379int Client::ll_lazyio(Fh *fh, int enable)
11380{
11fdf7f2
TL
11381 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11382 tout(cct) << __func__ << std::endl;
11383
f67539c2 11384 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11385 return _lazyio(fh, enable);
11386}
7c673cae 11387
92f5a8d4 11388int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 11389{
f67539c2 11390 std::scoped_lock l(client_lock);
92f5a8d4 11391 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
11392 << ", " << offset << ", " << count << ")" << dendl;
11393
11394 Fh *f = get_filehandle(fd);
11395 if (!f)
f67539c2 11396 return -CEPHFS_EBADF;
7c673cae
FG
11397
11398 // for now
11399 _fsync(f, true);
11400
11401 return 0;
11402}
11403
11404int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11405{
f67539c2 11406 std::scoped_lock l(client_lock);
7c673cae
FG
11407 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11408 << ", " << offset << ", " << count << ")" << dendl;
11409
11410 Fh *f = get_filehandle(fd);
11411 if (!f)
f67539c2 11412 return -CEPHFS_EBADF;
7c673cae
FG
11413 Inode *in = f->inode.get();
11414
11415 _fsync(f, true);
92f5a8d4
TL
11416 if (_release(in)) {
11417 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11418 if (r < 0)
11419 return r;
11420 }
7c673cae
FG
11421 return 0;
11422}
11423
11424
11425// =============================
11426// snaps
11427
f67539c2
TL
11428int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11429 mode_t mode, const std::map<std::string, std::string> &metadata)
7c673cae 11430{
f67539c2
TL
11431 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11432 if (!mref_reader.is_state_satisfied())
11433 return -CEPHFS_ENOTCONN;
181888fb 11434
f67539c2 11435 std::scoped_lock l(client_lock);
181888fb 11436
7c673cae
FG
11437 filepath path(relpath);
11438 InodeRef in;
11439 int r = path_walk(path, &in, perm);
11440 if (r < 0)
11441 return r;
11442 if (cct->_conf->client_permissions) {
11443 r = may_create(in.get(), perm);
11444 if (r < 0)
11445 return r;
11446 }
11447 Inode *snapdir = open_snapdir(in.get());
f67539c2 11448 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
7c673cae 11449}
181888fb 11450
f67539c2 11451int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
7c673cae 11452{
f67539c2
TL
11453 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11454 if (!mref_reader.is_state_satisfied())
11455 return -CEPHFS_ENOTCONN;
181888fb 11456
f67539c2 11457 std::scoped_lock l(client_lock);
181888fb 11458
7c673cae
FG
11459 filepath path(relpath);
11460 InodeRef in;
11461 int r = path_walk(path, &in, perms);
11462 if (r < 0)
11463 return r;
f67539c2 11464 Inode *snapdir = open_snapdir(in.get());
7c673cae 11465 if (cct->_conf->client_permissions) {
f67539c2 11466 r = may_delete(snapdir, check_perms ? name : NULL, perms);
7c673cae
FG
11467 if (r < 0)
11468 return r;
11469 }
7c673cae
FG
11470 return _rmdir(snapdir, name, perms);
11471}
11472
11473// =============================
11474// expose caps
11475
f67539c2
TL
11476int Client::get_caps_issued(int fd)
11477{
11478 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11479 if (!mref_reader.is_state_satisfied())
11480 return -CEPHFS_ENOTCONN;
7c673cae 11481
f67539c2 11482 std::scoped_lock lock(client_lock);
181888fb 11483
7c673cae
FG
11484 Fh *f = get_filehandle(fd);
11485 if (!f)
f67539c2 11486 return -CEPHFS_EBADF;
7c673cae
FG
11487
11488 return f->inode->caps_issued();
11489}
11490
11491int Client::get_caps_issued(const char *path, const UserPerm& perms)
11492{
f67539c2
TL
11493 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11494 if (!mref_reader.is_state_satisfied())
11495 return -CEPHFS_ENOTCONN;
181888fb 11496
f67539c2 11497 std::scoped_lock lock(client_lock);
181888fb 11498
7c673cae
FG
11499 filepath p(path);
11500 InodeRef in;
11501 int r = path_walk(p, &in, perms, true);
11502 if (r < 0)
11503 return r;
11504 return in->caps_issued();
11505}
11506
11507// =========================================
11508// low level
11509
11510Inode *Client::open_snapdir(Inode *diri)
11511{
11512 Inode *in;
11513 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11514 if (!inode_map.count(vino)) {
11515 in = new Inode(this, vino, &diri->layout);
11516
11517 in->ino = diri->ino;
11518 in->snapid = CEPH_SNAPDIR;
11519 in->mode = diri->mode;
11520 in->uid = diri->uid;
11521 in->gid = diri->gid;
494da23a 11522 in->nlink = 1;
7c673cae
FG
11523 in->mtime = diri->mtime;
11524 in->ctime = diri->ctime;
11525 in->btime = diri->btime;
f6b5b4d7 11526 in->atime = diri->atime;
7c673cae
FG
11527 in->size = diri->size;
11528 in->change_attr = diri->change_attr;
11529
11530 in->dirfragtree.clear();
11531 in->snapdir_parent = diri;
11532 diri->flags |= I_SNAPDIR_OPEN;
11533 inode_map[vino] = in;
11534 if (use_faked_inos())
11535 _assign_faked_ino(in);
11536 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11537 } else {
11538 in = inode_map[vino];
11539 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11540 }
11541 return in;
11542}
11543
11544int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11545 Inode **out, const UserPerm& perms)
11546{
f67539c2
TL
11547 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11548 if (!mref_reader.is_state_satisfied())
11549 return -CEPHFS_ENOTCONN;
11550
31f18b77 11551 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
11552 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11553 tout(cct) << __func__ << std::endl;
7c673cae
FG
11554 tout(cct) << name << std::endl;
11555
f67539c2 11556 std::scoped_lock lock(client_lock);
181888fb 11557
7c673cae 11558 int r = 0;
11fdf7f2
TL
11559 if (!fuse_default_permissions) {
11560 if (strcmp(name, ".") && strcmp(name, "..")) {
11561 r = may_lookup(parent, perms);
11562 if (r < 0)
11563 return r;
11564 }
7c673cae
FG
11565 }
11566
11567 string dname(name);
11568 InodeRef in;
11569
11570 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11571 if (r < 0) {
11572 attr->st_ino = 0;
11573 goto out;
11574 }
11575
11fdf7f2 11576 ceph_assert(in);
7c673cae
FG
11577 fill_stat(in, attr);
11578 _ll_get(in.get());
11579
11580 out:
11fdf7f2 11581 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11582 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11583 tout(cct) << attr->st_ino << std::endl;
11584 *out = in.get();
11585 return r;
11586}
11587
f67539c2
TL
11588int Client::ll_lookup_vino(
11589 vinodeno_t vino,
1adf2230
AA
11590 const UserPerm& perms,
11591 Inode **inode)
11592{
81eedcae 11593 ceph_assert(inode != NULL);
f67539c2
TL
11594 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11595 if (!mref_reader.is_state_satisfied())
11596 return -CEPHFS_ENOTCONN;
81eedcae 11597
b3b6e05e
TL
11598 if (is_reserved_vino(vino))
11599 return -CEPHFS_ESTALE;
11600
f67539c2
TL
11601 std::scoped_lock lock(client_lock);
11602 ldout(cct, 3) << __func__ << " " << vino << dendl;
1adf2230 11603
f67539c2
TL
11604 // Check the cache first
11605 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11606 if (p != inode_map.end()) {
11607 *inode = p->second;
11608 _ll_get(*inode);
81eedcae
TL
11609 return 0;
11610 }
11611
f67539c2 11612 uint64_t snapid = vino.snapid;
81eedcae 11613
f67539c2
TL
11614 // for snapdir, find the non-snapped dir inode
11615 if (snapid == CEPH_SNAPDIR)
11616 vino.snapid = CEPH_NOSNAP;
11617
11618 int r = _lookup_vino(vino, perms, inode);
11619 if (r)
1adf2230 11620 return r;
f67539c2 11621 ceph_assert(*inode != NULL);
81eedcae 11622
f67539c2
TL
11623 if (snapid == CEPH_SNAPDIR) {
11624 Inode *tmp = *inode;
1adf2230 11625
f67539c2
TL
11626 // open the snapdir and put the inode ref
11627 *inode = open_snapdir(tmp);
11628 _ll_forget(tmp, 1);
11629 _ll_get(*inode);
1adf2230 11630 }
1adf2230
AA
11631 return 0;
11632}
11633
f67539c2
TL
11634int Client::ll_lookup_inode(
11635 struct inodeno_t ino,
11636 const UserPerm& perms,
11637 Inode **inode)
11638{
11639 vinodeno_t vino(ino, CEPH_NOSNAP);
11640 return ll_lookup_vino(vino, perms, inode);
11641}
11642
7c673cae
FG
11643int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11644 struct ceph_statx *stx, unsigned want, unsigned flags,
11645 const UserPerm& perms)
11646{
f67539c2
TL
11647 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11648 if (!mref_reader.is_state_satisfied())
11649 return -CEPHFS_ENOTCONN;
11650
31f18b77 11651 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 11652 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
11653 tout(cct) << "ll_lookupx" << std::endl;
11654 tout(cct) << name << std::endl;
11655
f67539c2 11656 std::scoped_lock lock(client_lock);
181888fb 11657
7c673cae 11658 int r = 0;
11fdf7f2 11659 if (!fuse_default_permissions) {
7c673cae
FG
11660 r = may_lookup(parent, perms);
11661 if (r < 0)
11662 return r;
11663 }
11664
11665 string dname(name);
11666 InodeRef in;
11667
11668 unsigned mask = statx_to_mask(flags, want);
11669 r = _lookup(parent, dname, mask, &in, perms);
11670 if (r < 0) {
11671 stx->stx_ino = 0;
11672 stx->stx_mask = 0;
11673 } else {
11fdf7f2 11674 ceph_assert(in);
7c673cae
FG
11675 fill_statx(in, mask, stx);
11676 _ll_get(in.get());
11677 }
11678
11fdf7f2 11679 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11680 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11681 tout(cct) << stx->stx_ino << std::endl;
11682 *out = in.get();
11683 return r;
11684}
11685
11686int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11687 unsigned int want, unsigned int flags, const UserPerm& perms)
11688{
f67539c2
TL
11689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11690 if (!mref_reader.is_state_satisfied())
11691 return -CEPHFS_ENOTCONN;
181888fb 11692
7c673cae
FG
11693 filepath fp(name, 0);
11694 InodeRef in;
11695 int rc;
11696 unsigned mask = statx_to_mask(flags, want);
11697
11fdf7f2
TL
11698 ldout(cct, 3) << __func__ << " " << name << dendl;
11699 tout(cct) << __func__ << std::endl;
7c673cae
FG
11700 tout(cct) << name << std::endl;
11701
f67539c2 11702 std::scoped_lock lock(client_lock);
7c673cae
FG
11703 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11704 if (rc < 0) {
11705 /* zero out mask, just in case... */
11706 stx->stx_mask = 0;
11707 stx->stx_ino = 0;
11708 *out = NULL;
11709 return rc;
11710 } else {
11fdf7f2 11711 ceph_assert(in);
7c673cae
FG
11712 fill_statx(in, mask, stx);
11713 _ll_get(in.get());
11714 *out = in.get();
11715 return 0;
11716 }
11717}
11718
11719void Client::_ll_get(Inode *in)
11720{
11721 if (in->ll_ref == 0) {
b3b6e05e 11722 in->iget();
11fdf7f2
TL
11723 if (in->is_dir() && !in->dentries.empty()) {
11724 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11725 in->get_first_parent()->get(); // pin dentry
11726 }
11fdf7f2
TL
11727 if (in->snapid != CEPH_NOSNAP)
11728 ll_snap_ref[in->snapid]++;
7c673cae
FG
11729 }
11730 in->ll_get();
11fdf7f2 11731 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
11732}
11733
494da23a 11734int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
11735{
11736 in->ll_put(num);
11fdf7f2 11737 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 11738 if (in->ll_ref == 0) {
11fdf7f2
TL
11739 if (in->is_dir() && !in->dentries.empty()) {
11740 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11741 in->get_first_parent()->put(); // unpin dentry
11742 }
11fdf7f2
TL
11743 if (in->snapid != CEPH_NOSNAP) {
11744 auto p = ll_snap_ref.find(in->snapid);
11745 ceph_assert(p != ll_snap_ref.end());
11746 ceph_assert(p->second > 0);
11747 if (--p->second == 0)
11748 ll_snap_ref.erase(p);
11749 }
7c673cae
FG
11750 put_inode(in);
11751 return 0;
11752 } else {
11753 return in->ll_ref;
11754 }
11755}
11756
11757void Client::_ll_drop_pins()
11758{
11fdf7f2 11759 ldout(cct, 10) << __func__ << dendl;
1adf2230 11760 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
11761 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11762 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11763 it != inode_map.end();
11764 it = next) {
11765 Inode *in = it->second;
11766 next = it;
11767 ++next;
1adf2230
AA
11768 if (in->ll_ref){
11769 to_be_put.insert(in);
7c673cae 11770 _ll_put(in, in->ll_ref);
1adf2230 11771 }
7c673cae
FG
11772 }
11773}
11774
494da23a 11775bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 11776{
11fdf7f2 11777 inodeno_t ino = in->ino;
7c673cae 11778
11fdf7f2
TL
11779 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11780 tout(cct) << __func__ << std::endl;
7c673cae
FG
11781 tout(cct) << ino.val << std::endl;
11782 tout(cct) << count << std::endl;
11783
181888fb 11784 // Ignore forget if we're no longer mounted
f67539c2
TL
11785 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11786 if (!mref_reader.is_state_satisfied())
181888fb
FG
11787 return true;
11788
7c673cae
FG
11789 if (ino == 1) return true; // ignore forget on root.
11790
11791 bool last = false;
11792 if (in->ll_ref < count) {
11793 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11794 << ", which only has ll_ref=" << in->ll_ref << dendl;
11795 _ll_put(in, in->ll_ref);
11796 last = true;
11797 } else {
11798 if (_ll_put(in, count) == 0)
11799 last = true;
11800 }
11801
11802 return last;
11803}
11804
494da23a 11805bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 11806{
f67539c2 11807 std::scoped_lock lock(client_lock);
1adf2230
AA
11808 return _ll_forget(in, count);
11809}
11810
7c673cae
FG
11811bool Client::ll_put(Inode *in)
11812{
11813 /* ll_forget already takes the lock */
11814 return ll_forget(in, 1);
11815}
11816
11fdf7f2
TL
11817int Client::ll_get_snap_ref(snapid_t snap)
11818{
f67539c2 11819 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11820 auto p = ll_snap_ref.find(snap);
11821 if (p != ll_snap_ref.end())
11822 return p->second;
11823 return 0;
11824}
11825
7c673cae
FG
11826snapid_t Client::ll_get_snapid(Inode *in)
11827{
f67539c2 11828 std::scoped_lock lock(client_lock);
7c673cae
FG
11829 return in->snapid;
11830}
11831
11832Inode *Client::ll_get_inode(ino_t ino)
11833{
f67539c2
TL
11834 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11835 if (!mref_reader.is_state_satisfied())
181888fb
FG
11836 return NULL;
11837
f67539c2
TL
11838 std::scoped_lock lock(client_lock);
11839
7c673cae
FG
11840 vinodeno_t vino = _map_faked_ino(ino);
11841 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11842 if (p == inode_map.end())
11843 return NULL;
11844 Inode *in = p->second;
11845 _ll_get(in);
11846 return in;
11847}
11848
11849Inode *Client::ll_get_inode(vinodeno_t vino)
11850{
f67539c2
TL
11851 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11852 if (!mref_reader.is_state_satisfied())
181888fb
FG
11853 return NULL;
11854
b3b6e05e
TL
11855 if (is_reserved_vino(vino))
11856 return NULL;
11857
f67539c2
TL
11858 std::scoped_lock lock(client_lock);
11859
7c673cae
FG
11860 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11861 if (p == inode_map.end())
11862 return NULL;
11863 Inode *in = p->second;
11864 _ll_get(in);
11865 return in;
11866}
11867
11868int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11869{
11870 vinodeno_t vino = _get_vino(in);
11871
11fdf7f2
TL
11872 ldout(cct, 8) << __func__ << " " << vino << dendl;
11873 tout(cct) << __func__ << std::endl;
7c673cae
FG
11874 tout(cct) << vino.ino.val << std::endl;
11875
11876 if (vino.snapid < CEPH_NOSNAP)
11877 return 0;
11878 else
11879 return _getattr(in, caps, perms);
11880}
11881
11882int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11883{
f67539c2
TL
11884 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11885 if (!mref_reader.is_state_satisfied())
11886 return -CEPHFS_ENOTCONN;
7c673cae 11887
f67539c2 11888 std::scoped_lock lock(client_lock);
181888fb 11889
7c673cae
FG
11890 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11891
11892 if (res == 0)
11893 fill_stat(in, attr);
11fdf7f2 11894 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11895 return res;
11896}
11897
11898int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11899 unsigned int flags, const UserPerm& perms)
11900{
f67539c2
TL
11901 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11902 if (!mref_reader.is_state_satisfied())
11903 return -CEPHFS_ENOTCONN;
7c673cae 11904
f67539c2 11905 std::scoped_lock lock(client_lock);
181888fb 11906
7c673cae
FG
11907 int res = 0;
11908 unsigned mask = statx_to_mask(flags, want);
11909
94b18763 11910 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
11911 res = _ll_getattr(in, mask, perms);
11912
11913 if (res == 0)
11914 fill_statx(in, mask, stx);
11fdf7f2 11915 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11916 return res;
11917}
11918
11919int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11920 const UserPerm& perms, InodeRef *inp)
11921{
11922 vinodeno_t vino = _get_vino(in);
11923
11fdf7f2 11924 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 11925 << dendl;
11fdf7f2 11926 tout(cct) << __func__ << std::endl;
7c673cae
FG
11927 tout(cct) << vino.ino.val << std::endl;
11928 tout(cct) << stx->stx_mode << std::endl;
11929 tout(cct) << stx->stx_uid << std::endl;
11930 tout(cct) << stx->stx_gid << std::endl;
11931 tout(cct) << stx->stx_size << std::endl;
11932 tout(cct) << stx->stx_mtime << std::endl;
11933 tout(cct) << stx->stx_atime << std::endl;
11934 tout(cct) << stx->stx_btime << std::endl;
11935 tout(cct) << mask << std::endl;
11936
11fdf7f2 11937 if (!fuse_default_permissions) {
7c673cae
FG
11938 int res = may_setattr(in, stx, mask, perms);
11939 if (res < 0)
11940 return res;
11941 }
11942
11943 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11944
11945 return __setattrx(in, stx, mask, perms, inp);
11946}
11947
11948int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11949 const UserPerm& perms)
11950{
f67539c2
TL
11951 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11952 if (!mref_reader.is_state_satisfied())
11953 return -CEPHFS_ENOTCONN;
181888fb 11954
f67539c2 11955 std::scoped_lock lock(client_lock);
181888fb 11956
7c673cae
FG
11957 InodeRef target(in);
11958 int res = _ll_setattrx(in, stx, mask, perms, &target);
11959 if (res == 0) {
11fdf7f2 11960 ceph_assert(in == target.get());
7c673cae
FG
11961 fill_statx(in, in->caps_issued(), stx);
11962 }
11963
11fdf7f2 11964 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11965 return res;
11966}
11967
11968int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11969 const UserPerm& perms)
11970{
11971 struct ceph_statx stx;
11972 stat_to_statx(attr, &stx);
11973
f67539c2
TL
11974 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11975 if (!mref_reader.is_state_satisfied())
11976 return -CEPHFS_ENOTCONN;
181888fb 11977
f67539c2 11978 std::scoped_lock lock(client_lock);
181888fb 11979
7c673cae
FG
11980 InodeRef target(in);
11981 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11982 if (res == 0) {
11fdf7f2 11983 ceph_assert(in == target.get());
7c673cae
FG
11984 fill_stat(in, attr);
11985 }
11986
11fdf7f2 11987 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11988 return res;
11989}
11990
11991
11992// ----------
11993// xattrs
11994
11995int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11996 const UserPerm& perms)
11997{
f67539c2
TL
11998 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11999 if (!mref_reader.is_state_satisfied())
12000 return -CEPHFS_ENOTCONN;
181888fb 12001
f67539c2 12002 std::scoped_lock lock(client_lock);
181888fb 12003
7c673cae
FG
12004 InodeRef in;
12005 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12006 if (r < 0)
12007 return r;
12008 return _getxattr(in, name, value, size, perms);
12009}
12010
12011int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12012 const UserPerm& perms)
12013{
f67539c2
TL
12014 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12015 if (!mref_reader.is_state_satisfied())
12016 return -CEPHFS_ENOTCONN;
181888fb 12017
f67539c2 12018 std::scoped_lock lock(client_lock);
181888fb 12019
7c673cae
FG
12020 InodeRef in;
12021 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12022 if (r < 0)
12023 return r;
12024 return _getxattr(in, name, value, size, perms);
12025}
12026
12027int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12028 const UserPerm& perms)
12029{
f67539c2
TL
12030 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12031 if (!mref_reader.is_state_satisfied())
12032 return -CEPHFS_ENOTCONN;
181888fb 12033
f67539c2 12034 std::scoped_lock lock(client_lock);
181888fb 12035
7c673cae
FG
12036 Fh *f = get_filehandle(fd);
12037 if (!f)
f67539c2 12038 return -CEPHFS_EBADF;
7c673cae
FG
12039 return _getxattr(f->inode, name, value, size, perms);
12040}
12041
12042int Client::listxattr(const char *path, char *list, size_t size,
12043 const UserPerm& perms)
12044{
f67539c2
TL
12045 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12046 if (!mref_reader.is_state_satisfied())
12047 return -CEPHFS_ENOTCONN;
181888fb 12048
f67539c2 12049 std::scoped_lock lock(client_lock);
181888fb 12050
7c673cae
FG
12051 InodeRef in;
12052 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12053 if (r < 0)
12054 return r;
12055 return Client::_listxattr(in.get(), list, size, perms);
12056}
12057
12058int Client::llistxattr(const char *path, char *list, size_t size,
12059 const UserPerm& perms)
12060{
f67539c2
TL
12061 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12062 if (!mref_reader.is_state_satisfied())
12063 return -CEPHFS_ENOTCONN;
181888fb 12064
f67539c2 12065 std::scoped_lock lock(client_lock);
181888fb 12066
7c673cae
FG
12067 InodeRef in;
12068 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12069 if (r < 0)
12070 return r;
12071 return Client::_listxattr(in.get(), list, size, perms);
12072}
12073
12074int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12075{
f67539c2
TL
12076 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12077 if (!mref_reader.is_state_satisfied())
12078 return -CEPHFS_ENOTCONN;
181888fb 12079
f67539c2 12080 std::scoped_lock lock(client_lock);
181888fb 12081
7c673cae
FG
12082 Fh *f = get_filehandle(fd);
12083 if (!f)
f67539c2 12084 return -CEPHFS_EBADF;
7c673cae
FG
12085 return Client::_listxattr(f->inode.get(), list, size, perms);
12086}
12087
12088int Client::removexattr(const char *path, const char *name,
12089 const UserPerm& perms)
12090{
f67539c2
TL
12091 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12092 if (!mref_reader.is_state_satisfied())
12093 return -CEPHFS_ENOTCONN;
181888fb 12094
f67539c2 12095 std::scoped_lock lock(client_lock);
181888fb 12096
7c673cae
FG
12097 InodeRef in;
12098 int r = Client::path_walk(path, &in, perms, true);
12099 if (r < 0)
12100 return r;
12101 return _removexattr(in, name, perms);
12102}
12103
12104int Client::lremovexattr(const char *path, const char *name,
12105 const UserPerm& perms)
12106{
f67539c2
TL
12107 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12108 if (!mref_reader.is_state_satisfied())
12109 return -CEPHFS_ENOTCONN;
181888fb 12110
f67539c2 12111 std::scoped_lock lock(client_lock);
181888fb 12112
7c673cae
FG
12113 InodeRef in;
12114 int r = Client::path_walk(path, &in, perms, false);
12115 if (r < 0)
12116 return r;
12117 return _removexattr(in, name, perms);
12118}
12119
12120int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12121{
f67539c2
TL
12122 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12123 if (!mref_reader.is_state_satisfied())
12124 return -CEPHFS_ENOTCONN;
181888fb 12125
f67539c2 12126 std::scoped_lock lock(client_lock);
181888fb 12127
7c673cae
FG
12128 Fh *f = get_filehandle(fd);
12129 if (!f)
f67539c2 12130 return -CEPHFS_EBADF;
7c673cae
FG
12131 return _removexattr(f->inode, name, perms);
12132}
12133
12134int Client::setxattr(const char *path, const char *name, const void *value,
12135 size_t size, int flags, const UserPerm& perms)
12136{
f67539c2
TL
12137 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12138 if (!mref_reader.is_state_satisfied())
12139 return -CEPHFS_ENOTCONN;
12140
7c673cae
FG
12141 _setxattr_maybe_wait_for_osdmap(name, value, size);
12142
f67539c2 12143 std::scoped_lock lock(client_lock);
181888fb 12144
7c673cae
FG
12145 InodeRef in;
12146 int r = Client::path_walk(path, &in, perms, true);
12147 if (r < 0)
12148 return r;
12149 return _setxattr(in, name, value, size, flags, perms);
12150}
12151
12152int Client::lsetxattr(const char *path, const char *name, const void *value,
12153 size_t size, int flags, const UserPerm& perms)
12154{
f67539c2
TL
12155 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12156 if (!mref_reader.is_state_satisfied())
12157 return -CEPHFS_ENOTCONN;
7c673cae 12158
f67539c2 12159 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12160
f67539c2 12161 std::scoped_lock lock(client_lock);
181888fb 12162
7c673cae
FG
12163 InodeRef in;
12164 int r = Client::path_walk(path, &in, perms, false);
12165 if (r < 0)
12166 return r;
12167 return _setxattr(in, name, value, size, flags, perms);
12168}
12169
12170int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12171 int flags, const UserPerm& perms)
12172{
f67539c2
TL
12173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12174 if (!mref_reader.is_state_satisfied())
12175 return -CEPHFS_ENOTCONN;
7c673cae 12176
f67539c2 12177 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12178
f67539c2 12179 std::scoped_lock lock(client_lock);
181888fb 12180
7c673cae
FG
12181 Fh *f = get_filehandle(fd);
12182 if (!f)
f67539c2 12183 return -CEPHFS_EBADF;
7c673cae
FG
12184 return _setxattr(f->inode, name, value, size, flags, perms);
12185}
12186
12187int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12188 const UserPerm& perms)
12189{
12190 int r;
12191
12192 const VXattr *vxattr = _match_vxattr(in, name);
12193 if (vxattr) {
f67539c2 12194 r = -CEPHFS_ENODATA;
7c673cae
FG
12195
12196 // Do a force getattr to get the latest quota before returning
12197 // a value to userspace.
28e407b8
AA
12198 int flags = 0;
12199 if (vxattr->flags & VXATTR_RSTAT) {
12200 flags |= CEPH_STAT_RSTAT;
12201 }
adb31ebb
TL
12202 if (vxattr->flags & VXATTR_DIRSTAT) {
12203 flags |= CEPH_CAP_FILE_SHARED;
12204 }
f67539c2 12205 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
7c673cae
FG
12206 if (r != 0) {
12207 // Error from getattr!
12208 return r;
12209 }
12210
12211 // call pointer-to-member function
12212 char buf[256];
12213 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12214 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12215 } else {
f67539c2 12216 r = -CEPHFS_ENODATA;
7c673cae
FG
12217 }
12218
12219 if (size != 0) {
12220 if (r > (int)size) {
f67539c2 12221 r = -CEPHFS_ERANGE;
7c673cae
FG
12222 } else if (r > 0) {
12223 memcpy(value, buf, r);
12224 }
12225 }
12226 goto out;
12227 }
12228
12229 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
f67539c2 12230 r = -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12231 goto out;
12232 }
12233
12234 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12235 if (r == 0) {
12236 string n(name);
f67539c2 12237 r = -CEPHFS_ENODATA;
7c673cae
FG
12238 if (in->xattrs.count(n)) {
12239 r = in->xattrs[n].length();
12240 if (r > 0 && size != 0) {
12241 if (size >= (unsigned)r)
12242 memcpy(value, in->xattrs[n].c_str(), r);
12243 else
f67539c2 12244 r = -CEPHFS_ERANGE;
7c673cae
FG
12245 }
12246 }
12247 }
12248 out:
1adf2230 12249 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
12250 return r;
12251}
12252
12253int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12254 const UserPerm& perms)
12255{
12256 if (cct->_conf->client_permissions) {
12257 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12258 if (r < 0)
12259 return r;
12260 }
12261 return _getxattr(in.get(), name, value, size, perms);
12262}
12263
12264int Client::ll_getxattr(Inode *in, const char *name, void *value,
12265 size_t size, const UserPerm& perms)
12266{
f67539c2
TL
12267 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12268 if (!mref_reader.is_state_satisfied())
12269 return -CEPHFS_ENOTCONN;
181888fb 12270
7c673cae
FG
12271 vinodeno_t vino = _get_vino(in);
12272
11fdf7f2
TL
12273 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12274 tout(cct) << __func__ << std::endl;
7c673cae
FG
12275 tout(cct) << vino.ino.val << std::endl;
12276 tout(cct) << name << std::endl;
12277
f67539c2 12278 std::scoped_lock lock(client_lock);
11fdf7f2 12279 if (!fuse_default_permissions) {
7c673cae
FG
12280 int r = xattr_permission(in, name, MAY_READ, perms);
12281 if (r < 0)
12282 return r;
12283 }
12284
12285 return _getxattr(in, name, value, size, perms);
12286}
12287
12288int Client::_listxattr(Inode *in, char *name, size_t size,
12289 const UserPerm& perms)
12290{
81eedcae 12291 bool len_only = (size == 0);
7c673cae 12292 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
12293 if (r != 0) {
12294 goto out;
12295 }
7c673cae 12296
81eedcae 12297 r = 0;
f67539c2
TL
12298 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12299 if (xattr_name.rfind("ceph.", 0) == 0) {
12300 continue;
12301 }
12302
12303 size_t this_len = xattr_name.length() + 1;
81eedcae
TL
12304 r += this_len;
12305 if (len_only)
12306 continue;
7c673cae 12307
81eedcae 12308 if (this_len > size) {
f67539c2 12309 r = -CEPHFS_ERANGE;
81eedcae
TL
12310 goto out;
12311 }
12312
f67539c2 12313 memcpy(name, xattr_name.c_str(), this_len);
81eedcae
TL
12314 name += this_len;
12315 size -= this_len;
12316 }
81eedcae 12317out:
11fdf7f2 12318 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
12319 return r;
12320}
12321
12322int Client::ll_listxattr(Inode *in, char *names, size_t size,
12323 const UserPerm& perms)
12324{
f67539c2
TL
12325 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12326 if (!mref_reader.is_state_satisfied())
12327 return -CEPHFS_ENOTCONN;
181888fb 12328
7c673cae
FG
12329 vinodeno_t vino = _get_vino(in);
12330
11fdf7f2
TL
12331 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12332 tout(cct) << __func__ << std::endl;
7c673cae
FG
12333 tout(cct) << vino.ino.val << std::endl;
12334 tout(cct) << size << std::endl;
12335
f67539c2 12336 std::scoped_lock lock(client_lock);
7c673cae
FG
12337 return _listxattr(in, names, size, perms);
12338}
12339
12340int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12341 size_t size, int flags, const UserPerm& perms)
12342{
12343
12344 int xattr_flags = 0;
12345 if (!value)
12346 xattr_flags |= CEPH_XATTR_REMOVE;
12347 if (flags & XATTR_CREATE)
12348 xattr_flags |= CEPH_XATTR_CREATE;
12349 if (flags & XATTR_REPLACE)
12350 xattr_flags |= CEPH_XATTR_REPLACE;
12351
12352 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12353 filepath path;
12354 in->make_nosnap_relative_path(path);
12355 req->set_filepath(path);
12356 req->set_string2(name);
12357 req->set_inode(in);
12358 req->head.args.setxattr.flags = xattr_flags;
12359
12360 bufferlist bl;
11fdf7f2 12361 assert (value || size == 0);
7c673cae
FG
12362 bl.append((const char*)value, size);
12363 req->set_data(bl);
12364
12365 int res = make_request(req, perms);
12366
12367 trim_cache();
11fdf7f2 12368 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
12369 res << dendl;
12370 return res;
12371}
12372
12373int Client::_setxattr(Inode *in, const char *name, const void *value,
12374 size_t size, int flags, const UserPerm& perms)
12375{
12376 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12377 return -CEPHFS_EROFS;
7c673cae
FG
12378 }
12379
f6b5b4d7
TL
12380 if (size == 0) {
12381 value = "";
12382 } else if (value == NULL) {
f67539c2 12383 return -CEPHFS_EINVAL;
f6b5b4d7
TL
12384 }
12385
7c673cae
FG
12386 bool posix_acl_xattr = false;
12387 if (acl_type == POSIX_ACL)
12388 posix_acl_xattr = !strncmp(name, "system.", 7);
12389
12390 if (strncmp(name, "user.", 5) &&
12391 strncmp(name, "security.", 9) &&
12392 strncmp(name, "trusted.", 8) &&
12393 strncmp(name, "ceph.", 5) &&
12394 !posix_acl_xattr)
f67539c2 12395 return -CEPHFS_EOPNOTSUPP;
7c673cae 12396
11fdf7f2
TL
12397 bool check_realm = false;
12398
7c673cae
FG
12399 if (posix_acl_xattr) {
12400 if (!strcmp(name, ACL_EA_ACCESS)) {
12401 mode_t new_mode = in->mode;
12402 if (value) {
12403 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12404 if (ret < 0)
12405 return ret;
12406 if (ret == 0) {
12407 value = NULL;
12408 size = 0;
12409 }
12410 if (new_mode != in->mode) {
12411 struct ceph_statx stx;
12412 stx.stx_mode = new_mode;
12413 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12414 if (ret < 0)
12415 return ret;
12416 }
12417 }
12418 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12419 if (value) {
12420 if (!S_ISDIR(in->mode))
f67539c2 12421 return -CEPHFS_EACCES;
7c673cae
FG
12422 int ret = posix_acl_check(value, size);
12423 if (ret < 0)
f67539c2 12424 return -CEPHFS_EINVAL;
7c673cae
FG
12425 if (ret == 0) {
12426 value = NULL;
12427 size = 0;
12428 }
12429 }
12430 } else {
f67539c2 12431 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12432 }
12433 } else {
12434 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
12435 if (vxattr) {
12436 if (vxattr->readonly)
f67539c2 12437 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12438 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12439 check_realm = true;
12440 }
7c673cae
FG
12441 }
12442
11fdf7f2
TL
12443 int ret = _do_setxattr(in, name, value, size, flags, perms);
12444 if (ret >= 0 && check_realm) {
12445 // check if snaprealm was created for quota inode
12446 if (in->quota.is_enable() &&
12447 !(in->snaprealm && in->snaprealm->ino == in->ino))
f67539c2 12448 ret = -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12449 }
12450
12451 return ret;
7c673cae
FG
12452}
12453
12454int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12455 size_t size, int flags, const UserPerm& perms)
12456{
12457 if (cct->_conf->client_permissions) {
12458 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12459 if (r < 0)
12460 return r;
12461 }
12462 return _setxattr(in.get(), name, value, size, flags, perms);
12463}
12464
12465int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12466{
12467 string tmp;
12468 if (name == "layout") {
12469 string::iterator begin = value.begin();
12470 string::iterator end = value.end();
12471 keys_and_values<string::iterator> p; // create instance of parser
12472 std::map<string, string> m; // map to receive results
12473 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 12474 return -CEPHFS_EINVAL;
7c673cae
FG
12475 }
12476 if (begin != end)
f67539c2 12477 return -CEPHFS_EINVAL;
7c673cae
FG
12478 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12479 if (q->first == "pool") {
12480 tmp = q->second;
12481 break;
12482 }
12483 }
12484 } else if (name == "layout.pool") {
12485 tmp = value;
12486 }
12487
12488 if (tmp.length()) {
12489 int64_t pool;
12490 try {
12491 pool = boost::lexical_cast<unsigned>(tmp);
12492 if (!osdmap->have_pg_pool(pool))
f67539c2 12493 return -CEPHFS_ENOENT;
7c673cae
FG
12494 } catch (boost::bad_lexical_cast const&) {
12495 pool = osdmap->lookup_pg_pool_name(tmp);
12496 if (pool < 0) {
f67539c2 12497 return -CEPHFS_ENOENT;
7c673cae
FG
12498 }
12499 }
12500 }
12501
12502 return 0;
12503}
12504
12505void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12506{
12507 // For setting pool of layout, MetaRequest need osdmap epoch.
12508 // There is a race which create a new data pool but client and mds both don't have.
12509 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
f67539c2 12510 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
7c673cae
FG
12511 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12512 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12513 string rest(strstr(name, "layout"));
12514 string v((const char*)value, size);
12515 int r = objecter->with_osdmap([&](const OSDMap& o) {
12516 return _setxattr_check_data_pool(rest, v, &o);
12517 });
12518
f67539c2
TL
12519 if (r == -CEPHFS_ENOENT) {
12520 bs::error_code ec;
12521 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12522 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12523 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
7c673cae
FG
12524 }
12525 }
12526}
12527
12528int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12529 size_t size, int flags, const UserPerm& perms)
12530{
f67539c2
TL
12531 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12532 if (!mref_reader.is_state_satisfied())
12533 return -CEPHFS_ENOTCONN;
7c673cae 12534
f67539c2 12535 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12536
7c673cae
FG
12537 vinodeno_t vino = _get_vino(in);
12538
11fdf7f2
TL
12539 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12540 tout(cct) << __func__ << std::endl;
7c673cae
FG
12541 tout(cct) << vino.ino.val << std::endl;
12542 tout(cct) << name << std::endl;
12543
f67539c2 12544 std::scoped_lock lock(client_lock);
11fdf7f2 12545 if (!fuse_default_permissions) {
7c673cae
FG
12546 int r = xattr_permission(in, name, MAY_WRITE, perms);
12547 if (r < 0)
12548 return r;
12549 }
12550 return _setxattr(in, name, value, size, flags, perms);
12551}
12552
12553int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12554{
12555 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12556 return -CEPHFS_EROFS;
7c673cae
FG
12557 }
12558
12559 // same xattrs supported by kernel client
12560 if (strncmp(name, "user.", 5) &&
12561 strncmp(name, "system.", 7) &&
12562 strncmp(name, "security.", 9) &&
12563 strncmp(name, "trusted.", 8) &&
12564 strncmp(name, "ceph.", 5))
f67539c2 12565 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12566
12567 const VXattr *vxattr = _match_vxattr(in, name);
12568 if (vxattr && vxattr->readonly)
f67539c2 12569 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12570
12571 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12572 filepath path;
12573 in->make_nosnap_relative_path(path);
12574 req->set_filepath(path);
12575 req->set_filepath2(name);
12576 req->set_inode(in);
12577
12578 int res = make_request(req, perms);
12579
12580 trim_cache();
1adf2230 12581 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
12582 return res;
12583}
12584
12585int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12586{
12587 if (cct->_conf->client_permissions) {
12588 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12589 if (r < 0)
12590 return r;
12591 }
12592 return _removexattr(in.get(), name, perms);
12593}
12594
12595int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12596{
f67539c2
TL
12597 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12598 if (!mref_reader.is_state_satisfied())
12599 return -CEPHFS_ENOTCONN;
181888fb 12600
7c673cae
FG
12601 vinodeno_t vino = _get_vino(in);
12602
12603 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12604 tout(cct) << "ll_removexattr" << std::endl;
12605 tout(cct) << vino.ino.val << std::endl;
12606 tout(cct) << name << std::endl;
12607
f67539c2 12608 std::scoped_lock lock(client_lock);
11fdf7f2 12609 if (!fuse_default_permissions) {
7c673cae
FG
12610 int r = xattr_permission(in, name, MAY_WRITE, perms);
12611 if (r < 0)
12612 return r;
12613 }
12614
12615 return _removexattr(in, name, perms);
12616}
12617
12618bool Client::_vxattrcb_quota_exists(Inode *in)
12619{
11fdf7f2 12620 return in->quota.is_enable() &&
f6b5b4d7
TL
12621 (in->snapid != CEPH_NOSNAP ||
12622 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
12623}
12624size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12625{
12626 return snprintf(val, size,
12627 "max_bytes=%lld max_files=%lld",
12628 (long long int)in->quota.max_bytes,
12629 (long long int)in->quota.max_files);
12630}
12631size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12632{
12633 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12634}
12635size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12636{
12637 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12638}
12639
12640bool Client::_vxattrcb_layout_exists(Inode *in)
12641{
12642 return in->layout != file_layout_t();
12643}
12644size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12645{
12646 int r = snprintf(val, size,
11fdf7f2 12647 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
12648 (unsigned long long)in->layout.stripe_unit,
12649 (unsigned long long)in->layout.stripe_count,
12650 (unsigned long long)in->layout.object_size);
12651 objecter->with_osdmap([&](const OSDMap& o) {
12652 if (o.have_pg_pool(in->layout.pool_id))
12653 r += snprintf(val + r, size - r, "%s",
12654 o.get_pool_name(in->layout.pool_id).c_str());
12655 else
12656 r += snprintf(val + r, size - r, "%" PRIu64,
12657 (uint64_t)in->layout.pool_id);
12658 });
12659 if (in->layout.pool_ns.length())
12660 r += snprintf(val + r, size - r, " pool_namespace=%s",
12661 in->layout.pool_ns.c_str());
12662 return r;
12663}
12664size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12665{
11fdf7f2 12666 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
12667}
12668size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12669{
11fdf7f2 12670 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
12671}
12672size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12673{
11fdf7f2 12674 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
12675}
12676size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12677{
12678 size_t r;
12679 objecter->with_osdmap([&](const OSDMap& o) {
12680 if (o.have_pg_pool(in->layout.pool_id))
12681 r = snprintf(val, size, "%s", o.get_pool_name(
12682 in->layout.pool_id).c_str());
12683 else
12684 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12685 });
12686 return r;
12687}
12688size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12689{
12690 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12691}
12692size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12693{
11fdf7f2 12694 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
12695}
12696size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12697{
11fdf7f2 12698 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
12699}
12700size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12701{
11fdf7f2 12702 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
12703}
12704size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12705{
11fdf7f2 12706 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
12707}
12708size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12709{
11fdf7f2 12710 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
12711}
12712size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12713{
11fdf7f2 12714 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae 12715}
f67539c2
TL
12716size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12717{
12718 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12719}
7c673cae
FG
12720size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12721{
11fdf7f2 12722 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
12723}
12724size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12725{
81eedcae 12726 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
12727 (long)in->rstat.rctime.nsec());
12728}
11fdf7f2
TL
12729bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12730{
f67539c2 12731 return in->dir_pin != -CEPHFS_ENODATA;
11fdf7f2
TL
12732}
12733size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12734{
12735 return snprintf(val, size, "%ld", (long)in->dir_pin);
12736}
7c673cae 12737
81eedcae
TL
12738bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12739{
12740 return !in->snap_btime.is_zero();
12741}
12742
12743size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12744{
12745 return snprintf(val, size, "%llu.%09lu",
12746 (long long unsigned)in->snap_btime.sec(),
12747 (long unsigned)in->snap_btime.nsec());
12748}
12749
f67539c2
TL
12750bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12751{
12752 // checking one of the xattrs would suffice
12753 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12754}
12755
12756size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12757{
12758 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12759 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12760 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12761 in->xattrs["ceph.mirror.info.fs_id"].length(),
12762 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12763}
12764
adb31ebb
TL
12765size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12766{
12767 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12768}
12769
12770size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12771{
12772 auto name = messenger->get_myname();
12773 return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12774}
12775
7c673cae
FG
12776#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12777#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12778
adb31ebb 12779#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
12780{ \
12781 name: CEPH_XATTR_NAME(_type, _name), \
12782 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12783 readonly: true, \
28e407b8
AA
12784 exists_cb: NULL, \
12785 flags: _flags, \
7c673cae
FG
12786}
12787#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12788{ \
12789 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12790 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12791 readonly: false, \
7c673cae 12792 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 12793 flags: 0, \
7c673cae
FG
12794}
12795#define XATTR_QUOTA_FIELD(_type, _name) \
12796{ \
12797 name: CEPH_XATTR_NAME(_type, _name), \
12798 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12799 readonly: false, \
7c673cae 12800 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 12801 flags: 0, \
7c673cae
FG
12802}
12803
12804const Client::VXattr Client::_dir_vxattrs[] = {
12805 {
12806 name: "ceph.dir.layout",
12807 getxattr_cb: &Client::_vxattrcb_layout,
12808 readonly: false,
7c673cae 12809 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12810 flags: 0,
7c673cae
FG
12811 },
12812 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12813 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12814 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12815 XATTR_LAYOUT_FIELD(dir, layout, pool),
12816 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
12817 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12818 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12819 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12820 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12821 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12822 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
f67539c2 12823 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
adb31ebb
TL
12824 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12825 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
12826 {
12827 name: "ceph.quota",
12828 getxattr_cb: &Client::_vxattrcb_quota,
12829 readonly: false,
7c673cae 12830 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 12831 flags: 0,
7c673cae
FG
12832 },
12833 XATTR_QUOTA_FIELD(quota, max_bytes),
12834 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
12835 {
12836 name: "ceph.dir.pin",
12837 getxattr_cb: &Client::_vxattrcb_dir_pin,
12838 readonly: false,
11fdf7f2
TL
12839 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12840 flags: 0,
12841 },
81eedcae
TL
12842 {
12843 name: "ceph.snap.btime",
12844 getxattr_cb: &Client::_vxattrcb_snap_btime,
12845 readonly: true,
81eedcae
TL
12846 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12847 flags: 0,
12848 },
f67539c2
TL
12849 {
12850 name: "ceph.mirror.info",
12851 getxattr_cb: &Client::_vxattrcb_mirror_info,
12852 readonly: false,
12853 exists_cb: &Client::_vxattrcb_mirror_info_exists,
12854 flags: 0,
12855 },
7c673cae
FG
12856 { name: "" } /* Required table terminator */
12857};
12858
12859const Client::VXattr Client::_file_vxattrs[] = {
12860 {
12861 name: "ceph.file.layout",
12862 getxattr_cb: &Client::_vxattrcb_layout,
12863 readonly: false,
7c673cae 12864 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12865 flags: 0,
7c673cae
FG
12866 },
12867 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12868 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12869 XATTR_LAYOUT_FIELD(file, layout, object_size),
12870 XATTR_LAYOUT_FIELD(file, layout, pool),
12871 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
12872 {
12873 name: "ceph.snap.btime",
12874 getxattr_cb: &Client::_vxattrcb_snap_btime,
12875 readonly: true,
81eedcae
TL
12876 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12877 flags: 0,
12878 },
7c673cae
FG
12879 { name: "" } /* Required table terminator */
12880};
12881
adb31ebb
TL
12882const Client::VXattr Client::_common_vxattrs[] = {
12883 {
12884 name: "ceph.cluster_fsid",
12885 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12886 readonly: true,
12887 exists_cb: nullptr,
12888 flags: 0,
12889 },
12890 {
12891 name: "ceph.client_id",
12892 getxattr_cb: &Client::_vxattrcb_client_id,
12893 readonly: true,
12894 exists_cb: nullptr,
12895 flags: 0,
12896 },
12897 { name: "" } /* Required table terminator */
12898};
12899
7c673cae
FG
12900const Client::VXattr *Client::_get_vxattrs(Inode *in)
12901{
12902 if (in->is_dir())
12903 return _dir_vxattrs;
12904 else if (in->is_file())
12905 return _file_vxattrs;
12906 return NULL;
12907}
12908
12909const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12910{
12911 if (strncmp(name, "ceph.", 5) == 0) {
12912 const VXattr *vxattr = _get_vxattrs(in);
12913 if (vxattr) {
12914 while (!vxattr->name.empty()) {
12915 if (vxattr->name == name)
12916 return vxattr;
12917 vxattr++;
12918 }
12919 }
adb31ebb
TL
12920
12921 // for common vxattrs
12922 vxattr = _common_vxattrs;
12923 while (!vxattr->name.empty()) {
12924 if (vxattr->name == name)
12925 return vxattr;
12926 vxattr++;
12927 }
7c673cae 12928 }
adb31ebb 12929
7c673cae
FG
12930 return NULL;
12931}
12932
7c673cae
FG
12933int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12934{
f67539c2
TL
12935 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12936 if (!mref_reader.is_state_satisfied())
12937 return -CEPHFS_ENOTCONN;
181888fb 12938
7c673cae
FG
12939 vinodeno_t vino = _get_vino(in);
12940
12941 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12942 tout(cct) << "ll_readlink" << std::endl;
12943 tout(cct) << vino.ino.val << std::endl;
12944
f67539c2 12945 std::scoped_lock lock(client_lock);
11fdf7f2
TL
12946 for (auto dn : in->dentries) {
12947 touch_dn(dn);
7c673cae
FG
12948 }
12949
12950 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12951 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12952 return r;
12953}
12954
12955int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12956 const UserPerm& perms, InodeRef *inp)
12957{
1adf2230 12958 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12959 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12960 << ", gid " << perms.gid() << ")" << dendl;
12961
12962 if (strlen(name) > NAME_MAX)
f67539c2 12963 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
12964
12965 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 12966 return -CEPHFS_EROFS;
7c673cae
FG
12967 }
12968 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 12969 return -CEPHFS_EDQUOT;
7c673cae
FG
12970 }
12971
12972 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12973
12974 filepath path;
12975 dir->make_nosnap_relative_path(path);
12976 path.push_dentry(name);
12977 req->set_filepath(path);
12978 req->set_inode(dir);
12979 req->head.args.mknod.rdev = rdev;
12980 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12981 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12982
12983 bufferlist xattrs_bl;
12984 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12985 if (res < 0)
12986 goto fail;
12987 req->head.args.mknod.mode = mode;
12988 if (xattrs_bl.length() > 0)
12989 req->set_data(xattrs_bl);
12990
12991 Dentry *de;
12992 res = get_or_create(dir, name, &de);
12993 if (res < 0)
12994 goto fail;
12995 req->set_dentry(de);
12996
12997 res = make_request(req, perms, inp);
12998
12999 trim_cache();
13000
1adf2230 13001 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13002 return res;
13003
13004 fail:
13005 put_request(req);
13006 return res;
13007}
13008
13009int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13010 dev_t rdev, struct stat *attr, Inode **out,
13011 const UserPerm& perms)
13012{
f67539c2
TL
13013 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13014 if (!mref_reader.is_state_satisfied())
13015 return -CEPHFS_ENOTCONN;
181888fb 13016
7c673cae
FG
13017 vinodeno_t vparent = _get_vino(parent);
13018
13019 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13020 tout(cct) << "ll_mknod" << std::endl;
13021 tout(cct) << vparent.ino.val << std::endl;
13022 tout(cct) << name << std::endl;
13023 tout(cct) << mode << std::endl;
13024 tout(cct) << rdev << std::endl;
13025
f67539c2 13026 std::scoped_lock lock(client_lock);
11fdf7f2 13027 if (!fuse_default_permissions) {
7c673cae
FG
13028 int r = may_create(parent, perms);
13029 if (r < 0)
13030 return r;
13031 }
13032
13033 InodeRef in;
13034 int r = _mknod(parent, name, mode, rdev, perms, &in);
13035 if (r == 0) {
13036 fill_stat(in, attr);
13037 _ll_get(in.get());
13038 }
13039 tout(cct) << attr->st_ino << std::endl;
13040 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13041 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13042 *out = in.get();
13043 return r;
13044}
13045
13046int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13047 dev_t rdev, Inode **out,
13048 struct ceph_statx *stx, unsigned want, unsigned flags,
13049 const UserPerm& perms)
13050{
f67539c2
TL
13051 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13052 if (!mref_reader.is_state_satisfied())
13053 return -CEPHFS_ENOTCONN;
7c673cae 13054
f67539c2 13055 unsigned caps = statx_to_mask(flags, want);
181888fb 13056
7c673cae
FG
13057 vinodeno_t vparent = _get_vino(parent);
13058
13059 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13060 tout(cct) << "ll_mknodx" << std::endl;
13061 tout(cct) << vparent.ino.val << std::endl;
13062 tout(cct) << name << std::endl;
13063 tout(cct) << mode << std::endl;
13064 tout(cct) << rdev << std::endl;
13065
f67539c2
TL
13066 std::scoped_lock lock(client_lock);
13067
11fdf7f2 13068 if (!fuse_default_permissions) {
7c673cae
FG
13069 int r = may_create(parent, perms);
13070 if (r < 0)
13071 return r;
13072 }
13073
13074 InodeRef in;
13075 int r = _mknod(parent, name, mode, rdev, perms, &in);
13076 if (r == 0) {
13077 fill_statx(in, caps, stx);
13078 _ll_get(in.get());
13079 }
13080 tout(cct) << stx->stx_ino << std::endl;
13081 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13082 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13083 *out = in.get();
13084 return r;
13085}
13086
13087int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13088 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13089 int object_size, const char *data_pool, bool *created,
f67539c2 13090 const UserPerm& perms, std::string alternate_name)
7c673cae 13091{
1adf2230 13092 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
13093 mode << dec << ")" << dendl;
13094
13095 if (strlen(name) > NAME_MAX)
f67539c2 13096 return -CEPHFS_ENAMETOOLONG;
7c673cae 13097 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13098 return -CEPHFS_EROFS;
7c673cae
FG
13099 }
13100 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13101 return -CEPHFS_EDQUOT;
7c673cae
FG
13102 }
13103
13104 // use normalized flags to generate cmode
11fdf7f2
TL
13105 int cflags = ceph_flags_sys2wire(flags);
13106 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13107 cflags |= CEPH_O_LAZY;
13108
13109 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
13110
13111 int64_t pool_id = -1;
13112 if (data_pool && *data_pool) {
13113 pool_id = objecter->with_osdmap(
13114 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13115 if (pool_id < 0)
f67539c2 13116 return -CEPHFS_EINVAL;
7c673cae 13117 if (pool_id > 0xffffffffll)
f67539c2 13118 return -CEPHFS_ERANGE; // bummer!
7c673cae
FG
13119 }
13120
13121 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13122
13123 filepath path;
13124 dir->make_nosnap_relative_path(path);
13125 path.push_dentry(name);
13126 req->set_filepath(path);
f67539c2 13127 req->set_alternate_name(std::move(alternate_name));
7c673cae 13128 req->set_inode(dir);
11fdf7f2 13129 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
13130
13131 req->head.args.open.stripe_unit = stripe_unit;
13132 req->head.args.open.stripe_count = stripe_count;
13133 req->head.args.open.object_size = object_size;
13134 if (cct->_conf->client_debug_getattr_caps)
13135 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13136 else
13137 req->head.args.open.mask = 0;
13138 req->head.args.open.pool = pool_id;
13139 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13140 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13141
13142 mode |= S_IFREG;
13143 bufferlist xattrs_bl;
13144 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13145 if (res < 0)
13146 goto fail;
13147 req->head.args.open.mode = mode;
13148 if (xattrs_bl.length() > 0)
13149 req->set_data(xattrs_bl);
13150
13151 Dentry *de;
13152 res = get_or_create(dir, name, &de);
13153 if (res < 0)
13154 goto fail;
13155 req->set_dentry(de);
13156
13157 res = make_request(req, perms, inp, created);
13158 if (res < 0) {
13159 goto reply_error;
13160 }
13161
13162 /* If the caller passed a value in fhp, do the open */
13163 if(fhp) {
13164 (*inp)->get_open_ref(cmode);
13165 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13166 }
13167
13168 reply_error:
13169 trim_cache();
13170
1adf2230 13171 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
13172 << " layout " << stripe_unit
13173 << ' ' << stripe_count
13174 << ' ' << object_size
13175 <<") = " << res << dendl;
13176 return res;
13177
13178 fail:
13179 put_request(req);
13180 return res;
13181}
13182
7c673cae 13183int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
f67539c2
TL
13184 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13185 std::string alternate_name)
7c673cae 13186{
1adf2230 13187 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13188 << mode << dec << ", uid " << perm.uid()
13189 << ", gid " << perm.gid() << ")" << dendl;
13190
13191 if (strlen(name) > NAME_MAX)
f67539c2 13192 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13193
13194 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13195 return -CEPHFS_EROFS;
7c673cae
FG
13196 }
13197 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13198 return -CEPHFS_EDQUOT;
7c673cae 13199 }
f67539c2
TL
13200
13201 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13202 MetaRequest *req = new MetaRequest(is_snap_op ?
7c673cae
FG
13203 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13204
13205 filepath path;
13206 dir->make_nosnap_relative_path(path);
13207 path.push_dentry(name);
13208 req->set_filepath(path);
13209 req->set_inode(dir);
13210 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13211 req->dentry_unless = CEPH_CAP_FILE_EXCL;
f67539c2 13212 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13213
13214 mode |= S_IFDIR;
f67539c2
TL
13215 bufferlist bl;
13216 int res = _posix_acl_create(dir, &mode, bl, perm);
7c673cae
FG
13217 if (res < 0)
13218 goto fail;
13219 req->head.args.mkdir.mode = mode;
f67539c2
TL
13220 if (is_snap_op) {
13221 SnapPayload payload;
13222 // clear the bufferlist that may have been populated by the call
13223 // to _posix_acl_create(). MDS mksnap does not make use of it.
13224 // So, reuse it to pass metadata payload.
13225 bl.clear();
13226 payload.metadata = metadata;
13227 encode(payload, bl);
13228 }
13229 if (bl.length() > 0) {
13230 req->set_data(bl);
13231 }
7c673cae
FG
13232
13233 Dentry *de;
13234 res = get_or_create(dir, name, &de);
13235 if (res < 0)
13236 goto fail;
13237 req->set_dentry(de);
13238
13239 ldout(cct, 10) << "_mkdir: making request" << dendl;
13240 res = make_request(req, perm, inp);
13241 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13242
13243 trim_cache();
13244
1adf2230 13245 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13246 return res;
13247
13248 fail:
13249 put_request(req);
13250 return res;
13251}
13252
13253int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13254 struct stat *attr, Inode **out, const UserPerm& perm)
13255{
f67539c2
TL
13256 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13257 if (!mref_reader.is_state_satisfied())
13258 return -CEPHFS_ENOTCONN;
181888fb 13259
7c673cae
FG
13260 vinodeno_t vparent = _get_vino(parent);
13261
13262 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13263 tout(cct) << "ll_mkdir" << std::endl;
13264 tout(cct) << vparent.ino.val << std::endl;
13265 tout(cct) << name << std::endl;
13266 tout(cct) << mode << std::endl;
13267
f67539c2
TL
13268 std::scoped_lock lock(client_lock);
13269
11fdf7f2 13270 if (!fuse_default_permissions) {
7c673cae
FG
13271 int r = may_create(parent, perm);
13272 if (r < 0)
13273 return r;
13274 }
13275
13276 InodeRef in;
13277 int r = _mkdir(parent, name, mode, perm, &in);
13278 if (r == 0) {
13279 fill_stat(in, attr);
13280 _ll_get(in.get());
13281 }
13282 tout(cct) << attr->st_ino << std::endl;
13283 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13284 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13285 *out = in.get();
13286 return r;
13287}
13288
13289int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13290 struct ceph_statx *stx, unsigned want, unsigned flags,
13291 const UserPerm& perms)
13292{
f67539c2
TL
13293 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13294 if (!mref_reader.is_state_satisfied())
13295 return -CEPHFS_ENOTCONN;
181888fb 13296
7c673cae
FG
13297 vinodeno_t vparent = _get_vino(parent);
13298
13299 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13300 tout(cct) << "ll_mkdirx" << std::endl;
13301 tout(cct) << vparent.ino.val << std::endl;
13302 tout(cct) << name << std::endl;
13303 tout(cct) << mode << std::endl;
13304
f67539c2
TL
13305 std::scoped_lock lock(client_lock);
13306
11fdf7f2 13307 if (!fuse_default_permissions) {
7c673cae
FG
13308 int r = may_create(parent, perms);
13309 if (r < 0)
13310 return r;
13311 }
13312
13313 InodeRef in;
13314 int r = _mkdir(parent, name, mode, perms, &in);
13315 if (r == 0) {
13316 fill_statx(in, statx_to_mask(flags, want), stx);
13317 _ll_get(in.get());
13318 } else {
13319 stx->stx_ino = 0;
13320 stx->stx_mask = 0;
13321 }
13322 tout(cct) << stx->stx_ino << std::endl;
13323 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13324 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13325 *out = in.get();
13326 return r;
13327}
13328
13329int Client::_symlink(Inode *dir, const char *name, const char *target,
f67539c2 13330 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
7c673cae 13331{
1adf2230 13332 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
13333 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13334 << dendl;
13335
13336 if (strlen(name) > NAME_MAX)
f67539c2 13337 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13338
13339 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13340 return -CEPHFS_EROFS;
7c673cae
FG
13341 }
13342 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13343 return -CEPHFS_EDQUOT;
7c673cae
FG
13344 }
13345
13346 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13347
13348 filepath path;
13349 dir->make_nosnap_relative_path(path);
13350 path.push_dentry(name);
13351 req->set_filepath(path);
f67539c2 13352 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13353 req->set_inode(dir);
13354 req->set_string2(target);
13355 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13356 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13357
13358 Dentry *de;
13359 int res = get_or_create(dir, name, &de);
13360 if (res < 0)
13361 goto fail;
13362 req->set_dentry(de);
13363
13364 res = make_request(req, perms, inp);
13365
13366 trim_cache();
1adf2230 13367 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
13368 res << dendl;
13369 return res;
13370
13371 fail:
13372 put_request(req);
13373 return res;
13374}
13375
13376int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13377 struct stat *attr, Inode **out, const UserPerm& perms)
13378{
f67539c2
TL
13379 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13380 if (!mref_reader.is_state_satisfied())
13381 return -CEPHFS_ENOTCONN;
181888fb 13382
7c673cae
FG
13383 vinodeno_t vparent = _get_vino(parent);
13384
13385 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13386 << dendl;
13387 tout(cct) << "ll_symlink" << std::endl;
13388 tout(cct) << vparent.ino.val << std::endl;
13389 tout(cct) << name << std::endl;
13390 tout(cct) << value << std::endl;
13391
f67539c2
TL
13392 std::scoped_lock lock(client_lock);
13393
11fdf7f2 13394 if (!fuse_default_permissions) {
7c673cae
FG
13395 int r = may_create(parent, perms);
13396 if (r < 0)
13397 return r;
13398 }
13399
13400 InodeRef in;
f67539c2 13401 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13402 if (r == 0) {
13403 fill_stat(in, attr);
13404 _ll_get(in.get());
13405 }
13406 tout(cct) << attr->st_ino << std::endl;
13407 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13408 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13409 *out = in.get();
13410 return r;
13411}
13412
13413int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13414 Inode **out, struct ceph_statx *stx, unsigned want,
13415 unsigned flags, const UserPerm& perms)
13416{
f67539c2
TL
13417 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13418 if (!mref_reader.is_state_satisfied())
13419 return -CEPHFS_ENOTCONN;
181888fb 13420
7c673cae
FG
13421 vinodeno_t vparent = _get_vino(parent);
13422
13423 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13424 << dendl;
13425 tout(cct) << "ll_symlinkx" << std::endl;
13426 tout(cct) << vparent.ino.val << std::endl;
13427 tout(cct) << name << std::endl;
13428 tout(cct) << value << std::endl;
13429
f67539c2
TL
13430 std::scoped_lock lock(client_lock);
13431
11fdf7f2 13432 if (!fuse_default_permissions) {
7c673cae
FG
13433 int r = may_create(parent, perms);
13434 if (r < 0)
13435 return r;
13436 }
13437
13438 InodeRef in;
f67539c2 13439 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13440 if (r == 0) {
13441 fill_statx(in, statx_to_mask(flags, want), stx);
13442 _ll_get(in.get());
13443 }
13444 tout(cct) << stx->stx_ino << std::endl;
13445 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13446 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13447 *out = in.get();
13448 return r;
13449}
13450
13451int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13452{
1adf2230 13453 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
13454 << " uid " << perm.uid() << " gid " << perm.gid()
13455 << ")" << dendl;
13456
13457 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13458 return -CEPHFS_EROFS;
7c673cae
FG
13459 }
13460
13461 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13462
13463 filepath path;
13464 dir->make_nosnap_relative_path(path);
13465 path.push_dentry(name);
13466 req->set_filepath(path);
13467
13468 InodeRef otherin;
b32b8144 13469 Inode *in;
7c673cae 13470 Dentry *de;
b32b8144 13471
7c673cae
FG
13472 int res = get_or_create(dir, name, &de);
13473 if (res < 0)
13474 goto fail;
13475 req->set_dentry(de);
13476 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13477 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13478
13479 res = _lookup(dir, name, 0, &otherin, perm);
13480 if (res < 0)
13481 goto fail;
b32b8144
FG
13482
13483 in = otherin.get();
13484 req->set_other_inode(in);
13485 in->break_all_delegs();
7c673cae
FG
13486 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13487
13488 req->set_inode(dir);
13489
13490 res = make_request(req, perm);
13491
13492 trim_cache();
1adf2230 13493 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
13494 return res;
13495
13496 fail:
13497 put_request(req);
13498 return res;
13499}
13500
13501int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13502{
f67539c2
TL
13503 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13504 if (!mref_reader.is_state_satisfied())
13505 return -CEPHFS_ENOTCONN;
181888fb 13506
7c673cae
FG
13507 vinodeno_t vino = _get_vino(in);
13508
13509 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13510 tout(cct) << "ll_unlink" << std::endl;
13511 tout(cct) << vino.ino.val << std::endl;
13512 tout(cct) << name << std::endl;
13513
f67539c2
TL
13514 std::scoped_lock lock(client_lock);
13515
11fdf7f2 13516 if (!fuse_default_permissions) {
7c673cae
FG
13517 int r = may_delete(in, name, perm);
13518 if (r < 0)
13519 return r;
13520 }
13521 return _unlink(in, name, perm);
13522}
13523
13524int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13525{
1adf2230 13526 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
13527 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13528
13529 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13530 return -CEPHFS_EROFS;
7c673cae 13531 }
b32b8144
FG
13532
13533 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13534 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
13535 filepath path;
13536 dir->make_nosnap_relative_path(path);
13537 path.push_dentry(name);
13538 req->set_filepath(path);
11fdf7f2 13539 req->set_inode(dir);
7c673cae
FG
13540
13541 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13542 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13543 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13544
13545 InodeRef in;
13546
13547 Dentry *de;
13548 int res = get_or_create(dir, name, &de);
13549 if (res < 0)
13550 goto fail;
b32b8144
FG
13551 if (op == CEPH_MDS_OP_RMDIR)
13552 req->set_dentry(de);
13553 else
13554 de->get();
13555
7c673cae
FG
13556 res = _lookup(dir, name, 0, &in, perms);
13557 if (res < 0)
13558 goto fail;
11fdf7f2
TL
13559
13560 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 13561 unlink(de, true, true);
b32b8144 13562 de->put();
7c673cae 13563 }
11fdf7f2 13564 req->set_other_inode(in.get());
7c673cae
FG
13565
13566 res = make_request(req, perms);
13567
13568 trim_cache();
1adf2230 13569 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
13570 return res;
13571
13572 fail:
13573 put_request(req);
13574 return res;
13575}
13576
13577int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13578{
f67539c2
TL
13579 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13580 if (!mref_reader.is_state_satisfied())
13581 return -CEPHFS_ENOTCONN;
181888fb 13582
7c673cae
FG
13583 vinodeno_t vino = _get_vino(in);
13584
13585 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13586 tout(cct) << "ll_rmdir" << std::endl;
13587 tout(cct) << vino.ino.val << std::endl;
13588 tout(cct) << name << std::endl;
13589
f67539c2
TL
13590 std::scoped_lock lock(client_lock);
13591
11fdf7f2 13592 if (!fuse_default_permissions) {
7c673cae
FG
13593 int r = may_delete(in, name, perms);
13594 if (r < 0)
13595 return r;
13596 }
13597
13598 return _rmdir(in, name, perms);
13599}
13600
f67539c2 13601int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
7c673cae 13602{
1adf2230 13603 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
13604 << todir->ino << " " << toname
13605 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13606 << dendl;
13607
13608 if (fromdir->snapid != todir->snapid)
f67539c2 13609 return -CEPHFS_EXDEV;
7c673cae
FG
13610
13611 int op = CEPH_MDS_OP_RENAME;
13612 if (fromdir->snapid != CEPH_NOSNAP) {
13613 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13614 op = CEPH_MDS_OP_RENAMESNAP;
13615 else
f67539c2
TL
13616 return -CEPHFS_EROFS;
13617 }
13618 if (fromdir != todir) {
13619 Inode *fromdir_root =
13620 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13621 Inode *todir_root =
13622 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13623 if (fromdir_root != todir_root) {
13624 return -CEPHFS_EXDEV;
13625 }
7c673cae 13626 }
7c673cae
FG
13627
13628 InodeRef target;
13629 MetaRequest *req = new MetaRequest(op);
13630
13631 filepath from;
13632 fromdir->make_nosnap_relative_path(from);
13633 from.push_dentry(fromname);
13634 filepath to;
13635 todir->make_nosnap_relative_path(to);
13636 to.push_dentry(toname);
13637 req->set_filepath(to);
13638 req->set_filepath2(from);
f67539c2 13639 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13640
13641 Dentry *oldde;
13642 int res = get_or_create(fromdir, fromname, &oldde);
13643 if (res < 0)
13644 goto fail;
13645 Dentry *de;
13646 res = get_or_create(todir, toname, &de);
13647 if (res < 0)
13648 goto fail;
13649
13650 if (op == CEPH_MDS_OP_RENAME) {
13651 req->set_old_dentry(oldde);
13652 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13653 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13654
13655 req->set_dentry(de);
13656 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13657 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13658
13659 InodeRef oldin, otherin;
f67539c2 13660 res = _lookup(fromdir, fromname, 0, &oldin, perm);
7c673cae
FG
13661 if (res < 0)
13662 goto fail;
b32b8144
FG
13663
13664 Inode *oldinode = oldin.get();
13665 oldinode->break_all_delegs();
13666 req->set_old_inode(oldinode);
7c673cae
FG
13667 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13668
13669 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
13670 switch (res) {
13671 case 0:
13672 {
13673 Inode *in = otherin.get();
13674 req->set_other_inode(in);
13675 in->break_all_delegs();
13676 }
7c673cae 13677 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144 13678 break;
f67539c2 13679 case -CEPHFS_ENOENT:
b32b8144
FG
13680 break;
13681 default:
13682 goto fail;
7c673cae
FG
13683 }
13684
13685 req->set_inode(todir);
13686 } else {
13687 // renamesnap reply contains no tracedn, so we need to invalidate
13688 // dentry manually
13689 unlink(oldde, true, true);
13690 unlink(de, true, true);
11fdf7f2
TL
13691
13692 req->set_inode(todir);
7c673cae
FG
13693 }
13694
13695 res = make_request(req, perm, &target);
13696 ldout(cct, 10) << "rename result is " << res << dendl;
13697
13698 // renamed item from our cache
13699
13700 trim_cache();
1adf2230 13701 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
13702 return res;
13703
13704 fail:
13705 put_request(req);
13706 return res;
13707}
13708
13709int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13710 const char *newname, const UserPerm& perm)
13711{
f67539c2
TL
13712 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13713 if (!mref_reader.is_state_satisfied())
13714 return -CEPHFS_ENOTCONN;
181888fb 13715
7c673cae
FG
13716 vinodeno_t vparent = _get_vino(parent);
13717 vinodeno_t vnewparent = _get_vino(newparent);
13718
13719 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13720 << vnewparent << " " << newname << dendl;
13721 tout(cct) << "ll_rename" << std::endl;
13722 tout(cct) << vparent.ino.val << std::endl;
13723 tout(cct) << name << std::endl;
13724 tout(cct) << vnewparent.ino.val << std::endl;
13725 tout(cct) << newname << std::endl;
13726
f67539c2
TL
13727 std::scoped_lock lock(client_lock);
13728
11fdf7f2 13729 if (!fuse_default_permissions) {
7c673cae
FG
13730 int r = may_delete(parent, name, perm);
13731 if (r < 0)
13732 return r;
13733 r = may_delete(newparent, newname, perm);
f67539c2 13734 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
13735 return r;
13736 }
13737
f67539c2 13738 return _rename(parent, name, newparent, newname, perm, "");
7c673cae
FG
13739}
13740
f67539c2 13741int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
7c673cae 13742{
1adf2230 13743 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
13744 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13745
13746 if (strlen(newname) > NAME_MAX)
f67539c2 13747 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13748
13749 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
f67539c2 13750 return -CEPHFS_EROFS;
7c673cae
FG
13751 }
13752 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13753 return -CEPHFS_EDQUOT;
7c673cae
FG
13754 }
13755
b32b8144 13756 in->break_all_delegs();
7c673cae
FG
13757 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13758
13759 filepath path(newname, dir->ino);
13760 req->set_filepath(path);
f67539c2 13761 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13762 filepath existing(in->ino);
13763 req->set_filepath2(existing);
13764
13765 req->set_inode(dir);
13766 req->inode_drop = CEPH_CAP_FILE_SHARED;
13767 req->inode_unless = CEPH_CAP_FILE_EXCL;
13768
13769 Dentry *de;
13770 int res = get_or_create(dir, newname, &de);
13771 if (res < 0)
13772 goto fail;
13773 req->set_dentry(de);
13774
13775 res = make_request(req, perm, inp);
13776 ldout(cct, 10) << "link result is " << res << dendl;
13777
13778 trim_cache();
1adf2230 13779 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
13780 return res;
13781
13782 fail:
13783 put_request(req);
13784 return res;
13785}
13786
13787int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13788 const UserPerm& perm)
13789{
f67539c2
TL
13790 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13791 if (!mref_reader.is_state_satisfied())
13792 return -CEPHFS_ENOTCONN;
181888fb 13793
7c673cae
FG
13794 vinodeno_t vino = _get_vino(in);
13795 vinodeno_t vnewparent = _get_vino(newparent);
13796
31f18b77 13797 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
13798 newname << dendl;
13799 tout(cct) << "ll_link" << std::endl;
13800 tout(cct) << vino.ino.val << std::endl;
13801 tout(cct) << vnewparent << std::endl;
13802 tout(cct) << newname << std::endl;
13803
7c673cae
FG
13804 InodeRef target;
13805
f67539c2
TL
13806 std::scoped_lock lock(client_lock);
13807
11fdf7f2 13808 if (!fuse_default_permissions) {
7c673cae 13809 if (S_ISDIR(in->mode))
f67539c2 13810 return -CEPHFS_EPERM;
7c673cae 13811
11fdf7f2 13812 int r = may_hardlink(in, perm);
7c673cae
FG
13813 if (r < 0)
13814 return r;
13815
13816 r = may_create(newparent, perm);
13817 if (r < 0)
13818 return r;
13819 }
13820
f67539c2 13821 return _link(in, newparent, newname, perm, "", &target);
7c673cae
FG
13822}
13823
13824int Client::ll_num_osds(void)
13825{
f67539c2 13826 std::scoped_lock lock(client_lock);
7c673cae
FG
13827 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13828}
13829
13830int Client::ll_osdaddr(int osd, uint32_t *addr)
13831{
f67539c2 13832 std::scoped_lock lock(client_lock);
181888fb 13833
7c673cae
FG
13834 entity_addr_t g;
13835 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13836 if (!o.exists(osd))
13837 return false;
11fdf7f2 13838 g = o.get_addrs(osd).front();
7c673cae
FG
13839 return true;
13840 });
13841 if (!exists)
13842 return -1;
13843 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13844 *addr = ntohl(nb_addr);
13845 return 0;
13846}
181888fb 13847
7c673cae
FG
13848uint32_t Client::ll_stripe_unit(Inode *in)
13849{
f67539c2 13850 std::scoped_lock lock(client_lock);
7c673cae
FG
13851 return in->layout.stripe_unit;
13852}
13853
13854uint64_t Client::ll_snap_seq(Inode *in)
13855{
f67539c2 13856 std::scoped_lock lock(client_lock);
7c673cae
FG
13857 return in->snaprealm->seq;
13858}
13859
13860int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13861{
f67539c2 13862 std::scoped_lock lock(client_lock);
7c673cae
FG
13863 *layout = in->layout;
13864 return 0;
13865}
13866
13867int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13868{
13869 return ll_file_layout(fh->inode.get(), layout);
13870}
13871
13872/* Currently we cannot take advantage of redundancy in reads, since we
13873 would have to go through all possible placement groups (a
13874 potentially quite large number determined by a hash), and use CRUSH
13875 to calculate the appropriate set of OSDs for each placement group,
13876 then index into that. An array with one entry per OSD is much more
13877 tractable and works for demonstration purposes. */
13878
13879int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13880 file_layout_t* layout)
13881{
f67539c2 13882 std::scoped_lock lock(client_lock);
181888fb 13883
28e407b8 13884 inodeno_t ino = in->ino;
7c673cae
FG
13885 uint32_t object_size = layout->object_size;
13886 uint32_t su = layout->stripe_unit;
13887 uint32_t stripe_count = layout->stripe_count;
13888 uint64_t stripes_per_object = object_size / su;
11fdf7f2 13889 uint64_t stripeno = 0, stripepos = 0;
7c673cae 13890
11fdf7f2
TL
13891 if(stripe_count) {
13892 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13893 stripepos = blockno % stripe_count; // which object in the object set (X)
13894 }
7c673cae
FG
13895 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13896 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13897
13898 object_t oid = file_object_t(ino, objectno);
13899 return objecter->with_osdmap([&](const OSDMap& o) {
13900 ceph_object_layout olayout =
13901 o.file_to_object_layout(oid, *layout);
13902 pg_t pg = (pg_t)olayout.ol_pgid;
13903 vector<int> osds;
13904 int primary;
13905 o.pg_to_acting_osds(pg, &osds, &primary);
13906 return primary;
13907 });
13908}
13909
13910/* Return the offset of the block, internal to the object */
13911
13912uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13913{
f67539c2 13914 std::scoped_lock lock(client_lock);
7c673cae
FG
13915 file_layout_t *layout=&(in->layout);
13916 uint32_t object_size = layout->object_size;
13917 uint32_t su = layout->stripe_unit;
13918 uint64_t stripes_per_object = object_size / su;
13919
13920 return (blockno % stripes_per_object) * su;
13921}
13922
13923int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13924 const UserPerm& perms)
13925{
f67539c2
TL
13926 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13927 if (!mref_reader.is_state_satisfied())
13928 return -CEPHFS_ENOTCONN;
181888fb 13929
7c673cae
FG
13930 vinodeno_t vino = _get_vino(in);
13931
13932 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13933 tout(cct) << "ll_opendir" << std::endl;
13934 tout(cct) << vino.ino.val << std::endl;
13935
f67539c2
TL
13936 std::scoped_lock lock(client_lock);
13937
11fdf7f2 13938 if (!fuse_default_permissions) {
7c673cae
FG
13939 int r = may_open(in, flags, perms);
13940 if (r < 0)
13941 return r;
13942 }
13943
13944 int r = _opendir(in, dirpp, perms);
f67539c2 13945 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
13946
13947 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13948 << dendl;
13949 return r;
13950}
13951
13952int Client::ll_releasedir(dir_result_t *dirp)
13953{
f67539c2
TL
13954 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13955 if (!mref_reader.is_state_satisfied())
13956 return -CEPHFS_ENOTCONN;
13957
7c673cae
FG
13958 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13959 tout(cct) << "ll_releasedir" << std::endl;
f67539c2 13960 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 13961
f67539c2 13962 std::scoped_lock lock(client_lock);
181888fb 13963
7c673cae
FG
13964 _closedir(dirp);
13965 return 0;
13966}
13967
13968int Client::ll_fsyncdir(dir_result_t *dirp)
13969{
f67539c2
TL
13970 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13971 if (!mref_reader.is_state_satisfied())
13972 return -CEPHFS_ENOTCONN;
13973
7c673cae
FG
13974 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13975 tout(cct) << "ll_fsyncdir" << std::endl;
f67539c2 13976 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 13977
f67539c2 13978 std::scoped_lock lock(client_lock);
7c673cae
FG
13979 return _fsync(dirp->inode.get(), false);
13980}
13981
13982int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13983{
11fdf7f2 13984 ceph_assert(!(flags & O_CREAT));
7c673cae 13985
f67539c2
TL
13986 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13987 if (!mref_reader.is_state_satisfied())
13988 return -CEPHFS_ENOTCONN;
181888fb 13989
7c673cae
FG
13990 vinodeno_t vino = _get_vino(in);
13991
13992 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13993 tout(cct) << "ll_open" << std::endl;
13994 tout(cct) << vino.ino.val << std::endl;
13995 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13996
f67539c2
TL
13997 std::scoped_lock lock(client_lock);
13998
7c673cae 13999 int r;
11fdf7f2 14000 if (!fuse_default_permissions) {
7c673cae
FG
14001 r = may_open(in, flags, perms);
14002 if (r < 0)
14003 goto out;
14004 }
14005
14006 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14007
14008 out:
14009 Fh *fhptr = fhp ? *fhp : NULL;
14010 if (fhptr) {
14011 ll_unclosed_fh_set.insert(fhptr);
14012 }
f67539c2 14013 tout(cct) << (uintptr_t)fhptr << std::endl;
7c673cae
FG
14014 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14015 " = " << r << " (" << fhptr << ")" << dendl;
14016 return r;
14017}
14018
14019int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14020 int flags, InodeRef *in, int caps, Fh **fhp,
14021 const UserPerm& perms)
14022{
14023 *fhp = NULL;
14024
14025 vinodeno_t vparent = _get_vino(parent);
14026
1adf2230 14027 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14028 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14029 << ", gid " << perms.gid() << dendl;
14030 tout(cct) << "ll_create" << std::endl;
14031 tout(cct) << vparent.ino.val << std::endl;
14032 tout(cct) << name << std::endl;
14033 tout(cct) << mode << std::endl;
14034 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14035
14036 bool created = false;
14037 int r = _lookup(parent, name, caps, in, perms);
14038
14039 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 14040 return -CEPHFS_EEXIST;
7c673cae 14041
f67539c2 14042 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
11fdf7f2 14043 if (!fuse_default_permissions) {
7c673cae
FG
14044 r = may_create(parent, perms);
14045 if (r < 0)
14046 goto out;
14047 }
14048 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
f67539c2 14049 perms, "");
7c673cae
FG
14050 if (r < 0)
14051 goto out;
14052 }
14053
14054 if (r < 0)
14055 goto out;
14056
11fdf7f2 14057 ceph_assert(*in);
7c673cae
FG
14058
14059 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14060 if (!created) {
11fdf7f2 14061 if (!fuse_default_permissions) {
7c673cae
FG
14062 r = may_open(in->get(), flags, perms);
14063 if (r < 0) {
14064 if (*fhp) {
14065 int release_r = _release_fh(*fhp);
11fdf7f2 14066 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
14067 }
14068 goto out;
14069 }
14070 }
14071 if (*fhp == NULL) {
14072 r = _open(in->get(), flags, mode, fhp, perms);
14073 if (r < 0)
14074 goto out;
14075 }
14076 }
14077
14078out:
14079 if (*fhp) {
14080 ll_unclosed_fh_set.insert(*fhp);
14081 }
14082
14083 ino_t ino = 0;
14084 if (r >= 0) {
14085 Inode *inode = in->get();
14086 if (use_faked_inos())
14087 ino = inode->faked_ino;
14088 else
14089 ino = inode->ino;
14090 }
14091
f67539c2 14092 tout(cct) << (uintptr_t)*fhp << std::endl;
7c673cae 14093 tout(cct) << ino << std::endl;
1adf2230 14094 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14095 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14096 *fhp << " " << hex << ino << dec << ")" << dendl;
14097
14098 return r;
14099}
14100
14101int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14102 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14103 const UserPerm& perms)
14104{
f67539c2
TL
14105 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14106 if (!mref_reader.is_state_satisfied())
14107 return -CEPHFS_ENOTCONN;
7c673cae 14108
f67539c2
TL
14109 std::scoped_lock lock(client_lock);
14110 InodeRef in;
181888fb 14111
7c673cae
FG
14112 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14113 fhp, perms);
14114 if (r >= 0) {
11fdf7f2 14115 ceph_assert(in);
7c673cae
FG
14116
14117 // passing an Inode in outp requires an additional ref
14118 if (outp) {
14119 _ll_get(in.get());
14120 *outp = in.get();
14121 }
14122 fill_stat(in, attr);
14123 } else {
14124 attr->st_ino = 0;
14125 }
14126
14127 return r;
14128}
14129
14130int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14131 int oflags, Inode **outp, Fh **fhp,
14132 struct ceph_statx *stx, unsigned want, unsigned lflags,
14133 const UserPerm& perms)
14134{
14135 unsigned caps = statx_to_mask(lflags, want);
f67539c2
TL
14136 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14137 if (!mref_reader.is_state_satisfied())
14138 return -CEPHFS_ENOTCONN;
7c673cae 14139
f67539c2
TL
14140 std::scoped_lock lock(client_lock);
14141 InodeRef in;
7c673cae
FG
14142
14143 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14144 if (r >= 0) {
11fdf7f2 14145 ceph_assert(in);
7c673cae
FG
14146
14147 // passing an Inode in outp requires an additional ref
14148 if (outp) {
14149 _ll_get(in.get());
14150 *outp = in.get();
14151 }
14152 fill_statx(in, caps, stx);
14153 } else {
14154 stx->stx_ino = 0;
14155 stx->stx_mask = 0;
14156 }
14157
14158 return r;
14159}
14160
14161loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14162{
f67539c2
TL
14163 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14164 if (!mref_reader.is_state_satisfied())
14165 return -CEPHFS_ENOTCONN;
14166
7c673cae
FG
14167 tout(cct) << "ll_lseek" << std::endl;
14168 tout(cct) << offset << std::endl;
14169 tout(cct) << whence << std::endl;
14170
f67539c2 14171 std::scoped_lock lock(client_lock);
7c673cae
FG
14172 return _lseek(fh, offset, whence);
14173}
14174
14175int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14176{
f67539c2
TL
14177 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14178 if (!mref_reader.is_state_satisfied())
14179 return -CEPHFS_ENOTCONN;
14180
7c673cae
FG
14181 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14182 tout(cct) << "ll_read" << std::endl;
f67539c2 14183 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14184 tout(cct) << off << std::endl;
14185 tout(cct) << len << std::endl;
14186
11fdf7f2
TL
14187 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14188 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14189 std::scoped_lock lock(client_lock);
14190
f6b5b4d7
TL
14191 int r = _read(fh, off, len, bl);
14192 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14193 << dendl;
14194 return r;
7c673cae
FG
14195}
14196
14197int Client::ll_read_block(Inode *in, uint64_t blockid,
14198 char *buf,
14199 uint64_t offset,
14200 uint64_t length,
14201 file_layout_t* layout)
14202{
f67539c2
TL
14203 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14204 if (!mref_reader.is_state_satisfied())
14205 return -CEPHFS_ENOTCONN;
181888fb 14206
b32b8144 14207 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14208 object_t oid = file_object_t(vino.ino, blockid);
14209 C_SaferCond onfinish;
14210 bufferlist bl;
14211
14212 objecter->read(oid,
14213 object_locator_t(layout->pool_id),
14214 offset,
14215 length,
14216 vino.snapid,
14217 &bl,
14218 CEPH_OSD_FLAG_READ,
14219 &onfinish);
14220
7c673cae 14221 int r = onfinish.wait();
7c673cae 14222 if (r >= 0) {
9f95a23c 14223 bl.begin().copy(bl.length(), buf);
7c673cae
FG
14224 r = bl.length();
14225 }
14226
14227 return r;
14228}
14229
14230/* It appears that the OSD doesn't return success unless the entire
14231 buffer was written, return the write length on success. */
14232
14233int Client::ll_write_block(Inode *in, uint64_t blockid,
14234 char* buf, uint64_t offset,
14235 uint64_t length, file_layout_t* layout,
14236 uint64_t snapseq, uint32_t sync)
14237{
7c673cae 14238 vinodeno_t vino = ll_get_vino(in);
7c673cae 14239 int r = 0;
11fdf7f2 14240 std::unique_ptr<C_SaferCond> onsafe = nullptr;
f67539c2
TL
14241
14242 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14243 if (!mref_reader.is_state_satisfied())
14244 return -CEPHFS_ENOTCONN;
14245
7c673cae 14246 if (length == 0) {
f67539c2 14247 return -CEPHFS_EINVAL;
7c673cae
FG
14248 }
14249 if (true || sync) {
14250 /* if write is stable, the epilogue is waiting on
14251 * flock */
11fdf7f2 14252 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
14253 }
14254 object_t oid = file_object_t(vino.ino, blockid);
14255 SnapContext fakesnap;
11fdf7f2
TL
14256 ceph::bufferlist bl;
14257 if (length > 0) {
14258 bl.push_back(buffer::copy(buf, length));
14259 }
7c673cae
FG
14260
14261 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14262 << dendl;
14263
14264 fakesnap.seq = snapseq;
14265
14266 /* lock just in time */
7c673cae
FG
14267 objecter->write(oid,
14268 object_locator_t(layout->pool_id),
14269 offset,
14270 length,
14271 fakesnap,
14272 bl,
14273 ceph::real_clock::now(),
14274 0,
11fdf7f2 14275 onsafe.get());
7c673cae 14276
11fdf7f2
TL
14277 if (nullptr != onsafe) {
14278 r = onsafe->wait();
7c673cae
FG
14279 }
14280
14281 if (r < 0) {
14282 return r;
14283 } else {
14284 return length;
14285 }
14286}
14287
14288int Client::ll_commit_blocks(Inode *in,
14289 uint64_t offset,
14290 uint64_t length)
14291{
7c673cae
FG
14292 /*
14293 BarrierContext *bctx;
b32b8144 14294 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14295 uint64_t ino = vino.ino;
14296
14297 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14298 << offset << " to " << length << dendl;
14299
14300 if (length == 0) {
f67539c2 14301 return -CEPHFS_EINVAL;
7c673cae
FG
14302 }
14303
f67539c2 14304 std::scoped_lock lock(client_lock);
7c673cae
FG
14305 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14306 if (p != barriers.end()) {
14307 barrier_interval civ(offset, offset + length);
14308 p->second->commit_barrier(civ);
14309 }
14310 */
14311 return 0;
14312}
14313
14314int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14315{
7c673cae
FG
14316 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14317 "~" << len << dendl;
14318 tout(cct) << "ll_write" << std::endl;
f67539c2 14319 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14320 tout(cct) << off << std::endl;
14321 tout(cct) << len << std::endl;
14322
f67539c2
TL
14323 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14324 if (!mref_reader.is_state_satisfied())
14325 return -CEPHFS_ENOTCONN;
181888fb 14326
11fdf7f2
TL
14327 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14328 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14329 std::scoped_lock lock(client_lock);
14330
7c673cae
FG
14331 int r = _write(fh, off, len, data, NULL, 0);
14332 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14333 << dendl;
14334 return r;
14335}
14336
11fdf7f2
TL
14337int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14338{
f67539c2
TL
14339 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14340 if (!mref_reader.is_state_satisfied())
14341 return -CEPHFS_ENOTCONN;
14342
14343 std::unique_lock cl(client_lock);
14344 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false, cl);
11fdf7f2
TL
14345}
14346
14347int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14348{
f67539c2
TL
14349 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14350 if (!mref_reader.is_state_satisfied())
14351 return -CEPHFS_ENOTCONN;
14352
14353 std::unique_lock cl(client_lock);
14354 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false, cl);
11fdf7f2
TL
14355}
14356
7c673cae
FG
14357int Client::ll_flush(Fh *fh)
14358{
f67539c2
TL
14359 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14360 if (!mref_reader.is_state_satisfied())
14361 return -CEPHFS_ENOTCONN;
14362
7c673cae
FG
14363 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14364 tout(cct) << "ll_flush" << std::endl;
f67539c2 14365 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14366
f67539c2 14367 std::scoped_lock lock(client_lock);
7c673cae
FG
14368 return _flush(fh);
14369}
14370
14371int Client::ll_fsync(Fh *fh, bool syncdataonly)
14372{
f67539c2
TL
14373 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14374 if (!mref_reader.is_state_satisfied())
14375 return -CEPHFS_ENOTCONN;
14376
7c673cae
FG
14377 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14378 tout(cct) << "ll_fsync" << std::endl;
f67539c2 14379 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14380
f67539c2 14381 std::scoped_lock lock(client_lock);
7c673cae
FG
14382 int r = _fsync(fh, syncdataonly);
14383 if (r) {
14384 // If we're returning an error, clear it from the FH
14385 fh->take_async_err();
14386 }
14387 return r;
14388}
14389
28e407b8
AA
14390int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14391{
f67539c2
TL
14392 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14393 if (!mref_reader.is_state_satisfied())
14394 return -CEPHFS_ENOTCONN;
14395
28e407b8
AA
14396 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14397 tout(cct) << "ll_sync_inode" << std::endl;
f67539c2 14398 tout(cct) << (uintptr_t)in << std::endl;
28e407b8 14399
f67539c2 14400 std::scoped_lock lock(client_lock);
28e407b8
AA
14401 return _fsync(in, syncdataonly);
14402}
14403
7c673cae
FG
14404#ifdef FALLOC_FL_PUNCH_HOLE
14405
14406int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14407{
f67539c2
TL
14408 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14409
7c673cae 14410 if (offset < 0 || length <= 0)
f67539c2 14411 return -CEPHFS_EINVAL;
7c673cae
FG
14412
14413 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
f67539c2 14414 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14415
14416 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
f67539c2 14417 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14418
14419 Inode *in = fh->inode.get();
14420
14421 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14422 !(mode & FALLOC_FL_PUNCH_HOLE)) {
f67539c2 14423 return -CEPHFS_ENOSPC;
7c673cae
FG
14424 }
14425
14426 if (in->snapid != CEPH_NOSNAP)
f67539c2 14427 return -CEPHFS_EROFS;
7c673cae
FG
14428
14429 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 14430 return -CEPHFS_EBADF;
7c673cae
FG
14431
14432 uint64_t size = offset + length;
14433 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14434 size > in->size &&
11fdf7f2 14435 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
f67539c2 14436 return -CEPHFS_EDQUOT;
7c673cae
FG
14437 }
14438
14439 int have;
f6b5b4d7 14440 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
14441 if (r < 0)
14442 return r;
14443
11fdf7f2 14444 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
14445 if (mode & FALLOC_FL_PUNCH_HOLE) {
14446 if (in->inline_version < CEPH_INLINE_NONE &&
14447 (have & CEPH_CAP_FILE_BUFFER)) {
14448 bufferlist bl;
9f95a23c 14449 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
14450 int len = in->inline_data.length();
14451 if (offset < len) {
14452 if (offset > 0)
9f95a23c 14453 inline_iter.copy(offset, bl);
7c673cae
FG
14454 int size = length;
14455 if (offset + size > len)
14456 size = len - offset;
14457 if (size > 0)
14458 bl.append_zero(size);
9f95a23c
TL
14459 if (offset + size < len) {
14460 inline_iter += size;
14461 inline_iter.copy(len - offset - size, bl);
14462 }
7c673cae
FG
14463 in->inline_data = bl;
14464 in->inline_version++;
14465 }
91327a77 14466 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14467 in->change_attr++;
28e407b8 14468 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14469 } else {
14470 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
14471 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14472 uninline_data(in, onuninline.get());
7c673cae
FG
14473 }
14474
11fdf7f2 14475 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae 14476
7c673cae
FG
14477 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14478
14479 _invalidate_inode_cache(in, offset, length);
14480 filer->zero(in->ino, &in->layout,
14481 in->snaprealm->get_snap_context(),
14482 offset, length,
14483 ceph::real_clock::now(),
11fdf7f2 14484 0, true, &onfinish);
91327a77 14485 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14486 in->change_attr++;
28e407b8 14487 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14488
9f95a23c 14489 client_lock.unlock();
11fdf7f2 14490 onfinish.wait();
9f95a23c 14491 client_lock.lock();
f67539c2 14492 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
14493 }
14494 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14495 uint64_t size = offset + length;
14496 if (size > in->size) {
14497 in->size = size;
91327a77 14498 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14499 in->change_attr++;
28e407b8 14500 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14501
11fdf7f2 14502 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 14503 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
14504 } else if (is_max_size_approaching(in)) {
14505 check_caps(in, 0);
7c673cae
FG
14506 }
14507 }
14508 }
14509
11fdf7f2 14510 if (nullptr != onuninline) {
9f95a23c 14511 client_lock.unlock();
11fdf7f2 14512 int ret = onuninline->wait();
9f95a23c 14513 client_lock.lock();
7c673cae 14514
f67539c2 14515 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
14516 in->inline_data.clear();
14517 in->inline_version = CEPH_INLINE_NONE;
28e407b8 14518 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14519 check_caps(in, 0);
14520 } else
11fdf7f2 14521 r = ret;
7c673cae
FG
14522 }
14523
14524 put_cap_ref(in, CEPH_CAP_FILE_WR);
14525 return r;
14526}
14527#else
14528
14529int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14530{
f67539c2 14531 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14532}
14533
14534#endif
14535
14536
11fdf7f2 14537int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 14538{
f67539c2
TL
14539 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14540 if (!mref_reader.is_state_satisfied())
14541 return -CEPHFS_ENOTCONN;
14542
11fdf7f2
TL
14543 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14544 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
f67539c2 14545 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14546
f67539c2 14547 std::scoped_lock lock(client_lock);
7c673cae
FG
14548 return _fallocate(fh, mode, offset, length);
14549}
14550
14551int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14552{
f67539c2
TL
14553 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14554 if (!mref_reader.is_state_satisfied())
14555 return -CEPHFS_ENOTCONN;
7c673cae 14556
f67539c2 14557 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
181888fb 14558
f67539c2 14559 std::scoped_lock lock(client_lock);
7c673cae
FG
14560 Fh *fh = get_filehandle(fd);
14561 if (!fh)
f67539c2 14562 return -CEPHFS_EBADF;
7c673cae
FG
14563#if defined(__linux__) && defined(O_PATH)
14564 if (fh->flags & O_PATH)
f67539c2 14565 return -CEPHFS_EBADF;
7c673cae
FG
14566#endif
14567 return _fallocate(fh, mode, offset, length);
14568}
14569
14570int Client::ll_release(Fh *fh)
14571{
f67539c2
TL
14572 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14573 if (!mref_reader.is_state_satisfied())
14574 return -CEPHFS_ENOTCONN;
91327a77 14575
11fdf7f2 14576 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 14577 dendl;
11fdf7f2 14578 tout(cct) << __func__ << " (fh)" << std::endl;
f67539c2
TL
14579 tout(cct) << (uintptr_t)fh << std::endl;
14580
14581 std::scoped_lock lock(client_lock);
7c673cae
FG
14582
14583 if (ll_unclosed_fh_set.count(fh))
14584 ll_unclosed_fh_set.erase(fh);
14585 return _release_fh(fh);
14586}
14587
14588int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14589{
f67539c2
TL
14590 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14591 if (!mref_reader.is_state_satisfied())
14592 return -CEPHFS_ENOTCONN;
7c673cae
FG
14593
14594 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
f67539c2 14595 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
181888fb 14596
f67539c2 14597 std::scoped_lock lock(client_lock);
7c673cae
FG
14598 return _getlk(fh, fl, owner);
14599}
14600
14601int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14602{
f67539c2
TL
14603 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14604 if (!mref_reader.is_state_satisfied())
14605 return -CEPHFS_ENOTCONN;
7c673cae 14606
11fdf7f2 14607 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14608 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14609
f67539c2 14610 std::scoped_lock lock(client_lock);
7c673cae
FG
14611 return _setlk(fh, fl, owner, sleep);
14612}
14613
14614int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14615{
f67539c2
TL
14616 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14617 if (!mref_reader.is_state_satisfied())
14618 return -CEPHFS_ENOTCONN;
7c673cae 14619
11fdf7f2 14620 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14621 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14622
f67539c2 14623 std::scoped_lock lock(client_lock);
7c673cae
FG
14624 return _flock(fh, cmd, owner);
14625}
14626
b32b8144
FG
14627int Client::set_deleg_timeout(uint32_t timeout)
14628{
f67539c2 14629 std::scoped_lock lock(client_lock);
b32b8144
FG
14630
14631 /*
f67539c2 14632 * The whole point is to prevent blocklisting so we must time out the
b32b8144
FG
14633 * delegation before the session autoclose timeout kicks in.
14634 */
14635 if (timeout >= mdsmap->get_session_autoclose())
f67539c2 14636 return -CEPHFS_EINVAL;
b32b8144
FG
14637
14638 deleg_timeout = timeout;
14639 return 0;
14640}
14641
14642int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14643{
f67539c2 14644 int ret = -CEPHFS_EINVAL;
b32b8144 14645
f67539c2
TL
14646 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14647 if (!mref_reader.is_state_satisfied())
14648 return -CEPHFS_ENOTCONN;
b32b8144 14649
f67539c2 14650 std::scoped_lock lock(client_lock);
b32b8144
FG
14651
14652 Inode *inode = fh->inode.get();
14653
14654 switch(cmd) {
14655 case CEPH_DELEGATION_NONE:
14656 inode->unset_deleg(fh);
14657 ret = 0;
14658 break;
14659 default:
14660 try {
14661 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 14662 } catch (std::bad_alloc&) {
f67539c2 14663 ret = -CEPHFS_ENOMEM;
b32b8144
FG
14664 }
14665 break;
14666 }
14667 return ret;
14668}
14669
7c673cae
FG
14670class C_Client_RequestInterrupt : public Context {
14671private:
14672 Client *client;
14673 MetaRequest *req;
14674public:
14675 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14676 req->get();
14677 }
14678 void finish(int r) override {
f67539c2 14679 std::scoped_lock l(client->client_lock);
11fdf7f2 14680 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
14681 client->_interrupt_filelock(req);
14682 client->put_request(req);
14683 }
14684};
14685
14686void Client::ll_interrupt(void *d)
14687{
14688 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
14689 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14690 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
14691 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14692}
14693
14694// =========================================
14695// layout
14696
14697// expose file layouts
14698
14699int Client::describe_layout(const char *relpath, file_layout_t *lp,
14700 const UserPerm& perms)
14701{
f67539c2
TL
14702 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14703 if (!mref_reader.is_state_satisfied())
14704 return -CEPHFS_ENOTCONN;
7c673cae 14705
f67539c2 14706 std::scoped_lock lock(client_lock);
181888fb 14707
7c673cae
FG
14708 filepath path(relpath);
14709 InodeRef in;
14710 int r = path_walk(path, &in, perms);
14711 if (r < 0)
14712 return r;
14713
14714 *lp = in->layout;
14715
11fdf7f2 14716 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
14717 return 0;
14718}
14719
14720int Client::fdescribe_layout(int fd, file_layout_t *lp)
14721{
f67539c2
TL
14722 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14723 if (!mref_reader.is_state_satisfied())
14724 return -CEPHFS_ENOTCONN;
7c673cae 14725
f67539c2 14726 std::scoped_lock lock(client_lock);
181888fb 14727
7c673cae
FG
14728 Fh *f = get_filehandle(fd);
14729 if (!f)
f67539c2 14730 return -CEPHFS_EBADF;
7c673cae
FG
14731 Inode *in = f->inode.get();
14732
14733 *lp = in->layout;
14734
11fdf7f2 14735 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
14736 return 0;
14737}
14738
d2e6a577
FG
14739int64_t Client::get_default_pool_id()
14740{
f67539c2
TL
14741 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14742 if (!mref_reader.is_state_satisfied())
14743 return -CEPHFS_ENOTCONN;
181888fb 14744
f67539c2 14745 std::scoped_lock lock(client_lock);
181888fb 14746
d2e6a577
FG
14747 /* first data pool is the default */
14748 return mdsmap->get_first_data_pool();
14749}
7c673cae
FG
14750
14751// expose osdmap
14752
14753int64_t Client::get_pool_id(const char *pool_name)
14754{
f67539c2
TL
14755 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14756 if (!mref_reader.is_state_satisfied())
14757 return -CEPHFS_ENOTCONN;
181888fb 14758
f67539c2 14759 std::scoped_lock lock(client_lock);
181888fb 14760
7c673cae
FG
14761 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14762 pool_name);
14763}
14764
14765string Client::get_pool_name(int64_t pool)
14766{
f67539c2
TL
14767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14768 if (!mref_reader.is_state_satisfied())
181888fb
FG
14769 return string();
14770
f67539c2
TL
14771 std::scoped_lock lock(client_lock);
14772
7c673cae
FG
14773 return objecter->with_osdmap([pool](const OSDMap& o) {
14774 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14775 });
14776}
14777
14778int Client::get_pool_replication(int64_t pool)
14779{
f67539c2
TL
14780 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14781 if (!mref_reader.is_state_satisfied())
14782 return -CEPHFS_ENOTCONN;
181888fb 14783
f67539c2 14784 std::scoped_lock lock(client_lock);
181888fb 14785
7c673cae 14786 return objecter->with_osdmap([pool](const OSDMap& o) {
f67539c2 14787 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
7c673cae
FG
14788 });
14789}
14790
14791int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14792{
f67539c2
TL
14793 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14794 if (!mref_reader.is_state_satisfied())
14795 return -CEPHFS_ENOTCONN;
7c673cae 14796
f67539c2 14797 std::scoped_lock lock(client_lock);
181888fb 14798
7c673cae
FG
14799 Fh *f = get_filehandle(fd);
14800 if (!f)
f67539c2 14801 return -CEPHFS_EBADF;
7c673cae
FG
14802 Inode *in = f->inode.get();
14803
14804 vector<ObjectExtent> extents;
14805 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 14806 ceph_assert(extents.size() == 1);
7c673cae
FG
14807
14808 objecter->with_osdmap([&](const OSDMap& o) {
14809 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14810 o.pg_to_acting_osds(pg, osds);
14811 });
14812
14813 if (osds.empty())
f67539c2 14814 return -CEPHFS_EINVAL;
7c673cae
FG
14815
14816 /*
14817 * Return the remainder of the extent (stripe unit)
14818 *
14819 * If length = 1 is passed to Striper::file_to_extents we get a single
14820 * extent back, but its length is one so we still need to compute the length
14821 * to the end of the stripe unit.
14822 *
14823 * If length = su then we may get 1 or 2 objects back in the extents vector
14824 * which would have to be examined. Even then, the offsets are local to the
14825 * object, so matching up to the file offset is extra work.
14826 *
14827 * It seems simpler to stick with length = 1 and manually compute the
14828 * remainder.
14829 */
14830 if (len) {
14831 uint64_t su = in->layout.stripe_unit;
14832 *len = su - (off % su);
14833 }
14834
14835 return 0;
14836}
14837
14838int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14839{
f67539c2
TL
14840 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14841 if (!mref_reader.is_state_satisfied())
14842 return -CEPHFS_ENOTCONN;
181888fb 14843
f67539c2 14844 std::scoped_lock lock(client_lock);
181888fb 14845
7c673cae 14846 if (id < 0)
f67539c2 14847 return -CEPHFS_EINVAL;
7c673cae
FG
14848 return objecter->with_osdmap([&](const OSDMap& o) {
14849 return o.crush->get_full_location_ordered(id, path);
14850 });
14851}
14852
14853int Client::get_file_stripe_address(int fd, loff_t offset,
14854 vector<entity_addr_t>& address)
14855{
f67539c2
TL
14856 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14857 if (!mref_reader.is_state_satisfied())
14858 return -CEPHFS_ENOTCONN;
7c673cae 14859
f67539c2 14860 std::scoped_lock lock(client_lock);
181888fb 14861
7c673cae
FG
14862 Fh *f = get_filehandle(fd);
14863 if (!f)
f67539c2 14864 return -CEPHFS_EBADF;
7c673cae
FG
14865 Inode *in = f->inode.get();
14866
14867 // which object?
14868 vector<ObjectExtent> extents;
14869 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14870 in->truncate_size, extents);
11fdf7f2 14871 ceph_assert(extents.size() == 1);
7c673cae
FG
14872
14873 // now we have the object and its 'layout'
14874 return objecter->with_osdmap([&](const OSDMap& o) {
14875 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14876 vector<int> osds;
14877 o.pg_to_acting_osds(pg, osds);
14878 if (osds.empty())
f67539c2 14879 return -CEPHFS_EINVAL;
7c673cae 14880 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 14881 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
14882 address.push_back(addr);
14883 }
14884 return 0;
14885 });
14886}
14887
14888int Client::get_osd_addr(int osd, entity_addr_t& addr)
14889{
f67539c2
TL
14890 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14891 if (!mref_reader.is_state_satisfied())
14892 return -CEPHFS_ENOTCONN;
181888fb 14893
f67539c2 14894 std::scoped_lock lock(client_lock);
181888fb 14895
7c673cae
FG
14896 return objecter->with_osdmap([&](const OSDMap& o) {
14897 if (!o.exists(osd))
f67539c2 14898 return -CEPHFS_ENOENT;
7c673cae 14899
11fdf7f2 14900 addr = o.get_addrs(osd).front();
7c673cae
FG
14901 return 0;
14902 });
14903}
14904
14905int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14906 loff_t length, loff_t offset)
14907{
f67539c2
TL
14908 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14909 if (!mref_reader.is_state_satisfied())
14910 return -CEPHFS_ENOTCONN;
7c673cae 14911
f67539c2 14912 std::scoped_lock lock(client_lock);
181888fb 14913
7c673cae
FG
14914 Fh *f = get_filehandle(fd);
14915 if (!f)
f67539c2 14916 return -CEPHFS_EBADF;
7c673cae
FG
14917 Inode *in = f->inode.get();
14918
14919 // map to a list of extents
14920 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14921
11fdf7f2 14922 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
14923 return 0;
14924}
14925
14926
f67539c2 14927/* find an osd with the same ip. -CEPHFS_ENXIO if none. */
7c673cae
FG
14928int Client::get_local_osd()
14929{
f67539c2
TL
14930 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14931 if (!mref_reader.is_state_satisfied())
14932 return -CEPHFS_ENOTCONN;
181888fb 14933
f67539c2 14934 std::scoped_lock lock(client_lock);
181888fb 14935
7c673cae
FG
14936 objecter->with_osdmap([this](const OSDMap& o) {
14937 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 14938 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
14939 local_osd_epoch = o.get_epoch();
14940 }
14941 });
14942 return local_osd;
14943}
14944
14945
14946
14947
14948
14949
14950// ===============================
14951
14952void Client::ms_handle_connect(Connection *con)
14953{
11fdf7f2 14954 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14955}
14956
14957bool Client::ms_handle_reset(Connection *con)
14958{
11fdf7f2 14959 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14960 return false;
14961}
14962
14963void Client::ms_handle_remote_reset(Connection *con)
14964{
f67539c2 14965 std::scoped_lock lock(client_lock);
11fdf7f2 14966 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14967 switch (con->get_peer_type()) {
14968 case CEPH_ENTITY_TYPE_MDS:
14969 {
14970 // kludge to figure out which mds this is; fixme with a Connection* state
14971 mds_rank_t mds = MDS_RANK_NONE;
14972 MetaSession *s = NULL;
11fdf7f2 14973 for (auto &p : mds_sessions) {
b3b6e05e 14974 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
11fdf7f2
TL
14975 mds = p.first;
14976 s = &p.second;
7c673cae
FG
14977 }
14978 }
14979 if (mds >= 0) {
d2e6a577 14980 assert (s != NULL);
7c673cae
FG
14981 switch (s->state) {
14982 case MetaSession::STATE_CLOSING:
14983 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14984 _closed_mds_session(s);
14985 break;
14986
14987 case MetaSession::STATE_OPENING:
14988 {
14989 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14990 list<Context*> waiters;
14991 waiters.swap(s->waiting_for_open);
14992 _closed_mds_session(s);
14993 MetaSession *news = _get_or_open_mds_session(mds);
14994 news->waiting_for_open.swap(waiters);
14995 }
14996 break;
14997
14998 case MetaSession::STATE_OPEN:
14999 {
f67539c2 15000 objecter->maybe_request_map(); /* to check if we are blocklisted */
f6b5b4d7 15001 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae
FG
15002 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15003 _closed_mds_session(s);
15004 } else {
15005 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15006 s->state = MetaSession::STATE_STALE;
15007 }
15008 }
15009 break;
15010
15011 case MetaSession::STATE_NEW:
15012 case MetaSession::STATE_CLOSED:
15013 default:
15014 break;
15015 }
15016 }
15017 }
15018 break;
15019 }
15020}
15021
15022bool Client::ms_handle_refused(Connection *con)
15023{
11fdf7f2 15024 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15025 return false;
15026}
15027
7c673cae
FG
15028Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15029{
11fdf7f2
TL
15030 Inode *quota_in = root_ancestor;
15031 SnapRealm *realm = in->snaprealm;
15032 while (realm) {
15033 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15034 if (realm->ino != in->ino) {
15035 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15036 if (p == inode_map.end())
15037 break;
7c673cae 15038
11fdf7f2
TL
15039 if (p->second->quota.is_enable()) {
15040 quota_in = p->second;
15041 break;
7c673cae 15042 }
7c673cae 15043 }
11fdf7f2 15044 realm = realm->pparent;
7c673cae 15045 }
11fdf7f2
TL
15046 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15047 return quota_in;
7c673cae
FG
15048}
15049
15050/**
15051 * Traverse quota ancestors of the Inode, return true
15052 * if any of them passes the passed function
15053 */
15054bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15055 std::function<bool (const Inode &in)> test)
15056{
15057 while (true) {
11fdf7f2 15058 ceph_assert(in != NULL);
7c673cae
FG
15059 if (test(*in)) {
15060 return true;
15061 }
15062
15063 if (in == root_ancestor) {
15064 // We're done traversing, drop out
15065 return false;
15066 } else {
15067 // Continue up the tree
15068 in = get_quota_root(in, perms);
15069 }
15070 }
15071
15072 return false;
15073}
15074
15075bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15076{
15077 return check_quota_condition(in, perms,
15078 [](const Inode &in) {
15079 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15080 });
15081}
15082
15083bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 15084 const UserPerm& perms)
7c673cae
FG
15085{
15086 return check_quota_condition(in, perms,
11fdf7f2 15087 [&new_bytes](const Inode &in) {
7c673cae
FG
15088 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15089 > in.quota.max_bytes;
15090 });
15091}
15092
11fdf7f2 15093bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 15094{
9f95a23c
TL
15095 ceph_assert(in->size >= in->reported_size);
15096 const uint64_t size = in->size - in->reported_size;
11fdf7f2 15097 return check_quota_condition(in, perms,
9f95a23c 15098 [&size](const Inode &in) {
11fdf7f2
TL
15099 if (in.quota.max_bytes) {
15100 if (in.rstat.rbytes >= in.quota.max_bytes) {
15101 return true;
15102 }
15103
11fdf7f2 15104 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
15105 return (space >> 4) < size;
15106 } else {
15107 return false;
15108 }
15109 });
7c673cae
FG
15110}
15111
15112enum {
15113 POOL_CHECKED = 1,
15114 POOL_CHECKING = 2,
15115 POOL_READ = 4,
15116 POOL_WRITE = 8,
15117};
15118
15119int Client::check_pool_perm(Inode *in, int need)
15120{
f67539c2
TL
15121 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15122
7c673cae
FG
15123 if (!cct->_conf->client_check_pool_perm)
15124 return 0;
15125
f67539c2
TL
15126 /* Only need to do this for regular files */
15127 if (!in->is_file())
15128 return 0;
15129
7c673cae
FG
15130 int64_t pool_id = in->layout.pool_id;
15131 std::string pool_ns = in->layout.pool_ns;
15132 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15133 int have = 0;
15134 while (true) {
15135 auto it = pool_perms.find(perm_key);
15136 if (it == pool_perms.end())
15137 break;
15138 if (it->second == POOL_CHECKING) {
15139 // avoid concurrent checkings
15140 wait_on_list(waiting_for_pool_perm);
15141 } else {
15142 have = it->second;
11fdf7f2 15143 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
15144 break;
15145 }
15146 }
15147
15148 if (!have) {
15149 if (in->snapid != CEPH_NOSNAP) {
15150 // pool permission check needs to write to the first object. But for snapshot,
15151 // head of the first object may have alread been deleted. To avoid creating
15152 // orphan object, skip the check for now.
15153 return 0;
15154 }
15155
15156 pool_perms[perm_key] = POOL_CHECKING;
15157
15158 char oid_buf[32];
15159 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15160 object_t oid = oid_buf;
15161
15162 SnapContext nullsnapc;
15163
15164 C_SaferCond rd_cond;
15165 ObjectOperation rd_op;
f67539c2 15166 rd_op.stat(nullptr, nullptr, nullptr);
7c673cae
FG
15167
15168 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15169 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15170
15171 C_SaferCond wr_cond;
15172 ObjectOperation wr_op;
15173 wr_op.create(true);
15174
15175 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15176 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15177
9f95a23c 15178 client_lock.unlock();
7c673cae
FG
15179 int rd_ret = rd_cond.wait();
15180 int wr_ret = wr_cond.wait();
9f95a23c 15181 client_lock.lock();
7c673cae
FG
15182
15183 bool errored = false;
15184
f67539c2 15185 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
7c673cae 15186 have |= POOL_READ;
f67539c2 15187 else if (rd_ret != -CEPHFS_EPERM) {
11fdf7f2 15188 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15189 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15190 errored = true;
15191 }
15192
f67539c2 15193 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
7c673cae 15194 have |= POOL_WRITE;
f67539c2 15195 else if (wr_ret != -CEPHFS_EPERM) {
11fdf7f2 15196 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15197 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15198 errored = true;
15199 }
15200
15201 if (errored) {
15202 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15203 // Raise EIO because actual error code might be misleading for
15204 // userspace filesystem user.
15205 pool_perms.erase(perm_key);
15206 signal_cond_list(waiting_for_pool_perm);
f67539c2 15207 return -CEPHFS_EIO;
7c673cae
FG
15208 }
15209
15210 pool_perms[perm_key] = have | POOL_CHECKED;
15211 signal_cond_list(waiting_for_pool_perm);
15212 }
15213
15214 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 15215 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15216 << " need " << ccap_string(need) << ", but no read perm" << dendl;
f67539c2 15217 return -CEPHFS_EPERM;
7c673cae
FG
15218 }
15219 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 15220 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15221 << " need " << ccap_string(need) << ", but no write perm" << dendl;
f67539c2 15222 return -CEPHFS_EPERM;
7c673cae
FG
15223 }
15224
15225 return 0;
15226}
15227
15228int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15229{
15230 if (acl_type == POSIX_ACL) {
15231 if (in->xattrs.count(ACL_EA_ACCESS)) {
15232 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15233
15234 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15235 }
15236 }
f67539c2 15237 return -CEPHFS_EAGAIN;
7c673cae
FG
15238}
15239
15240int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15241{
15242 if (acl_type == NO_ACL)
15243 return 0;
15244
15245 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15246 if (r < 0)
15247 goto out;
15248
15249 if (acl_type == POSIX_ACL) {
15250 if (in->xattrs.count(ACL_EA_ACCESS)) {
15251 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15252 bufferptr acl(access_acl.c_str(), access_acl.length());
15253 r = posix_acl_access_chmod(acl, mode);
15254 if (r < 0)
15255 goto out;
15256 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15257 } else {
15258 r = 0;
15259 }
15260 }
15261out:
15262 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15263 return r;
15264}
15265
15266int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15267 const UserPerm& perms)
15268{
15269 if (acl_type == NO_ACL)
15270 return 0;
15271
15272 if (S_ISLNK(*mode))
15273 return 0;
15274
15275 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15276 if (r < 0)
15277 goto out;
15278
15279 if (acl_type == POSIX_ACL) {
15280 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15281 map<string, bufferptr> xattrs;
15282
15283 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15284 bufferptr acl(default_acl.c_str(), default_acl.length());
15285 r = posix_acl_inherit_mode(acl, mode);
15286 if (r < 0)
15287 goto out;
15288
15289 if (r > 0) {
15290 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15291 if (r < 0)
15292 goto out;
15293 if (r > 0)
15294 xattrs[ACL_EA_ACCESS] = acl;
15295 }
15296
15297 if (S_ISDIR(*mode))
15298 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15299
15300 r = xattrs.size();
15301 if (r > 0)
11fdf7f2 15302 encode(xattrs, xattrs_bl);
7c673cae
FG
15303 } else {
15304 if (umask_cb)
15305 *mode &= ~umask_cb(callback_handle);
15306 r = 0;
15307 }
15308 }
15309out:
15310 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15311 return r;
15312}
15313
15314void Client::set_filer_flags(int flags)
15315{
f67539c2 15316 std::scoped_lock l(client_lock);
11fdf7f2 15317 ceph_assert(flags == 0 ||
7c673cae
FG
15318 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15319 objecter->add_global_op_flags(flags);
15320}
15321
15322void Client::clear_filer_flags(int flags)
15323{
f67539c2 15324 std::scoped_lock l(client_lock);
11fdf7f2 15325 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
15326 objecter->clear_global_op_flag(flags);
15327}
15328
11fdf7f2
TL
15329// called before mount
15330void Client::set_uuid(const std::string& uuid)
15331{
f67539c2
TL
15332 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15333 ceph_assert(iref_reader.is_state_satisfied());
15334
15335 std::scoped_lock l(client_lock);
11fdf7f2
TL
15336 assert(!uuid.empty());
15337
15338 metadata["uuid"] = uuid;
15339 _close_sessions();
15340}
15341
15342// called before mount. 0 means infinite
15343void Client::set_session_timeout(unsigned timeout)
15344{
f67539c2
TL
15345 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15346 ceph_assert(iref_reader.is_state_satisfied());
15347
15348 std::scoped_lock l(client_lock);
11fdf7f2
TL
15349
15350 metadata["timeout"] = stringify(timeout);
15351}
15352
15353// called before mount
15354int Client::start_reclaim(const std::string& uuid, unsigned flags,
15355 const std::string& fs_name)
15356{
f67539c2
TL
15357 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15358 if (!iref_reader.is_state_satisfied())
15359 return -CEPHFS_ENOTCONN;
11fdf7f2
TL
15360
15361 if (uuid.empty())
f67539c2 15362 return -CEPHFS_EINVAL;
11fdf7f2 15363
f67539c2 15364 std::unique_lock l(client_lock);
11fdf7f2
TL
15365 {
15366 auto it = metadata.find("uuid");
15367 if (it != metadata.end() && it->second == uuid)
f67539c2 15368 return -CEPHFS_EINVAL;
11fdf7f2
TL
15369 }
15370
15371 int r = subscribe_mdsmap(fs_name);
15372 if (r < 0) {
15373 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15374 return r;
15375 }
15376
15377 if (metadata.empty())
15378 populate_metadata("");
15379
15380 while (mdsmap->get_epoch() == 0)
15381 wait_on_list(waiting_for_mdsmap);
15382
15383 reclaim_errno = 0;
15384 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15385 if (!mdsmap->is_up(mds)) {
15386 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15387 wait_on_list(waiting_for_mdsmap);
15388 continue;
15389 }
15390
15391 MetaSession *session;
15392 if (!have_open_session(mds)) {
15393 session = _get_or_open_mds_session(mds);
f6b5b4d7 15394 if (session->state == MetaSession::STATE_REJECTED)
f67539c2 15395 return -CEPHFS_EPERM;
11fdf7f2
TL
15396 if (session->state != MetaSession::STATE_OPENING) {
15397 // umounting?
f67539c2 15398 return -CEPHFS_EINVAL;
11fdf7f2
TL
15399 }
15400 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15401 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
15402 continue;
15403 }
15404
15405 session = &mds_sessions.at(mds);
15406 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
f67539c2 15407 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
15408
15409 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15410 session->reclaim_state == MetaSession::RECLAIMING) {
15411 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 15412 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
15413 session->con->send_message2(std::move(m));
15414 wait_on_list(waiting_for_reclaim);
15415 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
f67539c2 15416 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15417 } else {
15418 mds++;
15419 }
15420 }
15421
15422 // didn't find target session in any mds
15423 if (reclaim_target_addrs.empty()) {
15424 if (flags & CEPH_RECLAIM_RESET)
f67539c2
TL
15425 return -CEPHFS_ENOENT;
15426 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15427 }
15428
15429 if (flags & CEPH_RECLAIM_RESET)
15430 return 0;
15431
f67539c2
TL
15432 // use blocklist to check if target session was killed
15433 // (config option mds_session_blocklist_on_evict needs to be true)
15434 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15435 bs::error_code ec;
15436 l.unlock();
15437 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15438 l.lock();
11fdf7f2 15439
f67539c2
TL
15440 if (ec)
15441 return ceph::from_error_code(ec);
15442
15443 bool blocklisted = objecter->with_osdmap(
11fdf7f2 15444 [this](const OSDMap &osd_map) -> bool {
f67539c2 15445 return osd_map.is_blocklisted(reclaim_target_addrs);
11fdf7f2 15446 });
f67539c2
TL
15447 if (blocklisted)
15448 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15449
15450 metadata["reclaiming_uuid"] = uuid;
15451 return 0;
15452}
15453
15454void Client::finish_reclaim()
15455{
15456 auto it = metadata.find("reclaiming_uuid");
15457 if (it == metadata.end()) {
15458 for (auto &p : mds_sessions)
15459 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15460 return;
15461 }
15462
15463 for (auto &p : mds_sessions) {
15464 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 15465 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
11fdf7f2
TL
15466 p.second.con->send_message2(std::move(m));
15467 }
15468
15469 metadata["uuid"] = it->second;
15470 metadata.erase(it);
15471}
15472
15473void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15474{
15475 mds_rank_t from = mds_rank_t(reply->get_source().num());
15476 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15477
f67539c2 15478 std::scoped_lock cl(client_lock);
11fdf7f2
TL
15479 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
15480 if (!session) {
15481 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15482 return;
15483 }
15484
15485 if (reply->get_result() >= 0) {
15486 session->reclaim_state = MetaSession::RECLAIM_OK;
15487 if (reply->get_epoch() > reclaim_osd_epoch)
15488 reclaim_osd_epoch = reply->get_epoch();
15489 if (!reply->get_addrs().empty())
15490 reclaim_target_addrs = reply->get_addrs();
15491 } else {
15492 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15493 reclaim_errno = reply->get_result();
15494 }
15495
15496 signal_cond_list(waiting_for_reclaim);
15497}
15498
7c673cae
FG
15499/**
15500 * This is included in cap release messages, to cause
15501 * the MDS to wait until this OSD map epoch. It is necessary
15502 * in corner cases where we cancel RADOS ops, so that
15503 * nobody else tries to do IO to the same objects in
15504 * the same epoch as the cancelled ops.
15505 */
15506void Client::set_cap_epoch_barrier(epoch_t e)
15507{
15508 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15509 cap_epoch_barrier = e;
15510}
15511
15512const char** Client::get_tracked_conf_keys() const
15513{
15514 static const char* keys[] = {
15515 "client_cache_size",
15516 "client_cache_mid",
15517 "client_acl_type",
b32b8144
FG
15518 "client_deleg_timeout",
15519 "client_deleg_break_on_open",
f67539c2
TL
15520 "client_oc_size",
15521 "client_oc_max_objects",
15522 "client_oc_max_dirty",
15523 "client_oc_target_dirty",
15524 "client_oc_max_dirty_age",
7c673cae
FG
15525 NULL
15526 };
15527 return keys;
15528}
15529
11fdf7f2 15530void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
15531 const std::set <std::string> &changed)
15532{
f67539c2 15533 std::scoped_lock lock(client_lock);
7c673cae 15534
181888fb 15535 if (changed.count("client_cache_mid")) {
7c673cae
FG
15536 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15537 }
15538 if (changed.count("client_acl_type")) {
15539 acl_type = NO_ACL;
15540 if (cct->_conf->client_acl_type == "posix_acl")
15541 acl_type = POSIX_ACL;
15542 }
f67539c2
TL
15543 if (changed.count("client_oc_size")) {
15544 objectcacher->set_max_size(cct->_conf->client_oc_size);
15545 }
15546 if (changed.count("client_oc_max_objects")) {
15547 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15548 }
15549 if (changed.count("client_oc_max_dirty")) {
15550 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15551 }
15552 if (changed.count("client_oc_target_dirty")) {
15553 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15554 }
15555 if (changed.count("client_oc_max_dirty_age")) {
15556 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15557 }
7c673cae
FG
15558}
15559
7c673cae
FG
15560void intrusive_ptr_add_ref(Inode *in)
15561{
b3b6e05e 15562 in->iget();
7c673cae 15563}
f67539c2 15564
7c673cae
FG
15565void intrusive_ptr_release(Inode *in)
15566{
15567 in->client->put_inode(in);
15568}
15569
15570mds_rank_t Client::_get_random_up_mds() const
15571{
9f95a23c 15572 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
15573
15574 std::set<mds_rank_t> up;
15575 mdsmap->get_up_mds_set(up);
15576
15577 if (up.empty())
15578 return MDS_RANK_NONE;
15579 std::set<mds_rank_t>::const_iterator p = up.begin();
15580 for (int n = rand() % up.size(); n; n--)
15581 ++p;
15582 return *p;
15583}
15584
15585
f67539c2
TL
15586StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15587 boost::asio::io_context& ictx)
15588 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
7c673cae
FG
15589{
15590 monclient->set_messenger(m);
15591 objecter->set_client_incarnation(0);
15592}
15593
15594StandaloneClient::~StandaloneClient()
15595{
15596 delete objecter;
15597 objecter = nullptr;
15598}
15599
15600int StandaloneClient::init()
15601{
f67539c2
TL
15602 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15603 ceph_assert(iref_writer.is_first_writer());
15604
e306af50 15605 _pre_init();
7c673cae
FG
15606 objecter->init();
15607
9f95a23c 15608 client_lock.lock();
7c673cae
FG
15609
15610 messenger->add_dispatcher_tail(objecter);
15611 messenger->add_dispatcher_tail(this);
15612
15613 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15614 int r = monclient->init();
15615 if (r < 0) {
15616 // need to do cleanup because we're in an intermediate init state
f67539c2
TL
15617 {
15618 std::scoped_lock l(timer_lock);
15619 timer.shutdown();
15620 }
15621
9f95a23c 15622 client_lock.unlock();
7c673cae
FG
15623 objecter->shutdown();
15624 objectcacher->stop();
15625 monclient->shutdown();
15626 return r;
15627 }
15628 objecter->start();
15629
9f95a23c 15630 client_lock.unlock();
7c673cae 15631 _finish_init();
f67539c2 15632 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
15633
15634 return 0;
15635}
15636
15637void StandaloneClient::shutdown()
15638{
15639 Client::shutdown();
15640 objecter->shutdown();
15641 monclient->shutdown();
15642}