]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph quincy 17.2.4
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
f67539c2 26#ifndef _WIN32
7c673cae 27#include <sys/utsname.h>
f67539c2 28#endif
7c673cae
FG
29#include <sys/uio.h>
30
31#include <boost/lexical_cast.hpp>
32#include <boost/fusion/include/std_pair.hpp>
33
f67539c2
TL
34#include "common/async/waiter.h"
35
36#if defined(__FreeBSD__) || defined(_WIN32)
7c673cae
FG
37#define XATTR_CREATE 0x1
38#define XATTR_REPLACE 0x2
39#else
40#include <sys/xattr.h>
41#endif
42
43#if defined(__linux__)
44#include <linux/falloc.h>
45#endif
46
47#include <sys/statvfs.h>
48
49#include "common/config.h"
50#include "common/version.h"
f67539c2 51#include "common/async/blocked_completion.h"
7c673cae 52
11fdf7f2
TL
53#include "mon/MonClient.h"
54
55#include "messages/MClientCaps.h"
56#include "messages/MClientLease.h"
57#include "messages/MClientQuota.h"
58#include "messages/MClientReclaim.h"
59#include "messages/MClientReclaimReply.h"
7c673cae 60#include "messages/MClientReconnect.h"
11fdf7f2 61#include "messages/MClientReply.h"
7c673cae
FG
62#include "messages/MClientRequest.h"
63#include "messages/MClientRequestForward.h"
11fdf7f2 64#include "messages/MClientSession.h"
7c673cae 65#include "messages/MClientSnap.h"
f67539c2 66#include "messages/MClientMetrics.h"
7c673cae 67#include "messages/MCommandReply.h"
7c673cae
FG
68#include "messages/MFSMap.h"
69#include "messages/MFSMapUser.h"
11fdf7f2
TL
70#include "messages/MMDSMap.h"
71#include "messages/MOSDMap.h"
7c673cae
FG
72
73#include "mds/flock.h"
11fdf7f2 74#include "mds/cephfs_features.h"
7c673cae
FG
75#include "osd/OSDMap.h"
76#include "osdc/Filer.h"
77
78#include "common/Cond.h"
7c673cae
FG
79#include "common/perf_counters.h"
80#include "common/admin_socket.h"
81#include "common/errno.h"
82#include "include/str_list.h"
83
84#define dout_subsys ceph_subsys_client
85
86#include "include/lru.h"
87#include "include/compat.h"
88#include "include/stringify.h"
f67539c2 89#include "include/random.h"
7c673cae
FG
90
91#include "Client.h"
92#include "Inode.h"
93#include "Dentry.h"
b32b8144 94#include "Delegation.h"
7c673cae
FG
95#include "Dir.h"
96#include "ClientSnapRealm.h"
97#include "Fh.h"
98#include "MetaSession.h"
99#include "MetaRequest.h"
100#include "ObjecterWriteback.h"
101#include "posix_acl.h"
102
11fdf7f2 103#include "include/ceph_assert.h"
7c673cae
FG
104#include "include/stat.h"
105
e306af50 106#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
107
108#if HAVE_GETGROUPLIST
109#include <grp.h>
110#include <pwd.h>
111#include <unistd.h>
112#endif
113
114#undef dout_prefix
115#define dout_prefix *_dout << "client." << whoami << " "
116
117#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119// FreeBSD fails to define this
120#ifndef O_DSYNC
121#define O_DSYNC 0x0
122#endif
123// Darwin fails to define this
124#ifndef O_RSYNC
125#define O_RSYNC 0x0
126#endif
127
128#ifndef O_DIRECT
129#define O_DIRECT 0x0
130#endif
131
f67539c2
TL
132// Windows doesn't define those values. While the Posix compatibilty layer
133// doesn't support those values, the Windows native functions do provide
134// similar flags. Special care should be taken if we're going to use those
135// flags in ceph-dokan. The current values are no-ops, while propagating
136// them to the rest of the code might cause the Windows functions to reject
137// them as invalid.
138#ifndef O_NOFOLLOW
139#define O_NOFOLLOW 0x0
140#endif
141
142#ifndef O_SYNC
143#define O_SYNC 0x0
144#endif
145
7c673cae
FG
146#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
b3b6e05e
TL
148#ifndef S_IXUGO
149#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150#endif
151
20effc67
TL
152using std::dec;
153using std::hex;
154using std::list;
155using std::oct;
156using std::pair;
157using std::string;
158using std::vector;
159
adb31ebb
TL
160using namespace TOPNSPC::common;
161
f67539c2
TL
162namespace bs = boost::system;
163namespace ca = ceph::async;
164
7c673cae
FG
165void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166{
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169}
170
b3b6e05e
TL
171bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177}
178
2a845540
TL
179// running average and standard deviation -- presented in
180// Donald Knuth's TAoCP, Volume II.
181double calc_average(double old_avg, double value, uint64_t count) {
182 double new_avg;
183 if (count == 1) {
184 new_avg = value;
185 } else {
186 new_avg = old_avg + ((value - old_avg) / count);
187 }
188
189 return new_avg;
190}
191
192double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
193 double value, uint64_t count) {
194 double new_sq_sum;
195 if (count == 1) {
196 new_sq_sum = 0.0;
197 } else {
198 new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
199 }
200
201 return new_sq_sum;
202}
7c673cae
FG
203
204// -------------
205
206Client::CommandHook::CommandHook(Client *client) :
207 m_client(client)
208{
209}
210
9f95a23c
TL
211int Client::CommandHook::call(
212 std::string_view command,
213 const cmdmap_t& cmdmap,
214 Formatter *f,
215 std::ostream& errss,
216 bufferlist& out)
7c673cae 217{
7c673cae 218 f->open_object_section("result");
9f95a23c 219 {
f67539c2 220 std::scoped_lock l{m_client->client_lock};
9f95a23c
TL
221 if (command == "mds_requests")
222 m_client->dump_mds_requests(f);
adb31ebb
TL
223 else if (command == "mds_sessions") {
224 bool cap_dump = false;
225 cmd_getval(cmdmap, "cap_dump", cap_dump);
226 m_client->dump_mds_sessions(f, cap_dump);
227 } else if (command == "dump_cache")
9f95a23c
TL
228 m_client->dump_cache(f);
229 else if (command == "kick_stale_sessions")
230 m_client->_kick_stale_sessions();
231 else if (command == "status")
232 m_client->dump_status(f);
233 else
234 ceph_abort_msg("bad command registered");
235 }
7c673cae 236 f->close_section();
9f95a23c 237 return 0;
7c673cae
FG
238}
239
240
241// -------------
242
b3b6e05e
TL
243int Client::get_fd_inode(int fd, InodeRef *in) {
244 int r = 0;
245 if (fd == CEPHFS_AT_FDCWD) {
246 *in = cwd;
247 } else {
248 Fh *f = get_filehandle(fd);
249 if (!f) {
250 r = -CEPHFS_EBADF;
251 } else {
252 *in = f->inode;
253 }
254 }
255 return r;
256}
257
7c673cae
FG
258dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
259 : inode(in), offset(0), next_offset(2),
260 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
261 perms(perms)
262 { }
263
264void Client::_reset_faked_inos()
265{
266 ino_t start = 1024;
267 free_faked_inos.clear();
268 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
269 last_used_faked_ino = 0;
11fdf7f2 270 last_used_faked_root = 0;
f67539c2
TL
271 #ifdef _WIN32
272 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
273 // Windows structures, including Dokan ones, are using 64B identifiers.
274 _use_faked_inos = false;
275 #else
7c673cae 276 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
f67539c2 277 #endif
7c673cae
FG
278}
279
280void Client::_assign_faked_ino(Inode *in)
281{
11fdf7f2
TL
282 if (0 == last_used_faked_ino)
283 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
284 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
285 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 286 last_used_faked_ino = 2048;
7c673cae
FG
287 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
288 }
11fdf7f2 289 ceph_assert(it != free_faked_inos.end());
7c673cae 290 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 291 ceph_assert(it.get_len() > 0);
7c673cae
FG
292 last_used_faked_ino = it.get_start();
293 } else {
294 ++last_used_faked_ino;
11fdf7f2 295 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
296 }
297 in->faked_ino = last_used_faked_ino;
298 free_faked_inos.erase(in->faked_ino);
299 faked_ino_map[in->faked_ino] = in->vino();
300}
301
11fdf7f2
TL
302/*
303 * In the faked mode, if you export multiple subdirectories,
304 * you will see that the inode numbers of the exported subdirectories
305 * are the same. so we distinguish the mount point by reserving
306 * the "fake ids" between "1024~2048" and combining the last
307 * 10bits(0x3ff) of the "root inodes".
308*/
309void Client::_assign_faked_root(Inode *in)
310{
311 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
312 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
313 last_used_faked_root = 0;
314 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
315 }
20effc67 316 ceph_assert(it != free_faked_inos.end());
11fdf7f2
TL
317 vinodeno_t inode_info = in->vino();
318 uint64_t inode_num = (uint64_t)inode_info.ino;
319 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
320 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
20effc67 321 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
11fdf7f2
TL
322
323 in->faked_ino = last_used_faked_root;
324 free_faked_inos.erase(in->faked_ino);
325 faked_ino_map[in->faked_ino] = in->vino();
326}
327
7c673cae
FG
328void Client::_release_faked_ino(Inode *in)
329{
330 free_faked_inos.insert(in->faked_ino);
331 faked_ino_map.erase(in->faked_ino);
332}
333
334vinodeno_t Client::_map_faked_ino(ino_t ino)
335{
336 vinodeno_t vino;
337 if (ino == 1)
338 vino = root->vino();
339 else if (faked_ino_map.count(ino))
340 vino = faked_ino_map[ino];
341 else
342 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 343 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
344 return vino;
345}
346
347vinodeno_t Client::map_faked_ino(ino_t ino)
348{
f67539c2 349 std::scoped_lock lock(client_lock);
7c673cae
FG
350 return _map_faked_ino(ino);
351}
352
353// cons/des
354
355Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
f67539c2
TL
356 : Dispatcher(m->cct->get()),
357 timer(m->cct, timer_lock, false),
11fdf7f2
TL
358 messenger(m),
359 monclient(mc),
360 objecter(objecter_),
361 whoami(mc->get_global_id()),
f67539c2
TL
362 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
363 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
364 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
7c673cae
FG
365 async_ino_invalidator(m->cct),
366 async_dentry_invalidator(m->cct),
367 interrupt_finisher(m->cct),
368 remount_finisher(m->cct),
e306af50 369 async_ino_releasor(m->cct),
7c673cae 370 objecter_finisher(m->cct),
11fdf7f2
TL
371 m_command_hook(this),
372 fscid(0)
7c673cae
FG
373{
374 _reset_faked_inos();
7c673cae 375
7c673cae
FG
376 user_id = cct->_conf->client_mount_uid;
377 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
378 fuse_default_permissions = cct->_conf.get_val<bool>(
379 "fuse_default_permissions");
7c673cae 380
33c7a0ef
TL
381 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
382 "client_collect_and_send_global_metrics");
383
2a845540
TL
384 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
385 "client_mount_timeout");
386
387 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
388 "client_caps_release_delay");
389
7c673cae
FG
390 if (cct->_conf->client_acl_type == "posix_acl")
391 acl_type = POSIX_ACL;
392
7c673cae
FG
393 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
394
395 // file handles
396 free_fd_set.insert(10, 1<<30);
397
398 mdsmap.reset(new MDSMap);
399
400 // osd interfaces
401 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
402 &client_lock));
403 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
404 client_flush_set_callback, // all commit callback
405 (void*)this,
406 cct->_conf->client_oc_size,
407 cct->_conf->client_oc_max_objects,
408 cct->_conf->client_oc_max_dirty,
409 cct->_conf->client_oc_target_dirty,
410 cct->_conf->client_oc_max_dirty_age,
411 true));
7c673cae
FG
412}
413
414
415Client::~Client()
416{
9f95a23c 417 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 418
f67539c2
TL
419 // If the task is crashed or aborted and doesn't
420 // get any chance to run the umount and shutdow.
421 {
422 std::scoped_lock l{client_lock};
423 tick_thread_stopped = true;
424 upkeep_cond.notify_one();
425 }
426
427 if (upkeeper.joinable())
428 upkeeper.join();
429
31f18b77
FG
430 // It is necessary to hold client_lock, because any inode destruction
431 // may call into ObjectCacher, which asserts that it's lock (which is
432 // client_lock) is held.
f67539c2 433 std::scoped_lock l{client_lock};
7c673cae
FG
434 tear_down_cache();
435}
436
437void Client::tear_down_cache()
438{
439 // fd's
f67539c2
TL
440 for (auto &[fd, fh] : fd_map) {
441 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
7c673cae
FG
442 _release_fh(fh);
443 }
444 fd_map.clear();
445
446 while (!opened_dirs.empty()) {
447 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 448 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
449 _closedir(dirp);
450 }
451
452 // caps!
453 // *** FIXME ***
454
455 // empty lru
7c673cae 456 trim_cache();
11fdf7f2 457 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
458
459 // close root ino
11fdf7f2 460 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae 461 if (root && inode_map.size() == 1 + root_parents.size()) {
b3b6e05e 462 root.reset();
7c673cae
FG
463 }
464
11fdf7f2 465 ceph_assert(inode_map.empty());
7c673cae
FG
466}
467
468inodeno_t Client::get_root_ino()
469{
f67539c2 470 std::scoped_lock l(client_lock);
7c673cae
FG
471 if (use_faked_inos())
472 return root->faked_ino;
473 else
474 return root->ino;
475}
476
477Inode *Client::get_root()
478{
f67539c2 479 std::scoped_lock l(client_lock);
7c673cae 480 root->ll_get();
b3b6e05e 481 return root.get();
7c673cae
FG
482}
483
484
485// debug crapola
486
487void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
488{
489 filepath path;
490 in->make_long_path(path);
491 ldout(cct, 1) << "dump_inode: "
492 << (disconnected ? "DISCONNECTED ":"")
493 << "inode " << in->ino
494 << " " << path
b3b6e05e 495 << " ref " << in->get_nref()
f67539c2 496 << " " << *in << dendl;
7c673cae
FG
497
498 if (f) {
499 f->open_object_section("inode");
500 f->dump_stream("path") << path;
501 if (disconnected)
502 f->dump_int("disconnected", 1);
503 in->dump(f);
504 f->close_section();
505 }
506
507 did.insert(in);
508 if (in->dir) {
509 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
510 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
511 it != in->dir->dentries.end();
512 ++it) {
513 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
514 if (f) {
515 f->open_object_section("dentry");
516 it->second->dump(f);
517 f->close_section();
518 }
519 if (it->second->inode)
520 dump_inode(f, it->second->inode.get(), did, false);
521 }
522 }
523}
524
525void Client::dump_cache(Formatter *f)
526{
527 set<Inode*> did;
528
11fdf7f2 529 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
530
531 if (f)
532 f->open_array_section("cache");
533
534 if (root)
b3b6e05e 535 dump_inode(f, root.get(), did, true);
7c673cae
FG
536
537 // make a second pass to catch anything disconnected
538 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
539 it != inode_map.end();
540 ++it) {
541 if (did.count(it->second))
542 continue;
543 dump_inode(f, it->second, did, true);
544 }
545
546 if (f)
547 f->close_section();
548}
549
550void Client::dump_status(Formatter *f)
551{
9f95a23c 552 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
553
554 ldout(cct, 1) << __func__ << dendl;
555
556 const epoch_t osd_epoch
557 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
558
559 if (f) {
560 f->open_object_section("metadata");
561 for (const auto& kv : metadata)
562 f->dump_string(kv.first.c_str(), kv.second);
563 f->close_section();
564
565 f->dump_int("dentry_count", lru.lru_get_size());
566 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
567 f->dump_int("id", get_nodeid().v);
11fdf7f2 568 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 569 f->dump_object("inst", inst);
11fdf7f2
TL
570 f->dump_object("addr", inst.addr);
571 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
572 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
573 f->dump_int("inode_count", inode_map.size());
574 f->dump_int("mds_epoch", mdsmap->get_epoch());
575 f->dump_int("osd_epoch", osd_epoch);
576 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f67539c2 577 f->dump_bool("blocklisted", blocklisted);
adb31ebb 578 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
579 }
580}
581
e306af50 582void Client::_pre_init()
7c673cae
FG
583{
584 timer.init();
e306af50
TL
585
586 objecter_finisher.start();
587 filer.reset(new Filer(objecter, &objecter_finisher));
e306af50 588
7c673cae 589 objectcacher->start();
e306af50
TL
590}
591
592int Client::init()
593{
f67539c2
TL
594 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
595 ceph_assert(iref_writer.is_first_writer());
596
e306af50 597 _pre_init();
9f95a23c 598 {
f67539c2 599 std::scoped_lock l{client_lock};
9f95a23c
TL
600 messenger->add_dispatcher_tail(this);
601 }
7c673cae 602 _finish_init();
f67539c2 603 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
604 return 0;
605}
606
607void Client::_finish_init()
608{
9f95a23c 609 {
f67539c2 610 std::scoped_lock l{client_lock};
9f95a23c
TL
611 // logger
612 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
613 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
614 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
615 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
616 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
617 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
2a845540
TL
618 // average, standard deviation mds/r/w/ latencies
619 plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
620 plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
621 plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
622 plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
623 plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
624 plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
625 plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
626 plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
627 plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
9f95a23c
TL
628 logger.reset(plb.create_perf_counters());
629 cct->get_perfcounters_collection()->add(logger.get());
630 }
7c673cae 631
11fdf7f2 632 cct->_conf.add_observer(this);
7c673cae
FG
633
634 AdminSocket* admin_socket = cct->get_admin_socket();
635 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
636 &m_command_hook,
637 "show in-progress mds requests");
638 if (ret < 0) {
639 lderr(cct) << "error registering admin socket command: "
640 << cpp_strerror(-ret) << dendl;
641 }
adb31ebb
TL
642 ret = admin_socket->register_command("mds_sessions "
643 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
644 &m_command_hook,
645 "show mds session state");
646 if (ret < 0) {
647 lderr(cct) << "error registering admin socket command: "
648 << cpp_strerror(-ret) << dendl;
649 }
650 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
651 &m_command_hook,
652 "show in-memory metadata cache contents");
653 if (ret < 0) {
654 lderr(cct) << "error registering admin socket command: "
655 << cpp_strerror(-ret) << dendl;
656 }
657 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
658 &m_command_hook,
659 "kick sessions that were remote reset");
660 if (ret < 0) {
661 lderr(cct) << "error registering admin socket command: "
662 << cpp_strerror(-ret) << dendl;
663 }
664 ret = admin_socket->register_command("status",
7c673cae
FG
665 &m_command_hook,
666 "show overall client status");
667 if (ret < 0) {
668 lderr(cct) << "error registering admin socket command: "
669 << cpp_strerror(-ret) << dendl;
670 }
7c673cae
FG
671}
672
673void Client::shutdown()
674{
11fdf7f2 675 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
676
677 // If we were not mounted, but were being used for sending
678 // MDS commands, we may have sessions that need closing.
9f95a23c 679 {
f67539c2
TL
680 std::scoped_lock l{client_lock};
681
682 // To make sure the tick thread will be stoppped before
683 // destructing the Client, just in case like the _mount()
684 // failed but didn't not get a chance to stop the tick
685 // thread
686 tick_thread_stopped = true;
687 upkeep_cond.notify_one();
688
9f95a23c
TL
689 _close_sessions();
690 }
11fdf7f2 691 cct->_conf.remove_observer(this);
7c673cae 692
11fdf7f2 693 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
694
695 if (ino_invalidate_cb) {
696 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
697 async_ino_invalidator.wait_for_empty();
698 async_ino_invalidator.stop();
699 }
700
701 if (dentry_invalidate_cb) {
702 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
703 async_dentry_invalidator.wait_for_empty();
704 async_dentry_invalidator.stop();
705 }
706
707 if (switch_interrupt_cb) {
708 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
709 interrupt_finisher.wait_for_empty();
710 interrupt_finisher.stop();
711 }
712
713 if (remount_cb) {
714 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
715 remount_finisher.wait_for_empty();
716 remount_finisher.stop();
717 }
718
e306af50
TL
719 if (ino_release_cb) {
720 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
721 async_ino_releasor.wait_for_empty();
722 async_ino_releasor.stop();
723 }
724
7c673cae 725 objectcacher->stop(); // outside of client_lock! this does a join.
f67539c2
TL
726
727 /*
728 * We are shuting down the client.
729 *
730 * Just declare the state to CLIENT_NEW to block and fail any
731 * new comming "reader" and then try to wait all the in-flight
732 * "readers" to finish.
733 */
734 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
735 if (!iref_writer.is_first_writer())
736 return;
737 iref_writer.wait_readers_done();
738
9f95a23c 739 {
f67539c2 740 std::scoped_lock l(timer_lock);
9f95a23c
TL
741 timer.shutdown();
742 }
f67539c2 743
7c673cae
FG
744 objecter_finisher.wait_for_empty();
745 objecter_finisher.stop();
746
747 if (logger) {
748 cct->get_perfcounters_collection()->remove(logger.get());
749 logger.reset();
750 }
751}
752
2a845540
TL
753void Client::update_io_stat_metadata(utime_t latency) {
754 auto lat_nsec = latency.to_nsec();
755 // old values are used to compute new ones
756 auto o_avg = logger->tget(l_c_md_avg).to_nsec();
757 auto o_sqsum = logger->get(l_c_md_sqsum);
758
759 auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
760 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
761 nr_metadata_request);
762
763 logger->tinc(l_c_lat, latency);
764 logger->tinc(l_c_reply, latency);
765
766 utime_t avg;
767 avg.set_from_double(n_avg / 1000000000);
768 logger->tset(l_c_md_avg, avg);
769 logger->set(l_c_md_sqsum, n_sqsum);
770 logger->set(l_c_md_ops, nr_metadata_request);
771}
772
773void Client::update_io_stat_read(utime_t latency) {
774 auto lat_nsec = latency.to_nsec();
775 // old values are used to compute new ones
776 auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
777 auto o_sqsum = logger->get(l_c_rd_sqsum);
778
779 auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
780 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
781 nr_read_request);
782
783 logger->tinc(l_c_read, latency);
784
785 utime_t avg;
786 avg.set_from_double(n_avg / 1000000000);
787 logger->tset(l_c_rd_avg, avg);
788 logger->set(l_c_rd_sqsum, n_sqsum);
789 logger->set(l_c_rd_ops, nr_read_request);
790}
791
792void Client::update_io_stat_write(utime_t latency) {
793 auto lat_nsec = latency.to_nsec();
794 // old values are used to compute new ones
795 auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
796 auto o_sqsum = logger->get(l_c_wr_sqsum);
797
798 auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
799 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
800 nr_write_request);
801
802 logger->tinc(l_c_wrlat, latency);
803
804 utime_t avg;
805 avg.set_from_double(n_avg / 1000000000);
806 logger->tset(l_c_wr_avg, avg);
807 logger->set(l_c_wr_sqsum, n_sqsum);
808 logger->set(l_c_wr_ops, nr_write_request);
809}
7c673cae
FG
810
811// ===================
812// metadata cache stuff
813
814void Client::trim_cache(bool trim_kernel_dcache)
815{
181888fb
FG
816 uint64_t max = cct->_conf->client_cache_size;
817 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
818 unsigned last = 0;
819 while (lru.lru_get_size() != last) {
820 last = lru.lru_get_size();
821
f67539c2 822 if (!is_unmounting() && lru.lru_get_size() <= max) break;
7c673cae
FG
823
824 // trim!
31f18b77 825 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
826 if (!dn)
827 break; // done
f67539c2 828
7c673cae
FG
829 trim_dentry(dn);
830 }
831
181888fb 832 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
833 _invalidate_kernel_dcache();
834
835 // hose root?
b3b6e05e 836 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
7c673cae 837 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
b3b6e05e 838 root.reset();
7c673cae
FG
839 }
840}
841
842void Client::trim_cache_for_reconnect(MetaSession *s)
843{
844 mds_rank_t mds = s->mds_num;
11fdf7f2 845 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
846
847 int trimmed = 0;
848 list<Dentry*> skipped;
849 while (lru.lru_get_size() > 0) {
850 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
851 if (!dn)
852 break;
853
854 if ((dn->inode && dn->inode->caps.count(mds)) ||
855 dn->dir->parent_inode->caps.count(mds)) {
856 trim_dentry(dn);
857 trimmed++;
858 } else
859 skipped.push_back(dn);
860 }
861
862 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
863 lru.lru_insert_mid(*p);
864
11fdf7f2 865 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
866 << " trimmed " << trimmed << " dentries" << dendl;
867
868 if (s->caps.size() > 0)
869 _invalidate_kernel_dcache();
870}
871
872void Client::trim_dentry(Dentry *dn)
873{
874 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
875 << " in dir "
876 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
877 << dendl;
878 if (dn->inode) {
879 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
880 clear_dir_complete_and_ordered(diri, true);
881 }
882 unlink(dn, false, false); // drop dir, drop dentry
883}
884
885
1adf2230
AA
886void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
887 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 888{
7c673cae
FG
889 uint64_t prior_size = in->size;
890
7c673cae
FG
891 if (truncate_seq > in->truncate_seq ||
892 (truncate_seq == in->truncate_seq && size > in->size)) {
893 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
894 in->size = size;
895 in->reported_size = size;
896 if (truncate_seq != in->truncate_seq) {
897 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
898 << truncate_seq << dendl;
899 in->truncate_seq = truncate_seq;
900 in->oset.truncate_seq = truncate_seq;
901
902 // truncate cached file data
903 if (prior_size > size) {
904 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
905 }
906 }
907
908 // truncate inline data
909 if (in->inline_version < CEPH_INLINE_NONE) {
910 uint32_t len = in->inline_data.length();
911 if (size < len)
912 in->inline_data.splice(size, len - size);
913 }
914 }
915 if (truncate_seq >= in->truncate_seq &&
916 in->truncate_size != truncate_size) {
917 if (in->is_file()) {
918 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
919 << truncate_size << dendl;
920 in->truncate_size = truncate_size;
921 in->oset.truncate_size = truncate_size;
922 } else {
923 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
924 }
925 }
1adf2230
AA
926}
927
928void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
929 utime_t ctime, utime_t mtime, utime_t atime)
930{
931 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
932 << " ctime " << ctime << " mtime " << mtime << dendl;
933
934 if (time_warp_seq > in->time_warp_seq)
935 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
936 << " is higher than local time_warp_seq "
937 << in->time_warp_seq << dendl;
938
939 int warn = false;
7c673cae
FG
940 // be careful with size, mtime, atime
941 if (issued & (CEPH_CAP_FILE_EXCL|
942 CEPH_CAP_FILE_WR|
943 CEPH_CAP_FILE_BUFFER|
944 CEPH_CAP_AUTH_EXCL|
945 CEPH_CAP_XATTR_EXCL)) {
946 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
947 if (ctime > in->ctime)
948 in->ctime = ctime;
949 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
950 //the mds updated times, so take those!
951 in->mtime = mtime;
952 in->atime = atime;
953 in->time_warp_seq = time_warp_seq;
954 } else if (time_warp_seq == in->time_warp_seq) {
955 //take max times
956 if (mtime > in->mtime)
957 in->mtime = mtime;
958 if (atime > in->atime)
959 in->atime = atime;
960 } else if (issued & CEPH_CAP_FILE_EXCL) {
961 //ignore mds values as we have a higher seq
962 } else warn = true;
963 } else {
964 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
965 if (time_warp_seq >= in->time_warp_seq) {
966 in->ctime = ctime;
967 in->mtime = mtime;
968 in->atime = atime;
969 in->time_warp_seq = time_warp_seq;
970 } else warn = true;
971 }
972 if (warn) {
973 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
974 << time_warp_seq << " is lower than local time_warp_seq "
975 << in->time_warp_seq
976 << dendl;
977 }
978}
979
980void Client::_fragmap_remove_non_leaves(Inode *in)
981{
982 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
983 if (!in->dirfragtree.is_leaf(p->first))
984 in->fragmap.erase(p++);
985 else
986 ++p;
987}
988
989void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
990{
991 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
992 if (p->second == mds)
993 in->fragmap.erase(p++);
994 else
995 ++p;
996}
997
998Inode * Client::add_update_inode(InodeStat *st, utime_t from,
999 MetaSession *session,
1000 const UserPerm& request_perms)
1001{
1002 Inode *in;
1003 bool was_new = false;
1004 if (inode_map.count(st->vino)) {
1005 in = inode_map[st->vino];
11fdf7f2 1006 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
1007 } else {
1008 in = new Inode(this, st->vino, &st->layout);
1009 inode_map[st->vino] = in;
1010
1011 if (use_faked_inos())
1012 _assign_faked_ino(in);
1013
1014 if (!root) {
1015 root = in;
11fdf7f2 1016 if (use_faked_inos())
b3b6e05e 1017 _assign_faked_root(root.get());
7c673cae
FG
1018 root_ancestor = in;
1019 cwd = root;
f67539c2 1020 } else if (is_mounting()) {
7c673cae
FG
1021 root_parents[root_ancestor] = in;
1022 root_ancestor = in;
1023 }
1024
1025 // immutable bits
1026 in->ino = st->vino.ino;
1027 in->snapid = st->vino.snapid;
1028 in->mode = st->mode & S_IFMT;
1029 was_new = true;
1030 }
1031
1032 in->rdev = st->rdev;
1033 if (in->is_symlink())
1034 in->symlink = st->symlink;
1035
7c673cae 1036 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
1037 bool new_version = false;
1038 if (in->version == 0 ||
1039 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1040 (in->version & ~1) < st->version))
1041 new_version = true;
7c673cae 1042
1adf2230
AA
1043 int issued;
1044 in->caps_issued(&issued);
1045 issued |= in->caps_dirty();
1046 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 1047
1adf2230
AA
1048 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1049 !(issued & CEPH_CAP_AUTH_EXCL)) {
1050 in->mode = st->mode;
1051 in->uid = st->uid;
1052 in->gid = st->gid;
1053 in->btime = st->btime;
81eedcae 1054 in->snap_btime = st->snap_btime;
f67539c2 1055 in->snap_metadata = st->snap_metadata;
1adf2230 1056 }
7c673cae 1057
1adf2230
AA
1058 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1059 !(issued & CEPH_CAP_LINK_EXCL)) {
1060 in->nlink = st->nlink;
1061 }
7c673cae 1062
1adf2230
AA
1063 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1064 update_inode_file_time(in, issued, st->time_warp_seq,
1065 st->ctime, st->mtime, st->atime);
1066 }
7c673cae 1067
1adf2230
AA
1068 if (new_version ||
1069 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 1070 in->layout = st->layout;
1adf2230
AA
1071 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1072 }
7c673cae 1073
1adf2230
AA
1074 if (in->is_dir()) {
1075 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1076 in->dirstat = st->dirstat;
1077 }
1078 // dir_layout/rstat/quota are not tracked by capability, update them only if
1079 // the inode stat is from auth mds
1080 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
1081 in->dir_layout = st->dir_layout;
1082 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
1083 in->rstat = st->rstat;
1084 in->quota = st->quota;
11fdf7f2 1085 in->dir_pin = st->dir_pin;
1adf2230
AA
1086 }
1087 // move me if/when version reflects fragtree changes.
1088 if (in->dirfragtree != st->dirfragtree) {
1089 in->dirfragtree = st->dirfragtree;
1090 _fragmap_remove_non_leaves(in);
7c673cae 1091 }
7c673cae
FG
1092 }
1093
1094 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1095 st->xattrbl.length() &&
1096 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
1097 auto p = st->xattrbl.cbegin();
1098 decode(in->xattrs, p);
7c673cae
FG
1099 in->xattr_version = st->xattr_version;
1100 }
1101
1adf2230
AA
1102 if (st->inline_version > in->inline_version) {
1103 in->inline_data = st->inline_data;
1104 in->inline_version = st->inline_version;
7c673cae
FG
1105 }
1106
1adf2230
AA
1107 /* always take a newer change attr */
1108 if (st->change_attr > in->change_attr)
1109 in->change_attr = st->change_attr;
1110
1111 if (st->version > in->version)
1112 in->version = st->version;
1113
1114 if (was_new)
1115 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1116
1117 if (!st->cap.caps)
1118 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1119
7c673cae 1120 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
1121 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1122 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1123 st->cap.flags, request_perms);
28e407b8 1124 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 1125 in->max_size = st->max_size;
28e407b8
AA
1126 in->rstat = st->rstat;
1127 }
7c673cae 1128
1adf2230
AA
1129 // setting I_COMPLETE needs to happen after adding the cap
1130 if (in->is_dir() &&
1131 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1132 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1133 in->dirstat.nfiles == 0 &&
1134 in->dirstat.nsubdirs == 0) {
1135 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1136 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1137 if (in->dir) {
1138 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1139 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1140 in->dir->readdir_cache.clear();
1141 for (const auto& p : in->dir->dentries) {
1142 unlink(p.second, true, true); // keep dir, keep dentry
1143 }
1144 if (in->dir->dentries.empty())
1145 close_dir(in->dir);
7c673cae 1146 }
7c673cae 1147 }
1adf2230
AA
1148 } else {
1149 in->snap_caps |= st->cap.caps;
7c673cae
FG
1150 }
1151
f67539c2 1152 in->fscrypt = st->fscrypt;
7c673cae
FG
1153 return in;
1154}
1155
1156
1157/*
1158 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1159 */
1160Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1161 Inode *in, utime_t from, MetaSession *session,
1162 Dentry *old_dentry)
1163{
1164 Dentry *dn = NULL;
1165 if (dir->dentries.count(dname))
1166 dn = dir->dentries[dname];
1167
11fdf7f2 1168 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
1169 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1170 << dendl;
1171
1172 if (dn && dn->inode) {
1173 if (dn->inode->vino() == in->vino()) {
1174 touch_dn(dn);
1175 ldout(cct, 12) << " had dentry " << dname
1176 << " with correct vino " << dn->inode->vino()
1177 << dendl;
1178 } else {
1179 ldout(cct, 12) << " had dentry " << dname
1180 << " with WRONG vino " << dn->inode->vino()
1181 << dendl;
1182 unlink(dn, true, true); // keep dir, keep dentry
1183 }
1184 }
1185
1186 if (!dn || !dn->inode) {
1187 InodeRef tmp_ref(in);
1188 if (old_dentry) {
1189 if (old_dentry->dir != dir) {
1190 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1191 clear_dir_complete_and_ordered(old_diri, false);
1192 }
1193 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1194 }
1195 Inode *diri = dir->parent_inode;
7c673cae
FG
1196 clear_dir_complete_and_ordered(diri, false);
1197 dn = link(dir, dname, in, dn);
1198 }
1199
1200 update_dentry_lease(dn, dlease, from, session);
1201 return dn;
1202}
1203
1204void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1205{
1206 utime_t dttl = from;
1207 dttl += (float)dlease->duration_ms / 1000.0;
f67539c2
TL
1208
1209 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
7c673cae 1210
11fdf7f2 1211 ceph_assert(dn);
7c673cae 1212
9f95a23c 1213 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1214 if (dttl > dn->lease_ttl) {
1215 ldout(cct, 10) << "got dentry lease on " << dn->name
1216 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1217 dn->lease_ttl = dttl;
1218 dn->lease_mds = session->mds_num;
1219 dn->lease_seq = dlease->seq;
1220 dn->lease_gen = session->cap_gen;
1221 }
1222 }
1223 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1224 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1225 dn->mark_primary();
f67539c2 1226 dn->alternate_name = std::move(dlease->alternate_name);
7c673cae
FG
1227}
1228
1229
1230/*
1231 * update MDS location cache for a single inode
1232 */
522d829b 1233void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
7c673cae
FG
1234{
1235 // auth
1236 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1237 if (dst->auth >= 0) {
1238 in->fragmap[dst->frag] = dst->auth;
1239 } else {
1240 in->fragmap.erase(dst->frag);
1241 }
1242 if (!in->dirfragtree.is_leaf(dst->frag)) {
1243 in->dirfragtree.force_to_leaf(cct, dst->frag);
1244 _fragmap_remove_non_leaves(in);
1245 }
1246
522d829b
TL
1247 // replicated, only update from auth mds reply
1248 if (from == dst->auth) {
1249 in->dir_replicated = !dst->dist.empty();
1250 if (!dst->dist.empty())
1251 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1252 else
1253 in->frag_repmap.erase(dst->frag);
1254 }
7c673cae
FG
1255}
1256
1257void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1258{
f91f0fd5
TL
1259 if (complete)
1260 diri->dir_release_count++;
1261 else
1262 diri->dir_ordered_count++;
7c673cae
FG
1263 if (diri->flags & I_COMPLETE) {
1264 if (complete) {
1265 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1266 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1267 } else {
1268 if (diri->flags & I_DIR_ORDERED) {
1269 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1270 diri->flags &= ~I_DIR_ORDERED;
1271 }
1272 }
1273 if (diri->dir)
1274 diri->dir->readdir_cache.clear();
1275 }
1276}
1277
1278/*
1279 * insert results from readdir or lssnap into the metadata cache.
1280 */
1281void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1282
11fdf7f2 1283 auto& reply = request->reply;
7c673cae 1284 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1285 uint64_t features;
1286 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1287 features = (uint64_t)-1;
1288 }
1289 else {
1290 features = con->get_features();
1291 }
7c673cae
FG
1292
1293 dir_result_t *dirp = request->dirp;
11fdf7f2 1294 ceph_assert(dirp);
7c673cae
FG
1295
1296 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1297 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1298 if (!p.end()) {
1299 // snapdir?
1300 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1301 ceph_assert(diri);
7c673cae
FG
1302 diri = open_snapdir(diri);
1303 }
1304
1305 // only open dir if we're actually adding stuff to it!
1306 Dir *dir = diri->open_dir();
11fdf7f2 1307 ceph_assert(dir);
7c673cae
FG
1308
1309 // dirstat
11fdf7f2 1310 DirStat dst(p, features);
7c673cae
FG
1311 __u32 numdn;
1312 __u16 flags;
11fdf7f2
TL
1313 decode(numdn, p);
1314 decode(flags, p);
7c673cae
FG
1315
1316 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1317 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1318
1319 frag_t fg = (unsigned)request->head.args.readdir.frag;
1320 unsigned readdir_offset = dirp->next_offset;
1321 string readdir_start = dirp->last_name;
11fdf7f2 1322 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1323
1324 unsigned last_hash = 0;
1325 if (hash_order) {
1326 if (!readdir_start.empty()) {
1327 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1328 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1329 /* mds understands offset_hash */
1330 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1331 }
1332 }
1333
1334 if (fg != dst.frag) {
1335 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1336 fg = dst.frag;
1337 if (!hash_order) {
1338 readdir_offset = 2;
1339 readdir_start.clear();
1340 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1341 }
1342 }
1343
1344 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1345 << ", hash_order=" << hash_order
1346 << ", readdir_start " << readdir_start
1347 << ", last_hash " << last_hash
1348 << ", next_offset " << readdir_offset << dendl;
1349
1350 if (diri->snapid != CEPH_SNAPDIR &&
1351 fg.is_leftmost() && readdir_offset == 2 &&
1352 !(hash_order && last_hash)) {
1353 dirp->release_count = diri->dir_release_count;
1354 dirp->ordered_count = diri->dir_ordered_count;
1355 dirp->start_shared_gen = diri->shared_gen;
1356 dirp->cache_index = 0;
1357 }
1358
1359 dirp->buffer_frag = fg;
1360
1361 _readdir_drop_dirp_buffer(dirp);
1362 dirp->buffer.reserve(numdn);
1363
1364 string dname;
1365 LeaseStat dlease;
1366 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1367 decode(dname, p);
1368 dlease.decode(p, features);
7c673cae
FG
1369 InodeStat ist(p, features);
1370
1371 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1372
1373 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1374 request->perms);
1375 Dentry *dn;
1376 if (diri->dir->dentries.count(dname)) {
1377 Dentry *olddn = diri->dir->dentries[dname];
1378 if (olddn->inode != in) {
1379 // replace incorrect dentry
1380 unlink(olddn, true, true); // keep dir, dentry
1381 dn = link(dir, dname, in, olddn);
11fdf7f2 1382 ceph_assert(dn == olddn);
7c673cae
FG
1383 } else {
1384 // keep existing dn
1385 dn = olddn;
1386 touch_dn(dn);
1387 }
1388 } else {
1389 // new dn
1390 dn = link(dir, dname, in, NULL);
1391 }
f67539c2 1392 dn->alternate_name = std::move(dlease.alternate_name);
7c673cae
FG
1393
1394 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1395 if (hash_order) {
1396 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1397 if (hash != last_hash)
1398 readdir_offset = 2;
1399 last_hash = hash;
1400 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1401 } else {
1402 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1403 }
1404 // add to readdir cache
1405 if (dirp->release_count == diri->dir_release_count &&
1406 dirp->ordered_count == diri->dir_ordered_count &&
1407 dirp->start_shared_gen == diri->shared_gen) {
1408 if (dirp->cache_index == dir->readdir_cache.size()) {
1409 if (i == 0) {
11fdf7f2 1410 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1411 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1412 }
1413 dir->readdir_cache.push_back(dn);
1414 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1415 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1416 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1417 else
1418 dir->readdir_cache[dirp->cache_index] = dn;
1419 } else {
11fdf7f2 1420 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1421 }
1422 dirp->cache_index++;
1423 }
1424 // add to cached result list
f67539c2 1425 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
7c673cae
FG
1426 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1427 }
1428
1429 if (numdn > 0)
1430 dirp->last_name = dname;
1431 if (end)
1432 dirp->next_offset = 2;
1433 else
1434 dirp->next_offset = readdir_offset;
1435
1436 if (dir->is_empty())
1437 close_dir(dir);
1438 }
1439}
1440
1441/** insert_trace
1442 *
1443 * insert a trace from a MDS reply into the cache.
1444 */
1445Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1446{
11fdf7f2 1447 auto& reply = request->reply;
7c673cae
FG
1448 int op = request->get_op();
1449
1450 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1451 << " is_target=" << (int)reply->head.is_target
1452 << " is_dentry=" << (int)reply->head.is_dentry
1453 << dendl;
1454
11fdf7f2 1455 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1456 if (request->got_unsafe) {
1457 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1458 ceph_assert(p.end());
7c673cae
FG
1459 return NULL;
1460 }
1461
1462 if (p.end()) {
1463 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1464
1465 Dentry *d = request->dentry();
1466 if (d) {
1467 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1468 clear_dir_complete_and_ordered(diri, true);
1469 }
1470
1471 if (d && reply->get_result() == 0) {
1472 if (op == CEPH_MDS_OP_RENAME) {
1473 // rename
1474 Dentry *od = request->old_dentry();
1475 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1476 ceph_assert(od);
7c673cae
FG
1477 unlink(od, true, true); // keep dir, dentry
1478 } else if (op == CEPH_MDS_OP_RMDIR ||
1479 op == CEPH_MDS_OP_UNLINK) {
1480 // unlink, rmdir
1481 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1482 unlink(d, true, true); // keep dir, dentry
1483 }
1484 }
1485 return NULL;
1486 }
1487
1488 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1489 uint64_t features;
1490 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1491 features = (uint64_t)-1;
1492 }
1493 else {
1494 features = con->get_features();
1495 }
7c673cae
FG
1496 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1497
1498 // snap trace
1499 SnapRealm *realm = NULL;
1500 if (reply->snapbl.length())
1501 update_snap_trace(reply->snapbl, &realm);
1502
1503 ldout(cct, 10) << " hrm "
1504 << " is_target=" << (int)reply->head.is_target
1505 << " is_dentry=" << (int)reply->head.is_dentry
1506 << dendl;
1507
1508 InodeStat dirst;
1509 DirStat dst;
1510 string dname;
1511 LeaseStat dlease;
1512 InodeStat ist;
1513
1514 if (reply->head.is_dentry) {
1515 dirst.decode(p, features);
11fdf7f2
TL
1516 dst.decode(p, features);
1517 decode(dname, p);
1518 dlease.decode(p, features);
7c673cae
FG
1519 }
1520
1521 Inode *in = 0;
1522 if (reply->head.is_target) {
1523 ist.decode(p, features);
1524 if (cct->_conf->client_debug_getattr_caps) {
1525 unsigned wanted = 0;
1526 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1527 wanted = request->head.args.getattr.mask;
1528 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1529 wanted = request->head.args.open.mask;
1530
1531 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1532 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1533 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1534 }
1535
1536 in = add_update_inode(&ist, request->sent_stamp, session,
1537 request->perms);
1538 }
1539
1540 Inode *diri = NULL;
1541 if (reply->head.is_dentry) {
1542 diri = add_update_inode(&dirst, request->sent_stamp, session,
1543 request->perms);
522d829b
TL
1544 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1545 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
7c673cae
FG
1546
1547 if (in) {
1548 Dir *dir = diri->open_dir();
1549 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1550 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1551 } else {
1552 Dentry *dn = NULL;
1553 if (diri->dir && diri->dir->dentries.count(dname)) {
1554 dn = diri->dir->dentries[dname];
1555 if (dn->inode) {
7c673cae
FG
1556 clear_dir_complete_and_ordered(diri, false);
1557 unlink(dn, true, true); // keep dir, dentry
1558 }
1559 }
1560 if (dlease.duration_ms > 0) {
1561 if (!dn) {
1562 Dir *dir = diri->open_dir();
1563 dn = link(dir, dname, NULL, NULL);
1564 }
1565 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1566 }
1567 }
1568 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1569 op == CEPH_MDS_OP_MKSNAP) {
1570 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1571 // fake it for snap lookup
1572 vinodeno_t vino = ist.vino;
1573 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1574 ceph_assert(inode_map.count(vino));
7c673cae
FG
1575 diri = inode_map[vino];
1576
1577 string dname = request->path.last_dentry();
1578
1579 LeaseStat dlease;
1580 dlease.duration_ms = 0;
1581
1582 if (in) {
1583 Dir *dir = diri->open_dir();
1584 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1585 } else {
1586 if (diri->dir && diri->dir->dentries.count(dname)) {
1587 Dentry *dn = diri->dir->dentries[dname];
1588 if (dn->inode)
1589 unlink(dn, true, true); // keep dir, dentry
1590 }
1591 }
1592 }
1593
1594 if (in) {
1595 if (op == CEPH_MDS_OP_READDIR ||
1596 op == CEPH_MDS_OP_LSSNAP) {
1597 insert_readdir_results(request, session, in);
1598 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1599 // hack: return parent inode instead
1600 in = diri;
1601 }
1602
1603 if (request->dentry() == NULL && in != request->inode()) {
1604 // pin the target inode if its parent dentry is not pinned
1605 request->set_other_inode(in);
1606 }
1607 }
1608
1609 if (realm)
1610 put_snap_realm(realm);
1611
1612 request->target = in;
1613 return in;
1614}
1615
1616// -------
1617
1618mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1619{
1620 mds_rank_t mds = MDS_RANK_NONE;
1621 __u32 hash = 0;
1622 bool is_hash = false;
2a845540 1623 int issued = 0;
7c673cae
FG
1624
1625 Inode *in = NULL;
1626 Dentry *de = NULL;
7c673cae
FG
1627
1628 if (req->resend_mds >= 0) {
1629 mds = req->resend_mds;
1630 req->resend_mds = -1;
11fdf7f2 1631 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1632 goto out;
1633 }
1634
1635 if (cct->_conf->client_use_random_mds)
1636 goto random_mds;
1637
1638 in = req->inode();
1639 de = req->dentry();
1640 if (in) {
11fdf7f2 1641 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1642 if (req->path.depth()) {
1643 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1644 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1645 << " on " << req->path[0]
1646 << " => " << hash << dendl;
1647 is_hash = true;
1648 }
1649 } else if (de) {
1650 if (de->inode) {
1651 in = de->inode.get();
11fdf7f2 1652 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1653 } else {
1654 in = de->dir->parent_inode;
1655 hash = in->hash_dentry_name(de->name);
11fdf7f2 1656 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1657 << " on " << de->name
1658 << " => " << hash << dendl;
1659 is_hash = true;
1660 }
1661 }
1662 if (in) {
1663 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1664 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1665 while (in->snapid != CEPH_NOSNAP) {
1666 if (in->snapid == CEPH_SNAPDIR)
1667 in = in->snapdir_parent.get();
11fdf7f2 1668 else if (!in->dentries.empty())
7c673cae
FG
1669 /* In most cases there will only be one dentry, so getting it
1670 * will be the correct action. If there are multiple hard links,
1671 * I think the MDS should be able to redirect as needed*/
1672 in = in->get_first_parent()->dir->parent_inode;
1673 else {
1674 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1675 break;
1676 }
1677 }
1678 is_hash = false;
1679 }
1680
11fdf7f2 1681 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1682 << " hash=" << hash << dendl;
1683
2a845540
TL
1684 if (req->get_op() == CEPH_MDS_OP_GETATTR)
1685 issued = req->inode()->caps_issued();
1686
f67539c2 1687 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
7c673cae 1688 frag_t fg = in->dirfragtree[hash];
2a845540 1689 if (!req->auth_is_best(issued)) {
f67539c2
TL
1690 auto repmapit = in->frag_repmap.find(fg);
1691 if (repmapit != in->frag_repmap.end()) {
1692 auto& repmap = repmapit->second;
1693 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1694 mds = repmap.at(r);
1695 }
1696 } else if (in->fragmap.count(fg)) {
7c673cae
FG
1697 mds = in->fragmap[fg];
1698 if (phash_diri)
1699 *phash_diri = in;
91327a77 1700 } else if (in->auth_cap) {
f67539c2 1701 req->send_to_auth = true;
91327a77
AA
1702 mds = in->auth_cap->session->mds_num;
1703 }
1704 if (mds >= 0) {
11fdf7f2 1705 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1706 goto out;
1707 }
1708 }
1709
2a845540 1710 if (in->auth_cap && req->auth_is_best(issued)) {
11fdf7f2
TL
1711 mds = in->auth_cap->session->mds_num;
1712 } else if (!in->caps.empty()) {
1713 mds = in->caps.begin()->second.session->mds_num;
1714 } else {
7c673cae 1715 goto random_mds;
11fdf7f2
TL
1716 }
1717 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1718
1719 goto out;
1720 }
1721
1722random_mds:
1723 if (mds < 0) {
1724 mds = _get_random_up_mds();
1725 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1726 }
1727
1728out:
1729 ldout(cct, 20) << "mds is " << mds << dendl;
1730 return mds;
1731}
1732
7c673cae
FG
1733void Client::connect_mds_targets(mds_rank_t mds)
1734{
11fdf7f2
TL
1735 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1736 ceph_assert(mds_sessions.count(mds));
7c673cae 1737 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
f67539c2
TL
1738 for (const auto &rank : info.export_targets) {
1739 if (mds_sessions.count(rank) == 0 &&
1740 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
7c673cae 1741 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
f67539c2
TL
1742 << " export target mds." << rank << dendl;
1743 _open_mds_session(rank);
7c673cae
FG
1744 }
1745 }
1746}
1747
adb31ebb 1748void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1749{
1750 f->dump_int("id", get_nodeid().v);
11fdf7f2 1751 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1752 f->dump_object("inst", inst);
1753 f->dump_stream("inst_str") << inst;
1754 f->dump_stream("addr_str") << inst.addr;
7c673cae 1755 f->open_array_section("sessions");
11fdf7f2 1756 for (const auto &p : mds_sessions) {
7c673cae 1757 f->open_object_section("session");
20effc67 1758 p.second->dump(f, cap_dump);
7c673cae
FG
1759 f->close_section();
1760 }
1761 f->close_section();
1762 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1763}
f67539c2 1764
7c673cae
FG
1765void Client::dump_mds_requests(Formatter *f)
1766{
1767 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1768 p != mds_requests.end();
1769 ++p) {
1770 f->open_object_section("request");
1771 p->second->dump(f);
1772 f->close_section();
1773 }
1774}
1775
9f95a23c 1776int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1777 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1778 InodeRef *ptarget, bool *pcreated,
1779 const UserPerm& perms)
1780{
1781 // check whether this request actually did the create, and set created flag
1782 bufferlist extra_bl;
1783 inodeno_t created_ino;
1784 bool got_created_ino = false;
1785 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1786
11fdf7f2 1787 extra_bl = reply->get_extra_bl();
7c673cae 1788 if (extra_bl.length() >= 8) {
9f95a23c
TL
1789 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1790 struct openc_response_t ocres;
1791
1792 decode(ocres, extra_bl);
1793 created_ino = ocres.created_ino;
1794 /*
1795 * The userland cephfs client doesn't have a way to do an async create
1796 * (yet), so just discard delegated_inos for now. Eventually we should
1797 * store them and use them in create calls, even if they are synchronous,
1798 * if only for testing purposes.
1799 */
1800 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1801 } else {
1802 // u64 containing number of created ino
1803 decode(created_ino, extra_bl);
1804 }
7c673cae 1805 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1806 got_created_ino = true;
7c673cae
FG
1807 }
1808
1809 if (pcreated)
1810 *pcreated = got_created_ino;
1811
1812 if (request->target) {
1813 *ptarget = request->target;
1814 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1815 } else {
1816 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1817 (*ptarget) = p->second;
1818 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1819 } else {
1820 // we got a traceless reply, and need to look up what we just
1821 // created. for now, do this by name. someday, do this by the
1822 // ino... which we know! FIXME.
1823 InodeRef target;
1824 Dentry *d = request->dentry();
1825 if (d) {
1826 if (d->dir) {
1827 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1828 << d->dir->parent_inode->ino << "/" << d->name
1829 << " got_ino " << got_created_ino
1830 << " ino " << created_ino
1831 << dendl;
1832 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1833 &target, perms);
1834 } else {
1835 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1836 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1837 }
1838 } else {
1839 Inode *in = request->inode();
1840 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1841 << in->ino << dendl;
1842 r = _getattr(in, request->regetattr_mask, perms, true);
1843 target = in;
1844 }
1845 if (r >= 0) {
1846 // verify ino returned in reply and trace_dist are the same
1847 if (got_created_ino &&
1848 created_ino.val != target->ino.val) {
1849 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
f67539c2 1850 r = -CEPHFS_EINTR;
7c673cae
FG
1851 }
1852 if (ptarget)
1853 ptarget->swap(target);
1854 }
1855 }
1856 }
1857
1858 return r;
1859}
1860
1861
1862/**
1863 * make a request
1864 *
1865 * Blocking helper to make an MDS request.
1866 *
1867 * If the ptarget flag is set, behavior changes slightly: the caller
1868 * expects to get a pointer to the inode we are creating or operating
1869 * on. As a result, we will follow up any traceless mutation reply
1870 * with a getattr or lookup to transparently handle a traceless reply
1871 * from the MDS (as when the MDS restarts and the client has to replay
1872 * a request).
1873 *
1874 * @param request the MetaRequest to execute
1875 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1876 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1877 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1878 * @param use_mds [optional] prefer a specific mds (-1 for default)
1879 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1880 */
1881int Client::make_request(MetaRequest *request,
1882 const UserPerm& perms,
1883 InodeRef *ptarget, bool *pcreated,
1884 mds_rank_t use_mds,
1885 bufferlist *pdirbl)
1886{
1887 int r = 0;
1888
1889 // assign a unique tid
1890 ceph_tid_t tid = ++last_tid;
1891 request->set_tid(tid);
1892
1893 // and timestamp
1894 request->op_stamp = ceph_clock_now();
2a845540 1895 request->created = ceph::coarse_mono_clock::now();
7c673cae
FG
1896
1897 // make note
1898 mds_requests[tid] = request->get();
1899 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1900 oldest_tid = tid;
1901
1902 request->set_caller_perms(perms);
1903
1904 if (cct->_conf->client_inject_fixed_oldest_tid) {
1905 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1906 request->set_oldest_client_tid(1);
1907 } else {
1908 request->set_oldest_client_tid(oldest_tid);
1909 }
1910
1911 // hack target mds?
1912 if (use_mds >= 0)
1913 request->resend_mds = use_mds;
1914
20effc67 1915 MetaSessionRef session = NULL;
7c673cae
FG
1916 while (1) {
1917 if (request->aborted())
1918 break;
1919
f67539c2
TL
1920 if (blocklisted) {
1921 request->abort(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
1922 break;
1923 }
1924
7c673cae 1925 // set up wait cond
9f95a23c 1926 ceph::condition_variable caller_cond;
7c673cae
FG
1927 request->caller_cond = &caller_cond;
1928
1929 // choose mds
1930 Inode *hash_diri = NULL;
1931 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1932 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1933 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1934 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1935 if (hash_diri) {
1936 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1937 _fragmap_remove_stopped_mds(hash_diri, mds);
1938 } else {
1939 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1940 request->resend_mds = _get_random_up_mds();
1941 }
1942 } else {
1943 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1944 wait_on_list(waiting_for_mdsmap);
1945 }
1946 continue;
1947 }
1948
1949 // open a session?
7c673cae
FG
1950 if (!have_open_session(mds)) {
1951 session = _get_or_open_mds_session(mds);
f6b5b4d7 1952 if (session->state == MetaSession::STATE_REJECTED) {
f67539c2 1953 request->abort(-CEPHFS_EPERM);
f6b5b4d7
TL
1954 break;
1955 }
7c673cae
FG
1956 // wait
1957 if (session->state == MetaSession::STATE_OPENING) {
1958 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1959 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1960 continue;
1961 }
1962
1963 if (!have_open_session(mds))
1964 continue;
1965 } else {
20effc67 1966 session = mds_sessions.at(mds);
7c673cae
FG
1967 }
1968
1969 // send request.
20effc67 1970 send_request(request, session.get());
7c673cae
FG
1971
1972 // wait for signal
1973 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1974 request->kick = false;
9f95a23c
TL
1975 std::unique_lock l{client_lock, std::adopt_lock};
1976 caller_cond.wait(l, [request] {
1977 return (request->reply || // reply
1978 request->resend_mds >= 0 || // forward
1979 request->kick);
1980 });
1981 l.release();
1982 request->caller_cond = nullptr;
7c673cae
FG
1983
1984 // did we get a reply?
1985 if (request->reply)
1986 break;
1987 }
1988
1989 if (!request->reply) {
11fdf7f2
TL
1990 ceph_assert(request->aborted());
1991 ceph_assert(!request->got_unsafe);
7c673cae
FG
1992 r = request->get_abort_code();
1993 request->item.remove_myself();
1994 unregister_request(request);
11fdf7f2 1995 put_request(request);
7c673cae
FG
1996 return r;
1997 }
1998
1999 // got it!
11fdf7f2 2000 auto reply = std::move(request->reply);
7c673cae
FG
2001 r = reply->get_result();
2002 if (r >= 0)
2003 request->success = true;
2004
2005 // kick dispatcher (we've got it!)
11fdf7f2 2006 ceph_assert(request->dispatch_cond);
9f95a23c 2007 request->dispatch_cond->notify_all();
7c673cae
FG
2008 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2009 request->dispatch_cond = 0;
2010
2011 if (r >= 0 && ptarget)
20effc67 2012 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
7c673cae
FG
2013
2014 if (pdirbl)
11fdf7f2 2015 *pdirbl = reply->get_extra_bl();
7c673cae
FG
2016
2017 // -- log times --
2018 utime_t lat = ceph_clock_now();
2019 lat -= request->sent_stamp;
2020 ldout(cct, 20) << "lat " << lat << dendl;
2a845540
TL
2021
2022 ++nr_metadata_request;
2023 update_io_stat_metadata(lat);
7c673cae
FG
2024
2025 put_request(request);
7c673cae
FG
2026 return r;
2027}
2028
2029void Client::unregister_request(MetaRequest *req)
2030{
2031 mds_requests.erase(req->tid);
2032 if (req->tid == oldest_tid) {
2033 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2034 while (true) {
2035 if (p == mds_requests.end()) {
2036 oldest_tid = 0;
2037 break;
2038 }
2039 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2040 oldest_tid = p->first;
2041 break;
2042 }
2043 ++p;
2044 }
2045 }
2046 put_request(req);
2047}
2048
2049void Client::put_request(MetaRequest *request)
2050{
2051 if (request->_put()) {
2052 int op = -1;
2053 if (request->success)
2054 op = request->get_op();
2055 InodeRef other_in;
2056 request->take_other_inode(&other_in);
2057 delete request;
2058
2059 if (other_in &&
2060 (op == CEPH_MDS_OP_RMDIR ||
2061 op == CEPH_MDS_OP_RENAME ||
2062 op == CEPH_MDS_OP_RMSNAP)) {
2063 _try_to_trim_inode(other_in.get(), false);
2064 }
2065 }
2066}
2067
2068int Client::encode_inode_release(Inode *in, MetaRequest *req,
2069 mds_rank_t mds, int drop,
2070 int unless, int force)
2071{
11fdf7f2 2072 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
f67539c2 2073 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1911f103 2074 << ", force:" << force << ")" << dendl;
7c673cae 2075 int released = 0;
11fdf7f2
TL
2076 auto it = in->caps.find(mds);
2077 if (it != in->caps.end()) {
2078 Cap &cap = it->second;
7c673cae 2079 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
2080 if ((drop & cap.issued) &&
2081 !(unless & cap.issued)) {
1911f103 2082 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
2083 cap.issued &= ~drop;
2084 cap.implemented &= ~drop;
7c673cae 2085 released = 1;
7c673cae
FG
2086 } else {
2087 released = force;
2088 }
2089 if (released) {
1911f103
TL
2090 cap.wanted = in->caps_wanted();
2091 if (&cap == in->auth_cap &&
2092 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2093 in->requested_max_size = 0;
2094 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2095 }
7c673cae
FG
2096 ceph_mds_request_release rel;
2097 rel.ino = in->ino;
11fdf7f2
TL
2098 rel.cap_id = cap.cap_id;
2099 rel.seq = cap.seq;
2100 rel.issue_seq = cap.issue_seq;
2101 rel.mseq = cap.mseq;
2102 rel.caps = cap.implemented;
2103 rel.wanted = cap.wanted;
7c673cae
FG
2104 rel.dname_len = 0;
2105 rel.dname_seq = 0;
2106 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2107 }
2108 }
11fdf7f2 2109 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
2110 << released << dendl;
2111 return released;
2112}
2113
2114void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2115 mds_rank_t mds, int drop, int unless)
2116{
11fdf7f2 2117 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
2118 << dn << ")" << dendl;
2119 int released = 0;
2120 if (dn->dir)
2121 released = encode_inode_release(dn->dir->parent_inode, req,
2122 mds, drop, unless, 1);
2123 if (released && dn->lease_mds == mds) {
2124 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 2125 auto& rel = req->cap_releases.back();
7c673cae
FG
2126 rel.item.dname_len = dn->name.length();
2127 rel.item.dname_seq = dn->lease_seq;
2128 rel.dname = dn->name;
adb31ebb 2129 dn->lease_mds = -1;
7c673cae 2130 }
11fdf7f2 2131 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
2132 << dn << ")" << dendl;
2133}
2134
2135
2136/*
2137 * This requires the MClientRequest *request member to be set.
2138 * It will error out horribly without one.
2139 * Additionally, if you set any *drop member, you'd better have
2140 * set the corresponding dentry!
2141 */
2142void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2143{
11fdf7f2 2144 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
2145 << req << ", mds: " << mds << ")" << dendl;
2146 if (req->inode_drop && req->inode())
2147 encode_inode_release(req->inode(), req,
2148 mds, req->inode_drop,
2149 req->inode_unless);
2150
2151 if (req->old_inode_drop && req->old_inode())
2152 encode_inode_release(req->old_inode(), req,
2153 mds, req->old_inode_drop,
2154 req->old_inode_unless);
2155 if (req->other_inode_drop && req->other_inode())
2156 encode_inode_release(req->other_inode(), req,
2157 mds, req->other_inode_drop,
2158 req->other_inode_unless);
2159
2160 if (req->dentry_drop && req->dentry())
2161 encode_dentry_release(req->dentry(), req,
2162 mds, req->dentry_drop,
2163 req->dentry_unless);
2164
2165 if (req->old_dentry_drop && req->old_dentry())
2166 encode_dentry_release(req->old_dentry(), req,
2167 mds, req->old_dentry_drop,
2168 req->old_dentry_unless);
11fdf7f2 2169 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
2170 << req << ", mds " << mds <<dendl;
2171}
2172
2173bool Client::have_open_session(mds_rank_t mds)
2174{
11fdf7f2
TL
2175 const auto &it = mds_sessions.find(mds);
2176 return it != mds_sessions.end() &&
20effc67
TL
2177 (it->second->state == MetaSession::STATE_OPEN ||
2178 it->second->state == MetaSession::STATE_STALE);
7c673cae
FG
2179}
2180
20effc67 2181MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
7c673cae 2182{
11fdf7f2 2183 const auto &it = mds_sessions.find(mds);
20effc67 2184 if (it == mds_sessions.end() || it->second->con != con) {
7c673cae 2185 return NULL;
11fdf7f2 2186 } else {
20effc67 2187 return it->second;
11fdf7f2 2188 }
7c673cae
FG
2189}
2190
20effc67 2191MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
7c673cae 2192{
11fdf7f2 2193 auto it = mds_sessions.find(mds);
20effc67 2194 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
7c673cae
FG
2195}
2196
2197/**
2198 * Populate a map of strings with client-identifying metadata,
2199 * such as the hostname. Call this once at initialization.
2200 */
2201void Client::populate_metadata(const std::string &mount_root)
2202{
2203 // Hostname
f67539c2
TL
2204#ifdef _WIN32
2205 // TODO: move this to compat.h
2206 char hostname[64];
2207 DWORD hostname_sz = 64;
2208 GetComputerNameA(hostname, &hostname_sz);
2209 metadata["hostname"] = hostname;
2210#else
7c673cae
FG
2211 struct utsname u;
2212 int r = uname(&u);
2213 if (r >= 0) {
2214 metadata["hostname"] = u.nodename;
2215 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2216 } else {
2217 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2218 }
f67539c2 2219#endif
7c673cae
FG
2220
2221 metadata["pid"] = stringify(getpid());
2222
2223 // Ceph entity id (the '0' in "client.0")
2224 metadata["entity_id"] = cct->_conf->name.get_id();
2225
2226 // Our mount position
2227 if (!mount_root.empty()) {
2228 metadata["root"] = mount_root;
2229 }
2230
2231 // Ceph version
2232 metadata["ceph_version"] = pretty_version_to_str();
2233 metadata["ceph_sha1"] = git_version_to_str();
2234
2235 // Apply any metadata from the user's configured overrides
2236 std::vector<std::string> tokens;
2237 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2238 for (const auto &i : tokens) {
2239 auto eqpos = i.find("=");
2240 // Throw out anything that isn't of the form "<str>=<str>"
2241 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2242 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2243 continue;
2244 }
2245 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2246 }
2247}
2248
2249/**
2250 * Optionally add or override client metadata fields.
2251 */
2252void Client::update_metadata(std::string const &k, std::string const &v)
2253{
f67539c2
TL
2254 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2255 ceph_assert(iref_reader.is_state_satisfied());
2256
2257 std::scoped_lock l(client_lock);
7c673cae 2258
11fdf7f2
TL
2259 auto it = metadata.find(k);
2260 if (it != metadata.end()) {
7c673cae 2261 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2262 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2263 }
2264
2265 metadata[k] = v;
2266}
2267
20effc67 2268MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
7c673cae 2269{
11fdf7f2
TL
2270 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2271 auto addrs = mdsmap->get_addrs(mds);
2272 auto em = mds_sessions.emplace(std::piecewise_construct,
2273 std::forward_as_tuple(mds),
20effc67 2274 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
11fdf7f2 2275 ceph_assert(em.second); /* not already present */
20effc67 2276 auto session = em.first->second;
7c673cae 2277
9f95a23c 2278 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2279 m->metadata = metadata;
2280 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
f67539c2 2281 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
11fdf7f2 2282 session->con->send_message2(std::move(m));
7c673cae
FG
2283 return session;
2284}
2285
2286void Client::_close_mds_session(MetaSession *s)
2287{
11fdf7f2 2288 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2289 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2290 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2291}
2292
f6b5b4d7 2293void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2294{
11fdf7f2 2295 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2296 if (rejected && s->state != MetaSession::STATE_CLOSING)
2297 s->state = MetaSession::STATE_REJECTED;
2298 else
2299 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2300 s->con->mark_down();
2301 signal_context_list(s->waiting_for_open);
9f95a23c 2302 mount_cond.notify_all();
f6b5b4d7 2303 remove_session_caps(s, err);
7c673cae 2304 kick_requests_closed(s);
f6b5b4d7
TL
2305 mds_ranks_closing.erase(s->mds_num);
2306 if (s->state == MetaSession::STATE_CLOSED)
2307 mds_sessions.erase(s->mds_num);
7c673cae
FG
2308}
2309
11fdf7f2 2310void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2311{
2312 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2313 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 2314
f67539c2 2315 std::scoped_lock cl(client_lock);
20effc67 2316 auto session = _get_mds_session(from, m->get_connection().get());
7c673cae
FG
2317 if (!session) {
2318 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2319 return;
2320 }
2321
2322 switch (m->get_op()) {
2323 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2324 {
2325 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2326 missing_features -= m->supported_features;
2327 if (!missing_features.empty()) {
2328 lderr(cct) << "mds." << from << " lacks required features '"
2329 << missing_features << "', closing session " << dendl;
20effc67
TL
2330 _close_mds_session(session.get());
2331 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2
TL
2332 break;
2333 }
2334 session->mds_features = std::move(m->supported_features);
33c7a0ef 2335 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
11fdf7f2 2336
20effc67 2337 renew_caps(session.get());
11fdf7f2 2338 session->state = MetaSession::STATE_OPEN;
f67539c2 2339 if (is_unmounting())
9f95a23c 2340 mount_cond.notify_all();
11fdf7f2
TL
2341 else
2342 connect_mds_targets(from);
2343 signal_context_list(session->waiting_for_open);
2344 break;
2345 }
7c673cae
FG
2346
2347 case CEPH_SESSION_CLOSE:
20effc67 2348 _closed_mds_session(session.get());
7c673cae
FG
2349 break;
2350
2351 case CEPH_SESSION_RENEWCAPS:
2352 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2353 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2354 session->cap_ttl =
2355 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298 2356 if (was_stale)
20effc67 2357 wake_up_session_caps(session.get(), false);
7c673cae
FG
2358 }
2359 break;
2360
2361 case CEPH_SESSION_STALE:
28e407b8
AA
2362 // invalidate session caps/leases
2363 session->cap_gen++;
2364 session->cap_ttl = ceph_clock_now();
2365 session->cap_ttl -= 1;
20effc67 2366 renew_caps(session.get());
7c673cae
FG
2367 break;
2368
2369 case CEPH_SESSION_RECALL_STATE:
f67539c2
TL
2370 /*
2371 * Call the renew caps and flush cap releases just before
2372 * triming the caps in case the tick() won't get a chance
2373 * to run them, which could cause the client to be blocklisted
2374 * and MDS daemons trying to recall the caps again and
2375 * again.
2376 *
2377 * In most cases it will do nothing, and the new cap releases
2378 * added by trim_caps() followed will be deferred flushing
2379 * by tick().
2380 */
2381 renew_and_flush_cap_releases();
20effc67 2382 trim_caps(session.get(), m->get_max_caps());
7c673cae
FG
2383 break;
2384
2385 case CEPH_SESSION_FLUSHMSG:
a8e16298 2386 /* flush cap release */
11fdf7f2
TL
2387 if (auto& m = session->release; m) {
2388 session->con->send_message2(std::move(m));
a8e16298 2389 }
9f95a23c 2390 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2391 break;
2392
2393 case CEPH_SESSION_FORCE_RO:
20effc67 2394 force_session_readonly(session.get());
7c673cae
FG
2395 break;
2396
2397 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2398 {
2399 std::string_view error_str;
2400 auto it = m->metadata.find("error_string");
2401 if (it != m->metadata.end())
2402 error_str = it->second;
2403 else
2404 error_str = "unknown error";
2405 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2406
20effc67 2407 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2 2408 }
7c673cae
FG
2409 break;
2410
2411 default:
2412 ceph_abort();
2413 }
7c673cae
FG
2414}
2415
2416bool Client::_any_stale_sessions() const
2417{
9f95a23c 2418 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2419
11fdf7f2 2420 for (const auto &p : mds_sessions) {
20effc67 2421 if (p.second->state == MetaSession::STATE_STALE) {
7c673cae
FG
2422 return true;
2423 }
2424 }
2425
2426 return false;
2427}
2428
2429void Client::_kick_stale_sessions()
2430{
11fdf7f2 2431 ldout(cct, 1) << __func__ << dendl;
7c673cae 2432
11fdf7f2 2433 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67
TL
2434 auto s = it->second;
2435 if (s->state == MetaSession::STATE_REJECTED) {
2436 mds_sessions.erase(it->first);
f6b5b4d7
TL
2437 continue;
2438 }
20effc67
TL
2439 if (s->state == MetaSession::STATE_STALE)
2440 _closed_mds_session(s.get());
7c673cae
FG
2441 }
2442}
2443
2444void Client::send_request(MetaRequest *request, MetaSession *session,
2445 bool drop_cap_releases)
2446{
2447 // make the request
2448 mds_rank_t mds = session->mds_num;
11fdf7f2 2449 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2450 << " for mds." << mds << dendl;
11fdf7f2 2451 auto r = build_client_request(request);
7c673cae
FG
2452 if (request->dentry()) {
2453 r->set_dentry_wanted();
2454 }
2455 if (request->got_unsafe) {
2456 r->set_replayed_op();
2457 if (request->target)
2458 r->head.ino = request->target->ino;
2459 } else {
2460 encode_cap_releases(request, mds);
2461 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2462 request->cap_releases.clear();
2463 else
2464 r->releases.swap(request->cap_releases);
2465 }
2466 r->set_mdsmap_epoch(mdsmap->get_epoch());
2467 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2468 objecter->with_osdmap([r](const OSDMap& o) {
2469 r->set_osdmap_epoch(o.get_epoch());
2470 });
2471 }
2472
2473 if (request->mds == -1) {
2474 request->sent_stamp = ceph_clock_now();
11fdf7f2 2475 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2476 }
2477 request->mds = mds;
2478
2479 Inode *in = request->inode();
11fdf7f2
TL
2480 if (in) {
2481 auto it = in->caps.find(mds);
2482 if (it != in->caps.end()) {
2483 request->sent_on_mseq = it->second.mseq;
2484 }
2485 }
7c673cae
FG
2486
2487 session->requests.push_back(&request->item);
2488
11fdf7f2
TL
2489 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2490 session->con->send_message2(std::move(r));
7c673cae
FG
2491}
2492
9f95a23c 2493ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2494{
9f95a23c 2495 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2496 req->set_tid(request->tid);
2497 req->set_stamp(request->op_stamp);
2498 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2499
2500 // if the filepath's haven't been set, set them!
2501 if (request->path.empty()) {
2502 Inode *in = request->inode();
2503 Dentry *de = request->dentry();
2504 if (in)
2505 in->make_nosnap_relative_path(request->path);
2506 else if (de) {
2507 if (de->inode)
2508 de->inode->make_nosnap_relative_path(request->path);
2509 else if (de->dir) {
2510 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2511 request->path.push_dentry(de->name);
2512 }
2513 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2514 << " No path, inode, or appropriately-endowed dentry given!"
2515 << dendl;
2516 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2517 << " No path, inode, or dentry given!"
2518 << dendl;
2519 }
2520 req->set_filepath(request->get_filepath());
2521 req->set_filepath2(request->get_filepath2());
f67539c2 2522 req->set_alternate_name(request->alternate_name);
7c673cae
FG
2523 req->set_data(request->data);
2524 req->set_retry_attempt(request->retry_attempt++);
2525 req->head.num_fwd = request->num_fwd;
2526 const gid_t *_gids;
2527 int gid_count = request->perms.get_gids(&_gids);
2528 req->set_gid_list(gid_count, _gids);
2529 return req;
2530}
2531
2532
2533
11fdf7f2 2534void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2535{
2536 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
f67539c2
TL
2537
2538 std::scoped_lock cl(client_lock);
20effc67 2539 auto session = _get_mds_session(mds, fwd->get_connection().get());
7c673cae 2540 if (!session) {
7c673cae
FG
2541 return;
2542 }
2543 ceph_tid_t tid = fwd->get_tid();
2544
2545 if (mds_requests.count(tid) == 0) {
11fdf7f2 2546 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2547 return;
2548 }
2549
2550 MetaRequest *request = mds_requests[tid];
11fdf7f2 2551 ceph_assert(request);
7c673cae 2552
33c7a0ef
TL
2553 /*
2554 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2555 * is 'int32_t', while in 'ceph_mds_request_head' the
2556 * type is '__u8'. So in case the request bounces between
2557 * MDSes exceeding 256 times, the client will get stuck.
2558 *
2559 * In this case it's ususally a bug in MDS and continue
2560 * bouncing the request makes no sense.
2561 *
2562 * In future this could be fixed in ceph code, so avoid
2563 * using the hardcode here.
2564 */
2565 int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
2566 max_fwd = 1 << (max_fwd * CHAR_BIT) - 1;
2567 auto num_fwd = fwd->get_num_fwd();
2568 if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
2569 if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
2570 request->abort(-EMULTIHOP);
2571 request->caller_cond->notify_all();
2572 ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
2573 << ", abort it" << dendl;
2574 } else {
2575 ldout(cct, 10) << __func__ << " tid " << tid
2576 << " old fwd seq " << fwd->get_num_fwd()
2577 << " <= req fwd " << request->num_fwd
2578 << ", ignore it" << dendl;
2579 }
2580 return;
2581 }
2582
7c673cae
FG
2583 // reset retry counter
2584 request->retry_attempt = 0;
2585
2586 // request not forwarded, or dest mds has no session.
2587 // resend.
11fdf7f2 2588 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2589 << " fwd " << fwd->get_num_fwd()
2590 << " to mds." << fwd->get_dest_mds()
2591 << ", resending to " << fwd->get_dest_mds()
2592 << dendl;
2593
2594 request->mds = -1;
2595 request->item.remove_myself();
33c7a0ef 2596 request->num_fwd = num_fwd;
7c673cae 2597 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2598 request->caller_cond->notify_all();
7c673cae
FG
2599}
2600
2601bool Client::is_dir_operation(MetaRequest *req)
2602{
2603 int op = req->get_op();
2604 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2605 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2606 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2607 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2608 return true;
2609 return false;
2610}
2611
11fdf7f2 2612void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2613{
2614 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
f67539c2
TL
2615
2616 std::scoped_lock cl(client_lock);
20effc67 2617 auto session = _get_mds_session(mds_num, reply->get_connection().get());
7c673cae 2618 if (!session) {
7c673cae
FG
2619 return;
2620 }
2621
2622 ceph_tid_t tid = reply->get_tid();
2623 bool is_safe = reply->is_safe();
2624
2625 if (mds_requests.count(tid) == 0) {
11fdf7f2 2626 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2627 << " safe is:" << is_safe << dendl;
7c673cae
FG
2628 return;
2629 }
2630 MetaRequest *request = mds_requests.at(tid);
2631
11fdf7f2 2632 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2633 << " tid " << tid << dendl;
2634
2635 if (request->got_unsafe && !is_safe) {
2636 //duplicate response
2637 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2638 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2639 return;
2640 }
2641
11fdf7f2 2642 ceph_assert(!request->reply);
7c673cae 2643 request->reply = reply;
20effc67 2644 insert_trace(request, session.get());
7c673cae
FG
2645
2646 // Handle unsafe reply
2647 if (!is_safe) {
2648 request->got_unsafe = true;
2649 session->unsafe_requests.push_back(&request->unsafe_item);
2650 if (is_dir_operation(request)) {
2651 Inode *dir = request->inode();
11fdf7f2 2652 ceph_assert(dir);
7c673cae
FG
2653 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2654 }
2655 if (request->target) {
2656 InodeRef &in = request->target;
2657 in->unsafe_ops.push_back(&request->unsafe_target_item);
2658 }
2659 }
2660
2661 // Only signal the caller once (on the first reply):
2662 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2663 if (!is_safe || !request->got_unsafe) {
9f95a23c 2664 ceph::condition_variable cond;
7c673cae
FG
2665 request->dispatch_cond = &cond;
2666
2667 // wake up waiter
11fdf7f2 2668 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2669 request->caller_cond->notify_all();
7c673cae
FG
2670
2671 // wake for kick back
9f95a23c
TL
2672 std::unique_lock l{client_lock, std::adopt_lock};
2673 cond.wait(l, [tid, request, &cond, this] {
2674 if (request->dispatch_cond) {
2675 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2676 << tid << " " << &cond << dendl;
2677 }
2678 return !request->dispatch_cond;
2679 });
2680 l.release();
7c673cae
FG
2681 }
2682
2683 if (is_safe) {
2684 // the filesystem change is committed to disk
2685 // we're done, clean up
2686 if (request->got_unsafe) {
2687 request->unsafe_item.remove_myself();
2688 request->unsafe_dir_item.remove_myself();
2689 request->unsafe_target_item.remove_myself();
2690 signal_cond_list(request->waitfor_safe);
2691 }
2692 request->item.remove_myself();
2693 unregister_request(request);
2694 }
f67539c2 2695 if (is_unmounting())
9f95a23c 2696 mount_cond.notify_all();
7c673cae
FG
2697}
2698
2699void Client::_handle_full_flag(int64_t pool)
2700{
2701 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2702 << "on " << pool << dendl;
f67539c2 2703 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
7c673cae
FG
2704 // to do this rather than blocking, because otherwise when we fill up we
2705 // potentially lock caps forever on files with dirty pages, and we need
2706 // to be able to release those caps to the MDS so that it can delete files
2707 // and free up space.
f67539c2 2708 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
7c673cae
FG
2709
2710 // For all inodes with layouts in this pool and a pending flush write op
2711 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2712 // from ObjectCacher so that it doesn't re-issue the write in response to
2713 // the ENOSPC error.
2714 // Fortunately since we're cancelling everything in a given pool, we don't
2715 // need to know which ops belong to which ObjectSet, we can just blow all
2716 // the un-flushed cached data away and mark any dirty inodes' async_err
f67539c2 2717 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
7c673cae
FG
2718 // affecting this pool, and all the objectsets we're purging were also
2719 // in this pool.
2720 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2721 i != inode_map.end(); ++i)
2722 {
2723 Inode *inode = i->second;
2724 if (inode->oset.dirty_or_tx
2725 && (pool == -1 || inode->layout.pool_id == pool)) {
2726 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2727 << " has dirty objects, purging and setting ENOSPC" << dendl;
2728 objectcacher->purge_set(&inode->oset);
f67539c2 2729 inode->set_async_err(-CEPHFS_ENOSPC);
7c673cae
FG
2730 }
2731 }
2732
2733 if (cancelled_epoch != (epoch_t)-1) {
2734 set_cap_epoch_barrier(cancelled_epoch);
2735 }
2736}
2737
11fdf7f2 2738void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2739{
f67539c2 2740 std::scoped_lock cl(client_lock);
31f18b77 2741
11fdf7f2 2742 const auto myaddrs = messenger->get_myaddrs();
33c7a0ef 2743 bool new_blocklist = objecter->with_osdmap(
11fdf7f2 2744 [&](const OSDMap& o) {
33c7a0ef 2745 return o.is_blocklisted(myaddrs);
11fdf7f2 2746 });
33c7a0ef
TL
2747
2748 if (new_blocklist && !blocklisted) {
31f18b77
FG
2749 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2750 return o.get_epoch();
2751 });
f67539c2
TL
2752 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2753 blocklisted = true;
31f18b77 2754
f67539c2 2755 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
2756
2757 // Since we know all our OSD ops will fail, cancel them all preemtively,
2758 // so that on an unhealthy cluster we can umount promptly even if e.g.
2759 // some PGs were inaccessible.
f67539c2
TL
2760 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2761
2762 }
31f18b77 2763
f67539c2
TL
2764 if (blocklisted) {
2765 // Handle case where we were blocklisted but no longer are
2766 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2767 return o.is_blocklisted(myaddrs);});
31f18b77
FG
2768 }
2769
f67539c2
TL
2770 // Always subscribe to next osdmap for blocklisted client
2771 // until this client is not blocklisted.
2772 if (blocklisted) {
f64942e4
AA
2773 objecter->maybe_request_map();
2774 }
2775
7c673cae
FG
2776 if (objecter->osdmap_full_flag()) {
2777 _handle_full_flag(-1);
2778 } else {
2779 // Accumulate local list of full pools so that I can drop
2780 // the objecter lock before re-entering objecter in
2781 // cancel_writes
2782 std::vector<int64_t> full_pools;
2783
2784 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2785 for (const auto& kv : o.get_pools()) {
2786 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2787 full_pools.push_back(kv.first);
2788 }
2789 }
2790 });
2791
2792 for (auto p : full_pools)
2793 _handle_full_flag(p);
2794
2795 // Subscribe to subsequent maps to watch for the full flag going
2796 // away. For the global full flag objecter does this for us, but
2797 // it pays no attention to the per-pool full flag so in this branch
2798 // we do it ourselves.
2799 if (!full_pools.empty()) {
2800 objecter->maybe_request_map();
2801 }
2802 }
7c673cae
FG
2803}
2804
2805
2806// ------------------------
2807// incoming messages
2808
2809
11fdf7f2 2810bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2811{
f67539c2
TL
2812 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2813 if (!iref_reader.is_state_satisfied()) {
7c673cae 2814 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2815 return true;
2816 }
2817
2818 switch (m->get_type()) {
2819 // mounting and mds sessions
2820 case CEPH_MSG_MDS_MAP:
9f95a23c 2821 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2822 break;
2823 case CEPH_MSG_FS_MAP:
9f95a23c 2824 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2825 break;
2826 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2827 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2828 break;
2829 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2830 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2831 break;
2832
2833 case CEPH_MSG_OSD_MAP:
9f95a23c 2834 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2835 break;
2836
2837 // requests
2838 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2839 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2840 break;
2841 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2842 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2843 break;
2844
2845 // reclaim reply
2846 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2847 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2848 break;
2849
2850 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2851 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2852 break;
2853 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2854 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2855 break;
2856 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2857 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2858 break;
2859 case MSG_COMMAND_REPLY:
2860 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2861 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2862 } else {
2863 return false;
2864 }
2865 break;
2866 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2867 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2868 break;
2869
2870 default:
2871 return false;
2872 }
2873
2874 // unmounting?
f67539c2
TL
2875 std::scoped_lock cl(client_lock);
2876 if (is_unmounting()) {
7c673cae
FG
2877 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2878 << "+" << inode_map.size() << dendl;
f67539c2 2879 uint64_t size = lru.lru_get_size() + inode_map.size();
7c673cae 2880 trim_cache();
f67539c2 2881 if (size > lru.lru_get_size() + inode_map.size()) {
7c673cae 2882 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2883 mount_cond.notify_all();
7c673cae
FG
2884 } else {
2885 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2886 << "+" << inode_map.size() << dendl;
2887 }
2888 }
2889
2890 return true;
2891}
2892
11fdf7f2 2893void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae 2894{
f67539c2 2895 std::scoped_lock cl(client_lock);
7c673cae 2896 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2897
2898 signal_cond_list(waiting_for_fsmap);
2899
2900 monclient->sub_got("fsmap", fsmap->get_epoch());
2901}
2902
11fdf7f2 2903void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae 2904{
f67539c2 2905 std::scoped_lock cl(client_lock);
7c673cae
FG
2906 fsmap_user.reset(new FSMapUser);
2907 *fsmap_user = m->get_fsmap();
7c673cae
FG
2908
2909 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2910 signal_cond_list(waiting_for_fsmap);
2911}
2912
f67539c2
TL
2913// Cancel all the commands for missing or laggy GIDs
2914void Client::cancel_commands(const MDSMap& newmap)
7c673cae 2915{
f67539c2 2916 std::vector<ceph_tid_t> cancel_ops;
7c673cae 2917
f67539c2 2918 std::scoped_lock cmd_lock(command_lock);
7c673cae 2919 auto &commands = command_table.get_commands();
f67539c2 2920 for (const auto &[tid, op] : commands) {
7c673cae 2921 const mds_gid_t op_mds_gid = op.mds_gid;
f67539c2
TL
2922 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2923 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2924 cancel_ops.push_back(tid);
7c673cae
FG
2925 if (op.outs) {
2926 std::ostringstream ss;
2927 ss << "MDS " << op_mds_gid << " went away";
2928 *(op.outs) = ss.str();
2929 }
f67539c2
TL
2930 /*
2931 * No need to make the con->mark_down under
2932 * client_lock here, because the con will
2933 * has its own lock.
2934 */
7c673cae 2935 op.con->mark_down();
f67539c2
TL
2936 if (op.on_finish)
2937 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
7c673cae
FG
2938 }
2939 }
2940
f67539c2
TL
2941 for (const auto &tid : cancel_ops)
2942 command_table.erase(tid);
2943}
2944
2945void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2946{
2947 std::unique_lock cl(client_lock);
2948 if (m->get_epoch() <= mdsmap->get_epoch()) {
2949 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2950 << " is identical to or older than our "
2951 << mdsmap->get_epoch() << dendl;
2952 return;
7c673cae
FG
2953 }
2954
f67539c2
TL
2955 cl.unlock();
2956 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2957 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2958 _mdsmap->decode(m->get_encoded());
2959 cancel_commands(*_mdsmap.get());
2960 cl.lock();
2961
2962 _mdsmap.swap(mdsmap);
2963
7c673cae 2964 // reset session
11fdf7f2 2965 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2966 mds_rank_t mds = p->first;
20effc67 2967 MetaSessionRef session = p->second;
7c673cae
FG
2968 ++p;
2969
f67539c2 2970 int oldstate = _mdsmap->get_state(mds);
7c673cae
FG
2971 int newstate = mdsmap->get_state(mds);
2972 if (!mdsmap->is_up(mds)) {
2973 session->con->mark_down();
11fdf7f2 2974 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f67539c2
TL
2975 auto old_inc = _mdsmap->get_incarnation(mds);
2976 auto new_inc = mdsmap->get_incarnation(mds);
f64942e4
AA
2977 if (old_inc != new_inc) {
2978 ldout(cct, 1) << "mds incarnation changed from "
2979 << old_inc << " to " << new_inc << dendl;
2980 oldstate = MDSMap::STATE_NULL;
2981 }
7c673cae 2982 session->con->mark_down();
11fdf7f2 2983 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2984 // When new MDS starts to take over, notify kernel to trim unused entries
2985 // in its dcache/icache. Hopefully, the kernel will release some unused
2986 // inodes before the new MDS enters reconnect state.
20effc67 2987 trim_cache_for_reconnect(session.get());
7c673cae
FG
2988 } else if (oldstate == newstate)
2989 continue; // no change
f67539c2 2990
7c673cae
FG
2991 session->mds_state = newstate;
2992 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2993 session->con = messenger->connect_to_mds(session->addrs);
20effc67 2994 send_reconnect(session.get());
81eedcae
TL
2995 } else if (newstate > MDSMap::STATE_RECONNECT) {
2996 if (oldstate < MDSMap::STATE_RECONNECT) {
2997 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
20effc67 2998 _closed_mds_session(session.get());
81eedcae
TL
2999 continue;
3000 }
3001 if (newstate >= MDSMap::STATE_ACTIVE) {
3002 if (oldstate < MDSMap::STATE_ACTIVE) {
3003 // kick new requests
20effc67
TL
3004 kick_requests(session.get());
3005 kick_flushing_caps(session.get());
81eedcae 3006 signal_context_list(session->waiting_for_open);
20effc67 3007 wake_up_session_caps(session.get(), true);
81eedcae
TL
3008 }
3009 connect_mds_targets(mds);
7c673cae 3010 }
7c673cae
FG
3011 } else if (newstate == MDSMap::STATE_NULL &&
3012 mds >= mdsmap->get_max_mds()) {
20effc67 3013 _closed_mds_session(session.get());
7c673cae
FG
3014 }
3015 }
3016
3017 // kick any waiting threads
3018 signal_cond_list(waiting_for_mdsmap);
3019
7c673cae
FG
3020 monclient->sub_got("mdsmap", mdsmap->get_epoch());
3021}
3022
3023void Client::send_reconnect(MetaSession *session)
3024{
3025 mds_rank_t mds = session->mds_num;
11fdf7f2 3026 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
3027
3028 // trim unused caps to reduce MDS's cache rejoin time
3029 trim_cache_for_reconnect(session);
3030
3031 session->readonly = false;
3032
11fdf7f2 3033 session->release.reset();
7c673cae
FG
3034
3035 // reset my cap seq number
3036 session->seq = 0;
3037 //connect to the mds' offload targets
3038 connect_mds_targets(mds);
3039 //make sure unsafe requests get saved
3040 resend_unsafe_requests(session);
3041
11fdf7f2
TL
3042 early_kick_flushing_caps(session);
3043
9f95a23c 3044 auto m = make_message<MClientReconnect>();
11fdf7f2 3045 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
3046
3047 // i have an open session.
3048 ceph::unordered_set<inodeno_t> did_snaprealm;
3049 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3050 p != inode_map.end();
3051 ++p) {
3052 Inode *in = p->second;
11fdf7f2
TL
3053 auto it = in->caps.find(mds);
3054 if (it != in->caps.end()) {
3055 if (allow_multi &&
9f95a23c
TL
3056 m->get_approx_size() >=
3057 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
3058 m->mark_more();
3059 session->con->send_message2(std::move(m));
3060
9f95a23c 3061 m = make_message<MClientReconnect>();
11fdf7f2
TL
3062 }
3063
3064 Cap &cap = it->second;
7c673cae 3065 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 3066 << " " << ccap_string(cap.issued)
7c673cae
FG
3067 << " wants " << ccap_string(in->caps_wanted())
3068 << dendl;
3069 filepath path;
f91f0fd5 3070 in->make_short_path(path);
7c673cae
FG
3071 ldout(cct, 10) << " path " << path << dendl;
3072
3073 bufferlist flockbl;
3074 _encode_filelocks(in, flockbl);
3075
11fdf7f2
TL
3076 cap.seq = 0; // reset seq.
3077 cap.issue_seq = 0; // reset seq.
3078 cap.mseq = 0; // reset seq.
3079 // cap gen should catch up with session cap_gen
3080 if (cap.gen < session->cap_gen) {
3081 cap.gen = session->cap_gen;
3082 cap.issued = cap.implemented = CEPH_CAP_PIN;
3083 } else {
3084 cap.issued = cap.implemented;
3085 }
7c673cae
FG
3086 snapid_t snap_follows = 0;
3087 if (!in->cap_snaps.empty())
3088 snap_follows = in->cap_snaps.begin()->first;
3089
3090 m->add_cap(p->first.ino,
11fdf7f2 3091 cap.cap_id,
7c673cae
FG
3092 path.get_ino(), path.get_path(), // ino
3093 in->caps_wanted(), // wanted
11fdf7f2 3094 cap.issued, // issued
7c673cae
FG
3095 in->snaprealm->ino,
3096 snap_follows,
3097 flockbl);
3098
3099 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3100 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3101 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3102 did_snaprealm.insert(in->snaprealm->ino);
3103 }
3104 }
3105 }
3106
11fdf7f2
TL
3107 if (!allow_multi)
3108 m->set_encoding_version(0); // use connection features to choose encoding
3109 session->con->send_message2(std::move(m));
7c673cae 3110
9f95a23c 3111 mount_cond.notify_all();
11fdf7f2
TL
3112
3113 if (session->reclaim_state == MetaSession::RECLAIMING)
3114 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
3115}
3116
3117
3118void Client::kick_requests(MetaSession *session)
3119{
11fdf7f2 3120 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3121 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3122 p != mds_requests.end();
3123 ++p) {
31f18b77
FG
3124 MetaRequest *req = p->second;
3125 if (req->got_unsafe)
3126 continue;
3127 if (req->aborted()) {
3128 if (req->caller_cond) {
3129 req->kick = true;
9f95a23c 3130 req->caller_cond->notify_all();
31f18b77 3131 }
7c673cae 3132 continue;
31f18b77
FG
3133 }
3134 if (req->retry_attempt > 0)
7c673cae 3135 continue; // new requests only
31f18b77 3136 if (req->mds == session->mds_num) {
7c673cae
FG
3137 send_request(p->second, session);
3138 }
3139 }
3140}
3141
3142void Client::resend_unsafe_requests(MetaSession *session)
3143{
3144 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3145 !iter.end();
3146 ++iter)
3147 send_request(*iter, session);
3148
3149 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3150 // process completed requests in clientreplay stage.
3151 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3152 p != mds_requests.end();
3153 ++p) {
3154 MetaRequest *req = p->second;
3155 if (req->got_unsafe)
3156 continue;
31f18b77
FG
3157 if (req->aborted())
3158 continue;
7c673cae
FG
3159 if (req->retry_attempt == 0)
3160 continue; // old requests only
3161 if (req->mds == session->mds_num)
3162 send_request(req, session, true);
3163 }
3164}
3165
3166void Client::wait_unsafe_requests()
3167{
3168 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2 3169 for (const auto &p : mds_sessions) {
20effc67
TL
3170 const auto s = p.second;
3171 if (!s->unsafe_requests.empty()) {
3172 MetaRequest *req = s->unsafe_requests.back();
7c673cae
FG
3173 req->get();
3174 last_unsafe_reqs.push_back(req);
3175 }
3176 }
3177
3178 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3179 p != last_unsafe_reqs.end();
3180 ++p) {
3181 MetaRequest *req = *p;
3182 if (req->unsafe_item.is_on_list())
3183 wait_on_list(req->waitfor_safe);
3184 put_request(req);
3185 }
3186}
3187
3188void Client::kick_requests_closed(MetaSession *session)
3189{
11fdf7f2 3190 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3191 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3192 p != mds_requests.end(); ) {
3193 MetaRequest *req = p->second;
3194 ++p;
3195 if (req->mds == session->mds_num) {
3196 if (req->caller_cond) {
3197 req->kick = true;
9f95a23c 3198 req->caller_cond->notify_all();
7c673cae
FG
3199 }
3200 req->item.remove_myself();
3201 if (req->got_unsafe) {
11fdf7f2 3202 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 3203 req->unsafe_item.remove_myself();
eafe8130
TL
3204 if (is_dir_operation(req)) {
3205 Inode *dir = req->inode();
20effc67 3206 ceph_assert(dir);
f67539c2 3207 dir->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3208 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3209 << dir->ino << " " << req->get_tid() << dendl;
3210 req->unsafe_dir_item.remove_myself();
3211 }
3212 if (req->target) {
3213 InodeRef &in = req->target;
f67539c2 3214 in->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3215 lderr(cct) << "kick_requests_closed drop req of inode : "
3216 << in->ino << " " << req->get_tid() << dendl;
3217 req->unsafe_target_item.remove_myself();
3218 }
7c673cae
FG
3219 signal_cond_list(req->waitfor_safe);
3220 unregister_request(req);
3221 }
3222 }
3223 }
11fdf7f2
TL
3224 ceph_assert(session->requests.empty());
3225 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
3226}
3227
3228
3229
3230
3231/************
3232 * leases
3233 */
3234
3235void Client::got_mds_push(MetaSession *s)
3236{
3237 s->seq++;
3238 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3239 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 3240 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
3241 }
3242}
3243
11fdf7f2 3244void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3245{
11fdf7f2 3246 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3247
11fdf7f2 3248 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae 3249 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
3250
3251 std::scoped_lock cl(client_lock);
20effc67 3252 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 3253 if (!session) {
7c673cae
FG
3254 return;
3255 }
3256
20effc67 3257 got_mds_push(session.get());
7c673cae
FG
3258
3259 ceph_seq_t seq = m->get_seq();
3260
3261 Inode *in;
3262 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3263 if (inode_map.count(vino) == 0) {
3264 ldout(cct, 10) << " don't have vino " << vino << dendl;
3265 goto revoke;
3266 }
3267 in = inode_map[vino];
3268
9f95a23c 3269 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3270 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3271 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3272 goto revoke;
3273 }
3274 Dentry *dn = in->dir->dentries[m->dname];
3275 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3276 dn->lease_mds = -1;
3277 }
3278
3279 revoke:
11fdf7f2 3280 {
9f95a23c
TL
3281 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3282 m->get_mask(), m->get_ino(),
3283 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3284 m->get_connection()->send_message2(std::move(reply));
3285 }
7c673cae
FG
3286}
3287
f67539c2 3288void Client::_put_inode(Inode *in, int n)
7c673cae 3289{
f67539c2
TL
3290 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3291
b3b6e05e
TL
3292 int left = in->get_nref();
3293 ceph_assert(left >= n + 1);
3294 in->iput(n);
3295 left -= n;
3296 if (left == 1) { // the last one will be held by the inode_map
7c673cae
FG
3297 // release any caps
3298 remove_all_caps(in);
3299
11fdf7f2 3300 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3301 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3302 ceph_assert(!unclean);
7c673cae
FG
3303 inode_map.erase(in->vino());
3304 if (use_faked_inos())
3305 _release_faked_ino(in);
3306
b3b6e05e 3307 if (root == nullptr) {
7c673cae
FG
3308 root_ancestor = 0;
3309 while (!root_parents.empty())
3310 root_parents.erase(root_parents.begin());
3311 }
3312
b3b6e05e 3313 in->iput();
7c673cae
FG
3314 }
3315}
3316
f67539c2
TL
3317void Client::delay_put_inodes(bool wakeup)
3318{
3319 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3320
3321 std::map<Inode*,int> release;
3322 {
3323 std::scoped_lock dl(delay_i_lock);
3324 release.swap(delay_i_release);
3325 }
3326
3327 if (release.empty())
3328 return;
3329
3330 for (auto &[in, cnt] : release)
3331 _put_inode(in, cnt);
3332
3333 if (wakeup)
3334 mount_cond.notify_all();
3335}
3336
3337void Client::put_inode(Inode *in, int n)
3338{
3339 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3340
3341 std::scoped_lock dl(delay_i_lock);
3342 delay_i_release[in] += n;
3343}
3344
7c673cae
FG
3345void Client::close_dir(Dir *dir)
3346{
3347 Inode *in = dir->parent_inode;
11fdf7f2
TL
3348 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3349 ceph_assert(dir->is_empty());
3350 ceph_assert(in->dir == dir);
3351 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3352 if (!in->dentries.empty())
7c673cae
FG
3353 in->get_first_parent()->put(); // unpin dentry
3354
3355 delete in->dir;
3356 in->dir = 0;
3357 put_inode(in); // unpin inode
3358}
3359
3360 /**
3361 * Don't call this with in==NULL, use get_or_create for that
3362 * leave dn set to default NULL unless you're trying to add
3363 * a new inode to a pre-created Dentry
3364 */
3365Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3366{
3367 if (!dn) {
3368 // create a new Dentry
11fdf7f2
TL
3369 dn = new Dentry(dir, name);
3370
7c673cae
FG
3371 lru.lru_insert_mid(dn); // mid or top?
3372
3373 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3374 << " dn " << dn << " (new dn)" << dendl;
3375 } else {
11fdf7f2 3376 ceph_assert(!dn->inode);
7c673cae
FG
3377 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3378 << " dn " << dn << " (old dn)" << dendl;
3379 }
3380
3381 if (in) { // link to inode
11fdf7f2 3382 InodeRef tmp_ref;
7c673cae 3383 // only one parent for directories!
11fdf7f2
TL
3384 if (in->is_dir() && !in->dentries.empty()) {
3385 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3386 Dentry *olddn = in->get_first_parent();
11fdf7f2 3387 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3388 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3389 clear_dir_complete_and_ordered(old_diri, true);
3390 unlink(olddn, true, true); // keep dir, dentry
3391 }
3392
11fdf7f2 3393 dn->link(in);
f67539c2 3394 inc_dentry_nr();
11fdf7f2 3395 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3396 }
3397
3398 return dn;
3399}
3400
3401void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3402{
11fdf7f2 3403 InodeRef in(dn->inode);
7c673cae
FG
3404 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3405 << " inode " << dn->inode << dendl;
3406
3407 // unlink from inode
11fdf7f2
TL
3408 if (dn->inode) {
3409 dn->unlink();
f67539c2 3410 dec_dentry_nr();
11fdf7f2 3411 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3412 }
3413
3414 if (keepdentry) {
3415 dn->lease_mds = -1;
3416 } else {
3417 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3418
3419 // unlink from dir
11fdf7f2
TL
3420 Dir *dir = dn->dir;
3421 dn->detach();
7c673cae
FG
3422
3423 // delete den
3424 lru.lru_remove(dn);
3425 dn->put();
11fdf7f2
TL
3426
3427 if (dir->is_empty() && !keepdir)
3428 close_dir(dir);
7c673cae
FG
3429 }
3430}
3431
3432/**
3433 * For asynchronous flushes, check for errors from the IO and
3434 * update the inode if necessary
3435 */
3436class C_Client_FlushComplete : public Context {
3437private:
3438 Client *client;
3439 InodeRef inode;
3440public:
3441 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3442 void finish(int r) override {
9f95a23c 3443 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3444 if (r != 0) {
3445 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3446 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3447 << " 0x" << std::hex << inode->ino << std::dec
3448 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3449 inode->set_async_err(r);
3450 }
3451 }
3452};
3453
3454
3455/****
3456 * caps
3457 */
3458
3459void Client::get_cap_ref(Inode *in, int cap)
3460{
3461 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3462 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3463 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
b3b6e05e 3464 in->iget();
7c673cae
FG
3465 }
3466 if ((cap & CEPH_CAP_FILE_CACHE) &&
3467 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3468 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
b3b6e05e 3469 in->iget();
7c673cae
FG
3470 }
3471 in->get_cap_ref(cap);
3472}
3473
3474void Client::put_cap_ref(Inode *in, int cap)
3475{
3476 int last = in->put_cap_ref(cap);
3477 if (last) {
3478 int put_nref = 0;
3479 int drop = last & ~in->caps_issued();
3480 if (in->snapid == CEPH_NOSNAP) {
f67539c2 3481 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3482 !in->cap_snaps.empty() &&
3483 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3484 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3485 in->cap_snaps.rbegin()->second.writing = 0;
3486 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3487 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3488 }
3489 if (last & CEPH_CAP_FILE_BUFFER) {
3490 for (auto &p : in->cap_snaps)
3491 p.second.dirty_data = 0;
3492 signal_cond_list(in->waitfor_commit);
11fdf7f2 3493 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3494 ++put_nref;
3495 }
3496 }
3497 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3498 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3499 ++put_nref;
3500 }
3501 if (drop)
3502 check_caps(in, 0);
3503 if (put_nref)
3504 put_inode(in, put_nref);
3505 }
3506}
3507
f67539c2
TL
3508// get caps for a given file handle -- the inode should have @need caps
3509// issued by the mds and @want caps not revoked (or not under revocation).
3510// this routine blocks till the cap requirement is satisfied. also account
3511// (track) for capability hit when required (when cap requirement succeedes).
f6b5b4d7 3512int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3513{
f6b5b4d7
TL
3514 Inode *in = fh->inode.get();
3515
7c673cae
FG
3516 int r = check_pool_perm(in, need);
3517 if (r < 0)
3518 return r;
3519
3520 while (1) {
3521 int file_wanted = in->caps_file_wanted();
3522 if ((file_wanted & need) != need) {
3523 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3524 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3525 << dendl;
f67539c2 3526 return -CEPHFS_EBADF;
7c673cae
FG
3527 }
3528
f6b5b4d7 3529 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
f67539c2 3530 return -CEPHFS_EBADF;
f6b5b4d7
TL
3531
3532 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
f67539c2 3533 return -CEPHFS_EIO;
f6b5b4d7 3534
7c673cae
FG
3535 int implemented;
3536 int have = in->caps_issued(&implemented);
3537
3538 bool waitfor_caps = false;
3539 bool waitfor_commit = false;
3540
3541 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3542 if (endoff > 0) {
3543 if ((endoff >= (loff_t)in->max_size ||
3544 endoff > (loff_t)(in->size << 1)) &&
3545 endoff > (loff_t)in->wanted_max_size) {
3546 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3547 in->wanted_max_size = endoff;
3548 }
3549 if (in->wanted_max_size > in->max_size &&
3550 in->wanted_max_size > in->requested_max_size)
3551 check_caps(in, 0);
7c673cae
FG
3552 }
3553
3554 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3555 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3556 waitfor_caps = true;
3557 }
3558 if (!in->cap_snaps.empty()) {
3559 if (in->cap_snaps.rbegin()->second.writing) {
3560 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3561 waitfor_caps = true;
3562 }
3563 for (auto &p : in->cap_snaps) {
3564 if (p.second.dirty_data) {
3565 waitfor_commit = true;
3566 break;
3567 }
3568 }
3569 if (waitfor_commit) {
3570 _flush(in, new C_Client_FlushComplete(this, in));
3571 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3572 }
3573 }
3574 }
3575
3576 if (!waitfor_caps && !waitfor_commit) {
3577 if ((have & need) == need) {
7c673cae
FG
3578 int revoking = implemented & ~have;
3579 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3580 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3581 << " revoking " << ccap_string(revoking)
7c673cae 3582 << dendl;
c07f9fc5 3583 if ((revoking & want) == 0) {
7c673cae
FG
3584 *phave = need | (have & want);
3585 in->get_cap_ref(need);
f67539c2 3586 cap_hit();
7c673cae
FG
3587 return 0;
3588 }
3589 }
3590 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3591 waitfor_caps = true;
3592 }
3593
3594 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3595 in->auth_cap->session->readonly)
f67539c2 3596 return -CEPHFS_EROFS;
7c673cae
FG
3597
3598 if (in->flags & I_CAP_DROPPED) {
3599 int mds_wanted = in->caps_mds_wanted();
3600 if ((mds_wanted & need) != need) {
3601 int ret = _renew_caps(in);
3602 if (ret < 0)
3603 return ret;
3604 continue;
3605 }
a8e16298 3606 if (!(file_wanted & ~mds_wanted))
7c673cae 3607 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3608 }
3609
3610 if (waitfor_caps)
3611 wait_on_list(in->waitfor_caps);
3612 else if (waitfor_commit)
3613 wait_on_list(in->waitfor_commit);
3614 }
3615}
3616
3617int Client::get_caps_used(Inode *in)
3618{
3619 unsigned used = in->caps_used();
3620 if (!(used & CEPH_CAP_FILE_CACHE) &&
3621 !objectcacher->set_is_empty(&in->oset))
3622 used |= CEPH_CAP_FILE_CACHE;
3623 return used;
3624}
3625
3626void Client::cap_delay_requeue(Inode *in)
3627{
11fdf7f2 3628 ldout(cct, 10) << __func__ << " on " << *in << dendl;
2a845540
TL
3629
3630 in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
28e407b8 3631 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3632}
3633
3634void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3635 int flags, int used, int want, int retain,
7c673cae
FG
3636 int flush, ceph_tid_t flush_tid)
3637{
3638 int held = cap->issued | cap->implemented;
3639 int revoking = cap->implemented & ~cap->issued;
3640 retain &= ~revoking;
3641 int dropping = cap->issued & ~retain;
3642 int op = CEPH_CAP_OP_UPDATE;
3643
11fdf7f2 3644 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3645 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3646 << " used " << ccap_string(used)
3647 << " want " << ccap_string(want)
3648 << " flush " << ccap_string(flush)
3649 << " retain " << ccap_string(retain)
3650 << " held "<< ccap_string(held)
3651 << " revoking " << ccap_string(revoking)
3652 << " dropping " << ccap_string(dropping)
3653 << dendl;
3654
3655 if (cct->_conf->client_inject_release_failure && revoking) {
3656 const int would_have_issued = cap->issued & retain;
3657 const int would_have_implemented = cap->implemented & (cap->issued | used);
3658 // Simulated bug:
3659 // - tell the server we think issued is whatever they issued plus whatever we implemented
3660 // - leave what we have implemented in place
3661 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3662 cap->issued = cap->issued | cap->implemented;
3663
3664 // Make an exception for revoking xattr caps: we are injecting
3665 // failure to release other caps, but allow xattr because client
3666 // will block on xattr ops if it can't release these to MDS (#9800)
3667 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3668 cap->issued ^= xattr_mask & revoking;
3669 cap->implemented ^= xattr_mask & revoking;
3670
3671 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3672 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3673 } else {
3674 // Normal behaviour
3675 cap->issued &= retain;
3676 cap->implemented &= cap->issued | used;
3677 }
3678
3679 snapid_t follows = 0;
3680
3681 if (flush)
3682 follows = in->snaprealm->get_snap_context().seq;
20effc67 3683
9f95a23c 3684 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3685 in->ino,
3686 0,
3687 cap->cap_id, cap->seq,
3688 cap->implemented,
3689 want,
3690 flush,
3691 cap->mseq,
3692 cap_epoch_barrier);
3693 m->caller_uid = in->cap_dirtier_uid;
3694 m->caller_gid = in->cap_dirtier_gid;
3695
3696 m->head.issue_seq = cap->issue_seq;
3697 m->set_tid(flush_tid);
3698
3699 m->head.uid = in->uid;
3700 m->head.gid = in->gid;
3701 m->head.mode = in->mode;
20effc67 3702
7c673cae 3703 m->head.nlink = in->nlink;
20effc67 3704
7c673cae 3705 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3706 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3707 m->head.xattr_version = in->xattr_version;
3708 }
20effc67 3709
7c673cae
FG
3710 m->size = in->size;
3711 m->max_size = in->max_size;
3712 m->truncate_seq = in->truncate_seq;
3713 m->truncate_size = in->truncate_size;
3714 m->mtime = in->mtime;
3715 m->atime = in->atime;
3716 m->ctime = in->ctime;
3717 m->btime = in->btime;
3718 m->time_warp_seq = in->time_warp_seq;
3719 m->change_attr = in->change_attr;
eafe8130
TL
3720
3721 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3722 !in->cap_snaps.empty() &&
3723 in->cap_snaps.rbegin()->second.flush_tid == 0)
3724 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3725 m->flags = flags;
3726
7c673cae
FG
3727 if (flush & CEPH_CAP_FILE_WR) {
3728 m->inline_version = in->inline_version;
3729 m->inline_data = in->inline_data;
3730 }
3731
3732 in->reported_size = in->size;
3733 m->set_snap_follows(follows);
3734 cap->wanted = want;
3735 if (cap == in->auth_cap) {
1911f103
TL
3736 if (want & CEPH_CAP_ANY_FILE_WR) {
3737 m->set_max_size(in->wanted_max_size);
3738 in->requested_max_size = in->wanted_max_size;
3739 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3740 } else {
3741 in->requested_max_size = 0;
3742 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3743 }
7c673cae
FG
3744 }
3745
3746 if (!session->flushing_caps_tids.empty())
3747 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3748
11fdf7f2 3749 session->con->send_message2(std::move(m));
7c673cae
FG
3750}
3751
31f18b77
FG
3752static bool is_max_size_approaching(Inode *in)
3753{
3754 /* mds will adjust max size according to the reported size */
3755 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3756 return false;
3757 if (in->size >= in->max_size)
3758 return true;
3759 /* half of previous max_size increment has been used */
3760 if (in->max_size > in->reported_size &&
3761 (in->size << 1) >= in->max_size + in->reported_size)
3762 return true;
3763 return false;
3764}
7c673cae 3765
11fdf7f2
TL
3766static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3767{
3768 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3769 return used;
3770 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3771 return used;
3772
3773 if (issued & CEPH_CAP_FILE_LAZYIO) {
3774 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3775 used &= ~CEPH_CAP_FILE_CACHE;
3776 used |= CEPH_CAP_FILE_LAZYIO;
3777 }
3778 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3779 used &= ~CEPH_CAP_FILE_BUFFER;
3780 used |= CEPH_CAP_FILE_LAZYIO;
3781 }
3782 } else {
3783 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3784 used &= ~CEPH_CAP_FILE_CACHE;
3785 used |= CEPH_CAP_FILE_LAZYIO;
3786 }
3787 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3788 used &= ~CEPH_CAP_FILE_BUFFER;
3789 used |= CEPH_CAP_FILE_LAZYIO;
3790 }
3791 }
3792 return used;
3793}
3794
7c673cae
FG
3795/**
3796 * check_caps
3797 *
3798 * Examine currently used and wanted versus held caps. Release, flush or ack
3799 * revoked caps to the MDS as appropriate.
3800 *
3801 * @param in the inode to check
3802 * @param flags flags to apply to cap check
3803 */
3804void Client::check_caps(Inode *in, unsigned flags)
3805{
3806 unsigned wanted = in->caps_wanted();
3807 unsigned used = get_caps_used(in);
3808 unsigned cap_used;
3809
7c673cae
FG
3810 int implemented;
3811 int issued = in->caps_issued(&implemented);
3812 int revoking = implemented & ~issued;
3813
11fdf7f2
TL
3814 int orig_used = used;
3815 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3816
7c673cae 3817 int retain = wanted | used | CEPH_CAP_PIN;
f67539c2 3818 if (!is_unmounting() && in->nlink > 0) {
a8e16298 3819 if (wanted) {
7c673cae 3820 retain |= CEPH_CAP_ANY;
a8e16298
TL
3821 } else if (in->is_dir() &&
3822 (issued & CEPH_CAP_FILE_SHARED) &&
3823 (in->flags & I_COMPLETE)) {
3824 // we do this here because we don't want to drop to Fs (and then
3825 // drop the Fs if we do a create!) if that alone makes us send lookups
3826 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3827 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3828 retain |= wanted;
3829 } else {
7c673cae 3830 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3831 // keep RD only if we didn't have the file open RW,
3832 // because then the mds would revoke it anyway to
3833 // journal max_size=0.
3834 if (in->max_size == 0)
3835 retain |= CEPH_CAP_ANY_RD;
3836 }
7c673cae
FG
3837 }
3838
11fdf7f2 3839 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3840 << " wanted " << ccap_string(wanted)
3841 << " used " << ccap_string(used)
3842 << " issued " << ccap_string(issued)
3843 << " revoking " << ccap_string(revoking)
3844 << " flags=" << flags
3845 << dendl;
3846
3847 if (in->snapid != CEPH_NOSNAP)
3848 return; //snap caps last forever, can't write
3849
3850 if (in->caps.empty())
3851 return; // guard if at end of func
3852
11fdf7f2
TL
3853 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3854 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3855 if (_release(in))
11fdf7f2 3856 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3857 }
7c673cae 3858
20effc67
TL
3859 for (auto &[mds, cap] : in->caps) {
3860 auto session = mds_sessions.at(mds);
7c673cae
FG
3861
3862 cap_used = used;
11fdf7f2 3863 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3864 cap_used &= ~in->auth_cap->issued;
3865
11fdf7f2 3866 revoking = cap.implemented & ~cap.issued;
20effc67 3867
7c673cae 3868 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3869 << " issued " << ccap_string(cap.issued)
3870 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3871 << " revoking " << ccap_string(revoking) << dendl;
3872
3873 if (in->wanted_max_size > in->max_size &&
3874 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3875 &cap == in->auth_cap)
7c673cae
FG
3876 goto ack;
3877
3878 /* approaching file_max? */
11fdf7f2
TL
3879 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3880 &cap == in->auth_cap &&
31f18b77 3881 is_max_size_approaching(in)) {
7c673cae 3882 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3883 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3884 goto ack;
3885 }
3886
3887 /* completed revocation? */
3888 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3889 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3890 goto ack;
3891 }
3892
3893 /* want more caps from mds? */
11fdf7f2 3894 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3895 goto ack;
3896
f67539c2 3897 if (!revoking && is_unmounting() && (cap_used == 0))
7c673cae
FG
3898 goto ack;
3899
11fdf7f2 3900 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3901 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3902 continue;
3903
11fdf7f2 3904 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3905 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3906 cap_delay_requeue(in);
7c673cae
FG
3907 continue;
3908 }
3909
3910 ack:
eafe8130
TL
3911 if (&cap == in->auth_cap) {
3912 if (in->flags & I_KICK_FLUSH) {
3913 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3914 << " to mds." << mds << dendl;
20effc67 3915 kick_flushing_caps(in, session.get());
eafe8130
TL
3916 }
3917 if (!in->cap_snaps.empty() &&
3918 in->cap_snaps.rbegin()->second.flush_tid == 0)
3919 flush_snaps(in);
7c673cae
FG
3920 }
3921
3922 int flushing;
e306af50 3923 int msg_flags = 0;
7c673cae 3924 ceph_tid_t flush_tid;
11fdf7f2 3925 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3926 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3927 if (flags & CHECK_CAPS_SYNCHRONOUS)
3928 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3929 } else {
3930 flushing = 0;
3931 flush_tid = 0;
3932 }
3933
20effc67
TL
3934 in->delay_cap_item.remove_myself();
3935 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
eafe8130 3936 flushing, flush_tid);
7c673cae
FG
3937 }
3938}
3939
3940
3941void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3942{
3943 int used = get_caps_used(in);
3944 int dirty = in->caps_dirty();
11fdf7f2 3945 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3946
3947 if (in->cap_snaps.size() &&
3948 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3949 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3950 return;
3951 } else if (in->caps_dirty() ||
3952 (used & CEPH_CAP_FILE_WR) ||
3953 (dirty & CEPH_CAP_ANY_WR)) {
3954 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3955 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3956 CapSnap &capsnap = capsnapem.first->second;
3957 capsnap.context = old_snapc;
3958 capsnap.issued = in->caps_issued();
3959 capsnap.dirty = in->caps_dirty();
f67539c2 3960
7c673cae 3961 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
f67539c2 3962
7c673cae
FG
3963 capsnap.uid = in->uid;
3964 capsnap.gid = in->gid;
3965 capsnap.mode = in->mode;
3966 capsnap.btime = in->btime;
3967 capsnap.xattrs = in->xattrs;
3968 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3969 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3970 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
f67539c2 3971
7c673cae 3972 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3973 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3974 capsnap.writing = 1;
3975 } else {
3976 finish_cap_snap(in, capsnap, used);
3977 }
3978 } else {
11fdf7f2 3979 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3980 }
3981}
3982
3983void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3984{
11fdf7f2 3985 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3986 capsnap.size = in->size;
3987 capsnap.mtime = in->mtime;
3988 capsnap.atime = in->atime;
3989 capsnap.ctime = in->ctime;
3990 capsnap.time_warp_seq = in->time_warp_seq;
3991 capsnap.change_attr = in->change_attr;
7c673cae
FG
3992 capsnap.dirty |= in->caps_dirty();
3993
11fdf7f2
TL
3994 /* Only reset it if it wasn't set before */
3995 if (capsnap.cap_dirtier_uid == -1) {
3996 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3997 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3998 }
3999
7c673cae
FG
4000 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4001 capsnap.inline_data = in->inline_data;
4002 capsnap.inline_version = in->inline_version;
4003 }
4004
4005 if (used & CEPH_CAP_FILE_BUFFER) {
f67539c2 4006 capsnap.writing = 1;
11fdf7f2 4007 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
4008 << " WRBUFFER, delaying" << dendl;
4009 } else {
4010 capsnap.dirty_data = 0;
4011 flush_snaps(in);
4012 }
4013}
4014
eafe8130
TL
4015void Client::send_flush_snap(Inode *in, MetaSession *session,
4016 snapid_t follows, CapSnap& capsnap)
4017{
9f95a23c
TL
4018 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4019 in->ino, in->snaprealm->ino, 0,
4020 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
4021 m->caller_uid = capsnap.cap_dirtier_uid;
4022 m->caller_gid = capsnap.cap_dirtier_gid;
4023
4024 m->set_client_tid(capsnap.flush_tid);
4025 m->head.snap_follows = follows;
4026
4027 m->head.caps = capsnap.issued;
4028 m->head.dirty = capsnap.dirty;
4029
4030 m->head.uid = capsnap.uid;
4031 m->head.gid = capsnap.gid;
4032 m->head.mode = capsnap.mode;
4033 m->btime = capsnap.btime;
4034
4035 m->size = capsnap.size;
4036
4037 m->head.xattr_version = capsnap.xattr_version;
4038 encode(capsnap.xattrs, m->xattrbl);
4039
4040 m->ctime = capsnap.ctime;
4041 m->btime = capsnap.btime;
4042 m->mtime = capsnap.mtime;
4043 m->atime = capsnap.atime;
4044 m->time_warp_seq = capsnap.time_warp_seq;
4045 m->change_attr = capsnap.change_attr;
4046
4047 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4048 m->inline_version = in->inline_version;
4049 m->inline_data = in->inline_data;
4050 }
4051
4052 ceph_assert(!session->flushing_caps_tids.empty());
4053 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4054
4055 session->con->send_message2(std::move(m));
4056}
4057
4058void Client::flush_snaps(Inode *in)
7c673cae 4059{
eafe8130 4060 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 4061 ceph_assert(in->cap_snaps.size());
7c673cae
FG
4062
4063 // pick auth mds
11fdf7f2 4064 ceph_assert(in->auth_cap);
7c673cae 4065 MetaSession *session = in->auth_cap->session;
7c673cae
FG
4066
4067 for (auto &p : in->cap_snaps) {
4068 CapSnap &capsnap = p.second;
eafe8130
TL
4069 // only do new flush
4070 if (capsnap.flush_tid > 0)
4071 continue;
7c673cae
FG
4072
4073 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4074 << " follows " << p.first
4075 << " size " << capsnap.size
4076 << " mtime " << capsnap.mtime
4077 << " dirty_data=" << capsnap.dirty_data
4078 << " writing=" << capsnap.writing
4079 << " on " << *in << dendl;
4080 if (capsnap.dirty_data || capsnap.writing)
eafe8130 4081 break;
f67539c2 4082
eafe8130
TL
4083 capsnap.flush_tid = ++last_flush_tid;
4084 session->flushing_caps_tids.insert(capsnap.flush_tid);
4085 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4086 if (!in->flushing_cap_item.is_on_list())
4087 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 4088
eafe8130 4089 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
4090 }
4091}
4092
9f95a23c 4093void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 4094{
9f95a23c 4095 ceph::condition_variable cond;
7c673cae 4096 ls.push_back(&cond);
9f95a23c
TL
4097 std::unique_lock l{client_lock, std::adopt_lock};
4098 cond.wait(l);
4099 l.release();
7c673cae
FG
4100 ls.remove(&cond);
4101}
4102
9f95a23c 4103void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 4104{
9f95a23c
TL
4105 for (auto cond : ls) {
4106 cond->notify_all();
4107 }
7c673cae
FG
4108}
4109
4110void Client::wait_on_context_list(list<Context*>& ls)
4111{
9f95a23c 4112 ceph::condition_variable cond;
7c673cae
FG
4113 bool done = false;
4114 int r;
9f95a23c
TL
4115 ls.push_back(new C_Cond(cond, &done, &r));
4116 std::unique_lock l{client_lock, std::adopt_lock};
4117 cond.wait(l, [&done] { return done;});
4118 l.release();
7c673cae
FG
4119}
4120
4121void Client::signal_context_list(list<Context*>& ls)
4122{
4123 while (!ls.empty()) {
4124 ls.front()->complete(0);
4125 ls.pop_front();
4126 }
4127}
4128
a8e16298 4129void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 4130{
11fdf7f2
TL
4131 for (const auto &cap : s->caps) {
4132 auto &in = cap->inode;
a8e16298 4133 if (reconnect) {
11fdf7f2
TL
4134 in.requested_max_size = 0;
4135 in.wanted_max_size = 0;
a8e16298
TL
4136 } else {
4137 if (cap->gen < s->cap_gen) {
4138 // mds did not re-issue stale cap.
4139 cap->issued = cap->implemented = CEPH_CAP_PIN;
4140 // make sure mds knows what we want.
11fdf7f2
TL
4141 if (in.caps_file_wanted() & ~cap->wanted)
4142 in.flags |= I_CAP_DROPPED;
a8e16298
TL
4143 }
4144 }
11fdf7f2 4145 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4146 }
4147}
4148
4149
4150// flush dirty data (from objectcache)
4151
4152class C_Client_CacheInvalidate : public Context {
4153private:
4154 Client *client;
4155 vinodeno_t ino;
4156 int64_t offset, length;
4157public:
4158 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4159 client(c), offset(off), length(len) {
4160 if (client->use_faked_inos())
4161 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4162 else
4163 ino = in->vino();
4164 }
4165 void finish(int r) override {
4166 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 4167 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
4168 client->_async_invalidate(ino, offset, length);
4169 }
4170};
4171
4172void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4173{
f67539c2
TL
4174 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4175 if (!mref_reader.is_state_satisfied())
7c673cae 4176 return;
f67539c2 4177
11fdf7f2 4178 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
4179 ino_invalidate_cb(callback_handle, ino, off, len);
4180}
4181
4182void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4183
4184 if (ino_invalidate_cb)
4185 // we queue the invalidate, which calls the callback and decrements the ref
4186 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4187}
4188
4189void Client::_invalidate_inode_cache(Inode *in)
4190{
11fdf7f2 4191 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
4192
4193 // invalidate our userspace inode cache
94b18763 4194 if (cct->_conf->client_oc) {
7c673cae 4195 objectcacher->release_set(&in->oset);
94b18763
FG
4196 if (!objectcacher->set_is_empty(&in->oset))
4197 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4198 }
7c673cae
FG
4199
4200 _schedule_invalidate_callback(in, 0, 0);
4201}
4202
4203void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4204{
11fdf7f2 4205 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
4206
4207 // invalidate our userspace inode cache
4208 if (cct->_conf->client_oc) {
4209 vector<ObjectExtent> ls;
4210 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 4211 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
4212 }
4213
4214 _schedule_invalidate_callback(in, off, len);
4215}
4216
4217bool Client::_release(Inode *in)
4218{
4219 ldout(cct, 20) << "_release " << *in << dendl;
4220 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4221 _invalidate_inode_cache(in);
4222 return true;
4223 }
4224 return false;
4225}
4226
4227bool Client::_flush(Inode *in, Context *onfinish)
4228{
4229 ldout(cct, 10) << "_flush " << *in << dendl;
4230
4231 if (!in->oset.dirty_or_tx) {
4232 ldout(cct, 10) << " nothing to flush" << dendl;
4233 onfinish->complete(0);
4234 return true;
4235 }
4236
4237 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 4238 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
4239 objectcacher->purge_set(&in->oset);
4240 if (onfinish) {
f67539c2 4241 onfinish->complete(-CEPHFS_ENOSPC);
7c673cae
FG
4242 }
4243 return true;
4244 }
4245
4246 return objectcacher->flush_set(&in->oset, onfinish);
4247}
4248
4249void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4250{
f67539c2 4251 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
4252 if (!in->oset.dirty_or_tx) {
4253 ldout(cct, 10) << " nothing to flush" << dendl;
4254 return;
4255 }
4256
11fdf7f2 4257 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 4258 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 4259 offset, size, &onflush);
7c673cae
FG
4260 if (!ret) {
4261 // wait for flush
9f95a23c 4262 client_lock.unlock();
11fdf7f2 4263 onflush.wait();
9f95a23c 4264 client_lock.lock();
7c673cae
FG
4265 }
4266}
4267
4268void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4269{
f67539c2
TL
4270 // std::scoped_lock l(client_lock);
4271 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 4272 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 4273 ceph_assert(in);
7c673cae
FG
4274 _flushed(in);
4275}
4276
4277void Client::_flushed(Inode *in)
4278{
4279 ldout(cct, 10) << "_flushed " << *in << dendl;
4280
4281 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4282}
4283
4284
4285
4286// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4287void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4288{
4289 unsigned had = in->caps_issued();
4290
4291 if ((issued & CEPH_CAP_FILE_CACHE) &&
4292 !(had & CEPH_CAP_FILE_CACHE))
4293 in->cache_gen++;
4294
f91f0fd5
TL
4295 if ((issued & CEPH_CAP_FILE_SHARED) !=
4296 (had & CEPH_CAP_FILE_SHARED)) {
4297 if (issued & CEPH_CAP_FILE_SHARED)
4298 in->shared_gen++;
7c673cae
FG
4299 if (in->is_dir())
4300 clear_dir_complete_and_ordered(in, true);
4301 }
4302}
4303
4304void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4305 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4306 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4307{
11fdf7f2
TL
4308 if (!in->is_any_caps()) {
4309 ceph_assert(in->snaprealm == 0);
4310 in->snaprealm = get_snap_realm(realm);
4311 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4312 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4313 } else {
4314 ceph_assert(in->snaprealm);
4315 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4316 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4317 in->snaprealm_item.remove_myself();
4318 auto oldrealm = in->snaprealm;
4319 in->snaprealm = get_snap_realm(realm);
4320 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4321 put_snap_realm(oldrealm);
4322 }
4323 }
4324
7c673cae 4325 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4326 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4327 Cap &cap = capem.first->second;
4328 if (!capem.second) {
4329 if (cap.gen < mds_session->cap_gen)
4330 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4331
4332 /*
4333 * auth mds of the inode changed. we received the cap export
4334 * message, but still haven't received the cap import message.
4335 * handle_cap_export() updated the new auth MDS' cap.
4336 *
4337 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4338 * a message that was send before the cap import message. So
4339 * don't remove caps.
4340 */
11fdf7f2 4341 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4342 if (&cap != in->auth_cap)
4343 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4344
11fdf7f2
TL
4345 ceph_assert(cap.cap_id == cap_id);
4346 seq = cap.seq;
4347 mseq = cap.mseq;
4348 issued |= cap.issued;
7c673cae
FG
4349 flags |= CEPH_CAP_FLAG_AUTH;
4350 }
f67539c2
TL
4351 } else {
4352 inc_pinned_icaps();
7c673cae
FG
4353 }
4354
11fdf7f2 4355 check_cap_issue(in, issued);
7c673cae
FG
4356
4357 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4358 if (in->auth_cap != &cap &&
7c673cae
FG
4359 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4360 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4361 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4362 << "add myself to new auth MDS' flushing caps list" << dendl;
4363 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4364 }
11fdf7f2 4365 in->auth_cap = &cap;
7c673cae
FG
4366 }
4367 }
4368
11fdf7f2
TL
4369 unsigned old_caps = cap.issued;
4370 cap.cap_id = cap_id;
4371 cap.issued = issued;
4372 cap.implemented |= issued;
4373 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4374 cap.wanted = wanted;
a8e16298 4375 else
11fdf7f2
TL
4376 cap.wanted |= wanted;
4377 cap.seq = seq;
4378 cap.issue_seq = seq;
4379 cap.mseq = mseq;
4380 cap.gen = mds_session->cap_gen;
4381 cap.latest_perms = cap_perms;
4382 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4383 << " from mds." << mds
4384 << " on " << *in
4385 << dendl;
4386
4387 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4388 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4389 for (auto &p : in->caps) {
4390 if (&p.second == &cap)
7c673cae 4391 continue;
11fdf7f2 4392 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4393 check_caps(in, CHECK_CAPS_NODELAY);
4394 break;
4395 }
4396 }
4397 }
4398
4399 if (issued & ~old_caps)
4400 signal_cond_list(in->waitfor_caps);
4401}
4402
4403void Client::remove_cap(Cap *cap, bool queue_release)
4404{
11fdf7f2 4405 auto &in = cap->inode;
7c673cae
FG
4406 MetaSession *session = cap->session;
4407 mds_rank_t mds = cap->session->mds_num;
4408
11fdf7f2 4409 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4410
4411 if (queue_release) {
4412 session->enqueue_cap_release(
11fdf7f2 4413 in.ino,
7c673cae
FG
4414 cap->cap_id,
4415 cap->issue_seq,
4416 cap->mseq,
4417 cap_epoch_barrier);
f67539c2
TL
4418 } else {
4419 dec_pinned_icaps();
7c673cae
FG
4420 }
4421
f67539c2 4422
11fdf7f2
TL
4423 if (in.auth_cap == cap) {
4424 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4425 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4426 in.flushing_cap_item.remove_myself();
7c673cae 4427 }
11fdf7f2 4428 in.auth_cap = NULL;
7c673cae 4429 }
11fdf7f2
TL
4430 size_t n = in.caps.erase(mds);
4431 ceph_assert(n == 1);
7c673cae
FG
4432 cap = nullptr;
4433
11fdf7f2
TL
4434 if (!in.is_any_caps()) {
4435 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4436 in.snaprealm_item.remove_myself();
4437 put_snap_realm(in.snaprealm);
4438 in.snaprealm = 0;
7c673cae
FG
4439 }
4440}
4441
4442void Client::remove_all_caps(Inode *in)
4443{
4444 while (!in->caps.empty())
11fdf7f2 4445 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4446}
4447
f6b5b4d7 4448void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4449{
11fdf7f2 4450 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4451
4452 while (s->caps.size()) {
4453 Cap *cap = *s->caps.begin();
11fdf7f2 4454 InodeRef in(&cap->inode);
eafe8130 4455 bool dirty_caps = false;
7c673cae 4456 if (in->auth_cap == cap) {
7c673cae
FG
4457 dirty_caps = in->dirty_caps | in->flushing_caps;
4458 in->wanted_max_size = 0;
4459 in->requested_max_size = 0;
f6b5b4d7
TL
4460 if (in->has_any_filelocks())
4461 in->flags |= I_ERROR_FILELOCK;
7c673cae 4462 }
f6b5b4d7 4463 auto caps = cap->implemented;
a8e16298
TL
4464 if (cap->wanted | cap->issued)
4465 in->flags |= I_CAP_DROPPED;
7c673cae 4466 remove_cap(cap, false);
eafe8130 4467 in->cap_snaps.clear();
7c673cae 4468 if (dirty_caps) {
11fdf7f2 4469 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4470 if (in->flushing_caps) {
4471 num_flushing_caps--;
4472 in->flushing_cap_tids.clear();
4473 }
4474 in->flushing_caps = 0;
28e407b8 4475 in->mark_caps_clean();
11fdf7f2 4476 put_inode(in.get());
7c673cae 4477 }
f6b5b4d7
TL
4478 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4479 if (caps && !in->caps_issued_mask(caps, true)) {
f67539c2 4480 if (err == -CEPHFS_EBLOCKLISTED) {
f6b5b4d7
TL
4481 if (in->oset.dirty_or_tx) {
4482 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4483 in->set_async_err(err);
4484 }
4485 objectcacher->purge_set(&in->oset);
4486 } else {
4487 objectcacher->release_set(&in->oset);
4488 }
4489 _schedule_invalidate_callback(in.get(), 0, 0);
4490 }
4491
a8e16298 4492 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4493 }
4494 s->flushing_caps_tids.clear();
9f95a23c 4495 sync_cond.notify_all();
7c673cae
FG
4496}
4497
1d09f67e 4498std::pair<int, bool> Client::_do_remount(bool retry_on_error)
b32b8144 4499{
adb31ebb 4500 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
1d09f67e 4501 bool abort_on_failure = false;
91327a77 4502
b32b8144
FG
4503 errno = 0;
4504 int r = remount_cb(callback_handle);
91327a77
AA
4505 if (r == 0) {
4506 retries_on_invalidate = 0;
4507 } else {
b32b8144
FG
4508 int e = errno;
4509 client_t whoami = get_nodeid();
4510 if (r == -1) {
4511 lderr(cct) <<
4512 "failed to remount (to trim kernel dentries): "
4513 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4514 } else {
4515 lderr(cct) <<
4516 "failed to remount (to trim kernel dentries): "
4517 "return code = " << r << dendl;
4518 }
91327a77 4519 bool should_abort =
11fdf7f2
TL
4520 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4521 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4522 !(retry_on_error && (++retries_on_invalidate < max_retries));
f67539c2 4523 if (should_abort && !is_unmounting()) {
b32b8144 4524 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
1d09f67e 4525 abort_on_failure = true;
b32b8144
FG
4526 }
4527 }
1d09f67e 4528 return std::make_pair(r, abort_on_failure);
b32b8144
FG
4529}
4530
7c673cae
FG
4531class C_Client_Remount : public Context {
4532private:
4533 Client *client;
4534public:
4535 explicit C_Client_Remount(Client *c) : client(c) {}
4536 void finish(int r) override {
11fdf7f2 4537 ceph_assert(r == 0);
91327a77 4538 client->_do_remount(true);
7c673cae
FG
4539 }
4540};
4541
4542void Client::_invalidate_kernel_dcache()
4543{
f67539c2
TL
4544 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4545 if (!mref_reader.is_state_satisfied())
7c673cae 4546 return;
f67539c2 4547
94b18763
FG
4548 if (can_invalidate_dentries) {
4549 if (dentry_invalidate_cb && root->dir) {
4550 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4551 p != root->dir->dentries.end();
4552 ++p) {
4553 if (p->second->inode)
4554 _schedule_invalidate_dentry_callback(p->second, false);
4555 }
7c673cae
FG
4556 }
4557 } else if (remount_cb) {
4558 // Hacky:
4559 // when remounting a file system, linux kernel trims all unused dentries in the fs
4560 remount_finisher.queue(new C_Client_Remount(this));
4561 }
4562}
4563
91327a77
AA
4564void Client::_trim_negative_child_dentries(InodeRef& in)
4565{
4566 if (!in->is_dir())
4567 return;
4568
4569 Dir* dir = in->dir;
4570 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4571 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4572 Dentry *dn = p->second;
4573 ++p;
11fdf7f2 4574 ceph_assert(!dn->inode);
91327a77
AA
4575 if (dn->lru_is_expireable())
4576 unlink(dn, true, false); // keep dir, drop dentry
4577 }
4578 if (dir->dentries.empty()) {
4579 close_dir(dir);
4580 }
4581 }
4582
4583 if (in->flags & I_SNAPDIR_OPEN) {
4584 InodeRef snapdir = open_snapdir(in.get());
4585 _trim_negative_child_dentries(snapdir);
4586 }
4587}
4588
e306af50
TL
4589class C_Client_CacheRelease : public Context {
4590private:
4591 Client *client;
4592 vinodeno_t ino;
4593public:
4594 C_Client_CacheRelease(Client *c, Inode *in) :
4595 client(c) {
4596 if (client->use_faked_inos())
4597 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4598 else
4599 ino = in->vino();
4600 }
4601 void finish(int r) override {
4602 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4603 client->_async_inode_release(ino);
4604 }
4605};
4606
4607void Client::_async_inode_release(vinodeno_t ino)
4608{
f67539c2
TL
4609 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4610 if (!mref_reader.is_state_satisfied())
e306af50 4611 return;
f67539c2 4612
e306af50
TL
4613 ldout(cct, 10) << __func__ << " " << ino << dendl;
4614 ino_release_cb(callback_handle, ino);
4615}
4616
4617void Client::_schedule_ino_release_callback(Inode *in) {
4618
4619 if (ino_release_cb)
4620 // we queue the invalidate, which calls the callback and decrements the ref
4621 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4622}
4623
28e407b8 4624void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4625{
4626 mds_rank_t mds = s->mds_num;
28e407b8 4627 size_t caps_size = s->caps.size();
11fdf7f2 4628 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4629 << " caps " << caps_size << dendl;
4630
28e407b8
AA
4631 uint64_t trimmed = 0;
4632 auto p = s->caps.begin();
4633 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4634 * looking at from getting deleted during traversal. */
7c673cae
FG
4635 while ((caps_size - trimmed) > max && !p.end()) {
4636 Cap *cap = *p;
11fdf7f2 4637 InodeRef in(&cap->inode);
7c673cae
FG
4638
4639 // Increment p early because it will be invalidated if cap
4640 // is deleted inside remove_cap
4641 ++p;
4642
4643 if (in->caps.size() > 1 && cap != in->auth_cap) {
4644 int mine = cap->issued | cap->implemented;
4645 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4646 // disposable non-auth cap
b32b8144 4647 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4648 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4649 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4650 trimmed++;
4651 }
4652 } else {
4653 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4654 _trim_negative_child_dentries(in);
7c673cae 4655 bool all = true;
11fdf7f2
TL
4656 auto q = in->dentries.begin();
4657 while (q != in->dentries.end()) {
4658 Dentry *dn = *q;
4659 ++q;
7c673cae
FG
4660 if (dn->lru_is_expireable()) {
4661 if (can_invalidate_dentries &&
b3b6e05e 4662 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
7c673cae
FG
4663 // Only issue one of these per DN for inodes in root: handle
4664 // others more efficiently by calling for root-child DNs at
4665 // the end of this function.
4666 _schedule_invalidate_dentry_callback(dn, true);
4667 }
28e407b8
AA
4668 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4669 to_trim.insert(dn);
7c673cae
FG
4670 } else {
4671 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4672 all = false;
4673 }
4674 }
b3b6e05e 4675 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
f91f0fd5
TL
4676 _schedule_ino_release_callback(in.get());
4677 }
b3b6e05e 4678 if (all && in->ino != CEPH_INO_ROOT) {
7c673cae
FG
4679 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4680 trimmed++;
4681 }
4682 }
4683 }
28e407b8
AA
4684 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4685 for (const auto &dn : to_trim) {
4686 trim_dentry(dn);
4687 }
4688 to_trim.clear();
7c673cae 4689
b32b8144 4690 caps_size = s->caps.size();
11fdf7f2 4691 if (caps_size > (size_t)max)
7c673cae
FG
4692 _invalidate_kernel_dcache();
4693}
4694
4695void Client::force_session_readonly(MetaSession *s)
4696{
4697 s->readonly = true;
4698 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4699 auto &in = (*p)->inode;
4700 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4701 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4702 }
4703}
4704
7c673cae
FG
4705int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4706{
4707 MetaSession *session = in->auth_cap->session;
4708
4709 int flushing = in->dirty_caps;
11fdf7f2 4710 ceph_assert(flushing);
7c673cae
FG
4711
4712 ceph_tid_t flush_tid = ++last_flush_tid;
4713 in->flushing_cap_tids[flush_tid] = flushing;
4714
4715 if (!in->flushing_caps) {
11fdf7f2 4716 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4717 num_flushing_caps++;
4718 } else {
11fdf7f2 4719 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4720 }
4721
4722 in->flushing_caps |= flushing;
28e407b8 4723 in->mark_caps_clean();
7c673cae
FG
4724
4725 if (!in->flushing_cap_item.is_on_list())
4726 session->flushing_caps.push_back(&in->flushing_cap_item);
4727 session->flushing_caps_tids.insert(flush_tid);
4728
4729 *ptid = flush_tid;
4730 return flushing;
4731}
4732
4733void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4734{
4735 for (auto &p : in->cap_snaps) {
4736 CapSnap &capsnap = p.second;
4737 if (capsnap.flush_tid > 0) {
4738 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4739 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4740 }
4741 }
4742 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4743 it != in->flushing_cap_tids.end();
4744 ++it) {
4745 old_s->flushing_caps_tids.erase(it->first);
4746 new_s->flushing_caps_tids.insert(it->first);
4747 }
4748 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4749}
4750
4751/*
20effc67
TL
4752 * Flush all the dirty caps back to the MDS. Because the callers
4753 * generally wait on the result of this function (syncfs and umount
4754 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
7c673cae
FG
4755 */
4756void Client::flush_caps_sync()
4757{
4758 ldout(cct, 10) << __func__ << dendl;
20effc67
TL
4759 for (auto &q : mds_sessions) {
4760 auto s = q.second;
4761 xlist<Inode*>::iterator p = s->dirty_list.begin();
4762 while (!p.end()) {
4763 unsigned flags = CHECK_CAPS_NODELAY;
4764 Inode *in = *p;
7c673cae 4765
20effc67
TL
4766 ++p;
4767 if (p.end())
4768 flags |= CHECK_CAPS_SYNCHRONOUS;
4769 check_caps(in, flags);
4770 }
7c673cae
FG
4771 }
4772}
4773
7c673cae
FG
4774void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4775{
4776 while (in->flushing_caps) {
4777 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4778 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4779 if (it->first > want)
4780 break;
11fdf7f2 4781 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4782 << ccap_string(it->second) << " want " << want
4783 << " last " << it->first << dendl;
4784 wait_on_list(in->waitfor_caps);
4785 }
4786}
4787
4788void Client::wait_sync_caps(ceph_tid_t want)
4789{
4790 retry:
11fdf7f2 4791 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4792 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2 4793 for (auto &p : mds_sessions) {
20effc67 4794 auto s = p.second;
7c673cae
FG
4795 if (s->flushing_caps_tids.empty())
4796 continue;
4797 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4798 if (oldest_tid <= want) {
11fdf7f2 4799 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4800 << " (want " << want << ")" << dendl;
9f95a23c
TL
4801 std::unique_lock l{client_lock, std::adopt_lock};
4802 sync_cond.wait(l);
4803 l.release();
7c673cae
FG
4804 goto retry;
4805 }
4806 }
4807}
4808
eafe8130
TL
4809void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4810{
4811 in->flags &= ~I_KICK_FLUSH;
4812
4813 Cap *cap = in->auth_cap;
4814 ceph_assert(cap->session == session);
4815
4816 ceph_tid_t last_snap_flush = 0;
4817 for (auto p = in->flushing_cap_tids.rbegin();
4818 p != in->flushing_cap_tids.rend();
4819 ++p) {
4820 if (!p->second) {
4821 last_snap_flush = p->first;
4822 break;
4823 }
4824 }
4825
4826 int wanted = in->caps_wanted();
4827 int used = get_caps_used(in) | in->caps_dirty();
4828 auto it = in->cap_snaps.begin();
4829 for (auto& p : in->flushing_cap_tids) {
4830 if (p.second) {
4831 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4832 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4833 p.second, p.first);
4834 } else {
4835 ceph_assert(it != in->cap_snaps.end());
4836 ceph_assert(it->second.flush_tid == p.first);
4837 send_flush_snap(in, session, it->first, it->second);
4838 ++it;
4839 }
4840 }
4841}
4842
7c673cae
FG
4843void Client::kick_flushing_caps(MetaSession *session)
4844{
4845 mds_rank_t mds = session->mds_num;
11fdf7f2 4846 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4847
4848 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4849 Inode *in = *p;
eafe8130
TL
4850 if (in->flags & I_KICK_FLUSH) {
4851 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4852 kick_flushing_caps(in, session);
4853 }
7c673cae 4854 }
7c673cae
FG
4855}
4856
4857void Client::early_kick_flushing_caps(MetaSession *session)
4858{
7c673cae
FG
4859 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4860 Inode *in = *p;
11fdf7f2
TL
4861 Cap *cap = in->auth_cap;
4862 ceph_assert(cap);
7c673cae
FG
4863
4864 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4865 // stage. This guarantees that MDS processes the cap flush message before issuing
4866 // the flushing caps to other client.
eafe8130
TL
4867 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4868 in->flags |= I_KICK_FLUSH;
7c673cae 4869 continue;
eafe8130 4870 }
7c673cae
FG
4871
4872 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4873 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4874 // send_reconnect() also will reset these sequence numbers. make sure
4875 // sequence numbers in cap flush message match later reconnect message.
4876 cap->seq = 0;
4877 cap->issue_seq = 0;
4878 cap->mseq = 0;
4879 cap->issued = cap->implemented;
4880
eafe8130 4881 kick_flushing_caps(in, session);
7c673cae
FG
4882 }
4883}
4884
7c673cae
FG
4885void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4886{
4887 list<SnapRealm*> q;
4888 q.push_back(realm);
4889
4890 while (!q.empty()) {
4891 realm = q.front();
4892 q.pop_front();
4893
11fdf7f2 4894 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4895 realm->invalidate_cache();
4896
4897 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4898 p != realm->pchildren.end();
4899 ++p)
4900 q.push_back(*p);
4901 }
4902}
4903
4904SnapRealm *Client::get_snap_realm(inodeno_t r)
4905{
4906 SnapRealm *realm = snap_realms[r];
2a845540
TL
4907
4908 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
4909 << (realm ? realm->nref : 0) << dendl;
4910 if (!realm) {
7c673cae 4911 snap_realms[r] = realm = new SnapRealm(r);
2a845540
TL
4912
4913 // Do not release the global snaprealm until unmounting.
4914 if (r == CEPH_INO_GLOBAL_SNAPREALM)
4915 realm->nref++;
4916 }
4917
7c673cae 4918 realm->nref++;
2a845540
TL
4919 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
4920 << realm->nref << dendl;
7c673cae
FG
4921 return realm;
4922}
4923
4924SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4925{
4926 if (snap_realms.count(r) == 0) {
11fdf7f2 4927 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4928 return NULL;
4929 }
4930 SnapRealm *realm = snap_realms[r];
11fdf7f2 4931 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4932 realm->nref++;
4933 return realm;
4934}
4935
4936void Client::put_snap_realm(SnapRealm *realm)
4937{
11fdf7f2 4938 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4939 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4940 if (--realm->nref == 0) {
4941 snap_realms.erase(realm->ino);
4942 if (realm->pparent) {
4943 realm->pparent->pchildren.erase(realm);
4944 put_snap_realm(realm->pparent);
4945 }
4946 delete realm;
4947 }
4948}
4949
4950bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4951{
4952 if (realm->parent != parent) {
11fdf7f2 4953 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4954 << " " << realm->parent << " -> " << parent << dendl;
4955 realm->parent = parent;
4956 if (realm->pparent) {
4957 realm->pparent->pchildren.erase(realm);
4958 put_snap_realm(realm->pparent);
4959 }
4960 realm->pparent = get_snap_realm(parent);
4961 realm->pparent->pchildren.insert(realm);
4962 return true;
4963 }
4964 return false;
4965}
4966
4967static bool has_new_snaps(const SnapContext& old_snapc,
4968 const SnapContext& new_snapc)
4969{
4970 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4971}
4972
4973
11fdf7f2 4974void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4975{
4976 SnapRealm *first_realm = NULL;
11fdf7f2 4977 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4978
4979 map<SnapRealm*, SnapContext> dirty_realms;
4980
11fdf7f2 4981 auto p = bl.cbegin();
7c673cae
FG
4982 while (!p.end()) {
4983 SnapRealmInfo info;
11fdf7f2 4984 decode(info, p);
7c673cae
FG
4985 SnapRealm *realm = get_snap_realm(info.ino());
4986
4987 bool invalidate = false;
4988
4989 if (info.seq() > realm->seq) {
11fdf7f2 4990 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4991 << dendl;
4992
4993 if (flush) {
4994 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4995 // flush me + children
4996 list<SnapRealm*> q;
4997 q.push_back(realm);
4998 while (!q.empty()) {
4999 SnapRealm *realm = q.front();
5000 q.pop_front();
5001
5002 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5003 p != realm->pchildren.end();
5004 ++p)
5005 q.push_back(*p);
5006
5007 if (dirty_realms.count(realm) == 0) {
5008 realm->nref++;
5009 dirty_realms[realm] = realm->get_snap_context();
5010 }
5011 }
5012 }
5013
5014 // update
5015 realm->seq = info.seq();
5016 realm->created = info.created();
5017 realm->parent_since = info.parent_since();
5018 realm->prior_parent_snaps = info.prior_parent_snaps;
5019 realm->my_snaps = info.my_snaps;
5020 invalidate = true;
5021 }
5022
5023 // _always_ verify parent
5024 if (adjust_realm_parent(realm, info.parent()))
5025 invalidate = true;
5026
5027 if (invalidate) {
5028 invalidate_snaprealm_and_children(realm);
11fdf7f2 5029 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
5030 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
5031 } else {
11fdf7f2 5032 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
5033 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5034 }
f67539c2 5035
7c673cae
FG
5036 if (!first_realm)
5037 first_realm = realm;
5038 else
5039 put_snap_realm(realm);
5040 }
5041
f67539c2 5042 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 5043 // if there are new snaps ?
f67539c2 5044 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 5045 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
f67539c2
TL
5046 for (auto&& in : realm->inodes_with_caps) {
5047 queue_cap_snap(in, snapc);
7c673cae
FG
5048 }
5049 } else {
5050 ldout(cct, 10) << " no new snap on " << *realm << dendl;
5051 }
5052 put_snap_realm(realm);
5053 }
5054
5055 if (realm_ret)
5056 *realm_ret = first_realm;
5057 else
5058 put_snap_realm(first_realm);
5059}
5060
11fdf7f2 5061void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 5062{
11fdf7f2 5063 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 5064 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5065
5066 std::scoped_lock cl(client_lock);
20effc67 5067 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5068 if (!session) {
7c673cae
FG
5069 return;
5070 }
5071
20effc67 5072 got_mds_push(session.get());
7c673cae
FG
5073
5074 map<Inode*, SnapContext> to_move;
5075 SnapRealm *realm = 0;
5076
5077 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 5078 ceph_assert(m->head.split);
7c673cae 5079 SnapRealmInfo info;
11fdf7f2
TL
5080 auto p = m->bl.cbegin();
5081 decode(info, p);
5082 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
5083
5084 // flush, then move, ino's.
5085 realm = get_snap_realm(info.ino());
5086 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
5087 for (auto& ino : m->split_inos) {
5088 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
5089 if (inode_map.count(vino)) {
5090 Inode *in = inode_map[vino];
5091 if (!in->snaprealm || in->snaprealm == realm)
5092 continue;
5093 if (in->snaprealm->created > info.created()) {
5094 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5095 << *in->snaprealm << dendl;
5096 continue;
5097 }
5098 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5099
5100
5101 in->snaprealm_item.remove_myself();
5102 to_move[in] = in->snaprealm->get_snap_context();
5103 put_snap_realm(in->snaprealm);
5104 }
5105 }
5106
5107 // move child snaprealms, too
11fdf7f2
TL
5108 for (auto& child_realm : m->split_realms) {
5109 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5110 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
5111 if (!child)
5112 continue;
5113 adjust_realm_parent(child, realm->ino);
5114 put_snap_realm(child);
5115 }
5116 }
5117
5118 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5119
5120 if (realm) {
5121 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5122 Inode *in = p->first;
5123 in->snaprealm = realm;
5124 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5125 realm->nref++;
5126 // queue for snap writeback
5127 if (has_new_snaps(p->second, realm->get_snap_context()))
5128 queue_cap_snap(in, p->second);
5129 }
5130 put_snap_realm(realm);
5131 }
7c673cae
FG
5132}
5133
11fdf7f2 5134void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
5135{
5136 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5137
5138 std::scoped_lock cl(client_lock);
20effc67 5139 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5140 if (!session) {
7c673cae
FG
5141 return;
5142 }
5143
20effc67 5144 got_mds_push(session.get());
7c673cae 5145
11fdf7f2 5146 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
5147
5148 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5149 if (inode_map.count(vino)) {
5150 Inode *in = NULL;
5151 in = inode_map[vino];
5152
5153 if (in) {
5154 in->quota = m->quota;
5155 in->rstat = m->rstat;
5156 }
5157 }
7c673cae
FG
5158}
5159
11fdf7f2 5160void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
5161{
5162 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5163
5164 std::scoped_lock cl(client_lock);
20effc67 5165 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5166 if (!session) {
7c673cae
FG
5167 return;
5168 }
5169
5170 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5171 // Pause RADOS operations until we see the required epoch
5172 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5173 }
5174
5175 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5176 // Record the barrier so that we will transmit it to MDS when releasing
5177 set_cap_epoch_barrier(m->osd_epoch_barrier);
5178 }
5179
20effc67 5180 got_mds_push(session.get());
7c673cae 5181
11fdf7f2 5182 Inode *in;
7c673cae 5183 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
5184 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5185 in = it->second;
5186 } else {
7c673cae 5187 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 5188 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
5189 session->enqueue_cap_release(
5190 m->get_ino(),
5191 m->get_cap_id(),
5192 m->get_seq(),
5193 m->get_mseq(),
5194 cap_epoch_barrier);
5195 } else {
11fdf7f2 5196 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 5197 }
7c673cae
FG
5198
5199 // in case the mds is waiting on e.g. a revocation
5200 flush_cap_releases();
5201 return;
5202 }
5203
5204 switch (m->get_op()) {
20effc67
TL
5205 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5206 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5207 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
7c673cae
FG
5208 }
5209
11fdf7f2
TL
5210 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5211 Cap &cap = in->caps.at(mds);
7c673cae 5212
11fdf7f2 5213 switch (m->get_op()) {
20effc67 5214 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
11fdf7f2
TL
5215 case CEPH_CAP_OP_IMPORT:
5216 case CEPH_CAP_OP_REVOKE:
20effc67
TL
5217 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5218 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
11fdf7f2
TL
5219 }
5220 } else {
5221 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5222 return;
7c673cae
FG
5223 }
5224}
5225
11fdf7f2 5226void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5227{
5228 mds_rank_t mds = session->mds_num;
5229
11fdf7f2 5230 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5231 << " IMPORT from mds." << mds << dendl;
5232
5233 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5234 Cap *cap = NULL;
5235 UserPerm cap_perms;
11fdf7f2
TL
5236 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5237 cap = &it->second;
5238 cap_perms = cap->latest_perms;
7c673cae
FG
5239 }
5240
5241 // add/update it
5242 SnapRealm *realm = NULL;
5243 update_snap_trace(m->snapbl, &realm);
5244
1911f103
TL
5245 int issued = m->get_caps();
5246 int wanted = m->get_wanted();
7c673cae 5247 add_update_cap(in, session, m->get_cap_id(),
1911f103 5248 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 5249 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
5250
5251 if (cap && cap->cap_id == m->peer.cap_id) {
5252 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5253 }
5254
5255 if (realm)
5256 put_snap_realm(realm);
5257
eafe8130 5258 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
5259 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5260 in->requested_max_size > m->get_max_size()) {
5261 in->requested_max_size = 0;
5262 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5263 }
7c673cae 5264 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5265 kick_flushing_caps(in, session);
7c673cae
FG
5266 }
5267}
5268
11fdf7f2 5269void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5270{
5271 mds_rank_t mds = session->mds_num;
5272
11fdf7f2 5273 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5274 << " EXPORT from mds." << mds << dendl;
5275
11fdf7f2
TL
5276 auto it = in->caps.find(mds);
5277 if (it != in->caps.end()) {
5278 Cap &cap = it->second;
5279 if (cap.cap_id == m->get_cap_id()) {
5280 if (m->peer.cap_id) {
5281 const auto peer_mds = mds_rank_t(m->peer.mds);
20effc67 5282 auto tsession = _get_or_open_mds_session(peer_mds);
11fdf7f2
TL
5283 auto it = in->caps.find(peer_mds);
5284 if (it != in->caps.end()) {
5285 Cap &tcap = it->second;
5286 if (tcap.cap_id == m->peer.cap_id &&
5287 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5288 tcap.cap_id = m->peer.cap_id;
5289 tcap.seq = m->peer.seq - 1;
5290 tcap.issue_seq = tcap.seq;
5291 tcap.issued |= cap.issued;
5292 tcap.implemented |= cap.issued;
5293 if (&cap == in->auth_cap)
5294 in->auth_cap = &tcap;
5295 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
20effc67 5296 adjust_session_flushing_caps(in, session, tsession.get());
11fdf7f2
TL
5297 }
5298 } else {
20effc67 5299 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
11fdf7f2
TL
5300 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5301 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5302 cap.latest_perms);
5303 }
7c673cae 5304 } else {
11fdf7f2
TL
5305 if (cap.wanted | cap.issued)
5306 in->flags |= I_CAP_DROPPED;
7c673cae 5307 }
7c673cae 5308
11fdf7f2
TL
5309 remove_cap(&cap, false);
5310 }
7c673cae 5311 }
7c673cae
FG
5312}
5313
11fdf7f2 5314void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5315{
5316 mds_rank_t mds = session->mds_num;
11fdf7f2 5317 ceph_assert(in->caps.count(mds));
7c673cae 5318
11fdf7f2 5319 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5320 << " size " << in->size << " -> " << m->get_size()
5321 << dendl;
5322
1adf2230
AA
5323 int issued;
5324 in->caps_issued(&issued);
5325 issued |= in->caps_dirty();
5326 update_inode_file_size(in, issued, m->get_size(),
5327 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
5328}
5329
11fdf7f2 5330void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5331{
5332 ceph_tid_t flush_ack_tid = m->get_client_tid();
5333 int dirty = m->get_dirty();
5334 int cleaned = 0;
5335 int flushed = 0;
5336
11fdf7f2
TL
5337 auto it = in->flushing_cap_tids.begin();
5338 if (it->first < flush_ack_tid) {
5339 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5340 << " got unexpected flush ack tid " << flush_ack_tid
5341 << " expected is " << it->first << dendl;
5342 }
5343 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5344 if (!it->second) {
5345 // cap snap
5346 ++it;
5347 continue;
5348 }
7c673cae
FG
5349 if (it->first == flush_ack_tid)
5350 cleaned = it->second;
5351 if (it->first <= flush_ack_tid) {
5352 session->flushing_caps_tids.erase(it->first);
5353 in->flushing_cap_tids.erase(it++);
5354 ++flushed;
5355 continue;
5356 }
5357 cleaned &= ~it->second;
5358 if (!cleaned)
5359 break;
5360 ++it;
5361 }
5362
11fdf7f2 5363 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5364 << " cleaned " << ccap_string(cleaned) << " on " << *in
5365 << " with " << ccap_string(dirty) << dendl;
5366
5367 if (flushed) {
5368 signal_cond_list(in->waitfor_caps);
5369 if (session->flushing_caps_tids.empty() ||
5370 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5371 sync_cond.notify_all();
7c673cae
FG
5372 }
5373
5374 if (!dirty) {
5375 in->cap_dirtier_uid = -1;
5376 in->cap_dirtier_gid = -1;
5377 }
5378
5379 if (!cleaned) {
5380 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5381 } else {
5382 if (in->flushing_caps) {
5383 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5384 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5385 in->flushing_caps &= ~cleaned;
5386 if (in->flushing_caps == 0) {
5387 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5388 num_flushing_caps--;
eafe8130 5389 if (in->flushing_cap_tids.empty())
7c673cae
FG
5390 in->flushing_cap_item.remove_myself();
5391 }
5392 if (!in->caps_dirty())
5393 put_inode(in);
5394 }
5395 }
7c673cae
FG
5396}
5397
5398
11fdf7f2 5399void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5400{
eafe8130 5401 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5402 mds_rank_t mds = session->mds_num;
11fdf7f2 5403 ceph_assert(in->caps.count(mds));
7c673cae
FG
5404 snapid_t follows = m->get_snap_follows();
5405
11fdf7f2
TL
5406 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5407 auto& capsnap = it->second;
eafe8130
TL
5408 if (flush_ack_tid != capsnap.flush_tid) {
5409 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5410 } else {
eafe8130 5411 InodeRef tmp_ref(in);
11fdf7f2 5412 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5413 << " on " << *in << dendl;
7c673cae 5414 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5415 in->flushing_cap_tids.erase(capsnap.flush_tid);
5416 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5417 in->flushing_cap_item.remove_myself();
11fdf7f2 5418 in->cap_snaps.erase(it);
eafe8130
TL
5419
5420 signal_cond_list(in->waitfor_caps);
5421 if (session->flushing_caps_tids.empty() ||
5422 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5423 sync_cond.notify_all();
7c673cae
FG
5424 }
5425 } else {
11fdf7f2 5426 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5427 << " on " << *in << dendl;
5428 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5429 }
7c673cae
FG
5430}
5431
5432class C_Client_DentryInvalidate : public Context {
5433private:
5434 Client *client;
5435 vinodeno_t dirino;
5436 vinodeno_t ino;
5437 string name;
5438public:
5439 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5440 client(c), name(dn->name) {
5441 if (client->use_faked_inos()) {
5442 dirino.ino = dn->dir->parent_inode->faked_ino;
5443 if (del)
5444 ino.ino = dn->inode->faked_ino;
5445 } else {
5446 dirino = dn->dir->parent_inode->vino();
5447 if (del)
5448 ino = dn->inode->vino();
5449 }
5450 if (!del)
5451 ino.ino = inodeno_t();
5452 }
5453 void finish(int r) override {
5454 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5455 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5456 client->_async_dentry_invalidate(dirino, ino, name);
5457 }
5458};
5459
5460void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5461{
f67539c2
TL
5462 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5463 if (!mref_reader.is_state_satisfied())
7c673cae 5464 return;
f67539c2 5465
11fdf7f2 5466 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5467 << " in dir " << dirino << dendl;
e306af50 5468 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5469}
5470
5471void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5472{
5473 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5474 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5475}
5476
5477void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5478{
b3b6e05e 5479 int ref = in->get_nref();
494da23a 5480 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5481
5482 if (in->dir && !in->dir->dentries.empty()) {
5483 for (auto p = in->dir->dentries.begin();
5484 p != in->dir->dentries.end(); ) {
5485 Dentry *dn = p->second;
5486 ++p;
5487 /* rmsnap removes whole subtree, need trim inodes recursively.
5488 * we don't need to invalidate dentries recursively. because
5489 * invalidating a directory dentry effectively invalidate
5490 * whole subtree */
5491 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5492 _try_to_trim_inode(dn->inode.get(), false);
5493
5494 if (dn->lru_is_expireable())
5495 unlink(dn, true, false); // keep dir, drop dentry
5496 }
5497 if (in->dir->dentries.empty()) {
5498 close_dir(in->dir);
5499 --ref;
5500 }
5501 }
5502
b3b6e05e 5503 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
7c673cae
FG
5504 InodeRef snapdir = open_snapdir(in);
5505 _try_to_trim_inode(snapdir.get(), false);
5506 --ref;
5507 }
5508
b3b6e05e 5509 if (ref > 1) {
11fdf7f2
TL
5510 auto q = in->dentries.begin();
5511 while (q != in->dentries.end()) {
5512 Dentry *dn = *q;
5513 ++q;
494da23a
TL
5514 if( in->ll_ref > 0 && sched_inval) {
5515 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5516 // so in->dentries doesn't always reflect the state of kernel's dcache.
5517 _schedule_invalidate_dentry_callback(dn, true);
5518 }
7c673cae
FG
5519 unlink(dn, true, true);
5520 }
5521 }
5522}
5523
11fdf7f2 5524void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5525{
5526 mds_rank_t mds = session->mds_num;
5527 int used = get_caps_used(in);
5528 int wanted = in->caps_wanted();
a4b75251 5529 int flags = 0;
7c673cae 5530
a8e16298
TL
5531 const unsigned new_caps = m->get_caps();
5532 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5533 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5534 << " mds." << mds << " seq " << m->get_seq()
5535 << " caps now " << ccap_string(new_caps)
a8e16298 5536 << " was " << ccap_string(cap->issued)
92f5a8d4 5537 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5538
5539 if (was_stale)
5540 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5541 cap->seq = m->get_seq();
28e407b8 5542 cap->gen = session->cap_gen;
7c673cae 5543
11fdf7f2 5544 check_cap_issue(in, new_caps);
a8e16298 5545
7c673cae 5546 // update inode
1adf2230
AA
5547 int issued;
5548 in->caps_issued(&issued);
5549 issued |= in->caps_dirty();
7c673cae 5550
1adf2230
AA
5551 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5552 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5553 in->mode = m->head.mode;
5554 in->uid = m->head.uid;
5555 in->gid = m->head.gid;
5556 in->btime = m->btime;
5557 }
5558 bool deleted_inode = false;
1adf2230
AA
5559 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5560 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae 5561 in->nlink = m->head.nlink;
20effc67 5562 if (in->nlink == 0)
7c673cae
FG
5563 deleted_inode = true;
5564 }
1adf2230 5565 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5566 m->xattrbl.length() &&
5567 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5568 auto p = m->xattrbl.cbegin();
5569 decode(in->xattrs, p);
7c673cae
FG
5570 in->xattr_version = m->head.xattr_version;
5571 }
28e407b8
AA
5572
5573 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5574 in->dirstat.nfiles = m->get_nfiles();
5575 in->dirstat.nsubdirs = m->get_nsubdirs();
5576 }
5577
1adf2230
AA
5578 if (new_caps & CEPH_CAP_ANY_RD) {
5579 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5580 m->get_ctime(), m->get_mtime(), m->get_atime());
5581 }
5582
5583 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5584 in->layout = m->get_layout();
5585 update_inode_file_size(in, issued, m->get_size(),
5586 m->get_truncate_seq(), m->get_truncate_size());
5587 }
5588
5589 if (m->inline_version > in->inline_version) {
5590 in->inline_data = m->inline_data;
5591 in->inline_version = m->inline_version;
5592 }
5593
5594 /* always take a newer change attr */
5595 if (m->get_change_attr() > in->change_attr)
5596 in->change_attr = m->get_change_attr();
7c673cae
FG
5597
5598 // max_size
5599 if (cap == in->auth_cap &&
1adf2230
AA
5600 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5601 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5602 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5603 in->max_size = m->get_max_size();
5604 if (in->max_size > in->wanted_max_size) {
5605 in->wanted_max_size = 0;
5606 in->requested_max_size = 0;
5607 }
5608 }
5609
5610 bool check = false;
a8e16298
TL
5611 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5612 (wanted & ~(cap->wanted | new_caps))) {
5613 // If mds is importing cap, prior cap messages that update 'wanted'
5614 // may get dropped by mds (migrate seq mismatch).
5615 //
5616 // We don't send cap message to update 'wanted' if what we want are
5617 // already issued. If mds revokes caps, cap message that releases caps
5618 // also tells mds what we want. But if caps got revoked by mds forcedly
5619 // (session stale). We may haven't told mds what we want.
7c673cae 5620 check = true;
a8e16298 5621 }
7c673cae 5622
7c673cae
FG
5623
5624 // update caps
a8e16298 5625 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5626 if (revoked) {
5627 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5628 cap->issued = new_caps;
5629 cap->implemented |= new_caps;
5630
b32b8144
FG
5631 // recall delegations if we're losing caps necessary for them
5632 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5633 in->recall_deleg(false);
5634 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5635 in->recall_deleg(true);
5636
11fdf7f2
TL
5637 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5638 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5639 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5640 // waitin' for flush
11fdf7f2 5641 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
a4b75251
TL
5642 if (_release(in)) {
5643 check = true;
5644 flags = CHECK_CAPS_NODELAY;
5645 }
7c673cae
FG
5646 } else {
5647 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5648 check = true;
a4b75251 5649 flags = CHECK_CAPS_NODELAY;
7c673cae 5650 }
a8e16298
TL
5651 } else if (cap->issued == new_caps) {
5652 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5653 } else {
a8e16298 5654 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5655 cap->issued = new_caps;
5656 cap->implemented |= new_caps;
5657
5658 if (cap == in->auth_cap) {
5659 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5660 for (const auto &p : in->caps) {
5661 if (&p.second == cap)
7c673cae 5662 continue;
11fdf7f2 5663 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5664 check = true;
5665 break;
5666 }
5667 }
5668 }
5669 }
5670
5671 if (check)
a4b75251 5672 check_caps(in, flags);
7c673cae
FG
5673
5674 // wake up waiters
5675 if (new_caps)
5676 signal_cond_list(in->waitfor_caps);
5677
5678 // may drop inode's last ref
5679 if (deleted_inode)
5680 _try_to_trim_inode(in, true);
7c673cae
FG
5681}
5682
7c673cae
FG
5683int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5684{
b3b6e05e 5685 if (perms.uid() == 0) {
2a845540
TL
5686 // For directories, DACs are overridable.
5687 // For files, Read/write DACs are always overridable but executable DACs are
5688 // overridable when there is at least one exec bit set
5689 if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
b3b6e05e 5690 return -CEPHFS_EACCES;
7c673cae 5691 return 0;
b3b6e05e 5692 }
7c673cae
FG
5693
5694 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5695 int ret = _posix_acl_permission(in, perms, want);
f67539c2 5696 if (ret != -CEPHFS_EAGAIN)
7c673cae
FG
5697 return ret;
5698 }
5699
5700 // check permissions before doing anything else
5701 if (!in->check_mode(perms, want))
f67539c2 5702 return -CEPHFS_EACCES;
7c673cae
FG
5703 return 0;
5704}
5705
5706int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5707 const UserPerm& perms)
5708{
5709 int r = _getattr_for_perm(in, perms);
5710 if (r < 0)
5711 goto out;
5712
5713 r = 0;
5714 if (strncmp(name, "system.", 7) == 0) {
5715 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
f67539c2 5716 r = -CEPHFS_EPERM;
7c673cae
FG
5717 } else {
5718 r = inode_permission(in, perms, want);
5719 }
5720out:
1adf2230 5721 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5722 return r;
5723}
5724
20effc67 5725std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
7c673cae
FG
5726 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5727 return out;
5728}
5729
5730int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5731 const UserPerm& perms)
5732{
181888fb 5733 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5734 int r = _getattr_for_perm(in, perms);
5735 if (r < 0)
5736 goto out;
5737
5738 if (mask & CEPH_SETATTR_SIZE) {
5739 r = inode_permission(in, perms, MAY_WRITE);
5740 if (r < 0)
5741 goto out;
5742 }
5743
f67539c2 5744 r = -CEPHFS_EPERM;
7c673cae
FG
5745 if (mask & CEPH_SETATTR_UID) {
5746 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5747 goto out;
5748 }
5749 if (mask & CEPH_SETATTR_GID) {
5750 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5751 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5752 goto out;
5753 }
5754
5755 if (mask & CEPH_SETATTR_MODE) {
5756 if (perms.uid() != 0 && perms.uid() != in->uid)
5757 goto out;
5758
5759 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5760 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5761 stx->stx_mode &= ~S_ISGID;
5762 }
5763
5764 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5765 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5766 if (perms.uid() != 0 && perms.uid() != in->uid) {
5767 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5768 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5769 check_mask |= CEPH_SETATTR_MTIME;
5770 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5771 check_mask |= CEPH_SETATTR_ATIME;
5772 if (check_mask & mask) {
5773 goto out;
5774 } else {
5775 r = inode_permission(in, perms, MAY_WRITE);
5776 if (r < 0)
5777 goto out;
5778 }
5779 }
5780 }
5781 r = 0;
5782out:
5783 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5784 return r;
5785}
5786
5787int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5788{
181888fb 5789 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5790 unsigned want = 0;
5791
5792 if ((flags & O_ACCMODE) == O_WRONLY)
5793 want = MAY_WRITE;
5794 else if ((flags & O_ACCMODE) == O_RDWR)
5795 want = MAY_READ | MAY_WRITE;
5796 else if ((flags & O_ACCMODE) == O_RDONLY)
5797 want = MAY_READ;
5798 if (flags & O_TRUNC)
5799 want |= MAY_WRITE;
5800
5801 int r = 0;
5802 switch (in->mode & S_IFMT) {
5803 case S_IFLNK:
f67539c2 5804 r = -CEPHFS_ELOOP;
7c673cae
FG
5805 goto out;
5806 case S_IFDIR:
5807 if (want & MAY_WRITE) {
f67539c2 5808 r = -CEPHFS_EISDIR;
7c673cae
FG
5809 goto out;
5810 }
5811 break;
5812 }
5813
5814 r = _getattr_for_perm(in, perms);
5815 if (r < 0)
5816 goto out;
5817
5818 r = inode_permission(in, perms, want);
5819out:
5820 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5821 return r;
5822}
5823
5824int Client::may_lookup(Inode *dir, const UserPerm& perms)
5825{
181888fb 5826 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5827 int r = _getattr_for_perm(dir, perms);
5828 if (r < 0)
5829 goto out;
5830
5831 r = inode_permission(dir, perms, MAY_EXEC);
5832out:
5833 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5834 return r;
5835}
5836
5837int Client::may_create(Inode *dir, const UserPerm& perms)
5838{
181888fb 5839 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5840 int r = _getattr_for_perm(dir, perms);
5841 if (r < 0)
5842 goto out;
5843
5844 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5845out:
5846 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5847 return r;
5848}
5849
5850int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5851{
181888fb 5852 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5853 int r = _getattr_for_perm(dir, perms);
5854 if (r < 0)
5855 goto out;
5856
5857 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5858 if (r < 0)
5859 goto out;
5860
f67539c2 5861 /* 'name == NULL' means rmsnap w/o permission checks */
7c673cae
FG
5862 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5863 InodeRef otherin;
5864 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5865 if (r < 0)
5866 goto out;
5867 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
f67539c2 5868 r = -CEPHFS_EPERM;
7c673cae
FG
5869 }
5870out:
5871 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5872 return r;
5873}
5874
f67539c2
TL
5875int Client::may_delete(const char *relpath, const UserPerm& perms) {
5876 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5877
5878 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5879 if (!mref_reader.is_state_satisfied())
5880 return -ENOTCONN;
5881
5882 filepath path(relpath);
5883 string name = path.last_dentry();
5884 path.pop_dentry();
5885 InodeRef dir;
5886
5887 std::scoped_lock lock(client_lock);
5888 int r = path_walk(path, &dir, perms);
5889 if (r < 0)
5890 return r;
5891 if (cct->_conf->client_permissions) {
5892 int r = may_delete(dir.get(), name.c_str(), perms);
5893 if (r < 0)
5894 return r;
5895 }
5896
5897 return 0;
5898}
5899
7c673cae
FG
5900int Client::may_hardlink(Inode *in, const UserPerm& perms)
5901{
181888fb 5902 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5903 int r = _getattr_for_perm(in, perms);
5904 if (r < 0)
5905 goto out;
5906
5907 if (perms.uid() == 0 || perms.uid() == in->uid) {
5908 r = 0;
5909 goto out;
5910 }
5911
f67539c2 5912 r = -CEPHFS_EPERM;
7c673cae
FG
5913 if (!S_ISREG(in->mode))
5914 goto out;
5915
5916 if (in->mode & S_ISUID)
5917 goto out;
5918
5919 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5920 goto out;
5921
5922 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5923out:
5924 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5925 return r;
5926}
5927
5928int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5929{
5930 int mask = CEPH_STAT_CAP_MODE;
5931 bool force = false;
5932 if (acl_type != NO_ACL) {
5933 mask |= CEPH_STAT_CAP_XATTR;
5934 force = in->xattr_version == 0;
5935 }
5936 return _getattr(in, mask, perms, force);
5937}
5938
5939vinodeno_t Client::_get_vino(Inode *in)
5940{
5941 /* The caller must hold the client lock */
5942 return vinodeno_t(in->ino, in->snapid);
5943}
5944
7c673cae
FG
5945/**
5946 * Resolve an MDS spec to a list of MDS daemon GIDs.
5947 *
5948 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5949 * It may be '*' in which case it matches all GIDs.
5950 *
5951 * If no error is returned, the `targets` vector will be populated with at least
5952 * one MDS.
5953 */
5954int Client::resolve_mds(
5955 const std::string &mds_spec,
5956 std::vector<mds_gid_t> *targets)
5957{
11fdf7f2
TL
5958 ceph_assert(fsmap);
5959 ceph_assert(targets != nullptr);
7c673cae
FG
5960
5961 mds_role_t role;
f67539c2
TL
5962 CachedStackStringStream css;
5963 int role_r = fsmap->parse_role(mds_spec, &role, *css);
7c673cae
FG
5964 if (role_r == 0) {
5965 // We got a role, resolve it to a GID
f67539c2
TL
5966 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5967 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5968 << role << "' aka " << info.human_name() << dendl;
5969 targets->push_back(info.global_id);
7c673cae
FG
5970 return 0;
5971 }
5972
5973 std::string strtol_err;
5974 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5975 if (strtol_err.empty()) {
5976 // It is a possible GID
5977 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5978 if (fsmap->gid_exists(mds_gid)) {
f67539c2
TL
5979 auto& info = fsmap->get_info_gid(mds_gid);
5980 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5981 << info.human_name() << dendl;
7c673cae 5982 targets->push_back(mds_gid);
f67539c2 5983 return 0;
7c673cae 5984 } else {
f67539c2 5985 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
7c673cae 5986 << dendl;
f67539c2
TL
5987 lderr(cct) << "FSMap: " << *fsmap << dendl;
5988 return -CEPHFS_ENOENT;
7c673cae
FG
5989 }
5990 } else if (mds_spec == "*") {
5991 // It is a wildcard: use all MDSs
f67539c2 5992 const auto& mds_info = fsmap->get_mds_info();
7c673cae 5993
f67539c2 5994 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
7c673cae 5995 if (mds_info.empty()) {
f67539c2
TL
5996 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5997 lderr(cct) << "FSMap: " << *fsmap << dendl;
5998 return -CEPHFS_ENOENT;
7c673cae
FG
5999 }
6000
f67539c2
TL
6001 for (const auto& [gid, info] : mds_info) {
6002 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6003 targets->push_back(gid);
7c673cae 6004 }
f67539c2 6005 return 0;
7c673cae
FG
6006 } else {
6007 // It did not parse as an integer, it is not a wildcard, it must be a name
6008 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6009 if (mds_gid == 0) {
f67539c2 6010 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
7c673cae 6011 lderr(cct) << "FSMap: " << *fsmap << dendl;
f67539c2 6012 return -CEPHFS_ENOENT;
7c673cae 6013 } else {
f67539c2
TL
6014 auto& info = fsmap->get_info_gid(mds_gid);
6015 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6016 << "' to " << info.human_name() << dendl;
7c673cae
FG
6017 targets->push_back(mds_gid);
6018 }
f67539c2 6019 return 0;
7c673cae 6020 }
7c673cae
FG
6021}
6022
6023
6024/**
6025 * Authenticate with mon and establish global ID
6026 */
6027int Client::authenticate()
6028{
9f95a23c 6029 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
6030
6031 if (monclient->is_authenticated()) {
6032 return 0;
6033 }
6034
9f95a23c 6035 client_lock.unlock();
2a845540 6036 int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
9f95a23c 6037 client_lock.lock();
7c673cae
FG
6038 if (r < 0) {
6039 return r;
6040 }
6041
6042 whoami = monclient->get_global_id();
6043 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6044
6045 return 0;
6046}
6047
6048int Client::fetch_fsmap(bool user)
6049{
f67539c2
TL
6050 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6051
7c673cae
FG
6052 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6053 // rather than MDSMap because no one MDSMap contains all the daemons, and
6054 // a `tell` can address any daemon.
6055 version_t fsmap_latest;
f67539c2 6056 bs::error_code ec;
7c673cae 6057 do {
9f95a23c 6058 client_lock.unlock();
f67539c2
TL
6059 std::tie(fsmap_latest, std::ignore) =
6060 monclient->get_version("fsmap", ca::use_blocked[ec]);
9f95a23c 6061 client_lock.lock();
f67539c2 6062 } while (ec == bs::errc::resource_unavailable_try_again);
7c673cae 6063
f67539c2
TL
6064 if (ec) {
6065 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6066 return ceph::from_error_code(ec);
7c673cae
FG
6067 }
6068
6069 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6070
6071 if (user) {
6072 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6073 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6074 monclient->renew_subs();
6075 wait_on_list(waiting_for_fsmap);
6076 }
11fdf7f2
TL
6077 ceph_assert(fsmap_user);
6078 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
6079 } else {
6080 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6081 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6082 monclient->renew_subs();
6083 wait_on_list(waiting_for_fsmap);
6084 }
11fdf7f2
TL
6085 ceph_assert(fsmap);
6086 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
6087 }
6088 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6089 << fsmap_latest << dendl;
6090 return 0;
6091}
6092
6093/**
6094 *
6095 * @mds_spec one of ID, rank, GID, "*"
6096 *
6097 */
6098int Client::mds_command(
6099 const std::string &mds_spec,
6100 const vector<string>& cmd,
6101 const bufferlist& inbl,
6102 bufferlist *outbl,
6103 string *outs,
6104 Context *onfinish)
6105{
f67539c2
TL
6106 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6107 if (!iref_reader.is_state_satisfied())
6108 return -CEPHFS_ENOTCONN;
7c673cae 6109
f67539c2 6110 std::unique_lock cl(client_lock);
7c673cae
FG
6111
6112 int r;
6113 r = authenticate();
6114 if (r < 0) {
6115 return r;
6116 }
6117
6118 r = fetch_fsmap(false);
6119 if (r < 0) {
6120 return r;
6121 }
6122
6123 // Look up MDS target(s) of the command
6124 std::vector<mds_gid_t> targets;
6125 r = resolve_mds(mds_spec, &targets);
6126 if (r < 0) {
6127 return r;
6128 }
6129
6130 // If daemons are laggy, we won't send them commands. If all
6131 // are laggy then we fail.
6132 std::vector<mds_gid_t> non_laggy;
f67539c2 6133 for (const auto& gid : targets) {
7c673cae
FG
6134 const auto info = fsmap->get_info_gid(gid);
6135 if (!info.laggy()) {
6136 non_laggy.push_back(gid);
6137 }
6138 }
6139 if (non_laggy.size() == 0) {
6140 *outs = "All targeted MDS daemons are laggy";
f67539c2 6141 return -CEPHFS_ENOENT;
7c673cae
FG
6142 }
6143
6144 if (metadata.empty()) {
6145 // We are called on an unmounted client, so metadata
6146 // won't be initialized yet.
6147 populate_metadata("");
6148 }
6149
6150 // Send commands to targets
6151 C_GatherBuilder gather(cct, onfinish);
f67539c2 6152 for (const auto& target_gid : non_laggy) {
7c673cae
FG
6153 const auto info = fsmap->get_info_gid(target_gid);
6154
6155 // Open a connection to the target MDS
11fdf7f2 6156 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae 6157
f67539c2
TL
6158 cl.unlock();
6159 {
6160 std::scoped_lock cmd_lock(command_lock);
6161 // Generate MDSCommandOp state
6162 auto &op = command_table.start_command();
7c673cae 6163
f67539c2
TL
6164 op.on_finish = gather.new_sub();
6165 op.cmd = cmd;
6166 op.outbl = outbl;
6167 op.outs = outs;
6168 op.inbl = inbl;
6169 op.mds_gid = target_gid;
6170 op.con = conn;
7c673cae 6171
f67539c2
TL
6172 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6173 << " tid=" << op.tid << cmd << dendl;
7c673cae 6174
f67539c2
TL
6175 // Construct and send MCommand
6176 MessageRef m = op.get_message(monclient->get_fsid());
6177 conn->send_message2(std::move(m));
6178 }
6179 cl.lock();
7c673cae
FG
6180 }
6181 gather.activate();
6182
6183 return 0;
6184}
6185
11fdf7f2 6186void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
6187{
6188 ceph_tid_t const tid = m->get_tid();
6189
6190 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6191
f67539c2 6192 std::scoped_lock cmd_lock(command_lock);
7c673cae
FG
6193 if (!command_table.exists(tid)) {
6194 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
6195 return;
6196 }
6197
6198 auto &op = command_table.get_command(tid);
6199 if (op.outbl) {
11fdf7f2 6200 *op.outbl = m->get_data();
7c673cae
FG
6201 }
6202 if (op.outs) {
6203 *op.outs = m->rs;
6204 }
6205
6206 if (op.on_finish) {
6207 op.on_finish->complete(m->r);
6208 }
6209
6210 command_table.erase(tid);
7c673cae
FG
6211}
6212
6213// -------------------
6214// MOUNT
6215
11fdf7f2 6216int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 6217{
7c673cae
FG
6218 int r = authenticate();
6219 if (r < 0) {
6220 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6221 return r;
6222 }
6223
11fdf7f2
TL
6224 std::string resolved_fs_name;
6225 if (fs_name.empty()) {
9f95a23c
TL
6226 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6227 if (resolved_fs_name.empty())
6228 // Try the backwards compatibility fs name option
6229 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
6230 } else {
6231 resolved_fs_name = fs_name;
6232 }
6233
7c673cae 6234 std::string want = "mdsmap";
11fdf7f2 6235 if (!resolved_fs_name.empty()) {
7c673cae
FG
6236 r = fetch_fsmap(true);
6237 if (r < 0)
6238 return r;
11fdf7f2
TL
6239 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6240 if (fscid == FS_CLUSTER_ID_NONE) {
f67539c2 6241 return -CEPHFS_ENOENT;
11fdf7f2 6242 }
7c673cae
FG
6243
6244 std::ostringstream oss;
11fdf7f2 6245 oss << want << "." << fscid;
7c673cae
FG
6246 want = oss.str();
6247 }
6248 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6249
6250 monclient->sub_want(want, 0, 0);
6251 monclient->renew_subs();
6252
11fdf7f2
TL
6253 return 0;
6254}
6255
6256int Client::mount(const std::string &mount_root, const UserPerm& perms,
6257 bool require_mds, const std::string &fs_name)
6258{
f67539c2 6259 ceph_assert(is_initialized());
11fdf7f2 6260
f67539c2
TL
6261 /*
6262 * To make sure that the _unmount() must wait until the mount()
6263 * is done.
6264 */
6265 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6266 if (!mref_writer.is_first_writer()) // already mounting or mounted
11fdf7f2 6267 return 0;
11fdf7f2 6268
f67539c2 6269 std::unique_lock cl(client_lock);
11fdf7f2
TL
6270
6271 int r = subscribe_mdsmap(fs_name);
6272 if (r < 0) {
6273 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6274 return r;
6275 }
6276
f67539c2
TL
6277 start_tick_thread(); // start tick thread
6278
7c673cae
FG
6279 if (require_mds) {
6280 while (1) {
6281 auto availability = mdsmap->is_cluster_available();
6282 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6283 // Error out
6284 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6285 return CEPH_FUSE_NO_MDS_UP;
6286 } else if (availability == MDSMap::AVAILABLE) {
6287 // Continue to mount
6288 break;
6289 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6290 // Else, wait. MDSMonitor will update the map to bring
6291 // us to a conclusion eventually.
6292 wait_on_list(waiting_for_mdsmap);
6293 } else {
6294 // Unexpected value!
6295 ceph_abort();
6296 }
6297 }
6298 }
6299
6300 populate_metadata(mount_root.empty() ? "/" : mount_root);
6301
6302 filepath fp(CEPH_INO_ROOT);
6303 if (!mount_root.empty()) {
6304 fp = filepath(mount_root.c_str());
6305 }
6306 while (true) {
6307 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6308 req->set_filepath(fp);
6309 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6310 int res = make_request(req, perms);
6311 if (res < 0) {
f67539c2 6312 if (res == -CEPHFS_EACCES && root) {
7c673cae
FG
6313 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6314 break;
6315 }
6316 return res;
6317 }
6318
6319 if (fp.depth())
6320 fp.pop_dentry();
6321 else
6322 break;
6323 }
6324
11fdf7f2 6325 ceph_assert(root);
b3b6e05e 6326 _ll_get(root.get());
7c673cae 6327
7c673cae
FG
6328 // trace?
6329 if (!cct->_conf->client_trace.empty()) {
6330 traceout.open(cct->_conf->client_trace.c_str());
6331 if (traceout.is_open()) {
6332 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6333 } else {
6334 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6335 }
6336 }
6337
6338 /*
6339 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6340 ldout(cct, 3) << "op: struct stat st;" << dendl;
6341 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6342 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6343 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6344 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6345 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6346 ldout(cct, 3) << "op: int fd;" << dendl;
6347 */
f67539c2
TL
6348
6349 mref_writer.update_state(CLIENT_MOUNTED);
7c673cae
FG
6350 return 0;
6351}
6352
6353// UNMOUNT
6354
6355void Client::_close_sessions()
6356{
f6b5b4d7 6357 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67 6358 if (it->second->state == MetaSession::STATE_REJECTED)
f6b5b4d7
TL
6359 mds_sessions.erase(it++);
6360 else
6361 ++it;
6362 }
6363
7c673cae
FG
6364 while (!mds_sessions.empty()) {
6365 // send session closes!
11fdf7f2 6366 for (auto &p : mds_sessions) {
20effc67
TL
6367 if (p.second->state != MetaSession::STATE_CLOSING) {
6368 _close_mds_session(p.second.get());
f6b5b4d7 6369 mds_ranks_closing.insert(p.first);
7c673cae
FG
6370 }
6371 }
6372
6373 // wait for sessions to close
f6b5b4d7
TL
6374 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6375 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6376 << timo << "s)" << dendl;
9f95a23c 6377 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6378 if (!timo) {
6379 mount_cond.wait(l);
6380 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6381 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6382 while (!mds_ranks_closing.empty()) {
6383 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6384 // this prunes entry from mds_sessions and mds_ranks_closing
20effc67 6385 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
f6b5b4d7
TL
6386 }
6387 }
6388
6389 mds_ranks_closing.clear();
9f95a23c 6390 l.release();
7c673cae
FG
6391 }
6392}
6393
522d829b
TL
6394void Client::flush_mdlog_sync(Inode *in)
6395{
6396 if (in->unsafe_ops.empty()) {
6397 return;
6398 }
6399
6400 std::set<mds_rank_t> anchor;
6401 for (auto &&p : in->unsafe_ops) {
6402 anchor.emplace(p->mds);
6403 }
6404 if (in->auth_cap) {
6405 anchor.emplace(in->auth_cap->session->mds_num);
6406 }
6407
6408 for (auto &rank : anchor) {
6409 auto session = &mds_sessions.at(rank);
20effc67 6410 flush_mdlog(session->get());
522d829b
TL
6411 }
6412}
6413
31f18b77
FG
6414void Client::flush_mdlog_sync()
6415{
522d829b 6416 if (mds_requests.empty())
31f18b77 6417 return;
11fdf7f2 6418 for (auto &p : mds_sessions) {
20effc67 6419 flush_mdlog(p.second.get());
31f18b77
FG
6420 }
6421}
6422
6423void Client::flush_mdlog(MetaSession *session)
6424{
6425 // Only send this to Luminous or newer MDS daemons, older daemons
6426 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6427 const uint64_t features = session->con->get_features();
6428 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6429 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6430 session->con->send_message2(std::move(m));
31f18b77
FG
6431 }
6432}
6433
6434
11fdf7f2
TL
6435void Client::_abort_mds_sessions(int err)
6436{
6437 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6438 auto req = p->second;
6439 ++p;
6440 // unsafe requests will be removed during close session below.
6441 if (req->got_unsafe)
6442 continue;
6443
6444 req->abort(err);
6445 if (req->caller_cond) {
6446 req->kick = true;
9f95a23c 6447 req->caller_cond->notify_all();
11fdf7f2
TL
6448 }
6449 }
6450
6451 // Process aborts on any requests that were on this waitlist.
6452 // Any requests that were on a waiting_for_open session waitlist
6453 // will get kicked during close session below.
6454 signal_cond_list(waiting_for_mdsmap);
6455
6456 // Force-close all sessions
6457 while(!mds_sessions.empty()) {
20effc67
TL
6458 auto session = mds_sessions.begin()->second;
6459 _closed_mds_session(session.get(), err);
11fdf7f2
TL
6460 }
6461}
6462
6463void Client::_unmount(bool abort)
7c673cae 6464{
f67539c2
TL
6465 /*
6466 * We are unmounting the client.
6467 *
6468 * Just declare the state to STATE_UNMOUNTING to block and fail
6469 * any new comming "reader" and then try to wait all the in-flight
6470 * "readers" to finish.
6471 */
6472 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6473 if (!mref_writer.is_first_writer())
181888fb 6474 return;
f67539c2 6475 mref_writer.wait_readers_done();
7c673cae 6476
f67539c2
TL
6477 std::unique_lock lock{client_lock};
6478
6479 if (abort || blocklisted) {
6480 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
11fdf7f2
TL
6481 } else {
6482 ldout(cct, 2) << "unmounting" << dendl;
6483 }
7c673cae 6484
b32b8144
FG
6485 deleg_timeout = 0;
6486
11fdf7f2 6487 if (abort) {
f67539c2 6488 mount_aborted = true;
11fdf7f2 6489 // Abort all mds sessions
f67539c2 6490 _abort_mds_sessions(-CEPHFS_ENOTCONN);
11fdf7f2 6491
f67539c2 6492 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
11fdf7f2
TL
6493 } else {
6494 // flush the mdlog for pending requests, if any
6495 flush_mdlog_sync();
6496 }
6497
9f95a23c
TL
6498 mount_cond.wait(lock, [this] {
6499 if (!mds_requests.empty()) {
6500 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6501 << dendl;
6502 }
6503 return mds_requests.empty();
6504 });
7c673cae
FG
6505
6506 cwd.reset();
b3b6e05e 6507 root.reset();
7c673cae
FG
6508
6509 // clean up any unclosed files
6510 while (!fd_map.empty()) {
6511 Fh *fh = fd_map.begin()->second;
6512 fd_map.erase(fd_map.begin());
6513 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6514 _release_fh(fh);
6515 }
6516
6517 while (!ll_unclosed_fh_set.empty()) {
6518 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6519 Fh *fh = *it;
6520 ll_unclosed_fh_set.erase(fh);
6521 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6522 _release_fh(fh);
6523 }
6524
6525 while (!opened_dirs.empty()) {
6526 dir_result_t *dirp = *opened_dirs.begin();
6527 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6528 _closedir(dirp);
6529 }
6530
6531 _ll_drop_pins();
6532
7c673cae
FG
6533 if (cct->_conf->client_oc) {
6534 // flush/release all buffered data
11fdf7f2
TL
6535 std::list<InodeRef> anchor;
6536 for (auto& p : inode_map) {
6537 Inode *in = p.second;
7c673cae 6538 if (!in) {
11fdf7f2
TL
6539 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6540 ceph_assert(in);
7c673cae 6541 }
11fdf7f2
TL
6542
6543 // prevent inode from getting freed
6544 anchor.emplace_back(in);
6545
f67539c2 6546 if (abort || blocklisted) {
11fdf7f2
TL
6547 objectcacher->purge_set(&in->oset);
6548 } else if (!in->caps.empty()) {
7c673cae
FG
6549 _release(in);
6550 _flush(in, new C_Client_FlushComplete(this, in));
6551 }
6552 }
6553 }
6554
f67539c2 6555 if (abort || blocklisted) {
20effc67
TL
6556 for (auto &q : mds_sessions) {
6557 auto s = q.second;
6558 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6559 Inode *in = *p;
6560 ++p;
6561 if (in->dirty_caps) {
6562 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6563 in->mark_caps_clean();
6564 put_inode(in);
6565 }
11fdf7f2
TL
6566 }
6567 }
6568 } else {
6569 flush_caps_sync();
6570 wait_sync_caps(last_flush_tid);
6571 }
7c673cae
FG
6572
6573 // empty lru cache
7c673cae
FG
6574 trim_cache();
6575
f67539c2
TL
6576 delay_put_inodes();
6577
7c673cae
FG
6578 while (lru.lru_get_size() > 0 ||
6579 !inode_map.empty()) {
6580 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6581 << "+" << inode_map.size() << " items"
6582 << ", waiting (for caps to release?)"
6583 << dendl;
f67539c2 6584
9f95a23c
TL
6585 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6586 r == std::cv_status::timeout) {
7c673cae
FG
6587 dump_cache(NULL);
6588 }
6589 }
11fdf7f2
TL
6590 ceph_assert(lru.lru_get_size() == 0);
6591 ceph_assert(inode_map.empty());
7c673cae
FG
6592
6593 // stop tracing
6594 if (!cct->_conf->client_trace.empty()) {
6595 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6596 traceout.close();
6597 }
6598
f67539c2
TL
6599 // stop the tick thread
6600 tick_thread_stopped = true;
6601 upkeep_cond.notify_one();
6602
7c673cae
FG
6603 _close_sessions();
6604
2a845540
TL
6605 // release the global snapshot realm
6606 SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6607 if (global_realm) {
6608 ceph_assert(global_realm->nref == 1);
6609 put_snap_realm(global_realm);
6610 }
6611
f67539c2 6612 mref_writer.update_state(CLIENT_UNMOUNTED);
7c673cae
FG
6613
6614 ldout(cct, 2) << "unmounted." << dendl;
6615}
6616
b32b8144
FG
6617void Client::unmount()
6618{
11fdf7f2
TL
6619 _unmount(false);
6620}
6621
6622void Client::abort_conn()
6623{
11fdf7f2 6624 _unmount(true);
b32b8144
FG
6625}
6626
7c673cae
FG
6627void Client::flush_cap_releases()
6628{
f67539c2
TL
6629 uint64_t nr_caps = 0;
6630
7c673cae 6631 // send any cap releases
11fdf7f2 6632 for (auto &p : mds_sessions) {
20effc67
TL
6633 auto session = p.second;
6634 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
11fdf7f2 6635 p.first)) {
20effc67 6636 nr_caps += session->release->caps.size();
7c673cae
FG
6637 if (cct->_conf->client_inject_release_failure) {
6638 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6639 } else {
20effc67 6640 session->con->send_message2(std::move(session->release));
7c673cae 6641 }
20effc67 6642 session->release.reset();
7c673cae
FG
6643 }
6644 }
f67539c2
TL
6645
6646 if (nr_caps > 0) {
6647 dec_pinned_icaps(nr_caps);
6648 }
7c673cae
FG
6649}
6650
f67539c2 6651void Client::renew_and_flush_cap_releases()
7c673cae 6652{
f67539c2
TL
6653 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6654
6655 if (!mount_aborted && mdsmap->get_epoch()) {
6656 // renew caps?
2a845540
TL
6657 auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6658 if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
f67539c2
TL
6659 renew_caps();
6660
6661 flush_cap_releases();
7c673cae 6662 }
f67539c2
TL
6663}
6664
6665void Client::tick()
6666{
6667 ldout(cct, 20) << "tick" << dendl;
7c673cae 6668
2a845540 6669 auto now = ceph::coarse_mono_clock::now();
7c673cae 6670
f67539c2
TL
6671 /*
6672 * If the mount() is not finished
6673 */
6674 if (is_mounting() && !mds_requests.empty()) {
7c673cae 6675 MetaRequest *req = mds_requests.begin()->second;
f67539c2 6676
2a845540 6677 if (req->created + mount_timeout < now) {
f67539c2 6678 req->abort(-CEPHFS_ETIMEDOUT);
7c673cae 6679 if (req->caller_cond) {
f67539c2
TL
6680 req->kick = true;
6681 req->caller_cond->notify_all();
7c673cae
FG
6682 }
6683 signal_cond_list(waiting_for_mdsmap);
11fdf7f2 6684 for (auto &p : mds_sessions) {
20effc67 6685 signal_context_list(p.second->waiting_for_open);
11fdf7f2 6686 }
7c673cae
FG
6687 }
6688 }
6689
f67539c2 6690 renew_and_flush_cap_releases();
7c673cae
FG
6691
6692 // delayed caps
28e407b8 6693 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6694 while (!p.end()) {
6695 Inode *in = *p;
6696 ++p;
f67539c2 6697 if (!mount_aborted && in->hold_caps_until > now)
7c673cae 6698 break;
28e407b8 6699 delayed_list.pop_front();
f67539c2
TL
6700 if (!mount_aborted)
6701 check_caps(in, CHECK_CAPS_NODELAY);
7c673cae
FG
6702 }
6703
f67539c2
TL
6704 if (!mount_aborted)
6705 collect_and_send_metrics();
6706
6707 delay_put_inodes(is_unmounting());
7c673cae 6708 trim_cache(true);
f6b5b4d7 6709
f67539c2 6710 if (blocklisted && (is_mounted() || is_unmounting()) &&
2a845540 6711 last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
f6b5b4d7
TL
6712 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6713 messenger->client_reset();
6714 fd_gen++; // invalidate open files
f67539c2 6715 blocklisted = false;
f6b5b4d7
TL
6716 _kick_stale_sessions();
6717 last_auto_reconnect = now;
6718 }
7c673cae
FG
6719}
6720
f67539c2
TL
6721void Client::start_tick_thread()
6722{
6723 upkeeper = std::thread([this]() {
6724 using time = ceph::coarse_mono_time;
6725 using sec = std::chrono::seconds;
6726
6727 auto last_tick = time::min();
6728
6729 std::unique_lock cl(client_lock);
6730 while (!tick_thread_stopped) {
6731 auto now = clock::now();
6732 auto since = now - last_tick;
6733
6734 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6735 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6736
6737 auto interval = std::max(t_interval, d_interval);
6738 if (likely(since >= interval*.90)) {
6739 tick();
6740 last_tick = clock::now();
6741 } else {
6742 interval -= since;
6743 }
6744
6745 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6746 if (!tick_thread_stopped)
6747 upkeep_cond.wait_for(cl, interval);
6748 }
6749 });
6750}
6751
6752void Client::collect_and_send_metrics() {
6753 ldout(cct, 20) << __func__ << dendl;
6754
6755 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6756
6757 // right now, we only track and send global metrics. its sufficient
6758 // to send these metrics to MDS rank0.
6759 collect_and_send_global_metrics();
6760}
6761
6762void Client::collect_and_send_global_metrics() {
6763 ldout(cct, 20) << __func__ << dendl;
6764 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6765
6766 if (!have_open_session((mds_rank_t)0)) {
6767 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6768 << dendl;
6769 return;
6770 }
6771 auto session = _get_or_open_mds_session((mds_rank_t)0);
6772 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6773 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6774 return;
6775 }
6776
6777 ClientMetricMessage metric;
6778 std::vector<ClientMetricMessage> message;
6779
6780 // read latency
33c7a0ef
TL
6781 if (_collect_and_send_global_metrics ||
6782 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
2a845540
TL
6783 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
6784 logger->tget(l_c_rd_avg),
6785 logger->get(l_c_rd_sqsum),
6786 nr_read_request));
33c7a0ef
TL
6787 message.push_back(metric);
6788 }
f67539c2
TL
6789
6790 // write latency
33c7a0ef
TL
6791 if (_collect_and_send_global_metrics ||
6792 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
2a845540
TL
6793 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
6794 logger->tget(l_c_wr_avg),
6795 logger->get(l_c_wr_sqsum),
6796 nr_write_request));
33c7a0ef
TL
6797 message.push_back(metric);
6798 }
f67539c2
TL
6799
6800 // metadata latency
33c7a0ef
TL
6801 if (_collect_and_send_global_metrics ||
6802 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
2a845540
TL
6803 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
6804 logger->tget(l_c_md_avg),
6805 logger->get(l_c_md_sqsum),
6806 nr_metadata_request));
33c7a0ef
TL
6807 message.push_back(metric);
6808 }
f67539c2
TL
6809
6810 // cap hit ratio -- nr_caps is unused right now
33c7a0ef
TL
6811 if (_collect_and_send_global_metrics ||
6812 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6813 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6814 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6815 message.push_back(metric);
6816 }
f67539c2
TL
6817
6818 // dentry lease hit ratio
33c7a0ef
TL
6819 if (_collect_and_send_global_metrics ||
6820 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6821 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6822 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6823 message.push_back(metric);
6824 }
f67539c2
TL
6825
6826 // opened files
33c7a0ef
TL
6827 if (_collect_and_send_global_metrics ||
6828 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
f67539c2
TL
6829 auto [opened_files, total_inodes] = get_opened_files_rates();
6830 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
33c7a0ef 6831 message.push_back(metric);
f67539c2 6832 }
f67539c2
TL
6833
6834 // pinned i_caps
33c7a0ef
TL
6835 if (_collect_and_send_global_metrics ||
6836 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
f67539c2
TL
6837 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6838 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
33c7a0ef 6839 message.push_back(metric);
f67539c2 6840 }
f67539c2
TL
6841
6842 // opened inodes
33c7a0ef
TL
6843 if (_collect_and_send_global_metrics ||
6844 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
f67539c2
TL
6845 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6846 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
33c7a0ef 6847 message.push_back(metric);
f67539c2 6848 }
f67539c2 6849
a4b75251 6850 // read io sizes
33c7a0ef
TL
6851 if (_collect_and_send_global_metrics ||
6852 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6853 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6854 total_read_size));
6855 message.push_back(metric);
6856 }
a4b75251
TL
6857
6858 // write io sizes
33c7a0ef
TL
6859 if (_collect_and_send_global_metrics ||
6860 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6861 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6862 total_write_size));
6863 message.push_back(metric);
6864 }
a4b75251 6865
f67539c2
TL
6866 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6867}
6868
7c673cae
FG
6869void Client::renew_caps()
6870{
6871 ldout(cct, 10) << "renew_caps()" << dendl;
2a845540 6872 last_cap_renew = ceph::coarse_mono_clock::now();
7c673cae 6873
11fdf7f2
TL
6874 for (auto &p : mds_sessions) {
6875 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6876 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
20effc67 6877 renew_caps(p.second.get());
7c673cae
FG
6878 }
6879}
6880
6881void Client::renew_caps(MetaSession *session)
6882{
6883 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6884 session->last_cap_renew_request = ceph_clock_now();
6885 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6886 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6887}
6888
6889
6890// ===============================================================
6891// high level (POSIXy) interface
6892
6893int Client::_do_lookup(Inode *dir, const string& name, int mask,
6894 InodeRef *target, const UserPerm& perms)
6895{
6896 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6897 MetaRequest *req = new MetaRequest(op);
6898 filepath path;
6899 dir->make_nosnap_relative_path(path);
6900 path.push_dentry(name);
6901 req->set_filepath(path);
6902 req->set_inode(dir);
6903 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6904 mask |= DEBUG_GETATTR_CAPS;
6905 req->head.args.getattr.mask = mask;
6906
11fdf7f2 6907 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6908
6909 int r = make_request(req, perms, target);
11fdf7f2 6910 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6911 return r;
6912}
6913
f67539c2
TL
6914bool Client::_dentry_valid(const Dentry *dn)
6915{
6916 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6917
6918 // is dn lease valid?
6919 utime_t now = ceph_clock_now();
6920 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6921 mds_sessions.count(dn->lease_mds)) {
20effc67
TL
6922 auto s = mds_sessions.at(dn->lease_mds);
6923 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
f67539c2
TL
6924 dlease_hit();
6925 return true;
6926 }
6927
20effc67 6928 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
f67539c2
TL
6929 << " vs lease_gen " << dn->lease_gen << dendl;
6930 }
6931
6932 dlease_miss();
6933 return false;
6934}
6935
7c673cae 6936int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
f67539c2 6937 const UserPerm& perms, std::string* alternate_name)
7c673cae
FG
6938{
6939 int r = 0;
6940 Dentry *dn = NULL;
f67539c2 6941 bool did_lookup_request = false;
f91f0fd5
TL
6942 // can only request shared caps
6943 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 6944
7c673cae 6945 if (dname == "..") {
11fdf7f2
TL
6946 if (dir->dentries.empty()) {
6947 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6948 filepath path(dir->ino);
6949 req->set_filepath(path);
6950
6951 InodeRef tmptarget;
6952 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6953
6954 if (r == 0) {
f91f0fd5 6955 *target = std::move(tmptarget);
11fdf7f2
TL
6956 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6957 } else {
6958 *target = dir;
6959 }
6960 }
7c673cae
FG
6961 else
6962 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6963 goto done;
6964 }
6965
6966 if (dname == ".") {
6967 *target = dir;
6968 goto done;
6969 }
6970
11fdf7f2 6971 if (!dir->is_dir()) {
f67539c2 6972 r = -CEPHFS_ENOTDIR;
11fdf7f2
TL
6973 goto done;
6974 }
6975
7c673cae 6976 if (dname.length() > NAME_MAX) {
f67539c2 6977 r = -CEPHFS_ENAMETOOLONG;
7c673cae
FG
6978 goto done;
6979 }
6980
6981 if (dname == cct->_conf->client_snapdir &&
6982 dir->snapid == CEPH_NOSNAP) {
6983 *target = open_snapdir(dir);
6984 goto done;
6985 }
6986
f67539c2 6987relookup:
7c673cae
FG
6988 if (dir->dir &&
6989 dir->dir->dentries.count(dname)) {
6990 dn = dir->dir->dentries[dname];
6991
f67539c2
TL
6992 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6993 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7c673cae 6994
94b18763 6995 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
f67539c2
TL
6996 if (_dentry_valid(dn)) {
6997 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6998 // make trim_caps() behave.
6999 dir->try_touch_cap(dn->lease_mds);
7000 goto hit_dn;
7c673cae 7001 }
92f5a8d4 7002 // dir shared caps?
94b18763 7003 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 7004 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 7005 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
7006 goto hit_dn;
7007 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 7008 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae 7009 << *dir << " dn '" << dname << "'" << dendl;
f67539c2 7010 return -CEPHFS_ENOENT;
7c673cae
FG
7011 }
7012 }
7013 } else {
7014 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7015 }
7016 } else {
7017 // can we conclude ENOENT locally?
94b18763 7018 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 7019 (dir->flags & I_COMPLETE)) {
11fdf7f2 7020 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
f67539c2 7021 return -CEPHFS_ENOENT;
7c673cae
FG
7022 }
7023 }
7024
f67539c2
TL
7025 if (did_lookup_request) {
7026 r = 0;
7027 goto done;
7028 }
7c673cae 7029 r = _do_lookup(dir, dname, mask, target, perms);
f67539c2
TL
7030 did_lookup_request = true;
7031 if (r == 0) {
7032 /* complete lookup to get dentry for alternate_name */
7033 goto relookup;
7034 } else {
7035 goto done;
7036 }
7037
7038 hit_dn:
7039 if (dn->inode) {
7c673cae 7040 *target = dn->inode;
f67539c2
TL
7041 if (alternate_name)
7042 *alternate_name = dn->alternate_name;
7c673cae 7043 } else {
f67539c2 7044 r = -CEPHFS_ENOENT;
7c673cae
FG
7045 }
7046 touch_dn(dn);
f67539c2 7047 goto done;
7c673cae
FG
7048
7049 done:
7050 if (r < 0)
11fdf7f2 7051 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 7052 else
11fdf7f2 7053 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
7054 return r;
7055}
7056
7057int Client::get_or_create(Inode *dir, const char* name,
7058 Dentry **pdn, bool expect_null)
7059{
7060 // lookup
11fdf7f2 7061 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
7062 dir->open_dir();
7063 if (dir->dir->dentries.count(name)) {
7064 Dentry *dn = dir->dir->dentries[name];
f67539c2
TL
7065 if (_dentry_valid(dn)) {
7066 if (expect_null)
7067 return -CEPHFS_EEXIST;
7c673cae
FG
7068 }
7069 *pdn = dn;
7070 } else {
7071 // otherwise link up a new one
7072 *pdn = link(dir->dir, name, NULL, NULL);
7073 }
7074
7075 // success
7076 return 0;
7077}
7078
f67539c2
TL
7079int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7080{
7081 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7082 if (!mref_reader.is_state_satisfied())
7083 return -CEPHFS_ENOTCONN;
7084
7085 ldout(cct, 10) << __func__ << ": " << path << dendl;
7086
7087 std::scoped_lock lock(client_lock);
7088
7089 return path_walk(path, wdr, perms, followsym);
7090}
7091
7c673cae 7092int Client::path_walk(const filepath& origpath, InodeRef *end,
b3b6e05e 7093 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
f67539c2
TL
7094{
7095 walk_dentry_result wdr;
b3b6e05e 7096 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
f67539c2
TL
7097 *end = std::move(wdr.in);
7098 return rc;
7099}
7100
b3b6e05e
TL
7101int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7102 bool followsym, int mask, InodeRef dirinode)
7c673cae
FG
7103{
7104 filepath path = origpath;
7105 InodeRef cur;
f67539c2 7106 std::string alternate_name;
7c673cae
FG
7107 if (origpath.absolute())
7108 cur = root;
b3b6e05e 7109 else if (!dirinode)
7c673cae 7110 cur = cwd;
b3b6e05e
TL
7111 else {
7112 cur = dirinode;
7113 }
11fdf7f2 7114 ceph_assert(cur);
7c673cae 7115
b3b6e05e 7116 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
11fdf7f2 7117 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
7118
7119 int symlinks = 0;
7120
7121 unsigned i=0;
7122 while (i < path.depth() && cur) {
7123 int caps = 0;
7124 const string &dname = path[i];
7125 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7126 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7127 InodeRef next;
7128 if (cct->_conf->client_permissions) {
7129 int r = may_lookup(cur.get(), perms);
7130 if (r < 0)
7131 return r;
7132 caps = CEPH_CAP_AUTH_SHARED;
7133 }
7134
7135 /* Get extra requested caps on the last component */
7136 if (i == (path.depth() - 1))
7137 caps |= mask;
f67539c2 7138 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7c673cae
FG
7139 if (r < 0)
7140 return r;
7141 // only follow trailing symlink if followsym. always follow
7142 // 'directory' symlinks.
7143 if (next && next->is_symlink()) {
7144 symlinks++;
7145 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7146 if (symlinks > MAXSYMLINKS) {
f67539c2 7147 return -CEPHFS_ELOOP;
7c673cae
FG
7148 }
7149
7150 if (i < path.depth() - 1) {
7151 // dir symlink
7152 // replace consumed components of path with symlink dir target
7153 filepath resolved(next->symlink.c_str());
7154 resolved.append(path.postfixpath(i + 1));
7155 path = resolved;
7156 i = 0;
7157 if (next->symlink[0] == '/') {
7158 cur = root;
7159 }
7160 continue;
7161 } else if (followsym) {
7162 if (next->symlink[0] == '/') {
7163 path = next->symlink.c_str();
7164 i = 0;
7165 // reset position
7166 cur = root;
7167 } else {
7168 filepath more(next->symlink.c_str());
7169 // we need to remove the symlink component from off of the path
7170 // before adding the target that the symlink points to. remain
7171 // at the same position in the path.
7172 path.pop_dentry();
7173 path.append(more);
7174 }
7175 continue;
7176 }
7177 }
7178 cur.swap(next);
7179 i++;
7180 }
7181 if (!cur)
f67539c2
TL
7182 return -CEPHFS_ENOENT;
7183 if (result) {
7184 result->in = std::move(cur);
7185 result->alternate_name = std::move(alternate_name);
7186 }
7c673cae
FG
7187 return 0;
7188}
7189
7190
7191// namespace ops
7192
f67539c2 7193int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7c673cae 7194{
f67539c2
TL
7195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7196 if (!mref_reader.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN;
7198
7c673cae
FG
7199 tout(cct) << "link" << std::endl;
7200 tout(cct) << relexisting << std::endl;
7201 tout(cct) << relpath << std::endl;
7202
7203 filepath existing(relexisting);
7204
7205 InodeRef in, dir;
f67539c2
TL
7206
7207 std::scoped_lock lock(client_lock);
7c673cae
FG
7208 int r = path_walk(existing, &in, perm, true);
7209 if (r < 0)
7210 return r;
7211 if (std::string(relpath) == "/") {
f67539c2 7212 r = -CEPHFS_EEXIST;
7c673cae
FG
7213 return r;
7214 }
7215 filepath path(relpath);
7216 string name = path.last_dentry();
7217 path.pop_dentry();
7218
7219 r = path_walk(path, &dir, perm, true);
7220 if (r < 0)
7221 return r;
7222 if (cct->_conf->client_permissions) {
7223 if (S_ISDIR(in->mode)) {
f67539c2 7224 r = -CEPHFS_EPERM;
7c673cae
FG
7225 return r;
7226 }
7227 r = may_hardlink(in.get(), perm);
7228 if (r < 0)
7229 return r;
7230 r = may_create(dir.get(), perm);
7231 if (r < 0)
7232 return r;
7233 }
f67539c2 7234 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7235 return r;
7236}
7237
7238int Client::unlink(const char *relpath, const UserPerm& perm)
b3b6e05e
TL
7239{
7240 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7241}
7242
7243int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7c673cae 7244{
f67539c2 7245 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7246 if (!mref_reader.is_state_satisfied()) {
f67539c2 7247 return -CEPHFS_ENOTCONN;
b3b6e05e 7248 }
f67539c2 7249
11fdf7f2 7250 tout(cct) << __func__ << std::endl;
b3b6e05e 7251 tout(cct) << dirfd << std::endl;
7c673cae 7252 tout(cct) << relpath << std::endl;
b3b6e05e 7253 tout(cct) << flags << std::endl;
7c673cae 7254
b3b6e05e
TL
7255 if (std::string(relpath) == "/") {
7256 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7257 }
7c673cae
FG
7258
7259 filepath path(relpath);
7260 string name = path.last_dentry();
7261 path.pop_dentry();
7262 InodeRef dir;
f67539c2
TL
7263
7264 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7265
7266 InodeRef dirinode;
7267 int r = get_fd_inode(dirfd, &dirinode);
7268 if (r < 0) {
7269 return r;
7270 }
7271
7272 r = path_walk(path, &dir, perm, true, 0, dirinode);
7273 if (r < 0) {
7c673cae 7274 return r;
b3b6e05e 7275 }
7c673cae
FG
7276 if (cct->_conf->client_permissions) {
7277 r = may_delete(dir.get(), name.c_str(), perm);
b3b6e05e 7278 if (r < 0) {
7c673cae 7279 return r;
b3b6e05e 7280 }
7c673cae 7281 }
b3b6e05e
TL
7282 if (flags & AT_REMOVEDIR) {
7283 r = _rmdir(dir.get(), name.c_str(), perm);
7284 } else {
7285 r = _unlink(dir.get(), name.c_str(), perm);
7286 }
7287 return r;
7c673cae
FG
7288}
7289
f67539c2 7290int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7c673cae 7291{
f67539c2
TL
7292 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7293 if (!mref_reader.is_state_satisfied())
7294 return -CEPHFS_ENOTCONN;
7295
11fdf7f2 7296 tout(cct) << __func__ << std::endl;
7c673cae
FG
7297 tout(cct) << relfrom << std::endl;
7298 tout(cct) << relto << std::endl;
7299
7300 if (std::string(relfrom) == "/" || std::string(relto) == "/")
f67539c2 7301 return -CEPHFS_EBUSY;
7c673cae
FG
7302
7303 filepath from(relfrom);
7304 filepath to(relto);
7305 string fromname = from.last_dentry();
7306 from.pop_dentry();
7307 string toname = to.last_dentry();
7308 to.pop_dentry();
7309
7310 InodeRef fromdir, todir;
f67539c2
TL
7311
7312 std::scoped_lock lock(client_lock);
7c673cae
FG
7313 int r = path_walk(from, &fromdir, perm);
7314 if (r < 0)
7315 goto out;
7316 r = path_walk(to, &todir, perm);
7317 if (r < 0)
7318 goto out;
7319
7320 if (cct->_conf->client_permissions) {
7321 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7322 if (r < 0)
7323 return r;
7324 r = may_delete(todir.get(), toname.c_str(), perm);
f67539c2 7325 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
7326 return r;
7327 }
f67539c2 7328 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7329out:
7330 return r;
7331}
7332
7333// dirs
7334
f67539c2 7335int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
b3b6e05e
TL
7336{
7337 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7338}
7339
7340int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7341 std::string alternate_name)
7c673cae 7342{
f67539c2
TL
7343 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7344 if (!mref_reader.is_state_satisfied())
7345 return -CEPHFS_ENOTCONN;
7346
11fdf7f2 7347 tout(cct) << __func__ << std::endl;
b3b6e05e 7348 tout(cct) << dirfd << std::endl;
7c673cae
FG
7349 tout(cct) << relpath << std::endl;
7350 tout(cct) << mode << std::endl;
11fdf7f2 7351 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 7352
b3b6e05e 7353 if (std::string(relpath) == "/") {
f67539c2 7354 return -CEPHFS_EEXIST;
b3b6e05e 7355 }
7c673cae
FG
7356
7357 filepath path(relpath);
7358 string name = path.last_dentry();
7359 path.pop_dentry();
7360 InodeRef dir;
f67539c2
TL
7361
7362 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7363
7364 InodeRef dirinode;
7365 int r = get_fd_inode(dirfd, &dirinode);
7366 if (r < 0) {
7c673cae 7367 return r;
b3b6e05e
TL
7368 }
7369
7370 r = path_walk(path, &dir, perm, true, 0, dirinode);
7371 if (r < 0) {
7372 return r;
7373 }
7c673cae
FG
7374 if (cct->_conf->client_permissions) {
7375 r = may_create(dir.get(), perm);
b3b6e05e 7376 if (r < 0) {
7c673cae 7377 return r;
b3b6e05e 7378 }
7c673cae 7379 }
f67539c2 7380 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7c673cae
FG
7381}
7382
7383int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7384{
f67539c2
TL
7385 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7386 if (!mref_reader.is_state_satisfied())
7387 return -CEPHFS_ENOTCONN;
7388
7c673cae 7389 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 7390 tout(cct) << __func__ << std::endl;
7c673cae
FG
7391 tout(cct) << relpath << std::endl;
7392 tout(cct) << mode << std::endl;
7393
7394 //get through existing parts of path
7395 filepath path(relpath);
7396 unsigned int i;
7397 int r = 0, caps = 0;
7398 InodeRef cur, next;
f67539c2
TL
7399
7400 std::scoped_lock lock(client_lock);
7c673cae
FG
7401 cur = cwd;
7402 for (i=0; i<path.depth(); ++i) {
7403 if (cct->_conf->client_permissions) {
7404 r = may_lookup(cur.get(), perms);
7405 if (r < 0)
7406 break;
7407 caps = CEPH_CAP_AUTH_SHARED;
7408 }
7409 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7410 if (r < 0)
7411 break;
7412 cur.swap(next);
7413 }
f67539c2 7414 if (r!=-CEPHFS_ENOENT) return r;
11fdf7f2 7415 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
7416 //make new directory at each level
7417 for (; i<path.depth(); ++i) {
7418 if (cct->_conf->client_permissions) {
7419 r = may_create(cur.get(), perms);
7420 if (r < 0)
7421 return r;
7422 }
7423 //make new dir
7424 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 7425
7c673cae 7426 //check proper creation/existence
f67539c2 7427 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
c07f9fc5
FG
7428 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7429 }
7430 if (r < 0)
7431 return r;
7c673cae
FG
7432 //move to new dir and continue
7433 cur.swap(next);
11fdf7f2 7434 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
7435 << filepath(cur->ino).get_path() << dendl;
7436 }
7437 return 0;
7438}
7439
7440int Client::rmdir(const char *relpath, const UserPerm& perms)
7441{
b3b6e05e 7442 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7c673cae
FG
7443}
7444
7445int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
f67539c2
TL
7446{
7447 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7448 if (!mref_reader.is_state_satisfied())
7449 return -CEPHFS_ENOTCONN;
7450
11fdf7f2 7451 tout(cct) << __func__ << std::endl;
7c673cae
FG
7452 tout(cct) << relpath << std::endl;
7453 tout(cct) << mode << std::endl;
7454 tout(cct) << rdev << std::endl;
7455
7456 if (std::string(relpath) == "/")
f67539c2 7457 return -CEPHFS_EEXIST;
7c673cae
FG
7458
7459 filepath path(relpath);
7460 string name = path.last_dentry();
7461 path.pop_dentry();
7462 InodeRef dir;
f67539c2
TL
7463
7464 std::scoped_lock lock(client_lock);
7c673cae
FG
7465 int r = path_walk(path, &dir, perms);
7466 if (r < 0)
7467 return r;
7468 if (cct->_conf->client_permissions) {
7469 int r = may_create(dir.get(), perms);
7470 if (r < 0)
7471 return r;
7472 }
7473 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7474}
7475
7476// symlinks
7477
f67539c2 7478int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
b3b6e05e
TL
7479{
7480 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7481}
7482
7483int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7484 std::string alternate_name)
7c673cae 7485{
f67539c2 7486 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7487 if (!mref_reader.is_state_satisfied()) {
f67539c2 7488 return -CEPHFS_ENOTCONN;
b3b6e05e 7489 }
f67539c2 7490
11fdf7f2 7491 tout(cct) << __func__ << std::endl;
7c673cae 7492 tout(cct) << target << std::endl;
b3b6e05e 7493 tout(cct) << dirfd << std::endl;
7c673cae
FG
7494 tout(cct) << relpath << std::endl;
7495
b3b6e05e 7496 if (std::string(relpath) == "/") {
f67539c2 7497 return -CEPHFS_EEXIST;
b3b6e05e 7498 }
7c673cae
FG
7499
7500 filepath path(relpath);
7501 string name = path.last_dentry();
7502 path.pop_dentry();
7503 InodeRef dir;
f67539c2
TL
7504
7505 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7506
7507 InodeRef dirinode;
7508 int r = get_fd_inode(dirfd, &dirinode);
7509 if (r < 0) {
7c673cae 7510 return r;
b3b6e05e
TL
7511 }
7512 r = path_walk(path, &dir, perms, true, 0, dirinode);
7513 if (r < 0) {
7514 return r;
7515 }
7c673cae
FG
7516 if (cct->_conf->client_permissions) {
7517 int r = may_create(dir.get(), perms);
b3b6e05e 7518 if (r < 0) {
7c673cae 7519 return r;
b3b6e05e 7520 }
7c673cae 7521 }
f67539c2 7522 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7c673cae
FG
7523}
7524
7525int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7526{
b3b6e05e
TL
7527 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7528}
7529
7530int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
f67539c2 7531 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7532 if (!mref_reader.is_state_satisfied()) {
f67539c2 7533 return -CEPHFS_ENOTCONN;
b3b6e05e 7534 }
f67539c2 7535
11fdf7f2 7536 tout(cct) << __func__ << std::endl;
b3b6e05e 7537 tout(cct) << dirfd << std::endl;
7c673cae
FG
7538 tout(cct) << relpath << std::endl;
7539
b3b6e05e 7540 InodeRef dirinode;
f67539c2 7541 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7542 int r = get_fd_inode(dirfd, &dirinode);
7543 if (r < 0) {
7c673cae 7544 return r;
b3b6e05e
TL
7545 }
7546
7547 InodeRef in;
7548 filepath path(relpath);
7549 r = path_walk(path, &in, perms, false, 0, dirinode);
7550 if (r < 0) {
7551 return r;
7552 }
7c673cae
FG
7553
7554 return _readlink(in.get(), buf, size);
7555}
7556
7557int Client::_readlink(Inode *in, char *buf, size_t size)
7558{
7559 if (!in->is_symlink())
f67539c2 7560 return -CEPHFS_EINVAL;
7c673cae
FG
7561
7562 // copy into buf (at most size bytes)
7563 int r = in->symlink.length();
7564 if (r > (int)size)
7565 r = size;
7566 memcpy(buf, in->symlink.c_str(), r);
7567 return r;
7568}
7569
7570
7571// inode stuff
7572
7573int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7574{
94b18763 7575 bool yes = in->caps_issued_mask(mask, true);
7c673cae 7576
11fdf7f2 7577 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
7578 if (yes && !force)
7579 return 0;
7580
7581 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7582 filepath path;
7583 in->make_nosnap_relative_path(path);
7584 req->set_filepath(path);
7585 req->set_inode(in);
7586 req->head.args.getattr.mask = mask;
7587
7588 int res = make_request(req, perms);
11fdf7f2 7589 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
7590 return res;
7591}
7592
1d09f67e
TL
7593int Client::_getvxattr(
7594 Inode *in,
7595 const UserPerm& perms,
7596 const char *xattr_name,
7597 ssize_t size,
7598 void *value,
7599 mds_rank_t rank)
7600{
7601 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7602 return -CEPHFS_ENODATA;
7603 }
7604
7605 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7606 filepath path;
7607 in->make_nosnap_relative_path(path);
7608 req->set_filepath(path);
7609 req->set_inode(in);
7610 req->set_string2(xattr_name);
7611
7612 bufferlist bl;
7613 int res = make_request(req, perms, nullptr, nullptr, rank, &bl);
7614 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7615
7616 if (res < 0) {
7617 return res;
7618 }
7619
7620 std::string buf;
7621 auto p = bl.cbegin();
7622
7623 DECODE_START(1, p);
7624 decode(buf, p);
7625 DECODE_FINISH(p);
7626
7627 ssize_t len = buf.length();
7628
7629 res = len; // refer to man getxattr(2) for output buffer size == 0
7630
7631 if (size > 0) {
7632 if (len > size) {
7633 res = -CEPHFS_ERANGE; // insufficient output buffer space
7634 } else {
7635 memcpy(value, buf.c_str(), len);
7636 }
7637 }
7638 return res;
7639}
7640
7c673cae
FG
7641int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7642 const UserPerm& perms, InodeRef *inp)
7643{
7644 int issued = in->caps_issued();
20effc67
TL
7645 union ceph_mds_request_args args;
7646 bool kill_sguid = false;
7647 int inode_drop = 0;
7c673cae 7648
11fdf7f2 7649 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
7650 ccap_string(issued) << dendl;
7651
7652 if (in->snapid != CEPH_NOSNAP) {
f67539c2 7653 return -CEPHFS_EROFS;
7c673cae
FG
7654 }
7655 if ((mask & CEPH_SETATTR_SIZE) &&
f67539c2
TL
7656 (uint64_t)stx->stx_size > in->size &&
7657 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7c673cae 7658 perms)) {
f67539c2 7659 return -CEPHFS_EDQUOT;
7c673cae
FG
7660 }
7661
20effc67
TL
7662 memset(&args, 0, sizeof(args));
7663
7c673cae
FG
7664 // make the change locally?
7665 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7666 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7667 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7668 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7669 << in->cap_dirtier_gid << ", forcing sync setattr"
7670 << dendl;
7671 /*
7672 * This works because we implicitly flush the caps as part of the
7673 * request, so the cap update check will happen with the writeback
7674 * cap context, and then the setattr check will happen with the
7675 * caller's context.
7676 *
7677 * In reality this pattern is likely pretty rare (different users
7678 * setattr'ing the same file). If that turns out not to be the
7679 * case later, we can build a more complex pipelined cap writeback
7680 * infrastructure...
7681 */
20effc67 7682 mask |= CEPH_SETATTR_CTIME;
7c673cae
FG
7683 }
7684
7685 if (!mask) {
7686 // caller just needs us to bump the ctime
7687 in->ctime = ceph_clock_now();
7688 in->cap_dirtier_uid = perms.uid();
7689 in->cap_dirtier_gid = perms.gid();
7690 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7691 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7692 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7693 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7694 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7695 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7696 else
7697 mask |= CEPH_SETATTR_CTIME;
7698 }
7699
7700 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
20effc67 7701 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7c673cae
FG
7702
7703 mask &= ~CEPH_SETATTR_KILL_SGUID;
20effc67
TL
7704 } else if (mask & CEPH_SETATTR_SIZE) {
7705 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7706 mask |= CEPH_SETATTR_KILL_SGUID;
7707 inode_drop |= CEPH_CAP_AUTH_SHARED;
7708 }
7709
7710 if (mask & CEPH_SETATTR_UID) {
7711 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7c673cae 7712
20effc67 7713 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7714 in->ctime = ceph_clock_now();
7715 in->cap_dirtier_uid = perms.uid();
7716 in->cap_dirtier_gid = perms.gid();
7717 in->uid = stx->stx_uid;
28e407b8 7718 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7719 mask &= ~CEPH_SETATTR_UID;
7720 kill_sguid = true;
20effc67
TL
7721 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7722 in->uid != stx->stx_uid) {
7723 args.setattr.uid = stx->stx_uid;
7724 inode_drop |= CEPH_CAP_AUTH_SHARED;
7725 } else {
7726 mask &= ~CEPH_SETATTR_UID;
7c673cae 7727 }
20effc67
TL
7728 }
7729
7730 if (mask & CEPH_SETATTR_GID) {
7731 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7732
7733 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7734 in->ctime = ceph_clock_now();
7735 in->cap_dirtier_uid = perms.uid();
7736 in->cap_dirtier_gid = perms.gid();
7737 in->gid = stx->stx_gid;
28e407b8 7738 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7739 mask &= ~CEPH_SETATTR_GID;
7740 kill_sguid = true;
20effc67
TL
7741 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7742 in->gid != stx->stx_gid) {
7743 args.setattr.gid = stx->stx_gid;
7744 inode_drop |= CEPH_CAP_AUTH_SHARED;
7745 } else {
7746 mask &= ~CEPH_SETATTR_GID;
7c673cae 7747 }
20effc67 7748 }
7c673cae 7749
20effc67
TL
7750 if (mask & CEPH_SETATTR_MODE) {
7751 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7752
7753 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7754 in->ctime = ceph_clock_now();
7755 in->cap_dirtier_uid = perms.uid();
7756 in->cap_dirtier_gid = perms.gid();
7757 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7758 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7759 mask &= ~CEPH_SETATTR_MODE;
20effc67
TL
7760 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7761 in->mode != stx->stx_mode) {
7762 args.setattr.mode = stx->stx_mode;
7763 inode_drop |= CEPH_CAP_AUTH_SHARED;
7764 } else {
7765 mask &= ~CEPH_SETATTR_MODE;
7c673cae 7766 }
20effc67
TL
7767 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7768 kill_sguid && S_ISREG(in->mode) &&
7769 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7770 /* Must squash the any setuid/setgid bits with an ownership change */
7771 in->mode &= ~(S_ISUID|S_ISGID);
7772 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7773 }
7774
7775 if (mask & CEPH_SETATTR_BTIME) {
7776 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7c673cae 7777
20effc67 7778 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7779 in->ctime = ceph_clock_now();
7780 in->cap_dirtier_uid = perms.uid();
7781 in->cap_dirtier_gid = perms.gid();
7782 in->btime = utime_t(stx->stx_btime);
28e407b8 7783 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7784 mask &= ~CEPH_SETATTR_BTIME;
20effc67
TL
7785 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7786 in->btime != utime_t(stx->stx_btime)) {
7787 args.setattr.btime = utime_t(stx->stx_btime);
7788 inode_drop |= CEPH_CAP_AUTH_SHARED;
7789 } else {
7790 mask &= ~CEPH_SETATTR_BTIME;
7791 }
7792 }
7793
7794 if (mask & CEPH_SETATTR_SIZE) {
7795 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7796 //too big!
7797 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7798 return -CEPHFS_EFBIG;
7799 }
7800
7801 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7802 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7803 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7804 stx->stx_size >= in->size) {
7805 if (stx->stx_size > in->size) {
7806 in->size = in->reported_size = stx->stx_size;
7807 in->cap_dirtier_uid = perms.uid();
7808 in->cap_dirtier_gid = perms.gid();
7809 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7810 mask &= ~(CEPH_SETATTR_SIZE);
7811 mask |= CEPH_SETATTR_MTIME;
7812 } else {
7813 // ignore it when size doesn't change
7814 mask &= ~(CEPH_SETATTR_SIZE);
7815 }
7816 } else {
7817 args.setattr.size = stx->stx_size;
7818 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7819 CEPH_CAP_FILE_WR;
7820 }
7821 }
7822
7823 if (mask & CEPH_SETATTR_MTIME) {
7824 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7825 in->mtime = utime_t(stx->stx_mtime);
7826 in->ctime = ceph_clock_now();
7827 in->cap_dirtier_uid = perms.uid();
7828 in->cap_dirtier_gid = perms.gid();
7829 in->time_warp_seq++;
7830 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7831 mask &= ~CEPH_SETATTR_MTIME;
7832 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7833 utime_t(stx->stx_mtime) > in->mtime) {
7834 in->mtime = utime_t(stx->stx_mtime);
7835 in->ctime = ceph_clock_now();
7836 in->cap_dirtier_uid = perms.uid();
7837 in->cap_dirtier_gid = perms.gid();
7838 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7839 mask &= ~CEPH_SETATTR_MTIME;
7840 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7841 in->mtime != utime_t(stx->stx_mtime)) {
7842 args.setattr.mtime = utime_t(stx->stx_mtime);
7843 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7844 CEPH_CAP_FILE_WR;
7845 } else {
7846 mask &= ~CEPH_SETATTR_MTIME;
7c673cae 7847 }
7c673cae
FG
7848 }
7849
20effc67
TL
7850 if (mask & CEPH_SETATTR_ATIME) {
7851 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7852 in->atime = utime_t(stx->stx_atime);
7c673cae
FG
7853 in->ctime = ceph_clock_now();
7854 in->cap_dirtier_uid = perms.uid();
7855 in->cap_dirtier_gid = perms.gid();
7856 in->time_warp_seq++;
28e407b8 7857 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
20effc67
TL
7858 mask &= ~CEPH_SETATTR_ATIME;
7859 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7860 utime_t(stx->stx_atime) > in->atime) {
7861 in->atime = utime_t(stx->stx_atime);
7862 in->ctime = ceph_clock_now();
7863 in->cap_dirtier_uid = perms.uid();
7864 in->cap_dirtier_gid = perms.gid();
7865 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7866 mask &= ~CEPH_SETATTR_ATIME;
7867 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7868 in->atime != utime_t(stx->stx_atime)) {
7869 args.setattr.atime = utime_t(stx->stx_atime);
7870 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7871 CEPH_CAP_FILE_WR;
7872 } else {
7873 mask &= ~CEPH_SETATTR_ATIME;
7c673cae
FG
7874 }
7875 }
20effc67 7876
7c673cae
FG
7877 if (!mask) {
7878 in->change_attr++;
7879 return 0;
7880 }
7881
7c673cae
FG
7882 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7883
7884 filepath path;
7885
7886 in->make_nosnap_relative_path(path);
7887 req->set_filepath(path);
7888 req->set_inode(in);
7889
20effc67
TL
7890 req->head.args = args;
7891 req->inode_drop = inode_drop;
7c673cae 7892 req->head.args.setattr.mask = mask;
7c673cae
FG
7893 req->regetattr_mask = mask;
7894
7895 int res = make_request(req, perms, inp);
7896 ldout(cct, 10) << "_setattr result=" << res << dendl;
7897 return res;
7898}
7899
7900/* Note that we only care about attrs that setattr cares about */
7901void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7902{
7903 stx->stx_size = st->st_size;
7904 stx->stx_mode = st->st_mode;
7905 stx->stx_uid = st->st_uid;
7906 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7907#ifdef __APPLE__
7908 stx->stx_mtime = st->st_mtimespec;
7909 stx->stx_atime = st->st_atimespec;
f67539c2
TL
7910#elif __WIN32
7911 stx->stx_mtime.tv_sec = st->st_mtime;
7912 stx->stx_atime.tv_sec = st->st_atime;
11fdf7f2 7913#else
7c673cae
FG
7914 stx->stx_mtime = st->st_mtim;
7915 stx->stx_atime = st->st_atim;
11fdf7f2 7916#endif
7c673cae
FG
7917}
7918
7919int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7920 const UserPerm& perms, InodeRef *inp)
7921{
7922 int ret = _do_setattr(in, stx, mask, perms, inp);
7923 if (ret < 0)
7924 return ret;
7925 if (mask & CEPH_SETATTR_MODE)
7926 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7927 return ret;
7928}
7929
7930int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7931 const UserPerm& perms)
7932{
7933 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7934 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7935 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7936 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7937 if (cct->_conf->client_permissions) {
7938 int r = may_setattr(in.get(), stx, mask, perms);
7939 if (r < 0)
7940 return r;
7941 }
7942 return __setattrx(in.get(), stx, mask, perms);
7943}
7944
7945int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7946 const UserPerm& perms)
7947{
7948 struct ceph_statx stx;
7949
7950 stat_to_statx(attr, &stx);
7951 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7952
7953 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7954 mask &= ~CEPH_SETATTR_UID;
7955 }
7956 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7957 mask &= ~CEPH_SETATTR_GID;
7958 }
7959
7c673cae
FG
7960 return _setattrx(in, &stx, mask, perms);
7961}
7962
7963int Client::setattr(const char *relpath, struct stat *attr, int mask,
7964 const UserPerm& perms)
7965{
f67539c2
TL
7966 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7967 if (!mref_reader.is_state_satisfied())
7968 return -CEPHFS_ENOTCONN;
7969
11fdf7f2 7970 tout(cct) << __func__ << std::endl;
7c673cae
FG
7971 tout(cct) << relpath << std::endl;
7972 tout(cct) << mask << std::endl;
7973
7974 filepath path(relpath);
7975 InodeRef in;
f67539c2
TL
7976
7977 std::scoped_lock lock(client_lock);
7c673cae
FG
7978 int r = path_walk(path, &in, perms);
7979 if (r < 0)
7980 return r;
7981 return _setattr(in, attr, mask, perms);
7982}
7983
7984int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7985 const UserPerm& perms, int flags)
7986{
f67539c2
TL
7987 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7988 if (!mref_reader.is_state_satisfied())
7989 return -CEPHFS_ENOTCONN;
7990
11fdf7f2 7991 tout(cct) << __func__ << std::endl;
7c673cae
FG
7992 tout(cct) << relpath << std::endl;
7993 tout(cct) << mask << std::endl;
7994
7995 filepath path(relpath);
7996 InodeRef in;
f67539c2
TL
7997
7998 std::scoped_lock lock(client_lock);
7c673cae
FG
7999 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8000 if (r < 0)
8001 return r;
8002 return _setattrx(in, stx, mask, perms);
8003}
8004
8005int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8006{
f67539c2
TL
8007 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8008 if (!mref_reader.is_state_satisfied())
8009 return -CEPHFS_ENOTCONN;
8010
11fdf7f2 8011 tout(cct) << __func__ << std::endl;
7c673cae
FG
8012 tout(cct) << fd << std::endl;
8013 tout(cct) << mask << std::endl;
8014
f67539c2 8015 std::scoped_lock lock(client_lock);
7c673cae
FG
8016 Fh *f = get_filehandle(fd);
8017 if (!f)
f67539c2 8018 return -CEPHFS_EBADF;
7c673cae
FG
8019#if defined(__linux__) && defined(O_PATH)
8020 if (f->flags & O_PATH)
f67539c2 8021 return -CEPHFS_EBADF;
7c673cae
FG
8022#endif
8023 return _setattr(f->inode, attr, mask, perms);
8024}
8025
8026int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8027{
f67539c2
TL
8028 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8029 if (!mref_reader.is_state_satisfied())
8030 return -CEPHFS_ENOTCONN;
8031
11fdf7f2 8032 tout(cct) << __func__ << std::endl;
7c673cae
FG
8033 tout(cct) << fd << std::endl;
8034 tout(cct) << mask << std::endl;
8035
f67539c2 8036 std::scoped_lock lock(client_lock);
7c673cae
FG
8037 Fh *f = get_filehandle(fd);
8038 if (!f)
f67539c2 8039 return -CEPHFS_EBADF;
7c673cae
FG
8040#if defined(__linux__) && defined(O_PATH)
8041 if (f->flags & O_PATH)
f67539c2 8042 return -CEPHFS_EBADF;
7c673cae
FG
8043#endif
8044 return _setattrx(f->inode, stx, mask, perms);
8045}
8046
8047int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8048 frag_info_t *dirstat, int mask)
8049{
f67539c2
TL
8050 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8051 if (!mref_reader.is_state_satisfied())
8052 return -CEPHFS_ENOTCONN;
8053
11fdf7f2 8054 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8055 tout(cct) << "stat" << std::endl;
8056 tout(cct) << relpath << std::endl;
181888fb 8057
7c673cae
FG
8058 filepath path(relpath);
8059 InodeRef in;
f67539c2
TL
8060
8061 std::scoped_lock lock(client_lock);
7c673cae
FG
8062 int r = path_walk(path, &in, perms, true, mask);
8063 if (r < 0)
8064 return r;
8065 r = _getattr(in, mask, perms);
8066 if (r < 0) {
11fdf7f2 8067 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
8068 return r;
8069 }
8070 fill_stat(in, stbuf, dirstat);
11fdf7f2 8071 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8072 return r;
8073}
8074
8075unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8076{
8077 unsigned mask = 0;
8078
2a845540
TL
8079 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8080 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
7c673cae
FG
8081 goto out;
8082
2a845540 8083 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
7c673cae
FG
8084 mask |= CEPH_CAP_PIN;
8085 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8086 mask |= CEPH_CAP_AUTH_SHARED;
8087 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8088 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 8089 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
8090 mask |= CEPH_CAP_FILE_SHARED;
8091 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8092 mask |= CEPH_CAP_XATTR_SHARED;
8093out:
8094 return mask;
8095}
8096
8097int Client::statx(const char *relpath, struct ceph_statx *stx,
8098 const UserPerm& perms,
8099 unsigned int want, unsigned int flags)
8100{
b3b6e05e 8101 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7c673cae
FG
8102}
8103
8104int Client::lstat(const char *relpath, struct stat *stbuf,
8105 const UserPerm& perms, frag_info_t *dirstat, int mask)
8106{
f67539c2
TL
8107 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8108 if (!mref_reader.is_state_satisfied())
8109 return -CEPHFS_ENOTCONN;
8110
11fdf7f2 8111 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
11fdf7f2 8112 tout(cct) << __func__ << std::endl;
7c673cae 8113 tout(cct) << relpath << std::endl;
181888fb 8114
7c673cae
FG
8115 filepath path(relpath);
8116 InodeRef in;
f67539c2
TL
8117
8118 std::scoped_lock lock(client_lock);
7c673cae
FG
8119 // don't follow symlinks
8120 int r = path_walk(path, &in, perms, false, mask);
8121 if (r < 0)
8122 return r;
8123 r = _getattr(in, mask, perms);
8124 if (r < 0) {
11fdf7f2 8125 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
8126 return r;
8127 }
8128 fill_stat(in, stbuf, dirstat);
11fdf7f2 8129 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8130 return r;
8131}
8132
8133int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8134{
11fdf7f2 8135 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8136 << " mode 0" << oct << in->mode << dec
8137 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8138 memset(st, 0, sizeof(struct stat));
8139 if (use_faked_inos())
8140 st->st_ino = in->faked_ino;
8141 else
8142 st->st_ino = in->ino;
8143 st->st_dev = in->snapid;
8144 st->st_mode = in->mode;
8145 st->st_rdev = in->rdev;
28e407b8
AA
8146 if (in->is_dir()) {
8147 switch (in->nlink) {
8148 case 0:
8149 st->st_nlink = 0; /* dir is unlinked */
8150 break;
8151 case 1:
8152 st->st_nlink = 1 /* parent dentry */
8153 + 1 /* <dir>/. */
8154 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8155 break;
8156 default:
8157 ceph_abort();
8158 }
8159 } else {
8160 st->st_nlink = in->nlink;
8161 }
7c673cae
FG
8162 st->st_uid = in->uid;
8163 st->st_gid = in->gid;
8164 if (in->ctime > in->mtime) {
8165 stat_set_ctime_sec(st, in->ctime.sec());
8166 stat_set_ctime_nsec(st, in->ctime.nsec());
8167 } else {
8168 stat_set_ctime_sec(st, in->mtime.sec());
8169 stat_set_ctime_nsec(st, in->mtime.nsec());
8170 }
8171 stat_set_atime_sec(st, in->atime.sec());
8172 stat_set_atime_nsec(st, in->atime.nsec());
8173 stat_set_mtime_sec(st, in->mtime.sec());
8174 stat_set_mtime_nsec(st, in->mtime.nsec());
8175 if (in->is_dir()) {
8176 if (cct->_conf->client_dirsize_rbytes)
8177 st->st_size = in->rstat.rbytes;
8178 else
8179 st->st_size = in->dirstat.size();
f67539c2
TL
8180// The Windows "stat" structure provides just a subset of the fields that are
8181// available on Linux.
8182#ifndef _WIN32
7c673cae 8183 st->st_blocks = 1;
f67539c2 8184#endif
7c673cae
FG
8185 } else {
8186 st->st_size = in->size;
f67539c2 8187#ifndef _WIN32
7c673cae 8188 st->st_blocks = (in->size + 511) >> 9;
f67539c2 8189#endif
7c673cae 8190 }
f67539c2 8191#ifndef _WIN32
11fdf7f2 8192 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
f67539c2 8193#endif
7c673cae
FG
8194
8195 if (dirstat)
8196 *dirstat = in->dirstat;
8197 if (rstat)
8198 *rstat = in->rstat;
8199
8200 return in->caps_issued();
8201}
8202
8203void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8204{
11fdf7f2 8205 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8206 << " mode 0" << oct << in->mode << dec
8207 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8208 memset(stx, 0, sizeof(struct ceph_statx));
8209
8210 /*
2a845540 8211 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
7c673cae
FG
8212 * so that all bits are set.
8213 */
8214 if (!mask)
8215 mask = ~0;
8216
8217 /* These are always considered to be available */
8218 stx->stx_dev = in->snapid;
11fdf7f2 8219 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
8220
8221 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8222 stx->stx_mode = S_IFMT & in->mode;
8223 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8224 stx->stx_rdev = in->rdev;
8225 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8226
8227 if (mask & CEPH_CAP_AUTH_SHARED) {
8228 stx->stx_uid = in->uid;
8229 stx->stx_gid = in->gid;
8230 stx->stx_mode = in->mode;
8231 in->btime.to_timespec(&stx->stx_btime);
8232 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8233 }
8234
8235 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
8236 if (in->is_dir()) {
8237 switch (in->nlink) {
8238 case 0:
8239 stx->stx_nlink = 0; /* dir is unlinked */
8240 break;
8241 case 1:
8242 stx->stx_nlink = 1 /* parent dentry */
8243 + 1 /* <dir>/. */
8244 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8245 break;
8246 default:
8247 ceph_abort();
8248 }
8249 } else {
8250 stx->stx_nlink = in->nlink;
8251 }
7c673cae
FG
8252 stx->stx_mask |= CEPH_STATX_NLINK;
8253 }
8254
8255 if (mask & CEPH_CAP_FILE_SHARED) {
8256
8257 in->atime.to_timespec(&stx->stx_atime);
8258 in->mtime.to_timespec(&stx->stx_mtime);
8259
8260 if (in->is_dir()) {
8261 if (cct->_conf->client_dirsize_rbytes)
8262 stx->stx_size = in->rstat.rbytes;
8263 else
8264 stx->stx_size = in->dirstat.size();
8265 stx->stx_blocks = 1;
8266 } else {
8267 stx->stx_size = in->size;
8268 stx->stx_blocks = (in->size + 511) >> 9;
8269 }
8270 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8271 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8272 }
8273
8274 /* Change time and change_attr both require all shared caps to view */
8275 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8276 stx->stx_version = in->change_attr;
8277 if (in->ctime > in->mtime)
8278 in->ctime.to_timespec(&stx->stx_ctime);
8279 else
8280 in->mtime.to_timespec(&stx->stx_ctime);
8281 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8282 }
8283
8284}
8285
8286void Client::touch_dn(Dentry *dn)
8287{
8288 lru.lru_touch(dn);
8289}
8290
8291int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8292{
b3b6e05e 8293 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
7c673cae
FG
8294}
8295
8296int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8297{
f67539c2
TL
8298 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8299 if (!mref_reader.is_state_satisfied())
8300 return -CEPHFS_ENOTCONN;
8301
11fdf7f2 8302 tout(cct) << __func__ << std::endl;
7c673cae
FG
8303 tout(cct) << fd << std::endl;
8304 tout(cct) << mode << std::endl;
181888fb 8305
f67539c2 8306 std::scoped_lock lock(client_lock);
7c673cae
FG
8307 Fh *f = get_filehandle(fd);
8308 if (!f)
f67539c2 8309 return -CEPHFS_EBADF;
7c673cae
FG
8310#if defined(__linux__) && defined(O_PATH)
8311 if (f->flags & O_PATH)
f67539c2 8312 return -CEPHFS_EBADF;
7c673cae
FG
8313#endif
8314 struct stat attr;
8315 attr.st_mode = mode;
8316 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8317}
8318
b3b6e05e
TL
8319int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8320 const UserPerm& perms) {
f67539c2 8321 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8322 if (!mref_reader.is_state_satisfied()) {
f67539c2 8323 return -CEPHFS_ENOTCONN;
b3b6e05e 8324 }
f67539c2 8325
11fdf7f2 8326 tout(cct) << __func__ << std::endl;
b3b6e05e 8327 tout(cct) << dirfd << std::endl;
7c673cae
FG
8328 tout(cct) << relpath << std::endl;
8329 tout(cct) << mode << std::endl;
b3b6e05e 8330 tout(cct) << flags << std::endl;
181888fb 8331
7c673cae
FG
8332 filepath path(relpath);
8333 InodeRef in;
b3b6e05e 8334 InodeRef dirinode;
f67539c2
TL
8335
8336 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8337 int r = get_fd_inode(dirfd, &dirinode);
8338 if (r < 0) {
8339 return r;
8340 }
8341
8342 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8343 if (r < 0) {
7c673cae 8344 return r;
b3b6e05e 8345 }
7c673cae
FG
8346 struct stat attr;
8347 attr.st_mode = mode;
8348 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8349}
8350
b3b6e05e
TL
8351int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8352{
8353 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8354}
8355
7c673cae
FG
8356int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8357 const UserPerm& perms)
8358{
b3b6e05e 8359 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
7c673cae
FG
8360}
8361
8362int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8363{
f67539c2
TL
8364 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8365 if (!mref_reader.is_state_satisfied())
8366 return -CEPHFS_ENOTCONN;
8367
11fdf7f2 8368 tout(cct) << __func__ << std::endl;
7c673cae
FG
8369 tout(cct) << fd << std::endl;
8370 tout(cct) << new_uid << std::endl;
8371 tout(cct) << new_gid << std::endl;
181888fb 8372
f67539c2 8373 std::scoped_lock lock(client_lock);
7c673cae
FG
8374 Fh *f = get_filehandle(fd);
8375 if (!f)
f67539c2 8376 return -CEPHFS_EBADF;
7c673cae
FG
8377#if defined(__linux__) && defined(O_PATH)
8378 if (f->flags & O_PATH)
f67539c2 8379 return -CEPHFS_EBADF;
7c673cae
FG
8380#endif
8381 struct stat attr;
8382 attr.st_uid = new_uid;
8383 attr.st_gid = new_gid;
8384 int mask = 0;
8385 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8386 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8387 return _setattr(f->inode, &attr, mask, perms);
8388}
8389
8390int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8391 const UserPerm& perms)
8392{
b3b6e05e
TL
8393 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8394}
8395
8396int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8397 int flags, const UserPerm& perms) {
f67539c2 8398 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8399 if (!mref_reader.is_state_satisfied()) {
f67539c2 8400 return -CEPHFS_ENOTCONN;
b3b6e05e 8401 }
f67539c2 8402
11fdf7f2 8403 tout(cct) << __func__ << std::endl;
b3b6e05e 8404 tout(cct) << dirfd << std::endl;
7c673cae
FG
8405 tout(cct) << relpath << std::endl;
8406 tout(cct) << new_uid << std::endl;
8407 tout(cct) << new_gid << std::endl;
b3b6e05e 8408 tout(cct) << flags << std::endl;
181888fb 8409
7c673cae
FG
8410 filepath path(relpath);
8411 InodeRef in;
b3b6e05e 8412 InodeRef dirinode;
f67539c2
TL
8413
8414 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8415 int r = get_fd_inode(dirfd, &dirinode);
8416 if (r < 0) {
7c673cae 8417 return r;
b3b6e05e
TL
8418 }
8419
8420 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8421 if (r < 0) {
8422 return r;
8423 }
7c673cae
FG
8424 struct stat attr;
8425 attr.st_uid = new_uid;
8426 attr.st_gid = new_gid;
b3b6e05e 8427 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
8428}
8429
11fdf7f2
TL
8430static void attr_set_atime_and_mtime(struct stat *attr,
8431 const utime_t &atime,
8432 const utime_t &mtime)
8433{
8434 stat_set_atime_sec(attr, atime.tv.tv_sec);
8435 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8436 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8437 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8438}
8439
8440// for [l]utime() invoke the timeval variant as the timespec
8441// variant are not yet implemented. for futime[s](), invoke
8442// the timespec variant.
7c673cae
FG
8443int Client::utime(const char *relpath, struct utimbuf *buf,
8444 const UserPerm& perms)
8445{
11fdf7f2
TL
8446 struct timeval tv[2];
8447 tv[0].tv_sec = buf->actime;
8448 tv[0].tv_usec = 0;
8449 tv[1].tv_sec = buf->modtime;
8450 tv[1].tv_usec = 0;
8451
8452 return utimes(relpath, tv, perms);
8453}
8454
8455int Client::lutime(const char *relpath, struct utimbuf *buf,
8456 const UserPerm& perms)
8457{
8458 struct timeval tv[2];
8459 tv[0].tv_sec = buf->actime;
8460 tv[0].tv_usec = 0;
8461 tv[1].tv_sec = buf->modtime;
8462 tv[1].tv_usec = 0;
8463
8464 return lutimes(relpath, tv, perms);
8465}
8466
8467int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8468{
8469 struct timespec ts[2];
8470 ts[0].tv_sec = buf->actime;
8471 ts[0].tv_nsec = 0;
8472 ts[1].tv_sec = buf->modtime;
8473 ts[1].tv_nsec = 0;
8474
8475 return futimens(fd, ts, perms);
8476}
8477
8478int Client::utimes(const char *relpath, struct timeval times[2],
8479 const UserPerm& perms)
8480{
f67539c2
TL
8481 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8482 if (!mref_reader.is_state_satisfied())
8483 return -CEPHFS_ENOTCONN;
8484
11fdf7f2 8485 tout(cct) << __func__ << std::endl;
7c673cae 8486 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8487 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8488 << std::endl;
8489 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8490 << std::endl;
181888fb 8491
7c673cae
FG
8492 filepath path(relpath);
8493 InodeRef in;
f67539c2
TL
8494
8495 std::scoped_lock lock(client_lock);
7c673cae
FG
8496 int r = path_walk(path, &in, perms);
8497 if (r < 0)
8498 return r;
8499 struct stat attr;
11fdf7f2
TL
8500 utime_t atime(times[0]);
8501 utime_t mtime(times[1]);
8502
8503 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8504 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8505}
8506
11fdf7f2
TL
8507int Client::lutimes(const char *relpath, struct timeval times[2],
8508 const UserPerm& perms)
7c673cae 8509{
f67539c2
TL
8510 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8511 if (!mref_reader.is_state_satisfied())
8512 return -CEPHFS_ENOTCONN;
8513
11fdf7f2 8514 tout(cct) << __func__ << std::endl;
7c673cae 8515 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8516 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8517 << std::endl;
8518 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8519 << std::endl;
181888fb 8520
7c673cae
FG
8521 filepath path(relpath);
8522 InodeRef in;
f67539c2
TL
8523
8524 std::scoped_lock lock(client_lock);
7c673cae
FG
8525 int r = path_walk(path, &in, perms, false);
8526 if (r < 0)
8527 return r;
8528 struct stat attr;
11fdf7f2
TL
8529 utime_t atime(times[0]);
8530 utime_t mtime(times[1]);
8531
8532 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8533 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8534}
8535
11fdf7f2
TL
8536int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8537{
8538 struct timespec ts[2];
8539 ts[0].tv_sec = times[0].tv_sec;
8540 ts[0].tv_nsec = times[0].tv_usec * 1000;
8541 ts[1].tv_sec = times[1].tv_sec;
8542 ts[1].tv_nsec = times[1].tv_usec * 1000;
8543
8544 return futimens(fd, ts, perms);
8545}
8546
8547int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8548{
f67539c2
TL
8549 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8550 if (!mref_reader.is_state_satisfied())
8551 return -CEPHFS_ENOTCONN;
8552
11fdf7f2
TL
8553 tout(cct) << __func__ << std::endl;
8554 tout(cct) << fd << std::endl;
8555 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8556 << std::endl;
8557 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8558 << std::endl;
8559
f67539c2 8560 std::scoped_lock lock(client_lock);
11fdf7f2
TL
8561 Fh *f = get_filehandle(fd);
8562 if (!f)
f67539c2 8563 return -CEPHFS_EBADF;
11fdf7f2
TL
8564#if defined(__linux__) && defined(O_PATH)
8565 if (f->flags & O_PATH)
f67539c2 8566 return -CEPHFS_EBADF;
11fdf7f2
TL
8567#endif
8568 struct stat attr;
8569 utime_t atime(times[0]);
8570 utime_t mtime(times[1]);
8571
8572 attr_set_atime_and_mtime(&attr, atime, mtime);
8573 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8574}
8575
b3b6e05e
TL
8576int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8577 const UserPerm& perms) {
8578 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8579 if (!mref_reader.is_state_satisfied()) {
8580 return -CEPHFS_ENOTCONN;
8581 }
8582
8583 tout(cct) << __func__ << std::endl;
8584 tout(cct) << dirfd << std::endl;
8585 tout(cct) << relpath << std::endl;
8586 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8587 << std::endl;
8588 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8589 << std::endl;
8590 tout(cct) << flags << std::endl;
8591
8592 filepath path(relpath);
8593 InodeRef in;
8594 InodeRef dirinode;
8595
8596 std::scoped_lock lock(client_lock);
8597 int r = get_fd_inode(dirfd, &dirinode);
8598 if (r < 0) {
8599 return r;
8600 }
8601
8602#if defined(__linux__) && defined(O_PATH)
8603 if (flags & O_PATH) {
8604 return -CEPHFS_EBADF;
8605 }
8606#endif
8607
8608 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8609 if (r < 0) {
8610 return r;
8611 }
8612 struct stat attr;
8613 utime_t atime(times[0]);
8614 utime_t mtime(times[1]);
8615
8616 attr_set_atime_and_mtime(&attr, atime, mtime);
8617 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8618}
8619
7c673cae
FG
8620int Client::flock(int fd, int operation, uint64_t owner)
8621{
f67539c2
TL
8622 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8623 if (!mref_reader.is_state_satisfied())
8624 return -CEPHFS_ENOTCONN;
8625
11fdf7f2 8626 tout(cct) << __func__ << std::endl;
7c673cae
FG
8627 tout(cct) << fd << std::endl;
8628 tout(cct) << operation << std::endl;
8629 tout(cct) << owner << std::endl;
181888fb 8630
f67539c2 8631 std::scoped_lock lock(client_lock);
7c673cae
FG
8632 Fh *f = get_filehandle(fd);
8633 if (!f)
f67539c2 8634 return -CEPHFS_EBADF;
7c673cae
FG
8635
8636 return _flock(f, operation, owner);
8637}
8638
8639int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8640{
f67539c2
TL
8641 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8642 if (!mref_reader.is_state_satisfied())
8643 return -CEPHFS_ENOTCONN;
8644
11fdf7f2 8645 tout(cct) << __func__ << std::endl;
7c673cae 8646 tout(cct) << relpath << std::endl;
181888fb 8647
7c673cae
FG
8648 filepath path(relpath);
8649 InodeRef in;
f67539c2
TL
8650
8651 std::scoped_lock lock(client_lock);
7c673cae
FG
8652 int r = path_walk(path, &in, perms, true);
8653 if (r < 0)
8654 return r;
8655 if (cct->_conf->client_permissions) {
8656 int r = may_open(in.get(), O_RDONLY, perms);
8657 if (r < 0)
8658 return r;
8659 }
8660 r = _opendir(in.get(), dirpp, perms);
8661 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
f67539c2
TL
8662 if (r != -CEPHFS_ENOTDIR)
8663 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
8664 return r;
8665}
8666
b3b6e05e
TL
8667int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8668 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8669 if (!mref_reader.is_state_satisfied()) {
8670 return -CEPHFS_ENOTCONN;
8671 }
8672
8673 tout(cct) << __func__ << std::endl;
8674 tout(cct) << dirfd << std::endl;
8675
8676 InodeRef dirinode;
8677 std::scoped_lock locker(client_lock);
8678 int r = get_fd_inode(dirfd, &dirinode);
8679 if (r < 0) {
8680 return r;
8681 }
8682
8683 if (cct->_conf->client_permissions) {
8684 r = may_open(dirinode.get(), O_RDONLY, perms);
8685 if (r < 0) {
8686 return r;
8687 }
8688 }
8689 r = _opendir(dirinode.get(), dirpp, perms);
8690 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8691 if (r != -CEPHFS_ENOTDIR) {
8692 tout(cct) << (uintptr_t)*dirpp << std::endl;
8693 }
8694 return r;
8695}
8696
7c673cae
FG
8697int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8698{
8699 if (!in->is_dir())
f67539c2 8700 return -CEPHFS_ENOTDIR;
7c673cae
FG
8701 *dirpp = new dir_result_t(in, perms);
8702 opened_dirs.insert(*dirpp);
11fdf7f2 8703 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
8704 return 0;
8705}
8706
8707
8708int Client::closedir(dir_result_t *dir)
8709{
11fdf7f2 8710 tout(cct) << __func__ << std::endl;
f67539c2 8711 tout(cct) << (uintptr_t)dir << std::endl;
7c673cae 8712
11fdf7f2 8713 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
f67539c2 8714 std::scoped_lock lock(client_lock);
7c673cae
FG
8715 _closedir(dir);
8716 return 0;
8717}
8718
8719void Client::_closedir(dir_result_t *dirp)
8720{
11fdf7f2 8721 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
f67539c2 8722
7c673cae 8723 if (dirp->inode) {
11fdf7f2 8724 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
8725 dirp->inode.reset();
8726 }
8727 _readdir_drop_dirp_buffer(dirp);
8728 opened_dirs.erase(dirp);
8729 delete dirp;
8730}
8731
8732void Client::rewinddir(dir_result_t *dirp)
8733{
11fdf7f2 8734 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb 8735
f67539c2
TL
8736 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8737 if (!mref_reader.is_state_satisfied())
181888fb
FG
8738 return;
8739
f67539c2 8740 std::scoped_lock lock(client_lock);
7c673cae
FG
8741 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8742 _readdir_drop_dirp_buffer(d);
8743 d->reset();
8744}
8745
8746loff_t Client::telldir(dir_result_t *dirp)
8747{
8748 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 8749 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
8750 return d->offset;
8751}
8752
8753void Client::seekdir(dir_result_t *dirp, loff_t offset)
8754{
11fdf7f2 8755 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 8756
f67539c2
TL
8757 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8758 if (!mref_reader.is_state_satisfied())
181888fb
FG
8759 return;
8760
f67539c2
TL
8761 std::scoped_lock lock(client_lock);
8762
7c673cae
FG
8763 if (offset == dirp->offset)
8764 return;
8765
8766 if (offset > dirp->offset)
8767 dirp->release_count = 0; // bump if we do a forward seek
8768 else
8769 dirp->ordered_count = 0; // disable filling readdir cache
8770
8771 if (dirp->hash_order()) {
8772 if (dirp->offset > offset) {
8773 _readdir_drop_dirp_buffer(dirp);
8774 dirp->reset();
8775 }
8776 } else {
8777 if (offset == 0 ||
8778 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8779 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8780 _readdir_drop_dirp_buffer(dirp);
8781 dirp->reset();
8782 }
8783 }
8784
8785 dirp->offset = offset;
8786}
8787
8788
8789//struct dirent {
8790// ino_t d_ino; /* inode number */
8791// off_t d_off; /* offset to the next dirent */
8792// unsigned short d_reclen; /* length of this record */
8793// unsigned char d_type; /* type of file */
8794// char d_name[256]; /* filename */
8795//};
8796void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8797{
8798 strncpy(de->d_name, name, 255);
8799 de->d_name[255] = '\0';
f67539c2 8800#if !defined(__CYGWIN__) && !(defined(_WIN32))
7c673cae 8801 de->d_ino = ino;
11fdf7f2 8802#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
8803 de->d_off = next_off;
8804#endif
8805 de->d_reclen = 1;
8806 de->d_type = IFTODT(type);
11fdf7f2 8807 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
8808 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8809#endif
8810}
8811
8812void Client::_readdir_next_frag(dir_result_t *dirp)
8813{
8814 frag_t fg = dirp->buffer_frag;
8815
8816 if (fg.is_rightmost()) {
11fdf7f2 8817 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
8818 dirp->set_end();
8819 return;
8820 }
8821
8822 // advance
8823 fg = fg.next();
11fdf7f2 8824 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
8825
8826 if (dirp->hash_order()) {
8827 // keep last_name
8828 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8829 if (dirp->offset < new_offset) // don't decrease offset
8830 dirp->offset = new_offset;
8831 } else {
8832 dirp->last_name.clear();
8833 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8834 _readdir_rechoose_frag(dirp);
8835 }
8836}
8837
8838void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8839{
11fdf7f2 8840 ceph_assert(dirp->inode);
7c673cae
FG
8841
8842 if (dirp->hash_order())
8843 return;
8844
8845 frag_t cur = frag_t(dirp->offset_high());
8846 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8847 if (fg != cur) {
11fdf7f2 8848 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
8849 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8850 dirp->last_name.clear();
8851 dirp->next_offset = 2;
8852 }
8853}
8854
8855void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8856{
11fdf7f2 8857 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
8858 dirp->buffer.clear();
8859}
8860
8861int Client::_readdir_get_frag(dir_result_t *dirp)
8862{
11fdf7f2
TL
8863 ceph_assert(dirp);
8864 ceph_assert(dirp->inode);
7c673cae
FG
8865
8866 // get the current frag.
8867 frag_t fg;
8868 if (dirp->hash_order())
8869 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8870 else
8871 fg = frag_t(dirp->offset_high());
8872
11fdf7f2 8873 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
8874 << " offset " << hex << dirp->offset << dec << dendl;
8875
8876 int op = CEPH_MDS_OP_READDIR;
8877 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8878 op = CEPH_MDS_OP_LSSNAP;
8879
8880 InodeRef& diri = dirp->inode;
8881
8882 MetaRequest *req = new MetaRequest(op);
8883 filepath path;
8884 diri->make_nosnap_relative_path(path);
8885 req->set_filepath(path);
8886 req->set_inode(diri.get());
8887 req->head.args.readdir.frag = fg;
8888 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8889 if (dirp->last_name.length()) {
94b18763 8890 req->path2.set_path(dirp->last_name);
7c673cae
FG
8891 } else if (dirp->hash_order()) {
8892 req->head.args.readdir.offset_hash = dirp->offset_high();
8893 }
8894 req->dirp = dirp;
8895
8896 bufferlist dirbl;
8897 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8898
f67539c2 8899 if (res == -CEPHFS_EAGAIN) {
11fdf7f2 8900 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
8901 _readdir_rechoose_frag(dirp);
8902 return _readdir_get_frag(dirp);
8903 }
8904
8905 if (res == 0) {
11fdf7f2 8906 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
8907 << " size " << dirp->buffer.size() << dendl;
8908 } else {
11fdf7f2 8909 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
8910 dirp->set_end();
8911 }
8912
8913 return res;
8914}
8915
8916struct dentry_off_lt {
8917 bool operator()(const Dentry* dn, int64_t off) const {
8918 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8919 }
8920};
8921
8922int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8923 int caps, bool getref)
8924{
f67539c2 8925 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11fdf7f2 8926 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
8927 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8928 << dendl;
8929 Dir *dir = dirp->inode->dir;
8930
8931 if (!dir) {
8932 ldout(cct, 10) << " dir is empty" << dendl;
8933 dirp->set_end();
8934 return 0;
8935 }
8936
8937 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8938 dir->readdir_cache.end(),
8939 dirp->offset, dentry_off_lt());
8940
8941 string dn_name;
8942 while (true) {
adb31ebb 8943 int mask = caps;
7c673cae 8944 if (!dirp->inode->is_complete_and_ordered())
f67539c2 8945 return -CEPHFS_EAGAIN;
7c673cae
FG
8946 if (pd == dir->readdir_cache.end())
8947 break;
8948 Dentry *dn = *pd;
8949 if (dn->inode == NULL) {
8950 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8951 ++pd;
8952 continue;
8953 }
8954 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8955 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8956 ++pd;
8957 continue;
8958 }
8959
92f5a8d4 8960 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
8961 if (dn->inode->is_dir()) {
8962 mask |= CEPH_STAT_RSTAT;
8963 }
8964 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
8965 if (r < 0)
8966 return r;
92f5a8d4
TL
8967
8968 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8969 pd = dir->readdir_cache.begin() + idx;
8970 if (pd >= dir->readdir_cache.end() || *pd != dn)
f67539c2 8971 return -CEPHFS_EAGAIN;
7c673cae
FG
8972
8973 struct ceph_statx stx;
8974 struct dirent de;
8975 fill_statx(dn->inode, caps, &stx);
8976
8977 uint64_t next_off = dn->offset + 1;
eafe8130 8978 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8979 ++pd;
8980 if (pd == dir->readdir_cache.end())
8981 next_off = dir_result_t::END;
8982
8983 Inode *in = NULL;
7c673cae
FG
8984 if (getref) {
8985 in = dn->inode.get();
8986 _ll_get(in);
8987 }
8988
8989 dn_name = dn->name; // fill in name while we have lock
8990
9f95a23c 8991 client_lock.unlock();
7c673cae 8992 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8993 client_lock.lock();
7c673cae
FG
8994 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8995 << " = " << r << dendl;
8996 if (r < 0) {
8997 return r;
8998 }
8999
9000 dirp->offset = next_off;
9001 if (dirp->at_end())
9002 dirp->next_offset = 2;
9003 else
9004 dirp->next_offset = dirp->offset_low();
9005 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 9006 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
9007 if (r > 0)
9008 return r;
9009 }
9010
11fdf7f2 9011 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
9012 dirp->set_end();
9013 return 0;
9014}
9015
9016int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
9017 unsigned want, unsigned flags, bool getref)
9018{
9019 int caps = statx_to_mask(flags, want);
9020
f67539c2
TL
9021 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9022 if (!mref_reader.is_state_satisfied())
9023 return -CEPHFS_ENOTCONN;
7c673cae 9024
f67539c2 9025 std::unique_lock cl(client_lock);
181888fb 9026
7c673cae
FG
9027 dir_result_t *dirp = static_cast<dir_result_t*>(d);
9028
11fdf7f2 9029 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
9030 << dec << " at_end=" << dirp->at_end()
9031 << " hash_order=" << dirp->hash_order() << dendl;
9032
9033 struct dirent de;
9034 struct ceph_statx stx;
9035 memset(&de, 0, sizeof(de));
9036 memset(&stx, 0, sizeof(stx));
9037
9038 InodeRef& diri = dirp->inode;
9039
9040 if (dirp->at_end())
9041 return 0;
9042
9043 if (dirp->offset == 0) {
9044 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 9045 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
9046 uint64_t next_off = 1;
9047
9048 int r;
adb31ebb 9049 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
9050 if (r < 0)
9051 return r;
9052
9053 fill_statx(diri, caps, &stx);
9054 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9055
9056 Inode *inode = NULL;
9057 if (getref) {
9058 inode = diri.get();
9059 _ll_get(inode);
9060 }
9061
f67539c2 9062 cl.unlock();
7c673cae 9063 r = cb(p, &de, &stx, next_off, inode);
f67539c2 9064 cl.lock();
7c673cae
FG
9065 if (r < 0)
9066 return r;
9067
9068 dirp->offset = next_off;
9069 if (r > 0)
9070 return r;
9071 }
9072 if (dirp->offset == 1) {
9073 ldout(cct, 15) << " including .." << dendl;
9074 uint64_t next_off = 2;
9075 InodeRef in;
11fdf7f2 9076 if (diri->dentries.empty())
7c673cae
FG
9077 in = diri;
9078 else
94b18763 9079 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
9080
9081 int r;
adb31ebb 9082 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
9083 if (r < 0)
9084 return r;
9085
9086 fill_statx(in, caps, &stx);
9087 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9088
9089 Inode *inode = NULL;
9090 if (getref) {
9091 inode = in.get();
9092 _ll_get(inode);
9093 }
9094
f67539c2 9095 cl.unlock();
7c673cae 9096 r = cb(p, &de, &stx, next_off, inode);
f67539c2 9097 cl.lock();
7c673cae
FG
9098 if (r < 0)
9099 return r;
9100
9101 dirp->offset = next_off;
9102 if (r > 0)
9103 return r;
9104 }
9105
9106 // can we read from our cache?
9107 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
9108 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9109 << dirp->inode->is_complete_and_ordered()
9110 << " issued " << ccap_string(dirp->inode->caps_issued())
9111 << dendl;
9112 if (dirp->inode->snapid != CEPH_SNAPDIR &&
9113 dirp->inode->is_complete_and_ordered() &&
94b18763 9114 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 9115 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
f67539c2 9116 if (err != -CEPHFS_EAGAIN)
7c673cae
FG
9117 return err;
9118 }
9119
9120 while (1) {
9121 if (dirp->at_end())
9122 return 0;
9123
9124 bool check_caps = true;
9125 if (!dirp->is_cached()) {
9126 int r = _readdir_get_frag(dirp);
9127 if (r)
9128 return r;
9129 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9130 // different than the requested one. (our dirfragtree was outdated)
9131 check_caps = false;
9132 }
9133 frag_t fg = dirp->buffer_frag;
9134
9135 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9136 << " offset " << hex << dirp->offset << dendl;
9137
9138 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9139 dirp->offset, dir_result_t::dentry_off_lt());
9140 it != dirp->buffer.end();
9141 ++it) {
9142 dir_result_t::dentry &entry = *it;
9143
9144 uint64_t next_off = entry.offset + 1;
9145
9146 int r;
9147 if (check_caps) {
adb31ebb
TL
9148 int mask = caps;
9149 if(entry.inode->is_dir()){
9150 mask |= CEPH_STAT_RSTAT;
9151 }
9152 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
9153 if (r < 0)
9154 return r;
9155 }
9156
9157 fill_statx(entry.inode, caps, &stx);
9158 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9159
9160 Inode *inode = NULL;
9161 if (getref) {
9162 inode = entry.inode.get();
9163 _ll_get(inode);
9164 }
9165
f67539c2 9166 cl.unlock();
7c673cae 9167 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
f67539c2 9168 cl.lock();
7c673cae
FG
9169
9170 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9171 << " = " << r << dendl;
9172 if (r < 0)
9173 return r;
9174
9175 dirp->offset = next_off;
9176 if (r > 0)
9177 return r;
9178 }
9179
9180 if (dirp->next_offset > 2) {
9181 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9182 _readdir_drop_dirp_buffer(dirp);
9183 continue; // more!
9184 }
9185
9186 if (!fg.is_rightmost()) {
9187 // next frag!
9188 _readdir_next_frag(dirp);
9189 continue;
9190 }
9191
9192 if (diri->shared_gen == dirp->start_shared_gen &&
9193 diri->dir_release_count == dirp->release_count) {
9194 if (diri->dir_ordered_count == dirp->ordered_count) {
9195 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9196 if (diri->dir) {
11fdf7f2 9197 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
9198 diri->dir->readdir_cache.resize(dirp->cache_index);
9199 }
9200 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9201 } else {
9202 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9203 diri->flags |= I_COMPLETE;
9204 }
9205 }
9206
9207 dirp->set_end();
9208 return 0;
9209 }
9210 ceph_abort();
9211 return 0;
9212}
9213
9214
9215int Client::readdir_r(dir_result_t *d, struct dirent *de)
9216{
9217 return readdirplus_r(d, de, 0, 0, 0, NULL);
9218}
9219
9220/*
9221 * readdirplus_r
9222 *
9223 * returns
9224 * 1 if we got a dirent
9225 * 0 for end of directory
9226 * <0 on error
9227 */
9228
9229struct single_readdir {
9230 struct dirent *de;
9231 struct ceph_statx *stx;
9232 Inode *inode;
9233 bool full;
9234};
9235
9236static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9237 struct ceph_statx *stx, off_t off,
9238 Inode *in)
9239{
9240 single_readdir *c = static_cast<single_readdir *>(p);
9241
9242 if (c->full)
9243 return -1; // already filled this dirent
9244
9245 *c->de = *de;
9246 if (c->stx)
9247 *c->stx = *stx;
9248 c->inode = in;
9249 c->full = true;
9250 return 1;
9251}
9252
9253struct dirent *Client::readdir(dir_result_t *d)
9254{
9255 int ret;
f91f0fd5 9256 auto& de = d->de;
7c673cae
FG
9257 single_readdir sr;
9258 sr.de = &de;
9259 sr.stx = NULL;
9260 sr.inode = NULL;
9261 sr.full = false;
9262
9263 // our callback fills the dirent and sets sr.full=true on first
9264 // call, and returns -1 the second time around.
9265 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9266 if (ret < -1) {
9267 errno = -ret; // this sucks.
9268 return (dirent *) NULL;
9269 }
9270 if (sr.full) {
9271 return &de;
9272 }
9273 return (dirent *) NULL;
9274}
9275
9276int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9277 struct ceph_statx *stx, unsigned want,
9278 unsigned flags, Inode **out)
9279{
9280 single_readdir sr;
9281 sr.de = de;
9282 sr.stx = stx;
9283 sr.inode = NULL;
9284 sr.full = false;
9285
9286 // our callback fills the dirent and sets sr.full=true on first
9287 // call, and returns -1 the second time around.
9288 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9289 if (r < -1)
9290 return r;
9291 if (out)
9292 *out = sr.inode;
9293 if (sr.full)
9294 return 1;
9295 return 0;
9296}
9297
9298
9299/* getdents */
9300struct getdents_result {
9301 char *buf;
9302 int buflen;
9303 int pos;
9304 bool fullent;
9305};
9306
9307static int _readdir_getdent_cb(void *p, struct dirent *de,
9308 struct ceph_statx *stx, off_t off, Inode *in)
9309{
9310 struct getdents_result *c = static_cast<getdents_result *>(p);
9311
9312 int dlen;
9313 if (c->fullent)
9314 dlen = sizeof(*de);
9315 else
9316 dlen = strlen(de->d_name) + 1;
9317
9318 if (c->pos + dlen > c->buflen)
9319 return -1; // doesn't fit
9320
9321 if (c->fullent) {
9322 memcpy(c->buf + c->pos, de, sizeof(*de));
9323 } else {
9324 memcpy(c->buf + c->pos, de->d_name, dlen);
9325 }
9326 c->pos += dlen;
9327 return 0;
9328}
9329
9330int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9331{
9332 getdents_result gr;
9333 gr.buf = buf;
9334 gr.buflen = buflen;
9335 gr.fullent = fullent;
9336 gr.pos = 0;
9337
9338 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9339
9340 if (r < 0) { // some error
9341 if (r == -1) { // buffer ran out of space
9342 if (gr.pos) { // but we got some entries already!
9343 return gr.pos;
9344 } // or we need a larger buffer
f67539c2 9345 return -CEPHFS_ERANGE;
7c673cae
FG
9346 } else { // actual error, return it
9347 return r;
9348 }
9349 }
9350 return gr.pos;
9351}
9352
9353
9354/* getdir */
9355struct getdir_result {
9356 list<string> *contents;
9357 int num;
9358};
9359
9360static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9361{
9362 getdir_result *r = static_cast<getdir_result *>(p);
9363
9364 r->contents->push_back(de->d_name);
9365 r->num++;
9366 return 0;
9367}
9368
9369int Client::getdir(const char *relpath, list<string>& contents,
9370 const UserPerm& perms)
9371{
9372 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
f67539c2
TL
9373 tout(cct) << "getdir" << std::endl;
9374 tout(cct) << relpath << std::endl;
7c673cae
FG
9375
9376 dir_result_t *d;
9377 int r = opendir(relpath, &d, perms);
9378 if (r < 0)
9379 return r;
9380
9381 getdir_result gr;
9382 gr.contents = &contents;
9383 gr.num = 0;
9384 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9385
9386 closedir(d);
9387
9388 if (r < 0)
9389 return r;
9390 return gr.num;
9391}
9392
9393
9394/****** file i/o **********/
f67539c2 9395
b3b6e05e 9396// common parts for open and openat. call with client_lock locked.
20effc67 9397int Client::create_and_open(int dirfd, const char *relpath, int flags,
b3b6e05e
TL
9398 const UserPerm& perms, mode_t mode, int stripe_unit,
9399 int stripe_count, int object_size, const char *data_pool,
9400 std::string alternate_name) {
9401 ceph_assert(ceph_mutex_is_locked(client_lock));
f91f0fd5 9402 int cflags = ceph_flags_sys2wire(flags);
f91f0fd5 9403 tout(cct) << cflags << std::endl;
7c673cae
FG
9404
9405 Fh *fh = NULL;
9406
9407#if defined(__linux__) && defined(O_PATH)
9408 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9409 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9410 * in kernel (fs/open.c). */
9411 if (flags & O_PATH)
9412 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9413#endif
9414
9415 filepath path(relpath);
9416 InodeRef in;
9417 bool created = false;
9418 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9419 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
9420 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9421
b3b6e05e 9422 InodeRef dirinode = nullptr;
20effc67
TL
9423 int r = get_fd_inode(dirfd, &dirinode);
9424 if (r < 0) {
9425 return r;
b3b6e05e 9426 }
7c673cae 9427
20effc67 9428 r = path_walk(path, &in, perms, followsym, mask, dirinode);
7c673cae 9429 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 9430 return -CEPHFS_EEXIST;
7c673cae
FG
9431
9432#if defined(__linux__) && defined(O_PATH)
9433 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9434#else
b3b6e05e 9435 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
7c673cae 9436#endif
f67539c2 9437 return -CEPHFS_ELOOP;
7c673cae 9438
f67539c2 9439 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
7c673cae
FG
9440 filepath dirpath = path;
9441 string dname = dirpath.last_dentry();
9442 dirpath.pop_dentry();
9443 InodeRef dir;
9444 r = path_walk(dirpath, &dir, perms, true,
b3b6e05e
TL
9445 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9446 if (r < 0) {
7c673cae 9447 goto out;
b3b6e05e 9448 }
7c673cae
FG
9449 if (cct->_conf->client_permissions) {
9450 r = may_create(dir.get(), perms);
9451 if (r < 0)
b3b6e05e 9452 goto out;
7c673cae
FG
9453 }
9454 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
f67539c2
TL
9455 stripe_count, object_size, data_pool, &created, perms,
9456 std::move(alternate_name));
7c673cae
FG
9457 }
9458 if (r < 0)
9459 goto out;
9460
9461 if (!created) {
9462 // posix says we can only check permissions of existing files
9463 if (cct->_conf->client_permissions) {
9464 r = may_open(in.get(), flags, perms);
9465 if (r < 0)
b3b6e05e 9466 goto out;
7c673cae
FG
9467 }
9468 }
9469
9470 if (!fh)
9471 r = _open(in.get(), flags, mode, &fh, perms);
9472 if (r >= 0) {
9473 // allocate a integer file descriptor
11fdf7f2 9474 ceph_assert(fh);
7c673cae 9475 r = get_fd();
11fdf7f2 9476 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
9477 fd_map[r] = fh;
9478 }
9479
9480 out:
b3b6e05e
TL
9481 return r;
9482}
9483
9484int Client::open(const char *relpath, int flags, const UserPerm& perms,
9485 mode_t mode, int stripe_unit, int stripe_count,
9486 int object_size, const char *data_pool, std::string alternate_name)
9487{
9488 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9489 stripe_count, object_size, data_pool, alternate_name);
9490}
9491
b3b6e05e
TL
9492int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9493 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9494 const char *data_pool, std::string alternate_name) {
9495 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9496 if (!mref_reader.is_state_satisfied()) {
9497 return -CEPHFS_ENOTCONN;
9498 }
9499
9500 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9501 tout(cct) << dirfd << std::endl;
9502 tout(cct) << relpath << std::endl;
9503 tout(cct) << flags << std::endl;
9504 tout(cct) << mode << std::endl;
9505
9506 std::scoped_lock locker(client_lock);
9507 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9508 object_size, data_pool, alternate_name);
9509
7c673cae 9510 tout(cct) << r << std::endl;
b3b6e05e 9511 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
7c673cae
FG
9512 return r;
9513}
9514
7c673cae
FG
9515int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9516 const UserPerm& perms)
9517{
11fdf7f2 9518 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 9519
f67539c2
TL
9520 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9521 if (!mref_reader.is_state_satisfied())
9522 return -CEPHFS_ENOTCONN;
181888fb 9523
f67539c2 9524 std::scoped_lock lock(client_lock);
7c673cae
FG
9525 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9526 filepath path(ino);
9527 req->set_filepath(path);
9528
9529 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9530 char f[30];
9531 sprintf(f, "%u", h);
9532 filepath path2(dirino);
9533 path2.push_dentry(string(f));
9534 req->set_filepath2(path2);
9535
9536 int r = make_request(req, perms, NULL, NULL,
9537 rand() % mdsmap->get_num_in_mds());
11fdf7f2 9538 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
9539 return r;
9540}
9541
9542
9543/**
9544 * Load inode into local cache.
9545 *
9546 * If inode pointer is non-NULL, and take a reference on
9547 * the resulting Inode object in one operation, so that caller
9548 * can safely assume inode will still be there after return.
9549 */
f67539c2 9550int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
7c673cae 9551{
f67539c2 9552 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
7c673cae 9553
f67539c2
TL
9554 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9555 if (!mref_reader.is_state_satisfied())
9556 return -CEPHFS_ENOTCONN;
181888fb 9557
b3b6e05e
TL
9558 if (is_reserved_vino(vino))
9559 return -CEPHFS_ESTALE;
9560
7c673cae 9561 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
f67539c2 9562 filepath path(vino.ino);
7c673cae
FG
9563 req->set_filepath(path);
9564
f67539c2
TL
9565 /*
9566 * The MDS expects either a "real" snapid here or 0. The special value
9567 * carveouts for the snapid are all at the end of the range so we can
9568 * just look for any snapid below this value.
9569 */
9570 if (vino.snapid < CEPH_NOSNAP)
9571 req->head.args.lookupino.snapid = vino.snapid;
9572
7c673cae
FG
9573 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9574 if (r == 0 && inode != NULL) {
7c673cae 9575 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 9576 ceph_assert(p != inode_map.end());
7c673cae
FG
9577 *inode = p->second;
9578 _ll_get(*inode);
9579 }
f67539c2 9580 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
7c673cae
FG
9581 return r;
9582}
9583
1adf2230
AA
9584int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9585{
f67539c2
TL
9586 vinodeno_t vino(ino, CEPH_NOSNAP);
9587 std::scoped_lock lock(client_lock);
9588 return _lookup_vino(vino, perms, inode);
1adf2230 9589}
7c673cae
FG
9590
9591/**
9592 * Find the parent inode of `ino` and insert it into
9593 * our cache. Conditionally also set `parent` to a referenced
9594 * Inode* if caller provides non-NULL value.
9595 */
1adf2230 9596int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 9597{
11fdf7f2 9598 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9599
7c673cae
FG
9600 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9601 filepath path(ino->ino);
9602 req->set_filepath(path);
9603
9604 InodeRef target;
9605 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9606 // Give caller a reference to the parent ino if they provided a pointer.
9607 if (parent != NULL) {
9608 if (r == 0) {
9609 *parent = target.get();
9610 _ll_get(*parent);
11fdf7f2 9611 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
9612 } else {
9613 *parent = NULL;
9614 }
9615 }
11fdf7f2 9616 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9617 return r;
9618}
9619
7c673cae
FG
9620/**
9621 * Populate the parent dentry for `ino`, provided it is
9622 * a child of `parent`.
9623 */
1adf2230 9624int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 9625{
11fdf7f2
TL
9626 ceph_assert(parent->is_dir());
9627 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9628
f67539c2
TL
9629 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9630 if (!mref_reader.is_state_satisfied())
9631 return -CEPHFS_ENOTCONN;
181888fb 9632
7c673cae
FG
9633 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9634 req->set_filepath2(filepath(parent->ino));
9635 req->set_filepath(filepath(ino->ino));
9636 req->set_inode(ino);
9637
9638 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 9639 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9640 return r;
9641}
9642
1adf2230
AA
9643int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9644{
f67539c2 9645 std::scoped_lock lock(client_lock);
1adf2230
AA
9646 return _lookup_name(ino, parent, perms);
9647}
7c673cae 9648
11fdf7f2 9649Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 9650{
11fdf7f2 9651 ceph_assert(in);
f6b5b4d7 9652 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 9653
11fdf7f2 9654 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
9655
9656 if (in->snapid != CEPH_NOSNAP) {
9657 in->snap_cap_refs++;
9658 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9659 << ccap_string(in->caps_issued()) << dendl;
9660 }
9661
11fdf7f2 9662 const auto& conf = cct->_conf;
7c673cae
FG
9663 f->readahead.set_trigger_requests(1);
9664 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9665 uint64_t max_readahead = Readahead::NO_LIMIT;
9666 if (conf->client_readahead_max_bytes) {
11fdf7f2 9667 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
9668 }
9669 if (conf->client_readahead_max_periods) {
11fdf7f2 9670 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
9671 }
9672 f->readahead.set_max_readahead_size(max_readahead);
9673 vector<uint64_t> alignments;
9674 alignments.push_back(in->layout.get_period());
9675 alignments.push_back(in->layout.stripe_unit);
9676 f->readahead.set_alignments(alignments);
9677
9678 return f;
9679}
9680
9681int Client::_release_fh(Fh *f)
9682{
9683 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9684 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9685 Inode *in = f->inode.get();
11fdf7f2 9686 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 9687
b32b8144
FG
9688 in->unset_deleg(f);
9689
7c673cae
FG
9690 if (in->snapid == CEPH_NOSNAP) {
9691 if (in->put_open_ref(f->mode)) {
9692 _flush(in, new C_Client_FlushComplete(this, in));
9693 check_caps(in, 0);
9694 }
9695 } else {
11fdf7f2 9696 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
9697 in->snap_cap_refs--;
9698 }
9699
9700 _release_filelocks(f);
9701
9702 // Finally, read any async err (i.e. from flushes)
9703 int err = f->take_async_err();
9704 if (err != 0) {
11fdf7f2 9705 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
9706 << cpp_strerror(err) << dendl;
9707 } else {
11fdf7f2 9708 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
9709 }
9710
9711 _put_fh(f);
9712
9713 return err;
9714}
9715
9716void Client::_put_fh(Fh *f)
9717{
9718 int left = f->put();
9719 if (!left) {
9720 delete f;
9721 }
9722}
9723
9724int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9725 const UserPerm& perms)
9726{
9727 if (in->snapid != CEPH_NOSNAP &&
9728 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
f67539c2 9729 return -CEPHFS_EROFS;
7c673cae
FG
9730 }
9731
9732 // use normalized flags to generate cmode
11fdf7f2
TL
9733 int cflags = ceph_flags_sys2wire(flags);
9734 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9735 cflags |= CEPH_O_LAZY;
9736
9737 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
9738 int want = ceph_caps_for_mode(cmode);
9739 int result = 0;
9740
9741 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9742
b32b8144 9743 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
9744 // update wanted?
9745 check_caps(in, CHECK_CAPS_NODELAY);
9746 } else {
b32b8144 9747
7c673cae
FG
9748 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9749 filepath path;
9750 in->make_nosnap_relative_path(path);
9751 req->set_filepath(path);
11fdf7f2 9752 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
9753 req->head.args.open.mode = mode;
9754 req->head.args.open.pool = -1;
9755 if (cct->_conf->client_debug_getattr_caps)
9756 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9757 else
9758 req->head.args.open.mask = 0;
9759 req->head.args.open.old_size = in->size; // for O_TRUNC
9760 req->set_inode(in);
9761 result = make_request(req, perms);
b32b8144
FG
9762
9763 /*
9764 * NFS expects that delegations will be broken on a conflicting open,
9765 * not just when there is actual conflicting access to the file. SMB leases
9766 * and oplocks also have similar semantics.
9767 *
9768 * Ensure that clients that have delegations enabled will wait on minimal
9769 * caps during open, just to ensure that other clients holding delegations
9770 * return theirs first.
9771 */
9772 if (deleg_timeout && result == 0) {
9773 int need = 0, have;
9774
9775 if (cmode & CEPH_FILE_MODE_WR)
9776 need |= CEPH_CAP_FILE_WR;
9777 if (cmode & CEPH_FILE_MODE_RD)
9778 need |= CEPH_CAP_FILE_RD;
9779
f6b5b4d7
TL
9780 Fh fh(in, flags, cmode, fd_gen, perms);
9781 result = get_caps(&fh, need, want, &have, -1);
b32b8144 9782 if (result < 0) {
1adf2230 9783 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
9784 " . Denying open: " <<
9785 cpp_strerror(result) << dendl;
b32b8144
FG
9786 } else {
9787 put_cap_ref(in, need);
9788 }
9789 }
7c673cae
FG
9790 }
9791
9792 // success?
9793 if (result >= 0) {
9794 if (fhp)
9795 *fhp = _create_fh(in, flags, cmode, perms);
9796 } else {
9797 in->put_open_ref(cmode);
9798 }
9799
9800 trim_cache();
9801
9802 return result;
9803}
9804
9805int Client::_renew_caps(Inode *in)
9806{
9807 int wanted = in->caps_file_wanted();
9808 if (in->is_any_caps() &&
9809 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9810 check_caps(in, CHECK_CAPS_NODELAY);
9811 return 0;
9812 }
9813
9814 int flags = 0;
9815 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9816 flags = O_RDWR;
9817 else if (wanted & CEPH_CAP_FILE_RD)
9818 flags = O_RDONLY;
9819 else if (wanted & CEPH_CAP_FILE_WR)
9820 flags = O_WRONLY;
9821
9822 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9823 filepath path;
9824 in->make_nosnap_relative_path(path);
9825 req->set_filepath(path);
9826 req->head.args.open.flags = flags;
9827 req->head.args.open.pool = -1;
9828 if (cct->_conf->client_debug_getattr_caps)
9829 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9830 else
9831 req->head.args.open.mask = 0;
9832 req->set_inode(in);
9833
9834 // duplicate in case Cap goes away; not sure if that race is a concern?
9835 const UserPerm *pperm = in->get_best_perms();
9836 UserPerm perms;
9837 if (pperm != NULL)
9838 perms = *pperm;
9839 int ret = make_request(req, perms);
9840 return ret;
9841}
9842
b3b6e05e 9843int Client::_close(int fd)
7c673cae
FG
9844{
9845 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
7c673cae
FG
9846 tout(cct) << "close" << std::endl;
9847 tout(cct) << fd << std::endl;
9848
9849 Fh *fh = get_filehandle(fd);
9850 if (!fh)
f67539c2 9851 return -CEPHFS_EBADF;
7c673cae
FG
9852 int err = _release_fh(fh);
9853 fd_map.erase(fd);
9854 put_fd(fd);
9855 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9856 return err;
9857}
9858
b3b6e05e
TL
9859int Client::close(int fd) {
9860 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9861 if (!mref_reader.is_state_satisfied())
9862 return -CEPHFS_ENOTCONN;
9863
9864 std::scoped_lock lock(client_lock);
9865 return _close(fd);
9866}
7c673cae
FG
9867
9868// ------------
9869// read, write
9870
9871loff_t Client::lseek(int fd, loff_t offset, int whence)
9872{
f67539c2
TL
9873 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9874 if (!mref_reader.is_state_satisfied())
9875 return -CEPHFS_ENOTCONN;
9876
7c673cae
FG
9877 tout(cct) << "lseek" << std::endl;
9878 tout(cct) << fd << std::endl;
9879 tout(cct) << offset << std::endl;
9880 tout(cct) << whence << std::endl;
9881
f67539c2 9882 std::scoped_lock lock(client_lock);
7c673cae
FG
9883 Fh *f = get_filehandle(fd);
9884 if (!f)
f67539c2 9885 return -CEPHFS_EBADF;
7c673cae
FG
9886#if defined(__linux__) && defined(O_PATH)
9887 if (f->flags & O_PATH)
f67539c2 9888 return -CEPHFS_EBADF;
7c673cae
FG
9889#endif
9890 return _lseek(f, offset, whence);
9891}
9892
9893loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9894{
9895 Inode *in = f->inode.get();
9f95a23c 9896 bool whence_check = false;
11fdf7f2 9897 loff_t pos = -1;
7c673cae 9898
9f95a23c
TL
9899 switch (whence) {
9900 case SEEK_END:
9901 whence_check = true;
9902 break;
9903
9904#ifdef SEEK_DATA
9905 case SEEK_DATA:
9906 whence_check = true;
9907 break;
9908#endif
9909
9910#ifdef SEEK_HOLE
9911 case SEEK_HOLE:
9912 whence_check = true;
9913 break;
9914#endif
9915 }
9916
9917 if (whence_check) {
9918 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9919 if (r < 0)
92f5a8d4 9920 return r;
92f5a8d4
TL
9921 }
9922
7c673cae
FG
9923 switch (whence) {
9924 case SEEK_SET:
11fdf7f2 9925 pos = offset;
7c673cae
FG
9926 break;
9927
9928 case SEEK_CUR:
92f5a8d4 9929 pos = f->pos + offset;
7c673cae
FG
9930 break;
9931
9932 case SEEK_END:
11fdf7f2 9933 pos = in->size + offset;
7c673cae
FG
9934 break;
9935
9f95a23c 9936#ifdef SEEK_DATA
92f5a8d4 9937 case SEEK_DATA:
9f95a23c 9938 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9939 return -CEPHFS_ENXIO;
92f5a8d4
TL
9940 pos = offset;
9941 break;
9f95a23c 9942#endif
92f5a8d4 9943
9f95a23c 9944#ifdef SEEK_HOLE
92f5a8d4 9945 case SEEK_HOLE:
9f95a23c 9946 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9947 return -CEPHFS_ENXIO;
9f95a23c 9948 pos = in->size;
92f5a8d4 9949 break;
9f95a23c 9950#endif
92f5a8d4 9951
7c673cae 9952 default:
92f5a8d4 9953 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
f67539c2 9954 return -CEPHFS_EINVAL;
7c673cae
FG
9955 }
9956
11fdf7f2 9957 if (pos < 0) {
f67539c2 9958 return -CEPHFS_EINVAL;
11fdf7f2
TL
9959 } else {
9960 f->pos = pos;
9961 }
9962
1adf2230 9963 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
9964 return f->pos;
9965}
9966
9967
9968void Client::lock_fh_pos(Fh *f)
9969{
11fdf7f2 9970 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9971
9972 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 9973 ceph::condition_variable cond;
7c673cae 9974 f->pos_waiters.push_back(&cond);
11fdf7f2 9975 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
9976 std::unique_lock l{client_lock, std::adopt_lock};
9977 cond.wait(l, [f, me=&cond] {
9978 return !f->pos_locked && f->pos_waiters.front() == me;
9979 });
9980 l.release();
11fdf7f2
TL
9981 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9982 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9983 f->pos_waiters.pop_front();
9984 }
9985
9986 f->pos_locked = true;
9987}
9988
9989void Client::unlock_fh_pos(Fh *f)
9990{
f67539c2
TL
9991 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9992
11fdf7f2 9993 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 9994 f->pos_locked = false;
f67539c2
TL
9995 if (!f->pos_waiters.empty()) {
9996 // only wake up the oldest waiter
9997 auto cond = f->pos_waiters.front();
9998 cond->notify_one();
9999 }
7c673cae
FG
10000}
10001
10002int Client::uninline_data(Inode *in, Context *onfinish)
10003{
10004 if (!in->inline_data.length()) {
10005 onfinish->complete(0);
10006 return 0;
10007 }
10008
10009 char oid_buf[32];
10010 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10011 object_t oid = oid_buf;
10012
10013 ObjectOperation create_ops;
10014 create_ops.create(false);
10015
10016 objecter->mutate(oid,
10017 OSDMap::file_to_object_locator(in->layout),
10018 create_ops,
10019 in->snaprealm->get_snap_context(),
10020 ceph::real_clock::now(),
10021 0,
10022 NULL);
10023
10024 bufferlist inline_version_bl;
11fdf7f2 10025 encode(in->inline_version, inline_version_bl);
7c673cae
FG
10026
10027 ObjectOperation uninline_ops;
10028 uninline_ops.cmpxattr("inline_version",
10029 CEPH_OSD_CMPXATTR_OP_GT,
10030 CEPH_OSD_CMPXATTR_MODE_U64,
10031 inline_version_bl);
10032 bufferlist inline_data = in->inline_data;
10033 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10034 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10035
10036 objecter->mutate(oid,
10037 OSDMap::file_to_object_locator(in->layout),
10038 uninline_ops,
10039 in->snaprealm->get_snap_context(),
10040 ceph::real_clock::now(),
10041 0,
10042 onfinish);
10043
10044 return 0;
10045}
10046
10047//
10048
10049// blocking osd interface
10050
10051int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10052{
f67539c2
TL
10053 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10054 if (!mref_reader.is_state_satisfied())
10055 return -CEPHFS_ENOTCONN;
10056
7c673cae
FG
10057 tout(cct) << "read" << std::endl;
10058 tout(cct) << fd << std::endl;
10059 tout(cct) << size << std::endl;
10060 tout(cct) << offset << std::endl;
10061
f67539c2 10062 std::unique_lock lock(client_lock);
7c673cae
FG
10063 Fh *f = get_filehandle(fd);
10064 if (!f)
f67539c2 10065 return -CEPHFS_EBADF;
7c673cae
FG
10066#if defined(__linux__) && defined(O_PATH)
10067 if (f->flags & O_PATH)
f67539c2 10068 return -CEPHFS_EBADF;
7c673cae
FG
10069#endif
10070 bufferlist bl;
11fdf7f2
TL
10071 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10072 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
10073 int r = _read(f, offset, size, &bl);
10074 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10075 if (r >= 0) {
f6b5b4d7 10076 lock.unlock();
9f95a23c 10077 bl.begin().copy(bl.length(), buf);
7c673cae
FG
10078 r = bl.length();
10079 }
10080 return r;
10081}
10082
10083int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10084{
10085 if (iovcnt < 0)
f67539c2 10086 return -CEPHFS_EINVAL;
7c673cae
FG
10087 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10088}
10089
11fdf7f2 10090int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 10091{
f67539c2
TL
10092 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10093
11fdf7f2
TL
10094 int want, have = 0;
10095 bool movepos = false;
10096 std::unique_ptr<C_SaferCond> onuninline;
adb31ebb 10097 int64_t rc = 0;
11fdf7f2 10098 const auto& conf = cct->_conf;
7c673cae 10099 Inode *in = f->inode.get();
11fdf7f2
TL
10100 utime_t lat;
10101 utime_t start = ceph_clock_now();
7c673cae
FG
10102
10103 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
f67539c2 10104 return -CEPHFS_EBADF;
7c673cae
FG
10105 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10106
7c673cae
FG
10107 if (offset < 0) {
10108 lock_fh_pos(f);
10109 offset = f->pos;
10110 movepos = true;
10111 }
10112 loff_t start_pos = offset;
10113
10114 if (in->inline_version == 0) {
adb31ebb 10115 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 10116 if (r < 0) {
adb31ebb 10117 rc = r;
11fdf7f2 10118 goto done;
c07f9fc5 10119 }
11fdf7f2 10120 ceph_assert(in->inline_version > 0);
7c673cae
FG
10121 }
10122
10123retry:
11fdf7f2
TL
10124 if (f->mode & CEPH_FILE_MODE_LAZY)
10125 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10126 else
10127 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
10128 {
10129 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10130 if (r < 0) {
10131 rc = r;
10132 goto done;
10133 }
c07f9fc5 10134 }
7c673cae 10135 if (f->flags & O_DIRECT)
11fdf7f2 10136 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10137
10138 if (in->inline_version < CEPH_INLINE_NONE) {
10139 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
10140 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
10141 uninline_data(in, onuninline.get());
7c673cae
FG
10142 } else {
10143 uint32_t len = in->inline_data.length();
7c673cae
FG
10144 uint64_t endoff = offset + size;
10145 if (endoff > in->size)
10146 endoff = in->size;
10147
10148 if (offset < len) {
10149 if (endoff <= len) {
10150 bl->substr_of(in->inline_data, offset, endoff - offset);
10151 } else {
10152 bl->substr_of(in->inline_data, offset, len - offset);
10153 bl->append_zero(endoff - len);
10154 }
adb31ebb 10155 rc = endoff - offset;
7c673cae
FG
10156 } else if ((uint64_t)offset < endoff) {
10157 bl->append_zero(endoff - offset);
adb31ebb 10158 rc = endoff - offset;
11fdf7f2 10159 } else {
adb31ebb 10160 rc = 0;
7c673cae 10161 }
7c673cae
FG
10162 goto success;
10163 }
10164 }
10165
10166 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
10167 conf->client_oc &&
10168 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10169
10170 if (f->flags & O_RSYNC) {
10171 _flush_range(in, offset, size);
10172 }
adb31ebb
TL
10173 rc = _read_async(f, offset, size, bl);
10174 if (rc < 0)
7c673cae
FG
10175 goto done;
10176 } else {
10177 if (f->flags & O_DIRECT)
10178 _flush_range(in, offset, size);
10179
10180 bool checkeof = false;
adb31ebb
TL
10181 rc = _read_sync(f, offset, size, bl, &checkeof);
10182 if (rc < 0)
7c673cae
FG
10183 goto done;
10184 if (checkeof) {
adb31ebb
TL
10185 offset += rc;
10186 size -= rc;
7c673cae
FG
10187
10188 put_cap_ref(in, CEPH_CAP_FILE_RD);
10189 have = 0;
10190 // reverify size
adb31ebb
TL
10191 {
10192 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10193 if (r < 0) {
10194 rc = r;
10195 goto done;
10196 }
10197 }
7c673cae
FG
10198
10199 // eof? short read.
10200 if ((uint64_t)offset < in->size)
10201 goto retry;
10202 }
10203 }
10204
10205success:
adb31ebb 10206 ceph_assert(rc >= 0);
a4b75251 10207 update_read_io_size(bl->length());
7c673cae
FG
10208 if (movepos) {
10209 // adjust fd pos
adb31ebb 10210 f->pos = start_pos + rc;
7c673cae 10211 }
11fdf7f2
TL
10212
10213 lat = ceph_clock_now();
10214 lat -= start;
2a845540
TL
10215
10216 ++nr_read_request;
10217 update_io_stat_read(lat);
7c673cae
FG
10218
10219done:
10220 // done!
11fdf7f2 10221
7c673cae 10222 if (onuninline) {
9f95a23c 10223 client_lock.unlock();
11fdf7f2 10224 int ret = onuninline->wait();
9f95a23c 10225 client_lock.lock();
f67539c2 10226 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10227 in->inline_data.clear();
10228 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10229 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10230 check_caps(in, 0);
10231 } else
adb31ebb 10232 rc = ret;
7c673cae 10233 }
11fdf7f2 10234 if (have) {
7c673cae 10235 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
10236 }
10237 if (movepos) {
10238 unlock_fh_pos(f);
10239 }
adb31ebb 10240 return rc;
7c673cae
FG
10241}
10242
10243Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10244 client(c), f(f) {
10245 f->get();
10246 f->readahead.inc_pending();
10247}
10248
10249Client::C_Readahead::~C_Readahead() {
10250 f->readahead.dec_pending();
10251 client->_put_fh(f);
10252}
10253
10254void Client::C_Readahead::finish(int r) {
10255 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10256 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
a4b75251
TL
10257 if (r > 0) {
10258 client->update_read_io_size(r);
10259 }
7c673cae
FG
10260}
10261
10262int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10263{
f67539c2
TL
10264 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10265
11fdf7f2 10266 const auto& conf = cct->_conf;
7c673cae
FG
10267 Inode *in = f->inode.get();
10268
11fdf7f2 10269 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
10270
10271 // trim read based on file size?
10272 if (off >= in->size)
10273 return 0;
10274 if (len == 0)
10275 return 0;
10276 if (off + len > in->size) {
10277 len = in->size - off;
10278 }
10279
10280 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10281 << " max_bytes=" << f->readahead.get_max_readahead_size()
10282 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10283
10284 // read (and possibly block)
11fdf7f2
TL
10285 int r = 0;
10286 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 10287 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 10288 off, len, bl, 0, &onfinish);
7c673cae
FG
10289 if (r == 0) {
10290 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 10291 client_lock.unlock();
11fdf7f2 10292 r = onfinish.wait();
9f95a23c 10293 client_lock.lock();
7c673cae 10294 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
a4b75251 10295 update_read_io_size(bl->length());
7c673cae
FG
10296 }
10297
10298 if(f->readahead.get_min_readahead_size() > 0) {
10299 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10300 if (readahead_extent.second > 0) {
10301 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10302 << " (caller wants " << off << "~" << len << ")" << dendl;
10303 Context *onfinish2 = new C_Readahead(this, f);
10304 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10305 readahead_extent.first, readahead_extent.second,
10306 NULL, 0, onfinish2);
10307 if (r2 == 0) {
10308 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10309 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10310 } else {
10311 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10312 delete onfinish2;
10313 }
10314 }
10315 }
10316
10317 return r;
10318}
10319
10320int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10321 bool *checkeof)
10322{
f67539c2
TL
10323 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10324
7c673cae
FG
10325 Inode *in = f->inode.get();
10326 uint64_t pos = off;
10327 int left = len;
10328 int read = 0;
10329
11fdf7f2 10330 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 10331
f67539c2
TL
10332 // 0 success, 1 continue and < 0 error happen.
10333 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
11fdf7f2 10334 int r = onfinish.wait();
7c673cae
FG
10335
10336 // if we get ENOENT from OSD, assume 0 bytes returned
f67539c2 10337 if (r == -CEPHFS_ENOENT)
7c673cae
FG
10338 r = 0;
10339 if (r < 0)
10340 return r;
f67539c2 10341
7c673cae
FG
10342 if (tbl.length()) {
10343 r = tbl.length();
10344
10345 read += r;
10346 pos += r;
10347 left -= r;
10348 bl->claim_append(tbl);
10349 }
10350 // short read?
10351 if (r >= 0 && r < wanted) {
10352 if (pos < in->size) {
10353 // zero up to known EOF
10354 int64_t some = in->size - pos;
10355 if (some > left)
10356 some = left;
11fdf7f2
TL
10357 auto z = buffer::ptr_node::create(some);
10358 z->zero();
10359 bl->push_back(std::move(z));
7c673cae
FG
10360 read += some;
10361 pos += some;
10362 left -= some;
10363 if (left == 0)
f67539c2 10364 return 0;
7c673cae
FG
10365 }
10366
10367 *checkeof = true;
f67539c2 10368 return 0;
7c673cae 10369 }
f67539c2
TL
10370 return 1;
10371 };
7c673cae 10372
f67539c2
TL
10373 while (left > 0) {
10374 C_SaferCond onfinish("Client::_read_sync flock");
10375 bufferlist tbl;
7c673cae 10376
f67539c2
TL
10377 int wanted = left;
10378 filer->read_trunc(in->ino, &in->layout, in->snapid,
10379 pos, left, &tbl, 0,
10380 in->truncate_size, in->truncate_seq,
10381 &onfinish);
10382 client_lock.unlock();
10383 int r = wait_and_copy(onfinish, tbl, wanted);
10384 client_lock.lock();
10385 if (!r)
10386 return read;
10387 if (r < 0)
10388 return r;
7c673cae 10389 }
f67539c2 10390 return read;
7c673cae
FG
10391}
10392
10393int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10394{
f67539c2
TL
10395 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10396 if (!mref_reader.is_state_satisfied())
10397 return -CEPHFS_ENOTCONN;
10398
7c673cae
FG
10399 tout(cct) << "write" << std::endl;
10400 tout(cct) << fd << std::endl;
10401 tout(cct) << size << std::endl;
10402 tout(cct) << offset << std::endl;
10403
f67539c2 10404 std::scoped_lock lock(client_lock);
7c673cae
FG
10405 Fh *fh = get_filehandle(fd);
10406 if (!fh)
f67539c2 10407 return -CEPHFS_EBADF;
7c673cae
FG
10408#if defined(__linux__) && defined(O_PATH)
10409 if (fh->flags & O_PATH)
f67539c2 10410 return -CEPHFS_EBADF;
7c673cae 10411#endif
11fdf7f2
TL
10412 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10413 size = std::min(size, (loff_t)INT_MAX);
10414 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
10415 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10416 return r;
10417}
10418
10419int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10420{
10421 if (iovcnt < 0)
f67539c2 10422 return -CEPHFS_EINVAL;
7c673cae
FG
10423 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10424}
10425
11fdf7f2 10426int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
20effc67
TL
10427 unsigned iovcnt, int64_t offset,
10428 bool write, bool clamp_to_int)
7c673cae 10429{
20effc67
TL
10430 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10431
7c673cae
FG
10432#if defined(__linux__) && defined(O_PATH)
10433 if (fh->flags & O_PATH)
f67539c2 10434 return -CEPHFS_EBADF;
7c673cae
FG
10435#endif
10436 loff_t totallen = 0;
10437 for (unsigned i = 0; i < iovcnt; i++) {
10438 totallen += iov[i].iov_len;
10439 }
11fdf7f2
TL
10440
10441 /*
10442 * Some of the API functions take 64-bit size values, but only return
10443 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10444 * we don't do I/Os larger than the values we can return.
10445 */
10446 if (clamp_to_int) {
10447 totallen = std::min(totallen, (loff_t)INT_MAX);
10448 }
7c673cae 10449 if (write) {
11fdf7f2
TL
10450 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10451 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
10452 return w;
10453 } else {
10454 bufferlist bl;
11fdf7f2
TL
10455 int64_t r = _read(fh, offset, totallen, &bl);
10456 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
10457 if (r <= 0)
10458 return r;
10459
20effc67 10460 client_lock.unlock();
9f95a23c 10461 auto iter = bl.cbegin();
7c673cae
FG
10462 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10463 /*
f67539c2
TL
10464 * This piece of code aims to handle the case that bufferlist
10465 * does not have enough data to fill in the iov
7c673cae 10466 */
9f95a23c
TL
10467 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10468 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10469 resid -= round_size;
10470 /* iter is self-updating */
7c673cae 10471 }
20effc67 10472 client_lock.lock();
f67539c2 10473 return r;
7c673cae
FG
10474 }
10475}
10476
11fdf7f2
TL
10477int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10478{
f67539c2
TL
10479 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10480 if (!mref_reader.is_state_satisfied())
10481 return -CEPHFS_ENOTCONN;
10482
11fdf7f2
TL
10483 tout(cct) << fd << std::endl;
10484 tout(cct) << offset << std::endl;
10485
20effc67 10486 std::scoped_lock cl(client_lock);
11fdf7f2
TL
10487 Fh *fh = get_filehandle(fd);
10488 if (!fh)
f67539c2 10489 return -CEPHFS_EBADF;
20effc67 10490 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
11fdf7f2
TL
10491}
10492
10493int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10494 const struct iovec *iov, int iovcnt)
7c673cae 10495{
f67539c2
TL
10496 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10497
f64942e4 10498 uint64_t fpos = 0;
2a845540 10499 Inode *in = f->inode.get();
f64942e4 10500
2a845540
TL
10501 if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10502 (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10503 return -CEPHFS_EFBIG;
10504 }
7c673cae 10505 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
7c673cae
FG
10506
10507 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
f67539c2 10508 return -CEPHFS_ENOSPC;
7c673cae
FG
10509 }
10510
11fdf7f2 10511 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
10512
10513 // was Fh opened as writeable?
10514 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10515 return -CEPHFS_EBADF;
7c673cae 10516
7c673cae
FG
10517 // use/adjust fd pos?
10518 if (offset < 0) {
10519 lock_fh_pos(f);
10520 /*
10521 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10522 * change out from under us.
10523 */
10524 if (f->flags & O_APPEND) {
9f95a23c 10525 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
10526 if (r < 0) {
10527 unlock_fh_pos(f);
10528 return r;
10529 }
10530 }
10531 offset = f->pos;
f64942e4 10532 fpos = offset+size;
7c673cae
FG
10533 unlock_fh_pos(f);
10534 }
10535
11fdf7f2
TL
10536 // check quota
10537 uint64_t endoff = offset + size;
10538 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10539 f->actor_perms)) {
f67539c2 10540 return -CEPHFS_EDQUOT;
11fdf7f2
TL
10541 }
10542
7c673cae
FG
10543 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10544
10545 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10546
10547 // time it.
10548 utime_t start = ceph_clock_now();
10549
10550 if (in->inline_version == 0) {
10551 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10552 if (r < 0)
10553 return r;
11fdf7f2 10554 ceph_assert(in->inline_version > 0);
7c673cae
FG
10555 }
10556
10557 // copy into fresh buffer (since our write may be resub, async)
10558 bufferlist bl;
10559 if (buf) {
10560 if (size > 0)
10561 bl.append(buf, size);
10562 } else if (iov){
10563 for (int i = 0; i < iovcnt; i++) {
10564 if (iov[i].iov_len > 0) {
10565 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10566 }
10567 }
10568 }
10569
10570 utime_t lat;
10571 uint64_t totalwritten;
11fdf7f2
TL
10572 int want, have;
10573 if (f->mode & CEPH_FILE_MODE_LAZY)
10574 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10575 else
10576 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 10577 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
10578 if (r < 0)
10579 return r;
10580
10581 /* clear the setuid/setgid bits, if any */
181888fb 10582 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
10583 struct ceph_statx stx = { 0 };
10584
10585 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10586 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10587 if (r < 0)
10588 return r;
10589 } else {
10590 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10591 }
10592
10593 if (f->flags & O_DIRECT)
11fdf7f2 10594 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10595
10596 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10597
11fdf7f2
TL
10598 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10599
7c673cae
FG
10600 if (in->inline_version < CEPH_INLINE_NONE) {
10601 if (endoff > cct->_conf->client_max_inline_size ||
10602 endoff > CEPH_INLINE_MAX_SIZE ||
10603 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
10604 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10605 uninline_data(in, onuninline.get());
7c673cae
FG
10606 } else {
10607 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10608
10609 uint32_t len = in->inline_data.length();
10610
10611 if (endoff < len)
9f95a23c 10612 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
10613
10614 if (offset < len)
10615 in->inline_data.splice(offset, len - offset);
10616 else if (offset > len)
10617 in->inline_data.append_zero(offset - len);
10618
10619 in->inline_data.append(bl);
10620 in->inline_version++;
10621
10622 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10623
10624 goto success;
10625 }
10626 }
10627
11fdf7f2
TL
10628 if (cct->_conf->client_oc &&
10629 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10630 // do buffered write
10631 if (!in->oset.dirty_or_tx)
10632 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10633
10634 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10635
10636 // async, caching, non-blocking.
10637 r = objectcacher->file_write(&in->oset, &in->layout,
10638 in->snaprealm->get_snap_context(),
10639 offset, size, bl, ceph::real_clock::now(),
10640 0);
10641 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10642
10643 if (r < 0)
10644 goto done;
10645
10646 // flush cached write if O_SYNC is set on file fh
10647 // O_DSYNC == O_SYNC on linux < 2.6.33
10648 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10649 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10650 _flush_range(in, offset, size);
10651 }
10652 } else {
10653 if (f->flags & O_DIRECT)
10654 _flush_range(in, offset, size);
10655
10656 // simple, non-atomic sync write
11fdf7f2 10657 C_SaferCond onfinish("Client::_write flock");
f67539c2 10658 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
10659
10660 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10661 offset, size, bl, ceph::real_clock::now(), 0,
10662 in->truncate_size, in->truncate_seq,
11fdf7f2 10663 &onfinish);
9f95a23c 10664 client_lock.unlock();
f6b5b4d7 10665 r = onfinish.wait();
9f95a23c 10666 client_lock.lock();
f67539c2 10667 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
f6b5b4d7
TL
10668 if (r < 0)
10669 goto done;
7c673cae
FG
10670 }
10671
10672 // if we get here, write was successful, update client metadata
10673success:
a4b75251 10674 update_write_io_size(size);
7c673cae
FG
10675 // time
10676 lat = ceph_clock_now();
10677 lat -= start;
2a845540
TL
10678
10679 ++nr_write_request;
10680 update_io_stat_write(lat);
7c673cae 10681
f64942e4
AA
10682 if (fpos) {
10683 lock_fh_pos(f);
10684 f->pos = fpos;
10685 unlock_fh_pos(f);
10686 }
7c673cae 10687 totalwritten = size;
11fdf7f2 10688 r = (int64_t)totalwritten;
7c673cae
FG
10689
10690 // extend file?
10691 if (totalwritten + offset > in->size) {
10692 in->size = totalwritten + offset;
28e407b8 10693 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 10694
11fdf7f2 10695 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 10696 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
10697 } else if (is_max_size_approaching(in)) {
10698 check_caps(in, 0);
7c673cae
FG
10699 }
10700
10701 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10702 } else {
10703 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10704 }
10705
10706 // mtime
91327a77 10707 in->mtime = in->ctime = ceph_clock_now();
7c673cae 10708 in->change_attr++;
28e407b8 10709 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10710
10711done:
10712
11fdf7f2 10713 if (nullptr != onuninline) {
9f95a23c 10714 client_lock.unlock();
11fdf7f2 10715 int uninline_ret = onuninline->wait();
9f95a23c 10716 client_lock.lock();
7c673cae 10717
f67539c2 10718 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10719 in->inline_data.clear();
10720 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10721 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10722 check_caps(in, 0);
10723 } else
10724 r = uninline_ret;
10725 }
10726
10727 put_cap_ref(in, CEPH_CAP_FILE_WR);
10728 return r;
10729}
10730
10731int Client::_flush(Fh *f)
10732{
10733 Inode *in = f->inode.get();
10734 int err = f->take_async_err();
10735 if (err != 0) {
10736 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10737 << cpp_strerror(err) << dendl;
10738 } else {
10739 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10740 }
10741
10742 return err;
10743}
10744
10745int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10746{
10747 struct ceph_statx stx;
10748 stx.stx_size = length;
10749 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10750}
10751
10752int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10753{
f67539c2
TL
10754 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10755 if (!mref_reader.is_state_satisfied())
10756 return -CEPHFS_ENOTCONN;
10757
11fdf7f2 10758 tout(cct) << __func__ << std::endl;
7c673cae
FG
10759 tout(cct) << fd << std::endl;
10760 tout(cct) << length << std::endl;
10761
f67539c2 10762 std::scoped_lock lock(client_lock);
7c673cae
FG
10763 Fh *f = get_filehandle(fd);
10764 if (!f)
f67539c2 10765 return -CEPHFS_EBADF;
7c673cae
FG
10766#if defined(__linux__) && defined(O_PATH)
10767 if (f->flags & O_PATH)
f67539c2 10768 return -CEPHFS_EBADF;
7c673cae 10769#endif
adb31ebb 10770 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10771 return -CEPHFS_EBADF;
7c673cae
FG
10772 struct stat attr;
10773 attr.st_size = length;
10774 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10775}
10776
10777int Client::fsync(int fd, bool syncdataonly)
10778{
f67539c2
TL
10779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10780 if (!mref_reader.is_state_satisfied())
10781 return -CEPHFS_ENOTCONN;
10782
7c673cae
FG
10783 tout(cct) << "fsync" << std::endl;
10784 tout(cct) << fd << std::endl;
10785 tout(cct) << syncdataonly << std::endl;
10786
f67539c2 10787 std::scoped_lock lock(client_lock);
7c673cae
FG
10788 Fh *f = get_filehandle(fd);
10789 if (!f)
f67539c2 10790 return -CEPHFS_EBADF;
7c673cae
FG
10791#if defined(__linux__) && defined(O_PATH)
10792 if (f->flags & O_PATH)
f67539c2 10793 return -CEPHFS_EBADF;
7c673cae
FG
10794#endif
10795 int r = _fsync(f, syncdataonly);
10796 if (r == 0) {
10797 // The IOs in this fsync were okay, but maybe something happened
10798 // in the background that we shoudl be reporting?
10799 r = f->take_async_err();
1adf2230 10800 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
10801 << ") = 0, async_err = " << r << dendl;
10802 } else {
10803 // Assume that an error we encountered during fsync, even reported
10804 // synchronously, would also have applied the error to the Fh, and we
10805 // should clear it here to avoid returning the same error again on next
10806 // call.
1adf2230 10807 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
10808 << r << dendl;
10809 f->take_async_err();
10810 }
10811 return r;
10812}
10813
10814int Client::_fsync(Inode *in, bool syncdataonly)
10815{
f67539c2
TL
10816 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10817
7c673cae 10818 int r = 0;
11fdf7f2 10819 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
10820 ceph_tid_t flush_tid = 0;
10821 InodeRef tmp_ref;
11fdf7f2
TL
10822 utime_t lat;
10823 utime_t start = ceph_clock_now();
7c673cae 10824
1adf2230 10825 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
10826
10827 if (cct->_conf->client_oc) {
11fdf7f2
TL
10828 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10829 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10830 _flush(in, object_cacher_completion.get());
7c673cae
FG
10831 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10832 }
10833
10834 if (!syncdataonly && in->dirty_caps) {
10835 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10836 if (in->flushing_caps)
10837 flush_tid = last_flush_tid;
10838 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10839
10840 if (!syncdataonly && !in->unsafe_ops.empty()) {
522d829b 10841 flush_mdlog_sync(in);
28e407b8 10842
7c673cae
FG
10843 MetaRequest *req = in->unsafe_ops.back();
10844 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10845
10846 req->get();
10847 wait_on_list(req->waitfor_safe);
10848 put_request(req);
10849 }
10850
11fdf7f2 10851 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 10852 client_lock.unlock();
7c673cae 10853 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 10854 r = object_cacher_completion->wait();
9f95a23c 10855 client_lock.lock();
7c673cae
FG
10856 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10857 } else {
10858 // FIXME: this can starve
10859 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10860 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10861 << " uncommitted, waiting" << dendl;
10862 wait_on_list(in->waitfor_commit);
10863 }
10864 }
10865
10866 if (!r) {
10867 if (flush_tid > 0)
10868 wait_sync_caps(in, flush_tid);
10869
10870 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10871 } else {
1adf2230 10872 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
10873 << cpp_strerror(-r) << dendl;
10874 }
11fdf7f2
TL
10875
10876 lat = ceph_clock_now();
10877 lat -= start;
10878 logger->tinc(l_c_fsync, lat);
7c673cae
FG
10879
10880 return r;
10881}
10882
10883int Client::_fsync(Fh *f, bool syncdataonly)
10884{
1adf2230 10885 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
10886 return _fsync(f->inode.get(), syncdataonly);
10887}
10888
10889int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10890{
f67539c2
TL
10891 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10892 if (!mref_reader.is_state_satisfied())
10893 return -CEPHFS_ENOTCONN;
10894
7c673cae
FG
10895 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10896 tout(cct) << fd << std::endl;
10897
f67539c2 10898 std::scoped_lock lock(client_lock);
7c673cae
FG
10899 Fh *f = get_filehandle(fd);
10900 if (!f)
f67539c2 10901 return -CEPHFS_EBADF;
7c673cae
FG
10902 int r = _getattr(f->inode, mask, perms);
10903 if (r < 0)
10904 return r;
10905 fill_stat(f->inode, stbuf, NULL);
1adf2230 10906 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
10907 return r;
10908}
10909
10910int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10911 unsigned int want, unsigned int flags)
10912{
f67539c2
TL
10913 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10914 if (!mref_reader.is_state_satisfied())
10915 return -CEPHFS_ENOTCONN;
10916
7c673cae
FG
10917 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10918 tout(cct) << fd << std::endl;
10919
f67539c2 10920 std::scoped_lock lock(client_lock);
7c673cae
FG
10921 Fh *f = get_filehandle(fd);
10922 if (!f)
f67539c2 10923 return -CEPHFS_EBADF;
7c673cae
FG
10924
10925 unsigned mask = statx_to_mask(flags, want);
10926
10927 int r = 0;
b3b6e05e 10928 if (mask) {
7c673cae
FG
10929 r = _getattr(f->inode, mask, perms);
10930 if (r < 0) {
10931 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10932 return r;
10933 }
10934 }
10935
10936 fill_statx(f->inode, mask, stx);
10937 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10938 return r;
10939}
10940
b3b6e05e
TL
10941int Client::statxat(int dirfd, const char *relpath,
10942 struct ceph_statx *stx, const UserPerm& perms,
10943 unsigned int want, unsigned int flags) {
10944 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10945 if (!mref_reader.is_state_satisfied()) {
10946 return -CEPHFS_ENOTCONN;
10947 }
10948
10949 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10950 tout(cct) << dirfd << std::endl;
10951 tout(cct) << relpath << std::endl;
10952
10953 unsigned mask = statx_to_mask(flags, want);
10954
10955 InodeRef dirinode;
10956 std::scoped_lock lock(client_lock);
10957 int r = get_fd_inode(dirfd, &dirinode);
10958 if (r < 0) {
10959 return r;
10960 }
10961
10962 InodeRef in;
10963 filepath path(relpath);
10964 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10965 if (r < 0) {
10966 return r;
10967 }
10968 r = _getattr(in, mask, perms);
10969 if (r < 0) {
10970 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10971 return r;
10972 }
10973
10974 fill_statx(in, mask, stx);
10975 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10976 return r;
10977}
10978
7c673cae
FG
10979// not written yet, but i want to link!
10980
10981int Client::chdir(const char *relpath, std::string &new_cwd,
10982 const UserPerm& perms)
10983{
f67539c2
TL
10984 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10985 if (!mref_reader.is_state_satisfied())
10986 return -CEPHFS_ENOTCONN;
10987
7c673cae
FG
10988 tout(cct) << "chdir" << std::endl;
10989 tout(cct) << relpath << std::endl;
181888fb 10990
7c673cae
FG
10991 filepath path(relpath);
10992 InodeRef in;
f67539c2
TL
10993
10994 std::scoped_lock lock(client_lock);
7c673cae
FG
10995 int r = path_walk(path, &in, perms);
10996 if (r < 0)
10997 return r;
92f5a8d4
TL
10998
10999 if (!(in.get()->is_dir()))
f67539c2 11000 return -CEPHFS_ENOTDIR;
92f5a8d4 11001
7c673cae
FG
11002 if (cwd != in)
11003 cwd.swap(in);
11004 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
11005
b5b8bbf5 11006 _getcwd(new_cwd, perms);
7c673cae
FG
11007 return 0;
11008}
11009
b5b8bbf5 11010void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
11011{
11012 filepath path;
11fdf7f2 11013 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
11014
11015 Inode *in = cwd.get();
b3b6e05e 11016 while (in != root.get()) {
11fdf7f2 11017 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
11018
11019 // A cwd or ancester is unlinked
11fdf7f2 11020 if (in->dentries.empty()) {
7c673cae
FG
11021 return;
11022 }
11023
11024 Dentry *dn = in->get_first_parent();
11025
11026
11027 if (!dn) {
11028 // look it up
11fdf7f2 11029 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
11030 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11031 filepath path(in->ino);
11032 req->set_filepath(path);
11033 req->set_inode(in);
11034 int res = make_request(req, perms);
11035 if (res < 0)
11036 break;
11037
11038 // start over
11039 path = filepath();
11040 in = cwd.get();
11041 continue;
11042 }
11043 path.push_front_dentry(dn->name);
11044 in = dn->dir->parent_inode;
11045 }
11046 dir = "/";
11047 dir += path.get_path();
11048}
11049
b5b8bbf5
FG
11050void Client::getcwd(string& dir, const UserPerm& perms)
11051{
f67539c2
TL
11052 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11053 if (!mref_reader.is_state_satisfied())
11054 return;
11055
11056 std::scoped_lock l(client_lock);
11057
11058 _getcwd(dir, perms);
b5b8bbf5
FG
11059}
11060
7c673cae
FG
11061int Client::statfs(const char *path, struct statvfs *stbuf,
11062 const UserPerm& perms)
11063{
f67539c2
TL
11064 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11065 if (!mref_reader.is_state_satisfied())
11066 return -CEPHFS_ENOTCONN;
11067
11fdf7f2 11068 tout(cct) << __func__ << std::endl;
91327a77 11069 unsigned long int total_files_on_fs;
7c673cae
FG
11070
11071 ceph_statfs stats;
11072 C_SaferCond cond;
d2e6a577 11073
f67539c2 11074 std::unique_lock lock(client_lock);
d2e6a577
FG
11075 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11076 if (data_pools.size() == 1) {
11077 objecter->get_fs_stats(stats, data_pools[0], &cond);
11078 } else {
20effc67 11079 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
d2e6a577 11080 }
7c673cae 11081
f67539c2 11082 lock.unlock();
7c673cae 11083 int rval = cond.wait();
f67539c2
TL
11084 lock.lock();
11085
20effc67 11086 ceph_assert(root);
91327a77 11087 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
11088
11089 if (rval < 0) {
11090 ldout(cct, 1) << "underlying call to statfs returned error: "
11091 << cpp_strerror(rval)
11092 << dendl;
11093 return rval;
11094 }
11095
11096 memset(stbuf, 0, sizeof(*stbuf));
11097
11098 /*
11099 * we're going to set a block size of 4MB so we can represent larger
11100 * FSes without overflowing. Additionally convert the space
11101 * measurements from KB to bytes while making them in terms of
11102 * blocks. We use 4MB only because it is big enough, and because it
11103 * actually *is* the (ceph) default block size.
11104 */
11105 const int CEPH_BLOCK_SHIFT = 22;
11106 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11107 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77 11108 stbuf->f_files = total_files_on_fs;
f67539c2 11109 stbuf->f_ffree = -1;
7c673cae
FG
11110 stbuf->f_favail = -1;
11111 stbuf->f_fsid = -1; // ??
11112 stbuf->f_flag = 0; // ??
11113 stbuf->f_namemax = NAME_MAX;
11114
11115 // Usually quota_root will == root_ancestor, but if the mount root has no
11116 // quota but we can see a parent of it that does have a quota, we'll
11117 // respect that one instead.
11fdf7f2 11118 ceph_assert(root != nullptr);
b3b6e05e 11119 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
7c673cae 11120
2a845540
TL
11121 // get_quota_root should always give us something if client quotas are
11122 // enabled
11123 ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
7c673cae
FG
11124
11125 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11126
11127 // Skip the getattr if any sessions are stale, as we don't want to
11128 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11129 // is unhealthy.
11130 if (!_any_stale_sessions()) {
11131 int r = _getattr(quota_root, 0, perms, true);
11132 if (r != 0) {
11133 // Ignore return value: error getting latest inode metadata is not a good
11134 // reason to break "df".
11135 lderr(cct) << "Error in getattr on quota root 0x"
11136 << std::hex << quota_root->ino << std::dec
11137 << " statfs result may be outdated" << dendl;
11138 }
11139 }
11140
11141 // Special case: if there is a size quota set on the Inode acting
11142 // as the root for this client mount, then report the quota status
11143 // as the filesystem statistics.
11144 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11145 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
11146 // It is possible for a quota to be exceeded: arithmetic here must
11147 // handle case where used > total.
11148 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
11149
11150 stbuf->f_blocks = total;
11151 stbuf->f_bfree = free;
11152 stbuf->f_bavail = free;
11153 } else {
d2e6a577 11154 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
11155 // multiple pools may be used without one filesystem namespace via
11156 // layouts, this is the most correct thing we can do.
11157 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11158 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11159 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11160 }
11161
11162 return rval;
11163}
11164
11165int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11166 struct flock *fl, uint64_t owner, bool removing)
11167{
11fdf7f2 11168 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
11169 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11170 << " type " << fl->l_type << " owner " << owner
11171 << " " << fl->l_start << "~" << fl->l_len << dendl;
11172
f6b5b4d7 11173 if (in->flags & I_ERROR_FILELOCK)
f67539c2 11174 return -CEPHFS_EIO;
f6b5b4d7 11175
7c673cae
FG
11176 int lock_cmd;
11177 if (F_RDLCK == fl->l_type)
11178 lock_cmd = CEPH_LOCK_SHARED;
11179 else if (F_WRLCK == fl->l_type)
11180 lock_cmd = CEPH_LOCK_EXCL;
11181 else if (F_UNLCK == fl->l_type)
11182 lock_cmd = CEPH_LOCK_UNLOCK;
11183 else
f67539c2 11184 return -CEPHFS_EIO;
7c673cae
FG
11185
11186 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11187 sleep = 0;
11188
11189 /*
11190 * Set the most significant bit, so that MDS knows the 'owner'
11191 * is sufficient to identify the owner of lock. (old code uses
11192 * both 'owner' and 'pid')
11193 */
11194 owner |= (1ULL << 63);
11195
11196 MetaRequest *req = new MetaRequest(op);
11197 filepath path;
11198 in->make_nosnap_relative_path(path);
11199 req->set_filepath(path);
11200 req->set_inode(in);
11201
11202 req->head.args.filelock_change.rule = lock_type;
11203 req->head.args.filelock_change.type = lock_cmd;
11204 req->head.args.filelock_change.owner = owner;
11205 req->head.args.filelock_change.pid = fl->l_pid;
11206 req->head.args.filelock_change.start = fl->l_start;
11207 req->head.args.filelock_change.length = fl->l_len;
11208 req->head.args.filelock_change.wait = sleep;
11209
11210 int ret;
11211 bufferlist bl;
11212
11213 if (sleep && switch_interrupt_cb) {
11214 // enable interrupt
11215 switch_interrupt_cb(callback_handle, req->get());
11216 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
11217 // disable interrupt
11218 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
11219 if (ret == 0 && req->aborted()) {
11220 // effect of this lock request has been revoked by the 'lock intr' request
11221 ret = req->get_abort_code();
11222 }
7c673cae
FG
11223 put_request(req);
11224 } else {
11225 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11226 }
11227
11228 if (ret == 0) {
11229 if (op == CEPH_MDS_OP_GETFILELOCK) {
11230 ceph_filelock filelock;
11fdf7f2
TL
11231 auto p = bl.cbegin();
11232 decode(filelock, p);
7c673cae
FG
11233
11234 if (CEPH_LOCK_SHARED == filelock.type)
11235 fl->l_type = F_RDLCK;
11236 else if (CEPH_LOCK_EXCL == filelock.type)
11237 fl->l_type = F_WRLCK;
11238 else
11239 fl->l_type = F_UNLCK;
11240
11241 fl->l_whence = SEEK_SET;
11242 fl->l_start = filelock.start;
11243 fl->l_len = filelock.length;
11244 fl->l_pid = filelock.pid;
11245 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11246 ceph_lock_state_t *lock_state;
11247 if (lock_type == CEPH_LOCK_FCNTL) {
11248 if (!in->fcntl_locks)
11fdf7f2
TL
11249 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11250 lock_state = in->fcntl_locks.get();
7c673cae
FG
11251 } else if (lock_type == CEPH_LOCK_FLOCK) {
11252 if (!in->flock_locks)
11fdf7f2
TL
11253 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11254 lock_state = in->flock_locks.get();
7c673cae
FG
11255 } else {
11256 ceph_abort();
f67539c2 11257 return -CEPHFS_EINVAL;
7c673cae
FG
11258 }
11259 _update_lock_state(fl, owner, lock_state);
11260
11261 if (!removing) {
11262 if (lock_type == CEPH_LOCK_FCNTL) {
11263 if (!fh->fcntl_locks)
11fdf7f2
TL
11264 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11265 lock_state = fh->fcntl_locks.get();
7c673cae
FG
11266 } else {
11267 if (!fh->flock_locks)
11fdf7f2
TL
11268 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11269 lock_state = fh->flock_locks.get();
7c673cae
FG
11270 }
11271 _update_lock_state(fl, owner, lock_state);
11272 }
11273 } else
11274 ceph_abort();
11275 }
11276 return ret;
11277}
11278
11279int Client::_interrupt_filelock(MetaRequest *req)
11280{
31f18b77
FG
11281 // Set abort code, but do not kick. The abort code prevents the request
11282 // from being re-sent.
f67539c2 11283 req->abort(-CEPHFS_EINTR);
31f18b77
FG
11284 if (req->mds < 0)
11285 return 0; // haven't sent the request
11286
7c673cae
FG
11287 Inode *in = req->inode();
11288
11289 int lock_type;
11290 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11291 lock_type = CEPH_LOCK_FLOCK_INTR;
11292 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11293 lock_type = CEPH_LOCK_FCNTL_INTR;
11294 else {
11295 ceph_abort();
f67539c2 11296 return -CEPHFS_EINVAL;
7c673cae
FG
11297 }
11298
11299 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11300 filepath path;
11301 in->make_nosnap_relative_path(path);
11302 intr_req->set_filepath(path);
11303 intr_req->set_inode(in);
11304 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11305 intr_req->head.args.filelock_change.rule = lock_type;
11306 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11307
11308 UserPerm perms(req->get_uid(), req->get_gid());
11309 return make_request(intr_req, perms, NULL, NULL, -1);
11310}
11311
11312void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11313{
11314 if (!in->fcntl_locks && !in->flock_locks)
11315 return;
11316
11317 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 11318 encode(nr_fcntl_locks, bl);
7c673cae 11319 if (nr_fcntl_locks) {
11fdf7f2 11320 auto &lock_state = in->fcntl_locks;
20effc67 11321 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11322 p != lock_state->held_locks.end();
11323 ++p)
11fdf7f2 11324 encode(p->second, bl);
7c673cae
FG
11325 }
11326
11327 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 11328 encode(nr_flock_locks, bl);
7c673cae 11329 if (nr_flock_locks) {
11fdf7f2 11330 auto &lock_state = in->flock_locks;
20effc67 11331 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11332 p != lock_state->held_locks.end();
11333 ++p)
11fdf7f2 11334 encode(p->second, bl);
7c673cae
FG
11335 }
11336
11fdf7f2 11337 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
11338 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11339}
11340
11341void Client::_release_filelocks(Fh *fh)
11342{
11343 if (!fh->fcntl_locks && !fh->flock_locks)
11344 return;
11345
11346 Inode *in = fh->inode.get();
11fdf7f2 11347 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 11348
f6b5b4d7
TL
11349 list<ceph_filelock> activated_locks;
11350
7c673cae
FG
11351 list<pair<int, ceph_filelock> > to_release;
11352
11353 if (fh->fcntl_locks) {
11fdf7f2 11354 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
11355 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11356 auto q = p++;
11357 if (in->flags & I_ERROR_FILELOCK) {
11358 lock_state->remove_lock(q->second, activated_locks);
11359 } else {
11360 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11361 }
11362 }
11fdf7f2 11363 lock_state.reset();
7c673cae
FG
11364 }
11365 if (fh->flock_locks) {
11fdf7f2 11366 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
11367 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11368 auto q = p++;
11369 if (in->flags & I_ERROR_FILELOCK) {
11370 lock_state->remove_lock(q->second, activated_locks);
11371 } else {
11372 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11373 }
11374 }
11fdf7f2 11375 lock_state.reset();
7c673cae
FG
11376 }
11377
f6b5b4d7
TL
11378 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11379 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 11380
f6b5b4d7 11381 if (to_release.empty())
11fdf7f2
TL
11382 return;
11383
7c673cae
FG
11384 struct flock fl;
11385 memset(&fl, 0, sizeof(fl));
11386 fl.l_whence = SEEK_SET;
11387 fl.l_type = F_UNLCK;
11388
11389 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11390 p != to_release.end();
11391 ++p) {
11392 fl.l_start = p->second.start;
11393 fl.l_len = p->second.length;
11394 fl.l_pid = p->second.pid;
11395 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11396 p->second.owner, true);
11397 }
11398}
11399
11400void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11401 ceph_lock_state_t *lock_state)
11402{
11403 int lock_cmd;
11404 if (F_RDLCK == fl->l_type)
11405 lock_cmd = CEPH_LOCK_SHARED;
11406 else if (F_WRLCK == fl->l_type)
11407 lock_cmd = CEPH_LOCK_EXCL;
11408 else
11409 lock_cmd = CEPH_LOCK_UNLOCK;;
11410
11411 ceph_filelock filelock;
11412 filelock.start = fl->l_start;
11413 filelock.length = fl->l_len;
11414 filelock.client = 0;
11415 // see comment in _do_filelock()
11416 filelock.owner = owner | (1ULL << 63);
11417 filelock.pid = fl->l_pid;
11418 filelock.type = lock_cmd;
11419
11420 if (filelock.type == CEPH_LOCK_UNLOCK) {
11421 list<ceph_filelock> activated_locks;
11422 lock_state->remove_lock(filelock, activated_locks);
11423 } else {
11424 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 11425 ceph_assert(r);
7c673cae
FG
11426 }
11427}
11428
11429int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11430{
11431 Inode *in = fh->inode.get();
11432 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11433 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11434 return ret;
11435}
11436
11437int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11438{
11439 Inode *in = fh->inode.get();
11440 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11441 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11442 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11443 return ret;
11444}
11445
11446int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11447{
11448 Inode *in = fh->inode.get();
11449 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11450
11451 int sleep = !(cmd & LOCK_NB);
11452 cmd &= ~LOCK_NB;
11453
11454 int type;
11455 switch (cmd) {
11456 case LOCK_SH:
11457 type = F_RDLCK;
11458 break;
11459 case LOCK_EX:
11460 type = F_WRLCK;
11461 break;
11462 case LOCK_UN:
11463 type = F_UNLCK;
11464 break;
11465 default:
f67539c2 11466 return -CEPHFS_EINVAL;
7c673cae
FG
11467 }
11468
11469 struct flock fl;
11470 memset(&fl, 0, sizeof(fl));
11471 fl.l_type = type;
11472 fl.l_whence = SEEK_SET;
11473
11474 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11475 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11476 return ret;
11477}
11478
f67539c2
TL
11479int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11480 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11481 if (!mref_reader.is_state_satisfied()) {
11482 return -CEPHFS_ENOTCONN;
11483 }
11484
20effc67 11485 std::scoped_lock lock(client_lock);
f67539c2
TL
11486 InodeRef in;
11487 int r = Client::path_walk(path, &in, perms, true);
11488 if (r < 0) {
11489 return r;
11490 }
11491
11492 if (in->snapid == CEPH_NOSNAP) {
11493 return -CEPHFS_EINVAL;
11494 }
11495
11496 snap_info->id = in->snapid;
11497 snap_info->metadata = in->snap_metadata;
11498 return 0;
11499}
11500
11501int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11502{
11503 /* Since the only thing this does is wrap a call to statfs, and
11504 statfs takes a lock, it doesn't seem we have a need to split it
11505 out. */
7c673cae
FG
11506 return statfs(0, stbuf, perms);
11507}
11508
20effc67 11509void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
11510{
11511 if (!args)
11512 return;
20effc67 11513
11fdf7f2 11514 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
11515 << " invalidate_ino_cb " << args->ino_cb
11516 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
11517 << " switch_interrupt_cb " << args->switch_intr_cb
11518 << " remount_cb " << args->remount_cb
11519 << dendl;
11520 callback_handle = args->handle;
11521 if (args->ino_cb) {
11522 ino_invalidate_cb = args->ino_cb;
11523 async_ino_invalidator.start();
11524 }
11525 if (args->dentry_cb) {
11526 dentry_invalidate_cb = args->dentry_cb;
11527 async_dentry_invalidator.start();
11528 }
11529 if (args->switch_intr_cb) {
11530 switch_interrupt_cb = args->switch_intr_cb;
11531 interrupt_finisher.start();
11532 }
11533 if (args->remount_cb) {
11534 remount_cb = args->remount_cb;
11535 remount_finisher.start();
11536 }
e306af50
TL
11537 if (args->ino_release_cb) {
11538 ino_release_cb = args->ino_release_cb;
11539 async_ino_releasor.start();
11540 }
11541 if (args->umask_cb)
11542 umask_cb = args->umask_cb;
7c673cae
FG
11543}
11544
20effc67
TL
11545// This is deprecated, use ll_register_callbacks2() instead.
11546void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11547{
11548 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11549
11550 _ll_register_callbacks(args);
11551}
11552
11553int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11554{
11555 if (is_mounting() || is_mounted() || is_unmounting())
11556 return -CEPHFS_EBUSY;
11557
11558 _ll_register_callbacks(args);
11559 return 0;
11560}
11561
1d09f67e 11562std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
7c673cae 11563{
1d09f67e 11564 std::pair <int, bool> r(0, false);
7c673cae 11565
f67539c2
TL
11566 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11567 if (!iref_reader.is_state_satisfied())
1d09f67e 11568 return std::make_pair(-CEPHFS_ENOTCONN, false);
f67539c2 11569
7c673cae
FG
11570 can_invalidate_dentries = can_invalidate;
11571
11572 if (can_invalidate_dentries) {
11fdf7f2 11573 ceph_assert(dentry_invalidate_cb);
7c673cae 11574 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11fdf7f2
TL
11575 } else {
11576 ceph_assert(remount_cb);
7c673cae 11577 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 11578 r = _do_remount(false);
b32b8144 11579 }
11fdf7f2 11580
7c673cae
FG
11581 return r;
11582}
11583
11584int Client::_sync_fs()
11585{
f67539c2
TL
11586 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11587
11fdf7f2 11588 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
11589
11590 // flush file data
11fdf7f2
TL
11591 std::unique_ptr<C_SaferCond> cond = nullptr;
11592 if (cct->_conf->client_oc) {
11593 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11594 objectcacher->flush_all(cond.get());
11595 }
7c673cae
FG
11596
11597 // flush caps
11598 flush_caps_sync();
11599 ceph_tid_t flush_tid = last_flush_tid;
11600
11601 // wait for unsafe mds requests
11602 wait_unsafe_requests();
11603
11604 wait_sync_caps(flush_tid);
11605
11fdf7f2 11606 if (nullptr != cond) {
9f95a23c 11607 client_lock.unlock();
11fdf7f2
TL
11608 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11609 cond->wait();
11610 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 11611 client_lock.lock();
7c673cae
FG
11612 }
11613
11614 return 0;
11615}
11616
11617int Client::sync_fs()
11618{
f67539c2
TL
11619 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11620 if (!mref_reader.is_state_satisfied())
11621 return -CEPHFS_ENOTCONN;
181888fb 11622
f67539c2 11623 std::scoped_lock l(client_lock);
181888fb 11624
7c673cae
FG
11625 return _sync_fs();
11626}
11627
11628int64_t Client::drop_caches()
11629{
f67539c2 11630 std::scoped_lock l(client_lock);
7c673cae
FG
11631 return objectcacher->release_all();
11632}
11633
11fdf7f2
TL
11634int Client::_lazyio(Fh *fh, int enable)
11635{
11636 Inode *in = fh->inode.get();
11637 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11638
11639 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11640 return 0;
11641
11642 int orig_mode = fh->mode;
11643 if (enable) {
11644 fh->mode |= CEPH_FILE_MODE_LAZY;
11645 in->get_open_ref(fh->mode);
11646 in->put_open_ref(orig_mode);
11647 check_caps(in, CHECK_CAPS_NODELAY);
11648 } else {
11649 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11650 in->get_open_ref(fh->mode);
11651 in->put_open_ref(orig_mode);
11652 check_caps(in, 0);
11653 }
11654
11655 return 0;
11656}
11657
11658int Client::lazyio(int fd, int enable)
11659{
f67539c2 11660 std::scoped_lock l(client_lock);
11fdf7f2
TL
11661 Fh *f = get_filehandle(fd);
11662 if (!f)
f67539c2 11663 return -CEPHFS_EBADF;
11fdf7f2
TL
11664
11665 return _lazyio(f, enable);
11666}
11667
11668int Client::ll_lazyio(Fh *fh, int enable)
11669{
11fdf7f2
TL
11670 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11671 tout(cct) << __func__ << std::endl;
11672
f67539c2 11673 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11674 return _lazyio(fh, enable);
11675}
7c673cae 11676
92f5a8d4 11677int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 11678{
f67539c2 11679 std::scoped_lock l(client_lock);
92f5a8d4 11680 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
11681 << ", " << offset << ", " << count << ")" << dendl;
11682
11683 Fh *f = get_filehandle(fd);
11684 if (!f)
f67539c2 11685 return -CEPHFS_EBADF;
7c673cae
FG
11686
11687 // for now
11688 _fsync(f, true);
11689
11690 return 0;
11691}
11692
11693int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11694{
f67539c2 11695 std::scoped_lock l(client_lock);
7c673cae
FG
11696 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11697 << ", " << offset << ", " << count << ")" << dendl;
11698
11699 Fh *f = get_filehandle(fd);
11700 if (!f)
f67539c2 11701 return -CEPHFS_EBADF;
7c673cae
FG
11702 Inode *in = f->inode.get();
11703
11704 _fsync(f, true);
92f5a8d4
TL
11705 if (_release(in)) {
11706 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11707 if (r < 0)
11708 return r;
11709 }
7c673cae
FG
11710 return 0;
11711}
11712
11713
11714// =============================
11715// snaps
11716
f67539c2
TL
11717int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11718 mode_t mode, const std::map<std::string, std::string> &metadata)
7c673cae 11719{
f67539c2
TL
11720 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11721 if (!mref_reader.is_state_satisfied())
11722 return -CEPHFS_ENOTCONN;
181888fb 11723
f67539c2 11724 std::scoped_lock l(client_lock);
181888fb 11725
7c673cae
FG
11726 filepath path(relpath);
11727 InodeRef in;
11728 int r = path_walk(path, &in, perm);
11729 if (r < 0)
11730 return r;
11731 if (cct->_conf->client_permissions) {
11732 r = may_create(in.get(), perm);
11733 if (r < 0)
11734 return r;
11735 }
11736 Inode *snapdir = open_snapdir(in.get());
f67539c2 11737 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
7c673cae 11738}
181888fb 11739
f67539c2 11740int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
7c673cae 11741{
f67539c2
TL
11742 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11743 if (!mref_reader.is_state_satisfied())
11744 return -CEPHFS_ENOTCONN;
181888fb 11745
f67539c2 11746 std::scoped_lock l(client_lock);
181888fb 11747
7c673cae
FG
11748 filepath path(relpath);
11749 InodeRef in;
11750 int r = path_walk(path, &in, perms);
11751 if (r < 0)
11752 return r;
f67539c2 11753 Inode *snapdir = open_snapdir(in.get());
7c673cae 11754 if (cct->_conf->client_permissions) {
f67539c2 11755 r = may_delete(snapdir, check_perms ? name : NULL, perms);
7c673cae
FG
11756 if (r < 0)
11757 return r;
11758 }
7c673cae
FG
11759 return _rmdir(snapdir, name, perms);
11760}
11761
11762// =============================
11763// expose caps
11764
f67539c2
TL
11765int Client::get_caps_issued(int fd)
11766{
11767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11768 if (!mref_reader.is_state_satisfied())
11769 return -CEPHFS_ENOTCONN;
7c673cae 11770
f67539c2 11771 std::scoped_lock lock(client_lock);
181888fb 11772
7c673cae
FG
11773 Fh *f = get_filehandle(fd);
11774 if (!f)
f67539c2 11775 return -CEPHFS_EBADF;
7c673cae
FG
11776
11777 return f->inode->caps_issued();
11778}
11779
11780int Client::get_caps_issued(const char *path, const UserPerm& perms)
11781{
f67539c2
TL
11782 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11783 if (!mref_reader.is_state_satisfied())
11784 return -CEPHFS_ENOTCONN;
181888fb 11785
f67539c2 11786 std::scoped_lock lock(client_lock);
181888fb 11787
7c673cae
FG
11788 filepath p(path);
11789 InodeRef in;
11790 int r = path_walk(p, &in, perms, true);
11791 if (r < 0)
11792 return r;
11793 return in->caps_issued();
11794}
11795
11796// =========================================
11797// low level
11798
11799Inode *Client::open_snapdir(Inode *diri)
11800{
11801 Inode *in;
11802 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11803 if (!inode_map.count(vino)) {
11804 in = new Inode(this, vino, &diri->layout);
11805
11806 in->ino = diri->ino;
11807 in->snapid = CEPH_SNAPDIR;
11808 in->mode = diri->mode;
11809 in->uid = diri->uid;
11810 in->gid = diri->gid;
494da23a 11811 in->nlink = 1;
7c673cae
FG
11812 in->mtime = diri->mtime;
11813 in->ctime = diri->ctime;
11814 in->btime = diri->btime;
f6b5b4d7 11815 in->atime = diri->atime;
7c673cae
FG
11816 in->size = diri->size;
11817 in->change_attr = diri->change_attr;
11818
11819 in->dirfragtree.clear();
11820 in->snapdir_parent = diri;
11821 diri->flags |= I_SNAPDIR_OPEN;
11822 inode_map[vino] = in;
11823 if (use_faked_inos())
11824 _assign_faked_ino(in);
11825 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11826 } else {
11827 in = inode_map[vino];
11828 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11829 }
11830 return in;
11831}
11832
11833int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11834 Inode **out, const UserPerm& perms)
11835{
f67539c2
TL
11836 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11837 if (!mref_reader.is_state_satisfied())
11838 return -CEPHFS_ENOTCONN;
11839
31f18b77 11840 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
11841 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11842 tout(cct) << __func__ << std::endl;
7c673cae
FG
11843 tout(cct) << name << std::endl;
11844
f67539c2 11845 std::scoped_lock lock(client_lock);
181888fb 11846
7c673cae 11847 int r = 0;
11fdf7f2
TL
11848 if (!fuse_default_permissions) {
11849 if (strcmp(name, ".") && strcmp(name, "..")) {
11850 r = may_lookup(parent, perms);
11851 if (r < 0)
11852 return r;
11853 }
7c673cae
FG
11854 }
11855
11856 string dname(name);
11857 InodeRef in;
11858
11859 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11860 if (r < 0) {
11861 attr->st_ino = 0;
11862 goto out;
11863 }
11864
11fdf7f2 11865 ceph_assert(in);
7c673cae
FG
11866 fill_stat(in, attr);
11867 _ll_get(in.get());
11868
11869 out:
11fdf7f2 11870 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11871 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11872 tout(cct) << attr->st_ino << std::endl;
11873 *out = in.get();
11874 return r;
11875}
11876
f67539c2
TL
11877int Client::ll_lookup_vino(
11878 vinodeno_t vino,
1adf2230
AA
11879 const UserPerm& perms,
11880 Inode **inode)
11881{
81eedcae 11882 ceph_assert(inode != NULL);
f67539c2
TL
11883 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11884 if (!mref_reader.is_state_satisfied())
11885 return -CEPHFS_ENOTCONN;
81eedcae 11886
b3b6e05e
TL
11887 if (is_reserved_vino(vino))
11888 return -CEPHFS_ESTALE;
11889
f67539c2
TL
11890 std::scoped_lock lock(client_lock);
11891 ldout(cct, 3) << __func__ << " " << vino << dendl;
1adf2230 11892
f67539c2
TL
11893 // Check the cache first
11894 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11895 if (p != inode_map.end()) {
11896 *inode = p->second;
11897 _ll_get(*inode);
81eedcae
TL
11898 return 0;
11899 }
11900
f67539c2 11901 uint64_t snapid = vino.snapid;
81eedcae 11902
f67539c2
TL
11903 // for snapdir, find the non-snapped dir inode
11904 if (snapid == CEPH_SNAPDIR)
11905 vino.snapid = CEPH_NOSNAP;
11906
11907 int r = _lookup_vino(vino, perms, inode);
11908 if (r)
1adf2230 11909 return r;
f67539c2 11910 ceph_assert(*inode != NULL);
81eedcae 11911
f67539c2
TL
11912 if (snapid == CEPH_SNAPDIR) {
11913 Inode *tmp = *inode;
1adf2230 11914
f67539c2
TL
11915 // open the snapdir and put the inode ref
11916 *inode = open_snapdir(tmp);
11917 _ll_forget(tmp, 1);
11918 _ll_get(*inode);
1adf2230 11919 }
1adf2230
AA
11920 return 0;
11921}
11922
f67539c2
TL
11923int Client::ll_lookup_inode(
11924 struct inodeno_t ino,
11925 const UserPerm& perms,
11926 Inode **inode)
11927{
11928 vinodeno_t vino(ino, CEPH_NOSNAP);
11929 return ll_lookup_vino(vino, perms, inode);
11930}
11931
7c673cae
FG
11932int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11933 struct ceph_statx *stx, unsigned want, unsigned flags,
11934 const UserPerm& perms)
11935{
f67539c2
TL
11936 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11937 if (!mref_reader.is_state_satisfied())
11938 return -CEPHFS_ENOTCONN;
11939
31f18b77 11940 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 11941 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
11942 tout(cct) << "ll_lookupx" << std::endl;
11943 tout(cct) << name << std::endl;
11944
f67539c2 11945 std::scoped_lock lock(client_lock);
181888fb 11946
7c673cae 11947 int r = 0;
11fdf7f2 11948 if (!fuse_default_permissions) {
7c673cae
FG
11949 r = may_lookup(parent, perms);
11950 if (r < 0)
11951 return r;
11952 }
11953
11954 string dname(name);
11955 InodeRef in;
11956
11957 unsigned mask = statx_to_mask(flags, want);
11958 r = _lookup(parent, dname, mask, &in, perms);
11959 if (r < 0) {
11960 stx->stx_ino = 0;
11961 stx->stx_mask = 0;
11962 } else {
11fdf7f2 11963 ceph_assert(in);
7c673cae
FG
11964 fill_statx(in, mask, stx);
11965 _ll_get(in.get());
11966 }
11967
11fdf7f2 11968 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11969 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11970 tout(cct) << stx->stx_ino << std::endl;
11971 *out = in.get();
11972 return r;
11973}
11974
11975int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11976 unsigned int want, unsigned int flags, const UserPerm& perms)
11977{
f67539c2
TL
11978 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11979 if (!mref_reader.is_state_satisfied())
11980 return -CEPHFS_ENOTCONN;
181888fb 11981
7c673cae
FG
11982 filepath fp(name, 0);
11983 InodeRef in;
11984 int rc;
11985 unsigned mask = statx_to_mask(flags, want);
11986
11fdf7f2
TL
11987 ldout(cct, 3) << __func__ << " " << name << dendl;
11988 tout(cct) << __func__ << std::endl;
7c673cae
FG
11989 tout(cct) << name << std::endl;
11990
f67539c2 11991 std::scoped_lock lock(client_lock);
7c673cae
FG
11992 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11993 if (rc < 0) {
11994 /* zero out mask, just in case... */
11995 stx->stx_mask = 0;
11996 stx->stx_ino = 0;
11997 *out = NULL;
11998 return rc;
11999 } else {
11fdf7f2 12000 ceph_assert(in);
7c673cae
FG
12001 fill_statx(in, mask, stx);
12002 _ll_get(in.get());
12003 *out = in.get();
12004 return 0;
12005 }
12006}
12007
12008void Client::_ll_get(Inode *in)
12009{
12010 if (in->ll_ref == 0) {
b3b6e05e 12011 in->iget();
11fdf7f2
TL
12012 if (in->is_dir() && !in->dentries.empty()) {
12013 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
12014 in->get_first_parent()->get(); // pin dentry
12015 }
11fdf7f2
TL
12016 if (in->snapid != CEPH_NOSNAP)
12017 ll_snap_ref[in->snapid]++;
7c673cae
FG
12018 }
12019 in->ll_get();
11fdf7f2 12020 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
12021}
12022
494da23a 12023int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
12024{
12025 in->ll_put(num);
11fdf7f2 12026 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 12027 if (in->ll_ref == 0) {
11fdf7f2
TL
12028 if (in->is_dir() && !in->dentries.empty()) {
12029 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
12030 in->get_first_parent()->put(); // unpin dentry
12031 }
11fdf7f2
TL
12032 if (in->snapid != CEPH_NOSNAP) {
12033 auto p = ll_snap_ref.find(in->snapid);
12034 ceph_assert(p != ll_snap_ref.end());
12035 ceph_assert(p->second > 0);
12036 if (--p->second == 0)
12037 ll_snap_ref.erase(p);
12038 }
7c673cae
FG
12039 put_inode(in);
12040 return 0;
12041 } else {
12042 return in->ll_ref;
12043 }
12044}
12045
12046void Client::_ll_drop_pins()
12047{
11fdf7f2 12048 ldout(cct, 10) << __func__ << dendl;
1adf2230 12049 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
12050 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12051 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12052 it != inode_map.end();
12053 it = next) {
12054 Inode *in = it->second;
12055 next = it;
12056 ++next;
1adf2230
AA
12057 if (in->ll_ref){
12058 to_be_put.insert(in);
7c673cae 12059 _ll_put(in, in->ll_ref);
1adf2230 12060 }
7c673cae
FG
12061 }
12062}
12063
494da23a 12064bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 12065{
11fdf7f2 12066 inodeno_t ino = in->ino;
7c673cae 12067
11fdf7f2
TL
12068 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12069 tout(cct) << __func__ << std::endl;
7c673cae
FG
12070 tout(cct) << ino.val << std::endl;
12071 tout(cct) << count << std::endl;
12072
181888fb 12073 // Ignore forget if we're no longer mounted
f67539c2
TL
12074 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12075 if (!mref_reader.is_state_satisfied())
181888fb
FG
12076 return true;
12077
7c673cae
FG
12078 if (ino == 1) return true; // ignore forget on root.
12079
12080 bool last = false;
12081 if (in->ll_ref < count) {
12082 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12083 << ", which only has ll_ref=" << in->ll_ref << dendl;
12084 _ll_put(in, in->ll_ref);
12085 last = true;
12086 } else {
12087 if (_ll_put(in, count) == 0)
12088 last = true;
12089 }
12090
12091 return last;
12092}
12093
494da23a 12094bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 12095{
f67539c2 12096 std::scoped_lock lock(client_lock);
1adf2230
AA
12097 return _ll_forget(in, count);
12098}
12099
7c673cae
FG
12100bool Client::ll_put(Inode *in)
12101{
12102 /* ll_forget already takes the lock */
12103 return ll_forget(in, 1);
12104}
12105
11fdf7f2
TL
12106int Client::ll_get_snap_ref(snapid_t snap)
12107{
f67539c2 12108 std::scoped_lock lock(client_lock);
11fdf7f2
TL
12109 auto p = ll_snap_ref.find(snap);
12110 if (p != ll_snap_ref.end())
12111 return p->second;
12112 return 0;
12113}
12114
7c673cae
FG
12115snapid_t Client::ll_get_snapid(Inode *in)
12116{
f67539c2 12117 std::scoped_lock lock(client_lock);
7c673cae
FG
12118 return in->snapid;
12119}
12120
12121Inode *Client::ll_get_inode(ino_t ino)
12122{
f67539c2
TL
12123 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12124 if (!mref_reader.is_state_satisfied())
181888fb
FG
12125 return NULL;
12126
f67539c2
TL
12127 std::scoped_lock lock(client_lock);
12128
7c673cae
FG
12129 vinodeno_t vino = _map_faked_ino(ino);
12130 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12131 if (p == inode_map.end())
12132 return NULL;
12133 Inode *in = p->second;
12134 _ll_get(in);
12135 return in;
12136}
12137
12138Inode *Client::ll_get_inode(vinodeno_t vino)
12139{
f67539c2
TL
12140 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12141 if (!mref_reader.is_state_satisfied())
181888fb
FG
12142 return NULL;
12143
b3b6e05e
TL
12144 if (is_reserved_vino(vino))
12145 return NULL;
12146
f67539c2
TL
12147 std::scoped_lock lock(client_lock);
12148
7c673cae
FG
12149 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12150 if (p == inode_map.end())
12151 return NULL;
12152 Inode *in = p->second;
12153 _ll_get(in);
12154 return in;
12155}
12156
12157int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12158{
12159 vinodeno_t vino = _get_vino(in);
12160
11fdf7f2
TL
12161 ldout(cct, 8) << __func__ << " " << vino << dendl;
12162 tout(cct) << __func__ << std::endl;
7c673cae
FG
12163 tout(cct) << vino.ino.val << std::endl;
12164
12165 if (vino.snapid < CEPH_NOSNAP)
12166 return 0;
12167 else
12168 return _getattr(in, caps, perms);
12169}
12170
12171int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12172{
f67539c2
TL
12173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12174 if (!mref_reader.is_state_satisfied())
12175 return -CEPHFS_ENOTCONN;
7c673cae 12176
f67539c2 12177 std::scoped_lock lock(client_lock);
181888fb 12178
7c673cae
FG
12179 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12180
12181 if (res == 0)
12182 fill_stat(in, attr);
11fdf7f2 12183 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12184 return res;
12185}
12186
12187int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12188 unsigned int flags, const UserPerm& perms)
12189{
f67539c2
TL
12190 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12191 if (!mref_reader.is_state_satisfied())
12192 return -CEPHFS_ENOTCONN;
7c673cae 12193
f67539c2 12194 std::scoped_lock lock(client_lock);
181888fb 12195
7c673cae
FG
12196 int res = 0;
12197 unsigned mask = statx_to_mask(flags, want);
12198
94b18763 12199 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
12200 res = _ll_getattr(in, mask, perms);
12201
12202 if (res == 0)
12203 fill_statx(in, mask, stx);
11fdf7f2 12204 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12205 return res;
12206}
12207
12208int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12209 const UserPerm& perms, InodeRef *inp)
12210{
12211 vinodeno_t vino = _get_vino(in);
12212
11fdf7f2 12213 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 12214 << dendl;
11fdf7f2 12215 tout(cct) << __func__ << std::endl;
7c673cae
FG
12216 tout(cct) << vino.ino.val << std::endl;
12217 tout(cct) << stx->stx_mode << std::endl;
12218 tout(cct) << stx->stx_uid << std::endl;
12219 tout(cct) << stx->stx_gid << std::endl;
12220 tout(cct) << stx->stx_size << std::endl;
12221 tout(cct) << stx->stx_mtime << std::endl;
12222 tout(cct) << stx->stx_atime << std::endl;
12223 tout(cct) << stx->stx_btime << std::endl;
12224 tout(cct) << mask << std::endl;
12225
11fdf7f2 12226 if (!fuse_default_permissions) {
7c673cae
FG
12227 int res = may_setattr(in, stx, mask, perms);
12228 if (res < 0)
12229 return res;
12230 }
12231
12232 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12233
12234 return __setattrx(in, stx, mask, perms, inp);
12235}
12236
12237int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12238 const UserPerm& perms)
12239{
f67539c2
TL
12240 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12241 if (!mref_reader.is_state_satisfied())
12242 return -CEPHFS_ENOTCONN;
181888fb 12243
f67539c2 12244 std::scoped_lock lock(client_lock);
181888fb 12245
7c673cae
FG
12246 InodeRef target(in);
12247 int res = _ll_setattrx(in, stx, mask, perms, &target);
12248 if (res == 0) {
11fdf7f2 12249 ceph_assert(in == target.get());
7c673cae
FG
12250 fill_statx(in, in->caps_issued(), stx);
12251 }
12252
11fdf7f2 12253 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12254 return res;
12255}
12256
12257int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12258 const UserPerm& perms)
12259{
12260 struct ceph_statx stx;
12261 stat_to_statx(attr, &stx);
12262
f67539c2
TL
12263 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12264 if (!mref_reader.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN;
181888fb 12266
f67539c2 12267 std::scoped_lock lock(client_lock);
181888fb 12268
7c673cae
FG
12269 InodeRef target(in);
12270 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12271 if (res == 0) {
11fdf7f2 12272 ceph_assert(in == target.get());
7c673cae
FG
12273 fill_stat(in, attr);
12274 }
12275
11fdf7f2 12276 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12277 return res;
12278}
12279
12280
12281// ----------
12282// xattrs
12283
12284int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12285 const UserPerm& perms)
12286{
f67539c2
TL
12287 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12288 if (!mref_reader.is_state_satisfied())
12289 return -CEPHFS_ENOTCONN;
181888fb 12290
f67539c2 12291 std::scoped_lock lock(client_lock);
181888fb 12292
7c673cae
FG
12293 InodeRef in;
12294 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12295 if (r < 0)
12296 return r;
12297 return _getxattr(in, name, value, size, perms);
12298}
12299
12300int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12301 const UserPerm& perms)
12302{
f67539c2
TL
12303 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12304 if (!mref_reader.is_state_satisfied())
12305 return -CEPHFS_ENOTCONN;
181888fb 12306
f67539c2 12307 std::scoped_lock lock(client_lock);
181888fb 12308
7c673cae
FG
12309 InodeRef in;
12310 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12311 if (r < 0)
12312 return r;
12313 return _getxattr(in, name, value, size, perms);
12314}
12315
12316int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12317 const UserPerm& perms)
12318{
f67539c2
TL
12319 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12320 if (!mref_reader.is_state_satisfied())
12321 return -CEPHFS_ENOTCONN;
181888fb 12322
f67539c2 12323 std::scoped_lock lock(client_lock);
181888fb 12324
7c673cae
FG
12325 Fh *f = get_filehandle(fd);
12326 if (!f)
f67539c2 12327 return -CEPHFS_EBADF;
7c673cae
FG
12328 return _getxattr(f->inode, name, value, size, perms);
12329}
12330
12331int Client::listxattr(const char *path, char *list, size_t size,
12332 const UserPerm& perms)
12333{
f67539c2
TL
12334 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12335 if (!mref_reader.is_state_satisfied())
12336 return -CEPHFS_ENOTCONN;
181888fb 12337
f67539c2 12338 std::scoped_lock lock(client_lock);
181888fb 12339
7c673cae
FG
12340 InodeRef in;
12341 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12342 if (r < 0)
12343 return r;
12344 return Client::_listxattr(in.get(), list, size, perms);
12345}
12346
12347int Client::llistxattr(const char *path, char *list, size_t size,
12348 const UserPerm& perms)
12349{
f67539c2
TL
12350 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12351 if (!mref_reader.is_state_satisfied())
12352 return -CEPHFS_ENOTCONN;
181888fb 12353
f67539c2 12354 std::scoped_lock lock(client_lock);
181888fb 12355
7c673cae
FG
12356 InodeRef in;
12357 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12358 if (r < 0)
12359 return r;
12360 return Client::_listxattr(in.get(), list, size, perms);
12361}
12362
12363int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12364{
f67539c2
TL
12365 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12366 if (!mref_reader.is_state_satisfied())
12367 return -CEPHFS_ENOTCONN;
181888fb 12368
f67539c2 12369 std::scoped_lock lock(client_lock);
181888fb 12370
7c673cae
FG
12371 Fh *f = get_filehandle(fd);
12372 if (!f)
f67539c2 12373 return -CEPHFS_EBADF;
7c673cae
FG
12374 return Client::_listxattr(f->inode.get(), list, size, perms);
12375}
12376
12377int Client::removexattr(const char *path, const char *name,
12378 const UserPerm& perms)
12379{
f67539c2
TL
12380 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12381 if (!mref_reader.is_state_satisfied())
12382 return -CEPHFS_ENOTCONN;
181888fb 12383
f67539c2 12384 std::scoped_lock lock(client_lock);
181888fb 12385
7c673cae
FG
12386 InodeRef in;
12387 int r = Client::path_walk(path, &in, perms, true);
12388 if (r < 0)
12389 return r;
12390 return _removexattr(in, name, perms);
12391}
12392
12393int Client::lremovexattr(const char *path, const char *name,
12394 const UserPerm& perms)
12395{
f67539c2
TL
12396 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12397 if (!mref_reader.is_state_satisfied())
12398 return -CEPHFS_ENOTCONN;
181888fb 12399
f67539c2 12400 std::scoped_lock lock(client_lock);
181888fb 12401
7c673cae
FG
12402 InodeRef in;
12403 int r = Client::path_walk(path, &in, perms, false);
12404 if (r < 0)
12405 return r;
12406 return _removexattr(in, name, perms);
12407}
12408
12409int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12410{
f67539c2
TL
12411 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12412 if (!mref_reader.is_state_satisfied())
12413 return -CEPHFS_ENOTCONN;
181888fb 12414
f67539c2 12415 std::scoped_lock lock(client_lock);
181888fb 12416
7c673cae
FG
12417 Fh *f = get_filehandle(fd);
12418 if (!f)
f67539c2 12419 return -CEPHFS_EBADF;
7c673cae
FG
12420 return _removexattr(f->inode, name, perms);
12421}
12422
12423int Client::setxattr(const char *path, const char *name, const void *value,
12424 size_t size, int flags, const UserPerm& perms)
12425{
f67539c2
TL
12426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12427 if (!mref_reader.is_state_satisfied())
12428 return -CEPHFS_ENOTCONN;
12429
7c673cae
FG
12430 _setxattr_maybe_wait_for_osdmap(name, value, size);
12431
f67539c2 12432 std::scoped_lock lock(client_lock);
181888fb 12433
7c673cae
FG
12434 InodeRef in;
12435 int r = Client::path_walk(path, &in, perms, true);
12436 if (r < 0)
12437 return r;
12438 return _setxattr(in, name, value, size, flags, perms);
12439}
12440
12441int Client::lsetxattr(const char *path, const char *name, const void *value,
12442 size_t size, int flags, const UserPerm& perms)
12443{
f67539c2
TL
12444 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12445 if (!mref_reader.is_state_satisfied())
12446 return -CEPHFS_ENOTCONN;
7c673cae 12447
f67539c2 12448 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12449
f67539c2 12450 std::scoped_lock lock(client_lock);
181888fb 12451
7c673cae
FG
12452 InodeRef in;
12453 int r = Client::path_walk(path, &in, perms, false);
12454 if (r < 0)
12455 return r;
12456 return _setxattr(in, name, value, size, flags, perms);
12457}
12458
12459int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12460 int flags, const UserPerm& perms)
12461{
f67539c2
TL
12462 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12463 if (!mref_reader.is_state_satisfied())
12464 return -CEPHFS_ENOTCONN;
7c673cae 12465
f67539c2 12466 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12467
f67539c2 12468 std::scoped_lock lock(client_lock);
181888fb 12469
7c673cae
FG
12470 Fh *f = get_filehandle(fd);
12471 if (!f)
f67539c2 12472 return -CEPHFS_EBADF;
7c673cae
FG
12473 return _setxattr(f->inode, name, value, size, flags, perms);
12474}
12475
12476int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12477 const UserPerm& perms)
12478{
12479 int r;
1d09f67e 12480 const VXattr *vxattr = nullptr;
7c673cae 12481
1d09f67e 12482 vxattr = _match_vxattr(in, name);
7c673cae 12483 if (vxattr) {
f67539c2 12484 r = -CEPHFS_ENODATA;
7c673cae
FG
12485
12486 // Do a force getattr to get the latest quota before returning
12487 // a value to userspace.
28e407b8
AA
12488 int flags = 0;
12489 if (vxattr->flags & VXATTR_RSTAT) {
12490 flags |= CEPH_STAT_RSTAT;
12491 }
adb31ebb
TL
12492 if (vxattr->flags & VXATTR_DIRSTAT) {
12493 flags |= CEPH_CAP_FILE_SHARED;
12494 }
f67539c2 12495 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
7c673cae
FG
12496 if (r != 0) {
12497 // Error from getattr!
12498 return r;
12499 }
12500
12501 // call pointer-to-member function
12502 char buf[256];
12503 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12504 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12505 } else {
f67539c2 12506 r = -CEPHFS_ENODATA;
7c673cae
FG
12507 }
12508
12509 if (size != 0) {
12510 if (r > (int)size) {
f67539c2 12511 r = -CEPHFS_ERANGE;
7c673cae
FG
12512 } else if (r > 0) {
12513 memcpy(value, buf, r);
12514 }
12515 }
12516 goto out;
12517 }
12518
1d09f67e
TL
12519 if (!strncmp(name, "ceph.", 5)) {
12520 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12521 goto out;
12522 }
12523
7c673cae 12524 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
f67539c2 12525 r = -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12526 goto out;
12527 }
12528
12529 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12530 if (r == 0) {
12531 string n(name);
f67539c2 12532 r = -CEPHFS_ENODATA;
1d09f67e 12533 if (in->xattrs.count(n)) {
7c673cae
FG
12534 r = in->xattrs[n].length();
12535 if (r > 0 && size != 0) {
12536 if (size >= (unsigned)r)
12537 memcpy(value, in->xattrs[n].c_str(), r);
12538 else
f67539c2 12539 r = -CEPHFS_ERANGE;
7c673cae
FG
12540 }
12541 }
12542 }
12543 out:
1adf2230 12544 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
12545 return r;
12546}
12547
12548int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12549 const UserPerm& perms)
12550{
12551 if (cct->_conf->client_permissions) {
12552 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12553 if (r < 0)
12554 return r;
12555 }
12556 return _getxattr(in.get(), name, value, size, perms);
12557}
12558
12559int Client::ll_getxattr(Inode *in, const char *name, void *value,
12560 size_t size, const UserPerm& perms)
12561{
f67539c2
TL
12562 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12563 if (!mref_reader.is_state_satisfied())
12564 return -CEPHFS_ENOTCONN;
181888fb 12565
7c673cae
FG
12566 vinodeno_t vino = _get_vino(in);
12567
11fdf7f2
TL
12568 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12569 tout(cct) << __func__ << std::endl;
7c673cae
FG
12570 tout(cct) << vino.ino.val << std::endl;
12571 tout(cct) << name << std::endl;
12572
f67539c2 12573 std::scoped_lock lock(client_lock);
11fdf7f2 12574 if (!fuse_default_permissions) {
7c673cae
FG
12575 int r = xattr_permission(in, name, MAY_READ, perms);
12576 if (r < 0)
12577 return r;
12578 }
12579
12580 return _getxattr(in, name, value, size, perms);
12581}
12582
12583int Client::_listxattr(Inode *in, char *name, size_t size,
12584 const UserPerm& perms)
12585{
81eedcae 12586 bool len_only = (size == 0);
7c673cae 12587 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
12588 if (r != 0) {
12589 goto out;
12590 }
7c673cae 12591
81eedcae 12592 r = 0;
f67539c2
TL
12593 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12594 if (xattr_name.rfind("ceph.", 0) == 0) {
12595 continue;
12596 }
12597
12598 size_t this_len = xattr_name.length() + 1;
81eedcae
TL
12599 r += this_len;
12600 if (len_only)
12601 continue;
7c673cae 12602
81eedcae 12603 if (this_len > size) {
f67539c2 12604 r = -CEPHFS_ERANGE;
81eedcae
TL
12605 goto out;
12606 }
12607
f67539c2 12608 memcpy(name, xattr_name.c_str(), this_len);
81eedcae
TL
12609 name += this_len;
12610 size -= this_len;
12611 }
81eedcae 12612out:
11fdf7f2 12613 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
12614 return r;
12615}
12616
12617int Client::ll_listxattr(Inode *in, char *names, size_t size,
12618 const UserPerm& perms)
12619{
f67539c2
TL
12620 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12621 if (!mref_reader.is_state_satisfied())
12622 return -CEPHFS_ENOTCONN;
181888fb 12623
7c673cae
FG
12624 vinodeno_t vino = _get_vino(in);
12625
11fdf7f2
TL
12626 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12627 tout(cct) << __func__ << std::endl;
7c673cae
FG
12628 tout(cct) << vino.ino.val << std::endl;
12629 tout(cct) << size << std::endl;
12630
f67539c2 12631 std::scoped_lock lock(client_lock);
7c673cae
FG
12632 return _listxattr(in, names, size, perms);
12633}
12634
12635int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12636 size_t size, int flags, const UserPerm& perms)
12637{
12638
12639 int xattr_flags = 0;
12640 if (!value)
12641 xattr_flags |= CEPH_XATTR_REMOVE;
12642 if (flags & XATTR_CREATE)
12643 xattr_flags |= CEPH_XATTR_CREATE;
12644 if (flags & XATTR_REPLACE)
12645 xattr_flags |= CEPH_XATTR_REPLACE;
12646
12647 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12648 filepath path;
12649 in->make_nosnap_relative_path(path);
12650 req->set_filepath(path);
12651 req->set_string2(name);
12652 req->set_inode(in);
12653 req->head.args.setxattr.flags = xattr_flags;
12654
12655 bufferlist bl;
20effc67 12656 ceph_assert(value || size == 0);
7c673cae
FG
12657 bl.append((const char*)value, size);
12658 req->set_data(bl);
12659
12660 int res = make_request(req, perms);
12661
12662 trim_cache();
11fdf7f2 12663 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
12664 res << dendl;
12665 return res;
12666}
12667
12668int Client::_setxattr(Inode *in, const char *name, const void *value,
12669 size_t size, int flags, const UserPerm& perms)
12670{
12671 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12672 return -CEPHFS_EROFS;
7c673cae
FG
12673 }
12674
f6b5b4d7
TL
12675 if (size == 0) {
12676 value = "";
12677 } else if (value == NULL) {
f67539c2 12678 return -CEPHFS_EINVAL;
f6b5b4d7
TL
12679 }
12680
7c673cae
FG
12681 bool posix_acl_xattr = false;
12682 if (acl_type == POSIX_ACL)
12683 posix_acl_xattr = !strncmp(name, "system.", 7);
12684
12685 if (strncmp(name, "user.", 5) &&
12686 strncmp(name, "security.", 9) &&
12687 strncmp(name, "trusted.", 8) &&
12688 strncmp(name, "ceph.", 5) &&
12689 !posix_acl_xattr)
f67539c2 12690 return -CEPHFS_EOPNOTSUPP;
7c673cae 12691
11fdf7f2
TL
12692 bool check_realm = false;
12693
7c673cae
FG
12694 if (posix_acl_xattr) {
12695 if (!strcmp(name, ACL_EA_ACCESS)) {
12696 mode_t new_mode = in->mode;
12697 if (value) {
12698 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12699 if (ret < 0)
12700 return ret;
12701 if (ret == 0) {
12702 value = NULL;
12703 size = 0;
12704 }
12705 if (new_mode != in->mode) {
12706 struct ceph_statx stx;
12707 stx.stx_mode = new_mode;
12708 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12709 if (ret < 0)
12710 return ret;
12711 }
12712 }
12713 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12714 if (value) {
12715 if (!S_ISDIR(in->mode))
f67539c2 12716 return -CEPHFS_EACCES;
7c673cae
FG
12717 int ret = posix_acl_check(value, size);
12718 if (ret < 0)
f67539c2 12719 return -CEPHFS_EINVAL;
7c673cae
FG
12720 if (ret == 0) {
12721 value = NULL;
12722 size = 0;
12723 }
12724 }
12725 } else {
f67539c2 12726 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12727 }
12728 } else {
12729 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
12730 if (vxattr) {
12731 if (vxattr->readonly)
f67539c2 12732 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12733 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12734 check_realm = true;
12735 }
7c673cae
FG
12736 }
12737
11fdf7f2
TL
12738 int ret = _do_setxattr(in, name, value, size, flags, perms);
12739 if (ret >= 0 && check_realm) {
12740 // check if snaprealm was created for quota inode
12741 if (in->quota.is_enable() &&
12742 !(in->snaprealm && in->snaprealm->ino == in->ino))
f67539c2 12743 ret = -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12744 }
12745
12746 return ret;
7c673cae
FG
12747}
12748
12749int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12750 size_t size, int flags, const UserPerm& perms)
12751{
12752 if (cct->_conf->client_permissions) {
12753 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12754 if (r < 0)
12755 return r;
12756 }
12757 return _setxattr(in.get(), name, value, size, flags, perms);
12758}
12759
12760int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12761{
12762 string tmp;
12763 if (name == "layout") {
12764 string::iterator begin = value.begin();
12765 string::iterator end = value.end();
12766 keys_and_values<string::iterator> p; // create instance of parser
12767 std::map<string, string> m; // map to receive results
12768 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 12769 return -CEPHFS_EINVAL;
7c673cae
FG
12770 }
12771 if (begin != end)
f67539c2 12772 return -CEPHFS_EINVAL;
7c673cae
FG
12773 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12774 if (q->first == "pool") {
12775 tmp = q->second;
12776 break;
12777 }
12778 }
12779 } else if (name == "layout.pool") {
12780 tmp = value;
12781 }
12782
12783 if (tmp.length()) {
12784 int64_t pool;
12785 try {
12786 pool = boost::lexical_cast<unsigned>(tmp);
12787 if (!osdmap->have_pg_pool(pool))
f67539c2 12788 return -CEPHFS_ENOENT;
7c673cae
FG
12789 } catch (boost::bad_lexical_cast const&) {
12790 pool = osdmap->lookup_pg_pool_name(tmp);
12791 if (pool < 0) {
f67539c2 12792 return -CEPHFS_ENOENT;
7c673cae
FG
12793 }
12794 }
12795 }
12796
12797 return 0;
12798}
12799
12800void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12801{
12802 // For setting pool of layout, MetaRequest need osdmap epoch.
12803 // There is a race which create a new data pool but client and mds both don't have.
12804 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
f67539c2 12805 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
7c673cae
FG
12806 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12807 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12808 string rest(strstr(name, "layout"));
12809 string v((const char*)value, size);
12810 int r = objecter->with_osdmap([&](const OSDMap& o) {
12811 return _setxattr_check_data_pool(rest, v, &o);
12812 });
12813
f67539c2
TL
12814 if (r == -CEPHFS_ENOENT) {
12815 bs::error_code ec;
12816 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12817 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12818 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
7c673cae
FG
12819 }
12820 }
12821}
12822
12823int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12824 size_t size, int flags, const UserPerm& perms)
12825{
f67539c2
TL
12826 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12827 if (!mref_reader.is_state_satisfied())
12828 return -CEPHFS_ENOTCONN;
7c673cae 12829
f67539c2 12830 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12831
7c673cae
FG
12832 vinodeno_t vino = _get_vino(in);
12833
11fdf7f2
TL
12834 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12835 tout(cct) << __func__ << std::endl;
7c673cae
FG
12836 tout(cct) << vino.ino.val << std::endl;
12837 tout(cct) << name << std::endl;
12838
f67539c2 12839 std::scoped_lock lock(client_lock);
11fdf7f2 12840 if (!fuse_default_permissions) {
7c673cae
FG
12841 int r = xattr_permission(in, name, MAY_WRITE, perms);
12842 if (r < 0)
12843 return r;
12844 }
12845 return _setxattr(in, name, value, size, flags, perms);
12846}
12847
12848int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12849{
12850 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12851 return -CEPHFS_EROFS;
7c673cae
FG
12852 }
12853
12854 // same xattrs supported by kernel client
12855 if (strncmp(name, "user.", 5) &&
12856 strncmp(name, "system.", 7) &&
12857 strncmp(name, "security.", 9) &&
12858 strncmp(name, "trusted.", 8) &&
12859 strncmp(name, "ceph.", 5))
f67539c2 12860 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12861
12862 const VXattr *vxattr = _match_vxattr(in, name);
12863 if (vxattr && vxattr->readonly)
f67539c2 12864 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12865
12866 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12867 filepath path;
12868 in->make_nosnap_relative_path(path);
12869 req->set_filepath(path);
12870 req->set_filepath2(name);
12871 req->set_inode(in);
12872
12873 int res = make_request(req, perms);
12874
12875 trim_cache();
1adf2230 12876 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
12877 return res;
12878}
12879
12880int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12881{
12882 if (cct->_conf->client_permissions) {
12883 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12884 if (r < 0)
12885 return r;
12886 }
12887 return _removexattr(in.get(), name, perms);
12888}
12889
12890int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12891{
f67539c2
TL
12892 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12893 if (!mref_reader.is_state_satisfied())
12894 return -CEPHFS_ENOTCONN;
181888fb 12895
7c673cae
FG
12896 vinodeno_t vino = _get_vino(in);
12897
12898 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12899 tout(cct) << "ll_removexattr" << std::endl;
12900 tout(cct) << vino.ino.val << std::endl;
12901 tout(cct) << name << std::endl;
12902
f67539c2 12903 std::scoped_lock lock(client_lock);
11fdf7f2 12904 if (!fuse_default_permissions) {
7c673cae
FG
12905 int r = xattr_permission(in, name, MAY_WRITE, perms);
12906 if (r < 0)
12907 return r;
12908 }
12909
12910 return _removexattr(in, name, perms);
12911}
12912
12913bool Client::_vxattrcb_quota_exists(Inode *in)
12914{
11fdf7f2 12915 return in->quota.is_enable() &&
f6b5b4d7
TL
12916 (in->snapid != CEPH_NOSNAP ||
12917 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
12918}
12919size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12920{
12921 return snprintf(val, size,
12922 "max_bytes=%lld max_files=%lld",
12923 (long long int)in->quota.max_bytes,
12924 (long long int)in->quota.max_files);
12925}
12926size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12927{
12928 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12929}
12930size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12931{
12932 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12933}
12934
12935bool Client::_vxattrcb_layout_exists(Inode *in)
12936{
12937 return in->layout != file_layout_t();
12938}
12939size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12940{
12941 int r = snprintf(val, size,
11fdf7f2 12942 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
12943 (unsigned long long)in->layout.stripe_unit,
12944 (unsigned long long)in->layout.stripe_count,
12945 (unsigned long long)in->layout.object_size);
12946 objecter->with_osdmap([&](const OSDMap& o) {
12947 if (o.have_pg_pool(in->layout.pool_id))
12948 r += snprintf(val + r, size - r, "%s",
12949 o.get_pool_name(in->layout.pool_id).c_str());
12950 else
12951 r += snprintf(val + r, size - r, "%" PRIu64,
12952 (uint64_t)in->layout.pool_id);
12953 });
12954 if (in->layout.pool_ns.length())
12955 r += snprintf(val + r, size - r, " pool_namespace=%s",
12956 in->layout.pool_ns.c_str());
12957 return r;
12958}
12959size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12960{
11fdf7f2 12961 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
12962}
12963size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12964{
11fdf7f2 12965 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
12966}
12967size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12968{
11fdf7f2 12969 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
12970}
12971size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12972{
12973 size_t r;
12974 objecter->with_osdmap([&](const OSDMap& o) {
12975 if (o.have_pg_pool(in->layout.pool_id))
12976 r = snprintf(val, size, "%s", o.get_pool_name(
12977 in->layout.pool_id).c_str());
12978 else
12979 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12980 });
12981 return r;
12982}
12983size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12984{
12985 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12986}
12987size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12988{
11fdf7f2 12989 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
12990}
12991size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12992{
11fdf7f2 12993 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
12994}
12995size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12996{
11fdf7f2 12997 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
12998}
12999size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13000{
11fdf7f2 13001 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
13002}
13003size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13004{
11fdf7f2 13005 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
13006}
13007size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13008{
11fdf7f2 13009 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae 13010}
f67539c2
TL
13011size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13012{
13013 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13014}
7c673cae
FG
13015size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13016{
11fdf7f2 13017 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
13018}
13019size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13020{
81eedcae 13021 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
13022 (long)in->rstat.rctime.nsec());
13023}
11fdf7f2
TL
13024bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13025{
f67539c2 13026 return in->dir_pin != -CEPHFS_ENODATA;
11fdf7f2
TL
13027}
13028size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13029{
13030 return snprintf(val, size, "%ld", (long)in->dir_pin);
13031}
7c673cae 13032
81eedcae
TL
13033bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13034{
13035 return !in->snap_btime.is_zero();
13036}
13037
13038size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13039{
13040 return snprintf(val, size, "%llu.%09lu",
13041 (long long unsigned)in->snap_btime.sec(),
13042 (long unsigned)in->snap_btime.nsec());
13043}
13044
20effc67
TL
13045size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13046{
13047 int issued;
13048
13049 in->caps_issued(&issued);
13050 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13051}
13052
f67539c2
TL
13053bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13054{
13055 // checking one of the xattrs would suffice
13056 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13057}
13058
13059size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13060{
13061 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13062 in->xattrs["ceph.mirror.info.cluster_id"].length(),
13063 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13064 in->xattrs["ceph.mirror.info.fs_id"].length(),
13065 in->xattrs["ceph.mirror.info.fs_id"].c_str());
13066}
13067
adb31ebb
TL
13068size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13069{
13070 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13071}
13072
13073size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13074{
13075 auto name = messenger->get_myname();
20effc67 13076 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
adb31ebb
TL
13077}
13078
7c673cae
FG
13079#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13080#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13081
adb31ebb 13082#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
13083{ \
13084 name: CEPH_XATTR_NAME(_type, _name), \
13085 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13086 readonly: true, \
28e407b8
AA
13087 exists_cb: NULL, \
13088 flags: _flags, \
7c673cae
FG
13089}
13090#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13091{ \
13092 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13093 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13094 readonly: false, \
7c673cae 13095 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 13096 flags: 0, \
7c673cae
FG
13097}
13098#define XATTR_QUOTA_FIELD(_type, _name) \
13099{ \
13100 name: CEPH_XATTR_NAME(_type, _name), \
13101 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13102 readonly: false, \
7c673cae 13103 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 13104 flags: 0, \
7c673cae
FG
13105}
13106
13107const Client::VXattr Client::_dir_vxattrs[] = {
13108 {
13109 name: "ceph.dir.layout",
13110 getxattr_cb: &Client::_vxattrcb_layout,
13111 readonly: false,
7c673cae 13112 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 13113 flags: 0,
7c673cae 13114 },
1d09f67e
TL
13115 // FIXME
13116 // Delete the following dir layout field definitions for release "S"
7c673cae
FG
13117 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13118 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13119 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13120 XATTR_LAYOUT_FIELD(dir, layout, pool),
13121 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
13122 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13123 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13124 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13125 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13126 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13127 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
f67539c2 13128 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
adb31ebb
TL
13129 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13130 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
13131 {
13132 name: "ceph.quota",
13133 getxattr_cb: &Client::_vxattrcb_quota,
13134 readonly: false,
7c673cae 13135 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 13136 flags: 0,
7c673cae
FG
13137 },
13138 XATTR_QUOTA_FIELD(quota, max_bytes),
13139 XATTR_QUOTA_FIELD(quota, max_files),
1d09f67e
TL
13140 // FIXME
13141 // Delete the following dir pin field definitions for release "S"
11fdf7f2
TL
13142 {
13143 name: "ceph.dir.pin",
13144 getxattr_cb: &Client::_vxattrcb_dir_pin,
13145 readonly: false,
11fdf7f2
TL
13146 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13147 flags: 0,
13148 },
81eedcae
TL
13149 {
13150 name: "ceph.snap.btime",
13151 getxattr_cb: &Client::_vxattrcb_snap_btime,
13152 readonly: true,
81eedcae
TL
13153 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13154 flags: 0,
13155 },
f67539c2
TL
13156 {
13157 name: "ceph.mirror.info",
13158 getxattr_cb: &Client::_vxattrcb_mirror_info,
13159 readonly: false,
13160 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13161 flags: 0,
13162 },
20effc67
TL
13163 {
13164 name: "ceph.caps",
13165 getxattr_cb: &Client::_vxattrcb_caps,
13166 readonly: true,
13167 exists_cb: NULL,
13168 flags: 0,
13169 },
7c673cae
FG
13170 { name: "" } /* Required table terminator */
13171};
13172
13173const Client::VXattr Client::_file_vxattrs[] = {
13174 {
13175 name: "ceph.file.layout",
13176 getxattr_cb: &Client::_vxattrcb_layout,
13177 readonly: false,
7c673cae 13178 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 13179 flags: 0,
7c673cae
FG
13180 },
13181 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13182 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13183 XATTR_LAYOUT_FIELD(file, layout, object_size),
13184 XATTR_LAYOUT_FIELD(file, layout, pool),
13185 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
13186 {
13187 name: "ceph.snap.btime",
13188 getxattr_cb: &Client::_vxattrcb_snap_btime,
13189 readonly: true,
81eedcae
TL
13190 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13191 flags: 0,
13192 },
20effc67
TL
13193 {
13194 name: "ceph.caps",
13195 getxattr_cb: &Client::_vxattrcb_caps,
13196 readonly: true,
13197 exists_cb: NULL,
13198 flags: 0,
13199 },
7c673cae
FG
13200 { name: "" } /* Required table terminator */
13201};
13202
adb31ebb
TL
13203const Client::VXattr Client::_common_vxattrs[] = {
13204 {
13205 name: "ceph.cluster_fsid",
13206 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13207 readonly: true,
13208 exists_cb: nullptr,
13209 flags: 0,
13210 },
13211 {
13212 name: "ceph.client_id",
13213 getxattr_cb: &Client::_vxattrcb_client_id,
13214 readonly: true,
13215 exists_cb: nullptr,
13216 flags: 0,
13217 },
13218 { name: "" } /* Required table terminator */
13219};
13220
7c673cae
FG
13221const Client::VXattr *Client::_get_vxattrs(Inode *in)
13222{
13223 if (in->is_dir())
13224 return _dir_vxattrs;
13225 else if (in->is_file())
13226 return _file_vxattrs;
13227 return NULL;
13228}
13229
13230const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13231{
13232 if (strncmp(name, "ceph.", 5) == 0) {
13233 const VXattr *vxattr = _get_vxattrs(in);
13234 if (vxattr) {
13235 while (!vxattr->name.empty()) {
13236 if (vxattr->name == name)
13237 return vxattr;
13238 vxattr++;
13239 }
13240 }
adb31ebb
TL
13241
13242 // for common vxattrs
13243 vxattr = _common_vxattrs;
13244 while (!vxattr->name.empty()) {
13245 if (vxattr->name == name)
13246 return vxattr;
13247 vxattr++;
13248 }
7c673cae 13249 }
adb31ebb 13250
7c673cae
FG
13251 return NULL;
13252}
13253
7c673cae
FG
13254int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13255{
f67539c2
TL
13256 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13257 if (!mref_reader.is_state_satisfied())
13258 return -CEPHFS_ENOTCONN;
181888fb 13259
7c673cae
FG
13260 vinodeno_t vino = _get_vino(in);
13261
13262 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13263 tout(cct) << "ll_readlink" << std::endl;
13264 tout(cct) << vino.ino.val << std::endl;
13265
f67539c2 13266 std::scoped_lock lock(client_lock);
11fdf7f2
TL
13267 for (auto dn : in->dentries) {
13268 touch_dn(dn);
7c673cae
FG
13269 }
13270
13271 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13272 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13273 return r;
13274}
13275
13276int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13277 const UserPerm& perms, InodeRef *inp)
13278{
1adf2230 13279 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13280 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13281 << ", gid " << perms.gid() << ")" << dendl;
13282
13283 if (strlen(name) > NAME_MAX)
f67539c2 13284 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13285
13286 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13287 return -CEPHFS_EROFS;
7c673cae
FG
13288 }
13289 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13290 return -CEPHFS_EDQUOT;
7c673cae
FG
13291 }
13292
13293 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13294
13295 filepath path;
13296 dir->make_nosnap_relative_path(path);
13297 path.push_dentry(name);
13298 req->set_filepath(path);
13299 req->set_inode(dir);
13300 req->head.args.mknod.rdev = rdev;
13301 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13302 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13303
13304 bufferlist xattrs_bl;
13305 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13306 if (res < 0)
13307 goto fail;
13308 req->head.args.mknod.mode = mode;
13309 if (xattrs_bl.length() > 0)
13310 req->set_data(xattrs_bl);
13311
13312 Dentry *de;
13313 res = get_or_create(dir, name, &de);
13314 if (res < 0)
13315 goto fail;
13316 req->set_dentry(de);
13317
13318 res = make_request(req, perms, inp);
13319
13320 trim_cache();
13321
1adf2230 13322 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13323 return res;
13324
13325 fail:
13326 put_request(req);
13327 return res;
13328}
13329
13330int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13331 dev_t rdev, struct stat *attr, Inode **out,
13332 const UserPerm& perms)
13333{
f67539c2
TL
13334 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13335 if (!mref_reader.is_state_satisfied())
13336 return -CEPHFS_ENOTCONN;
181888fb 13337
7c673cae
FG
13338 vinodeno_t vparent = _get_vino(parent);
13339
13340 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13341 tout(cct) << "ll_mknod" << std::endl;
13342 tout(cct) << vparent.ino.val << std::endl;
13343 tout(cct) << name << std::endl;
13344 tout(cct) << mode << std::endl;
13345 tout(cct) << rdev << std::endl;
13346
f67539c2 13347 std::scoped_lock lock(client_lock);
11fdf7f2 13348 if (!fuse_default_permissions) {
7c673cae
FG
13349 int r = may_create(parent, perms);
13350 if (r < 0)
13351 return r;
13352 }
13353
13354 InodeRef in;
13355 int r = _mknod(parent, name, mode, rdev, perms, &in);
13356 if (r == 0) {
13357 fill_stat(in, attr);
13358 _ll_get(in.get());
13359 }
13360 tout(cct) << attr->st_ino << std::endl;
13361 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13362 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13363 *out = in.get();
13364 return r;
13365}
13366
13367int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13368 dev_t rdev, Inode **out,
13369 struct ceph_statx *stx, unsigned want, unsigned flags,
13370 const UserPerm& perms)
13371{
f67539c2
TL
13372 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13373 if (!mref_reader.is_state_satisfied())
13374 return -CEPHFS_ENOTCONN;
7c673cae 13375
f67539c2 13376 unsigned caps = statx_to_mask(flags, want);
181888fb 13377
7c673cae
FG
13378 vinodeno_t vparent = _get_vino(parent);
13379
13380 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13381 tout(cct) << "ll_mknodx" << std::endl;
13382 tout(cct) << vparent.ino.val << std::endl;
13383 tout(cct) << name << std::endl;
13384 tout(cct) << mode << std::endl;
13385 tout(cct) << rdev << std::endl;
13386
f67539c2
TL
13387 std::scoped_lock lock(client_lock);
13388
11fdf7f2 13389 if (!fuse_default_permissions) {
7c673cae
FG
13390 int r = may_create(parent, perms);
13391 if (r < 0)
13392 return r;
13393 }
13394
13395 InodeRef in;
13396 int r = _mknod(parent, name, mode, rdev, perms, &in);
13397 if (r == 0) {
13398 fill_statx(in, caps, stx);
13399 _ll_get(in.get());
13400 }
13401 tout(cct) << stx->stx_ino << std::endl;
13402 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13403 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13404 *out = in.get();
13405 return r;
13406}
13407
13408int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13409 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13410 int object_size, const char *data_pool, bool *created,
f67539c2 13411 const UserPerm& perms, std::string alternate_name)
7c673cae 13412{
1adf2230 13413 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
13414 mode << dec << ")" << dendl;
13415
13416 if (strlen(name) > NAME_MAX)
f67539c2 13417 return -CEPHFS_ENAMETOOLONG;
7c673cae 13418 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13419 return -CEPHFS_EROFS;
7c673cae
FG
13420 }
13421 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13422 return -CEPHFS_EDQUOT;
7c673cae
FG
13423 }
13424
13425 // use normalized flags to generate cmode
11fdf7f2
TL
13426 int cflags = ceph_flags_sys2wire(flags);
13427 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13428 cflags |= CEPH_O_LAZY;
13429
13430 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
13431
13432 int64_t pool_id = -1;
13433 if (data_pool && *data_pool) {
13434 pool_id = objecter->with_osdmap(
13435 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13436 if (pool_id < 0)
f67539c2 13437 return -CEPHFS_EINVAL;
7c673cae 13438 if (pool_id > 0xffffffffll)
f67539c2 13439 return -CEPHFS_ERANGE; // bummer!
7c673cae
FG
13440 }
13441
13442 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13443
13444 filepath path;
13445 dir->make_nosnap_relative_path(path);
13446 path.push_dentry(name);
13447 req->set_filepath(path);
f67539c2 13448 req->set_alternate_name(std::move(alternate_name));
7c673cae 13449 req->set_inode(dir);
11fdf7f2 13450 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
13451
13452 req->head.args.open.stripe_unit = stripe_unit;
13453 req->head.args.open.stripe_count = stripe_count;
13454 req->head.args.open.object_size = object_size;
13455 if (cct->_conf->client_debug_getattr_caps)
13456 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13457 else
13458 req->head.args.open.mask = 0;
13459 req->head.args.open.pool = pool_id;
13460 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13461 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13462
13463 mode |= S_IFREG;
13464 bufferlist xattrs_bl;
13465 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13466 if (res < 0)
13467 goto fail;
13468 req->head.args.open.mode = mode;
13469 if (xattrs_bl.length() > 0)
13470 req->set_data(xattrs_bl);
13471
13472 Dentry *de;
13473 res = get_or_create(dir, name, &de);
13474 if (res < 0)
13475 goto fail;
13476 req->set_dentry(de);
13477
13478 res = make_request(req, perms, inp, created);
13479 if (res < 0) {
13480 goto reply_error;
13481 }
13482
13483 /* If the caller passed a value in fhp, do the open */
13484 if(fhp) {
13485 (*inp)->get_open_ref(cmode);
13486 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13487 }
13488
13489 reply_error:
13490 trim_cache();
13491
1adf2230 13492 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
13493 << " layout " << stripe_unit
13494 << ' ' << stripe_count
13495 << ' ' << object_size
13496 <<") = " << res << dendl;
13497 return res;
13498
13499 fail:
13500 put_request(req);
13501 return res;
13502}
13503
7c673cae 13504int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
f67539c2
TL
13505 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13506 std::string alternate_name)
7c673cae 13507{
1adf2230 13508 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13509 << mode << dec << ", uid " << perm.uid()
13510 << ", gid " << perm.gid() << ")" << dendl;
13511
13512 if (strlen(name) > NAME_MAX)
f67539c2 13513 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13514
13515 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13516 return -CEPHFS_EROFS;
7c673cae
FG
13517 }
13518 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13519 return -CEPHFS_EDQUOT;
7c673cae 13520 }
f67539c2
TL
13521
13522 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13523 MetaRequest *req = new MetaRequest(is_snap_op ?
7c673cae
FG
13524 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13525
13526 filepath path;
13527 dir->make_nosnap_relative_path(path);
13528 path.push_dentry(name);
13529 req->set_filepath(path);
13530 req->set_inode(dir);
13531 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13532 req->dentry_unless = CEPH_CAP_FILE_EXCL;
f67539c2 13533 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13534
13535 mode |= S_IFDIR;
f67539c2
TL
13536 bufferlist bl;
13537 int res = _posix_acl_create(dir, &mode, bl, perm);
7c673cae
FG
13538 if (res < 0)
13539 goto fail;
13540 req->head.args.mkdir.mode = mode;
f67539c2
TL
13541 if (is_snap_op) {
13542 SnapPayload payload;
13543 // clear the bufferlist that may have been populated by the call
13544 // to _posix_acl_create(). MDS mksnap does not make use of it.
13545 // So, reuse it to pass metadata payload.
13546 bl.clear();
13547 payload.metadata = metadata;
13548 encode(payload, bl);
13549 }
13550 if (bl.length() > 0) {
13551 req->set_data(bl);
13552 }
7c673cae
FG
13553
13554 Dentry *de;
13555 res = get_or_create(dir, name, &de);
13556 if (res < 0)
13557 goto fail;
13558 req->set_dentry(de);
13559
13560 ldout(cct, 10) << "_mkdir: making request" << dendl;
13561 res = make_request(req, perm, inp);
13562 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13563
13564 trim_cache();
13565
1adf2230 13566 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13567 return res;
13568
13569 fail:
13570 put_request(req);
13571 return res;
13572}
13573
13574int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13575 struct stat *attr, Inode **out, const UserPerm& perm)
13576{
f67539c2
TL
13577 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13578 if (!mref_reader.is_state_satisfied())
13579 return -CEPHFS_ENOTCONN;
181888fb 13580
7c673cae
FG
13581 vinodeno_t vparent = _get_vino(parent);
13582
13583 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13584 tout(cct) << "ll_mkdir" << std::endl;
13585 tout(cct) << vparent.ino.val << std::endl;
13586 tout(cct) << name << std::endl;
13587 tout(cct) << mode << std::endl;
13588
f67539c2
TL
13589 std::scoped_lock lock(client_lock);
13590
11fdf7f2 13591 if (!fuse_default_permissions) {
7c673cae
FG
13592 int r = may_create(parent, perm);
13593 if (r < 0)
13594 return r;
13595 }
13596
13597 InodeRef in;
13598 int r = _mkdir(parent, name, mode, perm, &in);
13599 if (r == 0) {
13600 fill_stat(in, attr);
13601 _ll_get(in.get());
13602 }
13603 tout(cct) << attr->st_ino << std::endl;
13604 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13605 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13606 *out = in.get();
13607 return r;
13608}
13609
13610int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13611 struct ceph_statx *stx, unsigned want, unsigned flags,
13612 const UserPerm& perms)
13613{
f67539c2
TL
13614 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13615 if (!mref_reader.is_state_satisfied())
13616 return -CEPHFS_ENOTCONN;
181888fb 13617
7c673cae
FG
13618 vinodeno_t vparent = _get_vino(parent);
13619
13620 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13621 tout(cct) << "ll_mkdirx" << std::endl;
13622 tout(cct) << vparent.ino.val << std::endl;
13623 tout(cct) << name << std::endl;
13624 tout(cct) << mode << std::endl;
13625
f67539c2
TL
13626 std::scoped_lock lock(client_lock);
13627
11fdf7f2 13628 if (!fuse_default_permissions) {
7c673cae
FG
13629 int r = may_create(parent, perms);
13630 if (r < 0)
13631 return r;
13632 }
13633
13634 InodeRef in;
13635 int r = _mkdir(parent, name, mode, perms, &in);
13636 if (r == 0) {
13637 fill_statx(in, statx_to_mask(flags, want), stx);
13638 _ll_get(in.get());
13639 } else {
13640 stx->stx_ino = 0;
13641 stx->stx_mask = 0;
13642 }
13643 tout(cct) << stx->stx_ino << std::endl;
13644 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13645 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13646 *out = in.get();
13647 return r;
13648}
13649
13650int Client::_symlink(Inode *dir, const char *name, const char *target,
f67539c2 13651 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
7c673cae 13652{
1adf2230 13653 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
13654 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13655 << dendl;
13656
13657 if (strlen(name) > NAME_MAX)
f67539c2 13658 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13659
13660 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13661 return -CEPHFS_EROFS;
7c673cae
FG
13662 }
13663 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13664 return -CEPHFS_EDQUOT;
7c673cae
FG
13665 }
13666
13667 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13668
13669 filepath path;
13670 dir->make_nosnap_relative_path(path);
13671 path.push_dentry(name);
13672 req->set_filepath(path);
f67539c2 13673 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13674 req->set_inode(dir);
13675 req->set_string2(target);
13676 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13677 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13678
13679 Dentry *de;
13680 int res = get_or_create(dir, name, &de);
13681 if (res < 0)
13682 goto fail;
13683 req->set_dentry(de);
13684
13685 res = make_request(req, perms, inp);
13686
13687 trim_cache();
1adf2230 13688 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
13689 res << dendl;
13690 return res;
13691
13692 fail:
13693 put_request(req);
13694 return res;
13695}
13696
13697int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13698 struct stat *attr, Inode **out, const UserPerm& perms)
13699{
f67539c2
TL
13700 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13701 if (!mref_reader.is_state_satisfied())
13702 return -CEPHFS_ENOTCONN;
181888fb 13703
7c673cae
FG
13704 vinodeno_t vparent = _get_vino(parent);
13705
13706 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13707 << dendl;
13708 tout(cct) << "ll_symlink" << std::endl;
13709 tout(cct) << vparent.ino.val << std::endl;
13710 tout(cct) << name << std::endl;
13711 tout(cct) << value << std::endl;
13712
f67539c2
TL
13713 std::scoped_lock lock(client_lock);
13714
11fdf7f2 13715 if (!fuse_default_permissions) {
7c673cae
FG
13716 int r = may_create(parent, perms);
13717 if (r < 0)
13718 return r;
13719 }
13720
13721 InodeRef in;
f67539c2 13722 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13723 if (r == 0) {
13724 fill_stat(in, attr);
13725 _ll_get(in.get());
13726 }
13727 tout(cct) << attr->st_ino << std::endl;
13728 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13729 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13730 *out = in.get();
13731 return r;
13732}
13733
13734int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13735 Inode **out, struct ceph_statx *stx, unsigned want,
13736 unsigned flags, const UserPerm& perms)
13737{
f67539c2
TL
13738 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13739 if (!mref_reader.is_state_satisfied())
13740 return -CEPHFS_ENOTCONN;
181888fb 13741
7c673cae
FG
13742 vinodeno_t vparent = _get_vino(parent);
13743
13744 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13745 << dendl;
13746 tout(cct) << "ll_symlinkx" << std::endl;
13747 tout(cct) << vparent.ino.val << std::endl;
13748 tout(cct) << name << std::endl;
13749 tout(cct) << value << std::endl;
13750
f67539c2
TL
13751 std::scoped_lock lock(client_lock);
13752
11fdf7f2 13753 if (!fuse_default_permissions) {
7c673cae
FG
13754 int r = may_create(parent, perms);
13755 if (r < 0)
13756 return r;
13757 }
13758
13759 InodeRef in;
f67539c2 13760 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13761 if (r == 0) {
13762 fill_statx(in, statx_to_mask(flags, want), stx);
13763 _ll_get(in.get());
13764 }
13765 tout(cct) << stx->stx_ino << std::endl;
13766 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13767 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13768 *out = in.get();
13769 return r;
13770}
13771
13772int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13773{
1adf2230 13774 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
13775 << " uid " << perm.uid() << " gid " << perm.gid()
13776 << ")" << dendl;
13777
13778 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13779 return -CEPHFS_EROFS;
7c673cae
FG
13780 }
13781
13782 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13783
13784 filepath path;
13785 dir->make_nosnap_relative_path(path);
13786 path.push_dentry(name);
13787 req->set_filepath(path);
13788
13789 InodeRef otherin;
b32b8144 13790 Inode *in;
7c673cae 13791 Dentry *de;
b32b8144 13792
7c673cae
FG
13793 int res = get_or_create(dir, name, &de);
13794 if (res < 0)
13795 goto fail;
13796 req->set_dentry(de);
13797 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13798 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13799
13800 res = _lookup(dir, name, 0, &otherin, perm);
13801 if (res < 0)
13802 goto fail;
b32b8144
FG
13803
13804 in = otherin.get();
13805 req->set_other_inode(in);
13806 in->break_all_delegs();
7c673cae
FG
13807 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13808
13809 req->set_inode(dir);
13810
13811 res = make_request(req, perm);
13812
13813 trim_cache();
1adf2230 13814 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
13815 return res;
13816
13817 fail:
13818 put_request(req);
13819 return res;
13820}
13821
13822int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13823{
f67539c2
TL
13824 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13825 if (!mref_reader.is_state_satisfied())
13826 return -CEPHFS_ENOTCONN;
181888fb 13827
7c673cae
FG
13828 vinodeno_t vino = _get_vino(in);
13829
13830 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13831 tout(cct) << "ll_unlink" << std::endl;
13832 tout(cct) << vino.ino.val << std::endl;
13833 tout(cct) << name << std::endl;
13834
f67539c2
TL
13835 std::scoped_lock lock(client_lock);
13836
11fdf7f2 13837 if (!fuse_default_permissions) {
7c673cae
FG
13838 int r = may_delete(in, name, perm);
13839 if (r < 0)
13840 return r;
13841 }
13842 return _unlink(in, name, perm);
13843}
13844
13845int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13846{
1adf2230 13847 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
13848 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13849
13850 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13851 return -CEPHFS_EROFS;
7c673cae 13852 }
b32b8144
FG
13853
13854 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13855 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
13856 filepath path;
13857 dir->make_nosnap_relative_path(path);
13858 path.push_dentry(name);
13859 req->set_filepath(path);
11fdf7f2 13860 req->set_inode(dir);
7c673cae
FG
13861
13862 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13863 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13864 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13865
13866 InodeRef in;
13867
13868 Dentry *de;
13869 int res = get_or_create(dir, name, &de);
13870 if (res < 0)
13871 goto fail;
b32b8144
FG
13872 if (op == CEPH_MDS_OP_RMDIR)
13873 req->set_dentry(de);
13874 else
13875 de->get();
13876
7c673cae
FG
13877 res = _lookup(dir, name, 0, &in, perms);
13878 if (res < 0)
13879 goto fail;
11fdf7f2
TL
13880
13881 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 13882 unlink(de, true, true);
b32b8144 13883 de->put();
7c673cae 13884 }
11fdf7f2 13885 req->set_other_inode(in.get());
7c673cae
FG
13886
13887 res = make_request(req, perms);
13888
13889 trim_cache();
1adf2230 13890 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
13891 return res;
13892
13893 fail:
13894 put_request(req);
13895 return res;
13896}
13897
13898int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13899{
f67539c2
TL
13900 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13901 if (!mref_reader.is_state_satisfied())
13902 return -CEPHFS_ENOTCONN;
181888fb 13903
7c673cae
FG
13904 vinodeno_t vino = _get_vino(in);
13905
13906 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13907 tout(cct) << "ll_rmdir" << std::endl;
13908 tout(cct) << vino.ino.val << std::endl;
13909 tout(cct) << name << std::endl;
13910
f67539c2
TL
13911 std::scoped_lock lock(client_lock);
13912
11fdf7f2 13913 if (!fuse_default_permissions) {
7c673cae
FG
13914 int r = may_delete(in, name, perms);
13915 if (r < 0)
13916 return r;
13917 }
13918
13919 return _rmdir(in, name, perms);
13920}
13921
f67539c2 13922int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
7c673cae 13923{
1adf2230 13924 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
13925 << todir->ino << " " << toname
13926 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13927 << dendl;
13928
13929 if (fromdir->snapid != todir->snapid)
f67539c2 13930 return -CEPHFS_EXDEV;
7c673cae
FG
13931
13932 int op = CEPH_MDS_OP_RENAME;
13933 if (fromdir->snapid != CEPH_NOSNAP) {
13934 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13935 op = CEPH_MDS_OP_RENAMESNAP;
13936 else
f67539c2
TL
13937 return -CEPHFS_EROFS;
13938 }
2a845540 13939 if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
f67539c2
TL
13940 Inode *fromdir_root =
13941 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13942 Inode *todir_root =
13943 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13944 if (fromdir_root != todir_root) {
13945 return -CEPHFS_EXDEV;
13946 }
7c673cae 13947 }
7c673cae
FG
13948
13949 InodeRef target;
13950 MetaRequest *req = new MetaRequest(op);
13951
13952 filepath from;
13953 fromdir->make_nosnap_relative_path(from);
13954 from.push_dentry(fromname);
13955 filepath to;
13956 todir->make_nosnap_relative_path(to);
13957 to.push_dentry(toname);
13958 req->set_filepath(to);
13959 req->set_filepath2(from);
f67539c2 13960 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13961
13962 Dentry *oldde;
13963 int res = get_or_create(fromdir, fromname, &oldde);
13964 if (res < 0)
13965 goto fail;
13966 Dentry *de;
13967 res = get_or_create(todir, toname, &de);
13968 if (res < 0)
13969 goto fail;
13970
13971 if (op == CEPH_MDS_OP_RENAME) {
13972 req->set_old_dentry(oldde);
13973 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13974 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13975
13976 req->set_dentry(de);
13977 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13978 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13979
13980 InodeRef oldin, otherin;
f67539c2 13981 res = _lookup(fromdir, fromname, 0, &oldin, perm);
7c673cae
FG
13982 if (res < 0)
13983 goto fail;
b32b8144
FG
13984
13985 Inode *oldinode = oldin.get();
13986 oldinode->break_all_delegs();
13987 req->set_old_inode(oldinode);
7c673cae
FG
13988 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13989
13990 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
13991 switch (res) {
13992 case 0:
13993 {
13994 Inode *in = otherin.get();
13995 req->set_other_inode(in);
13996 in->break_all_delegs();
13997 }
7c673cae 13998 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144 13999 break;
f67539c2 14000 case -CEPHFS_ENOENT:
b32b8144
FG
14001 break;
14002 default:
14003 goto fail;
7c673cae
FG
14004 }
14005
14006 req->set_inode(todir);
14007 } else {
14008 // renamesnap reply contains no tracedn, so we need to invalidate
14009 // dentry manually
14010 unlink(oldde, true, true);
14011 unlink(de, true, true);
11fdf7f2
TL
14012
14013 req->set_inode(todir);
7c673cae
FG
14014 }
14015
14016 res = make_request(req, perm, &target);
14017 ldout(cct, 10) << "rename result is " << res << dendl;
14018
14019 // renamed item from our cache
14020
14021 trim_cache();
1adf2230 14022 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
14023 return res;
14024
14025 fail:
14026 put_request(req);
14027 return res;
14028}
14029
14030int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14031 const char *newname, const UserPerm& perm)
14032{
f67539c2
TL
14033 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14034 if (!mref_reader.is_state_satisfied())
14035 return -CEPHFS_ENOTCONN;
181888fb 14036
7c673cae
FG
14037 vinodeno_t vparent = _get_vino(parent);
14038 vinodeno_t vnewparent = _get_vino(newparent);
14039
14040 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14041 << vnewparent << " " << newname << dendl;
14042 tout(cct) << "ll_rename" << std::endl;
14043 tout(cct) << vparent.ino.val << std::endl;
14044 tout(cct) << name << std::endl;
14045 tout(cct) << vnewparent.ino.val << std::endl;
14046 tout(cct) << newname << std::endl;
14047
f67539c2
TL
14048 std::scoped_lock lock(client_lock);
14049
11fdf7f2 14050 if (!fuse_default_permissions) {
7c673cae
FG
14051 int r = may_delete(parent, name, perm);
14052 if (r < 0)
14053 return r;
14054 r = may_delete(newparent, newname, perm);
f67539c2 14055 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
14056 return r;
14057 }
14058
f67539c2 14059 return _rename(parent, name, newparent, newname, perm, "");
7c673cae
FG
14060}
14061
f67539c2 14062int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
7c673cae 14063{
1adf2230 14064 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
14065 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14066
14067 if (strlen(newname) > NAME_MAX)
f67539c2 14068 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
14069
14070 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
f67539c2 14071 return -CEPHFS_EROFS;
7c673cae
FG
14072 }
14073 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 14074 return -CEPHFS_EDQUOT;
7c673cae
FG
14075 }
14076
b32b8144 14077 in->break_all_delegs();
7c673cae
FG
14078 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14079
14080 filepath path(newname, dir->ino);
14081 req->set_filepath(path);
f67539c2 14082 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
14083 filepath existing(in->ino);
14084 req->set_filepath2(existing);
14085
14086 req->set_inode(dir);
14087 req->inode_drop = CEPH_CAP_FILE_SHARED;
14088 req->inode_unless = CEPH_CAP_FILE_EXCL;
14089
14090 Dentry *de;
14091 int res = get_or_create(dir, newname, &de);
14092 if (res < 0)
14093 goto fail;
14094 req->set_dentry(de);
14095
14096 res = make_request(req, perm, inp);
14097 ldout(cct, 10) << "link result is " << res << dendl;
14098
14099 trim_cache();
1adf2230 14100 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
14101 return res;
14102
14103 fail:
14104 put_request(req);
14105 return res;
14106}
14107
14108int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14109 const UserPerm& perm)
14110{
f67539c2
TL
14111 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14112 if (!mref_reader.is_state_satisfied())
14113 return -CEPHFS_ENOTCONN;
181888fb 14114
7c673cae
FG
14115 vinodeno_t vino = _get_vino(in);
14116 vinodeno_t vnewparent = _get_vino(newparent);
14117
31f18b77 14118 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
14119 newname << dendl;
14120 tout(cct) << "ll_link" << std::endl;
14121 tout(cct) << vino.ino.val << std::endl;
14122 tout(cct) << vnewparent << std::endl;
14123 tout(cct) << newname << std::endl;
14124
7c673cae
FG
14125 InodeRef target;
14126
f67539c2
TL
14127 std::scoped_lock lock(client_lock);
14128
11fdf7f2 14129 if (!fuse_default_permissions) {
7c673cae 14130 if (S_ISDIR(in->mode))
f67539c2 14131 return -CEPHFS_EPERM;
7c673cae 14132
11fdf7f2 14133 int r = may_hardlink(in, perm);
7c673cae
FG
14134 if (r < 0)
14135 return r;
14136
14137 r = may_create(newparent, perm);
14138 if (r < 0)
14139 return r;
14140 }
14141
f67539c2 14142 return _link(in, newparent, newname, perm, "", &target);
7c673cae
FG
14143}
14144
14145int Client::ll_num_osds(void)
14146{
f67539c2 14147 std::scoped_lock lock(client_lock);
7c673cae
FG
14148 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14149}
14150
14151int Client::ll_osdaddr(int osd, uint32_t *addr)
14152{
f67539c2 14153 std::scoped_lock lock(client_lock);
181888fb 14154
7c673cae
FG
14155 entity_addr_t g;
14156 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14157 if (!o.exists(osd))
14158 return false;
11fdf7f2 14159 g = o.get_addrs(osd).front();
7c673cae
FG
14160 return true;
14161 });
14162 if (!exists)
14163 return -1;
14164 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14165 *addr = ntohl(nb_addr);
14166 return 0;
14167}
181888fb 14168
7c673cae
FG
14169uint32_t Client::ll_stripe_unit(Inode *in)
14170{
f67539c2 14171 std::scoped_lock lock(client_lock);
7c673cae
FG
14172 return in->layout.stripe_unit;
14173}
14174
14175uint64_t Client::ll_snap_seq(Inode *in)
14176{
f67539c2 14177 std::scoped_lock lock(client_lock);
7c673cae
FG
14178 return in->snaprealm->seq;
14179}
14180
14181int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14182{
f67539c2 14183 std::scoped_lock lock(client_lock);
7c673cae
FG
14184 *layout = in->layout;
14185 return 0;
14186}
14187
14188int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14189{
14190 return ll_file_layout(fh->inode.get(), layout);
14191}
14192
14193/* Currently we cannot take advantage of redundancy in reads, since we
14194 would have to go through all possible placement groups (a
14195 potentially quite large number determined by a hash), and use CRUSH
14196 to calculate the appropriate set of OSDs for each placement group,
14197 then index into that. An array with one entry per OSD is much more
14198 tractable and works for demonstration purposes. */
14199
14200int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14201 file_layout_t* layout)
14202{
f67539c2 14203 std::scoped_lock lock(client_lock);
181888fb 14204
28e407b8 14205 inodeno_t ino = in->ino;
7c673cae
FG
14206 uint32_t object_size = layout->object_size;
14207 uint32_t su = layout->stripe_unit;
14208 uint32_t stripe_count = layout->stripe_count;
14209 uint64_t stripes_per_object = object_size / su;
11fdf7f2 14210 uint64_t stripeno = 0, stripepos = 0;
7c673cae 14211
11fdf7f2
TL
14212 if(stripe_count) {
14213 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14214 stripepos = blockno % stripe_count; // which object in the object set (X)
14215 }
7c673cae
FG
14216 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14217 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14218
14219 object_t oid = file_object_t(ino, objectno);
14220 return objecter->with_osdmap([&](const OSDMap& o) {
14221 ceph_object_layout olayout =
14222 o.file_to_object_layout(oid, *layout);
14223 pg_t pg = (pg_t)olayout.ol_pgid;
14224 vector<int> osds;
14225 int primary;
14226 o.pg_to_acting_osds(pg, &osds, &primary);
14227 return primary;
14228 });
14229}
14230
14231/* Return the offset of the block, internal to the object */
14232
14233uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14234{
f67539c2 14235 std::scoped_lock lock(client_lock);
7c673cae
FG
14236 file_layout_t *layout=&(in->layout);
14237 uint32_t object_size = layout->object_size;
14238 uint32_t su = layout->stripe_unit;
14239 uint64_t stripes_per_object = object_size / su;
14240
14241 return (blockno % stripes_per_object) * su;
14242}
14243
14244int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14245 const UserPerm& perms)
14246{
f67539c2
TL
14247 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14248 if (!mref_reader.is_state_satisfied())
14249 return -CEPHFS_ENOTCONN;
181888fb 14250
7c673cae
FG
14251 vinodeno_t vino = _get_vino(in);
14252
14253 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14254 tout(cct) << "ll_opendir" << std::endl;
14255 tout(cct) << vino.ino.val << std::endl;
14256
f67539c2
TL
14257 std::scoped_lock lock(client_lock);
14258
11fdf7f2 14259 if (!fuse_default_permissions) {
7c673cae
FG
14260 int r = may_open(in, flags, perms);
14261 if (r < 0)
14262 return r;
14263 }
14264
14265 int r = _opendir(in, dirpp, perms);
f67539c2 14266 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
14267
14268 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14269 << dendl;
14270 return r;
14271}
14272
14273int Client::ll_releasedir(dir_result_t *dirp)
14274{
f67539c2
TL
14275 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14276 if (!mref_reader.is_state_satisfied())
14277 return -CEPHFS_ENOTCONN;
14278
7c673cae
FG
14279 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14280 tout(cct) << "ll_releasedir" << std::endl;
f67539c2 14281 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14282
f67539c2 14283 std::scoped_lock lock(client_lock);
181888fb 14284
7c673cae
FG
14285 _closedir(dirp);
14286 return 0;
14287}
14288
14289int Client::ll_fsyncdir(dir_result_t *dirp)
14290{
f67539c2
TL
14291 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14292 if (!mref_reader.is_state_satisfied())
14293 return -CEPHFS_ENOTCONN;
14294
7c673cae
FG
14295 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14296 tout(cct) << "ll_fsyncdir" << std::endl;
f67539c2 14297 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14298
f67539c2 14299 std::scoped_lock lock(client_lock);
7c673cae
FG
14300 return _fsync(dirp->inode.get(), false);
14301}
14302
14303int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14304{
11fdf7f2 14305 ceph_assert(!(flags & O_CREAT));
7c673cae 14306
f67539c2
TL
14307 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14308 if (!mref_reader.is_state_satisfied())
14309 return -CEPHFS_ENOTCONN;
181888fb 14310
7c673cae
FG
14311 vinodeno_t vino = _get_vino(in);
14312
14313 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14314 tout(cct) << "ll_open" << std::endl;
14315 tout(cct) << vino.ino.val << std::endl;
14316 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14317
f67539c2
TL
14318 std::scoped_lock lock(client_lock);
14319
7c673cae 14320 int r;
11fdf7f2 14321 if (!fuse_default_permissions) {
7c673cae
FG
14322 r = may_open(in, flags, perms);
14323 if (r < 0)
14324 goto out;
14325 }
14326
14327 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14328
14329 out:
14330 Fh *fhptr = fhp ? *fhp : NULL;
14331 if (fhptr) {
14332 ll_unclosed_fh_set.insert(fhptr);
14333 }
f67539c2 14334 tout(cct) << (uintptr_t)fhptr << std::endl;
7c673cae
FG
14335 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14336 " = " << r << " (" << fhptr << ")" << dendl;
14337 return r;
14338}
14339
14340int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14341 int flags, InodeRef *in, int caps, Fh **fhp,
14342 const UserPerm& perms)
14343{
14344 *fhp = NULL;
14345
14346 vinodeno_t vparent = _get_vino(parent);
14347
1adf2230 14348 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14349 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14350 << ", gid " << perms.gid() << dendl;
14351 tout(cct) << "ll_create" << std::endl;
14352 tout(cct) << vparent.ino.val << std::endl;
14353 tout(cct) << name << std::endl;
14354 tout(cct) << mode << std::endl;
14355 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14356
14357 bool created = false;
14358 int r = _lookup(parent, name, caps, in, perms);
14359
14360 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 14361 return -CEPHFS_EEXIST;
7c673cae 14362
f67539c2 14363 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
11fdf7f2 14364 if (!fuse_default_permissions) {
7c673cae
FG
14365 r = may_create(parent, perms);
14366 if (r < 0)
14367 goto out;
14368 }
14369 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
f67539c2 14370 perms, "");
7c673cae
FG
14371 if (r < 0)
14372 goto out;
14373 }
14374
14375 if (r < 0)
14376 goto out;
14377
11fdf7f2 14378 ceph_assert(*in);
7c673cae
FG
14379
14380 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14381 if (!created) {
11fdf7f2 14382 if (!fuse_default_permissions) {
7c673cae
FG
14383 r = may_open(in->get(), flags, perms);
14384 if (r < 0) {
14385 if (*fhp) {
14386 int release_r = _release_fh(*fhp);
11fdf7f2 14387 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
14388 }
14389 goto out;
14390 }
14391 }
14392 if (*fhp == NULL) {
14393 r = _open(in->get(), flags, mode, fhp, perms);
14394 if (r < 0)
14395 goto out;
14396 }
14397 }
14398
14399out:
14400 if (*fhp) {
14401 ll_unclosed_fh_set.insert(*fhp);
14402 }
14403
14404 ino_t ino = 0;
14405 if (r >= 0) {
14406 Inode *inode = in->get();
14407 if (use_faked_inos())
14408 ino = inode->faked_ino;
14409 else
14410 ino = inode->ino;
14411 }
14412
f67539c2 14413 tout(cct) << (uintptr_t)*fhp << std::endl;
7c673cae 14414 tout(cct) << ino << std::endl;
1adf2230 14415 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14416 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14417 *fhp << " " << hex << ino << dec << ")" << dendl;
14418
14419 return r;
14420}
14421
14422int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14423 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14424 const UserPerm& perms)
14425{
f67539c2
TL
14426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14427 if (!mref_reader.is_state_satisfied())
14428 return -CEPHFS_ENOTCONN;
7c673cae 14429
f67539c2
TL
14430 std::scoped_lock lock(client_lock);
14431 InodeRef in;
181888fb 14432
7c673cae
FG
14433 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14434 fhp, perms);
14435 if (r >= 0) {
11fdf7f2 14436 ceph_assert(in);
7c673cae
FG
14437
14438 // passing an Inode in outp requires an additional ref
14439 if (outp) {
14440 _ll_get(in.get());
14441 *outp = in.get();
14442 }
14443 fill_stat(in, attr);
14444 } else {
14445 attr->st_ino = 0;
14446 }
14447
14448 return r;
14449}
14450
14451int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14452 int oflags, Inode **outp, Fh **fhp,
14453 struct ceph_statx *stx, unsigned want, unsigned lflags,
14454 const UserPerm& perms)
14455{
14456 unsigned caps = statx_to_mask(lflags, want);
f67539c2
TL
14457 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14458 if (!mref_reader.is_state_satisfied())
14459 return -CEPHFS_ENOTCONN;
7c673cae 14460
f67539c2
TL
14461 std::scoped_lock lock(client_lock);
14462 InodeRef in;
7c673cae
FG
14463
14464 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14465 if (r >= 0) {
11fdf7f2 14466 ceph_assert(in);
7c673cae
FG
14467
14468 // passing an Inode in outp requires an additional ref
14469 if (outp) {
14470 _ll_get(in.get());
14471 *outp = in.get();
14472 }
14473 fill_statx(in, caps, stx);
14474 } else {
14475 stx->stx_ino = 0;
14476 stx->stx_mask = 0;
14477 }
14478
14479 return r;
14480}
14481
14482loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14483{
f67539c2
TL
14484 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14485 if (!mref_reader.is_state_satisfied())
14486 return -CEPHFS_ENOTCONN;
14487
7c673cae
FG
14488 tout(cct) << "ll_lseek" << std::endl;
14489 tout(cct) << offset << std::endl;
14490 tout(cct) << whence << std::endl;
14491
f67539c2 14492 std::scoped_lock lock(client_lock);
7c673cae
FG
14493 return _lseek(fh, offset, whence);
14494}
14495
14496int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14497{
f67539c2
TL
14498 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14499 if (!mref_reader.is_state_satisfied())
14500 return -CEPHFS_ENOTCONN;
14501
7c673cae
FG
14502 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14503 tout(cct) << "ll_read" << std::endl;
f67539c2 14504 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14505 tout(cct) << off << std::endl;
14506 tout(cct) << len << std::endl;
14507
11fdf7f2
TL
14508 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14509 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14510 std::scoped_lock lock(client_lock);
14511
f6b5b4d7
TL
14512 int r = _read(fh, off, len, bl);
14513 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14514 << dendl;
14515 return r;
7c673cae
FG
14516}
14517
14518int Client::ll_read_block(Inode *in, uint64_t blockid,
14519 char *buf,
14520 uint64_t offset,
14521 uint64_t length,
14522 file_layout_t* layout)
14523{
f67539c2
TL
14524 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14525 if (!mref_reader.is_state_satisfied())
14526 return -CEPHFS_ENOTCONN;
181888fb 14527
b32b8144 14528 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14529 object_t oid = file_object_t(vino.ino, blockid);
14530 C_SaferCond onfinish;
14531 bufferlist bl;
14532
14533 objecter->read(oid,
14534 object_locator_t(layout->pool_id),
14535 offset,
14536 length,
14537 vino.snapid,
14538 &bl,
14539 CEPH_OSD_FLAG_READ,
14540 &onfinish);
14541
7c673cae 14542 int r = onfinish.wait();
7c673cae 14543 if (r >= 0) {
9f95a23c 14544 bl.begin().copy(bl.length(), buf);
7c673cae
FG
14545 r = bl.length();
14546 }
14547
14548 return r;
14549}
14550
14551/* It appears that the OSD doesn't return success unless the entire
14552 buffer was written, return the write length on success. */
14553
14554int Client::ll_write_block(Inode *in, uint64_t blockid,
14555 char* buf, uint64_t offset,
14556 uint64_t length, file_layout_t* layout,
14557 uint64_t snapseq, uint32_t sync)
14558{
7c673cae 14559 vinodeno_t vino = ll_get_vino(in);
7c673cae 14560 int r = 0;
11fdf7f2 14561 std::unique_ptr<C_SaferCond> onsafe = nullptr;
f67539c2
TL
14562
14563 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14564 if (!mref_reader.is_state_satisfied())
14565 return -CEPHFS_ENOTCONN;
14566
7c673cae 14567 if (length == 0) {
f67539c2 14568 return -CEPHFS_EINVAL;
7c673cae
FG
14569 }
14570 if (true || sync) {
14571 /* if write is stable, the epilogue is waiting on
14572 * flock */
11fdf7f2 14573 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
14574 }
14575 object_t oid = file_object_t(vino.ino, blockid);
14576 SnapContext fakesnap;
11fdf7f2
TL
14577 ceph::bufferlist bl;
14578 if (length > 0) {
14579 bl.push_back(buffer::copy(buf, length));
14580 }
7c673cae
FG
14581
14582 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14583 << dendl;
14584
14585 fakesnap.seq = snapseq;
14586
14587 /* lock just in time */
7c673cae
FG
14588 objecter->write(oid,
14589 object_locator_t(layout->pool_id),
14590 offset,
14591 length,
14592 fakesnap,
14593 bl,
14594 ceph::real_clock::now(),
14595 0,
11fdf7f2 14596 onsafe.get());
7c673cae 14597
11fdf7f2
TL
14598 if (nullptr != onsafe) {
14599 r = onsafe->wait();
7c673cae
FG
14600 }
14601
14602 if (r < 0) {
14603 return r;
14604 } else {
14605 return length;
14606 }
14607}
14608
14609int Client::ll_commit_blocks(Inode *in,
14610 uint64_t offset,
14611 uint64_t length)
14612{
7c673cae
FG
14613 /*
14614 BarrierContext *bctx;
b32b8144 14615 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14616 uint64_t ino = vino.ino;
14617
14618 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14619 << offset << " to " << length << dendl;
14620
14621 if (length == 0) {
f67539c2 14622 return -CEPHFS_EINVAL;
7c673cae
FG
14623 }
14624
f67539c2 14625 std::scoped_lock lock(client_lock);
7c673cae
FG
14626 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14627 if (p != barriers.end()) {
14628 barrier_interval civ(offset, offset + length);
14629 p->second->commit_barrier(civ);
14630 }
14631 */
14632 return 0;
14633}
14634
14635int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14636{
7c673cae
FG
14637 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14638 "~" << len << dendl;
14639 tout(cct) << "ll_write" << std::endl;
f67539c2 14640 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14641 tout(cct) << off << std::endl;
14642 tout(cct) << len << std::endl;
14643
f67539c2
TL
14644 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14645 if (!mref_reader.is_state_satisfied())
14646 return -CEPHFS_ENOTCONN;
181888fb 14647
11fdf7f2
TL
14648 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14649 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14650 std::scoped_lock lock(client_lock);
14651
7c673cae
FG
14652 int r = _write(fh, off, len, data, NULL, 0);
14653 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14654 << dendl;
14655 return r;
14656}
14657
11fdf7f2
TL
14658int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14659{
f67539c2
TL
14660 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14661 if (!mref_reader.is_state_satisfied())
14662 return -CEPHFS_ENOTCONN;
14663
20effc67
TL
14664 std::scoped_lock cl(client_lock);
14665 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
11fdf7f2
TL
14666}
14667
14668int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14669{
f67539c2
TL
14670 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14671 if (!mref_reader.is_state_satisfied())
14672 return -CEPHFS_ENOTCONN;
14673
20effc67
TL
14674 std::scoped_lock cl(client_lock);
14675 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
11fdf7f2
TL
14676}
14677
7c673cae
FG
14678int Client::ll_flush(Fh *fh)
14679{
f67539c2
TL
14680 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14681 if (!mref_reader.is_state_satisfied())
14682 return -CEPHFS_ENOTCONN;
14683
7c673cae
FG
14684 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14685 tout(cct) << "ll_flush" << std::endl;
f67539c2 14686 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14687
f67539c2 14688 std::scoped_lock lock(client_lock);
7c673cae
FG
14689 return _flush(fh);
14690}
14691
14692int Client::ll_fsync(Fh *fh, bool syncdataonly)
14693{
f67539c2
TL
14694 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14695 if (!mref_reader.is_state_satisfied())
14696 return -CEPHFS_ENOTCONN;
14697
7c673cae
FG
14698 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14699 tout(cct) << "ll_fsync" << std::endl;
f67539c2 14700 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14701
f67539c2 14702 std::scoped_lock lock(client_lock);
7c673cae
FG
14703 int r = _fsync(fh, syncdataonly);
14704 if (r) {
14705 // If we're returning an error, clear it from the FH
14706 fh->take_async_err();
14707 }
14708 return r;
14709}
14710
28e407b8
AA
14711int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14712{
f67539c2
TL
14713 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14714 if (!mref_reader.is_state_satisfied())
14715 return -CEPHFS_ENOTCONN;
14716
28e407b8
AA
14717 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14718 tout(cct) << "ll_sync_inode" << std::endl;
f67539c2 14719 tout(cct) << (uintptr_t)in << std::endl;
28e407b8 14720
f67539c2 14721 std::scoped_lock lock(client_lock);
28e407b8
AA
14722 return _fsync(in, syncdataonly);
14723}
14724
7c673cae
FG
14725int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14726{
f67539c2
TL
14727 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14728
7c673cae 14729 if (offset < 0 || length <= 0)
f67539c2 14730 return -CEPHFS_EINVAL;
7c673cae
FG
14731
14732 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
f67539c2 14733 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14734
14735 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
f67539c2 14736 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14737
14738 Inode *in = fh->inode.get();
14739
14740 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14741 !(mode & FALLOC_FL_PUNCH_HOLE)) {
f67539c2 14742 return -CEPHFS_ENOSPC;
7c673cae
FG
14743 }
14744
14745 if (in->snapid != CEPH_NOSNAP)
f67539c2 14746 return -CEPHFS_EROFS;
7c673cae
FG
14747
14748 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 14749 return -CEPHFS_EBADF;
7c673cae
FG
14750
14751 uint64_t size = offset + length;
14752 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14753 size > in->size &&
11fdf7f2 14754 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
f67539c2 14755 return -CEPHFS_EDQUOT;
7c673cae
FG
14756 }
14757
14758 int have;
f6b5b4d7 14759 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
14760 if (r < 0)
14761 return r;
14762
11fdf7f2 14763 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
14764 if (mode & FALLOC_FL_PUNCH_HOLE) {
14765 if (in->inline_version < CEPH_INLINE_NONE &&
14766 (have & CEPH_CAP_FILE_BUFFER)) {
14767 bufferlist bl;
9f95a23c 14768 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
14769 int len = in->inline_data.length();
14770 if (offset < len) {
14771 if (offset > 0)
9f95a23c 14772 inline_iter.copy(offset, bl);
7c673cae
FG
14773 int size = length;
14774 if (offset + size > len)
14775 size = len - offset;
14776 if (size > 0)
14777 bl.append_zero(size);
9f95a23c
TL
14778 if (offset + size < len) {
14779 inline_iter += size;
14780 inline_iter.copy(len - offset - size, bl);
14781 }
7c673cae
FG
14782 in->inline_data = bl;
14783 in->inline_version++;
14784 }
91327a77 14785 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14786 in->change_attr++;
28e407b8 14787 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14788 } else {
14789 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
14790 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14791 uninline_data(in, onuninline.get());
7c673cae
FG
14792 }
14793
11fdf7f2 14794 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae 14795
7c673cae
FG
14796 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14797
14798 _invalidate_inode_cache(in, offset, length);
14799 filer->zero(in->ino, &in->layout,
14800 in->snaprealm->get_snap_context(),
14801 offset, length,
14802 ceph::real_clock::now(),
11fdf7f2 14803 0, true, &onfinish);
91327a77 14804 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14805 in->change_attr++;
28e407b8 14806 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14807
9f95a23c 14808 client_lock.unlock();
11fdf7f2 14809 onfinish.wait();
9f95a23c 14810 client_lock.lock();
f67539c2 14811 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
14812 }
14813 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14814 uint64_t size = offset + length;
14815 if (size > in->size) {
14816 in->size = size;
91327a77 14817 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14818 in->change_attr++;
28e407b8 14819 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14820
11fdf7f2 14821 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 14822 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
14823 } else if (is_max_size_approaching(in)) {
14824 check_caps(in, 0);
7c673cae
FG
14825 }
14826 }
14827 }
14828
11fdf7f2 14829 if (nullptr != onuninline) {
9f95a23c 14830 client_lock.unlock();
11fdf7f2 14831 int ret = onuninline->wait();
9f95a23c 14832 client_lock.lock();
7c673cae 14833
f67539c2 14834 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
14835 in->inline_data.clear();
14836 in->inline_version = CEPH_INLINE_NONE;
28e407b8 14837 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14838 check_caps(in, 0);
14839 } else
11fdf7f2 14840 r = ret;
7c673cae
FG
14841 }
14842
14843 put_cap_ref(in, CEPH_CAP_FILE_WR);
14844 return r;
14845}
7c673cae 14846
11fdf7f2 14847int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 14848{
f67539c2
TL
14849 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14850 if (!mref_reader.is_state_satisfied())
14851 return -CEPHFS_ENOTCONN;
14852
11fdf7f2
TL
14853 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14854 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
f67539c2 14855 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14856
f67539c2 14857 std::scoped_lock lock(client_lock);
7c673cae
FG
14858 return _fallocate(fh, mode, offset, length);
14859}
14860
14861int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14862{
f67539c2
TL
14863 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14864 if (!mref_reader.is_state_satisfied())
14865 return -CEPHFS_ENOTCONN;
7c673cae 14866
f67539c2 14867 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
181888fb 14868
f67539c2 14869 std::scoped_lock lock(client_lock);
7c673cae
FG
14870 Fh *fh = get_filehandle(fd);
14871 if (!fh)
f67539c2 14872 return -CEPHFS_EBADF;
7c673cae
FG
14873#if defined(__linux__) && defined(O_PATH)
14874 if (fh->flags & O_PATH)
f67539c2 14875 return -CEPHFS_EBADF;
7c673cae
FG
14876#endif
14877 return _fallocate(fh, mode, offset, length);
14878}
14879
14880int Client::ll_release(Fh *fh)
14881{
f67539c2
TL
14882 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14883 if (!mref_reader.is_state_satisfied())
14884 return -CEPHFS_ENOTCONN;
91327a77 14885
11fdf7f2 14886 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 14887 dendl;
11fdf7f2 14888 tout(cct) << __func__ << " (fh)" << std::endl;
f67539c2
TL
14889 tout(cct) << (uintptr_t)fh << std::endl;
14890
14891 std::scoped_lock lock(client_lock);
7c673cae
FG
14892
14893 if (ll_unclosed_fh_set.count(fh))
14894 ll_unclosed_fh_set.erase(fh);
14895 return _release_fh(fh);
14896}
14897
14898int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14899{
f67539c2
TL
14900 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14901 if (!mref_reader.is_state_satisfied())
14902 return -CEPHFS_ENOTCONN;
7c673cae
FG
14903
14904 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
f67539c2 14905 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
181888fb 14906
f67539c2 14907 std::scoped_lock lock(client_lock);
7c673cae
FG
14908 return _getlk(fh, fl, owner);
14909}
14910
14911int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14912{
f67539c2
TL
14913 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14914 if (!mref_reader.is_state_satisfied())
14915 return -CEPHFS_ENOTCONN;
7c673cae 14916
11fdf7f2 14917 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14918 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14919
f67539c2 14920 std::scoped_lock lock(client_lock);
7c673cae
FG
14921 return _setlk(fh, fl, owner, sleep);
14922}
14923
14924int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14925{
f67539c2
TL
14926 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14927 if (!mref_reader.is_state_satisfied())
14928 return -CEPHFS_ENOTCONN;
7c673cae 14929
11fdf7f2 14930 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14931 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14932
f67539c2 14933 std::scoped_lock lock(client_lock);
7c673cae
FG
14934 return _flock(fh, cmd, owner);
14935}
14936
b32b8144
FG
14937int Client::set_deleg_timeout(uint32_t timeout)
14938{
f67539c2 14939 std::scoped_lock lock(client_lock);
b32b8144
FG
14940
14941 /*
f67539c2 14942 * The whole point is to prevent blocklisting so we must time out the
b32b8144
FG
14943 * delegation before the session autoclose timeout kicks in.
14944 */
14945 if (timeout >= mdsmap->get_session_autoclose())
f67539c2 14946 return -CEPHFS_EINVAL;
b32b8144
FG
14947
14948 deleg_timeout = timeout;
14949 return 0;
14950}
14951
14952int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14953{
f67539c2 14954 int ret = -CEPHFS_EINVAL;
b32b8144 14955
f67539c2
TL
14956 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14957 if (!mref_reader.is_state_satisfied())
14958 return -CEPHFS_ENOTCONN;
b32b8144 14959
f67539c2 14960 std::scoped_lock lock(client_lock);
b32b8144
FG
14961
14962 Inode *inode = fh->inode.get();
14963
14964 switch(cmd) {
14965 case CEPH_DELEGATION_NONE:
14966 inode->unset_deleg(fh);
14967 ret = 0;
14968 break;
14969 default:
14970 try {
14971 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 14972 } catch (std::bad_alloc&) {
f67539c2 14973 ret = -CEPHFS_ENOMEM;
b32b8144
FG
14974 }
14975 break;
14976 }
14977 return ret;
14978}
14979
7c673cae
FG
14980class C_Client_RequestInterrupt : public Context {
14981private:
14982 Client *client;
14983 MetaRequest *req;
14984public:
14985 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14986 req->get();
14987 }
14988 void finish(int r) override {
f67539c2 14989 std::scoped_lock l(client->client_lock);
11fdf7f2 14990 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
14991 client->_interrupt_filelock(req);
14992 client->put_request(req);
14993 }
14994};
14995
14996void Client::ll_interrupt(void *d)
14997{
14998 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
14999 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15000 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
15001 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15002}
15003
15004// =========================================
15005// layout
15006
15007// expose file layouts
15008
15009int Client::describe_layout(const char *relpath, file_layout_t *lp,
15010 const UserPerm& perms)
15011{
f67539c2
TL
15012 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15013 if (!mref_reader.is_state_satisfied())
15014 return -CEPHFS_ENOTCONN;
7c673cae 15015
f67539c2 15016 std::scoped_lock lock(client_lock);
181888fb 15017
7c673cae
FG
15018 filepath path(relpath);
15019 InodeRef in;
15020 int r = path_walk(path, &in, perms);
15021 if (r < 0)
15022 return r;
15023
15024 *lp = in->layout;
15025
11fdf7f2 15026 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
15027 return 0;
15028}
15029
15030int Client::fdescribe_layout(int fd, file_layout_t *lp)
15031{
f67539c2
TL
15032 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15033 if (!mref_reader.is_state_satisfied())
15034 return -CEPHFS_ENOTCONN;
7c673cae 15035
f67539c2 15036 std::scoped_lock lock(client_lock);
181888fb 15037
7c673cae
FG
15038 Fh *f = get_filehandle(fd);
15039 if (!f)
f67539c2 15040 return -CEPHFS_EBADF;
7c673cae
FG
15041 Inode *in = f->inode.get();
15042
15043 *lp = in->layout;
15044
11fdf7f2 15045 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
15046 return 0;
15047}
15048
d2e6a577
FG
15049int64_t Client::get_default_pool_id()
15050{
f67539c2
TL
15051 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15052 if (!mref_reader.is_state_satisfied())
15053 return -CEPHFS_ENOTCONN;
181888fb 15054
f67539c2 15055 std::scoped_lock lock(client_lock);
181888fb 15056
d2e6a577
FG
15057 /* first data pool is the default */
15058 return mdsmap->get_first_data_pool();
15059}
7c673cae
FG
15060
15061// expose osdmap
15062
15063int64_t Client::get_pool_id(const char *pool_name)
15064{
f67539c2
TL
15065 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15066 if (!mref_reader.is_state_satisfied())
15067 return -CEPHFS_ENOTCONN;
181888fb 15068
f67539c2 15069 std::scoped_lock lock(client_lock);
181888fb 15070
7c673cae
FG
15071 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15072 pool_name);
15073}
15074
15075string Client::get_pool_name(int64_t pool)
15076{
f67539c2
TL
15077 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15078 if (!mref_reader.is_state_satisfied())
181888fb
FG
15079 return string();
15080
f67539c2
TL
15081 std::scoped_lock lock(client_lock);
15082
7c673cae
FG
15083 return objecter->with_osdmap([pool](const OSDMap& o) {
15084 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15085 });
15086}
15087
15088int Client::get_pool_replication(int64_t pool)
15089{
f67539c2
TL
15090 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15091 if (!mref_reader.is_state_satisfied())
15092 return -CEPHFS_ENOTCONN;
181888fb 15093
f67539c2 15094 std::scoped_lock lock(client_lock);
181888fb 15095
7c673cae 15096 return objecter->with_osdmap([pool](const OSDMap& o) {
f67539c2 15097 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
7c673cae
FG
15098 });
15099}
15100
15101int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15102{
f67539c2
TL
15103 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15104 if (!mref_reader.is_state_satisfied())
15105 return -CEPHFS_ENOTCONN;
7c673cae 15106
f67539c2 15107 std::scoped_lock lock(client_lock);
181888fb 15108
7c673cae
FG
15109 Fh *f = get_filehandle(fd);
15110 if (!f)
f67539c2 15111 return -CEPHFS_EBADF;
7c673cae
FG
15112 Inode *in = f->inode.get();
15113
15114 vector<ObjectExtent> extents;
15115 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 15116 ceph_assert(extents.size() == 1);
7c673cae
FG
15117
15118 objecter->with_osdmap([&](const OSDMap& o) {
15119 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15120 o.pg_to_acting_osds(pg, osds);
15121 });
15122
15123 if (osds.empty())
f67539c2 15124 return -CEPHFS_EINVAL;
7c673cae
FG
15125
15126 /*
15127 * Return the remainder of the extent (stripe unit)
15128 *
15129 * If length = 1 is passed to Striper::file_to_extents we get a single
15130 * extent back, but its length is one so we still need to compute the length
15131 * to the end of the stripe unit.
15132 *
15133 * If length = su then we may get 1 or 2 objects back in the extents vector
15134 * which would have to be examined. Even then, the offsets are local to the
15135 * object, so matching up to the file offset is extra work.
15136 *
15137 * It seems simpler to stick with length = 1 and manually compute the
15138 * remainder.
15139 */
15140 if (len) {
15141 uint64_t su = in->layout.stripe_unit;
15142 *len = su - (off % su);
15143 }
15144
15145 return 0;
15146}
15147
15148int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15149{
f67539c2
TL
15150 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15151 if (!mref_reader.is_state_satisfied())
15152 return -CEPHFS_ENOTCONN;
181888fb 15153
f67539c2 15154 std::scoped_lock lock(client_lock);
181888fb 15155
7c673cae 15156 if (id < 0)
f67539c2 15157 return -CEPHFS_EINVAL;
7c673cae
FG
15158 return objecter->with_osdmap([&](const OSDMap& o) {
15159 return o.crush->get_full_location_ordered(id, path);
15160 });
15161}
15162
15163int Client::get_file_stripe_address(int fd, loff_t offset,
15164 vector<entity_addr_t>& address)
15165{
f67539c2
TL
15166 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15167 if (!mref_reader.is_state_satisfied())
15168 return -CEPHFS_ENOTCONN;
7c673cae 15169
f67539c2 15170 std::scoped_lock lock(client_lock);
181888fb 15171
7c673cae
FG
15172 Fh *f = get_filehandle(fd);
15173 if (!f)
f67539c2 15174 return -CEPHFS_EBADF;
7c673cae
FG
15175 Inode *in = f->inode.get();
15176
15177 // which object?
15178 vector<ObjectExtent> extents;
15179 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15180 in->truncate_size, extents);
11fdf7f2 15181 ceph_assert(extents.size() == 1);
7c673cae
FG
15182
15183 // now we have the object and its 'layout'
15184 return objecter->with_osdmap([&](const OSDMap& o) {
15185 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15186 vector<int> osds;
15187 o.pg_to_acting_osds(pg, osds);
15188 if (osds.empty())
f67539c2 15189 return -CEPHFS_EINVAL;
7c673cae 15190 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 15191 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
15192 address.push_back(addr);
15193 }
15194 return 0;
15195 });
15196}
15197
15198int Client::get_osd_addr(int osd, entity_addr_t& addr)
15199{
f67539c2
TL
15200 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15201 if (!mref_reader.is_state_satisfied())
15202 return -CEPHFS_ENOTCONN;
181888fb 15203
f67539c2 15204 std::scoped_lock lock(client_lock);
181888fb 15205
7c673cae
FG
15206 return objecter->with_osdmap([&](const OSDMap& o) {
15207 if (!o.exists(osd))
f67539c2 15208 return -CEPHFS_ENOENT;
7c673cae 15209
11fdf7f2 15210 addr = o.get_addrs(osd).front();
7c673cae
FG
15211 return 0;
15212 });
15213}
15214
15215int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15216 loff_t length, loff_t offset)
15217{
f67539c2
TL
15218 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15219 if (!mref_reader.is_state_satisfied())
15220 return -CEPHFS_ENOTCONN;
7c673cae 15221
f67539c2 15222 std::scoped_lock lock(client_lock);
181888fb 15223
7c673cae
FG
15224 Fh *f = get_filehandle(fd);
15225 if (!f)
f67539c2 15226 return -CEPHFS_EBADF;
7c673cae
FG
15227 Inode *in = f->inode.get();
15228
15229 // map to a list of extents
15230 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15231
11fdf7f2 15232 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
15233 return 0;
15234}
15235
15236
f67539c2 15237/* find an osd with the same ip. -CEPHFS_ENXIO if none. */
7c673cae
FG
15238int Client::get_local_osd()
15239{
f67539c2
TL
15240 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15241 if (!mref_reader.is_state_satisfied())
15242 return -CEPHFS_ENOTCONN;
181888fb 15243
f67539c2 15244 std::scoped_lock lock(client_lock);
181888fb 15245
7c673cae
FG
15246 objecter->with_osdmap([this](const OSDMap& o) {
15247 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 15248 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
15249 local_osd_epoch = o.get_epoch();
15250 }
15251 });
15252 return local_osd;
15253}
15254
15255
15256
15257
15258
15259
15260// ===============================
15261
15262void Client::ms_handle_connect(Connection *con)
15263{
11fdf7f2 15264 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15265}
15266
15267bool Client::ms_handle_reset(Connection *con)
15268{
11fdf7f2 15269 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15270 return false;
15271}
15272
15273void Client::ms_handle_remote_reset(Connection *con)
15274{
f67539c2 15275 std::scoped_lock lock(client_lock);
11fdf7f2 15276 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15277 switch (con->get_peer_type()) {
15278 case CEPH_ENTITY_TYPE_MDS:
15279 {
15280 // kludge to figure out which mds this is; fixme with a Connection* state
15281 mds_rank_t mds = MDS_RANK_NONE;
20effc67 15282 MetaSessionRef s = NULL;
11fdf7f2 15283 for (auto &p : mds_sessions) {
b3b6e05e 15284 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
11fdf7f2 15285 mds = p.first;
20effc67 15286 s = p.second;
7c673cae
FG
15287 }
15288 }
15289 if (mds >= 0) {
20effc67 15290 ceph_assert(s != NULL);
7c673cae
FG
15291 switch (s->state) {
15292 case MetaSession::STATE_CLOSING:
15293 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
20effc67 15294 _closed_mds_session(s.get());
7c673cae
FG
15295 break;
15296
15297 case MetaSession::STATE_OPENING:
15298 {
15299 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15300 list<Context*> waiters;
15301 waiters.swap(s->waiting_for_open);
20effc67
TL
15302 _closed_mds_session(s.get());
15303 auto news = _get_or_open_mds_session(mds);
7c673cae
FG
15304 news->waiting_for_open.swap(waiters);
15305 }
15306 break;
15307
15308 case MetaSession::STATE_OPEN:
15309 {
f67539c2 15310 objecter->maybe_request_map(); /* to check if we are blocklisted */
f6b5b4d7 15311 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae 15312 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
20effc67 15313 _closed_mds_session(s.get());
7c673cae
FG
15314 } else {
15315 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15316 s->state = MetaSession::STATE_STALE;
15317 }
15318 }
15319 break;
15320
15321 case MetaSession::STATE_NEW:
15322 case MetaSession::STATE_CLOSED:
15323 default:
15324 break;
15325 }
15326 }
15327 }
15328 break;
15329 }
15330}
15331
15332bool Client::ms_handle_refused(Connection *con)
15333{
11fdf7f2 15334 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15335 return false;
15336}
15337
7c673cae
FG
15338Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15339{
11fdf7f2
TL
15340 Inode *quota_in = root_ancestor;
15341 SnapRealm *realm = in->snaprealm;
2a845540
TL
15342
15343 if (!cct->_conf.get_val<bool>("client_quota"))
15344 return NULL;
15345
11fdf7f2
TL
15346 while (realm) {
15347 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15348 if (realm->ino != in->ino) {
15349 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15350 if (p == inode_map.end())
15351 break;
7c673cae 15352
11fdf7f2
TL
15353 if (p->second->quota.is_enable()) {
15354 quota_in = p->second;
15355 break;
7c673cae 15356 }
7c673cae 15357 }
11fdf7f2 15358 realm = realm->pparent;
7c673cae 15359 }
11fdf7f2
TL
15360 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15361 return quota_in;
7c673cae
FG
15362}
15363
15364/**
15365 * Traverse quota ancestors of the Inode, return true
15366 * if any of them passes the passed function
15367 */
15368bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15369 std::function<bool (const Inode &in)> test)
15370{
2a845540
TL
15371 if (!cct->_conf.get_val<bool>("client_quota"))
15372 return false;
15373
7c673cae 15374 while (true) {
11fdf7f2 15375 ceph_assert(in != NULL);
7c673cae
FG
15376 if (test(*in)) {
15377 return true;
15378 }
15379
15380 if (in == root_ancestor) {
15381 // We're done traversing, drop out
15382 return false;
15383 } else {
15384 // Continue up the tree
15385 in = get_quota_root(in, perms);
15386 }
15387 }
15388
15389 return false;
15390}
15391
15392bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15393{
15394 return check_quota_condition(in, perms,
15395 [](const Inode &in) {
15396 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15397 });
15398}
15399
15400bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 15401 const UserPerm& perms)
7c673cae
FG
15402{
15403 return check_quota_condition(in, perms,
11fdf7f2 15404 [&new_bytes](const Inode &in) {
7c673cae
FG
15405 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15406 > in.quota.max_bytes;
15407 });
15408}
15409
11fdf7f2 15410bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 15411{
9f95a23c
TL
15412 ceph_assert(in->size >= in->reported_size);
15413 const uint64_t size = in->size - in->reported_size;
11fdf7f2 15414 return check_quota_condition(in, perms,
9f95a23c 15415 [&size](const Inode &in) {
11fdf7f2
TL
15416 if (in.quota.max_bytes) {
15417 if (in.rstat.rbytes >= in.quota.max_bytes) {
15418 return true;
15419 }
15420
11fdf7f2 15421 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
15422 return (space >> 4) < size;
15423 } else {
15424 return false;
15425 }
15426 });
7c673cae
FG
15427}
15428
15429enum {
15430 POOL_CHECKED = 1,
15431 POOL_CHECKING = 2,
15432 POOL_READ = 4,
15433 POOL_WRITE = 8,
15434};
15435
15436int Client::check_pool_perm(Inode *in, int need)
15437{
f67539c2
TL
15438 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15439
7c673cae
FG
15440 if (!cct->_conf->client_check_pool_perm)
15441 return 0;
15442
f67539c2
TL
15443 /* Only need to do this for regular files */
15444 if (!in->is_file())
15445 return 0;
15446
7c673cae
FG
15447 int64_t pool_id = in->layout.pool_id;
15448 std::string pool_ns = in->layout.pool_ns;
15449 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15450 int have = 0;
15451 while (true) {
15452 auto it = pool_perms.find(perm_key);
15453 if (it == pool_perms.end())
15454 break;
15455 if (it->second == POOL_CHECKING) {
15456 // avoid concurrent checkings
15457 wait_on_list(waiting_for_pool_perm);
15458 } else {
15459 have = it->second;
11fdf7f2 15460 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
15461 break;
15462 }
15463 }
15464
15465 if (!have) {
15466 if (in->snapid != CEPH_NOSNAP) {
15467 // pool permission check needs to write to the first object. But for snapshot,
20effc67 15468 // head of the first object may have already been deleted. To avoid creating
7c673cae
FG
15469 // orphan object, skip the check for now.
15470 return 0;
15471 }
15472
15473 pool_perms[perm_key] = POOL_CHECKING;
15474
15475 char oid_buf[32];
15476 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15477 object_t oid = oid_buf;
15478
15479 SnapContext nullsnapc;
15480
15481 C_SaferCond rd_cond;
15482 ObjectOperation rd_op;
f67539c2 15483 rd_op.stat(nullptr, nullptr, nullptr);
7c673cae
FG
15484
15485 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15486 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15487
15488 C_SaferCond wr_cond;
15489 ObjectOperation wr_op;
15490 wr_op.create(true);
15491
15492 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15493 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15494
9f95a23c 15495 client_lock.unlock();
7c673cae
FG
15496 int rd_ret = rd_cond.wait();
15497 int wr_ret = wr_cond.wait();
9f95a23c 15498 client_lock.lock();
7c673cae
FG
15499
15500 bool errored = false;
15501
f67539c2 15502 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
7c673cae 15503 have |= POOL_READ;
f67539c2 15504 else if (rd_ret != -CEPHFS_EPERM) {
11fdf7f2 15505 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15506 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15507 errored = true;
15508 }
15509
f67539c2 15510 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
7c673cae 15511 have |= POOL_WRITE;
f67539c2 15512 else if (wr_ret != -CEPHFS_EPERM) {
11fdf7f2 15513 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15514 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15515 errored = true;
15516 }
15517
15518 if (errored) {
15519 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15520 // Raise EIO because actual error code might be misleading for
15521 // userspace filesystem user.
15522 pool_perms.erase(perm_key);
15523 signal_cond_list(waiting_for_pool_perm);
f67539c2 15524 return -CEPHFS_EIO;
7c673cae
FG
15525 }
15526
15527 pool_perms[perm_key] = have | POOL_CHECKED;
15528 signal_cond_list(waiting_for_pool_perm);
15529 }
15530
15531 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 15532 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15533 << " need " << ccap_string(need) << ", but no read perm" << dendl;
f67539c2 15534 return -CEPHFS_EPERM;
7c673cae
FG
15535 }
15536 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 15537 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15538 << " need " << ccap_string(need) << ", but no write perm" << dendl;
f67539c2 15539 return -CEPHFS_EPERM;
7c673cae
FG
15540 }
15541
15542 return 0;
15543}
15544
15545int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15546{
15547 if (acl_type == POSIX_ACL) {
15548 if (in->xattrs.count(ACL_EA_ACCESS)) {
15549 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15550
15551 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15552 }
15553 }
f67539c2 15554 return -CEPHFS_EAGAIN;
7c673cae
FG
15555}
15556
15557int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15558{
15559 if (acl_type == NO_ACL)
15560 return 0;
15561
15562 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15563 if (r < 0)
15564 goto out;
15565
15566 if (acl_type == POSIX_ACL) {
15567 if (in->xattrs.count(ACL_EA_ACCESS)) {
15568 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15569 bufferptr acl(access_acl.c_str(), access_acl.length());
15570 r = posix_acl_access_chmod(acl, mode);
15571 if (r < 0)
15572 goto out;
15573 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15574 } else {
15575 r = 0;
15576 }
15577 }
15578out:
15579 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15580 return r;
15581}
15582
15583int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15584 const UserPerm& perms)
15585{
15586 if (acl_type == NO_ACL)
15587 return 0;
15588
15589 if (S_ISLNK(*mode))
15590 return 0;
15591
15592 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15593 if (r < 0)
15594 goto out;
15595
15596 if (acl_type == POSIX_ACL) {
15597 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15598 map<string, bufferptr> xattrs;
15599
15600 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15601 bufferptr acl(default_acl.c_str(), default_acl.length());
15602 r = posix_acl_inherit_mode(acl, mode);
15603 if (r < 0)
15604 goto out;
15605
15606 if (r > 0) {
15607 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15608 if (r < 0)
15609 goto out;
15610 if (r > 0)
15611 xattrs[ACL_EA_ACCESS] = acl;
15612 }
15613
15614 if (S_ISDIR(*mode))
15615 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15616
15617 r = xattrs.size();
15618 if (r > 0)
11fdf7f2 15619 encode(xattrs, xattrs_bl);
7c673cae
FG
15620 } else {
15621 if (umask_cb)
15622 *mode &= ~umask_cb(callback_handle);
15623 r = 0;
15624 }
15625 }
15626out:
15627 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15628 return r;
15629}
15630
15631void Client::set_filer_flags(int flags)
15632{
f67539c2 15633 std::scoped_lock l(client_lock);
11fdf7f2 15634 ceph_assert(flags == 0 ||
7c673cae
FG
15635 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15636 objecter->add_global_op_flags(flags);
15637}
15638
15639void Client::clear_filer_flags(int flags)
15640{
f67539c2 15641 std::scoped_lock l(client_lock);
11fdf7f2 15642 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
15643 objecter->clear_global_op_flag(flags);
15644}
15645
11fdf7f2
TL
15646// called before mount
15647void Client::set_uuid(const std::string& uuid)
15648{
f67539c2
TL
15649 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15650 ceph_assert(iref_reader.is_state_satisfied());
15651
15652 std::scoped_lock l(client_lock);
20effc67 15653 ceph_assert(!uuid.empty());
11fdf7f2
TL
15654
15655 metadata["uuid"] = uuid;
15656 _close_sessions();
15657}
15658
15659// called before mount. 0 means infinite
15660void Client::set_session_timeout(unsigned timeout)
15661{
f67539c2
TL
15662 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15663 ceph_assert(iref_reader.is_state_satisfied());
15664
15665 std::scoped_lock l(client_lock);
11fdf7f2
TL
15666
15667 metadata["timeout"] = stringify(timeout);
15668}
15669
15670// called before mount
15671int Client::start_reclaim(const std::string& uuid, unsigned flags,
15672 const std::string& fs_name)
15673{
f67539c2
TL
15674 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15675 if (!iref_reader.is_state_satisfied())
15676 return -CEPHFS_ENOTCONN;
11fdf7f2
TL
15677
15678 if (uuid.empty())
f67539c2 15679 return -CEPHFS_EINVAL;
11fdf7f2 15680
f67539c2 15681 std::unique_lock l(client_lock);
11fdf7f2
TL
15682 {
15683 auto it = metadata.find("uuid");
15684 if (it != metadata.end() && it->second == uuid)
f67539c2 15685 return -CEPHFS_EINVAL;
11fdf7f2
TL
15686 }
15687
15688 int r = subscribe_mdsmap(fs_name);
15689 if (r < 0) {
15690 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15691 return r;
15692 }
15693
15694 if (metadata.empty())
15695 populate_metadata("");
15696
15697 while (mdsmap->get_epoch() == 0)
15698 wait_on_list(waiting_for_mdsmap);
15699
15700 reclaim_errno = 0;
15701 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15702 if (!mdsmap->is_up(mds)) {
15703 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15704 wait_on_list(waiting_for_mdsmap);
15705 continue;
15706 }
15707
20effc67 15708 MetaSessionRef session;
11fdf7f2
TL
15709 if (!have_open_session(mds)) {
15710 session = _get_or_open_mds_session(mds);
f6b5b4d7 15711 if (session->state == MetaSession::STATE_REJECTED)
f67539c2 15712 return -CEPHFS_EPERM;
11fdf7f2
TL
15713 if (session->state != MetaSession::STATE_OPENING) {
15714 // umounting?
f67539c2 15715 return -CEPHFS_EINVAL;
11fdf7f2
TL
15716 }
15717 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15718 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
15719 continue;
15720 }
15721
20effc67 15722 session = mds_sessions.at(mds);
11fdf7f2 15723 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
f67539c2 15724 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
15725
15726 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15727 session->reclaim_state == MetaSession::RECLAIMING) {
15728 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 15729 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
15730 session->con->send_message2(std::move(m));
15731 wait_on_list(waiting_for_reclaim);
15732 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
f67539c2 15733 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15734 } else {
15735 mds++;
15736 }
15737 }
15738
15739 // didn't find target session in any mds
15740 if (reclaim_target_addrs.empty()) {
15741 if (flags & CEPH_RECLAIM_RESET)
f67539c2
TL
15742 return -CEPHFS_ENOENT;
15743 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15744 }
15745
15746 if (flags & CEPH_RECLAIM_RESET)
15747 return 0;
15748
f67539c2
TL
15749 // use blocklist to check if target session was killed
15750 // (config option mds_session_blocklist_on_evict needs to be true)
15751 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15752 bs::error_code ec;
15753 l.unlock();
15754 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15755 l.lock();
11fdf7f2 15756
f67539c2
TL
15757 if (ec)
15758 return ceph::from_error_code(ec);
15759
15760 bool blocklisted = objecter->with_osdmap(
11fdf7f2 15761 [this](const OSDMap &osd_map) -> bool {
f67539c2 15762 return osd_map.is_blocklisted(reclaim_target_addrs);
11fdf7f2 15763 });
f67539c2
TL
15764 if (blocklisted)
15765 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15766
15767 metadata["reclaiming_uuid"] = uuid;
15768 return 0;
15769}
15770
15771void Client::finish_reclaim()
15772{
15773 auto it = metadata.find("reclaiming_uuid");
15774 if (it == metadata.end()) {
15775 for (auto &p : mds_sessions)
20effc67 15776 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
11fdf7f2
TL
15777 return;
15778 }
15779
15780 for (auto &p : mds_sessions) {
20effc67 15781 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 15782 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
20effc67 15783 p.second->con->send_message2(std::move(m));
11fdf7f2
TL
15784 }
15785
15786 metadata["uuid"] = it->second;
15787 metadata.erase(it);
15788}
15789
15790void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15791{
15792 mds_rank_t from = mds_rank_t(reply->get_source().num());
15793 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15794
f67539c2 15795 std::scoped_lock cl(client_lock);
20effc67 15796 auto session = _get_mds_session(from, reply->get_connection().get());
11fdf7f2
TL
15797 if (!session) {
15798 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15799 return;
15800 }
15801
15802 if (reply->get_result() >= 0) {
15803 session->reclaim_state = MetaSession::RECLAIM_OK;
15804 if (reply->get_epoch() > reclaim_osd_epoch)
15805 reclaim_osd_epoch = reply->get_epoch();
15806 if (!reply->get_addrs().empty())
15807 reclaim_target_addrs = reply->get_addrs();
15808 } else {
15809 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15810 reclaim_errno = reply->get_result();
15811 }
15812
15813 signal_cond_list(waiting_for_reclaim);
15814}
15815
7c673cae
FG
15816/**
15817 * This is included in cap release messages, to cause
15818 * the MDS to wait until this OSD map epoch. It is necessary
15819 * in corner cases where we cancel RADOS ops, so that
15820 * nobody else tries to do IO to the same objects in
15821 * the same epoch as the cancelled ops.
15822 */
15823void Client::set_cap_epoch_barrier(epoch_t e)
15824{
15825 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15826 cap_epoch_barrier = e;
15827}
15828
15829const char** Client::get_tracked_conf_keys() const
15830{
15831 static const char* keys[] = {
15832 "client_cache_size",
15833 "client_cache_mid",
15834 "client_acl_type",
b32b8144
FG
15835 "client_deleg_timeout",
15836 "client_deleg_break_on_open",
f67539c2
TL
15837 "client_oc_size",
15838 "client_oc_max_objects",
15839 "client_oc_max_dirty",
15840 "client_oc_target_dirty",
15841 "client_oc_max_dirty_age",
2a845540
TL
15842 "client_caps_release_delay",
15843 "client_mount_timeout",
7c673cae
FG
15844 NULL
15845 };
15846 return keys;
15847}
15848
11fdf7f2 15849void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
15850 const std::set <std::string> &changed)
15851{
f67539c2 15852 std::scoped_lock lock(client_lock);
7c673cae 15853
181888fb 15854 if (changed.count("client_cache_mid")) {
7c673cae
FG
15855 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15856 }
15857 if (changed.count("client_acl_type")) {
15858 acl_type = NO_ACL;
15859 if (cct->_conf->client_acl_type == "posix_acl")
15860 acl_type = POSIX_ACL;
15861 }
f67539c2
TL
15862 if (changed.count("client_oc_size")) {
15863 objectcacher->set_max_size(cct->_conf->client_oc_size);
15864 }
15865 if (changed.count("client_oc_max_objects")) {
15866 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15867 }
15868 if (changed.count("client_oc_max_dirty")) {
15869 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15870 }
15871 if (changed.count("client_oc_target_dirty")) {
15872 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15873 }
15874 if (changed.count("client_oc_max_dirty_age")) {
15875 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15876 }
33c7a0ef
TL
15877 if (changed.count("client_collect_and_send_global_metrics")) {
15878 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
15879 "client_collect_and_send_global_metrics");
15880 }
2a845540
TL
15881 if (changed.count("client_caps_release_delay")) {
15882 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
15883 "client_caps_release_delay");
15884 }
15885 if (changed.count("client_mount_timeout")) {
15886 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
15887 "client_mount_timeout");
15888 }
7c673cae
FG
15889}
15890
7c673cae
FG
15891void intrusive_ptr_add_ref(Inode *in)
15892{
b3b6e05e 15893 in->iget();
7c673cae 15894}
f67539c2 15895
7c673cae
FG
15896void intrusive_ptr_release(Inode *in)
15897{
15898 in->client->put_inode(in);
15899}
15900
15901mds_rank_t Client::_get_random_up_mds() const
15902{
9f95a23c 15903 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
15904
15905 std::set<mds_rank_t> up;
15906 mdsmap->get_up_mds_set(up);
15907
15908 if (up.empty())
15909 return MDS_RANK_NONE;
15910 std::set<mds_rank_t>::const_iterator p = up.begin();
15911 for (int n = rand() % up.size(); n; n--)
15912 ++p;
15913 return *p;
15914}
15915
15916
f67539c2
TL
15917StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15918 boost::asio::io_context& ictx)
15919 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
7c673cae
FG
15920{
15921 monclient->set_messenger(m);
15922 objecter->set_client_incarnation(0);
15923}
15924
15925StandaloneClient::~StandaloneClient()
15926{
15927 delete objecter;
15928 objecter = nullptr;
15929}
15930
15931int StandaloneClient::init()
15932{
f67539c2
TL
15933 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15934 ceph_assert(iref_writer.is_first_writer());
15935
e306af50 15936 _pre_init();
7c673cae
FG
15937 objecter->init();
15938
9f95a23c 15939 client_lock.lock();
7c673cae
FG
15940
15941 messenger->add_dispatcher_tail(objecter);
15942 messenger->add_dispatcher_tail(this);
15943
15944 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15945 int r = monclient->init();
15946 if (r < 0) {
15947 // need to do cleanup because we're in an intermediate init state
f67539c2
TL
15948 {
15949 std::scoped_lock l(timer_lock);
15950 timer.shutdown();
15951 }
15952
9f95a23c 15953 client_lock.unlock();
7c673cae
FG
15954 objecter->shutdown();
15955 objectcacher->stop();
15956 monclient->shutdown();
15957 return r;
15958 }
15959 objecter->start();
15960
9f95a23c 15961 client_lock.unlock();
7c673cae 15962 _finish_init();
f67539c2 15963 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
15964
15965 return 0;
15966}
15967
15968void StandaloneClient::shutdown()
15969{
15970 Client::shutdown();
15971 objecter->shutdown();
15972 monclient->shutdown();
15973}