]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph 15.2.13
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
7c673cae
FG
73#include "common/perf_counters.h"
74#include "common/admin_socket.h"
75#include "common/errno.h"
76#include "include/str_list.h"
77
78#define dout_subsys ceph_subsys_client
79
80#include "include/lru.h"
81#include "include/compat.h"
82#include "include/stringify.h"
83
84#include "Client.h"
85#include "Inode.h"
86#include "Dentry.h"
b32b8144 87#include "Delegation.h"
7c673cae
FG
88#include "Dir.h"
89#include "ClientSnapRealm.h"
90#include "Fh.h"
91#include "MetaSession.h"
92#include "MetaRequest.h"
93#include "ObjecterWriteback.h"
94#include "posix_acl.h"
95
11fdf7f2 96#include "include/ceph_assert.h"
7c673cae
FG
97#include "include/stat.h"
98
e306af50 99#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
100
101#if HAVE_GETGROUPLIST
102#include <grp.h>
103#include <pwd.h>
104#include <unistd.h>
105#endif
106
107#undef dout_prefix
108#define dout_prefix *_dout << "client." << whoami << " "
109
110#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112// FreeBSD fails to define this
113#ifndef O_DSYNC
114#define O_DSYNC 0x0
115#endif
116// Darwin fails to define this
117#ifndef O_RSYNC
118#define O_RSYNC 0x0
119#endif
120
121#ifndef O_DIRECT
122#define O_DIRECT 0x0
123#endif
124
125#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
adb31ebb
TL
127using namespace TOPNSPC::common;
128
7c673cae
FG
129void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
130{
131 Client *client = static_cast<Client*>(p);
132 client->flush_set_callback(oset);
133}
134
135
136// -------------
137
138Client::CommandHook::CommandHook(Client *client) :
139 m_client(client)
140{
141}
142
9f95a23c
TL
143int Client::CommandHook::call(
144 std::string_view command,
145 const cmdmap_t& cmdmap,
146 Formatter *f,
147 std::ostream& errss,
148 bufferlist& out)
7c673cae 149{
7c673cae 150 f->open_object_section("result");
9f95a23c
TL
151 {
152 std::lock_guard l{m_client->client_lock};
153 if (command == "mds_requests")
154 m_client->dump_mds_requests(f);
adb31ebb
TL
155 else if (command == "mds_sessions") {
156 bool cap_dump = false;
157 cmd_getval(cmdmap, "cap_dump", cap_dump);
158 m_client->dump_mds_sessions(f, cap_dump);
159 } else if (command == "dump_cache")
9f95a23c
TL
160 m_client->dump_cache(f);
161 else if (command == "kick_stale_sessions")
162 m_client->_kick_stale_sessions();
163 else if (command == "status")
164 m_client->dump_status(f);
165 else
166 ceph_abort_msg("bad command registered");
167 }
7c673cae 168 f->close_section();
9f95a23c 169 return 0;
7c673cae
FG
170}
171
172
173// -------------
174
175dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
176 : inode(in), offset(0), next_offset(2),
177 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
178 perms(perms)
179 { }
180
181void Client::_reset_faked_inos()
182{
183 ino_t start = 1024;
184 free_faked_inos.clear();
185 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
186 last_used_faked_ino = 0;
11fdf7f2 187 last_used_faked_root = 0;
7c673cae
FG
188 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
189}
190
191void Client::_assign_faked_ino(Inode *in)
192{
11fdf7f2
TL
193 if (0 == last_used_faked_ino)
194 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
195 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
196 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 197 last_used_faked_ino = 2048;
7c673cae
FG
198 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
199 }
11fdf7f2 200 ceph_assert(it != free_faked_inos.end());
7c673cae 201 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 202 ceph_assert(it.get_len() > 0);
7c673cae
FG
203 last_used_faked_ino = it.get_start();
204 } else {
205 ++last_used_faked_ino;
11fdf7f2 206 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
207 }
208 in->faked_ino = last_used_faked_ino;
209 free_faked_inos.erase(in->faked_ino);
210 faked_ino_map[in->faked_ino] = in->vino();
211}
212
11fdf7f2
TL
213/*
214 * In the faked mode, if you export multiple subdirectories,
215 * you will see that the inode numbers of the exported subdirectories
216 * are the same. so we distinguish the mount point by reserving
217 * the "fake ids" between "1024~2048" and combining the last
218 * 10bits(0x3ff) of the "root inodes".
219*/
220void Client::_assign_faked_root(Inode *in)
221{
222 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
223 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
224 last_used_faked_root = 0;
225 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
226 }
227 assert(it != free_faked_inos.end());
228 vinodeno_t inode_info = in->vino();
229 uint64_t inode_num = (uint64_t)inode_info.ino;
230 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
231 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
232 assert(it.get_start() + it.get_len() > last_used_faked_root);
233
234 in->faked_ino = last_used_faked_root;
235 free_faked_inos.erase(in->faked_ino);
236 faked_ino_map[in->faked_ino] = in->vino();
237}
238
7c673cae
FG
239void Client::_release_faked_ino(Inode *in)
240{
241 free_faked_inos.insert(in->faked_ino);
242 faked_ino_map.erase(in->faked_ino);
243}
244
245vinodeno_t Client::_map_faked_ino(ino_t ino)
246{
247 vinodeno_t vino;
248 if (ino == 1)
249 vino = root->vino();
250 else if (faked_ino_map.count(ino))
251 vino = faked_ino_map[ino];
252 else
253 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 254 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
255 return vino;
256}
257
258vinodeno_t Client::map_faked_ino(ino_t ino)
259{
11fdf7f2 260 std::lock_guard lock(client_lock);
7c673cae
FG
261 return _map_faked_ino(ino);
262}
263
264// cons/des
265
266Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
267 : Dispatcher(m->cct),
7c673cae 268 timer(m->cct, client_lock),
11fdf7f2
TL
269 messenger(m),
270 monclient(mc),
271 objecter(objecter_),
272 whoami(mc->get_global_id()),
7c673cae
FG
273 async_ino_invalidator(m->cct),
274 async_dentry_invalidator(m->cct),
275 interrupt_finisher(m->cct),
276 remount_finisher(m->cct),
e306af50 277 async_ino_releasor(m->cct),
7c673cae 278 objecter_finisher(m->cct),
11fdf7f2
TL
279 m_command_hook(this),
280 fscid(0)
7c673cae
FG
281{
282 _reset_faked_inos();
7c673cae 283
7c673cae
FG
284 user_id = cct->_conf->client_mount_uid;
285 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
286 fuse_default_permissions = cct->_conf.get_val<bool>(
287 "fuse_default_permissions");
7c673cae 288
7c673cae
FG
289 if (cct->_conf->client_acl_type == "posix_acl")
290 acl_type = POSIX_ACL;
291
7c673cae
FG
292 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
293
294 // file handles
295 free_fd_set.insert(10, 1<<30);
296
297 mdsmap.reset(new MDSMap);
298
299 // osd interfaces
300 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
301 &client_lock));
302 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
303 client_flush_set_callback, // all commit callback
304 (void*)this,
305 cct->_conf->client_oc_size,
306 cct->_conf->client_oc_max_objects,
307 cct->_conf->client_oc_max_dirty,
308 cct->_conf->client_oc_target_dirty,
309 cct->_conf->client_oc_max_dirty_age,
310 true));
7c673cae
FG
311}
312
313
314Client::~Client()
315{
9f95a23c 316 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 317
31f18b77
FG
318 // It is necessary to hold client_lock, because any inode destruction
319 // may call into ObjectCacher, which asserts that it's lock (which is
320 // client_lock) is held.
9f95a23c 321 std::lock_guard l{client_lock};
7c673cae
FG
322 tear_down_cache();
323}
324
325void Client::tear_down_cache()
326{
327 // fd's
328 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
329 it != fd_map.end();
330 ++it) {
331 Fh *fh = it->second;
11fdf7f2 332 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
333 _release_fh(fh);
334 }
335 fd_map.clear();
336
337 while (!opened_dirs.empty()) {
338 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 339 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
340 _closedir(dirp);
341 }
342
343 // caps!
344 // *** FIXME ***
345
346 // empty lru
7c673cae 347 trim_cache();
11fdf7f2 348 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
349
350 // close root ino
11fdf7f2 351 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
352 if (root && inode_map.size() == 1 + root_parents.size()) {
353 delete root;
354 root = 0;
355 root_ancestor = 0;
356 while (!root_parents.empty())
357 root_parents.erase(root_parents.begin());
358 inode_map.clear();
359 _reset_faked_inos();
360 }
361
11fdf7f2 362 ceph_assert(inode_map.empty());
7c673cae
FG
363}
364
365inodeno_t Client::get_root_ino()
366{
11fdf7f2 367 std::lock_guard l(client_lock);
7c673cae
FG
368 if (use_faked_inos())
369 return root->faked_ino;
370 else
371 return root->ino;
372}
373
374Inode *Client::get_root()
375{
11fdf7f2 376 std::lock_guard l(client_lock);
7c673cae
FG
377 root->ll_get();
378 return root;
379}
380
381
382// debug crapola
383
384void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
385{
386 filepath path;
387 in->make_long_path(path);
388 ldout(cct, 1) << "dump_inode: "
389 << (disconnected ? "DISCONNECTED ":"")
390 << "inode " << in->ino
391 << " " << path
392 << " ref " << in->get_num_ref()
393 << *in << dendl;
394
395 if (f) {
396 f->open_object_section("inode");
397 f->dump_stream("path") << path;
398 if (disconnected)
399 f->dump_int("disconnected", 1);
400 in->dump(f);
401 f->close_section();
402 }
403
404 did.insert(in);
405 if (in->dir) {
406 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
407 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
408 it != in->dir->dentries.end();
409 ++it) {
410 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
411 if (f) {
412 f->open_object_section("dentry");
413 it->second->dump(f);
414 f->close_section();
415 }
416 if (it->second->inode)
417 dump_inode(f, it->second->inode.get(), did, false);
418 }
419 }
420}
421
422void Client::dump_cache(Formatter *f)
423{
424 set<Inode*> did;
425
11fdf7f2 426 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
427
428 if (f)
429 f->open_array_section("cache");
430
431 if (root)
432 dump_inode(f, root, did, true);
433
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
436 it != inode_map.end();
437 ++it) {
438 if (did.count(it->second))
439 continue;
440 dump_inode(f, it->second, did, true);
441 }
442
443 if (f)
444 f->close_section();
445}
446
447void Client::dump_status(Formatter *f)
448{
9f95a23c 449 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
450
451 ldout(cct, 1) << __func__ << dendl;
452
453 const epoch_t osd_epoch
454 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
455
456 if (f) {
457 f->open_object_section("metadata");
458 for (const auto& kv : metadata)
459 f->dump_string(kv.first.c_str(), kv.second);
460 f->close_section();
461
462 f->dump_int("dentry_count", lru.lru_get_size());
463 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
464 f->dump_int("id", get_nodeid().v);
11fdf7f2 465 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 466 f->dump_object("inst", inst);
11fdf7f2
TL
467 f->dump_object("addr", inst.addr);
468 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
469 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
470 f->dump_int("inode_count", inode_map.size());
471 f->dump_int("mds_epoch", mdsmap->get_epoch());
472 f->dump_int("osd_epoch", osd_epoch);
473 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 474 f->dump_bool("blacklisted", blacklisted);
adb31ebb 475 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
476 }
477}
478
e306af50 479void Client::_pre_init()
7c673cae
FG
480{
481 timer.init();
e306af50
TL
482
483 objecter_finisher.start();
484 filer.reset(new Filer(objecter, &objecter_finisher));
485 objecter->enable_blacklist_events();
486
7c673cae 487 objectcacher->start();
e306af50
TL
488}
489
490int Client::init()
491{
492 _pre_init();
9f95a23c
TL
493 {
494 std::lock_guard l{client_lock};
495 ceph_assert(!initialized);
496 messenger->add_dispatcher_tail(this);
497 }
7c673cae
FG
498 _finish_init();
499 return 0;
500}
501
502void Client::_finish_init()
503{
9f95a23c
TL
504 {
505 std::lock_guard l{client_lock};
506 // logger
507 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
508 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
509 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
510 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
511 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
512 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
513 logger.reset(plb.create_perf_counters());
514 cct->get_perfcounters_collection()->add(logger.get());
515 }
7c673cae 516
11fdf7f2 517 cct->_conf.add_observer(this);
7c673cae
FG
518
519 AdminSocket* admin_socket = cct->get_admin_socket();
520 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
521 &m_command_hook,
522 "show in-progress mds requests");
523 if (ret < 0) {
524 lderr(cct) << "error registering admin socket command: "
525 << cpp_strerror(-ret) << dendl;
526 }
adb31ebb
TL
527 ret = admin_socket->register_command("mds_sessions "
528 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
529 &m_command_hook,
530 "show mds session state");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
536 &m_command_hook,
537 "show in-memory metadata cache contents");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
543 &m_command_hook,
544 "kick sessions that were remote reset");
545 if (ret < 0) {
546 lderr(cct) << "error registering admin socket command: "
547 << cpp_strerror(-ret) << dendl;
548 }
549 ret = admin_socket->register_command("status",
7c673cae
FG
550 &m_command_hook,
551 "show overall client status");
552 if (ret < 0) {
553 lderr(cct) << "error registering admin socket command: "
554 << cpp_strerror(-ret) << dendl;
555 }
556
9f95a23c 557 std::lock_guard l{client_lock};
7c673cae 558 initialized = true;
7c673cae
FG
559}
560
561void Client::shutdown()
562{
11fdf7f2 563 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
564
565 // If we were not mounted, but were being used for sending
566 // MDS commands, we may have sessions that need closing.
9f95a23c
TL
567 {
568 std::lock_guard l{client_lock};
569 _close_sessions();
570 }
11fdf7f2 571 cct->_conf.remove_observer(this);
7c673cae 572
11fdf7f2 573 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
574
575 if (ino_invalidate_cb) {
576 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
577 async_ino_invalidator.wait_for_empty();
578 async_ino_invalidator.stop();
579 }
580
581 if (dentry_invalidate_cb) {
582 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
583 async_dentry_invalidator.wait_for_empty();
584 async_dentry_invalidator.stop();
585 }
586
587 if (switch_interrupt_cb) {
588 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
589 interrupt_finisher.wait_for_empty();
590 interrupt_finisher.stop();
591 }
592
593 if (remount_cb) {
594 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
595 remount_finisher.wait_for_empty();
596 remount_finisher.stop();
597 }
598
e306af50
TL
599 if (ino_release_cb) {
600 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
601 async_ino_releasor.wait_for_empty();
602 async_ino_releasor.stop();
603 }
604
7c673cae 605 objectcacher->stop(); // outside of client_lock! this does a join.
9f95a23c
TL
606 {
607 std::lock_guard l{client_lock};
608 ceph_assert(initialized);
609 initialized = false;
610 timer.shutdown();
611 }
7c673cae
FG
612 objecter_finisher.wait_for_empty();
613 objecter_finisher.stop();
614
615 if (logger) {
616 cct->get_perfcounters_collection()->remove(logger.get());
617 logger.reset();
618 }
619}
620
621
622// ===================
623// metadata cache stuff
624
625void Client::trim_cache(bool trim_kernel_dcache)
626{
181888fb
FG
627 uint64_t max = cct->_conf->client_cache_size;
628 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
629 unsigned last = 0;
630 while (lru.lru_get_size() != last) {
631 last = lru.lru_get_size();
632
181888fb 633 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
634
635 // trim!
31f18b77 636 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
637 if (!dn)
638 break; // done
639
640 trim_dentry(dn);
641 }
642
181888fb 643 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
644 _invalidate_kernel_dcache();
645
646 // hose root?
647 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
648 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
649 delete root;
650 root = 0;
651 root_ancestor = 0;
652 while (!root_parents.empty())
653 root_parents.erase(root_parents.begin());
654 inode_map.clear();
655 _reset_faked_inos();
656 }
657}
658
659void Client::trim_cache_for_reconnect(MetaSession *s)
660{
661 mds_rank_t mds = s->mds_num;
11fdf7f2 662 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
663
664 int trimmed = 0;
665 list<Dentry*> skipped;
666 while (lru.lru_get_size() > 0) {
667 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
668 if (!dn)
669 break;
670
671 if ((dn->inode && dn->inode->caps.count(mds)) ||
672 dn->dir->parent_inode->caps.count(mds)) {
673 trim_dentry(dn);
674 trimmed++;
675 } else
676 skipped.push_back(dn);
677 }
678
679 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
680 lru.lru_insert_mid(*p);
681
11fdf7f2 682 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
683 << " trimmed " << trimmed << " dentries" << dendl;
684
685 if (s->caps.size() > 0)
686 _invalidate_kernel_dcache();
687}
688
689void Client::trim_dentry(Dentry *dn)
690{
691 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
692 << " in dir "
693 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
694 << dendl;
695 if (dn->inode) {
696 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
697 clear_dir_complete_and_ordered(diri, true);
698 }
699 unlink(dn, false, false); // drop dir, drop dentry
700}
701
702
1adf2230
AA
703void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
704 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 705{
7c673cae
FG
706 uint64_t prior_size = in->size;
707
7c673cae
FG
708 if (truncate_seq > in->truncate_seq ||
709 (truncate_seq == in->truncate_seq && size > in->size)) {
710 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
711 in->size = size;
712 in->reported_size = size;
713 if (truncate_seq != in->truncate_seq) {
714 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
715 << truncate_seq << dendl;
716 in->truncate_seq = truncate_seq;
717 in->oset.truncate_seq = truncate_seq;
718
719 // truncate cached file data
720 if (prior_size > size) {
721 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
722 }
723 }
724
725 // truncate inline data
726 if (in->inline_version < CEPH_INLINE_NONE) {
727 uint32_t len = in->inline_data.length();
728 if (size < len)
729 in->inline_data.splice(size, len - size);
730 }
731 }
732 if (truncate_seq >= in->truncate_seq &&
733 in->truncate_size != truncate_size) {
734 if (in->is_file()) {
735 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
736 << truncate_size << dendl;
737 in->truncate_size = truncate_size;
738 in->oset.truncate_size = truncate_size;
739 } else {
740 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
741 }
742 }
1adf2230
AA
743}
744
745void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
746 utime_t ctime, utime_t mtime, utime_t atime)
747{
748 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
749 << " ctime " << ctime << " mtime " << mtime << dendl;
750
751 if (time_warp_seq > in->time_warp_seq)
752 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
753 << " is higher than local time_warp_seq "
754 << in->time_warp_seq << dendl;
755
756 int warn = false;
7c673cae
FG
757 // be careful with size, mtime, atime
758 if (issued & (CEPH_CAP_FILE_EXCL|
759 CEPH_CAP_FILE_WR|
760 CEPH_CAP_FILE_BUFFER|
761 CEPH_CAP_AUTH_EXCL|
762 CEPH_CAP_XATTR_EXCL)) {
763 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
764 if (ctime > in->ctime)
765 in->ctime = ctime;
766 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
767 //the mds updated times, so take those!
768 in->mtime = mtime;
769 in->atime = atime;
770 in->time_warp_seq = time_warp_seq;
771 } else if (time_warp_seq == in->time_warp_seq) {
772 //take max times
773 if (mtime > in->mtime)
774 in->mtime = mtime;
775 if (atime > in->atime)
776 in->atime = atime;
777 } else if (issued & CEPH_CAP_FILE_EXCL) {
778 //ignore mds values as we have a higher seq
779 } else warn = true;
780 } else {
781 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
782 if (time_warp_seq >= in->time_warp_seq) {
783 in->ctime = ctime;
784 in->mtime = mtime;
785 in->atime = atime;
786 in->time_warp_seq = time_warp_seq;
787 } else warn = true;
788 }
789 if (warn) {
790 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
791 << time_warp_seq << " is lower than local time_warp_seq "
792 << in->time_warp_seq
793 << dendl;
794 }
795}
796
797void Client::_fragmap_remove_non_leaves(Inode *in)
798{
799 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
800 if (!in->dirfragtree.is_leaf(p->first))
801 in->fragmap.erase(p++);
802 else
803 ++p;
804}
805
806void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
807{
808 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
809 if (p->second == mds)
810 in->fragmap.erase(p++);
811 else
812 ++p;
813}
814
815Inode * Client::add_update_inode(InodeStat *st, utime_t from,
816 MetaSession *session,
817 const UserPerm& request_perms)
818{
819 Inode *in;
820 bool was_new = false;
821 if (inode_map.count(st->vino)) {
822 in = inode_map[st->vino];
11fdf7f2 823 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
824 } else {
825 in = new Inode(this, st->vino, &st->layout);
826 inode_map[st->vino] = in;
827
828 if (use_faked_inos())
829 _assign_faked_ino(in);
830
831 if (!root) {
832 root = in;
11fdf7f2
TL
833 if (use_faked_inos())
834 _assign_faked_root(root);
7c673cae
FG
835 root_ancestor = in;
836 cwd = root;
837 } else if (!mounted) {
838 root_parents[root_ancestor] = in;
839 root_ancestor = in;
840 }
841
842 // immutable bits
843 in->ino = st->vino.ino;
844 in->snapid = st->vino.snapid;
845 in->mode = st->mode & S_IFMT;
846 was_new = true;
847 }
848
849 in->rdev = st->rdev;
850 if (in->is_symlink())
851 in->symlink = st->symlink;
852
7c673cae 853 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
854 bool new_version = false;
855 if (in->version == 0 ||
856 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
857 (in->version & ~1) < st->version))
858 new_version = true;
7c673cae 859
1adf2230
AA
860 int issued;
861 in->caps_issued(&issued);
862 issued |= in->caps_dirty();
863 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 864
1adf2230
AA
865 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
866 !(issued & CEPH_CAP_AUTH_EXCL)) {
867 in->mode = st->mode;
868 in->uid = st->uid;
869 in->gid = st->gid;
870 in->btime = st->btime;
81eedcae 871 in->snap_btime = st->snap_btime;
1adf2230 872 }
7c673cae 873
1adf2230
AA
874 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
875 !(issued & CEPH_CAP_LINK_EXCL)) {
876 in->nlink = st->nlink;
877 }
7c673cae 878
1adf2230
AA
879 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
880 update_inode_file_time(in, issued, st->time_warp_seq,
881 st->ctime, st->mtime, st->atime);
882 }
7c673cae 883
1adf2230
AA
884 if (new_version ||
885 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 886 in->layout = st->layout;
1adf2230
AA
887 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
888 }
7c673cae 889
1adf2230
AA
890 if (in->is_dir()) {
891 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
892 in->dirstat = st->dirstat;
893 }
894 // dir_layout/rstat/quota are not tracked by capability, update them only if
895 // the inode stat is from auth mds
896 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
897 in->dir_layout = st->dir_layout;
898 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
899 in->rstat = st->rstat;
900 in->quota = st->quota;
11fdf7f2 901 in->dir_pin = st->dir_pin;
1adf2230
AA
902 }
903 // move me if/when version reflects fragtree changes.
904 if (in->dirfragtree != st->dirfragtree) {
905 in->dirfragtree = st->dirfragtree;
906 _fragmap_remove_non_leaves(in);
7c673cae 907 }
7c673cae
FG
908 }
909
910 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
911 st->xattrbl.length() &&
912 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
913 auto p = st->xattrbl.cbegin();
914 decode(in->xattrs, p);
7c673cae
FG
915 in->xattr_version = st->xattr_version;
916 }
917
1adf2230
AA
918 if (st->inline_version > in->inline_version) {
919 in->inline_data = st->inline_data;
920 in->inline_version = st->inline_version;
7c673cae
FG
921 }
922
1adf2230
AA
923 /* always take a newer change attr */
924 if (st->change_attr > in->change_attr)
925 in->change_attr = st->change_attr;
926
927 if (st->version > in->version)
928 in->version = st->version;
929
930 if (was_new)
931 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
932
933 if (!st->cap.caps)
934 return in; // as with readdir returning indoes in different snaprealms (no caps!)
935
7c673cae 936 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
937 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
938 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
939 st->cap.flags, request_perms);
28e407b8 940 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 941 in->max_size = st->max_size;
28e407b8
AA
942 in->rstat = st->rstat;
943 }
7c673cae 944
1adf2230
AA
945 // setting I_COMPLETE needs to happen after adding the cap
946 if (in->is_dir() &&
947 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
948 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
949 in->dirstat.nfiles == 0 &&
950 in->dirstat.nsubdirs == 0) {
951 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
952 in->flags |= I_COMPLETE | I_DIR_ORDERED;
953 if (in->dir) {
954 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
955 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
956 in->dir->readdir_cache.clear();
957 for (const auto& p : in->dir->dentries) {
958 unlink(p.second, true, true); // keep dir, keep dentry
959 }
960 if (in->dir->dentries.empty())
961 close_dir(in->dir);
7c673cae 962 }
7c673cae 963 }
1adf2230
AA
964 } else {
965 in->snap_caps |= st->cap.caps;
7c673cae
FG
966 }
967
968 return in;
969}
970
971
972/*
973 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
974 */
975Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
976 Inode *in, utime_t from, MetaSession *session,
977 Dentry *old_dentry)
978{
979 Dentry *dn = NULL;
980 if (dir->dentries.count(dname))
981 dn = dir->dentries[dname];
982
11fdf7f2 983 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
984 << " in dir " << dir->parent_inode->vino() << " dn " << dn
985 << dendl;
986
987 if (dn && dn->inode) {
988 if (dn->inode->vino() == in->vino()) {
989 touch_dn(dn);
990 ldout(cct, 12) << " had dentry " << dname
991 << " with correct vino " << dn->inode->vino()
992 << dendl;
993 } else {
994 ldout(cct, 12) << " had dentry " << dname
995 << " with WRONG vino " << dn->inode->vino()
996 << dendl;
997 unlink(dn, true, true); // keep dir, keep dentry
998 }
999 }
1000
1001 if (!dn || !dn->inode) {
1002 InodeRef tmp_ref(in);
1003 if (old_dentry) {
1004 if (old_dentry->dir != dir) {
1005 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1006 clear_dir_complete_and_ordered(old_diri, false);
1007 }
1008 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1009 }
1010 Inode *diri = dir->parent_inode;
7c673cae
FG
1011 clear_dir_complete_and_ordered(diri, false);
1012 dn = link(dir, dname, in, dn);
1013 }
1014
1015 update_dentry_lease(dn, dlease, from, session);
1016 return dn;
1017}
1018
1019void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1020{
1021 utime_t dttl = from;
1022 dttl += (float)dlease->duration_ms / 1000.0;
1023
11fdf7f2 1024 ceph_assert(dn);
7c673cae 1025
9f95a23c 1026 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1027 if (dttl > dn->lease_ttl) {
1028 ldout(cct, 10) << "got dentry lease on " << dn->name
1029 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1030 dn->lease_ttl = dttl;
1031 dn->lease_mds = session->mds_num;
1032 dn->lease_seq = dlease->seq;
1033 dn->lease_gen = session->cap_gen;
1034 }
1035 }
1036 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1037 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1038 dn->mark_primary();
7c673cae
FG
1039}
1040
1041
1042/*
1043 * update MDS location cache for a single inode
1044 */
1045void Client::update_dir_dist(Inode *in, DirStat *dst)
1046{
1047 // auth
1048 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1049 if (dst->auth >= 0) {
1050 in->fragmap[dst->frag] = dst->auth;
1051 } else {
1052 in->fragmap.erase(dst->frag);
1053 }
1054 if (!in->dirfragtree.is_leaf(dst->frag)) {
1055 in->dirfragtree.force_to_leaf(cct, dst->frag);
1056 _fragmap_remove_non_leaves(in);
1057 }
1058
1059 // replicated
1060 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
7c673cae
FG
1061}
1062
1063void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1064{
f91f0fd5
TL
1065 if (complete)
1066 diri->dir_release_count++;
1067 else
1068 diri->dir_ordered_count++;
7c673cae
FG
1069 if (diri->flags & I_COMPLETE) {
1070 if (complete) {
1071 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1072 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1073 } else {
1074 if (diri->flags & I_DIR_ORDERED) {
1075 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1076 diri->flags &= ~I_DIR_ORDERED;
1077 }
1078 }
1079 if (diri->dir)
1080 diri->dir->readdir_cache.clear();
1081 }
1082}
1083
1084/*
1085 * insert results from readdir or lssnap into the metadata cache.
1086 */
1087void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1088
11fdf7f2 1089 auto& reply = request->reply;
7c673cae 1090 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1091 uint64_t features;
1092 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1093 features = (uint64_t)-1;
1094 }
1095 else {
1096 features = con->get_features();
1097 }
7c673cae
FG
1098
1099 dir_result_t *dirp = request->dirp;
11fdf7f2 1100 ceph_assert(dirp);
7c673cae
FG
1101
1102 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1103 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1104 if (!p.end()) {
1105 // snapdir?
1106 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1107 ceph_assert(diri);
7c673cae
FG
1108 diri = open_snapdir(diri);
1109 }
1110
1111 // only open dir if we're actually adding stuff to it!
1112 Dir *dir = diri->open_dir();
11fdf7f2 1113 ceph_assert(dir);
7c673cae
FG
1114
1115 // dirstat
11fdf7f2 1116 DirStat dst(p, features);
7c673cae
FG
1117 __u32 numdn;
1118 __u16 flags;
11fdf7f2
TL
1119 decode(numdn, p);
1120 decode(flags, p);
7c673cae
FG
1121
1122 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1123 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1124
1125 frag_t fg = (unsigned)request->head.args.readdir.frag;
1126 unsigned readdir_offset = dirp->next_offset;
1127 string readdir_start = dirp->last_name;
11fdf7f2 1128 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1129
1130 unsigned last_hash = 0;
1131 if (hash_order) {
1132 if (!readdir_start.empty()) {
1133 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1134 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1135 /* mds understands offset_hash */
1136 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1137 }
1138 }
1139
1140 if (fg != dst.frag) {
1141 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1142 fg = dst.frag;
1143 if (!hash_order) {
1144 readdir_offset = 2;
1145 readdir_start.clear();
1146 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1147 }
1148 }
1149
1150 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1151 << ", hash_order=" << hash_order
1152 << ", readdir_start " << readdir_start
1153 << ", last_hash " << last_hash
1154 << ", next_offset " << readdir_offset << dendl;
1155
1156 if (diri->snapid != CEPH_SNAPDIR &&
1157 fg.is_leftmost() && readdir_offset == 2 &&
1158 !(hash_order && last_hash)) {
1159 dirp->release_count = diri->dir_release_count;
1160 dirp->ordered_count = diri->dir_ordered_count;
1161 dirp->start_shared_gen = diri->shared_gen;
1162 dirp->cache_index = 0;
1163 }
1164
1165 dirp->buffer_frag = fg;
1166
1167 _readdir_drop_dirp_buffer(dirp);
1168 dirp->buffer.reserve(numdn);
1169
1170 string dname;
1171 LeaseStat dlease;
1172 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1173 decode(dname, p);
1174 dlease.decode(p, features);
7c673cae
FG
1175 InodeStat ist(p, features);
1176
1177 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1178
1179 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1180 request->perms);
1181 Dentry *dn;
1182 if (diri->dir->dentries.count(dname)) {
1183 Dentry *olddn = diri->dir->dentries[dname];
1184 if (olddn->inode != in) {
1185 // replace incorrect dentry
1186 unlink(olddn, true, true); // keep dir, dentry
1187 dn = link(dir, dname, in, olddn);
11fdf7f2 1188 ceph_assert(dn == olddn);
7c673cae
FG
1189 } else {
1190 // keep existing dn
1191 dn = olddn;
1192 touch_dn(dn);
1193 }
1194 } else {
1195 // new dn
1196 dn = link(dir, dname, in, NULL);
1197 }
1198
1199 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1200 if (hash_order) {
1201 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1202 if (hash != last_hash)
1203 readdir_offset = 2;
1204 last_hash = hash;
1205 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1206 } else {
1207 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1208 }
1209 // add to readdir cache
1210 if (dirp->release_count == diri->dir_release_count &&
1211 dirp->ordered_count == diri->dir_ordered_count &&
1212 dirp->start_shared_gen == diri->shared_gen) {
1213 if (dirp->cache_index == dir->readdir_cache.size()) {
1214 if (i == 0) {
11fdf7f2 1215 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1216 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1217 }
1218 dir->readdir_cache.push_back(dn);
1219 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1220 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1221 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1222 else
1223 dir->readdir_cache[dirp->cache_index] = dn;
1224 } else {
11fdf7f2 1225 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1226 }
1227 dirp->cache_index++;
1228 }
1229 // add to cached result list
1230 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1231 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1232 }
1233
1234 if (numdn > 0)
1235 dirp->last_name = dname;
1236 if (end)
1237 dirp->next_offset = 2;
1238 else
1239 dirp->next_offset = readdir_offset;
1240
1241 if (dir->is_empty())
1242 close_dir(dir);
1243 }
1244}
1245
1246/** insert_trace
1247 *
1248 * insert a trace from a MDS reply into the cache.
1249 */
1250Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1251{
11fdf7f2 1252 auto& reply = request->reply;
7c673cae
FG
1253 int op = request->get_op();
1254
1255 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1256 << " is_target=" << (int)reply->head.is_target
1257 << " is_dentry=" << (int)reply->head.is_dentry
1258 << dendl;
1259
11fdf7f2 1260 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1261 if (request->got_unsafe) {
1262 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1263 ceph_assert(p.end());
7c673cae
FG
1264 return NULL;
1265 }
1266
1267 if (p.end()) {
1268 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1269
1270 Dentry *d = request->dentry();
1271 if (d) {
1272 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1281 ceph_assert(od);
7c673cae
FG
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
7c673cae
FG
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
11fdf7f2
TL
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
7c673cae
FG
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1338 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
7c673cae
FG
1360 clear_dir_complete_and_ordered(diri, false);
1361 unlink(dn, true, true); // keep dir, dentry
1362 }
1363 }
1364 if (dlease.duration_ms > 0) {
1365 if (!dn) {
1366 Dir *dir = diri->open_dir();
1367 dn = link(dir, dname, NULL, NULL);
1368 }
1369 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1370 }
1371 }
1372 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1373 op == CEPH_MDS_OP_MKSNAP) {
1374 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1375 // fake it for snap lookup
1376 vinodeno_t vino = ist.vino;
1377 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1378 ceph_assert(inode_map.count(vino));
7c673cae
FG
1379 diri = inode_map[vino];
1380
1381 string dname = request->path.last_dentry();
1382
1383 LeaseStat dlease;
1384 dlease.duration_ms = 0;
1385
1386 if (in) {
1387 Dir *dir = diri->open_dir();
1388 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1389 } else {
1390 if (diri->dir && diri->dir->dentries.count(dname)) {
1391 Dentry *dn = diri->dir->dentries[dname];
1392 if (dn->inode)
1393 unlink(dn, true, true); // keep dir, dentry
1394 }
1395 }
1396 }
1397
1398 if (in) {
1399 if (op == CEPH_MDS_OP_READDIR ||
1400 op == CEPH_MDS_OP_LSSNAP) {
1401 insert_readdir_results(request, session, in);
1402 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1403 // hack: return parent inode instead
1404 in = diri;
1405 }
1406
1407 if (request->dentry() == NULL && in != request->inode()) {
1408 // pin the target inode if its parent dentry is not pinned
1409 request->set_other_inode(in);
1410 }
1411 }
1412
1413 if (realm)
1414 put_snap_realm(realm);
1415
1416 request->target = in;
1417 return in;
1418}
1419
1420// -------
1421
1422mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1423{
1424 mds_rank_t mds = MDS_RANK_NONE;
1425 __u32 hash = 0;
1426 bool is_hash = false;
1427
1428 Inode *in = NULL;
1429 Dentry *de = NULL;
7c673cae
FG
1430
1431 if (req->resend_mds >= 0) {
1432 mds = req->resend_mds;
1433 req->resend_mds = -1;
11fdf7f2 1434 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1435 goto out;
1436 }
1437
1438 if (cct->_conf->client_use_random_mds)
1439 goto random_mds;
1440
1441 in = req->inode();
1442 de = req->dentry();
1443 if (in) {
11fdf7f2 1444 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1445 if (req->path.depth()) {
1446 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1447 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1448 << " on " << req->path[0]
1449 << " => " << hash << dendl;
1450 is_hash = true;
1451 }
1452 } else if (de) {
1453 if (de->inode) {
1454 in = de->inode.get();
11fdf7f2 1455 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1456 } else {
1457 in = de->dir->parent_inode;
1458 hash = in->hash_dentry_name(de->name);
11fdf7f2 1459 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1460 << " on " << de->name
1461 << " => " << hash << dendl;
1462 is_hash = true;
1463 }
1464 }
1465 if (in) {
1466 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1467 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1468 while (in->snapid != CEPH_NOSNAP) {
1469 if (in->snapid == CEPH_SNAPDIR)
1470 in = in->snapdir_parent.get();
11fdf7f2 1471 else if (!in->dentries.empty())
7c673cae
FG
1472 /* In most cases there will only be one dentry, so getting it
1473 * will be the correct action. If there are multiple hard links,
1474 * I think the MDS should be able to redirect as needed*/
1475 in = in->get_first_parent()->dir->parent_inode;
1476 else {
1477 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1478 break;
1479 }
1480 }
1481 is_hash = false;
1482 }
1483
11fdf7f2 1484 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1485 << " hash=" << hash << dendl;
1486
1487 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1488 frag_t fg = in->dirfragtree[hash];
1489 if (in->fragmap.count(fg)) {
1490 mds = in->fragmap[fg];
1491 if (phash_diri)
1492 *phash_diri = in;
91327a77
AA
1493 } else if (in->auth_cap) {
1494 mds = in->auth_cap->session->mds_num;
1495 }
1496 if (mds >= 0) {
11fdf7f2 1497 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1498 goto out;
1499 }
1500 }
1501
11fdf7f2
TL
1502 if (in->auth_cap && req->auth_is_best()) {
1503 mds = in->auth_cap->session->mds_num;
1504 } else if (!in->caps.empty()) {
1505 mds = in->caps.begin()->second.session->mds_num;
1506 } else {
7c673cae 1507 goto random_mds;
11fdf7f2
TL
1508 }
1509 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1510
1511 goto out;
1512 }
1513
1514random_mds:
1515 if (mds < 0) {
1516 mds = _get_random_up_mds();
1517 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1518 }
1519
1520out:
1521 ldout(cct, 20) << "mds is " << mds << dendl;
1522 return mds;
1523}
1524
1525
1526void Client::connect_mds_targets(mds_rank_t mds)
1527{
11fdf7f2
TL
1528 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1529 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1530 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1531 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1532 q != info.export_targets.end();
1533 ++q) {
1534 if (mds_sessions.count(*q) == 0 &&
1535 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1536 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1537 << " export target mds." << *q << dendl;
1538 _open_mds_session(*q);
1539 }
1540 }
1541}
1542
adb31ebb 1543void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1544{
1545 f->dump_int("id", get_nodeid().v);
11fdf7f2 1546 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1547 f->dump_object("inst", inst);
1548 f->dump_stream("inst_str") << inst;
1549 f->dump_stream("addr_str") << inst.addr;
7c673cae 1550 f->open_array_section("sessions");
11fdf7f2 1551 for (const auto &p : mds_sessions) {
7c673cae 1552 f->open_object_section("session");
adb31ebb 1553 p.second.dump(f, cap_dump);
7c673cae
FG
1554 f->close_section();
1555 }
1556 f->close_section();
1557 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1558}
1559void Client::dump_mds_requests(Formatter *f)
1560{
1561 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1562 p != mds_requests.end();
1563 ++p) {
1564 f->open_object_section("request");
1565 p->second->dump(f);
1566 f->close_section();
1567 }
1568}
1569
9f95a23c 1570int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1571 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1572 InodeRef *ptarget, bool *pcreated,
1573 const UserPerm& perms)
1574{
1575 // check whether this request actually did the create, and set created flag
1576 bufferlist extra_bl;
1577 inodeno_t created_ino;
1578 bool got_created_ino = false;
1579 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1580
11fdf7f2 1581 extra_bl = reply->get_extra_bl();
7c673cae 1582 if (extra_bl.length() >= 8) {
9f95a23c
TL
1583 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1584 struct openc_response_t ocres;
1585
1586 decode(ocres, extra_bl);
1587 created_ino = ocres.created_ino;
1588 /*
1589 * The userland cephfs client doesn't have a way to do an async create
1590 * (yet), so just discard delegated_inos for now. Eventually we should
1591 * store them and use them in create calls, even if they are synchronous,
1592 * if only for testing purposes.
1593 */
1594 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1595 } else {
1596 // u64 containing number of created ino
1597 decode(created_ino, extra_bl);
1598 }
7c673cae 1599 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1600 got_created_ino = true;
7c673cae
FG
1601 }
1602
1603 if (pcreated)
1604 *pcreated = got_created_ino;
1605
1606 if (request->target) {
1607 *ptarget = request->target;
1608 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1609 } else {
1610 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1611 (*ptarget) = p->second;
1612 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1613 } else {
1614 // we got a traceless reply, and need to look up what we just
1615 // created. for now, do this by name. someday, do this by the
1616 // ino... which we know! FIXME.
1617 InodeRef target;
1618 Dentry *d = request->dentry();
1619 if (d) {
1620 if (d->dir) {
1621 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1622 << d->dir->parent_inode->ino << "/" << d->name
1623 << " got_ino " << got_created_ino
1624 << " ino " << created_ino
1625 << dendl;
1626 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1627 &target, perms);
1628 } else {
1629 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1630 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1631 }
1632 } else {
1633 Inode *in = request->inode();
1634 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1635 << in->ino << dendl;
1636 r = _getattr(in, request->regetattr_mask, perms, true);
1637 target = in;
1638 }
1639 if (r >= 0) {
1640 // verify ino returned in reply and trace_dist are the same
1641 if (got_created_ino &&
1642 created_ino.val != target->ino.val) {
1643 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1644 r = -EINTR;
1645 }
1646 if (ptarget)
1647 ptarget->swap(target);
1648 }
1649 }
1650 }
1651
1652 return r;
1653}
1654
1655
1656/**
1657 * make a request
1658 *
1659 * Blocking helper to make an MDS request.
1660 *
1661 * If the ptarget flag is set, behavior changes slightly: the caller
1662 * expects to get a pointer to the inode we are creating or operating
1663 * on. As a result, we will follow up any traceless mutation reply
1664 * with a getattr or lookup to transparently handle a traceless reply
1665 * from the MDS (as when the MDS restarts and the client has to replay
1666 * a request).
1667 *
1668 * @param request the MetaRequest to execute
1669 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1670 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1671 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1672 * @param use_mds [optional] prefer a specific mds (-1 for default)
1673 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1674 */
1675int Client::make_request(MetaRequest *request,
1676 const UserPerm& perms,
1677 InodeRef *ptarget, bool *pcreated,
1678 mds_rank_t use_mds,
1679 bufferlist *pdirbl)
1680{
1681 int r = 0;
1682
1683 // assign a unique tid
1684 ceph_tid_t tid = ++last_tid;
1685 request->set_tid(tid);
1686
1687 // and timestamp
1688 request->op_stamp = ceph_clock_now();
1689
1690 // make note
1691 mds_requests[tid] = request->get();
1692 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1693 oldest_tid = tid;
1694
1695 request->set_caller_perms(perms);
1696
1697 if (cct->_conf->client_inject_fixed_oldest_tid) {
1698 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1699 request->set_oldest_client_tid(1);
1700 } else {
1701 request->set_oldest_client_tid(oldest_tid);
1702 }
1703
1704 // hack target mds?
1705 if (use_mds >= 0)
1706 request->resend_mds = use_mds;
1707
9f95a23c 1708 MetaSession *session = NULL;
7c673cae
FG
1709 while (1) {
1710 if (request->aborted())
1711 break;
1712
31f18b77
FG
1713 if (blacklisted) {
1714 request->abort(-EBLACKLISTED);
1715 break;
1716 }
1717
7c673cae 1718 // set up wait cond
9f95a23c 1719 ceph::condition_variable caller_cond;
7c673cae
FG
1720 request->caller_cond = &caller_cond;
1721
1722 // choose mds
1723 Inode *hash_diri = NULL;
1724 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1725 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1726 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1727 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1728 if (hash_diri) {
1729 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1730 _fragmap_remove_stopped_mds(hash_diri, mds);
1731 } else {
1732 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1733 request->resend_mds = _get_random_up_mds();
1734 }
1735 } else {
1736 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1737 wait_on_list(waiting_for_mdsmap);
1738 }
1739 continue;
1740 }
1741
1742 // open a session?
7c673cae
FG
1743 if (!have_open_session(mds)) {
1744 session = _get_or_open_mds_session(mds);
f6b5b4d7
TL
1745 if (session->state == MetaSession::STATE_REJECTED) {
1746 request->abort(-EPERM);
1747 break;
1748 }
7c673cae
FG
1749 // wait
1750 if (session->state == MetaSession::STATE_OPENING) {
1751 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1752 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1753 continue;
1754 }
1755
1756 if (!have_open_session(mds))
1757 continue;
1758 } else {
11fdf7f2 1759 session = &mds_sessions.at(mds);
7c673cae
FG
1760 }
1761
1762 // send request.
1763 send_request(request, session);
1764
1765 // wait for signal
1766 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1767 request->kick = false;
9f95a23c
TL
1768 std::unique_lock l{client_lock, std::adopt_lock};
1769 caller_cond.wait(l, [request] {
1770 return (request->reply || // reply
1771 request->resend_mds >= 0 || // forward
1772 request->kick);
1773 });
1774 l.release();
1775 request->caller_cond = nullptr;
7c673cae
FG
1776
1777 // did we get a reply?
1778 if (request->reply)
1779 break;
1780 }
1781
1782 if (!request->reply) {
11fdf7f2
TL
1783 ceph_assert(request->aborted());
1784 ceph_assert(!request->got_unsafe);
7c673cae
FG
1785 r = request->get_abort_code();
1786 request->item.remove_myself();
1787 unregister_request(request);
11fdf7f2 1788 put_request(request);
7c673cae
FG
1789 return r;
1790 }
1791
1792 // got it!
11fdf7f2 1793 auto reply = std::move(request->reply);
7c673cae
FG
1794 r = reply->get_result();
1795 if (r >= 0)
1796 request->success = true;
1797
1798 // kick dispatcher (we've got it!)
11fdf7f2 1799 ceph_assert(request->dispatch_cond);
9f95a23c 1800 request->dispatch_cond->notify_all();
7c673cae
FG
1801 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1802 request->dispatch_cond = 0;
1803
1804 if (r >= 0 && ptarget)
9f95a23c 1805 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
7c673cae
FG
1806
1807 if (pdirbl)
11fdf7f2 1808 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1809
1810 // -- log times --
1811 utime_t lat = ceph_clock_now();
1812 lat -= request->sent_stamp;
1813 ldout(cct, 20) << "lat " << lat << dendl;
1814 logger->tinc(l_c_lat, lat);
1815 logger->tinc(l_c_reply, lat);
1816
1817 put_request(request);
7c673cae
FG
1818 return r;
1819}
1820
1821void Client::unregister_request(MetaRequest *req)
1822{
1823 mds_requests.erase(req->tid);
1824 if (req->tid == oldest_tid) {
1825 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1826 while (true) {
1827 if (p == mds_requests.end()) {
1828 oldest_tid = 0;
1829 break;
1830 }
1831 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1832 oldest_tid = p->first;
1833 break;
1834 }
1835 ++p;
1836 }
1837 }
1838 put_request(req);
1839}
1840
1841void Client::put_request(MetaRequest *request)
1842{
1843 if (request->_put()) {
1844 int op = -1;
1845 if (request->success)
1846 op = request->get_op();
1847 InodeRef other_in;
1848 request->take_other_inode(&other_in);
1849 delete request;
1850
1851 if (other_in &&
1852 (op == CEPH_MDS_OP_RMDIR ||
1853 op == CEPH_MDS_OP_RENAME ||
1854 op == CEPH_MDS_OP_RMSNAP)) {
1855 _try_to_trim_inode(other_in.get(), false);
1856 }
1857 }
1858}
1859
1860int Client::encode_inode_release(Inode *in, MetaRequest *req,
1861 mds_rank_t mds, int drop,
1862 int unless, int force)
1863{
11fdf7f2 1864 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae 1865 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1911f103 1866 << ", force:" << force << ")" << dendl;
7c673cae 1867 int released = 0;
11fdf7f2
TL
1868 auto it = in->caps.find(mds);
1869 if (it != in->caps.end()) {
1870 Cap &cap = it->second;
7c673cae 1871 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1872 if ((drop & cap.issued) &&
1873 !(unless & cap.issued)) {
1911f103 1874 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
1875 cap.issued &= ~drop;
1876 cap.implemented &= ~drop;
7c673cae 1877 released = 1;
7c673cae
FG
1878 } else {
1879 released = force;
1880 }
1881 if (released) {
1911f103
TL
1882 cap.wanted = in->caps_wanted();
1883 if (&cap == in->auth_cap &&
1884 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1885 in->requested_max_size = 0;
1886 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1887 }
7c673cae
FG
1888 ceph_mds_request_release rel;
1889 rel.ino = in->ino;
11fdf7f2
TL
1890 rel.cap_id = cap.cap_id;
1891 rel.seq = cap.seq;
1892 rel.issue_seq = cap.issue_seq;
1893 rel.mseq = cap.mseq;
1894 rel.caps = cap.implemented;
1895 rel.wanted = cap.wanted;
7c673cae
FG
1896 rel.dname_len = 0;
1897 rel.dname_seq = 0;
1898 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1899 }
1900 }
11fdf7f2 1901 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1902 << released << dendl;
1903 return released;
1904}
1905
1906void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1907 mds_rank_t mds, int drop, int unless)
1908{
11fdf7f2 1909 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1910 << dn << ")" << dendl;
1911 int released = 0;
1912 if (dn->dir)
1913 released = encode_inode_release(dn->dir->parent_inode, req,
1914 mds, drop, unless, 1);
1915 if (released && dn->lease_mds == mds) {
1916 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1917 auto& rel = req->cap_releases.back();
7c673cae
FG
1918 rel.item.dname_len = dn->name.length();
1919 rel.item.dname_seq = dn->lease_seq;
1920 rel.dname = dn->name;
adb31ebb 1921 dn->lease_mds = -1;
7c673cae 1922 }
11fdf7f2 1923 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1924 << dn << ")" << dendl;
1925}
1926
1927
1928/*
1929 * This requires the MClientRequest *request member to be set.
1930 * It will error out horribly without one.
1931 * Additionally, if you set any *drop member, you'd better have
1932 * set the corresponding dentry!
1933 */
1934void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1935{
11fdf7f2 1936 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1937 << req << ", mds: " << mds << ")" << dendl;
1938 if (req->inode_drop && req->inode())
1939 encode_inode_release(req->inode(), req,
1940 mds, req->inode_drop,
1941 req->inode_unless);
1942
1943 if (req->old_inode_drop && req->old_inode())
1944 encode_inode_release(req->old_inode(), req,
1945 mds, req->old_inode_drop,
1946 req->old_inode_unless);
1947 if (req->other_inode_drop && req->other_inode())
1948 encode_inode_release(req->other_inode(), req,
1949 mds, req->other_inode_drop,
1950 req->other_inode_unless);
1951
1952 if (req->dentry_drop && req->dentry())
1953 encode_dentry_release(req->dentry(), req,
1954 mds, req->dentry_drop,
1955 req->dentry_unless);
1956
1957 if (req->old_dentry_drop && req->old_dentry())
1958 encode_dentry_release(req->old_dentry(), req,
1959 mds, req->old_dentry_drop,
1960 req->old_dentry_unless);
11fdf7f2 1961 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1962 << req << ", mds " << mds <<dendl;
1963}
1964
1965bool Client::have_open_session(mds_rank_t mds)
1966{
11fdf7f2
TL
1967 const auto &it = mds_sessions.find(mds);
1968 return it != mds_sessions.end() &&
1969 (it->second.state == MetaSession::STATE_OPEN ||
1970 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1971}
1972
1973MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1974{
11fdf7f2
TL
1975 const auto &it = mds_sessions.find(mds);
1976 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1977 return NULL;
11fdf7f2
TL
1978 } else {
1979 return &it->second;
1980 }
7c673cae
FG
1981}
1982
1983MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1984{
11fdf7f2
TL
1985 auto it = mds_sessions.find(mds);
1986 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1987}
1988
1989/**
1990 * Populate a map of strings with client-identifying metadata,
1991 * such as the hostname. Call this once at initialization.
1992 */
1993void Client::populate_metadata(const std::string &mount_root)
1994{
1995 // Hostname
1996 struct utsname u;
1997 int r = uname(&u);
1998 if (r >= 0) {
1999 metadata["hostname"] = u.nodename;
2000 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2001 } else {
2002 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2003 }
2004
2005 metadata["pid"] = stringify(getpid());
2006
2007 // Ceph entity id (the '0' in "client.0")
2008 metadata["entity_id"] = cct->_conf->name.get_id();
2009
2010 // Our mount position
2011 if (!mount_root.empty()) {
2012 metadata["root"] = mount_root;
2013 }
2014
2015 // Ceph version
2016 metadata["ceph_version"] = pretty_version_to_str();
2017 metadata["ceph_sha1"] = git_version_to_str();
2018
2019 // Apply any metadata from the user's configured overrides
2020 std::vector<std::string> tokens;
2021 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2022 for (const auto &i : tokens) {
2023 auto eqpos = i.find("=");
2024 // Throw out anything that isn't of the form "<str>=<str>"
2025 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2026 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2027 continue;
2028 }
2029 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2030 }
2031}
2032
2033/**
2034 * Optionally add or override client metadata fields.
2035 */
2036void Client::update_metadata(std::string const &k, std::string const &v)
2037{
11fdf7f2
TL
2038 std::lock_guard l(client_lock);
2039 ceph_assert(initialized);
7c673cae 2040
11fdf7f2
TL
2041 auto it = metadata.find(k);
2042 if (it != metadata.end()) {
7c673cae 2043 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2044 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2045 }
2046
2047 metadata[k] = v;
2048}
2049
2050MetaSession *Client::_open_mds_session(mds_rank_t mds)
2051{
11fdf7f2
TL
2052 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2053 auto addrs = mdsmap->get_addrs(mds);
2054 auto em = mds_sessions.emplace(std::piecewise_construct,
2055 std::forward_as_tuple(mds),
2056 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2057 ceph_assert(em.second); /* not already present */
2058 MetaSession *session = &em.first->second;
7c673cae 2059
9f95a23c 2060 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2061 m->metadata = metadata;
2062 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2063 session->con->send_message2(std::move(m));
7c673cae
FG
2064 return session;
2065}
2066
2067void Client::_close_mds_session(MetaSession *s)
2068{
11fdf7f2 2069 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2070 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2071 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2072}
2073
f6b5b4d7 2074void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2075{
11fdf7f2 2076 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2077 if (rejected && s->state != MetaSession::STATE_CLOSING)
2078 s->state = MetaSession::STATE_REJECTED;
2079 else
2080 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2081 s->con->mark_down();
2082 signal_context_list(s->waiting_for_open);
9f95a23c 2083 mount_cond.notify_all();
f6b5b4d7 2084 remove_session_caps(s, err);
7c673cae 2085 kick_requests_closed(s);
f6b5b4d7
TL
2086 mds_ranks_closing.erase(s->mds_num);
2087 if (s->state == MetaSession::STATE_CLOSED)
2088 mds_sessions.erase(s->mds_num);
7c673cae
FG
2089}
2090
11fdf7f2 2091void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2092{
2093 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2094 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2095
2096 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2097 if (!session) {
2098 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2099 return;
2100 }
2101
2102 switch (m->get_op()) {
2103 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2104 {
2105 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2106 missing_features -= m->supported_features;
2107 if (!missing_features.empty()) {
2108 lderr(cct) << "mds." << from << " lacks required features '"
2109 << missing_features << "', closing session " << dendl;
11fdf7f2 2110 _close_mds_session(session);
f6b5b4d7 2111 _closed_mds_session(session, -EPERM, true);
11fdf7f2
TL
2112 break;
2113 }
2114 session->mds_features = std::move(m->supported_features);
2115
2116 renew_caps(session);
2117 session->state = MetaSession::STATE_OPEN;
2118 if (unmounting)
9f95a23c 2119 mount_cond.notify_all();
11fdf7f2
TL
2120 else
2121 connect_mds_targets(from);
2122 signal_context_list(session->waiting_for_open);
2123 break;
2124 }
7c673cae
FG
2125
2126 case CEPH_SESSION_CLOSE:
2127 _closed_mds_session(session);
2128 break;
2129
2130 case CEPH_SESSION_RENEWCAPS:
2131 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2132 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2133 session->cap_ttl =
2134 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2135 if (was_stale)
2136 wake_up_session_caps(session, false);
7c673cae
FG
2137 }
2138 break;
2139
2140 case CEPH_SESSION_STALE:
28e407b8
AA
2141 // invalidate session caps/leases
2142 session->cap_gen++;
2143 session->cap_ttl = ceph_clock_now();
2144 session->cap_ttl -= 1;
7c673cae
FG
2145 renew_caps(session);
2146 break;
2147
2148 case CEPH_SESSION_RECALL_STATE:
2149 trim_caps(session, m->get_max_caps());
2150 break;
2151
2152 case CEPH_SESSION_FLUSHMSG:
a8e16298 2153 /* flush cap release */
11fdf7f2
TL
2154 if (auto& m = session->release; m) {
2155 session->con->send_message2(std::move(m));
a8e16298 2156 }
9f95a23c 2157 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2158 break;
2159
2160 case CEPH_SESSION_FORCE_RO:
2161 force_session_readonly(session);
2162 break;
2163
2164 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2165 {
2166 std::string_view error_str;
2167 auto it = m->metadata.find("error_string");
2168 if (it != m->metadata.end())
2169 error_str = it->second;
2170 else
2171 error_str = "unknown error";
2172 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2173
f6b5b4d7 2174 _closed_mds_session(session, -EPERM, true);
11fdf7f2 2175 }
7c673cae
FG
2176 break;
2177
2178 default:
2179 ceph_abort();
2180 }
7c673cae
FG
2181}
2182
2183bool Client::_any_stale_sessions() const
2184{
9f95a23c 2185 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2186
11fdf7f2
TL
2187 for (const auto &p : mds_sessions) {
2188 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2189 return true;
2190 }
2191 }
2192
2193 return false;
2194}
2195
2196void Client::_kick_stale_sessions()
2197{
11fdf7f2 2198 ldout(cct, 1) << __func__ << dendl;
7c673cae 2199
11fdf7f2
TL
2200 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2201 MetaSession &s = it->second;
f6b5b4d7
TL
2202 if (s.state == MetaSession::STATE_REJECTED) {
2203 mds_sessions.erase(it++);
2204 continue;
2205 }
11fdf7f2
TL
2206 ++it;
2207 if (s.state == MetaSession::STATE_STALE)
2208 _closed_mds_session(&s);
7c673cae
FG
2209 }
2210}
2211
2212void Client::send_request(MetaRequest *request, MetaSession *session,
2213 bool drop_cap_releases)
2214{
2215 // make the request
2216 mds_rank_t mds = session->mds_num;
11fdf7f2 2217 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2218 << " for mds." << mds << dendl;
11fdf7f2 2219 auto r = build_client_request(request);
7c673cae
FG
2220 if (request->dentry()) {
2221 r->set_dentry_wanted();
2222 }
2223 if (request->got_unsafe) {
2224 r->set_replayed_op();
2225 if (request->target)
2226 r->head.ino = request->target->ino;
2227 } else {
2228 encode_cap_releases(request, mds);
2229 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2230 request->cap_releases.clear();
2231 else
2232 r->releases.swap(request->cap_releases);
2233 }
2234 r->set_mdsmap_epoch(mdsmap->get_epoch());
2235 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2236 objecter->with_osdmap([r](const OSDMap& o) {
2237 r->set_osdmap_epoch(o.get_epoch());
2238 });
2239 }
2240
2241 if (request->mds == -1) {
2242 request->sent_stamp = ceph_clock_now();
11fdf7f2 2243 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2244 }
2245 request->mds = mds;
2246
2247 Inode *in = request->inode();
11fdf7f2
TL
2248 if (in) {
2249 auto it = in->caps.find(mds);
2250 if (it != in->caps.end()) {
2251 request->sent_on_mseq = it->second.mseq;
2252 }
2253 }
7c673cae
FG
2254
2255 session->requests.push_back(&request->item);
2256
11fdf7f2
TL
2257 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2258 session->con->send_message2(std::move(r));
7c673cae
FG
2259}
2260
9f95a23c 2261ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2262{
9f95a23c 2263 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2264 req->set_tid(request->tid);
2265 req->set_stamp(request->op_stamp);
2266 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2267
2268 // if the filepath's haven't been set, set them!
2269 if (request->path.empty()) {
2270 Inode *in = request->inode();
2271 Dentry *de = request->dentry();
2272 if (in)
2273 in->make_nosnap_relative_path(request->path);
2274 else if (de) {
2275 if (de->inode)
2276 de->inode->make_nosnap_relative_path(request->path);
2277 else if (de->dir) {
2278 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2279 request->path.push_dentry(de->name);
2280 }
2281 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2282 << " No path, inode, or appropriately-endowed dentry given!"
2283 << dendl;
2284 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2285 << " No path, inode, or dentry given!"
2286 << dendl;
2287 }
2288 req->set_filepath(request->get_filepath());
2289 req->set_filepath2(request->get_filepath2());
2290 req->set_data(request->data);
2291 req->set_retry_attempt(request->retry_attempt++);
2292 req->head.num_fwd = request->num_fwd;
2293 const gid_t *_gids;
2294 int gid_count = request->perms.get_gids(&_gids);
2295 req->set_gid_list(gid_count, _gids);
2296 return req;
2297}
2298
2299
2300
11fdf7f2 2301void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2302{
2303 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2304 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2305 if (!session) {
7c673cae
FG
2306 return;
2307 }
2308 ceph_tid_t tid = fwd->get_tid();
2309
2310 if (mds_requests.count(tid) == 0) {
11fdf7f2 2311 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2312 return;
2313 }
2314
2315 MetaRequest *request = mds_requests[tid];
11fdf7f2 2316 ceph_assert(request);
7c673cae
FG
2317
2318 // reset retry counter
2319 request->retry_attempt = 0;
2320
2321 // request not forwarded, or dest mds has no session.
2322 // resend.
11fdf7f2 2323 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2324 << " fwd " << fwd->get_num_fwd()
2325 << " to mds." << fwd->get_dest_mds()
2326 << ", resending to " << fwd->get_dest_mds()
2327 << dendl;
2328
2329 request->mds = -1;
2330 request->item.remove_myself();
2331 request->num_fwd = fwd->get_num_fwd();
2332 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2333 request->caller_cond->notify_all();
7c673cae
FG
2334}
2335
2336bool Client::is_dir_operation(MetaRequest *req)
2337{
2338 int op = req->get_op();
2339 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2340 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2341 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2342 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2343 return true;
2344 return false;
2345}
2346
11fdf7f2 2347void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2348{
2349 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2350 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2351 if (!session) {
7c673cae
FG
2352 return;
2353 }
2354
2355 ceph_tid_t tid = reply->get_tid();
2356 bool is_safe = reply->is_safe();
2357
2358 if (mds_requests.count(tid) == 0) {
11fdf7f2 2359 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2360 << " safe is:" << is_safe << dendl;
7c673cae
FG
2361 return;
2362 }
2363 MetaRequest *request = mds_requests.at(tid);
2364
11fdf7f2 2365 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2366 << " tid " << tid << dendl;
2367
2368 if (request->got_unsafe && !is_safe) {
2369 //duplicate response
2370 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2371 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2372 return;
2373 }
2374
2375 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2376 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2377 << " from mds." << request->mds << dendl;
2378 request->send_to_auth = true;
2379 request->resend_mds = choose_target_mds(request);
2380 Inode *in = request->inode();
11fdf7f2 2381 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2382 if (request->resend_mds >= 0 &&
2383 request->resend_mds == request->mds &&
2384 (in == NULL ||
11fdf7f2
TL
2385 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2386 request->sent_on_mseq == it->second.mseq)) {
2387 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae 2388 } else {
9f95a23c 2389 request->caller_cond->notify_all();
7c673cae
FG
2390 return;
2391 }
7c673cae
FG
2392 }
2393
11fdf7f2 2394 ceph_assert(!request->reply);
7c673cae
FG
2395 request->reply = reply;
2396 insert_trace(request, session);
2397
2398 // Handle unsafe reply
2399 if (!is_safe) {
2400 request->got_unsafe = true;
2401 session->unsafe_requests.push_back(&request->unsafe_item);
2402 if (is_dir_operation(request)) {
2403 Inode *dir = request->inode();
11fdf7f2 2404 ceph_assert(dir);
7c673cae
FG
2405 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2406 }
2407 if (request->target) {
2408 InodeRef &in = request->target;
2409 in->unsafe_ops.push_back(&request->unsafe_target_item);
2410 }
2411 }
2412
2413 // Only signal the caller once (on the first reply):
2414 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2415 if (!is_safe || !request->got_unsafe) {
9f95a23c 2416 ceph::condition_variable cond;
7c673cae
FG
2417 request->dispatch_cond = &cond;
2418
2419 // wake up waiter
11fdf7f2 2420 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2421 request->caller_cond->notify_all();
7c673cae
FG
2422
2423 // wake for kick back
9f95a23c
TL
2424 std::unique_lock l{client_lock, std::adopt_lock};
2425 cond.wait(l, [tid, request, &cond, this] {
2426 if (request->dispatch_cond) {
2427 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2428 << tid << " " << &cond << dendl;
2429 }
2430 return !request->dispatch_cond;
2431 });
2432 l.release();
7c673cae
FG
2433 }
2434
2435 if (is_safe) {
2436 // the filesystem change is committed to disk
2437 // we're done, clean up
2438 if (request->got_unsafe) {
2439 request->unsafe_item.remove_myself();
2440 request->unsafe_dir_item.remove_myself();
2441 request->unsafe_target_item.remove_myself();
2442 signal_cond_list(request->waitfor_safe);
2443 }
2444 request->item.remove_myself();
2445 unregister_request(request);
2446 }
2447 if (unmounting)
9f95a23c 2448 mount_cond.notify_all();
7c673cae
FG
2449}
2450
2451void Client::_handle_full_flag(int64_t pool)
2452{
2453 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2454 << "on " << pool << dendl;
2455 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2456 // to do this rather than blocking, because otherwise when we fill up we
2457 // potentially lock caps forever on files with dirty pages, and we need
2458 // to be able to release those caps to the MDS so that it can delete files
2459 // and free up space.
2460 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2461
2462 // For all inodes with layouts in this pool and a pending flush write op
2463 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2464 // from ObjectCacher so that it doesn't re-issue the write in response to
2465 // the ENOSPC error.
2466 // Fortunately since we're cancelling everything in a given pool, we don't
2467 // need to know which ops belong to which ObjectSet, we can just blow all
2468 // the un-flushed cached data away and mark any dirty inodes' async_err
2469 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2470 // affecting this pool, and all the objectsets we're purging were also
2471 // in this pool.
2472 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2473 i != inode_map.end(); ++i)
2474 {
2475 Inode *inode = i->second;
2476 if (inode->oset.dirty_or_tx
2477 && (pool == -1 || inode->layout.pool_id == pool)) {
2478 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2479 << " has dirty objects, purging and setting ENOSPC" << dendl;
2480 objectcacher->purge_set(&inode->oset);
2481 inode->set_async_err(-ENOSPC);
2482 }
2483 }
2484
2485 if (cancelled_epoch != (epoch_t)-1) {
2486 set_cap_epoch_barrier(cancelled_epoch);
2487 }
2488}
2489
11fdf7f2 2490void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2491{
31f18b77
FG
2492 std::set<entity_addr_t> new_blacklists;
2493 objecter->consume_blacklist_events(&new_blacklists);
2494
11fdf7f2
TL
2495 const auto myaddrs = messenger->get_myaddrs();
2496 bool new_blacklist = false;
2497 bool prenautilus = objecter->with_osdmap(
2498 [&](const OSDMap& o) {
9f95a23c 2499 return o.require_osd_release < ceph_release_t::nautilus;
11fdf7f2
TL
2500 });
2501 if (!blacklisted) {
2502 for (auto a : myaddrs.v) {
2503 // blacklist entries are always TYPE_ANY for nautilus+
2504 a.set_type(entity_addr_t::TYPE_ANY);
2505 if (new_blacklists.count(a)) {
2506 new_blacklist = true;
2507 break;
2508 }
2509 if (prenautilus) {
2510 // ...except pre-nautilus, they were TYPE_LEGACY
2511 a.set_type(entity_addr_t::TYPE_LEGACY);
2512 if (new_blacklists.count(a)) {
2513 new_blacklist = true;
2514 break;
2515 }
2516 }
2517 }
2518 }
2519 if (new_blacklist) {
31f18b77
FG
2520 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2521 return o.get_epoch();
2522 });
2523 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2524 blacklisted = true;
31f18b77 2525
11fdf7f2 2526 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2527
2528 // Since we know all our OSD ops will fail, cancel them all preemtively,
2529 // so that on an unhealthy cluster we can umount promptly even if e.g.
2530 // some PGs were inaccessible.
2531 objecter->op_cancel_writes(-EBLACKLISTED);
2532
2533 } else if (blacklisted) {
2534 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2535 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2536 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2537 }
2538
f64942e4
AA
2539 // Always subscribe to next osdmap for blacklisted client
2540 // until this client is not blacklisted.
2541 if (blacklisted) {
2542 objecter->maybe_request_map();
2543 }
2544
7c673cae
FG
2545 if (objecter->osdmap_full_flag()) {
2546 _handle_full_flag(-1);
2547 } else {
2548 // Accumulate local list of full pools so that I can drop
2549 // the objecter lock before re-entering objecter in
2550 // cancel_writes
2551 std::vector<int64_t> full_pools;
2552
2553 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2554 for (const auto& kv : o.get_pools()) {
2555 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2556 full_pools.push_back(kv.first);
2557 }
2558 }
2559 });
2560
2561 for (auto p : full_pools)
2562 _handle_full_flag(p);
2563
2564 // Subscribe to subsequent maps to watch for the full flag going
2565 // away. For the global full flag objecter does this for us, but
2566 // it pays no attention to the per-pool full flag so in this branch
2567 // we do it ourselves.
2568 if (!full_pools.empty()) {
2569 objecter->maybe_request_map();
2570 }
2571 }
7c673cae
FG
2572}
2573
2574
2575// ------------------------
2576// incoming messages
2577
2578
11fdf7f2 2579bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2580{
11fdf7f2 2581 std::lock_guard l(client_lock);
7c673cae
FG
2582 if (!initialized) {
2583 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2584 return true;
2585 }
2586
2587 switch (m->get_type()) {
2588 // mounting and mds sessions
2589 case CEPH_MSG_MDS_MAP:
9f95a23c 2590 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2591 break;
2592 case CEPH_MSG_FS_MAP:
9f95a23c 2593 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2594 break;
2595 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2596 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2597 break;
2598 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2599 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2600 break;
2601
2602 case CEPH_MSG_OSD_MAP:
9f95a23c 2603 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2604 break;
2605
2606 // requests
2607 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2608 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2609 break;
2610 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2611 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2612 break;
2613
2614 // reclaim reply
2615 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2616 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2617 break;
2618
2619 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2620 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2621 break;
2622 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2623 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2624 break;
2625 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2626 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2627 break;
2628 case MSG_COMMAND_REPLY:
2629 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2630 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2631 } else {
2632 return false;
2633 }
2634 break;
2635 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2636 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2637 break;
2638
2639 default:
2640 return false;
2641 }
2642
2643 // unmounting?
2644 if (unmounting) {
2645 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2646 << "+" << inode_map.size() << dendl;
2647 long unsigned size = lru.lru_get_size() + inode_map.size();
2648 trim_cache();
2649 if (size < lru.lru_get_size() + inode_map.size()) {
2650 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2651 mount_cond.notify_all();
7c673cae
FG
2652 } else {
2653 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2654 << "+" << inode_map.size() << dendl;
2655 }
2656 }
2657
2658 return true;
2659}
2660
11fdf7f2 2661void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2662{
2663 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2664
2665 signal_cond_list(waiting_for_fsmap);
2666
2667 monclient->sub_got("fsmap", fsmap->get_epoch());
2668}
2669
11fdf7f2 2670void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2671{
2672 fsmap_user.reset(new FSMapUser);
2673 *fsmap_user = m->get_fsmap();
7c673cae
FG
2674
2675 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2676 signal_cond_list(waiting_for_fsmap);
2677}
2678
11fdf7f2 2679void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2680{
f64942e4 2681 mds_gid_t old_inc, new_inc;
7c673cae 2682 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2683 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2684 << " is identical to or older than our "
2685 << mdsmap->get_epoch() << dendl;
7c673cae 2686 return;
f64942e4 2687 }
7c673cae 2688
11fdf7f2 2689 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2690
2691 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2692 oldmap.swap(mdsmap);
2693
2694 mdsmap->decode(m->get_encoded());
2695
2696 // Cancel any commands for missing or laggy GIDs
2697 std::list<ceph_tid_t> cancel_ops;
2698 auto &commands = command_table.get_commands();
2699 for (const auto &i : commands) {
2700 auto &op = i.second;
2701 const mds_gid_t op_mds_gid = op.mds_gid;
2702 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2703 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2704 cancel_ops.push_back(i.first);
2705 if (op.outs) {
2706 std::ostringstream ss;
2707 ss << "MDS " << op_mds_gid << " went away";
2708 *(op.outs) = ss.str();
2709 }
2710 op.con->mark_down();
2711 if (op.on_finish) {
2712 op.on_finish->complete(-ETIMEDOUT);
2713 }
2714 }
2715 }
2716
2717 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2718 i != cancel_ops.end(); ++i) {
2719 command_table.erase(*i);
2720 }
2721
2722 // reset session
11fdf7f2 2723 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2724 mds_rank_t mds = p->first;
11fdf7f2 2725 MetaSession *session = &p->second;
7c673cae
FG
2726 ++p;
2727
2728 int oldstate = oldmap->get_state(mds);
2729 int newstate = mdsmap->get_state(mds);
2730 if (!mdsmap->is_up(mds)) {
2731 session->con->mark_down();
11fdf7f2 2732 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2733 old_inc = oldmap->get_incarnation(mds);
2734 new_inc = mdsmap->get_incarnation(mds);
2735 if (old_inc != new_inc) {
2736 ldout(cct, 1) << "mds incarnation changed from "
2737 << old_inc << " to " << new_inc << dendl;
2738 oldstate = MDSMap::STATE_NULL;
2739 }
7c673cae 2740 session->con->mark_down();
11fdf7f2 2741 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2742 // When new MDS starts to take over, notify kernel to trim unused entries
2743 // in its dcache/icache. Hopefully, the kernel will release some unused
2744 // inodes before the new MDS enters reconnect state.
2745 trim_cache_for_reconnect(session);
2746 } else if (oldstate == newstate)
2747 continue; // no change
2748
2749 session->mds_state = newstate;
2750 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2751 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2752 send_reconnect(session);
81eedcae
TL
2753 } else if (newstate > MDSMap::STATE_RECONNECT) {
2754 if (oldstate < MDSMap::STATE_RECONNECT) {
2755 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2756 _closed_mds_session(session);
2757 continue;
2758 }
2759 if (newstate >= MDSMap::STATE_ACTIVE) {
2760 if (oldstate < MDSMap::STATE_ACTIVE) {
2761 // kick new requests
2762 kick_requests(session);
2763 kick_flushing_caps(session);
2764 signal_context_list(session->waiting_for_open);
2765 wake_up_session_caps(session, true);
2766 }
2767 connect_mds_targets(mds);
7c673cae 2768 }
7c673cae
FG
2769 } else if (newstate == MDSMap::STATE_NULL &&
2770 mds >= mdsmap->get_max_mds()) {
2771 _closed_mds_session(session);
2772 }
2773 }
2774
2775 // kick any waiting threads
2776 signal_cond_list(waiting_for_mdsmap);
2777
7c673cae
FG
2778 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2779}
2780
2781void Client::send_reconnect(MetaSession *session)
2782{
2783 mds_rank_t mds = session->mds_num;
11fdf7f2 2784 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2785
2786 // trim unused caps to reduce MDS's cache rejoin time
2787 trim_cache_for_reconnect(session);
2788
2789 session->readonly = false;
2790
11fdf7f2 2791 session->release.reset();
7c673cae
FG
2792
2793 // reset my cap seq number
2794 session->seq = 0;
2795 //connect to the mds' offload targets
2796 connect_mds_targets(mds);
2797 //make sure unsafe requests get saved
2798 resend_unsafe_requests(session);
2799
11fdf7f2
TL
2800 early_kick_flushing_caps(session);
2801
9f95a23c 2802 auto m = make_message<MClientReconnect>();
11fdf7f2 2803 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2804
2805 // i have an open session.
2806 ceph::unordered_set<inodeno_t> did_snaprealm;
2807 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2808 p != inode_map.end();
2809 ++p) {
2810 Inode *in = p->second;
11fdf7f2
TL
2811 auto it = in->caps.find(mds);
2812 if (it != in->caps.end()) {
2813 if (allow_multi &&
9f95a23c
TL
2814 m->get_approx_size() >=
2815 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
2816 m->mark_more();
2817 session->con->send_message2(std::move(m));
2818
9f95a23c 2819 m = make_message<MClientReconnect>();
11fdf7f2
TL
2820 }
2821
2822 Cap &cap = it->second;
7c673cae 2823 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2824 << " " << ccap_string(cap.issued)
7c673cae
FG
2825 << " wants " << ccap_string(in->caps_wanted())
2826 << dendl;
2827 filepath path;
f91f0fd5 2828 in->make_short_path(path);
7c673cae
FG
2829 ldout(cct, 10) << " path " << path << dendl;
2830
2831 bufferlist flockbl;
2832 _encode_filelocks(in, flockbl);
2833
11fdf7f2
TL
2834 cap.seq = 0; // reset seq.
2835 cap.issue_seq = 0; // reset seq.
2836 cap.mseq = 0; // reset seq.
2837 // cap gen should catch up with session cap_gen
2838 if (cap.gen < session->cap_gen) {
2839 cap.gen = session->cap_gen;
2840 cap.issued = cap.implemented = CEPH_CAP_PIN;
2841 } else {
2842 cap.issued = cap.implemented;
2843 }
7c673cae
FG
2844 snapid_t snap_follows = 0;
2845 if (!in->cap_snaps.empty())
2846 snap_follows = in->cap_snaps.begin()->first;
2847
2848 m->add_cap(p->first.ino,
11fdf7f2 2849 cap.cap_id,
7c673cae
FG
2850 path.get_ino(), path.get_path(), // ino
2851 in->caps_wanted(), // wanted
11fdf7f2 2852 cap.issued, // issued
7c673cae
FG
2853 in->snaprealm->ino,
2854 snap_follows,
2855 flockbl);
2856
2857 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2858 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2859 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2860 did_snaprealm.insert(in->snaprealm->ino);
2861 }
2862 }
2863 }
2864
11fdf7f2
TL
2865 if (!allow_multi)
2866 m->set_encoding_version(0); // use connection features to choose encoding
2867 session->con->send_message2(std::move(m));
7c673cae 2868
9f95a23c 2869 mount_cond.notify_all();
11fdf7f2
TL
2870
2871 if (session->reclaim_state == MetaSession::RECLAIMING)
2872 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2873}
2874
2875
2876void Client::kick_requests(MetaSession *session)
2877{
11fdf7f2 2878 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2879 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2880 p != mds_requests.end();
2881 ++p) {
31f18b77
FG
2882 MetaRequest *req = p->second;
2883 if (req->got_unsafe)
2884 continue;
2885 if (req->aborted()) {
2886 if (req->caller_cond) {
2887 req->kick = true;
9f95a23c 2888 req->caller_cond->notify_all();
31f18b77 2889 }
7c673cae 2890 continue;
31f18b77
FG
2891 }
2892 if (req->retry_attempt > 0)
7c673cae 2893 continue; // new requests only
31f18b77 2894 if (req->mds == session->mds_num) {
7c673cae
FG
2895 send_request(p->second, session);
2896 }
2897 }
2898}
2899
2900void Client::resend_unsafe_requests(MetaSession *session)
2901{
2902 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2903 !iter.end();
2904 ++iter)
2905 send_request(*iter, session);
2906
2907 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2908 // process completed requests in clientreplay stage.
2909 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2910 p != mds_requests.end();
2911 ++p) {
2912 MetaRequest *req = p->second;
2913 if (req->got_unsafe)
2914 continue;
31f18b77
FG
2915 if (req->aborted())
2916 continue;
7c673cae
FG
2917 if (req->retry_attempt == 0)
2918 continue; // old requests only
2919 if (req->mds == session->mds_num)
2920 send_request(req, session, true);
2921 }
2922}
2923
2924void Client::wait_unsafe_requests()
2925{
2926 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2927 for (const auto &p : mds_sessions) {
2928 const MetaSession &s = p.second;
2929 if (!s.unsafe_requests.empty()) {
2930 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2931 req->get();
2932 last_unsafe_reqs.push_back(req);
2933 }
2934 }
2935
2936 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2937 p != last_unsafe_reqs.end();
2938 ++p) {
2939 MetaRequest *req = *p;
2940 if (req->unsafe_item.is_on_list())
2941 wait_on_list(req->waitfor_safe);
2942 put_request(req);
2943 }
2944}
2945
2946void Client::kick_requests_closed(MetaSession *session)
2947{
11fdf7f2 2948 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2949 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2950 p != mds_requests.end(); ) {
2951 MetaRequest *req = p->second;
2952 ++p;
2953 if (req->mds == session->mds_num) {
2954 if (req->caller_cond) {
2955 req->kick = true;
9f95a23c 2956 req->caller_cond->notify_all();
7c673cae
FG
2957 }
2958 req->item.remove_myself();
2959 if (req->got_unsafe) {
11fdf7f2 2960 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 2961 req->unsafe_item.remove_myself();
eafe8130
TL
2962 if (is_dir_operation(req)) {
2963 Inode *dir = req->inode();
2964 assert(dir);
2965 dir->set_async_err(-EIO);
2966 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2967 << dir->ino << " " << req->get_tid() << dendl;
2968 req->unsafe_dir_item.remove_myself();
2969 }
2970 if (req->target) {
2971 InodeRef &in = req->target;
2972 in->set_async_err(-EIO);
2973 lderr(cct) << "kick_requests_closed drop req of inode : "
2974 << in->ino << " " << req->get_tid() << dendl;
2975 req->unsafe_target_item.remove_myself();
2976 }
7c673cae
FG
2977 signal_cond_list(req->waitfor_safe);
2978 unregister_request(req);
2979 }
2980 }
2981 }
11fdf7f2
TL
2982 ceph_assert(session->requests.empty());
2983 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2984}
2985
2986
2987
2988
2989/************
2990 * leases
2991 */
2992
2993void Client::got_mds_push(MetaSession *s)
2994{
2995 s->seq++;
2996 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2997 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 2998 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2999 }
3000}
3001
11fdf7f2 3002void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3003{
11fdf7f2 3004 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3005
11fdf7f2 3006 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
3007
3008 mds_rank_t mds = mds_rank_t(m->get_source().num());
3009 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3010 if (!session) {
7c673cae
FG
3011 return;
3012 }
3013
3014 got_mds_push(session);
3015
3016 ceph_seq_t seq = m->get_seq();
3017
3018 Inode *in;
3019 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3020 if (inode_map.count(vino) == 0) {
3021 ldout(cct, 10) << " don't have vino " << vino << dendl;
3022 goto revoke;
3023 }
3024 in = inode_map[vino];
3025
9f95a23c 3026 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3027 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3028 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3029 goto revoke;
3030 }
3031 Dentry *dn = in->dir->dentries[m->dname];
3032 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3033 dn->lease_mds = -1;
3034 }
3035
3036 revoke:
11fdf7f2 3037 {
9f95a23c
TL
3038 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3039 m->get_mask(), m->get_ino(),
3040 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3041 m->get_connection()->send_message2(std::move(reply));
3042 }
7c673cae
FG
3043}
3044
3045void Client::put_inode(Inode *in, int n)
3046{
11fdf7f2 3047 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3048 int left = in->_put(n);
3049 if (left == 0) {
3050 // release any caps
3051 remove_all_caps(in);
3052
11fdf7f2 3053 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3054 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3055 ceph_assert(!unclean);
7c673cae
FG
3056 inode_map.erase(in->vino());
3057 if (use_faked_inos())
3058 _release_faked_ino(in);
3059
3060 if (in == root) {
3061 root = 0;
3062 root_ancestor = 0;
3063 while (!root_parents.empty())
3064 root_parents.erase(root_parents.begin());
3065 }
3066
3067 delete in;
3068 }
3069}
3070
3071void Client::close_dir(Dir *dir)
3072{
3073 Inode *in = dir->parent_inode;
11fdf7f2
TL
3074 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3075 ceph_assert(dir->is_empty());
3076 ceph_assert(in->dir == dir);
3077 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3078 if (!in->dentries.empty())
7c673cae
FG
3079 in->get_first_parent()->put(); // unpin dentry
3080
3081 delete in->dir;
3082 in->dir = 0;
3083 put_inode(in); // unpin inode
3084}
3085
3086 /**
3087 * Don't call this with in==NULL, use get_or_create for that
3088 * leave dn set to default NULL unless you're trying to add
3089 * a new inode to a pre-created Dentry
3090 */
3091Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3092{
3093 if (!dn) {
3094 // create a new Dentry
11fdf7f2
TL
3095 dn = new Dentry(dir, name);
3096
7c673cae
FG
3097 lru.lru_insert_mid(dn); // mid or top?
3098
3099 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3100 << " dn " << dn << " (new dn)" << dendl;
3101 } else {
11fdf7f2 3102 ceph_assert(!dn->inode);
7c673cae
FG
3103 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3104 << " dn " << dn << " (old dn)" << dendl;
3105 }
3106
3107 if (in) { // link to inode
11fdf7f2 3108 InodeRef tmp_ref;
7c673cae 3109 // only one parent for directories!
11fdf7f2
TL
3110 if (in->is_dir() && !in->dentries.empty()) {
3111 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3112 Dentry *olddn = in->get_first_parent();
11fdf7f2 3113 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3114 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3115 clear_dir_complete_and_ordered(old_diri, true);
3116 unlink(olddn, true, true); // keep dir, dentry
3117 }
3118
11fdf7f2
TL
3119 dn->link(in);
3120 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3121 }
3122
3123 return dn;
3124}
3125
3126void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3127{
11fdf7f2 3128 InodeRef in(dn->inode);
7c673cae
FG
3129 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3130 << " inode " << dn->inode << dendl;
3131
3132 // unlink from inode
11fdf7f2
TL
3133 if (dn->inode) {
3134 dn->unlink();
3135 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3136 }
3137
3138 if (keepdentry) {
3139 dn->lease_mds = -1;
3140 } else {
3141 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3142
3143 // unlink from dir
11fdf7f2
TL
3144 Dir *dir = dn->dir;
3145 dn->detach();
7c673cae
FG
3146
3147 // delete den
3148 lru.lru_remove(dn);
3149 dn->put();
11fdf7f2
TL
3150
3151 if (dir->is_empty() && !keepdir)
3152 close_dir(dir);
7c673cae
FG
3153 }
3154}
3155
3156/**
3157 * For asynchronous flushes, check for errors from the IO and
3158 * update the inode if necessary
3159 */
3160class C_Client_FlushComplete : public Context {
3161private:
3162 Client *client;
3163 InodeRef inode;
3164public:
3165 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3166 void finish(int r) override {
9f95a23c 3167 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3168 if (r != 0) {
3169 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3170 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3171 << " 0x" << std::hex << inode->ino << std::dec
3172 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3173 inode->set_async_err(r);
3174 }
3175 }
3176};
3177
3178
3179/****
3180 * caps
3181 */
3182
3183void Client::get_cap_ref(Inode *in, int cap)
3184{
3185 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3186 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3187 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3188 in->get();
3189 }
3190 if ((cap & CEPH_CAP_FILE_CACHE) &&
3191 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3192 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3193 in->get();
3194 }
3195 in->get_cap_ref(cap);
3196}
3197
3198void Client::put_cap_ref(Inode *in, int cap)
3199{
3200 int last = in->put_cap_ref(cap);
3201 if (last) {
3202 int put_nref = 0;
3203 int drop = last & ~in->caps_issued();
3204 if (in->snapid == CEPH_NOSNAP) {
7f7e6c64 3205 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3206 !in->cap_snaps.empty() &&
3207 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3208 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3209 in->cap_snaps.rbegin()->second.writing = 0;
3210 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3211 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3212 }
3213 if (last & CEPH_CAP_FILE_BUFFER) {
3214 for (auto &p : in->cap_snaps)
3215 p.second.dirty_data = 0;
3216 signal_cond_list(in->waitfor_commit);
11fdf7f2 3217 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3218 ++put_nref;
3219 }
3220 }
3221 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3222 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3223 ++put_nref;
3224 }
3225 if (drop)
3226 check_caps(in, 0);
3227 if (put_nref)
3228 put_inode(in, put_nref);
3229 }
3230}
3231
f6b5b4d7 3232int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3233{
f6b5b4d7
TL
3234 Inode *in = fh->inode.get();
3235
7c673cae
FG
3236 int r = check_pool_perm(in, need);
3237 if (r < 0)
3238 return r;
3239
3240 while (1) {
3241 int file_wanted = in->caps_file_wanted();
3242 if ((file_wanted & need) != need) {
3243 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3244 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3245 << dendl;
3246 return -EBADF;
3247 }
3248
f6b5b4d7
TL
3249 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3250 return -EBADF;
3251
3252 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3253 return -EIO;
3254
7c673cae
FG
3255 int implemented;
3256 int have = in->caps_issued(&implemented);
3257
3258 bool waitfor_caps = false;
3259 bool waitfor_commit = false;
3260
3261 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3262 if (endoff > 0) {
3263 if ((endoff >= (loff_t)in->max_size ||
3264 endoff > (loff_t)(in->size << 1)) &&
3265 endoff > (loff_t)in->wanted_max_size) {
3266 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3267 in->wanted_max_size = endoff;
3268 }
3269 if (in->wanted_max_size > in->max_size &&
3270 in->wanted_max_size > in->requested_max_size)
3271 check_caps(in, 0);
7c673cae
FG
3272 }
3273
3274 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3275 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3276 waitfor_caps = true;
3277 }
3278 if (!in->cap_snaps.empty()) {
3279 if (in->cap_snaps.rbegin()->second.writing) {
3280 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3281 waitfor_caps = true;
3282 }
3283 for (auto &p : in->cap_snaps) {
3284 if (p.second.dirty_data) {
3285 waitfor_commit = true;
3286 break;
3287 }
3288 }
3289 if (waitfor_commit) {
3290 _flush(in, new C_Client_FlushComplete(this, in));
3291 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3292 }
3293 }
3294 }
3295
3296 if (!waitfor_caps && !waitfor_commit) {
3297 if ((have & need) == need) {
7c673cae
FG
3298 int revoking = implemented & ~have;
3299 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3300 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3301 << " revoking " << ccap_string(revoking)
7c673cae 3302 << dendl;
c07f9fc5 3303 if ((revoking & want) == 0) {
7c673cae
FG
3304 *phave = need | (have & want);
3305 in->get_cap_ref(need);
3306 return 0;
3307 }
3308 }
3309 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3310 waitfor_caps = true;
3311 }
3312
3313 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3314 in->auth_cap->session->readonly)
3315 return -EROFS;
3316
3317 if (in->flags & I_CAP_DROPPED) {
3318 int mds_wanted = in->caps_mds_wanted();
3319 if ((mds_wanted & need) != need) {
3320 int ret = _renew_caps(in);
3321 if (ret < 0)
3322 return ret;
3323 continue;
3324 }
a8e16298 3325 if (!(file_wanted & ~mds_wanted))
7c673cae 3326 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3327 }
3328
3329 if (waitfor_caps)
3330 wait_on_list(in->waitfor_caps);
3331 else if (waitfor_commit)
3332 wait_on_list(in->waitfor_commit);
3333 }
3334}
3335
3336int Client::get_caps_used(Inode *in)
3337{
3338 unsigned used = in->caps_used();
3339 if (!(used & CEPH_CAP_FILE_CACHE) &&
3340 !objectcacher->set_is_empty(&in->oset))
3341 used |= CEPH_CAP_FILE_CACHE;
3342 return used;
3343}
3344
3345void Client::cap_delay_requeue(Inode *in)
3346{
11fdf7f2 3347 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3348 in->hold_caps_until = ceph_clock_now();
3349 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3350 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3351}
3352
3353void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3354 int flags, int used, int want, int retain,
7c673cae
FG
3355 int flush, ceph_tid_t flush_tid)
3356{
3357 int held = cap->issued | cap->implemented;
3358 int revoking = cap->implemented & ~cap->issued;
3359 retain &= ~revoking;
3360 int dropping = cap->issued & ~retain;
3361 int op = CEPH_CAP_OP_UPDATE;
3362
11fdf7f2 3363 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3364 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3365 << " used " << ccap_string(used)
3366 << " want " << ccap_string(want)
3367 << " flush " << ccap_string(flush)
3368 << " retain " << ccap_string(retain)
3369 << " held "<< ccap_string(held)
3370 << " revoking " << ccap_string(revoking)
3371 << " dropping " << ccap_string(dropping)
3372 << dendl;
3373
3374 if (cct->_conf->client_inject_release_failure && revoking) {
3375 const int would_have_issued = cap->issued & retain;
3376 const int would_have_implemented = cap->implemented & (cap->issued | used);
3377 // Simulated bug:
3378 // - tell the server we think issued is whatever they issued plus whatever we implemented
3379 // - leave what we have implemented in place
3380 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3381 cap->issued = cap->issued | cap->implemented;
3382
3383 // Make an exception for revoking xattr caps: we are injecting
3384 // failure to release other caps, but allow xattr because client
3385 // will block on xattr ops if it can't release these to MDS (#9800)
3386 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3387 cap->issued ^= xattr_mask & revoking;
3388 cap->implemented ^= xattr_mask & revoking;
3389
3390 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3391 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3392 } else {
3393 // Normal behaviour
3394 cap->issued &= retain;
3395 cap->implemented &= cap->issued | used;
3396 }
3397
3398 snapid_t follows = 0;
3399
3400 if (flush)
3401 follows = in->snaprealm->get_snap_context().seq;
3402
9f95a23c 3403 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3404 in->ino,
3405 0,
3406 cap->cap_id, cap->seq,
3407 cap->implemented,
3408 want,
3409 flush,
3410 cap->mseq,
3411 cap_epoch_barrier);
3412 m->caller_uid = in->cap_dirtier_uid;
3413 m->caller_gid = in->cap_dirtier_gid;
3414
3415 m->head.issue_seq = cap->issue_seq;
3416 m->set_tid(flush_tid);
3417
3418 m->head.uid = in->uid;
3419 m->head.gid = in->gid;
3420 m->head.mode = in->mode;
3421
3422 m->head.nlink = in->nlink;
3423
3424 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3425 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3426 m->head.xattr_version = in->xattr_version;
3427 }
3428
3429 m->size = in->size;
3430 m->max_size = in->max_size;
3431 m->truncate_seq = in->truncate_seq;
3432 m->truncate_size = in->truncate_size;
3433 m->mtime = in->mtime;
3434 m->atime = in->atime;
3435 m->ctime = in->ctime;
3436 m->btime = in->btime;
3437 m->time_warp_seq = in->time_warp_seq;
3438 m->change_attr = in->change_attr;
eafe8130
TL
3439
3440 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3441 !in->cap_snaps.empty() &&
3442 in->cap_snaps.rbegin()->second.flush_tid == 0)
3443 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3444 m->flags = flags;
3445
7c673cae
FG
3446 if (flush & CEPH_CAP_FILE_WR) {
3447 m->inline_version = in->inline_version;
3448 m->inline_data = in->inline_data;
3449 }
3450
3451 in->reported_size = in->size;
3452 m->set_snap_follows(follows);
3453 cap->wanted = want;
3454 if (cap == in->auth_cap) {
1911f103
TL
3455 if (want & CEPH_CAP_ANY_FILE_WR) {
3456 m->set_max_size(in->wanted_max_size);
3457 in->requested_max_size = in->wanted_max_size;
3458 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3459 } else {
3460 in->requested_max_size = 0;
3461 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3462 }
7c673cae
FG
3463 }
3464
3465 if (!session->flushing_caps_tids.empty())
3466 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3467
11fdf7f2 3468 session->con->send_message2(std::move(m));
7c673cae
FG
3469}
3470
31f18b77
FG
3471static bool is_max_size_approaching(Inode *in)
3472{
3473 /* mds will adjust max size according to the reported size */
3474 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3475 return false;
3476 if (in->size >= in->max_size)
3477 return true;
3478 /* half of previous max_size increment has been used */
3479 if (in->max_size > in->reported_size &&
3480 (in->size << 1) >= in->max_size + in->reported_size)
3481 return true;
3482 return false;
3483}
7c673cae 3484
11fdf7f2
TL
3485static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3486{
3487 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3488 return used;
3489 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3490 return used;
3491
3492 if (issued & CEPH_CAP_FILE_LAZYIO) {
3493 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3494 used &= ~CEPH_CAP_FILE_CACHE;
3495 used |= CEPH_CAP_FILE_LAZYIO;
3496 }
3497 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3498 used &= ~CEPH_CAP_FILE_BUFFER;
3499 used |= CEPH_CAP_FILE_LAZYIO;
3500 }
3501 } else {
3502 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3503 used &= ~CEPH_CAP_FILE_CACHE;
3504 used |= CEPH_CAP_FILE_LAZYIO;
3505 }
3506 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3507 used &= ~CEPH_CAP_FILE_BUFFER;
3508 used |= CEPH_CAP_FILE_LAZYIO;
3509 }
3510 }
3511 return used;
3512}
3513
7c673cae
FG
3514/**
3515 * check_caps
3516 *
3517 * Examine currently used and wanted versus held caps. Release, flush or ack
3518 * revoked caps to the MDS as appropriate.
3519 *
3520 * @param in the inode to check
3521 * @param flags flags to apply to cap check
3522 */
3523void Client::check_caps(Inode *in, unsigned flags)
3524{
3525 unsigned wanted = in->caps_wanted();
3526 unsigned used = get_caps_used(in);
3527 unsigned cap_used;
3528
7c673cae
FG
3529 int implemented;
3530 int issued = in->caps_issued(&implemented);
3531 int revoking = implemented & ~issued;
3532
11fdf7f2
TL
3533 int orig_used = used;
3534 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3535
7c673cae 3536 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3537 if (!unmounting && in->nlink > 0) {
3538 if (wanted) {
7c673cae 3539 retain |= CEPH_CAP_ANY;
a8e16298
TL
3540 } else if (in->is_dir() &&
3541 (issued & CEPH_CAP_FILE_SHARED) &&
3542 (in->flags & I_COMPLETE)) {
3543 // we do this here because we don't want to drop to Fs (and then
3544 // drop the Fs if we do a create!) if that alone makes us send lookups
3545 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3546 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3547 retain |= wanted;
3548 } else {
7c673cae 3549 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3550 // keep RD only if we didn't have the file open RW,
3551 // because then the mds would revoke it anyway to
3552 // journal max_size=0.
3553 if (in->max_size == 0)
3554 retain |= CEPH_CAP_ANY_RD;
3555 }
7c673cae
FG
3556 }
3557
11fdf7f2 3558 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3559 << " wanted " << ccap_string(wanted)
3560 << " used " << ccap_string(used)
3561 << " issued " << ccap_string(issued)
3562 << " revoking " << ccap_string(revoking)
3563 << " flags=" << flags
3564 << dendl;
3565
3566 if (in->snapid != CEPH_NOSNAP)
3567 return; //snap caps last forever, can't write
3568
3569 if (in->caps.empty())
3570 return; // guard if at end of func
3571
11fdf7f2
TL
3572 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3573 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3574 if (_release(in))
11fdf7f2 3575 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3576 }
7c673cae 3577
7c673cae 3578
11fdf7f2
TL
3579 for (auto &p : in->caps) {
3580 mds_rank_t mds = p.first;
3581 Cap &cap = p.second;
7c673cae 3582
11fdf7f2 3583 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3584
3585 cap_used = used;
11fdf7f2 3586 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3587 cap_used &= ~in->auth_cap->issued;
3588
11fdf7f2 3589 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3590
3591 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3592 << " issued " << ccap_string(cap.issued)
3593 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3594 << " revoking " << ccap_string(revoking) << dendl;
3595
3596 if (in->wanted_max_size > in->max_size &&
3597 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3598 &cap == in->auth_cap)
7c673cae
FG
3599 goto ack;
3600
3601 /* approaching file_max? */
11fdf7f2
TL
3602 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3603 &cap == in->auth_cap &&
31f18b77 3604 is_max_size_approaching(in)) {
7c673cae 3605 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3606 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3607 goto ack;
3608 }
3609
3610 /* completed revocation? */
3611 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3612 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3613 goto ack;
3614 }
3615
3616 /* want more caps from mds? */
11fdf7f2 3617 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3618 goto ack;
3619
3620 if (!revoking && unmounting && (cap_used == 0))
3621 goto ack;
3622
11fdf7f2 3623 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3624 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3625 continue;
3626
11fdf7f2 3627 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3628 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3629 cap_delay_requeue(in);
7c673cae
FG
3630 continue;
3631 }
3632
3633 ack:
eafe8130
TL
3634 if (&cap == in->auth_cap) {
3635 if (in->flags & I_KICK_FLUSH) {
3636 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3637 << " to mds." << mds << dendl;
3638 kick_flushing_caps(in, session);
3639 }
3640 if (!in->cap_snaps.empty() &&
3641 in->cap_snaps.rbegin()->second.flush_tid == 0)
3642 flush_snaps(in);
7c673cae
FG
3643 }
3644
3645 int flushing;
e306af50 3646 int msg_flags = 0;
7c673cae 3647 ceph_tid_t flush_tid;
11fdf7f2 3648 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3649 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3650 if (flags & CHECK_CAPS_SYNCHRONOUS)
3651 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3652 } else {
3653 flushing = 0;
3654 flush_tid = 0;
3655 }
3656
eafe8130
TL
3657 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3658 flushing, flush_tid);
7c673cae
FG
3659 }
3660}
3661
3662
3663void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3664{
3665 int used = get_caps_used(in);
3666 int dirty = in->caps_dirty();
11fdf7f2 3667 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3668
3669 if (in->cap_snaps.size() &&
3670 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3671 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3672 return;
3673 } else if (in->caps_dirty() ||
3674 (used & CEPH_CAP_FILE_WR) ||
3675 (dirty & CEPH_CAP_ANY_WR)) {
3676 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3677 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3678 CapSnap &capsnap = capsnapem.first->second;
3679 capsnap.context = old_snapc;
3680 capsnap.issued = in->caps_issued();
3681 capsnap.dirty = in->caps_dirty();
7f7e6c64 3682
7c673cae 3683 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
7f7e6c64 3684
7c673cae
FG
3685 capsnap.uid = in->uid;
3686 capsnap.gid = in->gid;
3687 capsnap.mode = in->mode;
3688 capsnap.btime = in->btime;
3689 capsnap.xattrs = in->xattrs;
3690 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3691 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3692 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7f7e6c64 3693
7c673cae 3694 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3695 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3696 capsnap.writing = 1;
3697 } else {
3698 finish_cap_snap(in, capsnap, used);
3699 }
3700 } else {
11fdf7f2 3701 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3702 }
3703}
3704
3705void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3706{
11fdf7f2 3707 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3708 capsnap.size = in->size;
3709 capsnap.mtime = in->mtime;
3710 capsnap.atime = in->atime;
3711 capsnap.ctime = in->ctime;
3712 capsnap.time_warp_seq = in->time_warp_seq;
3713 capsnap.change_attr = in->change_attr;
7c673cae
FG
3714 capsnap.dirty |= in->caps_dirty();
3715
11fdf7f2
TL
3716 /* Only reset it if it wasn't set before */
3717 if (capsnap.cap_dirtier_uid == -1) {
3718 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3719 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3720 }
3721
7c673cae
FG
3722 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3723 capsnap.inline_data = in->inline_data;
3724 capsnap.inline_version = in->inline_version;
3725 }
3726
3727 if (used & CEPH_CAP_FILE_BUFFER) {
7f7e6c64 3728 capsnap.writing = 1;
11fdf7f2 3729 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3730 << " WRBUFFER, delaying" << dendl;
3731 } else {
3732 capsnap.dirty_data = 0;
3733 flush_snaps(in);
3734 }
3735}
3736
eafe8130
TL
3737void Client::send_flush_snap(Inode *in, MetaSession *session,
3738 snapid_t follows, CapSnap& capsnap)
3739{
9f95a23c
TL
3740 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3741 in->ino, in->snaprealm->ino, 0,
3742 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
3743 m->caller_uid = capsnap.cap_dirtier_uid;
3744 m->caller_gid = capsnap.cap_dirtier_gid;
3745
3746 m->set_client_tid(capsnap.flush_tid);
3747 m->head.snap_follows = follows;
3748
3749 m->head.caps = capsnap.issued;
3750 m->head.dirty = capsnap.dirty;
3751
3752 m->head.uid = capsnap.uid;
3753 m->head.gid = capsnap.gid;
3754 m->head.mode = capsnap.mode;
3755 m->btime = capsnap.btime;
3756
3757 m->size = capsnap.size;
3758
3759 m->head.xattr_version = capsnap.xattr_version;
3760 encode(capsnap.xattrs, m->xattrbl);
3761
3762 m->ctime = capsnap.ctime;
3763 m->btime = capsnap.btime;
3764 m->mtime = capsnap.mtime;
3765 m->atime = capsnap.atime;
3766 m->time_warp_seq = capsnap.time_warp_seq;
3767 m->change_attr = capsnap.change_attr;
3768
3769 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3770 m->inline_version = in->inline_version;
3771 m->inline_data = in->inline_data;
3772 }
3773
3774 ceph_assert(!session->flushing_caps_tids.empty());
3775 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3776
3777 session->con->send_message2(std::move(m));
3778}
3779
3780void Client::flush_snaps(Inode *in)
7c673cae 3781{
eafe8130 3782 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3783 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3784
3785 // pick auth mds
11fdf7f2 3786 ceph_assert(in->auth_cap);
7c673cae 3787 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3788
3789 for (auto &p : in->cap_snaps) {
3790 CapSnap &capsnap = p.second;
eafe8130
TL
3791 // only do new flush
3792 if (capsnap.flush_tid > 0)
3793 continue;
7c673cae
FG
3794
3795 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3796 << " follows " << p.first
3797 << " size " << capsnap.size
3798 << " mtime " << capsnap.mtime
3799 << " dirty_data=" << capsnap.dirty_data
3800 << " writing=" << capsnap.writing
3801 << " on " << *in << dendl;
3802 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3803 break;
7f7e6c64 3804
eafe8130
TL
3805 capsnap.flush_tid = ++last_flush_tid;
3806 session->flushing_caps_tids.insert(capsnap.flush_tid);
3807 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3808 if (!in->flushing_cap_item.is_on_list())
3809 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3810
eafe8130 3811 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3812 }
3813}
3814
9f95a23c 3815void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 3816{
9f95a23c 3817 ceph::condition_variable cond;
7c673cae 3818 ls.push_back(&cond);
9f95a23c
TL
3819 std::unique_lock l{client_lock, std::adopt_lock};
3820 cond.wait(l);
3821 l.release();
7c673cae
FG
3822 ls.remove(&cond);
3823}
3824
9f95a23c 3825void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 3826{
9f95a23c
TL
3827 for (auto cond : ls) {
3828 cond->notify_all();
3829 }
7c673cae
FG
3830}
3831
3832void Client::wait_on_context_list(list<Context*>& ls)
3833{
9f95a23c 3834 ceph::condition_variable cond;
7c673cae
FG
3835 bool done = false;
3836 int r;
9f95a23c
TL
3837 ls.push_back(new C_Cond(cond, &done, &r));
3838 std::unique_lock l{client_lock, std::adopt_lock};
3839 cond.wait(l, [&done] { return done;});
3840 l.release();
7c673cae
FG
3841}
3842
3843void Client::signal_context_list(list<Context*>& ls)
3844{
3845 while (!ls.empty()) {
3846 ls.front()->complete(0);
3847 ls.pop_front();
3848 }
3849}
3850
a8e16298 3851void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3852{
11fdf7f2
TL
3853 for (const auto &cap : s->caps) {
3854 auto &in = cap->inode;
a8e16298 3855 if (reconnect) {
11fdf7f2
TL
3856 in.requested_max_size = 0;
3857 in.wanted_max_size = 0;
a8e16298
TL
3858 } else {
3859 if (cap->gen < s->cap_gen) {
3860 // mds did not re-issue stale cap.
3861 cap->issued = cap->implemented = CEPH_CAP_PIN;
3862 // make sure mds knows what we want.
11fdf7f2
TL
3863 if (in.caps_file_wanted() & ~cap->wanted)
3864 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3865 }
3866 }
11fdf7f2 3867 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3868 }
3869}
3870
3871
3872// flush dirty data (from objectcache)
3873
3874class C_Client_CacheInvalidate : public Context {
3875private:
3876 Client *client;
3877 vinodeno_t ino;
3878 int64_t offset, length;
3879public:
3880 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3881 client(c), offset(off), length(len) {
3882 if (client->use_faked_inos())
3883 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3884 else
3885 ino = in->vino();
3886 }
3887 void finish(int r) override {
3888 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 3889 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
3890 client->_async_invalidate(ino, offset, length);
3891 }
3892};
3893
3894void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3895{
3896 if (unmounting)
3897 return;
11fdf7f2 3898 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3899 ino_invalidate_cb(callback_handle, ino, off, len);
3900}
3901
3902void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3903
3904 if (ino_invalidate_cb)
3905 // we queue the invalidate, which calls the callback and decrements the ref
3906 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3907}
3908
3909void Client::_invalidate_inode_cache(Inode *in)
3910{
11fdf7f2 3911 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3912
3913 // invalidate our userspace inode cache
94b18763 3914 if (cct->_conf->client_oc) {
7c673cae 3915 objectcacher->release_set(&in->oset);
94b18763
FG
3916 if (!objectcacher->set_is_empty(&in->oset))
3917 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3918 }
7c673cae
FG
3919
3920 _schedule_invalidate_callback(in, 0, 0);
3921}
3922
3923void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3924{
11fdf7f2 3925 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3926
3927 // invalidate our userspace inode cache
3928 if (cct->_conf->client_oc) {
3929 vector<ObjectExtent> ls;
3930 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3931 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3932 }
3933
3934 _schedule_invalidate_callback(in, off, len);
3935}
3936
3937bool Client::_release(Inode *in)
3938{
3939 ldout(cct, 20) << "_release " << *in << dendl;
3940 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3941 _invalidate_inode_cache(in);
3942 return true;
3943 }
3944 return false;
3945}
3946
3947bool Client::_flush(Inode *in, Context *onfinish)
3948{
3949 ldout(cct, 10) << "_flush " << *in << dendl;
3950
3951 if (!in->oset.dirty_or_tx) {
3952 ldout(cct, 10) << " nothing to flush" << dendl;
3953 onfinish->complete(0);
3954 return true;
3955 }
3956
3957 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3958 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3959 objectcacher->purge_set(&in->oset);
3960 if (onfinish) {
3961 onfinish->complete(-ENOSPC);
3962 }
3963 return true;
3964 }
3965
3966 return objectcacher->flush_set(&in->oset, onfinish);
3967}
3968
3969void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3970{
9f95a23c 3971 ceph_assert(ceph_mutex_is_locked(client_lock));
7c673cae
FG
3972 if (!in->oset.dirty_or_tx) {
3973 ldout(cct, 10) << " nothing to flush" << dendl;
3974 return;
3975 }
3976
11fdf7f2 3977 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3978 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3979 offset, size, &onflush);
7c673cae
FG
3980 if (!ret) {
3981 // wait for flush
9f95a23c 3982 client_lock.unlock();
11fdf7f2 3983 onflush.wait();
9f95a23c 3984 client_lock.lock();
7c673cae
FG
3985 }
3986}
3987
3988void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3989{
11fdf7f2 3990 // std::lock_guard l(client_lock);
9f95a23c 3991 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 3992 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3993 ceph_assert(in);
7c673cae
FG
3994 _flushed(in);
3995}
3996
3997void Client::_flushed(Inode *in)
3998{
3999 ldout(cct, 10) << "_flushed " << *in << dendl;
4000
4001 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4002}
4003
4004
4005
4006// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4007void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4008{
4009 unsigned had = in->caps_issued();
4010
4011 if ((issued & CEPH_CAP_FILE_CACHE) &&
4012 !(had & CEPH_CAP_FILE_CACHE))
4013 in->cache_gen++;
4014
f91f0fd5
TL
4015 if ((issued & CEPH_CAP_FILE_SHARED) !=
4016 (had & CEPH_CAP_FILE_SHARED)) {
4017 if (issued & CEPH_CAP_FILE_SHARED)
4018 in->shared_gen++;
7c673cae
FG
4019 if (in->is_dir())
4020 clear_dir_complete_and_ordered(in, true);
4021 }
4022}
4023
4024void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4025 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4026 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4027{
11fdf7f2
TL
4028 if (!in->is_any_caps()) {
4029 ceph_assert(in->snaprealm == 0);
4030 in->snaprealm = get_snap_realm(realm);
4031 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4032 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4033 } else {
4034 ceph_assert(in->snaprealm);
4035 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4036 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4037 in->snaprealm_item.remove_myself();
4038 auto oldrealm = in->snaprealm;
4039 in->snaprealm = get_snap_realm(realm);
4040 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4041 put_snap_realm(oldrealm);
4042 }
4043 }
4044
7c673cae 4045 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4046 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4047 Cap &cap = capem.first->second;
4048 if (!capem.second) {
4049 if (cap.gen < mds_session->cap_gen)
4050 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4051
4052 /*
4053 * auth mds of the inode changed. we received the cap export
4054 * message, but still haven't received the cap import message.
4055 * handle_cap_export() updated the new auth MDS' cap.
4056 *
4057 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058 * a message that was send before the cap import message. So
4059 * don't remove caps.
4060 */
11fdf7f2 4061 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4062 if (&cap != in->auth_cap)
4063 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4064
11fdf7f2
TL
4065 ceph_assert(cap.cap_id == cap_id);
4066 seq = cap.seq;
4067 mseq = cap.mseq;
4068 issued |= cap.issued;
7c673cae
FG
4069 flags |= CEPH_CAP_FLAG_AUTH;
4070 }
7c673cae
FG
4071 }
4072
11fdf7f2 4073 check_cap_issue(in, issued);
7c673cae
FG
4074
4075 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4076 if (in->auth_cap != &cap &&
7c673cae
FG
4077 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4078 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4079 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4080 << "add myself to new auth MDS' flushing caps list" << dendl;
4081 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4082 }
11fdf7f2 4083 in->auth_cap = &cap;
7c673cae
FG
4084 }
4085 }
4086
11fdf7f2
TL
4087 unsigned old_caps = cap.issued;
4088 cap.cap_id = cap_id;
4089 cap.issued = issued;
4090 cap.implemented |= issued;
4091 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4092 cap.wanted = wanted;
a8e16298 4093 else
11fdf7f2
TL
4094 cap.wanted |= wanted;
4095 cap.seq = seq;
4096 cap.issue_seq = seq;
4097 cap.mseq = mseq;
4098 cap.gen = mds_session->cap_gen;
4099 cap.latest_perms = cap_perms;
4100 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4101 << " from mds." << mds
4102 << " on " << *in
4103 << dendl;
4104
4105 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4106 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4107 for (auto &p : in->caps) {
4108 if (&p.second == &cap)
7c673cae 4109 continue;
11fdf7f2 4110 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4111 check_caps(in, CHECK_CAPS_NODELAY);
4112 break;
4113 }
4114 }
4115 }
4116
4117 if (issued & ~old_caps)
4118 signal_cond_list(in->waitfor_caps);
4119}
4120
4121void Client::remove_cap(Cap *cap, bool queue_release)
4122{
11fdf7f2 4123 auto &in = cap->inode;
7c673cae
FG
4124 MetaSession *session = cap->session;
4125 mds_rank_t mds = cap->session->mds_num;
4126
11fdf7f2 4127 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4128
4129 if (queue_release) {
4130 session->enqueue_cap_release(
11fdf7f2 4131 in.ino,
7c673cae
FG
4132 cap->cap_id,
4133 cap->issue_seq,
4134 cap->mseq,
4135 cap_epoch_barrier);
4136 }
4137
11fdf7f2
TL
4138 if (in.auth_cap == cap) {
4139 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4140 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4141 in.flushing_cap_item.remove_myself();
7c673cae 4142 }
11fdf7f2 4143 in.auth_cap = NULL;
7c673cae 4144 }
11fdf7f2
TL
4145 size_t n = in.caps.erase(mds);
4146 ceph_assert(n == 1);
7c673cae
FG
4147 cap = nullptr;
4148
11fdf7f2
TL
4149 if (!in.is_any_caps()) {
4150 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4151 in.snaprealm_item.remove_myself();
4152 put_snap_realm(in.snaprealm);
4153 in.snaprealm = 0;
7c673cae
FG
4154 }
4155}
4156
4157void Client::remove_all_caps(Inode *in)
4158{
4159 while (!in->caps.empty())
11fdf7f2 4160 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4161}
4162
f6b5b4d7 4163void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4164{
11fdf7f2 4165 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4166
4167 while (s->caps.size()) {
4168 Cap *cap = *s->caps.begin();
11fdf7f2 4169 InodeRef in(&cap->inode);
eafe8130 4170 bool dirty_caps = false;
7c673cae 4171 if (in->auth_cap == cap) {
7c673cae
FG
4172 dirty_caps = in->dirty_caps | in->flushing_caps;
4173 in->wanted_max_size = 0;
4174 in->requested_max_size = 0;
f6b5b4d7
TL
4175 if (in->has_any_filelocks())
4176 in->flags |= I_ERROR_FILELOCK;
7c673cae 4177 }
f6b5b4d7 4178 auto caps = cap->implemented;
a8e16298
TL
4179 if (cap->wanted | cap->issued)
4180 in->flags |= I_CAP_DROPPED;
7c673cae 4181 remove_cap(cap, false);
eafe8130 4182 in->cap_snaps.clear();
7c673cae 4183 if (dirty_caps) {
11fdf7f2 4184 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4185 if (in->flushing_caps) {
4186 num_flushing_caps--;
4187 in->flushing_cap_tids.clear();
4188 }
4189 in->flushing_caps = 0;
28e407b8 4190 in->mark_caps_clean();
11fdf7f2 4191 put_inode(in.get());
7c673cae 4192 }
f6b5b4d7
TL
4193 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4194 if (caps && !in->caps_issued_mask(caps, true)) {
4195 if (err == -EBLACKLISTED) {
4196 if (in->oset.dirty_or_tx) {
4197 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4198 in->set_async_err(err);
4199 }
4200 objectcacher->purge_set(&in->oset);
4201 } else {
4202 objectcacher->release_set(&in->oset);
4203 }
4204 _schedule_invalidate_callback(in.get(), 0, 0);
4205 }
4206
a8e16298 4207 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4208 }
4209 s->flushing_caps_tids.clear();
9f95a23c 4210 sync_cond.notify_all();
7c673cae
FG
4211}
4212
91327a77 4213int Client::_do_remount(bool retry_on_error)
b32b8144 4214{
adb31ebb 4215 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4216
b32b8144
FG
4217 errno = 0;
4218 int r = remount_cb(callback_handle);
91327a77
AA
4219 if (r == 0) {
4220 retries_on_invalidate = 0;
4221 } else {
b32b8144
FG
4222 int e = errno;
4223 client_t whoami = get_nodeid();
4224 if (r == -1) {
4225 lderr(cct) <<
4226 "failed to remount (to trim kernel dentries): "
4227 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4228 } else {
4229 lderr(cct) <<
4230 "failed to remount (to trim kernel dentries): "
4231 "return code = " << r << dendl;
4232 }
91327a77 4233 bool should_abort =
11fdf7f2
TL
4234 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4235 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4236 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4237 if (should_abort && !unmounting) {
4238 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4239 ceph_abort();
4240 }
4241 }
4242 return r;
4243}
4244
7c673cae
FG
4245class C_Client_Remount : public Context {
4246private:
4247 Client *client;
4248public:
4249 explicit C_Client_Remount(Client *c) : client(c) {}
4250 void finish(int r) override {
11fdf7f2 4251 ceph_assert(r == 0);
91327a77 4252 client->_do_remount(true);
7c673cae
FG
4253 }
4254};
4255
4256void Client::_invalidate_kernel_dcache()
4257{
4258 if (unmounting)
4259 return;
94b18763
FG
4260 if (can_invalidate_dentries) {
4261 if (dentry_invalidate_cb && root->dir) {
4262 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4263 p != root->dir->dentries.end();
4264 ++p) {
4265 if (p->second->inode)
4266 _schedule_invalidate_dentry_callback(p->second, false);
4267 }
7c673cae
FG
4268 }
4269 } else if (remount_cb) {
4270 // Hacky:
4271 // when remounting a file system, linux kernel trims all unused dentries in the fs
4272 remount_finisher.queue(new C_Client_Remount(this));
4273 }
4274}
4275
91327a77
AA
4276void Client::_trim_negative_child_dentries(InodeRef& in)
4277{
4278 if (!in->is_dir())
4279 return;
4280
4281 Dir* dir = in->dir;
4282 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4283 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4284 Dentry *dn = p->second;
4285 ++p;
11fdf7f2 4286 ceph_assert(!dn->inode);
91327a77
AA
4287 if (dn->lru_is_expireable())
4288 unlink(dn, true, false); // keep dir, drop dentry
4289 }
4290 if (dir->dentries.empty()) {
4291 close_dir(dir);
4292 }
4293 }
4294
4295 if (in->flags & I_SNAPDIR_OPEN) {
4296 InodeRef snapdir = open_snapdir(in.get());
4297 _trim_negative_child_dentries(snapdir);
4298 }
4299}
4300
e306af50
TL
4301class C_Client_CacheRelease : public Context {
4302private:
4303 Client *client;
4304 vinodeno_t ino;
4305public:
4306 C_Client_CacheRelease(Client *c, Inode *in) :
4307 client(c) {
4308 if (client->use_faked_inos())
4309 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4310 else
4311 ino = in->vino();
4312 }
4313 void finish(int r) override {
4314 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4315 client->_async_inode_release(ino);
4316 }
4317};
4318
4319void Client::_async_inode_release(vinodeno_t ino)
4320{
4321 if (unmounting)
4322 return;
4323 ldout(cct, 10) << __func__ << " " << ino << dendl;
4324 ino_release_cb(callback_handle, ino);
4325}
4326
4327void Client::_schedule_ino_release_callback(Inode *in) {
4328
4329 if (ino_release_cb)
4330 // we queue the invalidate, which calls the callback and decrements the ref
4331 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4332}
4333
28e407b8 4334void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4335{
4336 mds_rank_t mds = s->mds_num;
28e407b8 4337 size_t caps_size = s->caps.size();
11fdf7f2 4338 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4339 << " caps " << caps_size << dendl;
4340
28e407b8
AA
4341 uint64_t trimmed = 0;
4342 auto p = s->caps.begin();
4343 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4344 * looking at from getting deleted during traversal. */
7c673cae
FG
4345 while ((caps_size - trimmed) > max && !p.end()) {
4346 Cap *cap = *p;
11fdf7f2 4347 InodeRef in(&cap->inode);
7c673cae
FG
4348
4349 // Increment p early because it will be invalidated if cap
4350 // is deleted inside remove_cap
4351 ++p;
4352
4353 if (in->caps.size() > 1 && cap != in->auth_cap) {
4354 int mine = cap->issued | cap->implemented;
4355 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4356 // disposable non-auth cap
b32b8144 4357 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4358 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4359 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4360 trimmed++;
4361 }
4362 } else {
4363 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4364 _trim_negative_child_dentries(in);
7c673cae 4365 bool all = true;
11fdf7f2
TL
4366 auto q = in->dentries.begin();
4367 while (q != in->dentries.end()) {
4368 Dentry *dn = *q;
4369 ++q;
7c673cae
FG
4370 if (dn->lru_is_expireable()) {
4371 if (can_invalidate_dentries &&
4372 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4373 // Only issue one of these per DN for inodes in root: handle
4374 // others more efficiently by calling for root-child DNs at
4375 // the end of this function.
4376 _schedule_invalidate_dentry_callback(dn, true);
4377 }
28e407b8
AA
4378 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4379 to_trim.insert(dn);
7c673cae
FG
4380 } else {
4381 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4382 all = false;
4383 }
4384 }
f91f0fd5
TL
4385 if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
4386 _schedule_ino_release_callback(in.get());
4387 }
7c673cae
FG
4388 if (all && in->ino != MDS_INO_ROOT) {
4389 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4390 trimmed++;
4391 }
4392 }
4393 }
28e407b8
AA
4394 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4395 for (const auto &dn : to_trim) {
4396 trim_dentry(dn);
4397 }
4398 to_trim.clear();
7c673cae 4399
b32b8144 4400 caps_size = s->caps.size();
11fdf7f2 4401 if (caps_size > (size_t)max)
7c673cae
FG
4402 _invalidate_kernel_dcache();
4403}
4404
4405void Client::force_session_readonly(MetaSession *s)
4406{
4407 s->readonly = true;
4408 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4409 auto &in = (*p)->inode;
4410 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4411 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4412 }
4413}
4414
7c673cae
FG
4415int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4416{
4417 MetaSession *session = in->auth_cap->session;
4418
4419 int flushing = in->dirty_caps;
11fdf7f2 4420 ceph_assert(flushing);
7c673cae
FG
4421
4422 ceph_tid_t flush_tid = ++last_flush_tid;
4423 in->flushing_cap_tids[flush_tid] = flushing;
4424
4425 if (!in->flushing_caps) {
11fdf7f2 4426 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4427 num_flushing_caps++;
4428 } else {
11fdf7f2 4429 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4430 }
4431
4432 in->flushing_caps |= flushing;
28e407b8 4433 in->mark_caps_clean();
7c673cae
FG
4434
4435 if (!in->flushing_cap_item.is_on_list())
4436 session->flushing_caps.push_back(&in->flushing_cap_item);
4437 session->flushing_caps_tids.insert(flush_tid);
4438
4439 *ptid = flush_tid;
4440 return flushing;
4441}
4442
4443void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4444{
4445 for (auto &p : in->cap_snaps) {
4446 CapSnap &capsnap = p.second;
4447 if (capsnap.flush_tid > 0) {
4448 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4449 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4450 }
4451 }
4452 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4453 it != in->flushing_cap_tids.end();
4454 ++it) {
4455 old_s->flushing_caps_tids.erase(it->first);
4456 new_s->flushing_caps_tids.insert(it->first);
4457 }
4458 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4459}
4460
4461/*
4462 * Flush all caps back to the MDS. Because the callers generally wait on the
4463 * result of this function (syncfs and umount cases), we set
4464 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4465 */
4466void Client::flush_caps_sync()
4467{
4468 ldout(cct, 10) << __func__ << dendl;
28e407b8 4469 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4470 while (!p.end()) {
4471 unsigned flags = CHECK_CAPS_NODELAY;
4472 Inode *in = *p;
4473
4474 ++p;
28e407b8
AA
4475 delayed_list.pop_front();
4476 if (p.end() && dirty_list.empty())
7c673cae
FG
4477 flags |= CHECK_CAPS_SYNCHRONOUS;
4478 check_caps(in, flags);
4479 }
4480
4481 // other caps, too
28e407b8 4482 p = dirty_list.begin();
7c673cae
FG
4483 while (!p.end()) {
4484 unsigned flags = CHECK_CAPS_NODELAY;
4485 Inode *in = *p;
4486
4487 ++p;
4488 if (p.end())
4489 flags |= CHECK_CAPS_SYNCHRONOUS;
4490 check_caps(in, flags);
4491 }
4492}
4493
7c673cae
FG
4494void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4495{
4496 while (in->flushing_caps) {
4497 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4498 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4499 if (it->first > want)
4500 break;
11fdf7f2 4501 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4502 << ccap_string(it->second) << " want " << want
4503 << " last " << it->first << dendl;
4504 wait_on_list(in->waitfor_caps);
4505 }
4506}
4507
4508void Client::wait_sync_caps(ceph_tid_t want)
4509{
4510 retry:
11fdf7f2 4511 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4512 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4513 for (auto &p : mds_sessions) {
4514 MetaSession *s = &p.second;
7c673cae
FG
4515 if (s->flushing_caps_tids.empty())
4516 continue;
4517 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4518 if (oldest_tid <= want) {
11fdf7f2 4519 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4520 << " (want " << want << ")" << dendl;
9f95a23c
TL
4521 std::unique_lock l{client_lock, std::adopt_lock};
4522 sync_cond.wait(l);
4523 l.release();
7c673cae
FG
4524 goto retry;
4525 }
4526 }
4527}
4528
eafe8130
TL
4529void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4530{
4531 in->flags &= ~I_KICK_FLUSH;
4532
4533 Cap *cap = in->auth_cap;
4534 ceph_assert(cap->session == session);
4535
4536 ceph_tid_t last_snap_flush = 0;
4537 for (auto p = in->flushing_cap_tids.rbegin();
4538 p != in->flushing_cap_tids.rend();
4539 ++p) {
4540 if (!p->second) {
4541 last_snap_flush = p->first;
4542 break;
4543 }
4544 }
4545
4546 int wanted = in->caps_wanted();
4547 int used = get_caps_used(in) | in->caps_dirty();
4548 auto it = in->cap_snaps.begin();
4549 for (auto& p : in->flushing_cap_tids) {
4550 if (p.second) {
4551 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4552 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4553 p.second, p.first);
4554 } else {
4555 ceph_assert(it != in->cap_snaps.end());
4556 ceph_assert(it->second.flush_tid == p.first);
4557 send_flush_snap(in, session, it->first, it->second);
4558 ++it;
4559 }
4560 }
4561}
4562
7c673cae
FG
4563void Client::kick_flushing_caps(MetaSession *session)
4564{
4565 mds_rank_t mds = session->mds_num;
11fdf7f2 4566 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4567
4568 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4569 Inode *in = *p;
eafe8130
TL
4570 if (in->flags & I_KICK_FLUSH) {
4571 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4572 kick_flushing_caps(in, session);
4573 }
7c673cae 4574 }
7c673cae
FG
4575}
4576
4577void Client::early_kick_flushing_caps(MetaSession *session)
4578{
7c673cae
FG
4579 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4580 Inode *in = *p;
11fdf7f2
TL
4581 Cap *cap = in->auth_cap;
4582 ceph_assert(cap);
7c673cae
FG
4583
4584 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4585 // stage. This guarantees that MDS processes the cap flush message before issuing
4586 // the flushing caps to other client.
eafe8130
TL
4587 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4588 in->flags |= I_KICK_FLUSH;
7c673cae 4589 continue;
eafe8130 4590 }
7c673cae
FG
4591
4592 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4593 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4594 // send_reconnect() also will reset these sequence numbers. make sure
4595 // sequence numbers in cap flush message match later reconnect message.
4596 cap->seq = 0;
4597 cap->issue_seq = 0;
4598 cap->mseq = 0;
4599 cap->issued = cap->implemented;
4600
eafe8130 4601 kick_flushing_caps(in, session);
7c673cae
FG
4602 }
4603}
4604
7c673cae
FG
4605void SnapRealm::build_snap_context()
4606{
4607 set<snapid_t> snaps;
4608 snapid_t max_seq = seq;
4609
4610 // start with prior_parents?
4611 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4612 snaps.insert(prior_parent_snaps[i]);
4613
4614 // current parent's snaps
4615 if (pparent) {
4616 const SnapContext& psnapc = pparent->get_snap_context();
4617 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4618 if (psnapc.snaps[i] >= parent_since)
4619 snaps.insert(psnapc.snaps[i]);
4620 if (psnapc.seq > max_seq)
4621 max_seq = psnapc.seq;
4622 }
4623
4624 // my snaps
4625 for (unsigned i=0; i<my_snaps.size(); i++)
4626 snaps.insert(my_snaps[i]);
4627
4628 // ok!
4629 cached_snap_context.seq = max_seq;
4630 cached_snap_context.snaps.resize(0);
4631 cached_snap_context.snaps.reserve(snaps.size());
4632 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4633 cached_snap_context.snaps.push_back(*p);
4634}
4635
4636void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4637{
4638 list<SnapRealm*> q;
4639 q.push_back(realm);
4640
4641 while (!q.empty()) {
4642 realm = q.front();
4643 q.pop_front();
4644
11fdf7f2 4645 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4646 realm->invalidate_cache();
4647
4648 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4649 p != realm->pchildren.end();
4650 ++p)
4651 q.push_back(*p);
4652 }
4653}
4654
4655SnapRealm *Client::get_snap_realm(inodeno_t r)
4656{
4657 SnapRealm *realm = snap_realms[r];
4658 if (!realm)
4659 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4660 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4661 realm->nref++;
4662 return realm;
4663}
4664
4665SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4666{
4667 if (snap_realms.count(r) == 0) {
11fdf7f2 4668 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4669 return NULL;
4670 }
4671 SnapRealm *realm = snap_realms[r];
11fdf7f2 4672 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4673 realm->nref++;
4674 return realm;
4675}
4676
4677void Client::put_snap_realm(SnapRealm *realm)
4678{
11fdf7f2 4679 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4680 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4681 if (--realm->nref == 0) {
4682 snap_realms.erase(realm->ino);
4683 if (realm->pparent) {
4684 realm->pparent->pchildren.erase(realm);
4685 put_snap_realm(realm->pparent);
4686 }
4687 delete realm;
4688 }
4689}
4690
4691bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4692{
4693 if (realm->parent != parent) {
11fdf7f2 4694 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4695 << " " << realm->parent << " -> " << parent << dendl;
4696 realm->parent = parent;
4697 if (realm->pparent) {
4698 realm->pparent->pchildren.erase(realm);
4699 put_snap_realm(realm->pparent);
4700 }
4701 realm->pparent = get_snap_realm(parent);
4702 realm->pparent->pchildren.insert(realm);
4703 return true;
4704 }
4705 return false;
4706}
4707
4708static bool has_new_snaps(const SnapContext& old_snapc,
4709 const SnapContext& new_snapc)
4710{
4711 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4712}
4713
4714
11fdf7f2 4715void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4716{
4717 SnapRealm *first_realm = NULL;
11fdf7f2 4718 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4719
4720 map<SnapRealm*, SnapContext> dirty_realms;
4721
11fdf7f2 4722 auto p = bl.cbegin();
7c673cae
FG
4723 while (!p.end()) {
4724 SnapRealmInfo info;
11fdf7f2 4725 decode(info, p);
7c673cae
FG
4726 SnapRealm *realm = get_snap_realm(info.ino());
4727
4728 bool invalidate = false;
4729
4730 if (info.seq() > realm->seq) {
11fdf7f2 4731 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4732 << dendl;
4733
4734 if (flush) {
4735 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4736 // flush me + children
4737 list<SnapRealm*> q;
4738 q.push_back(realm);
4739 while (!q.empty()) {
4740 SnapRealm *realm = q.front();
4741 q.pop_front();
4742
4743 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4744 p != realm->pchildren.end();
4745 ++p)
4746 q.push_back(*p);
4747
4748 if (dirty_realms.count(realm) == 0) {
4749 realm->nref++;
4750 dirty_realms[realm] = realm->get_snap_context();
4751 }
4752 }
4753 }
4754
4755 // update
4756 realm->seq = info.seq();
4757 realm->created = info.created();
4758 realm->parent_since = info.parent_since();
4759 realm->prior_parent_snaps = info.prior_parent_snaps;
4760 realm->my_snaps = info.my_snaps;
4761 invalidate = true;
4762 }
4763
4764 // _always_ verify parent
4765 if (adjust_realm_parent(realm, info.parent()))
4766 invalidate = true;
4767
4768 if (invalidate) {
4769 invalidate_snaprealm_and_children(realm);
11fdf7f2 4770 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4771 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4772 } else {
11fdf7f2 4773 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4774 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4775 }
7f7e6c64 4776
7c673cae
FG
4777 if (!first_realm)
4778 first_realm = realm;
4779 else
4780 put_snap_realm(realm);
4781 }
4782
7f7e6c64 4783 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 4784 // if there are new snaps ?
7f7e6c64 4785 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 4786 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
7f7e6c64
TL
4787 for (auto&& in : realm->inodes_with_caps) {
4788 queue_cap_snap(in, snapc);
7c673cae
FG
4789 }
4790 } else {
4791 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4792 }
4793 put_snap_realm(realm);
4794 }
4795
4796 if (realm_ret)
4797 *realm_ret = first_realm;
4798 else
4799 put_snap_realm(first_realm);
4800}
4801
11fdf7f2 4802void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4803{
11fdf7f2 4804 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
7c673cae
FG
4808 return;
4809 }
4810
4811 got_mds_push(session);
4812
4813 map<Inode*, SnapContext> to_move;
4814 SnapRealm *realm = 0;
4815
4816 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4817 ceph_assert(m->head.split);
7c673cae 4818 SnapRealmInfo info;
11fdf7f2
TL
4819 auto p = m->bl.cbegin();
4820 decode(info, p);
4821 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4822
4823 // flush, then move, ino's.
4824 realm = get_snap_realm(info.ino());
4825 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4826 for (auto& ino : m->split_inos) {
4827 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4828 if (inode_map.count(vino)) {
4829 Inode *in = inode_map[vino];
4830 if (!in->snaprealm || in->snaprealm == realm)
4831 continue;
4832 if (in->snaprealm->created > info.created()) {
4833 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4834 << *in->snaprealm << dendl;
4835 continue;
4836 }
4837 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4838
4839
4840 in->snaprealm_item.remove_myself();
4841 to_move[in] = in->snaprealm->get_snap_context();
4842 put_snap_realm(in->snaprealm);
4843 }
4844 }
4845
4846 // move child snaprealms, too
11fdf7f2
TL
4847 for (auto& child_realm : m->split_realms) {
4848 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4849 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4850 if (!child)
4851 continue;
4852 adjust_realm_parent(child, realm->ino);
4853 put_snap_realm(child);
4854 }
4855 }
4856
4857 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4858
4859 if (realm) {
4860 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4861 Inode *in = p->first;
4862 in->snaprealm = realm;
4863 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4864 realm->nref++;
4865 // queue for snap writeback
4866 if (has_new_snaps(p->second, realm->get_snap_context()))
4867 queue_cap_snap(in, p->second);
4868 }
4869 put_snap_realm(realm);
4870 }
7c673cae
FG
4871}
4872
11fdf7f2 4873void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4874{
4875 mds_rank_t mds = mds_rank_t(m->get_source().num());
4876 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4877 if (!session) {
7c673cae
FG
4878 return;
4879 }
4880
4881 got_mds_push(session);
4882
11fdf7f2 4883 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4884
4885 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4886 if (inode_map.count(vino)) {
4887 Inode *in = NULL;
4888 in = inode_map[vino];
4889
4890 if (in) {
4891 in->quota = m->quota;
4892 in->rstat = m->rstat;
4893 }
4894 }
7c673cae
FG
4895}
4896
11fdf7f2 4897void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4898{
4899 mds_rank_t mds = mds_rank_t(m->get_source().num());
4900 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4901 if (!session) {
7c673cae
FG
4902 return;
4903 }
4904
4905 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4906 // Pause RADOS operations until we see the required epoch
4907 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4908 }
4909
4910 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4911 // Record the barrier so that we will transmit it to MDS when releasing
4912 set_cap_epoch_barrier(m->osd_epoch_barrier);
4913 }
4914
4915 got_mds_push(session);
4916
11fdf7f2 4917 Inode *in;
7c673cae 4918 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4919 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4920 in = it->second;
4921 } else {
7c673cae 4922 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4923 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4924 session->enqueue_cap_release(
4925 m->get_ino(),
4926 m->get_cap_id(),
4927 m->get_seq(),
4928 m->get_mseq(),
4929 cap_epoch_barrier);
4930 } else {
11fdf7f2 4931 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4932 }
7c673cae
FG
4933
4934 // in case the mds is waiting on e.g. a revocation
4935 flush_cap_releases();
4936 return;
4937 }
4938
4939 switch (m->get_op()) {
11fdf7f2
TL
4940 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4941 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4942 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4943 }
4944
11fdf7f2
TL
4945 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4946 Cap &cap = in->caps.at(mds);
7c673cae 4947
11fdf7f2
TL
4948 switch (m->get_op()) {
4949 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4950 case CEPH_CAP_OP_IMPORT:
4951 case CEPH_CAP_OP_REVOKE:
4952 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4953 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4954 }
4955 } else {
4956 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4957 return;
7c673cae
FG
4958 }
4959}
4960
11fdf7f2 4961void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4962{
4963 mds_rank_t mds = session->mds_num;
4964
11fdf7f2 4965 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4966 << " IMPORT from mds." << mds << dendl;
4967
4968 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4969 Cap *cap = NULL;
4970 UserPerm cap_perms;
11fdf7f2
TL
4971 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4972 cap = &it->second;
4973 cap_perms = cap->latest_perms;
7c673cae
FG
4974 }
4975
4976 // add/update it
4977 SnapRealm *realm = NULL;
4978 update_snap_trace(m->snapbl, &realm);
4979
1911f103
TL
4980 int issued = m->get_caps();
4981 int wanted = m->get_wanted();
7c673cae 4982 add_update_cap(in, session, m->get_cap_id(),
1911f103 4983 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 4984 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4985
4986 if (cap && cap->cap_id == m->peer.cap_id) {
4987 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4988 }
4989
4990 if (realm)
4991 put_snap_realm(realm);
4992
eafe8130 4993 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
4994 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4995 in->requested_max_size > m->get_max_size()) {
4996 in->requested_max_size = 0;
4997 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4998 }
7c673cae 4999 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5000 kick_flushing_caps(in, session);
7c673cae
FG
5001 }
5002}
5003
11fdf7f2 5004void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5005{
5006 mds_rank_t mds = session->mds_num;
5007
11fdf7f2 5008 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5009 << " EXPORT from mds." << mds << dendl;
5010
11fdf7f2
TL
5011 auto it = in->caps.find(mds);
5012 if (it != in->caps.end()) {
5013 Cap &cap = it->second;
5014 if (cap.cap_id == m->get_cap_id()) {
5015 if (m->peer.cap_id) {
5016 const auto peer_mds = mds_rank_t(m->peer.mds);
5017 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5018 auto it = in->caps.find(peer_mds);
5019 if (it != in->caps.end()) {
5020 Cap &tcap = it->second;
5021 if (tcap.cap_id == m->peer.cap_id &&
5022 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5023 tcap.cap_id = m->peer.cap_id;
5024 tcap.seq = m->peer.seq - 1;
5025 tcap.issue_seq = tcap.seq;
5026 tcap.issued |= cap.issued;
5027 tcap.implemented |= cap.issued;
5028 if (&cap == in->auth_cap)
5029 in->auth_cap = &tcap;
5030 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5031 adjust_session_flushing_caps(in, session, tsession);
5032 }
5033 } else {
5034 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5035 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5036 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5037 cap.latest_perms);
5038 }
7c673cae 5039 } else {
11fdf7f2
TL
5040 if (cap.wanted | cap.issued)
5041 in->flags |= I_CAP_DROPPED;
7c673cae 5042 }
7c673cae 5043
11fdf7f2
TL
5044 remove_cap(&cap, false);
5045 }
7c673cae 5046 }
7c673cae
FG
5047}
5048
11fdf7f2 5049void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5050{
5051 mds_rank_t mds = session->mds_num;
11fdf7f2 5052 ceph_assert(in->caps.count(mds));
7c673cae 5053
11fdf7f2 5054 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5055 << " size " << in->size << " -> " << m->get_size()
5056 << dendl;
5057
1adf2230
AA
5058 int issued;
5059 in->caps_issued(&issued);
5060 issued |= in->caps_dirty();
5061 update_inode_file_size(in, issued, m->get_size(),
5062 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
5063}
5064
11fdf7f2 5065void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5066{
5067 ceph_tid_t flush_ack_tid = m->get_client_tid();
5068 int dirty = m->get_dirty();
5069 int cleaned = 0;
5070 int flushed = 0;
5071
11fdf7f2
TL
5072 auto it = in->flushing_cap_tids.begin();
5073 if (it->first < flush_ack_tid) {
5074 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5075 << " got unexpected flush ack tid " << flush_ack_tid
5076 << " expected is " << it->first << dendl;
5077 }
5078 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5079 if (!it->second) {
5080 // cap snap
5081 ++it;
5082 continue;
5083 }
7c673cae
FG
5084 if (it->first == flush_ack_tid)
5085 cleaned = it->second;
5086 if (it->first <= flush_ack_tid) {
5087 session->flushing_caps_tids.erase(it->first);
5088 in->flushing_cap_tids.erase(it++);
5089 ++flushed;
5090 continue;
5091 }
5092 cleaned &= ~it->second;
5093 if (!cleaned)
5094 break;
5095 ++it;
5096 }
5097
11fdf7f2 5098 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5099 << " cleaned " << ccap_string(cleaned) << " on " << *in
5100 << " with " << ccap_string(dirty) << dendl;
5101
5102 if (flushed) {
5103 signal_cond_list(in->waitfor_caps);
5104 if (session->flushing_caps_tids.empty() ||
5105 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5106 sync_cond.notify_all();
7c673cae
FG
5107 }
5108
5109 if (!dirty) {
5110 in->cap_dirtier_uid = -1;
5111 in->cap_dirtier_gid = -1;
5112 }
5113
5114 if (!cleaned) {
5115 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5116 } else {
5117 if (in->flushing_caps) {
5118 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5119 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5120 in->flushing_caps &= ~cleaned;
5121 if (in->flushing_caps == 0) {
5122 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5123 num_flushing_caps--;
eafe8130 5124 if (in->flushing_cap_tids.empty())
7c673cae
FG
5125 in->flushing_cap_item.remove_myself();
5126 }
5127 if (!in->caps_dirty())
5128 put_inode(in);
5129 }
5130 }
7c673cae
FG
5131}
5132
5133
11fdf7f2 5134void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5135{
eafe8130 5136 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5137 mds_rank_t mds = session->mds_num;
11fdf7f2 5138 ceph_assert(in->caps.count(mds));
7c673cae
FG
5139 snapid_t follows = m->get_snap_follows();
5140
11fdf7f2
TL
5141 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5142 auto& capsnap = it->second;
eafe8130
TL
5143 if (flush_ack_tid != capsnap.flush_tid) {
5144 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5145 } else {
eafe8130 5146 InodeRef tmp_ref(in);
11fdf7f2 5147 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5148 << " on " << *in << dendl;
7c673cae 5149 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5150 in->flushing_cap_tids.erase(capsnap.flush_tid);
5151 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5152 in->flushing_cap_item.remove_myself();
11fdf7f2 5153 in->cap_snaps.erase(it);
eafe8130
TL
5154
5155 signal_cond_list(in->waitfor_caps);
5156 if (session->flushing_caps_tids.empty() ||
5157 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5158 sync_cond.notify_all();
7c673cae
FG
5159 }
5160 } else {
11fdf7f2 5161 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5162 << " on " << *in << dendl;
5163 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5164 }
7c673cae
FG
5165}
5166
5167class C_Client_DentryInvalidate : public Context {
5168private:
5169 Client *client;
5170 vinodeno_t dirino;
5171 vinodeno_t ino;
5172 string name;
5173public:
5174 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5175 client(c), name(dn->name) {
5176 if (client->use_faked_inos()) {
5177 dirino.ino = dn->dir->parent_inode->faked_ino;
5178 if (del)
5179 ino.ino = dn->inode->faked_ino;
5180 } else {
5181 dirino = dn->dir->parent_inode->vino();
5182 if (del)
5183 ino = dn->inode->vino();
5184 }
5185 if (!del)
5186 ino.ino = inodeno_t();
5187 }
5188 void finish(int r) override {
5189 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5190 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5191 client->_async_dentry_invalidate(dirino, ino, name);
5192 }
5193};
5194
5195void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5196{
5197 if (unmounting)
5198 return;
11fdf7f2 5199 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5200 << " in dir " << dirino << dendl;
e306af50 5201 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5202}
5203
5204void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5205{
5206 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5207 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5208}
5209
5210void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5211{
5212 int ref = in->get_num_ref();
494da23a 5213 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5214
5215 if (in->dir && !in->dir->dentries.empty()) {
5216 for (auto p = in->dir->dentries.begin();
5217 p != in->dir->dentries.end(); ) {
5218 Dentry *dn = p->second;
5219 ++p;
5220 /* rmsnap removes whole subtree, need trim inodes recursively.
5221 * we don't need to invalidate dentries recursively. because
5222 * invalidating a directory dentry effectively invalidate
5223 * whole subtree */
5224 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5225 _try_to_trim_inode(dn->inode.get(), false);
5226
5227 if (dn->lru_is_expireable())
5228 unlink(dn, true, false); // keep dir, drop dentry
5229 }
5230 if (in->dir->dentries.empty()) {
5231 close_dir(in->dir);
5232 --ref;
5233 }
5234 }
5235
5236 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5237 InodeRef snapdir = open_snapdir(in);
5238 _try_to_trim_inode(snapdir.get(), false);
5239 --ref;
5240 }
5241
494da23a 5242 if (ref > 0) {
11fdf7f2
TL
5243 auto q = in->dentries.begin();
5244 while (q != in->dentries.end()) {
5245 Dentry *dn = *q;
5246 ++q;
494da23a
TL
5247 if( in->ll_ref > 0 && sched_inval) {
5248 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5249 // so in->dentries doesn't always reflect the state of kernel's dcache.
5250 _schedule_invalidate_dentry_callback(dn, true);
5251 }
7c673cae
FG
5252 unlink(dn, true, true);
5253 }
5254 }
5255}
5256
11fdf7f2 5257void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5258{
5259 mds_rank_t mds = session->mds_num;
5260 int used = get_caps_used(in);
5261 int wanted = in->caps_wanted();
5262
a8e16298
TL
5263 const unsigned new_caps = m->get_caps();
5264 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5265 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5266 << " mds." << mds << " seq " << m->get_seq()
5267 << " caps now " << ccap_string(new_caps)
a8e16298 5268 << " was " << ccap_string(cap->issued)
92f5a8d4 5269 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5270
5271 if (was_stale)
5272 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5273 cap->seq = m->get_seq();
28e407b8 5274 cap->gen = session->cap_gen;
7c673cae 5275
11fdf7f2 5276 check_cap_issue(in, new_caps);
a8e16298 5277
7c673cae 5278 // update inode
1adf2230
AA
5279 int issued;
5280 in->caps_issued(&issued);
5281 issued |= in->caps_dirty();
7c673cae 5282
1adf2230
AA
5283 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5284 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5285 in->mode = m->head.mode;
5286 in->uid = m->head.uid;
5287 in->gid = m->head.gid;
5288 in->btime = m->btime;
5289 }
5290 bool deleted_inode = false;
1adf2230
AA
5291 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5292 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5293 in->nlink = m->head.nlink;
5294 if (in->nlink == 0 &&
5295 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5296 deleted_inode = true;
5297 }
1adf2230 5298 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5299 m->xattrbl.length() &&
5300 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5301 auto p = m->xattrbl.cbegin();
5302 decode(in->xattrs, p);
7c673cae
FG
5303 in->xattr_version = m->head.xattr_version;
5304 }
28e407b8
AA
5305
5306 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5307 in->dirstat.nfiles = m->get_nfiles();
5308 in->dirstat.nsubdirs = m->get_nsubdirs();
5309 }
5310
1adf2230
AA
5311 if (new_caps & CEPH_CAP_ANY_RD) {
5312 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5313 m->get_ctime(), m->get_mtime(), m->get_atime());
5314 }
5315
5316 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5317 in->layout = m->get_layout();
5318 update_inode_file_size(in, issued, m->get_size(),
5319 m->get_truncate_seq(), m->get_truncate_size());
5320 }
5321
5322 if (m->inline_version > in->inline_version) {
5323 in->inline_data = m->inline_data;
5324 in->inline_version = m->inline_version;
5325 }
5326
5327 /* always take a newer change attr */
5328 if (m->get_change_attr() > in->change_attr)
5329 in->change_attr = m->get_change_attr();
7c673cae
FG
5330
5331 // max_size
5332 if (cap == in->auth_cap &&
1adf2230
AA
5333 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5334 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5335 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5336 in->max_size = m->get_max_size();
5337 if (in->max_size > in->wanted_max_size) {
5338 in->wanted_max_size = 0;
5339 in->requested_max_size = 0;
5340 }
5341 }
5342
5343 bool check = false;
a8e16298
TL
5344 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5345 (wanted & ~(cap->wanted | new_caps))) {
5346 // If mds is importing cap, prior cap messages that update 'wanted'
5347 // may get dropped by mds (migrate seq mismatch).
5348 //
5349 // We don't send cap message to update 'wanted' if what we want are
5350 // already issued. If mds revokes caps, cap message that releases caps
5351 // also tells mds what we want. But if caps got revoked by mds forcedly
5352 // (session stale). We may haven't told mds what we want.
7c673cae 5353 check = true;
a8e16298 5354 }
7c673cae 5355
7c673cae
FG
5356
5357 // update caps
a8e16298 5358 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5359 if (revoked) {
5360 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5361 cap->issued = new_caps;
5362 cap->implemented |= new_caps;
5363
b32b8144
FG
5364 // recall delegations if we're losing caps necessary for them
5365 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5366 in->recall_deleg(false);
5367 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5368 in->recall_deleg(true);
5369
11fdf7f2
TL
5370 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5371 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5372 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5373 // waitin' for flush
11fdf7f2 5374 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5375 if (_release(in))
5376 check = true;
5377 } else {
5378 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5379 check = true;
5380 }
a8e16298
TL
5381 } else if (cap->issued == new_caps) {
5382 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5383 } else {
a8e16298 5384 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5385 cap->issued = new_caps;
5386 cap->implemented |= new_caps;
5387
5388 if (cap == in->auth_cap) {
5389 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5390 for (const auto &p : in->caps) {
5391 if (&p.second == cap)
7c673cae 5392 continue;
11fdf7f2 5393 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5394 check = true;
5395 break;
5396 }
5397 }
5398 }
5399 }
5400
5401 if (check)
5402 check_caps(in, 0);
5403
5404 // wake up waiters
5405 if (new_caps)
5406 signal_cond_list(in->waitfor_caps);
5407
5408 // may drop inode's last ref
5409 if (deleted_inode)
5410 _try_to_trim_inode(in, true);
7c673cae
FG
5411}
5412
7c673cae
FG
5413int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5414{
5415 if (perms.uid() == 0)
5416 return 0;
5417
5418 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5419 int ret = _posix_acl_permission(in, perms, want);
5420 if (ret != -EAGAIN)
5421 return ret;
5422 }
5423
5424 // check permissions before doing anything else
5425 if (!in->check_mode(perms, want))
5426 return -EACCES;
5427 return 0;
5428}
5429
5430int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5431 const UserPerm& perms)
5432{
5433 int r = _getattr_for_perm(in, perms);
5434 if (r < 0)
5435 goto out;
5436
5437 r = 0;
5438 if (strncmp(name, "system.", 7) == 0) {
5439 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5440 r = -EPERM;
5441 } else {
5442 r = inode_permission(in, perms, want);
5443 }
5444out:
1adf2230 5445 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5446 return r;
5447}
5448
5449ostream& operator<<(ostream &out, const UserPerm& perm) {
5450 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5451 return out;
5452}
5453
5454int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5455 const UserPerm& perms)
5456{
181888fb 5457 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5458 int r = _getattr_for_perm(in, perms);
5459 if (r < 0)
5460 goto out;
5461
5462 if (mask & CEPH_SETATTR_SIZE) {
5463 r = inode_permission(in, perms, MAY_WRITE);
5464 if (r < 0)
5465 goto out;
5466 }
5467
5468 r = -EPERM;
5469 if (mask & CEPH_SETATTR_UID) {
5470 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5471 goto out;
5472 }
5473 if (mask & CEPH_SETATTR_GID) {
5474 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5475 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5476 goto out;
5477 }
5478
5479 if (mask & CEPH_SETATTR_MODE) {
5480 if (perms.uid() != 0 && perms.uid() != in->uid)
5481 goto out;
5482
5483 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5484 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5485 stx->stx_mode &= ~S_ISGID;
5486 }
5487
5488 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5489 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5490 if (perms.uid() != 0 && perms.uid() != in->uid) {
5491 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5492 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5493 check_mask |= CEPH_SETATTR_MTIME;
5494 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5495 check_mask |= CEPH_SETATTR_ATIME;
5496 if (check_mask & mask) {
5497 goto out;
5498 } else {
5499 r = inode_permission(in, perms, MAY_WRITE);
5500 if (r < 0)
5501 goto out;
5502 }
5503 }
5504 }
5505 r = 0;
5506out:
5507 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5508 return r;
5509}
5510
5511int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5512{
181888fb 5513 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5514 unsigned want = 0;
5515
5516 if ((flags & O_ACCMODE) == O_WRONLY)
5517 want = MAY_WRITE;
5518 else if ((flags & O_ACCMODE) == O_RDWR)
5519 want = MAY_READ | MAY_WRITE;
5520 else if ((flags & O_ACCMODE) == O_RDONLY)
5521 want = MAY_READ;
5522 if (flags & O_TRUNC)
5523 want |= MAY_WRITE;
5524
5525 int r = 0;
5526 switch (in->mode & S_IFMT) {
5527 case S_IFLNK:
5528 r = -ELOOP;
5529 goto out;
5530 case S_IFDIR:
5531 if (want & MAY_WRITE) {
5532 r = -EISDIR;
5533 goto out;
5534 }
5535 break;
5536 }
5537
5538 r = _getattr_for_perm(in, perms);
5539 if (r < 0)
5540 goto out;
5541
5542 r = inode_permission(in, perms, want);
5543out:
5544 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5545 return r;
5546}
5547
5548int Client::may_lookup(Inode *dir, const UserPerm& perms)
5549{
181888fb 5550 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5551 int r = _getattr_for_perm(dir, perms);
5552 if (r < 0)
5553 goto out;
5554
5555 r = inode_permission(dir, perms, MAY_EXEC);
5556out:
5557 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5558 return r;
5559}
5560
5561int Client::may_create(Inode *dir, const UserPerm& perms)
5562{
181888fb 5563 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5564 int r = _getattr_for_perm(dir, perms);
5565 if (r < 0)
5566 goto out;
5567
5568 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5569out:
5570 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5571 return r;
5572}
5573
5574int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5575{
181888fb 5576 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5577 int r = _getattr_for_perm(dir, perms);
5578 if (r < 0)
5579 goto out;
5580
5581 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5582 if (r < 0)
5583 goto out;
5584
5585 /* 'name == NULL' means rmsnap */
5586 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5587 InodeRef otherin;
5588 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5589 if (r < 0)
5590 goto out;
5591 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5592 r = -EPERM;
5593 }
5594out:
5595 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5596 return r;
5597}
5598
5599int Client::may_hardlink(Inode *in, const UserPerm& perms)
5600{
181888fb 5601 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5602 int r = _getattr_for_perm(in, perms);
5603 if (r < 0)
5604 goto out;
5605
5606 if (perms.uid() == 0 || perms.uid() == in->uid) {
5607 r = 0;
5608 goto out;
5609 }
5610
5611 r = -EPERM;
5612 if (!S_ISREG(in->mode))
5613 goto out;
5614
5615 if (in->mode & S_ISUID)
5616 goto out;
5617
5618 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5619 goto out;
5620
5621 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5622out:
5623 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5624 return r;
5625}
5626
5627int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5628{
5629 int mask = CEPH_STAT_CAP_MODE;
5630 bool force = false;
5631 if (acl_type != NO_ACL) {
5632 mask |= CEPH_STAT_CAP_XATTR;
5633 force = in->xattr_version == 0;
5634 }
5635 return _getattr(in, mask, perms, force);
5636}
5637
5638vinodeno_t Client::_get_vino(Inode *in)
5639{
5640 /* The caller must hold the client lock */
5641 return vinodeno_t(in->ino, in->snapid);
5642}
5643
7c673cae
FG
5644/**
5645 * Resolve an MDS spec to a list of MDS daemon GIDs.
5646 *
5647 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5648 * It may be '*' in which case it matches all GIDs.
5649 *
5650 * If no error is returned, the `targets` vector will be populated with at least
5651 * one MDS.
5652 */
5653int Client::resolve_mds(
5654 const std::string &mds_spec,
5655 std::vector<mds_gid_t> *targets)
5656{
11fdf7f2
TL
5657 ceph_assert(fsmap);
5658 ceph_assert(targets != nullptr);
7c673cae
FG
5659
5660 mds_role_t role;
5661 std::stringstream ss;
5662 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5663 if (role_r == 0) {
5664 // We got a role, resolve it to a GID
5665 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5666 << role << "'" << dendl;
5667 targets->push_back(
5668 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5669 return 0;
5670 }
5671
5672 std::string strtol_err;
5673 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5674 if (strtol_err.empty()) {
5675 // It is a possible GID
5676 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5677 if (fsmap->gid_exists(mds_gid)) {
5678 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5679 targets->push_back(mds_gid);
5680 } else {
5681 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5682 << dendl;
5683 return -ENOENT;
5684 }
5685 } else if (mds_spec == "*") {
5686 // It is a wildcard: use all MDSs
5687 const auto mds_info = fsmap->get_mds_info();
5688
5689 if (mds_info.empty()) {
5690 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5691 return -ENOENT;
5692 }
5693
5694 for (const auto i : mds_info) {
5695 targets->push_back(i.first);
5696 }
5697 } else {
5698 // It did not parse as an integer, it is not a wildcard, it must be a name
5699 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5700 if (mds_gid == 0) {
5701 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5702
5703 lderr(cct) << "FSMap: " << *fsmap << dendl;
5704
5705 return -ENOENT;
5706 } else {
5707 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5708 << "' to GID " << mds_gid << dendl;
5709 targets->push_back(mds_gid);
5710 }
5711 }
5712
5713 return 0;
5714}
5715
5716
5717/**
5718 * Authenticate with mon and establish global ID
5719 */
5720int Client::authenticate()
5721{
9f95a23c 5722 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
5723
5724 if (monclient->is_authenticated()) {
5725 return 0;
5726 }
5727
9f95a23c 5728 client_lock.unlock();
7c673cae 5729 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
9f95a23c 5730 client_lock.lock();
7c673cae
FG
5731 if (r < 0) {
5732 return r;
5733 }
5734
5735 whoami = monclient->get_global_id();
5736 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5737
5738 return 0;
5739}
5740
5741int Client::fetch_fsmap(bool user)
5742{
5743 int r;
5744 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5745 // rather than MDSMap because no one MDSMap contains all the daemons, and
5746 // a `tell` can address any daemon.
5747 version_t fsmap_latest;
5748 do {
5749 C_SaferCond cond;
5750 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
9f95a23c 5751 client_lock.unlock();
7c673cae 5752 r = cond.wait();
9f95a23c 5753 client_lock.lock();
7c673cae
FG
5754 } while (r == -EAGAIN);
5755
5756 if (r < 0) {
5757 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5758 return r;
5759 }
5760
5761 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5762
5763 if (user) {
5764 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5765 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5766 monclient->renew_subs();
5767 wait_on_list(waiting_for_fsmap);
5768 }
11fdf7f2
TL
5769 ceph_assert(fsmap_user);
5770 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5771 } else {
5772 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5773 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5774 monclient->renew_subs();
5775 wait_on_list(waiting_for_fsmap);
5776 }
11fdf7f2
TL
5777 ceph_assert(fsmap);
5778 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5779 }
5780 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5781 << fsmap_latest << dendl;
5782 return 0;
5783}
5784
5785/**
5786 *
5787 * @mds_spec one of ID, rank, GID, "*"
5788 *
5789 */
5790int Client::mds_command(
5791 const std::string &mds_spec,
5792 const vector<string>& cmd,
5793 const bufferlist& inbl,
5794 bufferlist *outbl,
5795 string *outs,
5796 Context *onfinish)
5797{
11fdf7f2 5798 std::lock_guard lock(client_lock);
7c673cae 5799
181888fb
FG
5800 if (!initialized)
5801 return -ENOTCONN;
7c673cae
FG
5802
5803 int r;
5804 r = authenticate();
5805 if (r < 0) {
5806 return r;
5807 }
5808
5809 r = fetch_fsmap(false);
5810 if (r < 0) {
5811 return r;
5812 }
5813
5814 // Look up MDS target(s) of the command
5815 std::vector<mds_gid_t> targets;
5816 r = resolve_mds(mds_spec, &targets);
5817 if (r < 0) {
5818 return r;
5819 }
5820
5821 // If daemons are laggy, we won't send them commands. If all
5822 // are laggy then we fail.
5823 std::vector<mds_gid_t> non_laggy;
5824 for (const auto gid : targets) {
5825 const auto info = fsmap->get_info_gid(gid);
5826 if (!info.laggy()) {
5827 non_laggy.push_back(gid);
5828 }
5829 }
5830 if (non_laggy.size() == 0) {
5831 *outs = "All targeted MDS daemons are laggy";
5832 return -ENOENT;
5833 }
5834
5835 if (metadata.empty()) {
5836 // We are called on an unmounted client, so metadata
5837 // won't be initialized yet.
5838 populate_metadata("");
5839 }
5840
5841 // Send commands to targets
5842 C_GatherBuilder gather(cct, onfinish);
5843 for (const auto target_gid : non_laggy) {
5844 const auto info = fsmap->get_info_gid(target_gid);
5845
5846 // Open a connection to the target MDS
11fdf7f2 5847 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5848
5849 // Generate MDSCommandOp state
5850 auto &op = command_table.start_command();
5851
5852 op.on_finish = gather.new_sub();
5853 op.cmd = cmd;
5854 op.outbl = outbl;
5855 op.outs = outs;
5856 op.inbl = inbl;
5857 op.mds_gid = target_gid;
5858 op.con = conn;
5859
5860 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5861 << " tid=" << op.tid << cmd << dendl;
5862
5863 // Construct and send MCommand
11fdf7f2
TL
5864 auto m = op.get_message(monclient->get_fsid());
5865 conn->send_message2(std::move(m));
7c673cae
FG
5866 }
5867 gather.activate();
5868
5869 return 0;
5870}
5871
11fdf7f2 5872void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5873{
5874 ceph_tid_t const tid = m->get_tid();
5875
5876 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5877
5878 if (!command_table.exists(tid)) {
5879 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5880 return;
5881 }
5882
5883 auto &op = command_table.get_command(tid);
5884 if (op.outbl) {
11fdf7f2 5885 *op.outbl = m->get_data();
7c673cae
FG
5886 }
5887 if (op.outs) {
5888 *op.outs = m->rs;
5889 }
5890
5891 if (op.on_finish) {
5892 op.on_finish->complete(m->r);
5893 }
5894
5895 command_table.erase(tid);
7c673cae
FG
5896}
5897
5898// -------------------
5899// MOUNT
5900
11fdf7f2 5901int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5902{
7c673cae
FG
5903 int r = authenticate();
5904 if (r < 0) {
5905 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5906 return r;
5907 }
5908
11fdf7f2
TL
5909 std::string resolved_fs_name;
5910 if (fs_name.empty()) {
9f95a23c
TL
5911 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5912 if (resolved_fs_name.empty())
5913 // Try the backwards compatibility fs name option
5914 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
5915 } else {
5916 resolved_fs_name = fs_name;
5917 }
5918
7c673cae 5919 std::string want = "mdsmap";
11fdf7f2 5920 if (!resolved_fs_name.empty()) {
7c673cae
FG
5921 r = fetch_fsmap(true);
5922 if (r < 0)
5923 return r;
11fdf7f2
TL
5924 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5925 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5926 return -ENOENT;
11fdf7f2 5927 }
7c673cae
FG
5928
5929 std::ostringstream oss;
11fdf7f2 5930 oss << want << "." << fscid;
7c673cae
FG
5931 want = oss.str();
5932 }
5933 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5934
5935 monclient->sub_want(want, 0, 0);
5936 monclient->renew_subs();
5937
11fdf7f2
TL
5938 return 0;
5939}
5940
5941int Client::mount(const std::string &mount_root, const UserPerm& perms,
5942 bool require_mds, const std::string &fs_name)
5943{
5944 std::lock_guard lock(client_lock);
5945
5946 if (mounted) {
5947 ldout(cct, 5) << "already mounted" << dendl;
5948 return 0;
5949 }
5950
5951 unmounting = false;
5952
5953 int r = subscribe_mdsmap(fs_name);
5954 if (r < 0) {
5955 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5956 return r;
5957 }
5958
7c673cae
FG
5959 tick(); // start tick
5960
5961 if (require_mds) {
5962 while (1) {
5963 auto availability = mdsmap->is_cluster_available();
5964 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5965 // Error out
5966 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5967 return CEPH_FUSE_NO_MDS_UP;
5968 } else if (availability == MDSMap::AVAILABLE) {
5969 // Continue to mount
5970 break;
5971 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5972 // Else, wait. MDSMonitor will update the map to bring
5973 // us to a conclusion eventually.
5974 wait_on_list(waiting_for_mdsmap);
5975 } else {
5976 // Unexpected value!
5977 ceph_abort();
5978 }
5979 }
5980 }
5981
5982 populate_metadata(mount_root.empty() ? "/" : mount_root);
5983
5984 filepath fp(CEPH_INO_ROOT);
5985 if (!mount_root.empty()) {
5986 fp = filepath(mount_root.c_str());
5987 }
5988 while (true) {
5989 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5990 req->set_filepath(fp);
5991 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5992 int res = make_request(req, perms);
5993 if (res < 0) {
5994 if (res == -EACCES && root) {
5995 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5996 break;
5997 }
5998 return res;
5999 }
6000
6001 if (fp.depth())
6002 fp.pop_dentry();
6003 else
6004 break;
6005 }
6006
11fdf7f2 6007 ceph_assert(root);
7c673cae
FG
6008 _ll_get(root);
6009
6010 mounted = true;
6011
6012 // trace?
6013 if (!cct->_conf->client_trace.empty()) {
6014 traceout.open(cct->_conf->client_trace.c_str());
6015 if (traceout.is_open()) {
6016 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6017 } else {
6018 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6019 }
6020 }
6021
6022 /*
6023 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6024 ldout(cct, 3) << "op: struct stat st;" << dendl;
6025 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6026 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6027 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6028 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6029 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6030 ldout(cct, 3) << "op: int fd;" << dendl;
6031 */
6032 return 0;
6033}
6034
6035// UNMOUNT
6036
6037void Client::_close_sessions()
6038{
f6b5b4d7
TL
6039 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6040 if (it->second.state == MetaSession::STATE_REJECTED)
6041 mds_sessions.erase(it++);
6042 else
6043 ++it;
6044 }
6045
7c673cae
FG
6046 while (!mds_sessions.empty()) {
6047 // send session closes!
11fdf7f2
TL
6048 for (auto &p : mds_sessions) {
6049 if (p.second.state != MetaSession::STATE_CLOSING) {
6050 _close_mds_session(&p.second);
f6b5b4d7 6051 mds_ranks_closing.insert(p.first);
7c673cae
FG
6052 }
6053 }
6054
6055 // wait for sessions to close
f6b5b4d7
TL
6056 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6057 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6058 << timo << "s)" << dendl;
9f95a23c 6059 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6060 if (!timo) {
6061 mount_cond.wait(l);
6062 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6063 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6064 while (!mds_ranks_closing.empty()) {
6065 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6066 // this prunes entry from mds_sessions and mds_ranks_closing
6067 _closed_mds_session(&session, -ETIMEDOUT);
6068 }
6069 }
6070
6071 mds_ranks_closing.clear();
9f95a23c 6072 l.release();
7c673cae
FG
6073 }
6074}
6075
31f18b77
FG
6076void Client::flush_mdlog_sync()
6077{
6078 if (mds_requests.empty())
6079 return;
11fdf7f2
TL
6080 for (auto &p : mds_sessions) {
6081 flush_mdlog(&p.second);
31f18b77
FG
6082 }
6083}
6084
6085void Client::flush_mdlog(MetaSession *session)
6086{
6087 // Only send this to Luminous or newer MDS daemons, older daemons
6088 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6089 const uint64_t features = session->con->get_features();
6090 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6091 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6092 session->con->send_message2(std::move(m));
31f18b77
FG
6093 }
6094}
6095
6096
11fdf7f2
TL
6097void Client::_abort_mds_sessions(int err)
6098{
6099 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6100 auto req = p->second;
6101 ++p;
6102 // unsafe requests will be removed during close session below.
6103 if (req->got_unsafe)
6104 continue;
6105
6106 req->abort(err);
6107 if (req->caller_cond) {
6108 req->kick = true;
9f95a23c 6109 req->caller_cond->notify_all();
11fdf7f2
TL
6110 }
6111 }
6112
6113 // Process aborts on any requests that were on this waitlist.
6114 // Any requests that were on a waiting_for_open session waitlist
6115 // will get kicked during close session below.
6116 signal_cond_list(waiting_for_mdsmap);
6117
6118 // Force-close all sessions
6119 while(!mds_sessions.empty()) {
6120 auto& session = mds_sessions.begin()->second;
f6b5b4d7 6121 _closed_mds_session(&session, err);
11fdf7f2
TL
6122 }
6123}
6124
6125void Client::_unmount(bool abort)
7c673cae 6126{
9f95a23c 6127 std::unique_lock lock{client_lock, std::adopt_lock};
181888fb
FG
6128 if (unmounting)
6129 return;
7c673cae 6130
11fdf7f2
TL
6131 if (abort || blacklisted) {
6132 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6133 } else {
6134 ldout(cct, 2) << "unmounting" << dendl;
6135 }
7c673cae
FG
6136 unmounting = true;
6137
b32b8144
FG
6138 deleg_timeout = 0;
6139
11fdf7f2
TL
6140 if (abort) {
6141 // Abort all mds sessions
6142 _abort_mds_sessions(-ENOTCONN);
6143
6144 objecter->op_cancel_writes(-ENOTCONN);
6145 } else {
6146 // flush the mdlog for pending requests, if any
6147 flush_mdlog_sync();
6148 }
6149
9f95a23c
TL
6150 mount_cond.wait(lock, [this] {
6151 if (!mds_requests.empty()) {
6152 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6153 << dendl;
6154 }
6155 return mds_requests.empty();
6156 });
7c673cae
FG
6157 if (tick_event)
6158 timer.cancel_event(tick_event);
6159 tick_event = 0;
6160
6161 cwd.reset();
6162
6163 // clean up any unclosed files
6164 while (!fd_map.empty()) {
6165 Fh *fh = fd_map.begin()->second;
6166 fd_map.erase(fd_map.begin());
6167 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6168 _release_fh(fh);
6169 }
6170
6171 while (!ll_unclosed_fh_set.empty()) {
6172 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6173 Fh *fh = *it;
6174 ll_unclosed_fh_set.erase(fh);
6175 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6176 _release_fh(fh);
6177 }
6178
6179 while (!opened_dirs.empty()) {
6180 dir_result_t *dirp = *opened_dirs.begin();
6181 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6182 _closedir(dirp);
6183 }
6184
6185 _ll_drop_pins();
6186
9f95a23c
TL
6187 mount_cond.wait(lock, [this] {
6188 if (unsafe_sync_write > 0) {
6189 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6190 << dendl;
6191 }
6192 return unsafe_sync_write <= 0;
6193 });
7c673cae
FG
6194
6195 if (cct->_conf->client_oc) {
6196 // flush/release all buffered data
11fdf7f2
TL
6197 std::list<InodeRef> anchor;
6198 for (auto& p : inode_map) {
6199 Inode *in = p.second;
7c673cae 6200 if (!in) {
11fdf7f2
TL
6201 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6202 ceph_assert(in);
7c673cae 6203 }
11fdf7f2
TL
6204
6205 // prevent inode from getting freed
6206 anchor.emplace_back(in);
6207
6208 if (abort || blacklisted) {
6209 objectcacher->purge_set(&in->oset);
6210 } else if (!in->caps.empty()) {
7c673cae
FG
6211 _release(in);
6212 _flush(in, new C_Client_FlushComplete(this, in));
6213 }
6214 }
6215 }
6216
11fdf7f2
TL
6217 if (abort || blacklisted) {
6218 for (auto p = dirty_list.begin(); !p.end(); ) {
6219 Inode *in = *p;
6220 ++p;
6221 if (in->dirty_caps) {
6222 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6223 in->mark_caps_clean();
6224 put_inode(in);
6225 }
6226 }
6227 } else {
6228 flush_caps_sync();
6229 wait_sync_caps(last_flush_tid);
6230 }
7c673cae
FG
6231
6232 // empty lru cache
7c673cae
FG
6233 trim_cache();
6234
6235 while (lru.lru_get_size() > 0 ||
6236 !inode_map.empty()) {
6237 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6238 << "+" << inode_map.size() << " items"
6239 << ", waiting (for caps to release?)"
6240 << dendl;
9f95a23c
TL
6241 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6242 r == std::cv_status::timeout) {
7c673cae
FG
6243 dump_cache(NULL);
6244 }
6245 }
11fdf7f2
TL
6246 ceph_assert(lru.lru_get_size() == 0);
6247 ceph_assert(inode_map.empty());
7c673cae
FG
6248
6249 // stop tracing
6250 if (!cct->_conf->client_trace.empty()) {
6251 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6252 traceout.close();
6253 }
6254
6255 _close_sessions();
6256
6257 mounted = false;
6258
9f95a23c 6259 lock.release();
7c673cae
FG
6260 ldout(cct, 2) << "unmounted." << dendl;
6261}
6262
b32b8144
FG
6263void Client::unmount()
6264{
11fdf7f2
TL
6265 std::lock_guard lock(client_lock);
6266 _unmount(false);
6267}
6268
6269void Client::abort_conn()
6270{
6271 std::lock_guard lock(client_lock);
6272 _unmount(true);
b32b8144
FG
6273}
6274
7c673cae
FG
6275void Client::flush_cap_releases()
6276{
6277 // send any cap releases
11fdf7f2
TL
6278 for (auto &p : mds_sessions) {
6279 auto &session = p.second;
6280 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6281 p.first)) {
7c673cae
FG
6282 if (cct->_conf->client_inject_release_failure) {
6283 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6284 } else {
11fdf7f2 6285 session.con->send_message2(std::move(session.release));
7c673cae 6286 }
11fdf7f2 6287 session.release.reset();
7c673cae
FG
6288 }
6289 }
6290}
6291
6292void Client::tick()
6293{
6294 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6295 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6296 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6297 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6298 }
6299
6300 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6301 tick_event = timer.add_event_after(
6302 cct->_conf->client_tick_interval,
9f95a23c 6303 new LambdaContext([this](int) {
3efd9988 6304 // Called back via Timer, which takes client_lock for us
9f95a23c 6305 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3efd9988
FG
6306 tick();
6307 }));
7c673cae
FG
6308 utime_t now = ceph_clock_now();
6309
6310 if (!mounted && !mds_requests.empty()) {
6311 MetaRequest *req = mds_requests.begin()->second;
6312 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6313 req->abort(-ETIMEDOUT);
6314 if (req->caller_cond) {
6315 req->kick = true;
9f95a23c 6316 req->caller_cond->notify_all();
7c673cae
FG
6317 }
6318 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6319 for (auto &p : mds_sessions) {
6320 signal_context_list(p.second.waiting_for_open);
6321 }
7c673cae
FG
6322 }
6323 }
6324
6325 if (mdsmap->get_epoch()) {
6326 // renew caps?
6327 utime_t el = now - last_cap_renew;
6328 if (el > mdsmap->get_session_timeout() / 3.0)
6329 renew_caps();
6330
6331 flush_cap_releases();
6332 }
6333
6334 // delayed caps
28e407b8 6335 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6336 while (!p.end()) {
6337 Inode *in = *p;
6338 ++p;
6339 if (in->hold_caps_until > now)
6340 break;
28e407b8 6341 delayed_list.pop_front();
7c673cae
FG
6342 check_caps(in, CHECK_CAPS_NODELAY);
6343 }
6344
6345 trim_cache(true);
f6b5b4d7
TL
6346
6347 if (blacklisted && mounted &&
6348 last_auto_reconnect + 30 * 60 < now &&
6349 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6350 messenger->client_reset();
6351 fd_gen++; // invalidate open files
6352 blacklisted = false;
6353 _kick_stale_sessions();
6354 last_auto_reconnect = now;
6355 }
7c673cae
FG
6356}
6357
6358void Client::renew_caps()
6359{
6360 ldout(cct, 10) << "renew_caps()" << dendl;
6361 last_cap_renew = ceph_clock_now();
6362
11fdf7f2
TL
6363 for (auto &p : mds_sessions) {
6364 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6365 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6366 renew_caps(&p.second);
7c673cae
FG
6367 }
6368}
6369
6370void Client::renew_caps(MetaSession *session)
6371{
6372 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6373 session->last_cap_renew_request = ceph_clock_now();
6374 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6375 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6376}
6377
6378
6379// ===============================================================
6380// high level (POSIXy) interface
6381
6382int Client::_do_lookup(Inode *dir, const string& name, int mask,
6383 InodeRef *target, const UserPerm& perms)
6384{
6385 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6386 MetaRequest *req = new MetaRequest(op);
6387 filepath path;
6388 dir->make_nosnap_relative_path(path);
6389 path.push_dentry(name);
6390 req->set_filepath(path);
6391 req->set_inode(dir);
6392 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6393 mask |= DEBUG_GETATTR_CAPS;
6394 req->head.args.getattr.mask = mask;
6395
11fdf7f2 6396 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6397
6398 int r = make_request(req, perms, target);
11fdf7f2 6399 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6400 return r;
6401}
6402
6403int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6404 const UserPerm& perms)
6405{
6406 int r = 0;
6407 Dentry *dn = NULL;
f91f0fd5
TL
6408 // can only request shared caps
6409 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 6410
7c673cae 6411 if (dname == "..") {
11fdf7f2
TL
6412 if (dir->dentries.empty()) {
6413 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6414 filepath path(dir->ino);
6415 req->set_filepath(path);
6416
6417 InodeRef tmptarget;
6418 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6419
6420 if (r == 0) {
f91f0fd5 6421 *target = std::move(tmptarget);
11fdf7f2
TL
6422 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6423 } else {
6424 *target = dir;
6425 }
6426 }
7c673cae
FG
6427 else
6428 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6429 goto done;
6430 }
6431
6432 if (dname == ".") {
6433 *target = dir;
6434 goto done;
6435 }
6436
11fdf7f2
TL
6437 if (!dir->is_dir()) {
6438 r = -ENOTDIR;
6439 goto done;
6440 }
6441
7c673cae
FG
6442 if (dname.length() > NAME_MAX) {
6443 r = -ENAMETOOLONG;
6444 goto done;
6445 }
6446
6447 if (dname == cct->_conf->client_snapdir &&
6448 dir->snapid == CEPH_NOSNAP) {
6449 *target = open_snapdir(dir);
6450 goto done;
6451 }
6452
6453 if (dir->dir &&
6454 dir->dir->dentries.count(dname)) {
6455 dn = dir->dir->dentries[dname];
6456
11fdf7f2 6457 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6458 << " seq " << dn->lease_seq
6459 << dendl;
6460
94b18763 6461 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6462 // is dn lease valid?
6463 utime_t now = ceph_clock_now();
6464 if (dn->lease_mds >= 0 &&
6465 dn->lease_ttl > now &&
6466 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6467 MetaSession &s = mds_sessions.at(dn->lease_mds);
6468 if (s.cap_ttl > now &&
6469 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6470 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6471 // make trim_caps() behave.
6472 dir->try_touch_cap(dn->lease_mds);
6473 goto hit_dn;
6474 }
11fdf7f2 6475 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6476 << " vs lease_gen " << dn->lease_gen << dendl;
6477 }
92f5a8d4 6478 // dir shared caps?
94b18763 6479 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6480 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6481 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6482 goto hit_dn;
6483 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6484 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6485 << *dir << " dn '" << dname << "'" << dendl;
6486 return -ENOENT;
6487 }
6488 }
6489 } else {
6490 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6491 }
6492 } else {
6493 // can we conclude ENOENT locally?
94b18763 6494 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6495 (dir->flags & I_COMPLETE)) {
11fdf7f2 6496 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6497 return -ENOENT;
6498 }
6499 }
6500
6501 r = _do_lookup(dir, dname, mask, target, perms);
6502 goto done;
6503
6504 hit_dn:
6505 if (dn->inode) {
6506 *target = dn->inode;
6507 } else {
6508 r = -ENOENT;
6509 }
6510 touch_dn(dn);
6511
6512 done:
6513 if (r < 0)
11fdf7f2 6514 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6515 else
11fdf7f2 6516 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6517 return r;
6518}
6519
6520int Client::get_or_create(Inode *dir, const char* name,
6521 Dentry **pdn, bool expect_null)
6522{
6523 // lookup
11fdf7f2 6524 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6525 dir->open_dir();
6526 if (dir->dir->dentries.count(name)) {
6527 Dentry *dn = dir->dir->dentries[name];
6528
6529 // is dn lease valid?
6530 utime_t now = ceph_clock_now();
6531 if (dn->inode &&
6532 dn->lease_mds >= 0 &&
6533 dn->lease_ttl > now &&
6534 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6535 MetaSession &s = mds_sessions.at(dn->lease_mds);
6536 if (s.cap_ttl > now &&
6537 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6538 if (expect_null)
6539 return -EEXIST;
6540 }
6541 }
6542 *pdn = dn;
6543 } else {
6544 // otherwise link up a new one
6545 *pdn = link(dir->dir, name, NULL, NULL);
6546 }
6547
6548 // success
6549 return 0;
6550}
6551
6552int Client::path_walk(const filepath& origpath, InodeRef *end,
6553 const UserPerm& perms, bool followsym, int mask)
6554{
6555 filepath path = origpath;
6556 InodeRef cur;
6557 if (origpath.absolute())
6558 cur = root;
6559 else
6560 cur = cwd;
11fdf7f2 6561 ceph_assert(cur);
7c673cae 6562
11fdf7f2 6563 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6564
6565 int symlinks = 0;
6566
6567 unsigned i=0;
6568 while (i < path.depth() && cur) {
6569 int caps = 0;
6570 const string &dname = path[i];
6571 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6572 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6573 InodeRef next;
6574 if (cct->_conf->client_permissions) {
6575 int r = may_lookup(cur.get(), perms);
6576 if (r < 0)
6577 return r;
6578 caps = CEPH_CAP_AUTH_SHARED;
6579 }
6580
6581 /* Get extra requested caps on the last component */
6582 if (i == (path.depth() - 1))
6583 caps |= mask;
6584 int r = _lookup(cur.get(), dname, caps, &next, perms);
6585 if (r < 0)
6586 return r;
6587 // only follow trailing symlink if followsym. always follow
6588 // 'directory' symlinks.
6589 if (next && next->is_symlink()) {
6590 symlinks++;
6591 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6592 if (symlinks > MAXSYMLINKS) {
6593 return -ELOOP;
6594 }
6595
6596 if (i < path.depth() - 1) {
6597 // dir symlink
6598 // replace consumed components of path with symlink dir target
6599 filepath resolved(next->symlink.c_str());
6600 resolved.append(path.postfixpath(i + 1));
6601 path = resolved;
6602 i = 0;
6603 if (next->symlink[0] == '/') {
6604 cur = root;
6605 }
6606 continue;
6607 } else if (followsym) {
6608 if (next->symlink[0] == '/') {
6609 path = next->symlink.c_str();
6610 i = 0;
6611 // reset position
6612 cur = root;
6613 } else {
6614 filepath more(next->symlink.c_str());
6615 // we need to remove the symlink component from off of the path
6616 // before adding the target that the symlink points to. remain
6617 // at the same position in the path.
6618 path.pop_dentry();
6619 path.append(more);
6620 }
6621 continue;
6622 }
6623 }
6624 cur.swap(next);
6625 i++;
6626 }
6627 if (!cur)
6628 return -ENOENT;
6629 if (end)
6630 end->swap(cur);
6631 return 0;
6632}
6633
6634
6635// namespace ops
6636
6637int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6638{
11fdf7f2 6639 std::lock_guard lock(client_lock);
7c673cae
FG
6640 tout(cct) << "link" << std::endl;
6641 tout(cct) << relexisting << std::endl;
6642 tout(cct) << relpath << std::endl;
6643
181888fb
FG
6644 if (unmounting)
6645 return -ENOTCONN;
6646
7c673cae
FG
6647 filepath existing(relexisting);
6648
6649 InodeRef in, dir;
6650 int r = path_walk(existing, &in, perm, true);
6651 if (r < 0)
6652 return r;
6653 if (std::string(relpath) == "/") {
6654 r = -EEXIST;
6655 return r;
6656 }
6657 filepath path(relpath);
6658 string name = path.last_dentry();
6659 path.pop_dentry();
6660
6661 r = path_walk(path, &dir, perm, true);
6662 if (r < 0)
6663 return r;
6664 if (cct->_conf->client_permissions) {
6665 if (S_ISDIR(in->mode)) {
6666 r = -EPERM;
6667 return r;
6668 }
6669 r = may_hardlink(in.get(), perm);
6670 if (r < 0)
6671 return r;
6672 r = may_create(dir.get(), perm);
6673 if (r < 0)
6674 return r;
6675 }
6676 r = _link(in.get(), dir.get(), name.c_str(), perm);
6677 return r;
6678}
6679
6680int Client::unlink(const char *relpath, const UserPerm& perm)
6681{
11fdf7f2
TL
6682 std::lock_guard lock(client_lock);
6683 tout(cct) << __func__ << std::endl;
7c673cae
FG
6684 tout(cct) << relpath << std::endl;
6685
181888fb
FG
6686 if (unmounting)
6687 return -ENOTCONN;
6688
7c673cae
FG
6689 if (std::string(relpath) == "/")
6690 return -EISDIR;
6691
6692 filepath path(relpath);
6693 string name = path.last_dentry();
6694 path.pop_dentry();
6695 InodeRef dir;
6696 int r = path_walk(path, &dir, perm);
6697 if (r < 0)
6698 return r;
6699 if (cct->_conf->client_permissions) {
6700 r = may_delete(dir.get(), name.c_str(), perm);
6701 if (r < 0)
6702 return r;
6703 }
6704 return _unlink(dir.get(), name.c_str(), perm);
6705}
6706
6707int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6708{
11fdf7f2
TL
6709 std::lock_guard lock(client_lock);
6710 tout(cct) << __func__ << std::endl;
7c673cae
FG
6711 tout(cct) << relfrom << std::endl;
6712 tout(cct) << relto << std::endl;
6713
181888fb
FG
6714 if (unmounting)
6715 return -ENOTCONN;
6716
7c673cae
FG
6717 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6718 return -EBUSY;
6719
6720 filepath from(relfrom);
6721 filepath to(relto);
6722 string fromname = from.last_dentry();
6723 from.pop_dentry();
6724 string toname = to.last_dentry();
6725 to.pop_dentry();
6726
6727 InodeRef fromdir, todir;
6728 int r = path_walk(from, &fromdir, perm);
6729 if (r < 0)
6730 goto out;
6731 r = path_walk(to, &todir, perm);
6732 if (r < 0)
6733 goto out;
6734
6735 if (cct->_conf->client_permissions) {
6736 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6737 if (r < 0)
6738 return r;
6739 r = may_delete(todir.get(), toname.c_str(), perm);
6740 if (r < 0 && r != -ENOENT)
6741 return r;
6742 }
6743 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6744out:
6745 return r;
6746}
6747
6748// dirs
6749
6750int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6751{
11fdf7f2
TL
6752 std::lock_guard lock(client_lock);
6753 tout(cct) << __func__ << std::endl;
7c673cae
FG
6754 tout(cct) << relpath << std::endl;
6755 tout(cct) << mode << std::endl;
11fdf7f2 6756 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6757
181888fb
FG
6758 if (unmounting)
6759 return -ENOTCONN;
6760
7c673cae
FG
6761 if (std::string(relpath) == "/")
6762 return -EEXIST;
6763
6764 filepath path(relpath);
6765 string name = path.last_dentry();
6766 path.pop_dentry();
6767 InodeRef dir;
6768 int r = path_walk(path, &dir, perm);
6769 if (r < 0)
6770 return r;
6771 if (cct->_conf->client_permissions) {
6772 r = may_create(dir.get(), perm);
6773 if (r < 0)
6774 return r;
6775 }
6776 return _mkdir(dir.get(), name.c_str(), mode, perm);
6777}
6778
6779int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6780{
11fdf7f2 6781 std::lock_guard lock(client_lock);
7c673cae 6782 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6783 tout(cct) << __func__ << std::endl;
7c673cae
FG
6784 tout(cct) << relpath << std::endl;
6785 tout(cct) << mode << std::endl;
6786
181888fb
FG
6787 if (unmounting)
6788 return -ENOTCONN;
6789
7c673cae
FG
6790 //get through existing parts of path
6791 filepath path(relpath);
6792 unsigned int i;
6793 int r = 0, caps = 0;
6794 InodeRef cur, next;
6795 cur = cwd;
6796 for (i=0; i<path.depth(); ++i) {
6797 if (cct->_conf->client_permissions) {
6798 r = may_lookup(cur.get(), perms);
6799 if (r < 0)
6800 break;
6801 caps = CEPH_CAP_AUTH_SHARED;
6802 }
6803 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6804 if (r < 0)
6805 break;
6806 cur.swap(next);
6807 }
7c673cae 6808 if (r!=-ENOENT) return r;
11fdf7f2 6809 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6810 //make new directory at each level
6811 for (; i<path.depth(); ++i) {
6812 if (cct->_conf->client_permissions) {
6813 r = may_create(cur.get(), perms);
6814 if (r < 0)
6815 return r;
6816 }
6817 //make new dir
6818 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6819
7c673cae 6820 //check proper creation/existence
c07f9fc5
FG
6821 if(-EEXIST == r && i < path.depth() - 1) {
6822 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6823 }
6824 if (r < 0)
6825 return r;
7c673cae
FG
6826 //move to new dir and continue
6827 cur.swap(next);
11fdf7f2 6828 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6829 << filepath(cur->ino).get_path() << dendl;
6830 }
6831 return 0;
6832}
6833
6834int Client::rmdir(const char *relpath, const UserPerm& perms)
6835{
11fdf7f2
TL
6836 std::lock_guard lock(client_lock);
6837 tout(cct) << __func__ << std::endl;
7c673cae
FG
6838 tout(cct) << relpath << std::endl;
6839
181888fb
FG
6840 if (unmounting)
6841 return -ENOTCONN;
6842
7c673cae
FG
6843 if (std::string(relpath) == "/")
6844 return -EBUSY;
6845
6846 filepath path(relpath);
6847 string name = path.last_dentry();
6848 path.pop_dentry();
6849 InodeRef dir;
6850 int r = path_walk(path, &dir, perms);
6851 if (r < 0)
6852 return r;
6853 if (cct->_conf->client_permissions) {
6854 int r = may_delete(dir.get(), name.c_str(), perms);
6855 if (r < 0)
6856 return r;
6857 }
6858 return _rmdir(dir.get(), name.c_str(), perms);
6859}
6860
6861int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6862{
11fdf7f2
TL
6863 std::lock_guard lock(client_lock);
6864 tout(cct) << __func__ << std::endl;
7c673cae
FG
6865 tout(cct) << relpath << std::endl;
6866 tout(cct) << mode << std::endl;
6867 tout(cct) << rdev << std::endl;
6868
181888fb
FG
6869 if (unmounting)
6870 return -ENOTCONN;
6871
7c673cae
FG
6872 if (std::string(relpath) == "/")
6873 return -EEXIST;
6874
6875 filepath path(relpath);
6876 string name = path.last_dentry();
6877 path.pop_dentry();
6878 InodeRef dir;
6879 int r = path_walk(path, &dir, perms);
6880 if (r < 0)
6881 return r;
6882 if (cct->_conf->client_permissions) {
6883 int r = may_create(dir.get(), perms);
6884 if (r < 0)
6885 return r;
6886 }
6887 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6888}
6889
6890// symlinks
6891
6892int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6893{
11fdf7f2
TL
6894 std::lock_guard lock(client_lock);
6895 tout(cct) << __func__ << std::endl;
7c673cae
FG
6896 tout(cct) << target << std::endl;
6897 tout(cct) << relpath << std::endl;
6898
181888fb
FG
6899 if (unmounting)
6900 return -ENOTCONN;
6901
7c673cae
FG
6902 if (std::string(relpath) == "/")
6903 return -EEXIST;
6904
6905 filepath path(relpath);
6906 string name = path.last_dentry();
6907 path.pop_dentry();
6908 InodeRef dir;
6909 int r = path_walk(path, &dir, perms);
6910 if (r < 0)
6911 return r;
6912 if (cct->_conf->client_permissions) {
6913 int r = may_create(dir.get(), perms);
6914 if (r < 0)
6915 return r;
6916 }
6917 return _symlink(dir.get(), name.c_str(), target, perms);
6918}
6919
6920int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6921{
11fdf7f2
TL
6922 std::lock_guard lock(client_lock);
6923 tout(cct) << __func__ << std::endl;
7c673cae
FG
6924 tout(cct) << relpath << std::endl;
6925
181888fb
FG
6926 if (unmounting)
6927 return -ENOTCONN;
6928
7c673cae
FG
6929 filepath path(relpath);
6930 InodeRef in;
6931 int r = path_walk(path, &in, perms, false);
6932 if (r < 0)
6933 return r;
6934
6935 return _readlink(in.get(), buf, size);
6936}
6937
6938int Client::_readlink(Inode *in, char *buf, size_t size)
6939{
6940 if (!in->is_symlink())
6941 return -EINVAL;
6942
6943 // copy into buf (at most size bytes)
6944 int r = in->symlink.length();
6945 if (r > (int)size)
6946 r = size;
6947 memcpy(buf, in->symlink.c_str(), r);
6948 return r;
6949}
6950
6951
6952// inode stuff
6953
6954int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6955{
94b18763 6956 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6957
11fdf7f2 6958 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6959 if (yes && !force)
6960 return 0;
6961
6962 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6963 filepath path;
6964 in->make_nosnap_relative_path(path);
6965 req->set_filepath(path);
6966 req->set_inode(in);
6967 req->head.args.getattr.mask = mask;
6968
6969 int res = make_request(req, perms);
11fdf7f2 6970 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6971 return res;
6972}
6973
6974int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6975 const UserPerm& perms, InodeRef *inp)
6976{
6977 int issued = in->caps_issued();
6978
11fdf7f2 6979 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6980 ccap_string(issued) << dendl;
6981
6982 if (in->snapid != CEPH_NOSNAP) {
6983 return -EROFS;
6984 }
6985 if ((mask & CEPH_SETATTR_SIZE) &&
6986 (unsigned long)stx->stx_size > in->size &&
6987 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6988 perms)) {
6989 return -EDQUOT;
6990 }
6991
6992 // make the change locally?
6993 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6994 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6995 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6996 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6997 << in->cap_dirtier_gid << ", forcing sync setattr"
6998 << dendl;
6999 /*
7000 * This works because we implicitly flush the caps as part of the
7001 * request, so the cap update check will happen with the writeback
7002 * cap context, and then the setattr check will happen with the
7003 * caller's context.
7004 *
7005 * In reality this pattern is likely pretty rare (different users
7006 * setattr'ing the same file). If that turns out not to be the
7007 * case later, we can build a more complex pipelined cap writeback
7008 * infrastructure...
7009 */
7010 if (!mask)
7011 mask |= CEPH_SETATTR_CTIME;
7012 goto force_request;
7013 }
7014
7015 if (!mask) {
7016 // caller just needs us to bump the ctime
7017 in->ctime = ceph_clock_now();
7018 in->cap_dirtier_uid = perms.uid();
7019 in->cap_dirtier_gid = perms.gid();
7020 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7021 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7022 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7023 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7024 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7025 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7026 else
7027 mask |= CEPH_SETATTR_CTIME;
7028 }
7029
7030 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7031 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7032
7033 mask &= ~CEPH_SETATTR_KILL_SGUID;
7034
7035 if (mask & CEPH_SETATTR_UID) {
7036 in->ctime = ceph_clock_now();
7037 in->cap_dirtier_uid = perms.uid();
7038 in->cap_dirtier_gid = perms.gid();
7039 in->uid = stx->stx_uid;
28e407b8 7040 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7041 mask &= ~CEPH_SETATTR_UID;
7042 kill_sguid = true;
7043 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7044 }
7045 if (mask & CEPH_SETATTR_GID) {
7046 in->ctime = ceph_clock_now();
7047 in->cap_dirtier_uid = perms.uid();
7048 in->cap_dirtier_gid = perms.gid();
7049 in->gid = stx->stx_gid;
28e407b8 7050 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7051 mask &= ~CEPH_SETATTR_GID;
7052 kill_sguid = true;
7053 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7054 }
7055
7056 if (mask & CEPH_SETATTR_MODE) {
7057 in->ctime = ceph_clock_now();
7058 in->cap_dirtier_uid = perms.uid();
7059 in->cap_dirtier_gid = perms.gid();
7060 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7061 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7062 mask &= ~CEPH_SETATTR_MODE;
7063 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 7064 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 7065 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 7066 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 7067 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7068 }
7069
7070 if (mask & CEPH_SETATTR_BTIME) {
7071 in->ctime = ceph_clock_now();
7072 in->cap_dirtier_uid = perms.uid();
7073 in->cap_dirtier_gid = perms.gid();
7074 in->btime = utime_t(stx->stx_btime);
28e407b8 7075 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7076 mask &= ~CEPH_SETATTR_BTIME;
7077 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7078 }
7079 } else if (mask & CEPH_SETATTR_SIZE) {
7080 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7081 mask |= CEPH_SETATTR_KILL_SGUID;
7082 }
7083
7084 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7085 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7086 if (mask & CEPH_SETATTR_MTIME)
7087 in->mtime = utime_t(stx->stx_mtime);
7088 if (mask & CEPH_SETATTR_ATIME)
7089 in->atime = utime_t(stx->stx_atime);
7090 in->ctime = ceph_clock_now();
7091 in->cap_dirtier_uid = perms.uid();
7092 in->cap_dirtier_gid = perms.gid();
7093 in->time_warp_seq++;
28e407b8 7094 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
7095 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7096 }
7097 }
7098 if (!mask) {
7099 in->change_attr++;
7100 return 0;
7101 }
7102
7103force_request:
7104 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7105
7106 filepath path;
7107
7108 in->make_nosnap_relative_path(path);
7109 req->set_filepath(path);
7110 req->set_inode(in);
7111
7112 if (mask & CEPH_SETATTR_KILL_SGUID) {
7113 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7114 }
7115 if (mask & CEPH_SETATTR_MODE) {
7116 req->head.args.setattr.mode = stx->stx_mode;
7117 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7118 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7119 }
7120 if (mask & CEPH_SETATTR_UID) {
7121 req->head.args.setattr.uid = stx->stx_uid;
7122 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7123 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7124 }
7125 if (mask & CEPH_SETATTR_GID) {
7126 req->head.args.setattr.gid = stx->stx_gid;
7127 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7128 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7129 }
7130 if (mask & CEPH_SETATTR_BTIME) {
7131 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7132 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7133 }
7134 if (mask & CEPH_SETATTR_MTIME) {
7135 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 7136 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7137 CEPH_CAP_FILE_WR;
7138 }
7139 if (mask & CEPH_SETATTR_ATIME) {
7140 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7141 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7142 CEPH_CAP_FILE_WR;
7143 }
7144 if (mask & CEPH_SETATTR_SIZE) {
7145 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7146 req->head.args.setattr.size = stx->stx_size;
7147 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7148 } else { //too big!
7149 put_request(req);
7150 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7151 return -EFBIG;
7152 }
94b18763 7153 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7154 CEPH_CAP_FILE_WR;
7155 }
7156 req->head.args.setattr.mask = mask;
7157
7158 req->regetattr_mask = mask;
7159
7160 int res = make_request(req, perms, inp);
7161 ldout(cct, 10) << "_setattr result=" << res << dendl;
7162 return res;
7163}
7164
7165/* Note that we only care about attrs that setattr cares about */
7166void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7167{
7168 stx->stx_size = st->st_size;
7169 stx->stx_mode = st->st_mode;
7170 stx->stx_uid = st->st_uid;
7171 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7172#ifdef __APPLE__
7173 stx->stx_mtime = st->st_mtimespec;
7174 stx->stx_atime = st->st_atimespec;
7175#else
7c673cae
FG
7176 stx->stx_mtime = st->st_mtim;
7177 stx->stx_atime = st->st_atim;
11fdf7f2 7178#endif
7c673cae
FG
7179}
7180
7181int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7182 const UserPerm& perms, InodeRef *inp)
7183{
7184 int ret = _do_setattr(in, stx, mask, perms, inp);
7185 if (ret < 0)
7186 return ret;
7187 if (mask & CEPH_SETATTR_MODE)
7188 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7189 return ret;
7190}
7191
7192int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7193 const UserPerm& perms)
7194{
7195 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7196 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7197 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7198 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7199 if (cct->_conf->client_permissions) {
7200 int r = may_setattr(in.get(), stx, mask, perms);
7201 if (r < 0)
7202 return r;
7203 }
7204 return __setattrx(in.get(), stx, mask, perms);
7205}
7206
7207int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7208 const UserPerm& perms)
7209{
7210 struct ceph_statx stx;
7211
7212 stat_to_statx(attr, &stx);
7213 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7214
7215 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7216 mask &= ~CEPH_SETATTR_UID;
7217 }
7218 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7219 mask &= ~CEPH_SETATTR_GID;
7220 }
7221
7c673cae
FG
7222 return _setattrx(in, &stx, mask, perms);
7223}
7224
7225int Client::setattr(const char *relpath, struct stat *attr, int mask,
7226 const UserPerm& perms)
7227{
11fdf7f2
TL
7228 std::lock_guard lock(client_lock);
7229 tout(cct) << __func__ << std::endl;
7c673cae
FG
7230 tout(cct) << relpath << std::endl;
7231 tout(cct) << mask << std::endl;
7232
181888fb
FG
7233 if (unmounting)
7234 return -ENOTCONN;
7235
7c673cae
FG
7236 filepath path(relpath);
7237 InodeRef in;
7238 int r = path_walk(path, &in, perms);
7239 if (r < 0)
7240 return r;
7241 return _setattr(in, attr, mask, perms);
7242}
7243
7244int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7245 const UserPerm& perms, int flags)
7246{
11fdf7f2
TL
7247 std::lock_guard lock(client_lock);
7248 tout(cct) << __func__ << std::endl;
7c673cae
FG
7249 tout(cct) << relpath << std::endl;
7250 tout(cct) << mask << std::endl;
7251
181888fb
FG
7252 if (unmounting)
7253 return -ENOTCONN;
7254
7c673cae
FG
7255 filepath path(relpath);
7256 InodeRef in;
7257 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7258 if (r < 0)
7259 return r;
7260 return _setattrx(in, stx, mask, perms);
7261}
7262
7263int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7264{
11fdf7f2
TL
7265 std::lock_guard lock(client_lock);
7266 tout(cct) << __func__ << std::endl;
7c673cae
FG
7267 tout(cct) << fd << std::endl;
7268 tout(cct) << mask << std::endl;
7269
181888fb
FG
7270 if (unmounting)
7271 return -ENOTCONN;
7272
7c673cae
FG
7273 Fh *f = get_filehandle(fd);
7274 if (!f)
7275 return -EBADF;
7276#if defined(__linux__) && defined(O_PATH)
7277 if (f->flags & O_PATH)
7278 return -EBADF;
7279#endif
7280 return _setattr(f->inode, attr, mask, perms);
7281}
7282
7283int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7284{
11fdf7f2
TL
7285 std::lock_guard lock(client_lock);
7286 tout(cct) << __func__ << std::endl;
7c673cae
FG
7287 tout(cct) << fd << std::endl;
7288 tout(cct) << mask << std::endl;
7289
181888fb
FG
7290 if (unmounting)
7291 return -ENOTCONN;
7292
7c673cae
FG
7293 Fh *f = get_filehandle(fd);
7294 if (!f)
7295 return -EBADF;
7296#if defined(__linux__) && defined(O_PATH)
7297 if (f->flags & O_PATH)
7298 return -EBADF;
7299#endif
7300 return _setattrx(f->inode, stx, mask, perms);
7301}
7302
7303int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7304 frag_info_t *dirstat, int mask)
7305{
11fdf7f2
TL
7306 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7307 std::lock_guard lock(client_lock);
7c673cae
FG
7308 tout(cct) << "stat" << std::endl;
7309 tout(cct) << relpath << std::endl;
181888fb
FG
7310
7311 if (unmounting)
7312 return -ENOTCONN;
7313
7c673cae
FG
7314 filepath path(relpath);
7315 InodeRef in;
7316 int r = path_walk(path, &in, perms, true, mask);
7317 if (r < 0)
7318 return r;
7319 r = _getattr(in, mask, perms);
7320 if (r < 0) {
11fdf7f2 7321 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7322 return r;
7323 }
7324 fill_stat(in, stbuf, dirstat);
11fdf7f2 7325 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7326 return r;
7327}
7328
7329unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7330{
7331 unsigned mask = 0;
7332
7333 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7334 if (flags & AT_NO_ATTR_SYNC)
7335 goto out;
7336
7337 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7338 mask |= CEPH_CAP_PIN;
7339 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7340 mask |= CEPH_CAP_AUTH_SHARED;
7341 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7342 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 7343 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
7344 mask |= CEPH_CAP_FILE_SHARED;
7345 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7346 mask |= CEPH_CAP_XATTR_SHARED;
7347out:
7348 return mask;
7349}
7350
7351int Client::statx(const char *relpath, struct ceph_statx *stx,
7352 const UserPerm& perms,
7353 unsigned int want, unsigned int flags)
7354{
11fdf7f2
TL
7355 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7356 std::lock_guard lock(client_lock);
7c673cae
FG
7357 tout(cct) << "statx" << std::endl;
7358 tout(cct) << relpath << std::endl;
181888fb
FG
7359
7360 if (unmounting)
7361 return -ENOTCONN;
7362
7c673cae
FG
7363 filepath path(relpath);
7364 InodeRef in;
7365
7366 unsigned mask = statx_to_mask(flags, want);
7367
7368 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7369 if (r < 0)
7370 return r;
7371
7372 r = _getattr(in, mask, perms);
7373 if (r < 0) {
11fdf7f2 7374 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7375 return r;
7376 }
7377
7378 fill_statx(in, mask, stx);
11fdf7f2 7379 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7380 return r;
7381}
7382
7383int Client::lstat(const char *relpath, struct stat *stbuf,
7384 const UserPerm& perms, frag_info_t *dirstat, int mask)
7385{
11fdf7f2
TL
7386 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7387 std::lock_guard lock(client_lock);
7388 tout(cct) << __func__ << std::endl;
7c673cae 7389 tout(cct) << relpath << std::endl;
181888fb
FG
7390
7391 if (unmounting)
7392 return -ENOTCONN;
7393
7c673cae
FG
7394 filepath path(relpath);
7395 InodeRef in;
7396 // don't follow symlinks
7397 int r = path_walk(path, &in, perms, false, mask);
7398 if (r < 0)
7399 return r;
7400 r = _getattr(in, mask, perms);
7401 if (r < 0) {
11fdf7f2 7402 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7403 return r;
7404 }
7405 fill_stat(in, stbuf, dirstat);
11fdf7f2 7406 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7407 return r;
7408}
7409
7410int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7411{
11fdf7f2 7412 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7413 << " mode 0" << oct << in->mode << dec
7414 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7415 memset(st, 0, sizeof(struct stat));
7416 if (use_faked_inos())
7417 st->st_ino = in->faked_ino;
7418 else
7419 st->st_ino = in->ino;
7420 st->st_dev = in->snapid;
7421 st->st_mode = in->mode;
7422 st->st_rdev = in->rdev;
28e407b8
AA
7423 if (in->is_dir()) {
7424 switch (in->nlink) {
7425 case 0:
7426 st->st_nlink = 0; /* dir is unlinked */
7427 break;
7428 case 1:
7429 st->st_nlink = 1 /* parent dentry */
7430 + 1 /* <dir>/. */
7431 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7432 break;
7433 default:
7434 ceph_abort();
7435 }
7436 } else {
7437 st->st_nlink = in->nlink;
7438 }
7c673cae
FG
7439 st->st_uid = in->uid;
7440 st->st_gid = in->gid;
7441 if (in->ctime > in->mtime) {
7442 stat_set_ctime_sec(st, in->ctime.sec());
7443 stat_set_ctime_nsec(st, in->ctime.nsec());
7444 } else {
7445 stat_set_ctime_sec(st, in->mtime.sec());
7446 stat_set_ctime_nsec(st, in->mtime.nsec());
7447 }
7448 stat_set_atime_sec(st, in->atime.sec());
7449 stat_set_atime_nsec(st, in->atime.nsec());
7450 stat_set_mtime_sec(st, in->mtime.sec());
7451 stat_set_mtime_nsec(st, in->mtime.nsec());
7452 if (in->is_dir()) {
7453 if (cct->_conf->client_dirsize_rbytes)
7454 st->st_size = in->rstat.rbytes;
7455 else
7456 st->st_size = in->dirstat.size();
7457 st->st_blocks = 1;
7458 } else {
7459 st->st_size = in->size;
7460 st->st_blocks = (in->size + 511) >> 9;
7461 }
11fdf7f2 7462 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7463
7464 if (dirstat)
7465 *dirstat = in->dirstat;
7466 if (rstat)
7467 *rstat = in->rstat;
7468
7469 return in->caps_issued();
7470}
7471
7472void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7473{
11fdf7f2 7474 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7475 << " mode 0" << oct << in->mode << dec
7476 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7477 memset(stx, 0, sizeof(struct ceph_statx));
7478
7479 /*
7480 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7481 * so that all bits are set.
7482 */
7483 if (!mask)
7484 mask = ~0;
7485
7486 /* These are always considered to be available */
7487 stx->stx_dev = in->snapid;
11fdf7f2 7488 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7489
7490 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7491 stx->stx_mode = S_IFMT & in->mode;
7492 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7493 stx->stx_rdev = in->rdev;
7494 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7495
7496 if (mask & CEPH_CAP_AUTH_SHARED) {
7497 stx->stx_uid = in->uid;
7498 stx->stx_gid = in->gid;
7499 stx->stx_mode = in->mode;
7500 in->btime.to_timespec(&stx->stx_btime);
7501 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7502 }
7503
7504 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7505 if (in->is_dir()) {
7506 switch (in->nlink) {
7507 case 0:
7508 stx->stx_nlink = 0; /* dir is unlinked */
7509 break;
7510 case 1:
7511 stx->stx_nlink = 1 /* parent dentry */
7512 + 1 /* <dir>/. */
7513 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7514 break;
7515 default:
7516 ceph_abort();
7517 }
7518 } else {
7519 stx->stx_nlink = in->nlink;
7520 }
7c673cae
FG
7521 stx->stx_mask |= CEPH_STATX_NLINK;
7522 }
7523
7524 if (mask & CEPH_CAP_FILE_SHARED) {
7525
7526 in->atime.to_timespec(&stx->stx_atime);
7527 in->mtime.to_timespec(&stx->stx_mtime);
7528
7529 if (in->is_dir()) {
7530 if (cct->_conf->client_dirsize_rbytes)
7531 stx->stx_size = in->rstat.rbytes;
7532 else
7533 stx->stx_size = in->dirstat.size();
7534 stx->stx_blocks = 1;
7535 } else {
7536 stx->stx_size = in->size;
7537 stx->stx_blocks = (in->size + 511) >> 9;
7538 }
7539 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7540 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7541 }
7542
7543 /* Change time and change_attr both require all shared caps to view */
7544 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7545 stx->stx_version = in->change_attr;
7546 if (in->ctime > in->mtime)
7547 in->ctime.to_timespec(&stx->stx_ctime);
7548 else
7549 in->mtime.to_timespec(&stx->stx_ctime);
7550 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7551 }
7552
7553}
7554
7555void Client::touch_dn(Dentry *dn)
7556{
7557 lru.lru_touch(dn);
7558}
7559
7560int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7561{
11fdf7f2
TL
7562 std::lock_guard lock(client_lock);
7563 tout(cct) << __func__ << std::endl;
7c673cae
FG
7564 tout(cct) << relpath << std::endl;
7565 tout(cct) << mode << std::endl;
181888fb
FG
7566
7567 if (unmounting)
7568 return -ENOTCONN;
7569
7c673cae
FG
7570 filepath path(relpath);
7571 InodeRef in;
7572 int r = path_walk(path, &in, perms);
7573 if (r < 0)
7574 return r;
7575 struct stat attr;
7576 attr.st_mode = mode;
7577 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7578}
7579
7580int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7581{
11fdf7f2
TL
7582 std::lock_guard lock(client_lock);
7583 tout(cct) << __func__ << std::endl;
7c673cae
FG
7584 tout(cct) << fd << std::endl;
7585 tout(cct) << mode << std::endl;
181888fb
FG
7586
7587 if (unmounting)
7588 return -ENOTCONN;
7589
7c673cae
FG
7590 Fh *f = get_filehandle(fd);
7591 if (!f)
7592 return -EBADF;
7593#if defined(__linux__) && defined(O_PATH)
7594 if (f->flags & O_PATH)
7595 return -EBADF;
7596#endif
7597 struct stat attr;
7598 attr.st_mode = mode;
7599 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7600}
7601
7602int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7603{
11fdf7f2
TL
7604 std::lock_guard lock(client_lock);
7605 tout(cct) << __func__ << std::endl;
7c673cae
FG
7606 tout(cct) << relpath << std::endl;
7607 tout(cct) << mode << std::endl;
181888fb
FG
7608
7609 if (unmounting)
7610 return -ENOTCONN;
7611
7c673cae
FG
7612 filepath path(relpath);
7613 InodeRef in;
7614 // don't follow symlinks
7615 int r = path_walk(path, &in, perms, false);
7616 if (r < 0)
7617 return r;
7618 struct stat attr;
7619 attr.st_mode = mode;
7620 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7621}
7622
7623int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7624 const UserPerm& perms)
7625{
11fdf7f2
TL
7626 std::lock_guard lock(client_lock);
7627 tout(cct) << __func__ << std::endl;
7c673cae
FG
7628 tout(cct) << relpath << std::endl;
7629 tout(cct) << new_uid << std::endl;
7630 tout(cct) << new_gid << std::endl;
181888fb
FG
7631
7632 if (unmounting)
7633 return -ENOTCONN;
7634
7c673cae
FG
7635 filepath path(relpath);
7636 InodeRef in;
7637 int r = path_walk(path, &in, perms);
7638 if (r < 0)
7639 return r;
7640 struct stat attr;
7641 attr.st_uid = new_uid;
7642 attr.st_gid = new_gid;
181888fb 7643 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7644}
7645
7646int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7647{
11fdf7f2
TL
7648 std::lock_guard lock(client_lock);
7649 tout(cct) << __func__ << std::endl;
7c673cae
FG
7650 tout(cct) << fd << std::endl;
7651 tout(cct) << new_uid << std::endl;
7652 tout(cct) << new_gid << std::endl;
181888fb
FG
7653
7654 if (unmounting)
7655 return -ENOTCONN;
7656
7c673cae
FG
7657 Fh *f = get_filehandle(fd);
7658 if (!f)
7659 return -EBADF;
7660#if defined(__linux__) && defined(O_PATH)
7661 if (f->flags & O_PATH)
7662 return -EBADF;
7663#endif
7664 struct stat attr;
7665 attr.st_uid = new_uid;
7666 attr.st_gid = new_gid;
7667 int mask = 0;
7668 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7669 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7670 return _setattr(f->inode, &attr, mask, perms);
7671}
7672
7673int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7674 const UserPerm& perms)
7675{
11fdf7f2
TL
7676 std::lock_guard lock(client_lock);
7677 tout(cct) << __func__ << std::endl;
7c673cae
FG
7678 tout(cct) << relpath << std::endl;
7679 tout(cct) << new_uid << std::endl;
7680 tout(cct) << new_gid << std::endl;
181888fb
FG
7681
7682 if (unmounting)
7683 return -ENOTCONN;
7684
7c673cae
FG
7685 filepath path(relpath);
7686 InodeRef in;
7687 // don't follow symlinks
7688 int r = path_walk(path, &in, perms, false);
7689 if (r < 0)
7690 return r;
7691 struct stat attr;
7692 attr.st_uid = new_uid;
7693 attr.st_gid = new_gid;
7694 int mask = 0;
7695 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7696 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7697 return _setattr(in, &attr, mask, perms);
7698}
7699
11fdf7f2
TL
7700static void attr_set_atime_and_mtime(struct stat *attr,
7701 const utime_t &atime,
7702 const utime_t &mtime)
7703{
7704 stat_set_atime_sec(attr, atime.tv.tv_sec);
7705 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7706 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7707 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7708}
7709
7710// for [l]utime() invoke the timeval variant as the timespec
7711// variant are not yet implemented. for futime[s](), invoke
7712// the timespec variant.
7c673cae
FG
7713int Client::utime(const char *relpath, struct utimbuf *buf,
7714 const UserPerm& perms)
7715{
11fdf7f2
TL
7716 struct timeval tv[2];
7717 tv[0].tv_sec = buf->actime;
7718 tv[0].tv_usec = 0;
7719 tv[1].tv_sec = buf->modtime;
7720 tv[1].tv_usec = 0;
7721
7722 return utimes(relpath, tv, perms);
7723}
7724
7725int Client::lutime(const char *relpath, struct utimbuf *buf,
7726 const UserPerm& perms)
7727{
7728 struct timeval tv[2];
7729 tv[0].tv_sec = buf->actime;
7730 tv[0].tv_usec = 0;
7731 tv[1].tv_sec = buf->modtime;
7732 tv[1].tv_usec = 0;
7733
7734 return lutimes(relpath, tv, perms);
7735}
7736
7737int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7738{
7739 struct timespec ts[2];
7740 ts[0].tv_sec = buf->actime;
7741 ts[0].tv_nsec = 0;
7742 ts[1].tv_sec = buf->modtime;
7743 ts[1].tv_nsec = 0;
7744
7745 return futimens(fd, ts, perms);
7746}
7747
7748int Client::utimes(const char *relpath, struct timeval times[2],
7749 const UserPerm& perms)
7750{
7751 std::lock_guard lock(client_lock);
7752 tout(cct) << __func__ << std::endl;
7c673cae 7753 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7754 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7755 << std::endl;
7756 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7757 << std::endl;
181888fb
FG
7758
7759 if (unmounting)
7760 return -ENOTCONN;
7761
7c673cae
FG
7762 filepath path(relpath);
7763 InodeRef in;
7764 int r = path_walk(path, &in, perms);
7765 if (r < 0)
7766 return r;
7767 struct stat attr;
11fdf7f2
TL
7768 utime_t atime(times[0]);
7769 utime_t mtime(times[1]);
7770
7771 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7772 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7773}
7774
11fdf7f2
TL
7775int Client::lutimes(const char *relpath, struct timeval times[2],
7776 const UserPerm& perms)
7c673cae 7777{
11fdf7f2
TL
7778 std::lock_guard lock(client_lock);
7779 tout(cct) << __func__ << std::endl;
7c673cae 7780 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7781 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7782 << std::endl;
7783 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7784 << std::endl;
181888fb
FG
7785
7786 if (unmounting)
7787 return -ENOTCONN;
7788
7c673cae
FG
7789 filepath path(relpath);
7790 InodeRef in;
7c673cae
FG
7791 int r = path_walk(path, &in, perms, false);
7792 if (r < 0)
7793 return r;
7794 struct stat attr;
11fdf7f2
TL
7795 utime_t atime(times[0]);
7796 utime_t mtime(times[1]);
7797
7798 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7799 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7800}
7801
11fdf7f2
TL
7802int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7803{
7804 struct timespec ts[2];
7805 ts[0].tv_sec = times[0].tv_sec;
7806 ts[0].tv_nsec = times[0].tv_usec * 1000;
7807 ts[1].tv_sec = times[1].tv_sec;
7808 ts[1].tv_nsec = times[1].tv_usec * 1000;
7809
7810 return futimens(fd, ts, perms);
7811}
7812
7813int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7814{
7815 std::lock_guard lock(client_lock);
7816 tout(cct) << __func__ << std::endl;
7817 tout(cct) << fd << std::endl;
7818 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7819 << std::endl;
7820 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7821 << std::endl;
7822
7823 if (unmounting)
7824 return -ENOTCONN;
7825
7826 Fh *f = get_filehandle(fd);
7827 if (!f)
7828 return -EBADF;
7829#if defined(__linux__) && defined(O_PATH)
7830 if (f->flags & O_PATH)
7831 return -EBADF;
7832#endif
7833 struct stat attr;
7834 utime_t atime(times[0]);
7835 utime_t mtime(times[1]);
7836
7837 attr_set_atime_and_mtime(&attr, atime, mtime);
7838 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7839}
7840
7c673cae
FG
7841int Client::flock(int fd, int operation, uint64_t owner)
7842{
11fdf7f2
TL
7843 std::lock_guard lock(client_lock);
7844 tout(cct) << __func__ << std::endl;
7c673cae
FG
7845 tout(cct) << fd << std::endl;
7846 tout(cct) << operation << std::endl;
7847 tout(cct) << owner << std::endl;
181888fb
FG
7848
7849 if (unmounting)
7850 return -ENOTCONN;
7851
7c673cae
FG
7852 Fh *f = get_filehandle(fd);
7853 if (!f)
7854 return -EBADF;
7855
7856 return _flock(f, operation, owner);
7857}
7858
7859int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7860{
11fdf7f2
TL
7861 std::lock_guard lock(client_lock);
7862 tout(cct) << __func__ << std::endl;
7c673cae 7863 tout(cct) << relpath << std::endl;
181888fb
FG
7864
7865 if (unmounting)
7866 return -ENOTCONN;
7867
7c673cae
FG
7868 filepath path(relpath);
7869 InodeRef in;
7870 int r = path_walk(path, &in, perms, true);
7871 if (r < 0)
7872 return r;
7873 if (cct->_conf->client_permissions) {
7874 int r = may_open(in.get(), O_RDONLY, perms);
7875 if (r < 0)
7876 return r;
7877 }
7878 r = _opendir(in.get(), dirpp, perms);
7879 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7880 if (r != -ENOTDIR)
7881 tout(cct) << (unsigned long)*dirpp << std::endl;
7882 return r;
7883}
7884
7885int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7886{
7887 if (!in->is_dir())
7888 return -ENOTDIR;
7889 *dirpp = new dir_result_t(in, perms);
7890 opened_dirs.insert(*dirpp);
11fdf7f2 7891 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7892 return 0;
7893}
7894
7895
7896int Client::closedir(dir_result_t *dir)
7897{
11fdf7f2
TL
7898 std::lock_guard lock(client_lock);
7899 tout(cct) << __func__ << std::endl;
7c673cae
FG
7900 tout(cct) << (unsigned long)dir << std::endl;
7901
11fdf7f2 7902 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7903 _closedir(dir);
7904 return 0;
7905}
7906
7907void Client::_closedir(dir_result_t *dirp)
7908{
11fdf7f2 7909 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7910 if (dirp->inode) {
11fdf7f2 7911 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7912 dirp->inode.reset();
7913 }
7914 _readdir_drop_dirp_buffer(dirp);
7915 opened_dirs.erase(dirp);
7916 delete dirp;
7917}
7918
7919void Client::rewinddir(dir_result_t *dirp)
7920{
11fdf7f2
TL
7921 std::lock_guard lock(client_lock);
7922 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7923
7924 if (unmounting)
7925 return;
7926
7c673cae
FG
7927 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7928 _readdir_drop_dirp_buffer(d);
7929 d->reset();
7930}
7931
7932loff_t Client::telldir(dir_result_t *dirp)
7933{
7934 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7935 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7936 return d->offset;
7937}
7938
7939void Client::seekdir(dir_result_t *dirp, loff_t offset)
7940{
11fdf7f2 7941 std::lock_guard lock(client_lock);
7c673cae 7942
11fdf7f2 7943 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7944
181888fb
FG
7945 if (unmounting)
7946 return;
7947
7c673cae
FG
7948 if (offset == dirp->offset)
7949 return;
7950
7951 if (offset > dirp->offset)
7952 dirp->release_count = 0; // bump if we do a forward seek
7953 else
7954 dirp->ordered_count = 0; // disable filling readdir cache
7955
7956 if (dirp->hash_order()) {
7957 if (dirp->offset > offset) {
7958 _readdir_drop_dirp_buffer(dirp);
7959 dirp->reset();
7960 }
7961 } else {
7962 if (offset == 0 ||
7963 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7964 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7965 _readdir_drop_dirp_buffer(dirp);
7966 dirp->reset();
7967 }
7968 }
7969
7970 dirp->offset = offset;
7971}
7972
7973
7974//struct dirent {
7975// ino_t d_ino; /* inode number */
7976// off_t d_off; /* offset to the next dirent */
7977// unsigned short d_reclen; /* length of this record */
7978// unsigned char d_type; /* type of file */
7979// char d_name[256]; /* filename */
7980//};
7981void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7982{
7983 strncpy(de->d_name, name, 255);
7984 de->d_name[255] = '\0';
7985#ifndef __CYGWIN__
7986 de->d_ino = ino;
11fdf7f2 7987#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7988 de->d_off = next_off;
7989#endif
7990 de->d_reclen = 1;
7991 de->d_type = IFTODT(type);
11fdf7f2 7992 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7993 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7994#endif
7995}
7996
7997void Client::_readdir_next_frag(dir_result_t *dirp)
7998{
7999 frag_t fg = dirp->buffer_frag;
8000
8001 if (fg.is_rightmost()) {
11fdf7f2 8002 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
8003 dirp->set_end();
8004 return;
8005 }
8006
8007 // advance
8008 fg = fg.next();
11fdf7f2 8009 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
8010
8011 if (dirp->hash_order()) {
8012 // keep last_name
8013 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8014 if (dirp->offset < new_offset) // don't decrease offset
8015 dirp->offset = new_offset;
8016 } else {
8017 dirp->last_name.clear();
8018 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8019 _readdir_rechoose_frag(dirp);
8020 }
8021}
8022
8023void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8024{
11fdf7f2 8025 ceph_assert(dirp->inode);
7c673cae
FG
8026
8027 if (dirp->hash_order())
8028 return;
8029
8030 frag_t cur = frag_t(dirp->offset_high());
8031 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8032 if (fg != cur) {
11fdf7f2 8033 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
8034 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8035 dirp->last_name.clear();
8036 dirp->next_offset = 2;
8037 }
8038}
8039
8040void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8041{
11fdf7f2 8042 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
8043 dirp->buffer.clear();
8044}
8045
8046int Client::_readdir_get_frag(dir_result_t *dirp)
8047{
11fdf7f2
TL
8048 ceph_assert(dirp);
8049 ceph_assert(dirp->inode);
7c673cae
FG
8050
8051 // get the current frag.
8052 frag_t fg;
8053 if (dirp->hash_order())
8054 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8055 else
8056 fg = frag_t(dirp->offset_high());
8057
11fdf7f2 8058 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
8059 << " offset " << hex << dirp->offset << dec << dendl;
8060
8061 int op = CEPH_MDS_OP_READDIR;
8062 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8063 op = CEPH_MDS_OP_LSSNAP;
8064
8065 InodeRef& diri = dirp->inode;
8066
8067 MetaRequest *req = new MetaRequest(op);
8068 filepath path;
8069 diri->make_nosnap_relative_path(path);
8070 req->set_filepath(path);
8071 req->set_inode(diri.get());
8072 req->head.args.readdir.frag = fg;
8073 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8074 if (dirp->last_name.length()) {
94b18763 8075 req->path2.set_path(dirp->last_name);
7c673cae
FG
8076 } else if (dirp->hash_order()) {
8077 req->head.args.readdir.offset_hash = dirp->offset_high();
8078 }
8079 req->dirp = dirp;
8080
8081 bufferlist dirbl;
8082 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8083
8084 if (res == -EAGAIN) {
11fdf7f2 8085 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
8086 _readdir_rechoose_frag(dirp);
8087 return _readdir_get_frag(dirp);
8088 }
8089
8090 if (res == 0) {
11fdf7f2 8091 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
8092 << " size " << dirp->buffer.size() << dendl;
8093 } else {
11fdf7f2 8094 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
8095 dirp->set_end();
8096 }
8097
8098 return res;
8099}
8100
8101struct dentry_off_lt {
8102 bool operator()(const Dentry* dn, int64_t off) const {
8103 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8104 }
8105};
8106
8107int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8108 int caps, bool getref)
8109{
9f95a23c 8110 ceph_assert(ceph_mutex_is_locked(client_lock));
11fdf7f2 8111 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
8112 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8113 << dendl;
8114 Dir *dir = dirp->inode->dir;
8115
8116 if (!dir) {
8117 ldout(cct, 10) << " dir is empty" << dendl;
8118 dirp->set_end();
8119 return 0;
8120 }
8121
8122 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8123 dir->readdir_cache.end(),
8124 dirp->offset, dentry_off_lt());
8125
8126 string dn_name;
8127 while (true) {
adb31ebb 8128 int mask = caps;
7c673cae
FG
8129 if (!dirp->inode->is_complete_and_ordered())
8130 return -EAGAIN;
8131 if (pd == dir->readdir_cache.end())
8132 break;
8133 Dentry *dn = *pd;
8134 if (dn->inode == NULL) {
8135 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8136 ++pd;
8137 continue;
8138 }
8139 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8140 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8141 ++pd;
8142 continue;
8143 }
8144
92f5a8d4 8145 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
8146 if (dn->inode->is_dir()) {
8147 mask |= CEPH_STAT_RSTAT;
8148 }
8149 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
8150 if (r < 0)
8151 return r;
92f5a8d4
TL
8152
8153 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8154 pd = dir->readdir_cache.begin() + idx;
8155 if (pd >= dir->readdir_cache.end() || *pd != dn)
8156 return -EAGAIN;
7c673cae
FG
8157
8158 struct ceph_statx stx;
8159 struct dirent de;
8160 fill_statx(dn->inode, caps, &stx);
8161
8162 uint64_t next_off = dn->offset + 1;
eafe8130 8163 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8164 ++pd;
8165 if (pd == dir->readdir_cache.end())
8166 next_off = dir_result_t::END;
8167
8168 Inode *in = NULL;
7c673cae
FG
8169 if (getref) {
8170 in = dn->inode.get();
8171 _ll_get(in);
8172 }
8173
8174 dn_name = dn->name; // fill in name while we have lock
8175
9f95a23c 8176 client_lock.unlock();
7c673cae 8177 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8178 client_lock.lock();
7c673cae
FG
8179 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8180 << " = " << r << dendl;
8181 if (r < 0) {
8182 return r;
8183 }
8184
8185 dirp->offset = next_off;
8186 if (dirp->at_end())
8187 dirp->next_offset = 2;
8188 else
8189 dirp->next_offset = dirp->offset_low();
8190 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8191 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8192 if (r > 0)
8193 return r;
8194 }
8195
11fdf7f2 8196 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8197 dirp->set_end();
8198 return 0;
8199}
8200
8201int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8202 unsigned want, unsigned flags, bool getref)
8203{
8204 int caps = statx_to_mask(flags, want);
8205
11fdf7f2 8206 std::lock_guard lock(client_lock);
7c673cae 8207
181888fb
FG
8208 if (unmounting)
8209 return -ENOTCONN;
8210
7c673cae
FG
8211 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8212
11fdf7f2 8213 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8214 << dec << " at_end=" << dirp->at_end()
8215 << " hash_order=" << dirp->hash_order() << dendl;
8216
8217 struct dirent de;
8218 struct ceph_statx stx;
8219 memset(&de, 0, sizeof(de));
8220 memset(&stx, 0, sizeof(stx));
8221
8222 InodeRef& diri = dirp->inode;
8223
8224 if (dirp->at_end())
8225 return 0;
8226
8227 if (dirp->offset == 0) {
8228 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8229 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8230 uint64_t next_off = 1;
8231
8232 int r;
adb31ebb 8233 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8234 if (r < 0)
8235 return r;
8236
8237 fill_statx(diri, caps, &stx);
8238 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8239
8240 Inode *inode = NULL;
8241 if (getref) {
8242 inode = diri.get();
8243 _ll_get(inode);
8244 }
8245
9f95a23c 8246 client_lock.unlock();
7c673cae 8247 r = cb(p, &de, &stx, next_off, inode);
9f95a23c 8248 client_lock.lock();
7c673cae
FG
8249 if (r < 0)
8250 return r;
8251
8252 dirp->offset = next_off;
8253 if (r > 0)
8254 return r;
8255 }
8256 if (dirp->offset == 1) {
8257 ldout(cct, 15) << " including .." << dendl;
8258 uint64_t next_off = 2;
8259 InodeRef in;
11fdf7f2 8260 if (diri->dentries.empty())
7c673cae
FG
8261 in = diri;
8262 else
94b18763 8263 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8264
8265 int r;
adb31ebb 8266 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8267 if (r < 0)
8268 return r;
8269
8270 fill_statx(in, caps, &stx);
8271 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8272
8273 Inode *inode = NULL;
8274 if (getref) {
8275 inode = in.get();
8276 _ll_get(inode);
8277 }
8278
9f95a23c 8279 client_lock.unlock();
7c673cae 8280 r = cb(p, &de, &stx, next_off, inode);
9f95a23c 8281 client_lock.lock();
7c673cae
FG
8282 if (r < 0)
8283 return r;
8284
8285 dirp->offset = next_off;
8286 if (r > 0)
8287 return r;
8288 }
8289
8290 // can we read from our cache?
8291 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8292 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8293 << dirp->inode->is_complete_and_ordered()
8294 << " issued " << ccap_string(dirp->inode->caps_issued())
8295 << dendl;
8296 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8297 dirp->inode->is_complete_and_ordered() &&
94b18763 8298 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8299 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8300 if (err != -EAGAIN)
8301 return err;
8302 }
8303
8304 while (1) {
8305 if (dirp->at_end())
8306 return 0;
8307
8308 bool check_caps = true;
8309 if (!dirp->is_cached()) {
8310 int r = _readdir_get_frag(dirp);
8311 if (r)
8312 return r;
8313 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8314 // different than the requested one. (our dirfragtree was outdated)
8315 check_caps = false;
8316 }
8317 frag_t fg = dirp->buffer_frag;
8318
8319 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8320 << " offset " << hex << dirp->offset << dendl;
8321
8322 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8323 dirp->offset, dir_result_t::dentry_off_lt());
8324 it != dirp->buffer.end();
8325 ++it) {
8326 dir_result_t::dentry &entry = *it;
8327
8328 uint64_t next_off = entry.offset + 1;
8329
8330 int r;
8331 if (check_caps) {
adb31ebb
TL
8332 int mask = caps;
8333 if(entry.inode->is_dir()){
8334 mask |= CEPH_STAT_RSTAT;
8335 }
8336 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
8337 if (r < 0)
8338 return r;
8339 }
8340
8341 fill_statx(entry.inode, caps, &stx);
8342 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8343
8344 Inode *inode = NULL;
8345 if (getref) {
8346 inode = entry.inode.get();
8347 _ll_get(inode);
8348 }
8349
9f95a23c 8350 client_lock.unlock();
7c673cae 8351 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9f95a23c 8352 client_lock.lock();
7c673cae
FG
8353
8354 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8355 << " = " << r << dendl;
8356 if (r < 0)
8357 return r;
8358
8359 dirp->offset = next_off;
8360 if (r > 0)
8361 return r;
8362 }
8363
8364 if (dirp->next_offset > 2) {
8365 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8366 _readdir_drop_dirp_buffer(dirp);
8367 continue; // more!
8368 }
8369
8370 if (!fg.is_rightmost()) {
8371 // next frag!
8372 _readdir_next_frag(dirp);
8373 continue;
8374 }
8375
8376 if (diri->shared_gen == dirp->start_shared_gen &&
8377 diri->dir_release_count == dirp->release_count) {
8378 if (diri->dir_ordered_count == dirp->ordered_count) {
8379 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8380 if (diri->dir) {
11fdf7f2 8381 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8382 diri->dir->readdir_cache.resize(dirp->cache_index);
8383 }
8384 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8385 } else {
8386 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8387 diri->flags |= I_COMPLETE;
8388 }
8389 }
8390
8391 dirp->set_end();
8392 return 0;
8393 }
8394 ceph_abort();
8395 return 0;
8396}
8397
8398
8399int Client::readdir_r(dir_result_t *d, struct dirent *de)
8400{
8401 return readdirplus_r(d, de, 0, 0, 0, NULL);
8402}
8403
8404/*
8405 * readdirplus_r
8406 *
8407 * returns
8408 * 1 if we got a dirent
8409 * 0 for end of directory
8410 * <0 on error
8411 */
8412
8413struct single_readdir {
8414 struct dirent *de;
8415 struct ceph_statx *stx;
8416 Inode *inode;
8417 bool full;
8418};
8419
8420static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8421 struct ceph_statx *stx, off_t off,
8422 Inode *in)
8423{
8424 single_readdir *c = static_cast<single_readdir *>(p);
8425
8426 if (c->full)
8427 return -1; // already filled this dirent
8428
8429 *c->de = *de;
8430 if (c->stx)
8431 *c->stx = *stx;
8432 c->inode = in;
8433 c->full = true;
8434 return 1;
8435}
8436
8437struct dirent *Client::readdir(dir_result_t *d)
8438{
8439 int ret;
f91f0fd5 8440 auto& de = d->de;
7c673cae
FG
8441 single_readdir sr;
8442 sr.de = &de;
8443 sr.stx = NULL;
8444 sr.inode = NULL;
8445 sr.full = false;
8446
8447 // our callback fills the dirent and sets sr.full=true on first
8448 // call, and returns -1 the second time around.
8449 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8450 if (ret < -1) {
8451 errno = -ret; // this sucks.
8452 return (dirent *) NULL;
8453 }
8454 if (sr.full) {
8455 return &de;
8456 }
8457 return (dirent *) NULL;
8458}
8459
8460int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8461 struct ceph_statx *stx, unsigned want,
8462 unsigned flags, Inode **out)
8463{
8464 single_readdir sr;
8465 sr.de = de;
8466 sr.stx = stx;
8467 sr.inode = NULL;
8468 sr.full = false;
8469
8470 // our callback fills the dirent and sets sr.full=true on first
8471 // call, and returns -1 the second time around.
8472 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8473 if (r < -1)
8474 return r;
8475 if (out)
8476 *out = sr.inode;
8477 if (sr.full)
8478 return 1;
8479 return 0;
8480}
8481
8482
8483/* getdents */
8484struct getdents_result {
8485 char *buf;
8486 int buflen;
8487 int pos;
8488 bool fullent;
8489};
8490
8491static int _readdir_getdent_cb(void *p, struct dirent *de,
8492 struct ceph_statx *stx, off_t off, Inode *in)
8493{
8494 struct getdents_result *c = static_cast<getdents_result *>(p);
8495
8496 int dlen;
8497 if (c->fullent)
8498 dlen = sizeof(*de);
8499 else
8500 dlen = strlen(de->d_name) + 1;
8501
8502 if (c->pos + dlen > c->buflen)
8503 return -1; // doesn't fit
8504
8505 if (c->fullent) {
8506 memcpy(c->buf + c->pos, de, sizeof(*de));
8507 } else {
8508 memcpy(c->buf + c->pos, de->d_name, dlen);
8509 }
8510 c->pos += dlen;
8511 return 0;
8512}
8513
8514int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8515{
8516 getdents_result gr;
8517 gr.buf = buf;
8518 gr.buflen = buflen;
8519 gr.fullent = fullent;
8520 gr.pos = 0;
8521
8522 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8523
8524 if (r < 0) { // some error
8525 if (r == -1) { // buffer ran out of space
8526 if (gr.pos) { // but we got some entries already!
8527 return gr.pos;
8528 } // or we need a larger buffer
8529 return -ERANGE;
8530 } else { // actual error, return it
8531 return r;
8532 }
8533 }
8534 return gr.pos;
8535}
8536
8537
8538/* getdir */
8539struct getdir_result {
8540 list<string> *contents;
8541 int num;
8542};
8543
8544static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8545{
8546 getdir_result *r = static_cast<getdir_result *>(p);
8547
8548 r->contents->push_back(de->d_name);
8549 r->num++;
8550 return 0;
8551}
8552
8553int Client::getdir(const char *relpath, list<string>& contents,
8554 const UserPerm& perms)
8555{
8556 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8557 {
11fdf7f2 8558 std::lock_guard lock(client_lock);
7c673cae
FG
8559 tout(cct) << "getdir" << std::endl;
8560 tout(cct) << relpath << std::endl;
8561 }
8562
8563 dir_result_t *d;
8564 int r = opendir(relpath, &d, perms);
8565 if (r < 0)
8566 return r;
8567
8568 getdir_result gr;
8569 gr.contents = &contents;
8570 gr.num = 0;
8571 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8572
8573 closedir(d);
8574
8575 if (r < 0)
8576 return r;
8577 return gr.num;
8578}
8579
8580
8581/****** file i/o **********/
8582int Client::open(const char *relpath, int flags, const UserPerm& perms,
8583 mode_t mode, int stripe_unit, int stripe_count,
8584 int object_size, const char *data_pool)
8585{
f91f0fd5
TL
8586 int cflags = ceph_flags_sys2wire(flags);
8587
8588 ldout(cct, 3) << "open enter(" << relpath << ", " << cflags << "," << mode << ")" << dendl;
11fdf7f2 8589 std::lock_guard lock(client_lock);
7c673cae
FG
8590 tout(cct) << "open" << std::endl;
8591 tout(cct) << relpath << std::endl;
f91f0fd5 8592 tout(cct) << cflags << std::endl;
7c673cae 8593
181888fb
FG
8594 if (unmounting)
8595 return -ENOTCONN;
8596
7c673cae
FG
8597 Fh *fh = NULL;
8598
8599#if defined(__linux__) && defined(O_PATH)
8600 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8601 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8602 * in kernel (fs/open.c). */
8603 if (flags & O_PATH)
8604 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8605#endif
8606
8607 filepath path(relpath);
8608 InodeRef in;
8609 bool created = false;
8610 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8611 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
8612 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
8613
8614 int r = path_walk(path, &in, perms, followsym, mask);
7c673cae
FG
8615
8616 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8617 return -EEXIST;
8618
8619#if defined(__linux__) && defined(O_PATH)
8620 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8621#else
8622 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8623#endif
8624 return -ELOOP;
8625
8626 if (r == -ENOENT && (flags & O_CREAT)) {
8627 filepath dirpath = path;
8628 string dname = dirpath.last_dentry();
8629 dirpath.pop_dentry();
8630 InodeRef dir;
8631 r = path_walk(dirpath, &dir, perms, true,
8632 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8633 if (r < 0)
8634 goto out;
8635 if (cct->_conf->client_permissions) {
8636 r = may_create(dir.get(), perms);
8637 if (r < 0)
8638 goto out;
8639 }
8640 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8641 stripe_count, object_size, data_pool, &created, perms);
8642 }
8643 if (r < 0)
8644 goto out;
8645
8646 if (!created) {
8647 // posix says we can only check permissions of existing files
8648 if (cct->_conf->client_permissions) {
8649 r = may_open(in.get(), flags, perms);
8650 if (r < 0)
8651 goto out;
8652 }
8653 }
8654
8655 if (!fh)
8656 r = _open(in.get(), flags, mode, &fh, perms);
8657 if (r >= 0) {
8658 // allocate a integer file descriptor
11fdf7f2 8659 ceph_assert(fh);
7c673cae 8660 r = get_fd();
11fdf7f2 8661 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8662 fd_map[r] = fh;
8663 }
8664
8665 out:
8666 tout(cct) << r << std::endl;
f91f0fd5 8667 ldout(cct, 3) << "open exit(" << path << ", " << cflags << ") = " << r << dendl;
7c673cae
FG
8668 return r;
8669}
8670
8671int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8672{
8673 /* Use default file striping parameters */
8674 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8675}
8676
8677int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8678 const UserPerm& perms)
8679{
11fdf7f2
TL
8680 std::lock_guard lock(client_lock);
8681 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8682
181888fb
FG
8683 if (unmounting)
8684 return -ENOTCONN;
8685
7c673cae
FG
8686 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8687 filepath path(ino);
8688 req->set_filepath(path);
8689
8690 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8691 char f[30];
8692 sprintf(f, "%u", h);
8693 filepath path2(dirino);
8694 path2.push_dentry(string(f));
8695 req->set_filepath2(path2);
8696
8697 int r = make_request(req, perms, NULL, NULL,
8698 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8699 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8700 return r;
8701}
8702
8703
8704/**
8705 * Load inode into local cache.
8706 *
8707 * If inode pointer is non-NULL, and take a reference on
8708 * the resulting Inode object in one operation, so that caller
8709 * can safely assume inode will still be there after return.
8710 */
1adf2230 8711int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8712{
11fdf7f2 8713 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8714
181888fb
FG
8715 if (unmounting)
8716 return -ENOTCONN;
8717
7c673cae
FG
8718 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8719 filepath path(ino);
8720 req->set_filepath(path);
8721
8722 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8723 if (r == 0 && inode != NULL) {
8724 vinodeno_t vino(ino, CEPH_NOSNAP);
8725 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8726 ceph_assert(p != inode_map.end());
7c673cae
FG
8727 *inode = p->second;
8728 _ll_get(*inode);
8729 }
11fdf7f2 8730 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8731 return r;
8732}
8733
1adf2230
AA
8734int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8735{
11fdf7f2 8736 std::lock_guard lock(client_lock);
1adf2230
AA
8737 return _lookup_ino(ino, perms, inode);
8738}
7c673cae
FG
8739
8740/**
8741 * Find the parent inode of `ino` and insert it into
8742 * our cache. Conditionally also set `parent` to a referenced
8743 * Inode* if caller provides non-NULL value.
8744 */
1adf2230 8745int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8746{
11fdf7f2 8747 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8748
7c673cae
FG
8749 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8750 filepath path(ino->ino);
8751 req->set_filepath(path);
8752
8753 InodeRef target;
8754 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8755 // Give caller a reference to the parent ino if they provided a pointer.
8756 if (parent != NULL) {
8757 if (r == 0) {
8758 *parent = target.get();
8759 _ll_get(*parent);
11fdf7f2 8760 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8761 } else {
8762 *parent = NULL;
8763 }
8764 }
11fdf7f2 8765 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8766 return r;
8767}
8768
7c673cae
FG
8769/**
8770 * Populate the parent dentry for `ino`, provided it is
8771 * a child of `parent`.
8772 */
1adf2230 8773int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8774{
11fdf7f2
TL
8775 ceph_assert(parent->is_dir());
8776 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8777
181888fb
FG
8778 if (unmounting)
8779 return -ENOTCONN;
8780
7c673cae
FG
8781 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8782 req->set_filepath2(filepath(parent->ino));
8783 req->set_filepath(filepath(ino->ino));
8784 req->set_inode(ino);
8785
8786 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8787 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8788 return r;
8789}
8790
1adf2230
AA
8791int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8792{
11fdf7f2 8793 std::lock_guard lock(client_lock);
1adf2230
AA
8794 return _lookup_name(ino, parent, perms);
8795}
7c673cae 8796
11fdf7f2 8797Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8798{
11fdf7f2 8799 ceph_assert(in);
f6b5b4d7 8800 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 8801
11fdf7f2 8802 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8803
8804 if (in->snapid != CEPH_NOSNAP) {
8805 in->snap_cap_refs++;
8806 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8807 << ccap_string(in->caps_issued()) << dendl;
8808 }
8809
11fdf7f2 8810 const auto& conf = cct->_conf;
7c673cae
FG
8811 f->readahead.set_trigger_requests(1);
8812 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8813 uint64_t max_readahead = Readahead::NO_LIMIT;
8814 if (conf->client_readahead_max_bytes) {
11fdf7f2 8815 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8816 }
8817 if (conf->client_readahead_max_periods) {
11fdf7f2 8818 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8819 }
8820 f->readahead.set_max_readahead_size(max_readahead);
8821 vector<uint64_t> alignments;
8822 alignments.push_back(in->layout.get_period());
8823 alignments.push_back(in->layout.stripe_unit);
8824 f->readahead.set_alignments(alignments);
8825
8826 return f;
8827}
8828
8829int Client::_release_fh(Fh *f)
8830{
8831 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8832 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8833 Inode *in = f->inode.get();
11fdf7f2 8834 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8835
b32b8144
FG
8836 in->unset_deleg(f);
8837
7c673cae
FG
8838 if (in->snapid == CEPH_NOSNAP) {
8839 if (in->put_open_ref(f->mode)) {
8840 _flush(in, new C_Client_FlushComplete(this, in));
8841 check_caps(in, 0);
8842 }
8843 } else {
11fdf7f2 8844 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8845 in->snap_cap_refs--;
8846 }
8847
8848 _release_filelocks(f);
8849
8850 // Finally, read any async err (i.e. from flushes)
8851 int err = f->take_async_err();
8852 if (err != 0) {
11fdf7f2 8853 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8854 << cpp_strerror(err) << dendl;
8855 } else {
11fdf7f2 8856 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8857 }
8858
8859 _put_fh(f);
8860
8861 return err;
8862}
8863
8864void Client::_put_fh(Fh *f)
8865{
8866 int left = f->put();
8867 if (!left) {
8868 delete f;
8869 }
8870}
8871
8872int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8873 const UserPerm& perms)
8874{
8875 if (in->snapid != CEPH_NOSNAP &&
8876 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8877 return -EROFS;
8878 }
8879
8880 // use normalized flags to generate cmode
11fdf7f2
TL
8881 int cflags = ceph_flags_sys2wire(flags);
8882 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8883 cflags |= CEPH_O_LAZY;
8884
8885 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8886 int want = ceph_caps_for_mode(cmode);
8887 int result = 0;
8888
8889 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8890
b32b8144 8891 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8892 // update wanted?
8893 check_caps(in, CHECK_CAPS_NODELAY);
8894 } else {
b32b8144 8895
7c673cae
FG
8896 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8897 filepath path;
8898 in->make_nosnap_relative_path(path);
8899 req->set_filepath(path);
11fdf7f2 8900 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8901 req->head.args.open.mode = mode;
8902 req->head.args.open.pool = -1;
8903 if (cct->_conf->client_debug_getattr_caps)
8904 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8905 else
8906 req->head.args.open.mask = 0;
8907 req->head.args.open.old_size = in->size; // for O_TRUNC
8908 req->set_inode(in);
8909 result = make_request(req, perms);
b32b8144
FG
8910
8911 /*
8912 * NFS expects that delegations will be broken on a conflicting open,
8913 * not just when there is actual conflicting access to the file. SMB leases
8914 * and oplocks also have similar semantics.
8915 *
8916 * Ensure that clients that have delegations enabled will wait on minimal
8917 * caps during open, just to ensure that other clients holding delegations
8918 * return theirs first.
8919 */
8920 if (deleg_timeout && result == 0) {
8921 int need = 0, have;
8922
8923 if (cmode & CEPH_FILE_MODE_WR)
8924 need |= CEPH_CAP_FILE_WR;
8925 if (cmode & CEPH_FILE_MODE_RD)
8926 need |= CEPH_CAP_FILE_RD;
8927
f6b5b4d7
TL
8928 Fh fh(in, flags, cmode, fd_gen, perms);
8929 result = get_caps(&fh, need, want, &have, -1);
b32b8144 8930 if (result < 0) {
1adf2230 8931 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8932 " . Denying open: " <<
8933 cpp_strerror(result) << dendl;
b32b8144
FG
8934 } else {
8935 put_cap_ref(in, need);
8936 }
8937 }
7c673cae
FG
8938 }
8939
8940 // success?
8941 if (result >= 0) {
8942 if (fhp)
8943 *fhp = _create_fh(in, flags, cmode, perms);
8944 } else {
8945 in->put_open_ref(cmode);
8946 }
8947
8948 trim_cache();
8949
8950 return result;
8951}
8952
8953int Client::_renew_caps(Inode *in)
8954{
8955 int wanted = in->caps_file_wanted();
8956 if (in->is_any_caps() &&
8957 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8958 check_caps(in, CHECK_CAPS_NODELAY);
8959 return 0;
8960 }
8961
8962 int flags = 0;
8963 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8964 flags = O_RDWR;
8965 else if (wanted & CEPH_CAP_FILE_RD)
8966 flags = O_RDONLY;
8967 else if (wanted & CEPH_CAP_FILE_WR)
8968 flags = O_WRONLY;
8969
8970 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8971 filepath path;
8972 in->make_nosnap_relative_path(path);
8973 req->set_filepath(path);
8974 req->head.args.open.flags = flags;
8975 req->head.args.open.pool = -1;
8976 if (cct->_conf->client_debug_getattr_caps)
8977 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8978 else
8979 req->head.args.open.mask = 0;
8980 req->set_inode(in);
8981
8982 // duplicate in case Cap goes away; not sure if that race is a concern?
8983 const UserPerm *pperm = in->get_best_perms();
8984 UserPerm perms;
8985 if (pperm != NULL)
8986 perms = *pperm;
8987 int ret = make_request(req, perms);
8988 return ret;
8989}
8990
8991int Client::close(int fd)
8992{
8993 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8994 std::lock_guard lock(client_lock);
7c673cae
FG
8995 tout(cct) << "close" << std::endl;
8996 tout(cct) << fd << std::endl;
8997
181888fb
FG
8998 if (unmounting)
8999 return -ENOTCONN;
9000
7c673cae
FG
9001 Fh *fh = get_filehandle(fd);
9002 if (!fh)
9003 return -EBADF;
9004 int err = _release_fh(fh);
9005 fd_map.erase(fd);
9006 put_fd(fd);
9007 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9008 return err;
9009}
9010
9011
9012// ------------
9013// read, write
9014
9015loff_t Client::lseek(int fd, loff_t offset, int whence)
9016{
11fdf7f2 9017 std::lock_guard lock(client_lock);
7c673cae
FG
9018 tout(cct) << "lseek" << std::endl;
9019 tout(cct) << fd << std::endl;
9020 tout(cct) << offset << std::endl;
9021 tout(cct) << whence << std::endl;
9022
181888fb
FG
9023 if (unmounting)
9024 return -ENOTCONN;
9025
7c673cae
FG
9026 Fh *f = get_filehandle(fd);
9027 if (!f)
9028 return -EBADF;
9029#if defined(__linux__) && defined(O_PATH)
9030 if (f->flags & O_PATH)
9031 return -EBADF;
9032#endif
9033 return _lseek(f, offset, whence);
9034}
9035
9036loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9037{
9038 Inode *in = f->inode.get();
9f95a23c 9039 bool whence_check = false;
11fdf7f2 9040 loff_t pos = -1;
7c673cae 9041
9f95a23c
TL
9042 switch (whence) {
9043 case SEEK_END:
9044 whence_check = true;
9045 break;
9046
9047#ifdef SEEK_DATA
9048 case SEEK_DATA:
9049 whence_check = true;
9050 break;
9051#endif
9052
9053#ifdef SEEK_HOLE
9054 case SEEK_HOLE:
9055 whence_check = true;
9056 break;
9057#endif
9058 }
9059
9060 if (whence_check) {
9061 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9062 if (r < 0)
92f5a8d4 9063 return r;
92f5a8d4
TL
9064 }
9065
7c673cae
FG
9066 switch (whence) {
9067 case SEEK_SET:
11fdf7f2 9068 pos = offset;
7c673cae
FG
9069 break;
9070
9071 case SEEK_CUR:
92f5a8d4 9072 pos = f->pos + offset;
7c673cae
FG
9073 break;
9074
9075 case SEEK_END:
11fdf7f2 9076 pos = in->size + offset;
7c673cae
FG
9077 break;
9078
9f95a23c 9079#ifdef SEEK_DATA
92f5a8d4 9080 case SEEK_DATA:
9f95a23c
TL
9081 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9082 return -ENXIO;
92f5a8d4
TL
9083 pos = offset;
9084 break;
9f95a23c 9085#endif
92f5a8d4 9086
9f95a23c 9087#ifdef SEEK_HOLE
92f5a8d4 9088 case SEEK_HOLE:
9f95a23c
TL
9089 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9090 return -ENXIO;
9091 pos = in->size;
92f5a8d4 9092 break;
9f95a23c 9093#endif
92f5a8d4 9094
7c673cae 9095 default:
92f5a8d4
TL
9096 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9097 return -EINVAL;
7c673cae
FG
9098 }
9099
11fdf7f2
TL
9100 if (pos < 0) {
9101 return -EINVAL;
9102 } else {
9103 f->pos = pos;
9104 }
9105
1adf2230 9106 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
9107 return f->pos;
9108}
9109
9110
9111void Client::lock_fh_pos(Fh *f)
9112{
11fdf7f2 9113 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9114
9115 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 9116 ceph::condition_variable cond;
7c673cae 9117 f->pos_waiters.push_back(&cond);
11fdf7f2 9118 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
9119 std::unique_lock l{client_lock, std::adopt_lock};
9120 cond.wait(l, [f, me=&cond] {
9121 return !f->pos_locked && f->pos_waiters.front() == me;
9122 });
9123 l.release();
11fdf7f2
TL
9124 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9125 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9126 f->pos_waiters.pop_front();
9127 }
9128
9129 f->pos_locked = true;
9130}
9131
9132void Client::unlock_fh_pos(Fh *f)
9133{
7f7e6c64
TL
9134 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9135
11fdf7f2 9136 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 9137 f->pos_locked = false;
7f7e6c64
TL
9138 if (!f->pos_waiters.empty()) {
9139 // only wake up the oldest waiter
9140 auto cond = f->pos_waiters.front();
9141 cond->notify_one();
9142 }
7c673cae
FG
9143}
9144
9145int Client::uninline_data(Inode *in, Context *onfinish)
9146{
9147 if (!in->inline_data.length()) {
9148 onfinish->complete(0);
9149 return 0;
9150 }
9151
9152 char oid_buf[32];
9153 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9154 object_t oid = oid_buf;
9155
9156 ObjectOperation create_ops;
9157 create_ops.create(false);
9158
9159 objecter->mutate(oid,
9160 OSDMap::file_to_object_locator(in->layout),
9161 create_ops,
9162 in->snaprealm->get_snap_context(),
9163 ceph::real_clock::now(),
9164 0,
9165 NULL);
9166
9167 bufferlist inline_version_bl;
11fdf7f2 9168 encode(in->inline_version, inline_version_bl);
7c673cae
FG
9169
9170 ObjectOperation uninline_ops;
9171 uninline_ops.cmpxattr("inline_version",
9172 CEPH_OSD_CMPXATTR_OP_GT,
9173 CEPH_OSD_CMPXATTR_MODE_U64,
9174 inline_version_bl);
9175 bufferlist inline_data = in->inline_data;
9176 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9177 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9178
9179 objecter->mutate(oid,
9180 OSDMap::file_to_object_locator(in->layout),
9181 uninline_ops,
9182 in->snaprealm->get_snap_context(),
9183 ceph::real_clock::now(),
9184 0,
9185 onfinish);
9186
9187 return 0;
9188}
9189
9190//
9191
9192// blocking osd interface
9193
9194int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9195{
f6b5b4d7 9196 std::unique_lock lock(client_lock);
7c673cae
FG
9197 tout(cct) << "read" << std::endl;
9198 tout(cct) << fd << std::endl;
9199 tout(cct) << size << std::endl;
9200 tout(cct) << offset << std::endl;
9201
181888fb
FG
9202 if (unmounting)
9203 return -ENOTCONN;
9204
7c673cae
FG
9205 Fh *f = get_filehandle(fd);
9206 if (!f)
9207 return -EBADF;
9208#if defined(__linux__) && defined(O_PATH)
9209 if (f->flags & O_PATH)
9210 return -EBADF;
9211#endif
9212 bufferlist bl;
11fdf7f2
TL
9213 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9214 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9215 int r = _read(f, offset, size, &bl);
9216 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9217 if (r >= 0) {
f6b5b4d7 9218 lock.unlock();
9f95a23c 9219 bl.begin().copy(bl.length(), buf);
7c673cae
FG
9220 r = bl.length();
9221 }
9222 return r;
9223}
9224
9225int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9226{
9227 if (iovcnt < 0)
9228 return -EINVAL;
9229 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9230}
9231
11fdf7f2 9232int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9233{
11fdf7f2
TL
9234 int want, have = 0;
9235 bool movepos = false;
9236 std::unique_ptr<C_SaferCond> onuninline;
adb31ebb 9237 int64_t rc = 0;
11fdf7f2 9238 const auto& conf = cct->_conf;
7c673cae 9239 Inode *in = f->inode.get();
11fdf7f2
TL
9240 utime_t lat;
9241 utime_t start = ceph_clock_now();
7c673cae
FG
9242
9243 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9244 return -EBADF;
9245 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9246
7c673cae
FG
9247 if (offset < 0) {
9248 lock_fh_pos(f);
9249 offset = f->pos;
9250 movepos = true;
9251 }
9252 loff_t start_pos = offset;
9253
9254 if (in->inline_version == 0) {
adb31ebb 9255 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9256 if (r < 0) {
adb31ebb 9257 rc = r;
11fdf7f2 9258 goto done;
c07f9fc5 9259 }
11fdf7f2 9260 ceph_assert(in->inline_version > 0);
7c673cae
FG
9261 }
9262
9263retry:
11fdf7f2
TL
9264 if (f->mode & CEPH_FILE_MODE_LAZY)
9265 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9266 else
9267 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
9268 {
9269 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9270 if (r < 0) {
9271 rc = r;
9272 goto done;
9273 }
c07f9fc5 9274 }
7c673cae 9275 if (f->flags & O_DIRECT)
11fdf7f2 9276 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9277
9278 if (in->inline_version < CEPH_INLINE_NONE) {
9279 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9280 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9281 uninline_data(in, onuninline.get());
7c673cae
FG
9282 } else {
9283 uint32_t len = in->inline_data.length();
7c673cae
FG
9284 uint64_t endoff = offset + size;
9285 if (endoff > in->size)
9286 endoff = in->size;
9287
9288 if (offset < len) {
9289 if (endoff <= len) {
9290 bl->substr_of(in->inline_data, offset, endoff - offset);
9291 } else {
9292 bl->substr_of(in->inline_data, offset, len - offset);
9293 bl->append_zero(endoff - len);
9294 }
adb31ebb 9295 rc = endoff - offset;
7c673cae
FG
9296 } else if ((uint64_t)offset < endoff) {
9297 bl->append_zero(endoff - offset);
adb31ebb 9298 rc = endoff - offset;
11fdf7f2 9299 } else {
adb31ebb 9300 rc = 0;
7c673cae 9301 }
7c673cae
FG
9302 goto success;
9303 }
9304 }
9305
9306 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9307 conf->client_oc &&
9308 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9309
9310 if (f->flags & O_RSYNC) {
9311 _flush_range(in, offset, size);
9312 }
adb31ebb
TL
9313 rc = _read_async(f, offset, size, bl);
9314 if (rc < 0)
7c673cae
FG
9315 goto done;
9316 } else {
9317 if (f->flags & O_DIRECT)
9318 _flush_range(in, offset, size);
9319
9320 bool checkeof = false;
adb31ebb
TL
9321 rc = _read_sync(f, offset, size, bl, &checkeof);
9322 if (rc < 0)
7c673cae
FG
9323 goto done;
9324 if (checkeof) {
adb31ebb
TL
9325 offset += rc;
9326 size -= rc;
7c673cae
FG
9327
9328 put_cap_ref(in, CEPH_CAP_FILE_RD);
9329 have = 0;
9330 // reverify size
adb31ebb
TL
9331 {
9332 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9333 if (r < 0) {
9334 rc = r;
9335 goto done;
9336 }
9337 }
7c673cae
FG
9338
9339 // eof? short read.
9340 if ((uint64_t)offset < in->size)
9341 goto retry;
9342 }
9343 }
9344
9345success:
adb31ebb 9346 ceph_assert(rc >= 0);
7c673cae
FG
9347 if (movepos) {
9348 // adjust fd pos
adb31ebb 9349 f->pos = start_pos + rc;
7c673cae 9350 }
11fdf7f2
TL
9351
9352 lat = ceph_clock_now();
9353 lat -= start;
9354 logger->tinc(l_c_read, lat);
7c673cae
FG
9355
9356done:
9357 // done!
11fdf7f2 9358
7c673cae 9359 if (onuninline) {
9f95a23c 9360 client_lock.unlock();
11fdf7f2 9361 int ret = onuninline->wait();
9f95a23c 9362 client_lock.lock();
11fdf7f2 9363 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9364 in->inline_data.clear();
9365 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9366 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9367 check_caps(in, 0);
9368 } else
adb31ebb 9369 rc = ret;
7c673cae 9370 }
11fdf7f2 9371 if (have) {
7c673cae 9372 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9373 }
9374 if (movepos) {
9375 unlock_fh_pos(f);
9376 }
adb31ebb 9377 return rc;
7c673cae
FG
9378}
9379
9380Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9381 client(c), f(f) {
9382 f->get();
9383 f->readahead.inc_pending();
9384}
9385
9386Client::C_Readahead::~C_Readahead() {
9387 f->readahead.dec_pending();
9388 client->_put_fh(f);
9389}
9390
9391void Client::C_Readahead::finish(int r) {
9392 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9393 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9394}
9395
9396int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9397{
11fdf7f2 9398 const auto& conf = cct->_conf;
7c673cae
FG
9399 Inode *in = f->inode.get();
9400
11fdf7f2 9401 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9402
9403 // trim read based on file size?
9404 if (off >= in->size)
9405 return 0;
9406 if (len == 0)
9407 return 0;
9408 if (off + len > in->size) {
9409 len = in->size - off;
9410 }
9411
9412 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9413 << " max_bytes=" << f->readahead.get_max_readahead_size()
9414 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9415
9416 // read (and possibly block)
11fdf7f2
TL
9417 int r = 0;
9418 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9419 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9420 off, len, bl, 0, &onfinish);
7c673cae
FG
9421 if (r == 0) {
9422 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 9423 client_lock.unlock();
11fdf7f2 9424 r = onfinish.wait();
9f95a23c 9425 client_lock.lock();
7c673cae 9426 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9427 }
9428
9429 if(f->readahead.get_min_readahead_size() > 0) {
9430 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9431 if (readahead_extent.second > 0) {
9432 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9433 << " (caller wants " << off << "~" << len << ")" << dendl;
9434 Context *onfinish2 = new C_Readahead(this, f);
9435 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9436 readahead_extent.first, readahead_extent.second,
9437 NULL, 0, onfinish2);
9438 if (r2 == 0) {
9439 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9440 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9441 } else {
9442 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9443 delete onfinish2;
9444 }
9445 }
9446 }
9447
9448 return r;
9449}
9450
9451int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9452 bool *checkeof)
9453{
9454 Inode *in = f->inode.get();
9455 uint64_t pos = off;
9456 int left = len;
9457 int read = 0;
9458
11fdf7f2 9459 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 9460
7c673cae 9461 while (left > 0) {
11fdf7f2 9462 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9463 bufferlist tbl;
9464
9465 int wanted = left;
9466 filer->read_trunc(in->ino, &in->layout, in->snapid,
9467 pos, left, &tbl, 0,
9468 in->truncate_size, in->truncate_seq,
11fdf7f2 9469 &onfinish);
9f95a23c 9470 client_lock.unlock();
11fdf7f2 9471 int r = onfinish.wait();
9f95a23c 9472 client_lock.lock();
7c673cae
FG
9473
9474 // if we get ENOENT from OSD, assume 0 bytes returned
9475 if (r == -ENOENT)
9476 r = 0;
9477 if (r < 0)
9478 return r;
9479 if (tbl.length()) {
9480 r = tbl.length();
9481
9482 read += r;
9483 pos += r;
9484 left -= r;
9485 bl->claim_append(tbl);
9486 }
9487 // short read?
9488 if (r >= 0 && r < wanted) {
9489 if (pos < in->size) {
9490 // zero up to known EOF
9491 int64_t some = in->size - pos;
9492 if (some > left)
9493 some = left;
11fdf7f2
TL
9494 auto z = buffer::ptr_node::create(some);
9495 z->zero();
9496 bl->push_back(std::move(z));
7c673cae
FG
9497 read += some;
9498 pos += some;
9499 left -= some;
9500 if (left == 0)
9501 return read;
9502 }
9503
9504 *checkeof = true;
9505 return read;
9506 }
9507 }
9508 return read;
9509}
9510
9511
9512/*
9513 * we keep count of uncommitted sync writes on the inode, so that
9514 * fsync can DDRT.
9515 */
9516void Client::_sync_write_commit(Inode *in)
9517{
11fdf7f2 9518 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9519 unsafe_sync_write--;
9520
9521 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9522
11fdf7f2 9523 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9524 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9525 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9f95a23c 9526 mount_cond.notify_all();
7c673cae
FG
9527 }
9528}
9529
9530int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9531{
11fdf7f2 9532 std::lock_guard lock(client_lock);
7c673cae
FG
9533 tout(cct) << "write" << std::endl;
9534 tout(cct) << fd << std::endl;
9535 tout(cct) << size << std::endl;
9536 tout(cct) << offset << std::endl;
9537
181888fb
FG
9538 if (unmounting)
9539 return -ENOTCONN;
9540
7c673cae
FG
9541 Fh *fh = get_filehandle(fd);
9542 if (!fh)
9543 return -EBADF;
9544#if defined(__linux__) && defined(O_PATH)
9545 if (fh->flags & O_PATH)
9546 return -EBADF;
9547#endif
11fdf7f2
TL
9548 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9549 size = std::min(size, (loff_t)INT_MAX);
9550 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9551 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9552 return r;
9553}
9554
9555int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9556{
9557 if (iovcnt < 0)
9558 return -EINVAL;
9559 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9560}
9561
11fdf7f2
TL
9562int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9563 unsigned iovcnt, int64_t offset, bool write,
9564 bool clamp_to_int)
7c673cae 9565{
7c673cae
FG
9566#if defined(__linux__) && defined(O_PATH)
9567 if (fh->flags & O_PATH)
9568 return -EBADF;
9569#endif
9570 loff_t totallen = 0;
9571 for (unsigned i = 0; i < iovcnt; i++) {
9572 totallen += iov[i].iov_len;
9573 }
11fdf7f2
TL
9574
9575 /*
9576 * Some of the API functions take 64-bit size values, but only return
9577 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9578 * we don't do I/Os larger than the values we can return.
9579 */
9580 if (clamp_to_int) {
9581 totallen = std::min(totallen, (loff_t)INT_MAX);
9582 }
7c673cae 9583 if (write) {
11fdf7f2
TL
9584 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9585 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9586 return w;
9587 } else {
9588 bufferlist bl;
11fdf7f2
TL
9589 int64_t r = _read(fh, offset, totallen, &bl);
9590 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9591 if (r <= 0)
9592 return r;
9593
9f95a23c 9594 auto iter = bl.cbegin();
7c673cae
FG
9595 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9596 /*
9597 * This piece of code aims to handle the case that bufferlist does not have enough data
9598 * to fill in the iov
9599 */
9f95a23c
TL
9600 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9601 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9602 resid -= round_size;
9603 /* iter is self-updating */
7c673cae
FG
9604 }
9605 return r;
9606 }
9607}
9608
11fdf7f2
TL
9609int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9610{
9611 std::lock_guard lock(client_lock);
9612 tout(cct) << fd << std::endl;
9613 tout(cct) << offset << std::endl;
9614
9615 if (unmounting)
9616 return -ENOTCONN;
9617
9618 Fh *fh = get_filehandle(fd);
9619 if (!fh)
9620 return -EBADF;
9621 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9622}
9623
9624int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9625 const struct iovec *iov, int iovcnt)
7c673cae 9626{
f64942e4
AA
9627 uint64_t fpos = 0;
9628
7c673cae
FG
9629 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9630 return -EFBIG;
9631
9632 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9633 Inode *in = f->inode.get();
9634
9635 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9636 return -ENOSPC;
9637 }
9638
11fdf7f2 9639 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9640
9641 // was Fh opened as writeable?
9642 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9643 return -EBADF;
9644
7c673cae
FG
9645 // use/adjust fd pos?
9646 if (offset < 0) {
9647 lock_fh_pos(f);
9648 /*
9649 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9650 * change out from under us.
9651 */
9652 if (f->flags & O_APPEND) {
9f95a23c 9653 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
9654 if (r < 0) {
9655 unlock_fh_pos(f);
9656 return r;
9657 }
9658 }
9659 offset = f->pos;
f64942e4 9660 fpos = offset+size;
7c673cae
FG
9661 unlock_fh_pos(f);
9662 }
9663
11fdf7f2
TL
9664 // check quota
9665 uint64_t endoff = offset + size;
9666 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9667 f->actor_perms)) {
9668 return -EDQUOT;
9669 }
9670
7c673cae
FG
9671 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9672
9673 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9674
9675 // time it.
9676 utime_t start = ceph_clock_now();
9677
9678 if (in->inline_version == 0) {
9679 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9680 if (r < 0)
9681 return r;
11fdf7f2 9682 ceph_assert(in->inline_version > 0);
7c673cae
FG
9683 }
9684
9685 // copy into fresh buffer (since our write may be resub, async)
9686 bufferlist bl;
9687 if (buf) {
9688 if (size > 0)
9689 bl.append(buf, size);
9690 } else if (iov){
9691 for (int i = 0; i < iovcnt; i++) {
9692 if (iov[i].iov_len > 0) {
9693 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9694 }
9695 }
9696 }
9697
9698 utime_t lat;
9699 uint64_t totalwritten;
11fdf7f2
TL
9700 int want, have;
9701 if (f->mode & CEPH_FILE_MODE_LAZY)
9702 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9703 else
9704 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 9705 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9706 if (r < 0)
9707 return r;
9708
9709 /* clear the setuid/setgid bits, if any */
181888fb 9710 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9711 struct ceph_statx stx = { 0 };
9712
9713 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9714 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9715 if (r < 0)
9716 return r;
9717 } else {
9718 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9719 }
9720
9721 if (f->flags & O_DIRECT)
11fdf7f2 9722 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9723
9724 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9725
11fdf7f2
TL
9726 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9727
7c673cae
FG
9728 if (in->inline_version < CEPH_INLINE_NONE) {
9729 if (endoff > cct->_conf->client_max_inline_size ||
9730 endoff > CEPH_INLINE_MAX_SIZE ||
9731 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9732 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9733 uninline_data(in, onuninline.get());
7c673cae
FG
9734 } else {
9735 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9736
9737 uint32_t len = in->inline_data.length();
9738
9739 if (endoff < len)
9f95a23c 9740 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
9741
9742 if (offset < len)
9743 in->inline_data.splice(offset, len - offset);
9744 else if (offset > len)
9745 in->inline_data.append_zero(offset - len);
9746
9747 in->inline_data.append(bl);
9748 in->inline_version++;
9749
9750 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9751
9752 goto success;
9753 }
9754 }
9755
11fdf7f2
TL
9756 if (cct->_conf->client_oc &&
9757 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9758 // do buffered write
9759 if (!in->oset.dirty_or_tx)
9760 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9761
9762 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9763
9764 // async, caching, non-blocking.
9765 r = objectcacher->file_write(&in->oset, &in->layout,
9766 in->snaprealm->get_snap_context(),
9767 offset, size, bl, ceph::real_clock::now(),
9768 0);
9769 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9770
9771 if (r < 0)
9772 goto done;
9773
9774 // flush cached write if O_SYNC is set on file fh
9775 // O_DSYNC == O_SYNC on linux < 2.6.33
9776 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9777 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9778 _flush_range(in, offset, size);
9779 }
9780 } else {
9781 if (f->flags & O_DIRECT)
9782 _flush_range(in, offset, size);
9783
9784 // simple, non-atomic sync write
11fdf7f2 9785 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9786 unsafe_sync_write++;
9787 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9788
9789 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9790 offset, size, bl, ceph::real_clock::now(), 0,
9791 in->truncate_size, in->truncate_seq,
11fdf7f2 9792 &onfinish);
9f95a23c 9793 client_lock.unlock();
f6b5b4d7 9794 r = onfinish.wait();
9f95a23c 9795 client_lock.lock();
7c673cae 9796 _sync_write_commit(in);
f6b5b4d7
TL
9797 if (r < 0)
9798 goto done;
7c673cae
FG
9799 }
9800
9801 // if we get here, write was successful, update client metadata
9802success:
9803 // time
9804 lat = ceph_clock_now();
9805 lat -= start;
9806 logger->tinc(l_c_wrlat, lat);
9807
f64942e4
AA
9808 if (fpos) {
9809 lock_fh_pos(f);
9810 f->pos = fpos;
9811 unlock_fh_pos(f);
9812 }
7c673cae 9813 totalwritten = size;
11fdf7f2 9814 r = (int64_t)totalwritten;
7c673cae
FG
9815
9816 // extend file?
9817 if (totalwritten + offset > in->size) {
9818 in->size = totalwritten + offset;
28e407b8 9819 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9820
11fdf7f2 9821 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9822 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9823 } else if (is_max_size_approaching(in)) {
9824 check_caps(in, 0);
7c673cae
FG
9825 }
9826
9827 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9828 } else {
9829 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9830 }
9831
9832 // mtime
91327a77 9833 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9834 in->change_attr++;
28e407b8 9835 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9836
9837done:
9838
11fdf7f2 9839 if (nullptr != onuninline) {
9f95a23c 9840 client_lock.unlock();
11fdf7f2 9841 int uninline_ret = onuninline->wait();
9f95a23c 9842 client_lock.lock();
7c673cae
FG
9843
9844 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9845 in->inline_data.clear();
9846 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9847 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9848 check_caps(in, 0);
9849 } else
9850 r = uninline_ret;
9851 }
9852
9853 put_cap_ref(in, CEPH_CAP_FILE_WR);
9854 return r;
9855}
9856
9857int Client::_flush(Fh *f)
9858{
9859 Inode *in = f->inode.get();
9860 int err = f->take_async_err();
9861 if (err != 0) {
9862 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9863 << cpp_strerror(err) << dendl;
9864 } else {
9865 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9866 }
9867
9868 return err;
9869}
9870
9871int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9872{
9873 struct ceph_statx stx;
9874 stx.stx_size = length;
9875 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9876}
9877
9878int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9879{
11fdf7f2
TL
9880 std::lock_guard lock(client_lock);
9881 tout(cct) << __func__ << std::endl;
7c673cae
FG
9882 tout(cct) << fd << std::endl;
9883 tout(cct) << length << std::endl;
9884
181888fb
FG
9885 if (unmounting)
9886 return -ENOTCONN;
9887
7c673cae
FG
9888 Fh *f = get_filehandle(fd);
9889 if (!f)
9890 return -EBADF;
9891#if defined(__linux__) && defined(O_PATH)
9892 if (f->flags & O_PATH)
9893 return -EBADF;
9894#endif
adb31ebb
TL
9895 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9896 return -EBADF;
7c673cae
FG
9897 struct stat attr;
9898 attr.st_size = length;
9899 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9900}
9901
9902int Client::fsync(int fd, bool syncdataonly)
9903{
11fdf7f2 9904 std::lock_guard lock(client_lock);
7c673cae
FG
9905 tout(cct) << "fsync" << std::endl;
9906 tout(cct) << fd << std::endl;
9907 tout(cct) << syncdataonly << std::endl;
9908
181888fb
FG
9909 if (unmounting)
9910 return -ENOTCONN;
9911
7c673cae
FG
9912 Fh *f = get_filehandle(fd);
9913 if (!f)
9914 return -EBADF;
9915#if defined(__linux__) && defined(O_PATH)
9916 if (f->flags & O_PATH)
9917 return -EBADF;
9918#endif
9919 int r = _fsync(f, syncdataonly);
9920 if (r == 0) {
9921 // The IOs in this fsync were okay, but maybe something happened
9922 // in the background that we shoudl be reporting?
9923 r = f->take_async_err();
1adf2230 9924 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9925 << ") = 0, async_err = " << r << dendl;
9926 } else {
9927 // Assume that an error we encountered during fsync, even reported
9928 // synchronously, would also have applied the error to the Fh, and we
9929 // should clear it here to avoid returning the same error again on next
9930 // call.
1adf2230 9931 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9932 << r << dendl;
9933 f->take_async_err();
9934 }
9935 return r;
9936}
9937
9938int Client::_fsync(Inode *in, bool syncdataonly)
9939{
9940 int r = 0;
11fdf7f2 9941 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9942 ceph_tid_t flush_tid = 0;
9943 InodeRef tmp_ref;
11fdf7f2
TL
9944 utime_t lat;
9945 utime_t start = ceph_clock_now();
7c673cae 9946
1adf2230 9947 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9948
9949 if (cct->_conf->client_oc) {
11fdf7f2
TL
9950 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9951 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9952 _flush(in, object_cacher_completion.get());
7c673cae
FG
9953 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9954 }
9955
9956 if (!syncdataonly && in->dirty_caps) {
9957 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9958 if (in->flushing_caps)
9959 flush_tid = last_flush_tid;
9960 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9961
9962 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9963 flush_mdlog_sync();
9964
7c673cae
FG
9965 MetaRequest *req = in->unsafe_ops.back();
9966 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9967
9968 req->get();
9969 wait_on_list(req->waitfor_safe);
9970 put_request(req);
9971 }
9972
11fdf7f2 9973 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 9974 client_lock.unlock();
7c673cae 9975 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9976 r = object_cacher_completion->wait();
9f95a23c 9977 client_lock.lock();
7c673cae
FG
9978 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9979 } else {
9980 // FIXME: this can starve
9981 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9982 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9983 << " uncommitted, waiting" << dendl;
9984 wait_on_list(in->waitfor_commit);
9985 }
9986 }
9987
9988 if (!r) {
9989 if (flush_tid > 0)
9990 wait_sync_caps(in, flush_tid);
9991
9992 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9993 } else {
1adf2230 9994 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9995 << cpp_strerror(-r) << dendl;
9996 }
11fdf7f2
TL
9997
9998 lat = ceph_clock_now();
9999 lat -= start;
10000 logger->tinc(l_c_fsync, lat);
7c673cae
FG
10001
10002 return r;
10003}
10004
10005int Client::_fsync(Fh *f, bool syncdataonly)
10006{
1adf2230 10007 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
10008 return _fsync(f->inode.get(), syncdataonly);
10009}
10010
10011int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10012{
11fdf7f2 10013 std::lock_guard lock(client_lock);
7c673cae
FG
10014 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10015 tout(cct) << fd << std::endl;
10016
181888fb
FG
10017 if (unmounting)
10018 return -ENOTCONN;
10019
7c673cae
FG
10020 Fh *f = get_filehandle(fd);
10021 if (!f)
10022 return -EBADF;
10023 int r = _getattr(f->inode, mask, perms);
10024 if (r < 0)
10025 return r;
10026 fill_stat(f->inode, stbuf, NULL);
1adf2230 10027 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
10028 return r;
10029}
10030
10031int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10032 unsigned int want, unsigned int flags)
10033{
11fdf7f2 10034 std::lock_guard lock(client_lock);
7c673cae
FG
10035 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10036 tout(cct) << fd << std::endl;
10037
181888fb
FG
10038 if (unmounting)
10039 return -ENOTCONN;
10040
7c673cae
FG
10041 Fh *f = get_filehandle(fd);
10042 if (!f)
10043 return -EBADF;
10044
10045 unsigned mask = statx_to_mask(flags, want);
10046
10047 int r = 0;
94b18763 10048 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
10049 r = _getattr(f->inode, mask, perms);
10050 if (r < 0) {
10051 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10052 return r;
10053 }
10054 }
10055
10056 fill_statx(f->inode, mask, stx);
10057 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10058 return r;
10059}
10060
10061// not written yet, but i want to link!
10062
10063int Client::chdir(const char *relpath, std::string &new_cwd,
10064 const UserPerm& perms)
10065{
11fdf7f2 10066 std::lock_guard lock(client_lock);
7c673cae
FG
10067 tout(cct) << "chdir" << std::endl;
10068 tout(cct) << relpath << std::endl;
181888fb
FG
10069
10070 if (unmounting)
10071 return -ENOTCONN;
10072
7c673cae
FG
10073 filepath path(relpath);
10074 InodeRef in;
10075 int r = path_walk(path, &in, perms);
10076 if (r < 0)
10077 return r;
92f5a8d4
TL
10078
10079 if (!(in.get()->is_dir()))
10080 return -ENOTDIR;
10081
7c673cae
FG
10082 if (cwd != in)
10083 cwd.swap(in);
10084 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10085
b5b8bbf5 10086 _getcwd(new_cwd, perms);
7c673cae
FG
10087 return 0;
10088}
10089
b5b8bbf5 10090void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
10091{
10092 filepath path;
11fdf7f2 10093 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
10094
10095 Inode *in = cwd.get();
10096 while (in != root) {
11fdf7f2 10097 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
10098
10099 // A cwd or ancester is unlinked
11fdf7f2 10100 if (in->dentries.empty()) {
7c673cae
FG
10101 return;
10102 }
10103
10104 Dentry *dn = in->get_first_parent();
10105
10106
10107 if (!dn) {
10108 // look it up
11fdf7f2 10109 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
10110 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10111 filepath path(in->ino);
10112 req->set_filepath(path);
10113 req->set_inode(in);
10114 int res = make_request(req, perms);
10115 if (res < 0)
10116 break;
10117
10118 // start over
10119 path = filepath();
10120 in = cwd.get();
10121 continue;
10122 }
10123 path.push_front_dentry(dn->name);
10124 in = dn->dir->parent_inode;
10125 }
10126 dir = "/";
10127 dir += path.get_path();
10128}
10129
b5b8bbf5
FG
10130void Client::getcwd(string& dir, const UserPerm& perms)
10131{
11fdf7f2 10132 std::lock_guard l(client_lock);
181888fb
FG
10133 if (!unmounting)
10134 _getcwd(dir, perms);
b5b8bbf5
FG
10135}
10136
7c673cae
FG
10137int Client::statfs(const char *path, struct statvfs *stbuf,
10138 const UserPerm& perms)
10139{
11fdf7f2
TL
10140 std::lock_guard l(client_lock);
10141 tout(cct) << __func__ << std::endl;
91327a77 10142 unsigned long int total_files_on_fs;
7c673cae 10143
181888fb
FG
10144 if (unmounting)
10145 return -ENOTCONN;
10146
7c673cae
FG
10147 ceph_statfs stats;
10148 C_SaferCond cond;
d2e6a577
FG
10149
10150 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10151 if (data_pools.size() == 1) {
10152 objecter->get_fs_stats(stats, data_pools[0], &cond);
10153 } else {
10154 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10155 }
7c673cae 10156
9f95a23c 10157 client_lock.unlock();
7c673cae 10158 int rval = cond.wait();
91327a77
AA
10159 assert(root);
10160 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9f95a23c 10161 client_lock.lock();
7c673cae
FG
10162
10163 if (rval < 0) {
10164 ldout(cct, 1) << "underlying call to statfs returned error: "
10165 << cpp_strerror(rval)
10166 << dendl;
10167 return rval;
10168 }
10169
10170 memset(stbuf, 0, sizeof(*stbuf));
10171
10172 /*
10173 * we're going to set a block size of 4MB so we can represent larger
10174 * FSes without overflowing. Additionally convert the space
10175 * measurements from KB to bytes while making them in terms of
10176 * blocks. We use 4MB only because it is big enough, and because it
10177 * actually *is* the (ceph) default block size.
10178 */
10179 const int CEPH_BLOCK_SHIFT = 22;
10180 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10181 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
10182 stbuf->f_files = total_files_on_fs;
10183 stbuf->f_ffree = 0;
7c673cae
FG
10184 stbuf->f_favail = -1;
10185 stbuf->f_fsid = -1; // ??
10186 stbuf->f_flag = 0; // ??
10187 stbuf->f_namemax = NAME_MAX;
10188
10189 // Usually quota_root will == root_ancestor, but if the mount root has no
10190 // quota but we can see a parent of it that does have a quota, we'll
10191 // respect that one instead.
11fdf7f2 10192 ceph_assert(root != nullptr);
7c673cae
FG
10193 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10194
10195 // get_quota_root should always give us something
10196 // because client quotas are always enabled
11fdf7f2 10197 ceph_assert(quota_root != nullptr);
7c673cae
FG
10198
10199 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10200
10201 // Skip the getattr if any sessions are stale, as we don't want to
10202 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10203 // is unhealthy.
10204 if (!_any_stale_sessions()) {
10205 int r = _getattr(quota_root, 0, perms, true);
10206 if (r != 0) {
10207 // Ignore return value: error getting latest inode metadata is not a good
10208 // reason to break "df".
10209 lderr(cct) << "Error in getattr on quota root 0x"
10210 << std::hex << quota_root->ino << std::dec
10211 << " statfs result may be outdated" << dendl;
10212 }
10213 }
10214
10215 // Special case: if there is a size quota set on the Inode acting
10216 // as the root for this client mount, then report the quota status
10217 // as the filesystem statistics.
10218 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10219 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10220 // It is possible for a quota to be exceeded: arithmetic here must
10221 // handle case where used > total.
10222 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10223
10224 stbuf->f_blocks = total;
10225 stbuf->f_bfree = free;
10226 stbuf->f_bavail = free;
10227 } else {
d2e6a577 10228 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10229 // multiple pools may be used without one filesystem namespace via
10230 // layouts, this is the most correct thing we can do.
10231 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10232 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10233 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10234 }
10235
10236 return rval;
10237}
10238
10239int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10240 struct flock *fl, uint64_t owner, bool removing)
10241{
11fdf7f2 10242 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10243 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10244 << " type " << fl->l_type << " owner " << owner
10245 << " " << fl->l_start << "~" << fl->l_len << dendl;
10246
f6b5b4d7
TL
10247 if (in->flags & I_ERROR_FILELOCK)
10248 return -EIO;
10249
7c673cae
FG
10250 int lock_cmd;
10251 if (F_RDLCK == fl->l_type)
10252 lock_cmd = CEPH_LOCK_SHARED;
10253 else if (F_WRLCK == fl->l_type)
10254 lock_cmd = CEPH_LOCK_EXCL;
10255 else if (F_UNLCK == fl->l_type)
10256 lock_cmd = CEPH_LOCK_UNLOCK;
10257 else
10258 return -EIO;
10259
10260 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10261 sleep = 0;
10262
10263 /*
10264 * Set the most significant bit, so that MDS knows the 'owner'
10265 * is sufficient to identify the owner of lock. (old code uses
10266 * both 'owner' and 'pid')
10267 */
10268 owner |= (1ULL << 63);
10269
10270 MetaRequest *req = new MetaRequest(op);
10271 filepath path;
10272 in->make_nosnap_relative_path(path);
10273 req->set_filepath(path);
10274 req->set_inode(in);
10275
10276 req->head.args.filelock_change.rule = lock_type;
10277 req->head.args.filelock_change.type = lock_cmd;
10278 req->head.args.filelock_change.owner = owner;
10279 req->head.args.filelock_change.pid = fl->l_pid;
10280 req->head.args.filelock_change.start = fl->l_start;
10281 req->head.args.filelock_change.length = fl->l_len;
10282 req->head.args.filelock_change.wait = sleep;
10283
10284 int ret;
10285 bufferlist bl;
10286
10287 if (sleep && switch_interrupt_cb) {
10288 // enable interrupt
10289 switch_interrupt_cb(callback_handle, req->get());
10290 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10291 // disable interrupt
10292 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10293 if (ret == 0 && req->aborted()) {
10294 // effect of this lock request has been revoked by the 'lock intr' request
10295 ret = req->get_abort_code();
10296 }
7c673cae
FG
10297 put_request(req);
10298 } else {
10299 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10300 }
10301
10302 if (ret == 0) {
10303 if (op == CEPH_MDS_OP_GETFILELOCK) {
10304 ceph_filelock filelock;
11fdf7f2
TL
10305 auto p = bl.cbegin();
10306 decode(filelock, p);
7c673cae
FG
10307
10308 if (CEPH_LOCK_SHARED == filelock.type)
10309 fl->l_type = F_RDLCK;
10310 else if (CEPH_LOCK_EXCL == filelock.type)
10311 fl->l_type = F_WRLCK;
10312 else
10313 fl->l_type = F_UNLCK;
10314
10315 fl->l_whence = SEEK_SET;
10316 fl->l_start = filelock.start;
10317 fl->l_len = filelock.length;
10318 fl->l_pid = filelock.pid;
10319 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10320 ceph_lock_state_t *lock_state;
10321 if (lock_type == CEPH_LOCK_FCNTL) {
10322 if (!in->fcntl_locks)
11fdf7f2
TL
10323 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10324 lock_state = in->fcntl_locks.get();
7c673cae
FG
10325 } else if (lock_type == CEPH_LOCK_FLOCK) {
10326 if (!in->flock_locks)
11fdf7f2
TL
10327 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10328 lock_state = in->flock_locks.get();
7c673cae
FG
10329 } else {
10330 ceph_abort();
10331 return -EINVAL;
10332 }
10333 _update_lock_state(fl, owner, lock_state);
10334
10335 if (!removing) {
10336 if (lock_type == CEPH_LOCK_FCNTL) {
10337 if (!fh->fcntl_locks)
11fdf7f2
TL
10338 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10339 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10340 } else {
10341 if (!fh->flock_locks)
11fdf7f2
TL
10342 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10343 lock_state = fh->flock_locks.get();
7c673cae
FG
10344 }
10345 _update_lock_state(fl, owner, lock_state);
10346 }
10347 } else
10348 ceph_abort();
10349 }
10350 return ret;
10351}
10352
10353int Client::_interrupt_filelock(MetaRequest *req)
10354{
31f18b77
FG
10355 // Set abort code, but do not kick. The abort code prevents the request
10356 // from being re-sent.
10357 req->abort(-EINTR);
10358 if (req->mds < 0)
10359 return 0; // haven't sent the request
10360
7c673cae
FG
10361 Inode *in = req->inode();
10362
10363 int lock_type;
10364 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10365 lock_type = CEPH_LOCK_FLOCK_INTR;
10366 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10367 lock_type = CEPH_LOCK_FCNTL_INTR;
10368 else {
10369 ceph_abort();
10370 return -EINVAL;
10371 }
10372
10373 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10374 filepath path;
10375 in->make_nosnap_relative_path(path);
10376 intr_req->set_filepath(path);
10377 intr_req->set_inode(in);
10378 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10379 intr_req->head.args.filelock_change.rule = lock_type;
10380 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10381
10382 UserPerm perms(req->get_uid(), req->get_gid());
10383 return make_request(intr_req, perms, NULL, NULL, -1);
10384}
10385
10386void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10387{
10388 if (!in->fcntl_locks && !in->flock_locks)
10389 return;
10390
10391 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10392 encode(nr_fcntl_locks, bl);
7c673cae 10393 if (nr_fcntl_locks) {
11fdf7f2 10394 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10395 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10396 p != lock_state->held_locks.end();
10397 ++p)
11fdf7f2 10398 encode(p->second, bl);
7c673cae
FG
10399 }
10400
10401 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10402 encode(nr_flock_locks, bl);
7c673cae 10403 if (nr_flock_locks) {
11fdf7f2 10404 auto &lock_state = in->flock_locks;
7c673cae
FG
10405 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10406 p != lock_state->held_locks.end();
10407 ++p)
11fdf7f2 10408 encode(p->second, bl);
7c673cae
FG
10409 }
10410
11fdf7f2 10411 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10412 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10413}
10414
10415void Client::_release_filelocks(Fh *fh)
10416{
10417 if (!fh->fcntl_locks && !fh->flock_locks)
10418 return;
10419
10420 Inode *in = fh->inode.get();
11fdf7f2 10421 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 10422
f6b5b4d7
TL
10423 list<ceph_filelock> activated_locks;
10424
7c673cae
FG
10425 list<pair<int, ceph_filelock> > to_release;
10426
10427 if (fh->fcntl_locks) {
11fdf7f2 10428 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
10429 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10430 auto q = p++;
10431 if (in->flags & I_ERROR_FILELOCK) {
10432 lock_state->remove_lock(q->second, activated_locks);
10433 } else {
10434 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10435 }
10436 }
11fdf7f2 10437 lock_state.reset();
7c673cae
FG
10438 }
10439 if (fh->flock_locks) {
11fdf7f2 10440 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
10441 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10442 auto q = p++;
10443 if (in->flags & I_ERROR_FILELOCK) {
10444 lock_state->remove_lock(q->second, activated_locks);
10445 } else {
10446 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10447 }
10448 }
11fdf7f2 10449 lock_state.reset();
7c673cae
FG
10450 }
10451
f6b5b4d7
TL
10452 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10453 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 10454
f6b5b4d7 10455 if (to_release.empty())
11fdf7f2
TL
10456 return;
10457
7c673cae
FG
10458 struct flock fl;
10459 memset(&fl, 0, sizeof(fl));
10460 fl.l_whence = SEEK_SET;
10461 fl.l_type = F_UNLCK;
10462
10463 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10464 p != to_release.end();
10465 ++p) {
10466 fl.l_start = p->second.start;
10467 fl.l_len = p->second.length;
10468 fl.l_pid = p->second.pid;
10469 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10470 p->second.owner, true);
10471 }
10472}
10473
10474void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10475 ceph_lock_state_t *lock_state)
10476{
10477 int lock_cmd;
10478 if (F_RDLCK == fl->l_type)
10479 lock_cmd = CEPH_LOCK_SHARED;
10480 else if (F_WRLCK == fl->l_type)
10481 lock_cmd = CEPH_LOCK_EXCL;
10482 else
10483 lock_cmd = CEPH_LOCK_UNLOCK;;
10484
10485 ceph_filelock filelock;
10486 filelock.start = fl->l_start;
10487 filelock.length = fl->l_len;
10488 filelock.client = 0;
10489 // see comment in _do_filelock()
10490 filelock.owner = owner | (1ULL << 63);
10491 filelock.pid = fl->l_pid;
10492 filelock.type = lock_cmd;
10493
10494 if (filelock.type == CEPH_LOCK_UNLOCK) {
10495 list<ceph_filelock> activated_locks;
10496 lock_state->remove_lock(filelock, activated_locks);
10497 } else {
10498 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10499 ceph_assert(r);
7c673cae
FG
10500 }
10501}
10502
10503int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10504{
10505 Inode *in = fh->inode.get();
10506 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10507 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10508 return ret;
10509}
10510
10511int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10512{
10513 Inode *in = fh->inode.get();
10514 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10515 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10516 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10517 return ret;
10518}
10519
10520int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10521{
10522 Inode *in = fh->inode.get();
10523 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10524
10525 int sleep = !(cmd & LOCK_NB);
10526 cmd &= ~LOCK_NB;
10527
10528 int type;
10529 switch (cmd) {
10530 case LOCK_SH:
10531 type = F_RDLCK;
10532 break;
10533 case LOCK_EX:
10534 type = F_WRLCK;
10535 break;
10536 case LOCK_UN:
10537 type = F_UNLCK;
10538 break;
10539 default:
10540 return -EINVAL;
10541 }
10542
10543 struct flock fl;
10544 memset(&fl, 0, sizeof(fl));
10545 fl.l_type = type;
10546 fl.l_whence = SEEK_SET;
10547
10548 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10549 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10550 return ret;
10551}
10552
10553int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10554{
10555 /* Since the only thing this does is wrap a call to statfs, and
10556 statfs takes a lock, it doesn't seem we have a need to split it
10557 out. */
10558 return statfs(0, stbuf, perms);
10559}
10560
e306af50 10561void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
10562{
10563 if (!args)
10564 return;
11fdf7f2
TL
10565 std::lock_guard l(client_lock);
10566 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10567 << " invalidate_ino_cb " << args->ino_cb
10568 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10569 << " switch_interrupt_cb " << args->switch_intr_cb
10570 << " remount_cb " << args->remount_cb
10571 << dendl;
10572 callback_handle = args->handle;
10573 if (args->ino_cb) {
10574 ino_invalidate_cb = args->ino_cb;
10575 async_ino_invalidator.start();
10576 }
10577 if (args->dentry_cb) {
10578 dentry_invalidate_cb = args->dentry_cb;
10579 async_dentry_invalidator.start();
10580 }
10581 if (args->switch_intr_cb) {
10582 switch_interrupt_cb = args->switch_intr_cb;
10583 interrupt_finisher.start();
10584 }
10585 if (args->remount_cb) {
10586 remount_cb = args->remount_cb;
10587 remount_finisher.start();
10588 }
e306af50
TL
10589 if (args->ino_release_cb) {
10590 ino_release_cb = args->ino_release_cb;
10591 async_ino_releasor.start();
10592 }
10593 if (args->umask_cb)
10594 umask_cb = args->umask_cb;
7c673cae
FG
10595}
10596
10597int Client::test_dentry_handling(bool can_invalidate)
10598{
10599 int r = 0;
10600
10601 can_invalidate_dentries = can_invalidate;
10602
10603 if (can_invalidate_dentries) {
11fdf7f2 10604 ceph_assert(dentry_invalidate_cb);
7c673cae 10605 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10606 r = 0;
11fdf7f2
TL
10607 } else {
10608 ceph_assert(remount_cb);
7c673cae 10609 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10610 r = _do_remount(false);
b32b8144 10611 }
11fdf7f2 10612
7c673cae
FG
10613 return r;
10614}
10615
10616int Client::_sync_fs()
10617{
11fdf7f2 10618 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10619
10620 // flush file data
11fdf7f2
TL
10621 std::unique_ptr<C_SaferCond> cond = nullptr;
10622 if (cct->_conf->client_oc) {
10623 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10624 objectcacher->flush_all(cond.get());
10625 }
7c673cae
FG
10626
10627 // flush caps
10628 flush_caps_sync();
10629 ceph_tid_t flush_tid = last_flush_tid;
10630
10631 // wait for unsafe mds requests
10632 wait_unsafe_requests();
10633
10634 wait_sync_caps(flush_tid);
10635
11fdf7f2 10636 if (nullptr != cond) {
9f95a23c 10637 client_lock.unlock();
11fdf7f2
TL
10638 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10639 cond->wait();
10640 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 10641 client_lock.lock();
7c673cae
FG
10642 }
10643
10644 return 0;
10645}
10646
10647int Client::sync_fs()
10648{
11fdf7f2 10649 std::lock_guard l(client_lock);
181888fb
FG
10650
10651 if (unmounting)
10652 return -ENOTCONN;
10653
7c673cae
FG
10654 return _sync_fs();
10655}
10656
10657int64_t Client::drop_caches()
10658{
11fdf7f2 10659 std::lock_guard l(client_lock);
7c673cae
FG
10660 return objectcacher->release_all();
10661}
10662
11fdf7f2
TL
10663int Client::_lazyio(Fh *fh, int enable)
10664{
10665 Inode *in = fh->inode.get();
10666 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10667
10668 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10669 return 0;
10670
10671 int orig_mode = fh->mode;
10672 if (enable) {
10673 fh->mode |= CEPH_FILE_MODE_LAZY;
10674 in->get_open_ref(fh->mode);
10675 in->put_open_ref(orig_mode);
10676 check_caps(in, CHECK_CAPS_NODELAY);
10677 } else {
10678 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10679 in->get_open_ref(fh->mode);
10680 in->put_open_ref(orig_mode);
10681 check_caps(in, 0);
10682 }
10683
10684 return 0;
10685}
10686
10687int Client::lazyio(int fd, int enable)
10688{
10689 std::lock_guard l(client_lock);
10690 Fh *f = get_filehandle(fd);
10691 if (!f)
10692 return -EBADF;
10693
10694 return _lazyio(f, enable);
10695}
10696
10697int Client::ll_lazyio(Fh *fh, int enable)
10698{
10699 std::lock_guard lock(client_lock);
10700 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10701 tout(cct) << __func__ << std::endl;
10702
10703 return _lazyio(fh, enable);
10704}
7c673cae 10705
92f5a8d4 10706int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 10707{
11fdf7f2 10708 std::lock_guard l(client_lock);
92f5a8d4 10709 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
10710 << ", " << offset << ", " << count << ")" << dendl;
10711
10712 Fh *f = get_filehandle(fd);
10713 if (!f)
10714 return -EBADF;
10715
10716 // for now
10717 _fsync(f, true);
10718
10719 return 0;
10720}
10721
10722int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10723{
11fdf7f2 10724 std::lock_guard l(client_lock);
7c673cae
FG
10725 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10726 << ", " << offset << ", " << count << ")" << dendl;
10727
10728 Fh *f = get_filehandle(fd);
10729 if (!f)
10730 return -EBADF;
10731 Inode *in = f->inode.get();
10732
10733 _fsync(f, true);
92f5a8d4
TL
10734 if (_release(in)) {
10735 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10736 if (r < 0)
10737 return r;
10738 }
7c673cae
FG
10739 return 0;
10740}
10741
10742
10743// =============================
10744// snaps
10745
10746int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10747{
11fdf7f2 10748 std::lock_guard l(client_lock);
181888fb
FG
10749
10750 if (unmounting)
10751 return -ENOTCONN;
10752
7c673cae
FG
10753 filepath path(relpath);
10754 InodeRef in;
10755 int r = path_walk(path, &in, perm);
10756 if (r < 0)
10757 return r;
10758 if (cct->_conf->client_permissions) {
10759 r = may_create(in.get(), perm);
10760 if (r < 0)
10761 return r;
10762 }
10763 Inode *snapdir = open_snapdir(in.get());
10764 return _mkdir(snapdir, name, 0, perm);
10765}
181888fb 10766
7c673cae
FG
10767int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10768{
11fdf7f2 10769 std::lock_guard l(client_lock);
181888fb
FG
10770
10771 if (unmounting)
10772 return -ENOTCONN;
10773
7c673cae
FG
10774 filepath path(relpath);
10775 InodeRef in;
10776 int r = path_walk(path, &in, perms);
10777 if (r < 0)
10778 return r;
10779 if (cct->_conf->client_permissions) {
10780 r = may_delete(in.get(), NULL, perms);
10781 if (r < 0)
10782 return r;
10783 }
10784 Inode *snapdir = open_snapdir(in.get());
10785 return _rmdir(snapdir, name, perms);
10786}
10787
10788// =============================
10789// expose caps
10790
10791int Client::get_caps_issued(int fd) {
10792
11fdf7f2 10793 std::lock_guard lock(client_lock);
7c673cae 10794
181888fb
FG
10795 if (unmounting)
10796 return -ENOTCONN;
10797
7c673cae
FG
10798 Fh *f = get_filehandle(fd);
10799 if (!f)
10800 return -EBADF;
10801
10802 return f->inode->caps_issued();
10803}
10804
10805int Client::get_caps_issued(const char *path, const UserPerm& perms)
10806{
11fdf7f2 10807 std::lock_guard lock(client_lock);
181888fb
FG
10808
10809 if (unmounting)
10810 return -ENOTCONN;
10811
7c673cae
FG
10812 filepath p(path);
10813 InodeRef in;
10814 int r = path_walk(p, &in, perms, true);
10815 if (r < 0)
10816 return r;
10817 return in->caps_issued();
10818}
10819
10820// =========================================
10821// low level
10822
10823Inode *Client::open_snapdir(Inode *diri)
10824{
10825 Inode *in;
10826 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10827 if (!inode_map.count(vino)) {
10828 in = new Inode(this, vino, &diri->layout);
10829
10830 in->ino = diri->ino;
10831 in->snapid = CEPH_SNAPDIR;
10832 in->mode = diri->mode;
10833 in->uid = diri->uid;
10834 in->gid = diri->gid;
494da23a 10835 in->nlink = 1;
7c673cae
FG
10836 in->mtime = diri->mtime;
10837 in->ctime = diri->ctime;
10838 in->btime = diri->btime;
f6b5b4d7 10839 in->atime = diri->atime;
7c673cae
FG
10840 in->size = diri->size;
10841 in->change_attr = diri->change_attr;
10842
10843 in->dirfragtree.clear();
10844 in->snapdir_parent = diri;
10845 diri->flags |= I_SNAPDIR_OPEN;
10846 inode_map[vino] = in;
10847 if (use_faked_inos())
10848 _assign_faked_ino(in);
10849 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10850 } else {
10851 in = inode_map[vino];
10852 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10853 }
10854 return in;
10855}
10856
10857int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10858 Inode **out, const UserPerm& perms)
10859{
11fdf7f2 10860 std::lock_guard lock(client_lock);
31f18b77 10861 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10862 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10863 tout(cct) << __func__ << std::endl;
7c673cae
FG
10864 tout(cct) << name << std::endl;
10865
181888fb
FG
10866 if (unmounting)
10867 return -ENOTCONN;
10868
7c673cae 10869 int r = 0;
11fdf7f2
TL
10870 if (!fuse_default_permissions) {
10871 if (strcmp(name, ".") && strcmp(name, "..")) {
10872 r = may_lookup(parent, perms);
10873 if (r < 0)
10874 return r;
10875 }
7c673cae
FG
10876 }
10877
10878 string dname(name);
10879 InodeRef in;
10880
10881 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10882 if (r < 0) {
10883 attr->st_ino = 0;
10884 goto out;
10885 }
10886
11fdf7f2 10887 ceph_assert(in);
7c673cae
FG
10888 fill_stat(in, attr);
10889 _ll_get(in.get());
10890
10891 out:
11fdf7f2 10892 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10893 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10894 tout(cct) << attr->st_ino << std::endl;
10895 *out = in.get();
10896 return r;
10897}
10898
1adf2230
AA
10899int Client::ll_lookup_inode(
10900 struct inodeno_t ino,
10901 const UserPerm& perms,
10902 Inode **inode)
10903{
81eedcae 10904 ceph_assert(inode != NULL);
11fdf7f2 10905 std::lock_guard lock(client_lock);
1adf2230
AA
10906 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10907
81eedcae
TL
10908 if (unmounting)
10909 return -ENOTCONN;
10910
1adf2230
AA
10911 // Num1: get inode and *inode
10912 int r = _lookup_ino(ino, perms, inode);
81eedcae 10913 if (r)
1adf2230 10914 return r;
81eedcae 10915
11fdf7f2 10916 ceph_assert(*inode != NULL);
1adf2230 10917
81eedcae
TL
10918 if (!(*inode)->dentries.empty()) {
10919 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10920 return 0;
10921 }
10922
10923 if ((*inode)->is_root()) {
10924 ldout(cct, 8) << "ino is root, no parent" << dendl;
10925 return 0;
10926 }
10927
1adf2230
AA
10928 // Num2: Request the parent inode, so that we can look up the name
10929 Inode *parent;
10930 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10931 if (r) {
1adf2230
AA
10932 _ll_forget(*inode, 1);
10933 return r;
1adf2230 10934 }
81eedcae 10935
11fdf7f2 10936 ceph_assert(parent != NULL);
1adf2230
AA
10937
10938 // Num3: Finally, get the name (dentry) of the requested inode
10939 r = _lookup_name(*inode, parent, perms);
10940 if (r) {
10941 // Unexpected error
10942 _ll_forget(parent, 1);
10943 _ll_forget(*inode, 1);
10944 return r;
10945 }
10946
10947 _ll_forget(parent, 1);
10948 return 0;
10949}
10950
7c673cae
FG
10951int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10952 struct ceph_statx *stx, unsigned want, unsigned flags,
10953 const UserPerm& perms)
10954{
11fdf7f2 10955 std::lock_guard lock(client_lock);
31f18b77 10956 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10957 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10958 tout(cct) << "ll_lookupx" << std::endl;
10959 tout(cct) << name << std::endl;
10960
181888fb
FG
10961 if (unmounting)
10962 return -ENOTCONN;
10963
7c673cae 10964 int r = 0;
11fdf7f2 10965 if (!fuse_default_permissions) {
7c673cae
FG
10966 r = may_lookup(parent, perms);
10967 if (r < 0)
10968 return r;
10969 }
10970
10971 string dname(name);
10972 InodeRef in;
10973
10974 unsigned mask = statx_to_mask(flags, want);
10975 r = _lookup(parent, dname, mask, &in, perms);
10976 if (r < 0) {
10977 stx->stx_ino = 0;
10978 stx->stx_mask = 0;
10979 } else {
11fdf7f2 10980 ceph_assert(in);
7c673cae
FG
10981 fill_statx(in, mask, stx);
10982 _ll_get(in.get());
10983 }
10984
11fdf7f2 10985 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10986 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10987 tout(cct) << stx->stx_ino << std::endl;
10988 *out = in.get();
10989 return r;
10990}
10991
10992int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10993 unsigned int want, unsigned int flags, const UserPerm& perms)
10994{
11fdf7f2 10995 std::lock_guard lock(client_lock);
181888fb
FG
10996
10997 if (unmounting)
10998 return -ENOTCONN;
10999
7c673cae
FG
11000 filepath fp(name, 0);
11001 InodeRef in;
11002 int rc;
11003 unsigned mask = statx_to_mask(flags, want);
11004
11fdf7f2
TL
11005 ldout(cct, 3) << __func__ << " " << name << dendl;
11006 tout(cct) << __func__ << std::endl;
7c673cae
FG
11007 tout(cct) << name << std::endl;
11008
11009 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11010 if (rc < 0) {
11011 /* zero out mask, just in case... */
11012 stx->stx_mask = 0;
11013 stx->stx_ino = 0;
11014 *out = NULL;
11015 return rc;
11016 } else {
11fdf7f2 11017 ceph_assert(in);
7c673cae
FG
11018 fill_statx(in, mask, stx);
11019 _ll_get(in.get());
11020 *out = in.get();
11021 return 0;
11022 }
11023}
11024
11025void Client::_ll_get(Inode *in)
11026{
11027 if (in->ll_ref == 0) {
11028 in->get();
11fdf7f2
TL
11029 if (in->is_dir() && !in->dentries.empty()) {
11030 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11031 in->get_first_parent()->get(); // pin dentry
11032 }
11fdf7f2
TL
11033 if (in->snapid != CEPH_NOSNAP)
11034 ll_snap_ref[in->snapid]++;
7c673cae
FG
11035 }
11036 in->ll_get();
11fdf7f2 11037 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
11038}
11039
494da23a 11040int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
11041{
11042 in->ll_put(num);
11fdf7f2 11043 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 11044 if (in->ll_ref == 0) {
11fdf7f2
TL
11045 if (in->is_dir() && !in->dentries.empty()) {
11046 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11047 in->get_first_parent()->put(); // unpin dentry
11048 }
11fdf7f2
TL
11049 if (in->snapid != CEPH_NOSNAP) {
11050 auto p = ll_snap_ref.find(in->snapid);
11051 ceph_assert(p != ll_snap_ref.end());
11052 ceph_assert(p->second > 0);
11053 if (--p->second == 0)
11054 ll_snap_ref.erase(p);
11055 }
7c673cae
FG
11056 put_inode(in);
11057 return 0;
11058 } else {
11059 return in->ll_ref;
11060 }
11061}
11062
11063void Client::_ll_drop_pins()
11064{
11fdf7f2 11065 ldout(cct, 10) << __func__ << dendl;
1adf2230 11066 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
11067 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11068 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11069 it != inode_map.end();
11070 it = next) {
11071 Inode *in = it->second;
11072 next = it;
11073 ++next;
1adf2230
AA
11074 if (in->ll_ref){
11075 to_be_put.insert(in);
7c673cae 11076 _ll_put(in, in->ll_ref);
1adf2230 11077 }
7c673cae
FG
11078 }
11079}
11080
494da23a 11081bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 11082{
11fdf7f2 11083 inodeno_t ino = in->ino;
7c673cae 11084
11fdf7f2
TL
11085 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11086 tout(cct) << __func__ << std::endl;
7c673cae
FG
11087 tout(cct) << ino.val << std::endl;
11088 tout(cct) << count << std::endl;
11089
181888fb
FG
11090 // Ignore forget if we're no longer mounted
11091 if (unmounting)
11092 return true;
11093
7c673cae
FG
11094 if (ino == 1) return true; // ignore forget on root.
11095
11096 bool last = false;
11097 if (in->ll_ref < count) {
11098 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11099 << ", which only has ll_ref=" << in->ll_ref << dendl;
11100 _ll_put(in, in->ll_ref);
11101 last = true;
11102 } else {
11103 if (_ll_put(in, count) == 0)
11104 last = true;
11105 }
11106
11107 return last;
11108}
11109
494da23a 11110bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 11111{
11fdf7f2 11112 std::lock_guard lock(client_lock);
1adf2230
AA
11113 return _ll_forget(in, count);
11114}
11115
7c673cae
FG
11116bool Client::ll_put(Inode *in)
11117{
11118 /* ll_forget already takes the lock */
11119 return ll_forget(in, 1);
11120}
11121
11fdf7f2
TL
11122int Client::ll_get_snap_ref(snapid_t snap)
11123{
11124 std::lock_guard lock(client_lock);
11125 auto p = ll_snap_ref.find(snap);
11126 if (p != ll_snap_ref.end())
11127 return p->second;
11128 return 0;
11129}
11130
7c673cae
FG
11131snapid_t Client::ll_get_snapid(Inode *in)
11132{
11fdf7f2 11133 std::lock_guard lock(client_lock);
7c673cae
FG
11134 return in->snapid;
11135}
11136
11137Inode *Client::ll_get_inode(ino_t ino)
11138{
11fdf7f2 11139 std::lock_guard lock(client_lock);
181888fb
FG
11140
11141 if (unmounting)
11142 return NULL;
11143
7c673cae
FG
11144 vinodeno_t vino = _map_faked_ino(ino);
11145 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11146 if (p == inode_map.end())
11147 return NULL;
11148 Inode *in = p->second;
11149 _ll_get(in);
11150 return in;
11151}
11152
11153Inode *Client::ll_get_inode(vinodeno_t vino)
11154{
11fdf7f2 11155 std::lock_guard lock(client_lock);
181888fb
FG
11156
11157 if (unmounting)
11158 return NULL;
11159
7c673cae
FG
11160 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11161 if (p == inode_map.end())
11162 return NULL;
11163 Inode *in = p->second;
11164 _ll_get(in);
11165 return in;
11166}
11167
11168int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11169{
11170 vinodeno_t vino = _get_vino(in);
11171
11fdf7f2
TL
11172 ldout(cct, 8) << __func__ << " " << vino << dendl;
11173 tout(cct) << __func__ << std::endl;
7c673cae
FG
11174 tout(cct) << vino.ino.val << std::endl;
11175
11176 if (vino.snapid < CEPH_NOSNAP)
11177 return 0;
11178 else
11179 return _getattr(in, caps, perms);
11180}
11181
11182int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11183{
11fdf7f2 11184 std::lock_guard lock(client_lock);
7c673cae 11185
181888fb
FG
11186 if (unmounting)
11187 return -ENOTCONN;
11188
7c673cae
FG
11189 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11190
11191 if (res == 0)
11192 fill_stat(in, attr);
11fdf7f2 11193 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11194 return res;
11195}
11196
11197int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11198 unsigned int flags, const UserPerm& perms)
11199{
11fdf7f2 11200 std::lock_guard lock(client_lock);
7c673cae 11201
181888fb
FG
11202 if (unmounting)
11203 return -ENOTCONN;
11204
7c673cae
FG
11205 int res = 0;
11206 unsigned mask = statx_to_mask(flags, want);
11207
94b18763 11208 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
11209 res = _ll_getattr(in, mask, perms);
11210
11211 if (res == 0)
11212 fill_statx(in, mask, stx);
11fdf7f2 11213 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11214 return res;
11215}
11216
11217int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11218 const UserPerm& perms, InodeRef *inp)
11219{
11220 vinodeno_t vino = _get_vino(in);
11221
11fdf7f2 11222 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 11223 << dendl;
11fdf7f2 11224 tout(cct) << __func__ << std::endl;
7c673cae
FG
11225 tout(cct) << vino.ino.val << std::endl;
11226 tout(cct) << stx->stx_mode << std::endl;
11227 tout(cct) << stx->stx_uid << std::endl;
11228 tout(cct) << stx->stx_gid << std::endl;
11229 tout(cct) << stx->stx_size << std::endl;
11230 tout(cct) << stx->stx_mtime << std::endl;
11231 tout(cct) << stx->stx_atime << std::endl;
11232 tout(cct) << stx->stx_btime << std::endl;
11233 tout(cct) << mask << std::endl;
11234
11fdf7f2 11235 if (!fuse_default_permissions) {
7c673cae
FG
11236 int res = may_setattr(in, stx, mask, perms);
11237 if (res < 0)
11238 return res;
11239 }
11240
11241 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11242
11243 return __setattrx(in, stx, mask, perms, inp);
11244}
11245
11246int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11247 const UserPerm& perms)
11248{
11fdf7f2 11249 std::lock_guard lock(client_lock);
181888fb
FG
11250
11251 if (unmounting)
11252 return -ENOTCONN;
11253
7c673cae
FG
11254 InodeRef target(in);
11255 int res = _ll_setattrx(in, stx, mask, perms, &target);
11256 if (res == 0) {
11fdf7f2 11257 ceph_assert(in == target.get());
7c673cae
FG
11258 fill_statx(in, in->caps_issued(), stx);
11259 }
11260
11fdf7f2 11261 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11262 return res;
11263}
11264
11265int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11266 const UserPerm& perms)
11267{
11268 struct ceph_statx stx;
11269 stat_to_statx(attr, &stx);
11270
11fdf7f2 11271 std::lock_guard lock(client_lock);
181888fb
FG
11272
11273 if (unmounting)
11274 return -ENOTCONN;
11275
7c673cae
FG
11276 InodeRef target(in);
11277 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11278 if (res == 0) {
11fdf7f2 11279 ceph_assert(in == target.get());
7c673cae
FG
11280 fill_stat(in, attr);
11281 }
11282
11fdf7f2 11283 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11284 return res;
11285}
11286
11287
11288// ----------
11289// xattrs
11290
11291int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11292 const UserPerm& perms)
11293{
11fdf7f2 11294 std::lock_guard lock(client_lock);
181888fb
FG
11295
11296 if (unmounting)
11297 return -ENOTCONN;
11298
7c673cae
FG
11299 InodeRef in;
11300 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11301 if (r < 0)
11302 return r;
11303 return _getxattr(in, name, value, size, perms);
11304}
11305
11306int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11307 const UserPerm& perms)
11308{
11fdf7f2 11309 std::lock_guard lock(client_lock);
181888fb
FG
11310
11311 if (unmounting)
11312 return -ENOTCONN;
11313
7c673cae
FG
11314 InodeRef in;
11315 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11316 if (r < 0)
11317 return r;
11318 return _getxattr(in, name, value, size, perms);
11319}
11320
11321int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11322 const UserPerm& perms)
11323{
11fdf7f2 11324 std::lock_guard lock(client_lock);
181888fb
FG
11325
11326 if (unmounting)
11327 return -ENOTCONN;
11328
7c673cae
FG
11329 Fh *f = get_filehandle(fd);
11330 if (!f)
11331 return -EBADF;
11332 return _getxattr(f->inode, name, value, size, perms);
11333}
11334
11335int Client::listxattr(const char *path, char *list, size_t size,
11336 const UserPerm& perms)
11337{
11fdf7f2 11338 std::lock_guard lock(client_lock);
181888fb
FG
11339
11340 if (unmounting)
11341 return -ENOTCONN;
11342
7c673cae
FG
11343 InodeRef in;
11344 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11345 if (r < 0)
11346 return r;
11347 return Client::_listxattr(in.get(), list, size, perms);
11348}
11349
11350int Client::llistxattr(const char *path, char *list, size_t size,
11351 const UserPerm& perms)
11352{
11fdf7f2 11353 std::lock_guard lock(client_lock);
181888fb
FG
11354
11355 if (unmounting)
11356 return -ENOTCONN;
11357
7c673cae
FG
11358 InodeRef in;
11359 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11360 if (r < 0)
11361 return r;
11362 return Client::_listxattr(in.get(), list, size, perms);
11363}
11364
11365int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11366{
11fdf7f2 11367 std::lock_guard lock(client_lock);
181888fb
FG
11368
11369 if (unmounting)
11370 return -ENOTCONN;
11371
7c673cae
FG
11372 Fh *f = get_filehandle(fd);
11373 if (!f)
11374 return -EBADF;
11375 return Client::_listxattr(f->inode.get(), list, size, perms);
11376}
11377
11378int Client::removexattr(const char *path, const char *name,
11379 const UserPerm& perms)
11380{
11fdf7f2 11381 std::lock_guard lock(client_lock);
181888fb
FG
11382
11383 if (unmounting)
11384 return -ENOTCONN;
11385
7c673cae
FG
11386 InodeRef in;
11387 int r = Client::path_walk(path, &in, perms, true);
11388 if (r < 0)
11389 return r;
11390 return _removexattr(in, name, perms);
11391}
11392
11393int Client::lremovexattr(const char *path, const char *name,
11394 const UserPerm& perms)
11395{
11fdf7f2 11396 std::lock_guard lock(client_lock);
181888fb
FG
11397
11398 if (unmounting)
11399 return -ENOTCONN;
11400
7c673cae
FG
11401 InodeRef in;
11402 int r = Client::path_walk(path, &in, perms, false);
11403 if (r < 0)
11404 return r;
11405 return _removexattr(in, name, perms);
11406}
11407
11408int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11409{
11fdf7f2 11410 std::lock_guard lock(client_lock);
181888fb
FG
11411
11412 if (unmounting)
11413 return -ENOTCONN;
11414
7c673cae
FG
11415 Fh *f = get_filehandle(fd);
11416 if (!f)
11417 return -EBADF;
11418 return _removexattr(f->inode, name, perms);
11419}
11420
11421int Client::setxattr(const char *path, const char *name, const void *value,
11422 size_t size, int flags, const UserPerm& perms)
11423{
11424 _setxattr_maybe_wait_for_osdmap(name, value, size);
11425
11fdf7f2 11426 std::lock_guard lock(client_lock);
181888fb
FG
11427
11428 if (unmounting)
11429 return -ENOTCONN;
11430
7c673cae
FG
11431 InodeRef in;
11432 int r = Client::path_walk(path, &in, perms, true);
11433 if (r < 0)
11434 return r;
11435 return _setxattr(in, name, value, size, flags, perms);
11436}
11437
11438int Client::lsetxattr(const char *path, const char *name, const void *value,
11439 size_t size, int flags, const UserPerm& perms)
11440{
11441 _setxattr_maybe_wait_for_osdmap(name, value, size);
11442
11fdf7f2 11443 std::lock_guard lock(client_lock);
181888fb
FG
11444
11445 if (unmounting)
11446 return -ENOTCONN;
11447
7c673cae
FG
11448 InodeRef in;
11449 int r = Client::path_walk(path, &in, perms, false);
11450 if (r < 0)
11451 return r;
11452 return _setxattr(in, name, value, size, flags, perms);
11453}
11454
11455int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11456 int flags, const UserPerm& perms)
11457{
11458 _setxattr_maybe_wait_for_osdmap(name, value, size);
11459
11fdf7f2 11460 std::lock_guard lock(client_lock);
181888fb
FG
11461
11462 if (unmounting)
11463 return -ENOTCONN;
11464
7c673cae
FG
11465 Fh *f = get_filehandle(fd);
11466 if (!f)
11467 return -EBADF;
11468 return _setxattr(f->inode, name, value, size, flags, perms);
11469}
11470
11471int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11472 const UserPerm& perms)
11473{
11474 int r;
11475
11476 const VXattr *vxattr = _match_vxattr(in, name);
11477 if (vxattr) {
11478 r = -ENODATA;
11479
11480 // Do a force getattr to get the latest quota before returning
11481 // a value to userspace.
28e407b8
AA
11482 int flags = 0;
11483 if (vxattr->flags & VXATTR_RSTAT) {
11484 flags |= CEPH_STAT_RSTAT;
11485 }
adb31ebb
TL
11486 if (vxattr->flags & VXATTR_DIRSTAT) {
11487 flags |= CEPH_CAP_FILE_SHARED;
11488 }
28e407b8 11489 r = _getattr(in, flags, perms, true);
7c673cae
FG
11490 if (r != 0) {
11491 // Error from getattr!
11492 return r;
11493 }
11494
11495 // call pointer-to-member function
11496 char buf[256];
11497 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11498 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11499 } else {
11500 r = -ENODATA;
11501 }
11502
11503 if (size != 0) {
11504 if (r > (int)size) {
11505 r = -ERANGE;
11506 } else if (r > 0) {
11507 memcpy(value, buf, r);
11508 }
11509 }
11510 goto out;
11511 }
11512
11513 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11514 r = -EOPNOTSUPP;
11515 goto out;
11516 }
11517
11518 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11519 if (r == 0) {
11520 string n(name);
11521 r = -ENODATA;
11522 if (in->xattrs.count(n)) {
11523 r = in->xattrs[n].length();
11524 if (r > 0 && size != 0) {
11525 if (size >= (unsigned)r)
11526 memcpy(value, in->xattrs[n].c_str(), r);
11527 else
11528 r = -ERANGE;
11529 }
11530 }
11531 }
11532 out:
1adf2230 11533 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11534 return r;
11535}
11536
11537int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11538 const UserPerm& perms)
11539{
11540 if (cct->_conf->client_permissions) {
11541 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11542 if (r < 0)
11543 return r;
11544 }
11545 return _getxattr(in.get(), name, value, size, perms);
11546}
11547
11548int Client::ll_getxattr(Inode *in, const char *name, void *value,
11549 size_t size, const UserPerm& perms)
11550{
11fdf7f2 11551 std::lock_guard lock(client_lock);
7c673cae 11552
181888fb
FG
11553 if (unmounting)
11554 return -ENOTCONN;
11555
7c673cae
FG
11556 vinodeno_t vino = _get_vino(in);
11557
11fdf7f2
TL
11558 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11559 tout(cct) << __func__ << std::endl;
7c673cae
FG
11560 tout(cct) << vino.ino.val << std::endl;
11561 tout(cct) << name << std::endl;
11562
11fdf7f2 11563 if (!fuse_default_permissions) {
7c673cae
FG
11564 int r = xattr_permission(in, name, MAY_READ, perms);
11565 if (r < 0)
11566 return r;
11567 }
11568
11569 return _getxattr(in, name, value, size, perms);
11570}
11571
11572int Client::_listxattr(Inode *in, char *name, size_t size,
11573 const UserPerm& perms)
11574{
81eedcae 11575 bool len_only = (size == 0);
7c673cae 11576 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11577 if (r != 0) {
11578 goto out;
11579 }
7c673cae 11580
81eedcae
TL
11581 r = 0;
11582 for (const auto& p : in->xattrs) {
11583 size_t this_len = p.first.length() + 1;
11584 r += this_len;
11585 if (len_only)
11586 continue;
7c673cae 11587
81eedcae
TL
11588 if (this_len > size) {
11589 r = -ERANGE;
11590 goto out;
11591 }
11592
11593 memcpy(name, p.first.c_str(), this_len);
11594 name += this_len;
11595 size -= this_len;
11596 }
81eedcae 11597out:
11fdf7f2 11598 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11599 return r;
11600}
11601
11602int Client::ll_listxattr(Inode *in, char *names, size_t size,
11603 const UserPerm& perms)
11604{
11fdf7f2 11605 std::lock_guard lock(client_lock);
7c673cae 11606
181888fb
FG
11607 if (unmounting)
11608 return -ENOTCONN;
11609
7c673cae
FG
11610 vinodeno_t vino = _get_vino(in);
11611
11fdf7f2
TL
11612 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11613 tout(cct) << __func__ << std::endl;
7c673cae
FG
11614 tout(cct) << vino.ino.val << std::endl;
11615 tout(cct) << size << std::endl;
11616
11617 return _listxattr(in, names, size, perms);
11618}
11619
11620int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11621 size_t size, int flags, const UserPerm& perms)
11622{
11623
11624 int xattr_flags = 0;
11625 if (!value)
11626 xattr_flags |= CEPH_XATTR_REMOVE;
11627 if (flags & XATTR_CREATE)
11628 xattr_flags |= CEPH_XATTR_CREATE;
11629 if (flags & XATTR_REPLACE)
11630 xattr_flags |= CEPH_XATTR_REPLACE;
11631
11632 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11633 filepath path;
11634 in->make_nosnap_relative_path(path);
11635 req->set_filepath(path);
11636 req->set_string2(name);
11637 req->set_inode(in);
11638 req->head.args.setxattr.flags = xattr_flags;
11639
11640 bufferlist bl;
11fdf7f2 11641 assert (value || size == 0);
7c673cae
FG
11642 bl.append((const char*)value, size);
11643 req->set_data(bl);
11644
11645 int res = make_request(req, perms);
11646
11647 trim_cache();
11fdf7f2 11648 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11649 res << dendl;
11650 return res;
11651}
11652
11653int Client::_setxattr(Inode *in, const char *name, const void *value,
11654 size_t size, int flags, const UserPerm& perms)
11655{
11656 if (in->snapid != CEPH_NOSNAP) {
11657 return -EROFS;
11658 }
11659
f6b5b4d7
TL
11660 if (size == 0) {
11661 value = "";
11662 } else if (value == NULL) {
11663 return -EINVAL;
11664 }
11665
7c673cae
FG
11666 bool posix_acl_xattr = false;
11667 if (acl_type == POSIX_ACL)
11668 posix_acl_xattr = !strncmp(name, "system.", 7);
11669
11670 if (strncmp(name, "user.", 5) &&
11671 strncmp(name, "security.", 9) &&
11672 strncmp(name, "trusted.", 8) &&
11673 strncmp(name, "ceph.", 5) &&
11674 !posix_acl_xattr)
11675 return -EOPNOTSUPP;
11676
11fdf7f2
TL
11677 bool check_realm = false;
11678
7c673cae
FG
11679 if (posix_acl_xattr) {
11680 if (!strcmp(name, ACL_EA_ACCESS)) {
11681 mode_t new_mode = in->mode;
11682 if (value) {
11683 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11684 if (ret < 0)
11685 return ret;
11686 if (ret == 0) {
11687 value = NULL;
11688 size = 0;
11689 }
11690 if (new_mode != in->mode) {
11691 struct ceph_statx stx;
11692 stx.stx_mode = new_mode;
11693 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11694 if (ret < 0)
11695 return ret;
11696 }
11697 }
11698 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11699 if (value) {
11700 if (!S_ISDIR(in->mode))
11701 return -EACCES;
11702 int ret = posix_acl_check(value, size);
11703 if (ret < 0)
11704 return -EINVAL;
11705 if (ret == 0) {
11706 value = NULL;
11707 size = 0;
11708 }
11709 }
11710 } else {
11711 return -EOPNOTSUPP;
11712 }
11713 } else {
11714 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11715 if (vxattr) {
11716 if (vxattr->readonly)
11717 return -EOPNOTSUPP;
11718 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11719 check_realm = true;
11720 }
7c673cae
FG
11721 }
11722
11fdf7f2
TL
11723 int ret = _do_setxattr(in, name, value, size, flags, perms);
11724 if (ret >= 0 && check_realm) {
11725 // check if snaprealm was created for quota inode
11726 if (in->quota.is_enable() &&
11727 !(in->snaprealm && in->snaprealm->ino == in->ino))
11728 ret = -EOPNOTSUPP;
11729 }
11730
11731 return ret;
7c673cae
FG
11732}
11733
11734int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11735 size_t size, int flags, const UserPerm& perms)
11736{
11737 if (cct->_conf->client_permissions) {
11738 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11739 if (r < 0)
11740 return r;
11741 }
11742 return _setxattr(in.get(), name, value, size, flags, perms);
11743}
11744
11745int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11746{
11747 string tmp;
11748 if (name == "layout") {
11749 string::iterator begin = value.begin();
11750 string::iterator end = value.end();
11751 keys_and_values<string::iterator> p; // create instance of parser
11752 std::map<string, string> m; // map to receive results
11753 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11754 return -EINVAL;
11755 }
11756 if (begin != end)
11757 return -EINVAL;
11758 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11759 if (q->first == "pool") {
11760 tmp = q->second;
11761 break;
11762 }
11763 }
11764 } else if (name == "layout.pool") {
11765 tmp = value;
11766 }
11767
11768 if (tmp.length()) {
11769 int64_t pool;
11770 try {
11771 pool = boost::lexical_cast<unsigned>(tmp);
11772 if (!osdmap->have_pg_pool(pool))
11773 return -ENOENT;
11774 } catch (boost::bad_lexical_cast const&) {
11775 pool = osdmap->lookup_pg_pool_name(tmp);
11776 if (pool < 0) {
11777 return -ENOENT;
11778 }
11779 }
11780 }
11781
11782 return 0;
11783}
11784
11785void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11786{
11787 // For setting pool of layout, MetaRequest need osdmap epoch.
11788 // There is a race which create a new data pool but client and mds both don't have.
11789 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11790 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11791 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11792 string rest(strstr(name, "layout"));
11793 string v((const char*)value, size);
11794 int r = objecter->with_osdmap([&](const OSDMap& o) {
11795 return _setxattr_check_data_pool(rest, v, &o);
11796 });
11797
11798 if (r == -ENOENT) {
11799 C_SaferCond ctx;
11800 objecter->wait_for_latest_osdmap(&ctx);
11801 ctx.wait();
11802 }
11803 }
11804}
11805
11806int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11807 size_t size, int flags, const UserPerm& perms)
11808{
11809 _setxattr_maybe_wait_for_osdmap(name, value, size);
11810
11fdf7f2 11811 std::lock_guard lock(client_lock);
7c673cae 11812
181888fb
FG
11813 if (unmounting)
11814 return -ENOTCONN;
11815
7c673cae
FG
11816 vinodeno_t vino = _get_vino(in);
11817
11fdf7f2
TL
11818 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11819 tout(cct) << __func__ << std::endl;
7c673cae
FG
11820 tout(cct) << vino.ino.val << std::endl;
11821 tout(cct) << name << std::endl;
11822
11fdf7f2 11823 if (!fuse_default_permissions) {
7c673cae
FG
11824 int r = xattr_permission(in, name, MAY_WRITE, perms);
11825 if (r < 0)
11826 return r;
11827 }
11828 return _setxattr(in, name, value, size, flags, perms);
11829}
11830
11831int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11832{
11833 if (in->snapid != CEPH_NOSNAP) {
11834 return -EROFS;
11835 }
11836
11837 // same xattrs supported by kernel client
11838 if (strncmp(name, "user.", 5) &&
11839 strncmp(name, "system.", 7) &&
11840 strncmp(name, "security.", 9) &&
11841 strncmp(name, "trusted.", 8) &&
11842 strncmp(name, "ceph.", 5))
11843 return -EOPNOTSUPP;
11844
11845 const VXattr *vxattr = _match_vxattr(in, name);
11846 if (vxattr && vxattr->readonly)
11847 return -EOPNOTSUPP;
11848
11849 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11850 filepath path;
11851 in->make_nosnap_relative_path(path);
11852 req->set_filepath(path);
11853 req->set_filepath2(name);
11854 req->set_inode(in);
11855
11856 int res = make_request(req, perms);
11857
11858 trim_cache();
1adf2230 11859 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11860 return res;
11861}
11862
11863int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11864{
11865 if (cct->_conf->client_permissions) {
11866 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11867 if (r < 0)
11868 return r;
11869 }
11870 return _removexattr(in.get(), name, perms);
11871}
11872
11873int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11874{
11fdf7f2 11875 std::lock_guard lock(client_lock);
7c673cae 11876
181888fb
FG
11877 if (unmounting)
11878 return -ENOTCONN;
11879
7c673cae
FG
11880 vinodeno_t vino = _get_vino(in);
11881
11882 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11883 tout(cct) << "ll_removexattr" << std::endl;
11884 tout(cct) << vino.ino.val << std::endl;
11885 tout(cct) << name << std::endl;
11886
11fdf7f2 11887 if (!fuse_default_permissions) {
7c673cae
FG
11888 int r = xattr_permission(in, name, MAY_WRITE, perms);
11889 if (r < 0)
11890 return r;
11891 }
11892
11893 return _removexattr(in, name, perms);
11894}
11895
11896bool Client::_vxattrcb_quota_exists(Inode *in)
11897{
11fdf7f2 11898 return in->quota.is_enable() &&
f6b5b4d7
TL
11899 (in->snapid != CEPH_NOSNAP ||
11900 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
11901}
11902size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11903{
11904 return snprintf(val, size,
11905 "max_bytes=%lld max_files=%lld",
11906 (long long int)in->quota.max_bytes,
11907 (long long int)in->quota.max_files);
11908}
11909size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11910{
11911 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11912}
11913size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11914{
11915 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11916}
11917
11918bool Client::_vxattrcb_layout_exists(Inode *in)
11919{
11920 return in->layout != file_layout_t();
11921}
11922size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11923{
11924 int r = snprintf(val, size,
11fdf7f2 11925 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11926 (unsigned long long)in->layout.stripe_unit,
11927 (unsigned long long)in->layout.stripe_count,
11928 (unsigned long long)in->layout.object_size);
11929 objecter->with_osdmap([&](const OSDMap& o) {
11930 if (o.have_pg_pool(in->layout.pool_id))
11931 r += snprintf(val + r, size - r, "%s",
11932 o.get_pool_name(in->layout.pool_id).c_str());
11933 else
11934 r += snprintf(val + r, size - r, "%" PRIu64,
11935 (uint64_t)in->layout.pool_id);
11936 });
11937 if (in->layout.pool_ns.length())
11938 r += snprintf(val + r, size - r, " pool_namespace=%s",
11939 in->layout.pool_ns.c_str());
11940 return r;
11941}
11942size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11943{
11fdf7f2 11944 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11945}
11946size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11947{
11fdf7f2 11948 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11949}
11950size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11951{
11fdf7f2 11952 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11953}
11954size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11955{
11956 size_t r;
11957 objecter->with_osdmap([&](const OSDMap& o) {
11958 if (o.have_pg_pool(in->layout.pool_id))
11959 r = snprintf(val, size, "%s", o.get_pool_name(
11960 in->layout.pool_id).c_str());
11961 else
11962 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11963 });
11964 return r;
11965}
11966size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11967{
11968 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11969}
11970size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11971{
11fdf7f2 11972 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11973}
11974size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11975{
11fdf7f2 11976 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11977}
11978size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11979{
11fdf7f2 11980 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11981}
11982size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11983{
11fdf7f2 11984 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11985}
11986size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11987{
11fdf7f2 11988 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11989}
11990size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11991{
11fdf7f2 11992 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11993}
11994size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11995{
11fdf7f2 11996 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11997}
11998size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11999{
81eedcae 12000 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
12001 (long)in->rstat.rctime.nsec());
12002}
11fdf7f2
TL
12003bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12004{
12005 return in->dir_pin != -ENODATA;
12006}
12007size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12008{
12009 return snprintf(val, size, "%ld", (long)in->dir_pin);
12010}
7c673cae 12011
81eedcae
TL
12012bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12013{
12014 return !in->snap_btime.is_zero();
12015}
12016
12017size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12018{
12019 return snprintf(val, size, "%llu.%09lu",
12020 (long long unsigned)in->snap_btime.sec(),
12021 (long unsigned)in->snap_btime.nsec());
12022}
12023
adb31ebb
TL
12024size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12025{
12026 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12027}
12028
12029size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12030{
12031 auto name = messenger->get_myname();
12032 return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12033}
12034
7c673cae
FG
12035#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12036#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12037
adb31ebb 12038#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
12039{ \
12040 name: CEPH_XATTR_NAME(_type, _name), \
12041 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12042 readonly: true, \
28e407b8
AA
12043 exists_cb: NULL, \
12044 flags: _flags, \
7c673cae
FG
12045}
12046#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12047{ \
12048 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12049 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12050 readonly: false, \
7c673cae 12051 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 12052 flags: 0, \
7c673cae
FG
12053}
12054#define XATTR_QUOTA_FIELD(_type, _name) \
12055{ \
12056 name: CEPH_XATTR_NAME(_type, _name), \
12057 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12058 readonly: false, \
7c673cae 12059 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 12060 flags: 0, \
7c673cae
FG
12061}
12062
12063const Client::VXattr Client::_dir_vxattrs[] = {
12064 {
12065 name: "ceph.dir.layout",
12066 getxattr_cb: &Client::_vxattrcb_layout,
12067 readonly: false,
7c673cae 12068 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12069 flags: 0,
7c673cae
FG
12070 },
12071 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12072 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12073 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12074 XATTR_LAYOUT_FIELD(dir, layout, pool),
12075 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
12076 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12077 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12078 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12079 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12080 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12081 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12082 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12083 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
12084 {
12085 name: "ceph.quota",
12086 getxattr_cb: &Client::_vxattrcb_quota,
12087 readonly: false,
7c673cae 12088 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 12089 flags: 0,
7c673cae
FG
12090 },
12091 XATTR_QUOTA_FIELD(quota, max_bytes),
12092 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
12093 {
12094 name: "ceph.dir.pin",
12095 getxattr_cb: &Client::_vxattrcb_dir_pin,
12096 readonly: false,
11fdf7f2
TL
12097 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12098 flags: 0,
12099 },
81eedcae
TL
12100 {
12101 name: "ceph.snap.btime",
12102 getxattr_cb: &Client::_vxattrcb_snap_btime,
12103 readonly: true,
81eedcae
TL
12104 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12105 flags: 0,
12106 },
7c673cae
FG
12107 { name: "" } /* Required table terminator */
12108};
12109
12110const Client::VXattr Client::_file_vxattrs[] = {
12111 {
12112 name: "ceph.file.layout",
12113 getxattr_cb: &Client::_vxattrcb_layout,
12114 readonly: false,
7c673cae 12115 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12116 flags: 0,
7c673cae
FG
12117 },
12118 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12119 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12120 XATTR_LAYOUT_FIELD(file, layout, object_size),
12121 XATTR_LAYOUT_FIELD(file, layout, pool),
12122 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
12123 {
12124 name: "ceph.snap.btime",
12125 getxattr_cb: &Client::_vxattrcb_snap_btime,
12126 readonly: true,
81eedcae
TL
12127 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12128 flags: 0,
12129 },
7c673cae
FG
12130 { name: "" } /* Required table terminator */
12131};
12132
adb31ebb
TL
12133const Client::VXattr Client::_common_vxattrs[] = {
12134 {
12135 name: "ceph.cluster_fsid",
12136 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12137 readonly: true,
12138 exists_cb: nullptr,
12139 flags: 0,
12140 },
12141 {
12142 name: "ceph.client_id",
12143 getxattr_cb: &Client::_vxattrcb_client_id,
12144 readonly: true,
12145 exists_cb: nullptr,
12146 flags: 0,
12147 },
12148 { name: "" } /* Required table terminator */
12149};
12150
7c673cae
FG
12151const Client::VXattr *Client::_get_vxattrs(Inode *in)
12152{
12153 if (in->is_dir())
12154 return _dir_vxattrs;
12155 else if (in->is_file())
12156 return _file_vxattrs;
12157 return NULL;
12158}
12159
12160const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12161{
12162 if (strncmp(name, "ceph.", 5) == 0) {
12163 const VXattr *vxattr = _get_vxattrs(in);
12164 if (vxattr) {
12165 while (!vxattr->name.empty()) {
12166 if (vxattr->name == name)
12167 return vxattr;
12168 vxattr++;
12169 }
12170 }
adb31ebb
TL
12171
12172 // for common vxattrs
12173 vxattr = _common_vxattrs;
12174 while (!vxattr->name.empty()) {
12175 if (vxattr->name == name)
12176 return vxattr;
12177 vxattr++;
12178 }
7c673cae 12179 }
adb31ebb 12180
7c673cae
FG
12181 return NULL;
12182}
12183
7c673cae
FG
12184int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12185{
11fdf7f2 12186 std::lock_guard lock(client_lock);
7c673cae 12187
181888fb
FG
12188 if (unmounting)
12189 return -ENOTCONN;
12190
7c673cae
FG
12191 vinodeno_t vino = _get_vino(in);
12192
12193 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12194 tout(cct) << "ll_readlink" << std::endl;
12195 tout(cct) << vino.ino.val << std::endl;
12196
11fdf7f2
TL
12197 for (auto dn : in->dentries) {
12198 touch_dn(dn);
7c673cae
FG
12199 }
12200
12201 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12202 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12203 return r;
12204}
12205
12206int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12207 const UserPerm& perms, InodeRef *inp)
12208{
1adf2230 12209 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12210 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12211 << ", gid " << perms.gid() << ")" << dendl;
12212
12213 if (strlen(name) > NAME_MAX)
12214 return -ENAMETOOLONG;
12215
12216 if (dir->snapid != CEPH_NOSNAP) {
12217 return -EROFS;
12218 }
12219 if (is_quota_files_exceeded(dir, perms)) {
12220 return -EDQUOT;
12221 }
12222
12223 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12224
12225 filepath path;
12226 dir->make_nosnap_relative_path(path);
12227 path.push_dentry(name);
12228 req->set_filepath(path);
12229 req->set_inode(dir);
12230 req->head.args.mknod.rdev = rdev;
12231 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12232 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12233
12234 bufferlist xattrs_bl;
12235 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12236 if (res < 0)
12237 goto fail;
12238 req->head.args.mknod.mode = mode;
12239 if (xattrs_bl.length() > 0)
12240 req->set_data(xattrs_bl);
12241
12242 Dentry *de;
12243 res = get_or_create(dir, name, &de);
12244 if (res < 0)
12245 goto fail;
12246 req->set_dentry(de);
12247
12248 res = make_request(req, perms, inp);
12249
12250 trim_cache();
12251
1adf2230 12252 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12253 return res;
12254
12255 fail:
12256 put_request(req);
12257 return res;
12258}
12259
12260int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12261 dev_t rdev, struct stat *attr, Inode **out,
12262 const UserPerm& perms)
12263{
11fdf7f2 12264 std::lock_guard lock(client_lock);
7c673cae 12265
181888fb
FG
12266 if (unmounting)
12267 return -ENOTCONN;
12268
7c673cae
FG
12269 vinodeno_t vparent = _get_vino(parent);
12270
12271 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12272 tout(cct) << "ll_mknod" << std::endl;
12273 tout(cct) << vparent.ino.val << std::endl;
12274 tout(cct) << name << std::endl;
12275 tout(cct) << mode << std::endl;
12276 tout(cct) << rdev << std::endl;
12277
11fdf7f2 12278 if (!fuse_default_permissions) {
7c673cae
FG
12279 int r = may_create(parent, perms);
12280 if (r < 0)
12281 return r;
12282 }
12283
12284 InodeRef in;
12285 int r = _mknod(parent, name, mode, rdev, perms, &in);
12286 if (r == 0) {
12287 fill_stat(in, attr);
12288 _ll_get(in.get());
12289 }
12290 tout(cct) << attr->st_ino << std::endl;
12291 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12292 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12293 *out = in.get();
12294 return r;
12295}
12296
12297int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12298 dev_t rdev, Inode **out,
12299 struct ceph_statx *stx, unsigned want, unsigned flags,
12300 const UserPerm& perms)
12301{
12302 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12303 std::lock_guard lock(client_lock);
7c673cae 12304
181888fb
FG
12305 if (unmounting)
12306 return -ENOTCONN;
12307
7c673cae
FG
12308 vinodeno_t vparent = _get_vino(parent);
12309
12310 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12311 tout(cct) << "ll_mknodx" << std::endl;
12312 tout(cct) << vparent.ino.val << std::endl;
12313 tout(cct) << name << std::endl;
12314 tout(cct) << mode << std::endl;
12315 tout(cct) << rdev << std::endl;
12316
11fdf7f2 12317 if (!fuse_default_permissions) {
7c673cae
FG
12318 int r = may_create(parent, perms);
12319 if (r < 0)
12320 return r;
12321 }
12322
12323 InodeRef in;
12324 int r = _mknod(parent, name, mode, rdev, perms, &in);
12325 if (r == 0) {
12326 fill_statx(in, caps, stx);
12327 _ll_get(in.get());
12328 }
12329 tout(cct) << stx->stx_ino << std::endl;
12330 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12331 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12332 *out = in.get();
12333 return r;
12334}
12335
12336int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12337 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12338 int object_size, const char *data_pool, bool *created,
12339 const UserPerm& perms)
12340{
1adf2230 12341 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12342 mode << dec << ")" << dendl;
12343
12344 if (strlen(name) > NAME_MAX)
12345 return -ENAMETOOLONG;
12346 if (dir->snapid != CEPH_NOSNAP) {
12347 return -EROFS;
12348 }
12349 if (is_quota_files_exceeded(dir, perms)) {
12350 return -EDQUOT;
12351 }
12352
12353 // use normalized flags to generate cmode
11fdf7f2
TL
12354 int cflags = ceph_flags_sys2wire(flags);
12355 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12356 cflags |= CEPH_O_LAZY;
12357
12358 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12359
12360 int64_t pool_id = -1;
12361 if (data_pool && *data_pool) {
12362 pool_id = objecter->with_osdmap(
12363 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12364 if (pool_id < 0)
12365 return -EINVAL;
12366 if (pool_id > 0xffffffffll)
12367 return -ERANGE; // bummer!
12368 }
12369
12370 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12371
12372 filepath path;
12373 dir->make_nosnap_relative_path(path);
12374 path.push_dentry(name);
12375 req->set_filepath(path);
12376 req->set_inode(dir);
11fdf7f2 12377 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12378
12379 req->head.args.open.stripe_unit = stripe_unit;
12380 req->head.args.open.stripe_count = stripe_count;
12381 req->head.args.open.object_size = object_size;
12382 if (cct->_conf->client_debug_getattr_caps)
12383 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12384 else
12385 req->head.args.open.mask = 0;
12386 req->head.args.open.pool = pool_id;
12387 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12388 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12389
12390 mode |= S_IFREG;
12391 bufferlist xattrs_bl;
12392 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12393 if (res < 0)
12394 goto fail;
12395 req->head.args.open.mode = mode;
12396 if (xattrs_bl.length() > 0)
12397 req->set_data(xattrs_bl);
12398
12399 Dentry *de;
12400 res = get_or_create(dir, name, &de);
12401 if (res < 0)
12402 goto fail;
12403 req->set_dentry(de);
12404
12405 res = make_request(req, perms, inp, created);
12406 if (res < 0) {
12407 goto reply_error;
12408 }
12409
12410 /* If the caller passed a value in fhp, do the open */
12411 if(fhp) {
12412 (*inp)->get_open_ref(cmode);
12413 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12414 }
12415
12416 reply_error:
12417 trim_cache();
12418
1adf2230 12419 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12420 << " layout " << stripe_unit
12421 << ' ' << stripe_count
12422 << ' ' << object_size
12423 <<") = " << res << dendl;
12424 return res;
12425
12426 fail:
12427 put_request(req);
12428 return res;
12429}
12430
12431
12432int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12433 InodeRef *inp)
12434{
1adf2230 12435 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12436 << mode << dec << ", uid " << perm.uid()
12437 << ", gid " << perm.gid() << ")" << dendl;
12438
12439 if (strlen(name) > NAME_MAX)
12440 return -ENAMETOOLONG;
12441
12442 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12443 return -EROFS;
12444 }
12445 if (is_quota_files_exceeded(dir, perm)) {
12446 return -EDQUOT;
12447 }
12448 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12449 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12450
12451 filepath path;
12452 dir->make_nosnap_relative_path(path);
12453 path.push_dentry(name);
12454 req->set_filepath(path);
12455 req->set_inode(dir);
12456 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12457 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12458
12459 mode |= S_IFDIR;
12460 bufferlist xattrs_bl;
12461 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12462 if (res < 0)
12463 goto fail;
12464 req->head.args.mkdir.mode = mode;
12465 if (xattrs_bl.length() > 0)
12466 req->set_data(xattrs_bl);
12467
12468 Dentry *de;
12469 res = get_or_create(dir, name, &de);
12470 if (res < 0)
12471 goto fail;
12472 req->set_dentry(de);
12473
12474 ldout(cct, 10) << "_mkdir: making request" << dendl;
12475 res = make_request(req, perm, inp);
12476 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12477
12478 trim_cache();
12479
1adf2230 12480 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12481 return res;
12482
12483 fail:
12484 put_request(req);
12485 return res;
12486}
12487
12488int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12489 struct stat *attr, Inode **out, const UserPerm& perm)
12490{
11fdf7f2 12491 std::lock_guard lock(client_lock);
7c673cae 12492
181888fb
FG
12493 if (unmounting)
12494 return -ENOTCONN;
12495
7c673cae
FG
12496 vinodeno_t vparent = _get_vino(parent);
12497
12498 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12499 tout(cct) << "ll_mkdir" << std::endl;
12500 tout(cct) << vparent.ino.val << std::endl;
12501 tout(cct) << name << std::endl;
12502 tout(cct) << mode << std::endl;
12503
11fdf7f2 12504 if (!fuse_default_permissions) {
7c673cae
FG
12505 int r = may_create(parent, perm);
12506 if (r < 0)
12507 return r;
12508 }
12509
12510 InodeRef in;
12511 int r = _mkdir(parent, name, mode, perm, &in);
12512 if (r == 0) {
12513 fill_stat(in, attr);
12514 _ll_get(in.get());
12515 }
12516 tout(cct) << attr->st_ino << std::endl;
12517 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12518 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12519 *out = in.get();
12520 return r;
12521}
12522
12523int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12524 struct ceph_statx *stx, unsigned want, unsigned flags,
12525 const UserPerm& perms)
12526{
11fdf7f2 12527 std::lock_guard lock(client_lock);
7c673cae 12528
181888fb
FG
12529 if (unmounting)
12530 return -ENOTCONN;
12531
7c673cae
FG
12532 vinodeno_t vparent = _get_vino(parent);
12533
12534 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12535 tout(cct) << "ll_mkdirx" << std::endl;
12536 tout(cct) << vparent.ino.val << std::endl;
12537 tout(cct) << name << std::endl;
12538 tout(cct) << mode << std::endl;
12539
11fdf7f2 12540 if (!fuse_default_permissions) {
7c673cae
FG
12541 int r = may_create(parent, perms);
12542 if (r < 0)
12543 return r;
12544 }
12545
12546 InodeRef in;
12547 int r = _mkdir(parent, name, mode, perms, &in);
12548 if (r == 0) {
12549 fill_statx(in, statx_to_mask(flags, want), stx);
12550 _ll_get(in.get());
12551 } else {
12552 stx->stx_ino = 0;
12553 stx->stx_mask = 0;
12554 }
12555 tout(cct) << stx->stx_ino << std::endl;
12556 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12557 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12558 *out = in.get();
12559 return r;
12560}
12561
12562int Client::_symlink(Inode *dir, const char *name, const char *target,
12563 const UserPerm& perms, InodeRef *inp)
12564{
1adf2230 12565 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12566 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12567 << dendl;
12568
12569 if (strlen(name) > NAME_MAX)
12570 return -ENAMETOOLONG;
12571
12572 if (dir->snapid != CEPH_NOSNAP) {
12573 return -EROFS;
12574 }
12575 if (is_quota_files_exceeded(dir, perms)) {
12576 return -EDQUOT;
12577 }
12578
12579 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12580
12581 filepath path;
12582 dir->make_nosnap_relative_path(path);
12583 path.push_dentry(name);
12584 req->set_filepath(path);
12585 req->set_inode(dir);
12586 req->set_string2(target);
12587 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12588 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12589
12590 Dentry *de;
12591 int res = get_or_create(dir, name, &de);
12592 if (res < 0)
12593 goto fail;
12594 req->set_dentry(de);
12595
12596 res = make_request(req, perms, inp);
12597
12598 trim_cache();
1adf2230 12599 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12600 res << dendl;
12601 return res;
12602
12603 fail:
12604 put_request(req);
12605 return res;
12606}
12607
12608int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12609 struct stat *attr, Inode **out, const UserPerm& perms)
12610{
11fdf7f2 12611 std::lock_guard lock(client_lock);
7c673cae 12612
181888fb
FG
12613 if (unmounting)
12614 return -ENOTCONN;
12615
7c673cae
FG
12616 vinodeno_t vparent = _get_vino(parent);
12617
12618 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12619 << dendl;
12620 tout(cct) << "ll_symlink" << std::endl;
12621 tout(cct) << vparent.ino.val << std::endl;
12622 tout(cct) << name << std::endl;
12623 tout(cct) << value << std::endl;
12624
11fdf7f2 12625 if (!fuse_default_permissions) {
7c673cae
FG
12626 int r = may_create(parent, perms);
12627 if (r < 0)
12628 return r;
12629 }
12630
12631 InodeRef in;
12632 int r = _symlink(parent, name, value, perms, &in);
12633 if (r == 0) {
12634 fill_stat(in, attr);
12635 _ll_get(in.get());
12636 }
12637 tout(cct) << attr->st_ino << std::endl;
12638 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12639 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12640 *out = in.get();
12641 return r;
12642}
12643
12644int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12645 Inode **out, struct ceph_statx *stx, unsigned want,
12646 unsigned flags, const UserPerm& perms)
12647{
11fdf7f2 12648 std::lock_guard lock(client_lock);
7c673cae 12649
181888fb
FG
12650 if (unmounting)
12651 return -ENOTCONN;
12652
7c673cae
FG
12653 vinodeno_t vparent = _get_vino(parent);
12654
12655 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12656 << dendl;
12657 tout(cct) << "ll_symlinkx" << std::endl;
12658 tout(cct) << vparent.ino.val << std::endl;
12659 tout(cct) << name << std::endl;
12660 tout(cct) << value << std::endl;
12661
11fdf7f2 12662 if (!fuse_default_permissions) {
7c673cae
FG
12663 int r = may_create(parent, perms);
12664 if (r < 0)
12665 return r;
12666 }
12667
12668 InodeRef in;
12669 int r = _symlink(parent, name, value, perms, &in);
12670 if (r == 0) {
12671 fill_statx(in, statx_to_mask(flags, want), stx);
12672 _ll_get(in.get());
12673 }
12674 tout(cct) << stx->stx_ino << std::endl;
12675 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12676 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12677 *out = in.get();
12678 return r;
12679}
12680
12681int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12682{
1adf2230 12683 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12684 << " uid " << perm.uid() << " gid " << perm.gid()
12685 << ")" << dendl;
12686
12687 if (dir->snapid != CEPH_NOSNAP) {
12688 return -EROFS;
12689 }
12690
12691 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12692
12693 filepath path;
12694 dir->make_nosnap_relative_path(path);
12695 path.push_dentry(name);
12696 req->set_filepath(path);
12697
12698 InodeRef otherin;
b32b8144 12699 Inode *in;
7c673cae 12700 Dentry *de;
b32b8144 12701
7c673cae
FG
12702 int res = get_or_create(dir, name, &de);
12703 if (res < 0)
12704 goto fail;
12705 req->set_dentry(de);
12706 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12707 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12708
12709 res = _lookup(dir, name, 0, &otherin, perm);
12710 if (res < 0)
12711 goto fail;
b32b8144
FG
12712
12713 in = otherin.get();
12714 req->set_other_inode(in);
12715 in->break_all_delegs();
7c673cae
FG
12716 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12717
12718 req->set_inode(dir);
12719
12720 res = make_request(req, perm);
12721
12722 trim_cache();
1adf2230 12723 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12724 return res;
12725
12726 fail:
12727 put_request(req);
12728 return res;
12729}
12730
12731int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12732{
11fdf7f2 12733 std::lock_guard lock(client_lock);
7c673cae 12734
181888fb
FG
12735 if (unmounting)
12736 return -ENOTCONN;
12737
7c673cae
FG
12738 vinodeno_t vino = _get_vino(in);
12739
12740 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12741 tout(cct) << "ll_unlink" << std::endl;
12742 tout(cct) << vino.ino.val << std::endl;
12743 tout(cct) << name << std::endl;
12744
11fdf7f2 12745 if (!fuse_default_permissions) {
7c673cae
FG
12746 int r = may_delete(in, name, perm);
12747 if (r < 0)
12748 return r;
12749 }
12750 return _unlink(in, name, perm);
12751}
12752
12753int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12754{
1adf2230 12755 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12756 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12757
12758 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12759 return -EROFS;
12760 }
b32b8144
FG
12761
12762 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12763 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12764 filepath path;
12765 dir->make_nosnap_relative_path(path);
12766 path.push_dentry(name);
12767 req->set_filepath(path);
11fdf7f2 12768 req->set_inode(dir);
7c673cae
FG
12769
12770 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12771 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12772 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12773
12774 InodeRef in;
12775
12776 Dentry *de;
12777 int res = get_or_create(dir, name, &de);
12778 if (res < 0)
12779 goto fail;
b32b8144
FG
12780 if (op == CEPH_MDS_OP_RMDIR)
12781 req->set_dentry(de);
12782 else
12783 de->get();
12784
7c673cae
FG
12785 res = _lookup(dir, name, 0, &in, perms);
12786 if (res < 0)
12787 goto fail;
11fdf7f2
TL
12788
12789 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12790 unlink(de, true, true);
b32b8144 12791 de->put();
7c673cae 12792 }
11fdf7f2 12793 req->set_other_inode(in.get());
7c673cae
FG
12794
12795 res = make_request(req, perms);
12796
12797 trim_cache();
1adf2230 12798 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12799 return res;
12800
12801 fail:
12802 put_request(req);
12803 return res;
12804}
12805
12806int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12807{
11fdf7f2 12808 std::lock_guard lock(client_lock);
7c673cae 12809
181888fb
FG
12810 if (unmounting)
12811 return -ENOTCONN;
12812
7c673cae
FG
12813 vinodeno_t vino = _get_vino(in);
12814
12815 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12816 tout(cct) << "ll_rmdir" << std::endl;
12817 tout(cct) << vino.ino.val << std::endl;
12818 tout(cct) << name << std::endl;
12819
11fdf7f2 12820 if (!fuse_default_permissions) {
7c673cae
FG
12821 int r = may_delete(in, name, perms);
12822 if (r < 0)
12823 return r;
12824 }
12825
12826 return _rmdir(in, name, perms);
12827}
12828
12829int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12830{
1adf2230 12831 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12832 << todir->ino << " " << toname
12833 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12834 << dendl;
12835
12836 if (fromdir->snapid != todir->snapid)
12837 return -EXDEV;
12838
12839 int op = CEPH_MDS_OP_RENAME;
12840 if (fromdir->snapid != CEPH_NOSNAP) {
12841 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12842 op = CEPH_MDS_OP_RENAMESNAP;
12843 else
12844 return -EROFS;
12845 }
7c673cae
FG
12846
12847 InodeRef target;
12848 MetaRequest *req = new MetaRequest(op);
12849
12850 filepath from;
12851 fromdir->make_nosnap_relative_path(from);
12852 from.push_dentry(fromname);
12853 filepath to;
12854 todir->make_nosnap_relative_path(to);
12855 to.push_dentry(toname);
12856 req->set_filepath(to);
12857 req->set_filepath2(from);
12858
12859 Dentry *oldde;
12860 int res = get_or_create(fromdir, fromname, &oldde);
12861 if (res < 0)
12862 goto fail;
12863 Dentry *de;
12864 res = get_or_create(todir, toname, &de);
12865 if (res < 0)
12866 goto fail;
12867
12868 if (op == CEPH_MDS_OP_RENAME) {
12869 req->set_old_dentry(oldde);
12870 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12871 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12872
12873 req->set_dentry(de);
12874 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12875 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12876
12877 InodeRef oldin, otherin;
9f95a23c
TL
12878 Inode *fromdir_root = nullptr;
12879 Inode *todir_root = nullptr;
12880 int mask = 0;
12881 bool quota_check = false;
12882 if (fromdir != todir) {
12883 fromdir_root =
12884 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12885 todir_root =
12886 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12887
12888 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12889 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12890 // to auth MDS to get latest rstat for todir_root and source dir
12891 // even if their dentry caches and inode caps are satisfied.
12892 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12893 if (res < 0)
12894 goto fail;
12895
12896 quota_check = true;
12897 if (oldde->inode && oldde->inode->is_dir()) {
12898 mask |= CEPH_STAT_RSTAT;
12899 }
12900 }
12901 }
12902
12903 res = _lookup(fromdir, fromname, mask, &oldin, perm);
7c673cae
FG
12904 if (res < 0)
12905 goto fail;
b32b8144
FG
12906
12907 Inode *oldinode = oldin.get();
12908 oldinode->break_all_delegs();
12909 req->set_old_inode(oldinode);
7c673cae
FG
12910 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12911
9f95a23c
TL
12912 if (quota_check) {
12913 int64_t old_bytes, old_files;
12914 if (oldinode->is_dir()) {
12915 old_bytes = oldinode->rstat.rbytes;
12916 old_files = oldinode->rstat.rsize();
12917 } else {
12918 old_bytes = oldinode->size;
12919 old_files = 1;
12920 }
12921
12922 bool quota_exceed = false;
12923 if (todir_root && todir_root->quota.max_bytes &&
12924 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12925 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12926 << old_bytes << ") to (" << todir->ino
12927 << ") will exceed quota on " << *todir_root << dendl;
12928 quota_exceed = true;
12929 }
12930
12931 if (todir_root && todir_root->quota.max_files &&
12932 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12933 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12934 << old_files << ") to (" << todir->ino
12935 << ") will exceed quota on " << *todir_root << dendl;
12936 quota_exceed = true;
12937 }
12938
12939 if (quota_exceed) {
12940 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12941 goto fail;
12942 }
12943 }
12944
7c673cae 12945 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12946 switch (res) {
12947 case 0:
12948 {
12949 Inode *in = otherin.get();
12950 req->set_other_inode(in);
12951 in->break_all_delegs();
12952 }
7c673cae 12953 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12954 break;
12955 case -ENOENT:
12956 break;
12957 default:
12958 goto fail;
7c673cae
FG
12959 }
12960
12961 req->set_inode(todir);
12962 } else {
12963 // renamesnap reply contains no tracedn, so we need to invalidate
12964 // dentry manually
12965 unlink(oldde, true, true);
12966 unlink(de, true, true);
11fdf7f2
TL
12967
12968 req->set_inode(todir);
7c673cae
FG
12969 }
12970
12971 res = make_request(req, perm, &target);
12972 ldout(cct, 10) << "rename result is " << res << dendl;
12973
12974 // renamed item from our cache
12975
12976 trim_cache();
1adf2230 12977 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12978 return res;
12979
12980 fail:
12981 put_request(req);
12982 return res;
12983}
12984
12985int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12986 const char *newname, const UserPerm& perm)
12987{
11fdf7f2 12988 std::lock_guard lock(client_lock);
7c673cae 12989
181888fb
FG
12990 if (unmounting)
12991 return -ENOTCONN;
12992
7c673cae
FG
12993 vinodeno_t vparent = _get_vino(parent);
12994 vinodeno_t vnewparent = _get_vino(newparent);
12995
12996 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12997 << vnewparent << " " << newname << dendl;
12998 tout(cct) << "ll_rename" << std::endl;
12999 tout(cct) << vparent.ino.val << std::endl;
13000 tout(cct) << name << std::endl;
13001 tout(cct) << vnewparent.ino.val << std::endl;
13002 tout(cct) << newname << std::endl;
13003
11fdf7f2 13004 if (!fuse_default_permissions) {
7c673cae
FG
13005 int r = may_delete(parent, name, perm);
13006 if (r < 0)
13007 return r;
13008 r = may_delete(newparent, newname, perm);
13009 if (r < 0 && r != -ENOENT)
13010 return r;
13011 }
13012
13013 return _rename(parent, name, newparent, newname, perm);
13014}
13015
13016int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
13017{
1adf2230 13018 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
13019 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13020
13021 if (strlen(newname) > NAME_MAX)
13022 return -ENAMETOOLONG;
13023
13024 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13025 return -EROFS;
13026 }
13027 if (is_quota_files_exceeded(dir, perm)) {
13028 return -EDQUOT;
13029 }
13030
b32b8144 13031 in->break_all_delegs();
7c673cae
FG
13032 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13033
13034 filepath path(newname, dir->ino);
13035 req->set_filepath(path);
13036 filepath existing(in->ino);
13037 req->set_filepath2(existing);
13038
13039 req->set_inode(dir);
13040 req->inode_drop = CEPH_CAP_FILE_SHARED;
13041 req->inode_unless = CEPH_CAP_FILE_EXCL;
13042
13043 Dentry *de;
13044 int res = get_or_create(dir, newname, &de);
13045 if (res < 0)
13046 goto fail;
13047 req->set_dentry(de);
13048
13049 res = make_request(req, perm, inp);
13050 ldout(cct, 10) << "link result is " << res << dendl;
13051
13052 trim_cache();
1adf2230 13053 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
13054 return res;
13055
13056 fail:
13057 put_request(req);
13058 return res;
13059}
13060
13061int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13062 const UserPerm& perm)
13063{
11fdf7f2 13064 std::lock_guard lock(client_lock);
7c673cae 13065
181888fb
FG
13066 if (unmounting)
13067 return -ENOTCONN;
13068
7c673cae
FG
13069 vinodeno_t vino = _get_vino(in);
13070 vinodeno_t vnewparent = _get_vino(newparent);
13071
31f18b77 13072 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
13073 newname << dendl;
13074 tout(cct) << "ll_link" << std::endl;
13075 tout(cct) << vino.ino.val << std::endl;
13076 tout(cct) << vnewparent << std::endl;
13077 tout(cct) << newname << std::endl;
13078
7c673cae
FG
13079 InodeRef target;
13080
11fdf7f2 13081 if (!fuse_default_permissions) {
7c673cae
FG
13082 if (S_ISDIR(in->mode))
13083 return -EPERM;
13084
11fdf7f2 13085 int r = may_hardlink(in, perm);
7c673cae
FG
13086 if (r < 0)
13087 return r;
13088
13089 r = may_create(newparent, perm);
13090 if (r < 0)
13091 return r;
13092 }
13093
13094 return _link(in, newparent, newname, perm, &target);
13095}
13096
13097int Client::ll_num_osds(void)
13098{
11fdf7f2 13099 std::lock_guard lock(client_lock);
7c673cae
FG
13100 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13101}
13102
13103int Client::ll_osdaddr(int osd, uint32_t *addr)
13104{
11fdf7f2 13105 std::lock_guard lock(client_lock);
181888fb 13106
7c673cae
FG
13107 entity_addr_t g;
13108 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13109 if (!o.exists(osd))
13110 return false;
11fdf7f2 13111 g = o.get_addrs(osd).front();
7c673cae
FG
13112 return true;
13113 });
13114 if (!exists)
13115 return -1;
13116 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13117 *addr = ntohl(nb_addr);
13118 return 0;
13119}
181888fb 13120
7c673cae
FG
13121uint32_t Client::ll_stripe_unit(Inode *in)
13122{
11fdf7f2 13123 std::lock_guard lock(client_lock);
7c673cae
FG
13124 return in->layout.stripe_unit;
13125}
13126
13127uint64_t Client::ll_snap_seq(Inode *in)
13128{
11fdf7f2 13129 std::lock_guard lock(client_lock);
7c673cae
FG
13130 return in->snaprealm->seq;
13131}
13132
13133int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13134{
11fdf7f2 13135 std::lock_guard lock(client_lock);
7c673cae
FG
13136 *layout = in->layout;
13137 return 0;
13138}
13139
13140int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13141{
13142 return ll_file_layout(fh->inode.get(), layout);
13143}
13144
13145/* Currently we cannot take advantage of redundancy in reads, since we
13146 would have to go through all possible placement groups (a
13147 potentially quite large number determined by a hash), and use CRUSH
13148 to calculate the appropriate set of OSDs for each placement group,
13149 then index into that. An array with one entry per OSD is much more
13150 tractable and works for demonstration purposes. */
13151
13152int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13153 file_layout_t* layout)
13154{
11fdf7f2 13155 std::lock_guard lock(client_lock);
181888fb 13156
28e407b8 13157 inodeno_t ino = in->ino;
7c673cae
FG
13158 uint32_t object_size = layout->object_size;
13159 uint32_t su = layout->stripe_unit;
13160 uint32_t stripe_count = layout->stripe_count;
13161 uint64_t stripes_per_object = object_size / su;
11fdf7f2 13162 uint64_t stripeno = 0, stripepos = 0;
7c673cae 13163
11fdf7f2
TL
13164 if(stripe_count) {
13165 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13166 stripepos = blockno % stripe_count; // which object in the object set (X)
13167 }
7c673cae
FG
13168 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13169 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13170
13171 object_t oid = file_object_t(ino, objectno);
13172 return objecter->with_osdmap([&](const OSDMap& o) {
13173 ceph_object_layout olayout =
13174 o.file_to_object_layout(oid, *layout);
13175 pg_t pg = (pg_t)olayout.ol_pgid;
13176 vector<int> osds;
13177 int primary;
13178 o.pg_to_acting_osds(pg, &osds, &primary);
13179 return primary;
13180 });
13181}
13182
13183/* Return the offset of the block, internal to the object */
13184
13185uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13186{
11fdf7f2 13187 std::lock_guard lock(client_lock);
7c673cae
FG
13188 file_layout_t *layout=&(in->layout);
13189 uint32_t object_size = layout->object_size;
13190 uint32_t su = layout->stripe_unit;
13191 uint64_t stripes_per_object = object_size / su;
13192
13193 return (blockno % stripes_per_object) * su;
13194}
13195
13196int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13197 const UserPerm& perms)
13198{
11fdf7f2 13199 std::lock_guard lock(client_lock);
7c673cae 13200
181888fb
FG
13201 if (unmounting)
13202 return -ENOTCONN;
13203
7c673cae
FG
13204 vinodeno_t vino = _get_vino(in);
13205
13206 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13207 tout(cct) << "ll_opendir" << std::endl;
13208 tout(cct) << vino.ino.val << std::endl;
13209
11fdf7f2 13210 if (!fuse_default_permissions) {
7c673cae
FG
13211 int r = may_open(in, flags, perms);
13212 if (r < 0)
13213 return r;
13214 }
13215
13216 int r = _opendir(in, dirpp, perms);
13217 tout(cct) << (unsigned long)*dirpp << std::endl;
13218
13219 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13220 << dendl;
13221 return r;
13222}
13223
13224int Client::ll_releasedir(dir_result_t *dirp)
13225{
11fdf7f2 13226 std::lock_guard lock(client_lock);
7c673cae
FG
13227 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13228 tout(cct) << "ll_releasedir" << std::endl;
13229 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
13230
13231 if (unmounting)
13232 return -ENOTCONN;
13233
7c673cae
FG
13234 _closedir(dirp);
13235 return 0;
13236}
13237
13238int Client::ll_fsyncdir(dir_result_t *dirp)
13239{
11fdf7f2 13240 std::lock_guard lock(client_lock);
7c673cae
FG
13241 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13242 tout(cct) << "ll_fsyncdir" << std::endl;
13243 tout(cct) << (unsigned long)dirp << std::endl;
13244
181888fb
FG
13245 if (unmounting)
13246 return -ENOTCONN;
13247
7c673cae
FG
13248 return _fsync(dirp->inode.get(), false);
13249}
13250
13251int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13252{
11fdf7f2 13253 ceph_assert(!(flags & O_CREAT));
7c673cae 13254
11fdf7f2 13255 std::lock_guard lock(client_lock);
7c673cae 13256
181888fb
FG
13257 if (unmounting)
13258 return -ENOTCONN;
13259
7c673cae
FG
13260 vinodeno_t vino = _get_vino(in);
13261
13262 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13263 tout(cct) << "ll_open" << std::endl;
13264 tout(cct) << vino.ino.val << std::endl;
13265 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13266
13267 int r;
11fdf7f2 13268 if (!fuse_default_permissions) {
7c673cae
FG
13269 r = may_open(in, flags, perms);
13270 if (r < 0)
13271 goto out;
13272 }
13273
13274 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13275
13276 out:
13277 Fh *fhptr = fhp ? *fhp : NULL;
13278 if (fhptr) {
13279 ll_unclosed_fh_set.insert(fhptr);
13280 }
13281 tout(cct) << (unsigned long)fhptr << std::endl;
13282 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13283 " = " << r << " (" << fhptr << ")" << dendl;
13284 return r;
13285}
13286
13287int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13288 int flags, InodeRef *in, int caps, Fh **fhp,
13289 const UserPerm& perms)
13290{
13291 *fhp = NULL;
13292
13293 vinodeno_t vparent = _get_vino(parent);
13294
1adf2230 13295 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13296 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13297 << ", gid " << perms.gid() << dendl;
13298 tout(cct) << "ll_create" << std::endl;
13299 tout(cct) << vparent.ino.val << std::endl;
13300 tout(cct) << name << std::endl;
13301 tout(cct) << mode << std::endl;
13302 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13303
13304 bool created = false;
13305 int r = _lookup(parent, name, caps, in, perms);
13306
13307 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13308 return -EEXIST;
13309
13310 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2 13311 if (!fuse_default_permissions) {
7c673cae
FG
13312 r = may_create(parent, perms);
13313 if (r < 0)
13314 goto out;
13315 }
13316 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13317 perms);
13318 if (r < 0)
13319 goto out;
13320 }
13321
13322 if (r < 0)
13323 goto out;
13324
11fdf7f2 13325 ceph_assert(*in);
7c673cae
FG
13326
13327 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13328 if (!created) {
11fdf7f2 13329 if (!fuse_default_permissions) {
7c673cae
FG
13330 r = may_open(in->get(), flags, perms);
13331 if (r < 0) {
13332 if (*fhp) {
13333 int release_r = _release_fh(*fhp);
11fdf7f2 13334 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13335 }
13336 goto out;
13337 }
13338 }
13339 if (*fhp == NULL) {
13340 r = _open(in->get(), flags, mode, fhp, perms);
13341 if (r < 0)
13342 goto out;
13343 }
13344 }
13345
13346out:
13347 if (*fhp) {
13348 ll_unclosed_fh_set.insert(*fhp);
13349 }
13350
13351 ino_t ino = 0;
13352 if (r >= 0) {
13353 Inode *inode = in->get();
13354 if (use_faked_inos())
13355 ino = inode->faked_ino;
13356 else
13357 ino = inode->ino;
13358 }
13359
13360 tout(cct) << (unsigned long)*fhp << std::endl;
13361 tout(cct) << ino << std::endl;
1adf2230 13362 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13363 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13364 *fhp << " " << hex << ino << dec << ")" << dendl;
13365
13366 return r;
13367}
13368
13369int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13370 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13371 const UserPerm& perms)
13372{
11fdf7f2 13373 std::lock_guard lock(client_lock);
7c673cae
FG
13374 InodeRef in;
13375
181888fb
FG
13376 if (unmounting)
13377 return -ENOTCONN;
13378
7c673cae
FG
13379 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13380 fhp, perms);
13381 if (r >= 0) {
11fdf7f2 13382 ceph_assert(in);
7c673cae
FG
13383
13384 // passing an Inode in outp requires an additional ref
13385 if (outp) {
13386 _ll_get(in.get());
13387 *outp = in.get();
13388 }
13389 fill_stat(in, attr);
13390 } else {
13391 attr->st_ino = 0;
13392 }
13393
13394 return r;
13395}
13396
13397int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13398 int oflags, Inode **outp, Fh **fhp,
13399 struct ceph_statx *stx, unsigned want, unsigned lflags,
13400 const UserPerm& perms)
13401{
13402 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13403 std::lock_guard lock(client_lock);
7c673cae
FG
13404 InodeRef in;
13405
181888fb
FG
13406 if (unmounting)
13407 return -ENOTCONN;
7c673cae
FG
13408
13409 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13410 if (r >= 0) {
11fdf7f2 13411 ceph_assert(in);
7c673cae
FG
13412
13413 // passing an Inode in outp requires an additional ref
13414 if (outp) {
13415 _ll_get(in.get());
13416 *outp = in.get();
13417 }
13418 fill_statx(in, caps, stx);
13419 } else {
13420 stx->stx_ino = 0;
13421 stx->stx_mask = 0;
13422 }
13423
13424 return r;
13425}
13426
13427loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13428{
11fdf7f2 13429 std::lock_guard lock(client_lock);
7c673cae
FG
13430 tout(cct) << "ll_lseek" << std::endl;
13431 tout(cct) << offset << std::endl;
13432 tout(cct) << whence << std::endl;
13433
181888fb
FG
13434 if (unmounting)
13435 return -ENOTCONN;
13436
7c673cae
FG
13437 return _lseek(fh, offset, whence);
13438}
13439
13440int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13441{
11fdf7f2 13442 std::lock_guard lock(client_lock);
7c673cae
FG
13443 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13444 tout(cct) << "ll_read" << std::endl;
13445 tout(cct) << (unsigned long)fh << std::endl;
13446 tout(cct) << off << std::endl;
13447 tout(cct) << len << std::endl;
13448
181888fb
FG
13449 if (unmounting)
13450 return -ENOTCONN;
13451
11fdf7f2
TL
13452 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13453 len = std::min(len, (loff_t)INT_MAX);
f6b5b4d7
TL
13454 int r = _read(fh, off, len, bl);
13455 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13456 << dendl;
13457 return r;
7c673cae
FG
13458}
13459
13460int Client::ll_read_block(Inode *in, uint64_t blockid,
13461 char *buf,
13462 uint64_t offset,
13463 uint64_t length,
13464 file_layout_t* layout)
13465{
11fdf7f2 13466 std::lock_guard lock(client_lock);
181888fb
FG
13467
13468 if (unmounting)
13469 return -ENOTCONN;
13470
b32b8144 13471 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13472 object_t oid = file_object_t(vino.ino, blockid);
13473 C_SaferCond onfinish;
13474 bufferlist bl;
13475
13476 objecter->read(oid,
13477 object_locator_t(layout->pool_id),
13478 offset,
13479 length,
13480 vino.snapid,
13481 &bl,
13482 CEPH_OSD_FLAG_READ,
13483 &onfinish);
13484
9f95a23c 13485 client_lock.unlock();
7c673cae 13486 int r = onfinish.wait();
9f95a23c 13487 client_lock.lock();
7c673cae
FG
13488
13489 if (r >= 0) {
9f95a23c 13490 bl.begin().copy(bl.length(), buf);
7c673cae
FG
13491 r = bl.length();
13492 }
13493
13494 return r;
13495}
13496
13497/* It appears that the OSD doesn't return success unless the entire
13498 buffer was written, return the write length on success. */
13499
13500int Client::ll_write_block(Inode *in, uint64_t blockid,
13501 char* buf, uint64_t offset,
13502 uint64_t length, file_layout_t* layout,
13503 uint64_t snapseq, uint32_t sync)
13504{
7c673cae 13505 vinodeno_t vino = ll_get_vino(in);
7c673cae 13506 int r = 0;
11fdf7f2
TL
13507 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13508
7c673cae
FG
13509 if (length == 0) {
13510 return -EINVAL;
13511 }
13512 if (true || sync) {
13513 /* if write is stable, the epilogue is waiting on
13514 * flock */
11fdf7f2 13515 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13516 }
13517 object_t oid = file_object_t(vino.ino, blockid);
13518 SnapContext fakesnap;
11fdf7f2
TL
13519 ceph::bufferlist bl;
13520 if (length > 0) {
13521 bl.push_back(buffer::copy(buf, length));
13522 }
7c673cae
FG
13523
13524 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13525 << dendl;
13526
13527 fakesnap.seq = snapseq;
13528
13529 /* lock just in time */
9f95a23c 13530 client_lock.lock();
181888fb 13531 if (unmounting) {
9f95a23c 13532 client_lock.unlock();
181888fb
FG
13533 return -ENOTCONN;
13534 }
7c673cae
FG
13535
13536 objecter->write(oid,
13537 object_locator_t(layout->pool_id),
13538 offset,
13539 length,
13540 fakesnap,
13541 bl,
13542 ceph::real_clock::now(),
13543 0,
11fdf7f2 13544 onsafe.get());
7c673cae 13545
9f95a23c 13546 client_lock.unlock();
11fdf7f2
TL
13547 if (nullptr != onsafe) {
13548 r = onsafe->wait();
7c673cae
FG
13549 }
13550
13551 if (r < 0) {
13552 return r;
13553 } else {
13554 return length;
13555 }
13556}
13557
13558int Client::ll_commit_blocks(Inode *in,
13559 uint64_t offset,
13560 uint64_t length)
13561{
11fdf7f2 13562 std::lock_guard lock(client_lock);
7c673cae
FG
13563 /*
13564 BarrierContext *bctx;
b32b8144 13565 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13566 uint64_t ino = vino.ino;
13567
13568 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13569 << offset << " to " << length << dendl;
13570
13571 if (length == 0) {
13572 return -EINVAL;
13573 }
13574
13575 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13576 if (p != barriers.end()) {
13577 barrier_interval civ(offset, offset + length);
13578 p->second->commit_barrier(civ);
13579 }
13580 */
13581 return 0;
13582}
13583
13584int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13585{
11fdf7f2 13586 std::lock_guard lock(client_lock);
7c673cae
FG
13587 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13588 "~" << len << dendl;
13589 tout(cct) << "ll_write" << std::endl;
13590 tout(cct) << (unsigned long)fh << std::endl;
13591 tout(cct) << off << std::endl;
13592 tout(cct) << len << std::endl;
13593
181888fb
FG
13594 if (unmounting)
13595 return -ENOTCONN;
13596
11fdf7f2
TL
13597 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13598 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13599 int r = _write(fh, off, len, data, NULL, 0);
13600 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13601 << dendl;
13602 return r;
13603}
13604
11fdf7f2
TL
13605int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13606{
13607 std::lock_guard lock(client_lock);
13608 if (unmounting)
13609 return -ENOTCONN;
13610 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13611}
13612
13613int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13614{
13615 std::lock_guard lock(client_lock);
13616 if (unmounting)
13617 return -ENOTCONN;
13618 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13619}
13620
7c673cae
FG
13621int Client::ll_flush(Fh *fh)
13622{
11fdf7f2 13623 std::lock_guard lock(client_lock);
7c673cae
FG
13624 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13625 tout(cct) << "ll_flush" << std::endl;
13626 tout(cct) << (unsigned long)fh << std::endl;
13627
181888fb
FG
13628 if (unmounting)
13629 return -ENOTCONN;
13630
7c673cae
FG
13631 return _flush(fh);
13632}
13633
13634int Client::ll_fsync(Fh *fh, bool syncdataonly)
13635{
11fdf7f2 13636 std::lock_guard lock(client_lock);
7c673cae
FG
13637 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13638 tout(cct) << "ll_fsync" << std::endl;
13639 tout(cct) << (unsigned long)fh << std::endl;
13640
181888fb
FG
13641 if (unmounting)
13642 return -ENOTCONN;
13643
7c673cae
FG
13644 int r = _fsync(fh, syncdataonly);
13645 if (r) {
13646 // If we're returning an error, clear it from the FH
13647 fh->take_async_err();
13648 }
13649 return r;
13650}
13651
28e407b8
AA
13652int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13653{
11fdf7f2 13654 std::lock_guard lock(client_lock);
28e407b8
AA
13655 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13656 tout(cct) << "ll_sync_inode" << std::endl;
13657 tout(cct) << (unsigned long)in << std::endl;
13658
13659 if (unmounting)
13660 return -ENOTCONN;
13661
13662 return _fsync(in, syncdataonly);
13663}
13664
7c673cae
FG
13665#ifdef FALLOC_FL_PUNCH_HOLE
13666
13667int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13668{
13669 if (offset < 0 || length <= 0)
13670 return -EINVAL;
13671
13672 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13673 return -EOPNOTSUPP;
13674
13675 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13676 return -EOPNOTSUPP;
13677
13678 Inode *in = fh->inode.get();
13679
13680 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13681 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13682 return -ENOSPC;
13683 }
13684
13685 if (in->snapid != CEPH_NOSNAP)
13686 return -EROFS;
13687
13688 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13689 return -EBADF;
13690
13691 uint64_t size = offset + length;
13692 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13693 size > in->size &&
11fdf7f2 13694 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13695 return -EDQUOT;
13696 }
13697
13698 int have;
f6b5b4d7 13699 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
13700 if (r < 0)
13701 return r;
13702
11fdf7f2 13703 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13704 if (mode & FALLOC_FL_PUNCH_HOLE) {
13705 if (in->inline_version < CEPH_INLINE_NONE &&
13706 (have & CEPH_CAP_FILE_BUFFER)) {
13707 bufferlist bl;
9f95a23c 13708 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
13709 int len = in->inline_data.length();
13710 if (offset < len) {
13711 if (offset > 0)
9f95a23c 13712 inline_iter.copy(offset, bl);
7c673cae
FG
13713 int size = length;
13714 if (offset + size > len)
13715 size = len - offset;
13716 if (size > 0)
13717 bl.append_zero(size);
9f95a23c
TL
13718 if (offset + size < len) {
13719 inline_iter += size;
13720 inline_iter.copy(len - offset - size, bl);
13721 }
7c673cae
FG
13722 in->inline_data = bl;
13723 in->inline_version++;
13724 }
91327a77 13725 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13726 in->change_attr++;
28e407b8 13727 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13728 } else {
13729 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13730 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13731 uninline_data(in, onuninline.get());
7c673cae
FG
13732 }
13733
11fdf7f2 13734 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13735
13736 unsafe_sync_write++;
13737 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13738
13739 _invalidate_inode_cache(in, offset, length);
13740 filer->zero(in->ino, &in->layout,
13741 in->snaprealm->get_snap_context(),
13742 offset, length,
13743 ceph::real_clock::now(),
11fdf7f2 13744 0, true, &onfinish);
91327a77 13745 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13746 in->change_attr++;
28e407b8 13747 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13748
9f95a23c 13749 client_lock.unlock();
11fdf7f2 13750 onfinish.wait();
9f95a23c 13751 client_lock.lock();
7c673cae
FG
13752 _sync_write_commit(in);
13753 }
13754 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13755 uint64_t size = offset + length;
13756 if (size > in->size) {
13757 in->size = size;
91327a77 13758 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13759 in->change_attr++;
28e407b8 13760 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13761
11fdf7f2 13762 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13763 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13764 } else if (is_max_size_approaching(in)) {
13765 check_caps(in, 0);
7c673cae
FG
13766 }
13767 }
13768 }
13769
11fdf7f2 13770 if (nullptr != onuninline) {
9f95a23c 13771 client_lock.unlock();
11fdf7f2 13772 int ret = onuninline->wait();
9f95a23c 13773 client_lock.lock();
7c673cae 13774
11fdf7f2 13775 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13776 in->inline_data.clear();
13777 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13778 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13779 check_caps(in, 0);
13780 } else
11fdf7f2 13781 r = ret;
7c673cae
FG
13782 }
13783
13784 put_cap_ref(in, CEPH_CAP_FILE_WR);
13785 return r;
13786}
13787#else
13788
13789int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13790{
13791 return -EOPNOTSUPP;
13792}
13793
13794#endif
13795
13796
11fdf7f2 13797int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13798{
11fdf7f2
TL
13799 std::lock_guard lock(client_lock);
13800 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13801 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13802 tout(cct) << (unsigned long)fh << std::endl;
13803
181888fb
FG
13804 if (unmounting)
13805 return -ENOTCONN;
13806
7c673cae
FG
13807 return _fallocate(fh, mode, offset, length);
13808}
13809
13810int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13811{
11fdf7f2
TL
13812 std::lock_guard lock(client_lock);
13813 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13814
181888fb
FG
13815 if (unmounting)
13816 return -ENOTCONN;
13817
7c673cae
FG
13818 Fh *fh = get_filehandle(fd);
13819 if (!fh)
13820 return -EBADF;
13821#if defined(__linux__) && defined(O_PATH)
13822 if (fh->flags & O_PATH)
13823 return -EBADF;
13824#endif
13825 return _fallocate(fh, mode, offset, length);
13826}
13827
13828int Client::ll_release(Fh *fh)
13829{
11fdf7f2 13830 std::lock_guard lock(client_lock);
91327a77
AA
13831
13832 if (unmounting)
13833 return -ENOTCONN;
13834
11fdf7f2 13835 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13836 dendl;
11fdf7f2 13837 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13838 tout(cct) << (unsigned long)fh << std::endl;
13839
13840 if (ll_unclosed_fh_set.count(fh))
13841 ll_unclosed_fh_set.erase(fh);
13842 return _release_fh(fh);
13843}
13844
13845int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13846{
11fdf7f2 13847 std::lock_guard lock(client_lock);
7c673cae
FG
13848
13849 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13850 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13851
181888fb
FG
13852 if (unmounting)
13853 return -ENOTCONN;
13854
7c673cae
FG
13855 return _getlk(fh, fl, owner);
13856}
13857
13858int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13859{
11fdf7f2 13860 std::lock_guard lock(client_lock);
7c673cae 13861
11fdf7f2
TL
13862 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13863 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13864
181888fb
FG
13865 if (unmounting)
13866 return -ENOTCONN;
13867
7c673cae
FG
13868 return _setlk(fh, fl, owner, sleep);
13869}
13870
13871int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13872{
11fdf7f2 13873 std::lock_guard lock(client_lock);
7c673cae 13874
11fdf7f2
TL
13875 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13876 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13877
181888fb
FG
13878 if (unmounting)
13879 return -ENOTCONN;
13880
7c673cae
FG
13881 return _flock(fh, cmd, owner);
13882}
13883
b32b8144
FG
13884int Client::set_deleg_timeout(uint32_t timeout)
13885{
11fdf7f2 13886 std::lock_guard lock(client_lock);
b32b8144
FG
13887
13888 /*
13889 * The whole point is to prevent blacklisting so we must time out the
13890 * delegation before the session autoclose timeout kicks in.
13891 */
13892 if (timeout >= mdsmap->get_session_autoclose())
13893 return -EINVAL;
13894
13895 deleg_timeout = timeout;
13896 return 0;
13897}
13898
13899int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13900{
13901 int ret = -EINVAL;
13902
11fdf7f2 13903 std::lock_guard lock(client_lock);
b32b8144
FG
13904
13905 if (!mounted)
13906 return -ENOTCONN;
13907
13908 Inode *inode = fh->inode.get();
13909
13910 switch(cmd) {
13911 case CEPH_DELEGATION_NONE:
13912 inode->unset_deleg(fh);
13913 ret = 0;
13914 break;
13915 default:
13916 try {
13917 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13918 } catch (std::bad_alloc&) {
b32b8144
FG
13919 ret = -ENOMEM;
13920 }
13921 break;
13922 }
13923 return ret;
13924}
13925
7c673cae
FG
13926class C_Client_RequestInterrupt : public Context {
13927private:
13928 Client *client;
13929 MetaRequest *req;
13930public:
13931 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13932 req->get();
13933 }
13934 void finish(int r) override {
11fdf7f2
TL
13935 std::lock_guard l(client->client_lock);
13936 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13937 client->_interrupt_filelock(req);
13938 client->put_request(req);
13939 }
13940};
13941
13942void Client::ll_interrupt(void *d)
13943{
13944 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13945 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13946 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13947 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13948}
13949
13950// =========================================
13951// layout
13952
13953// expose file layouts
13954
13955int Client::describe_layout(const char *relpath, file_layout_t *lp,
13956 const UserPerm& perms)
13957{
11fdf7f2 13958 std::lock_guard lock(client_lock);
7c673cae 13959
181888fb
FG
13960 if (unmounting)
13961 return -ENOTCONN;
13962
7c673cae
FG
13963 filepath path(relpath);
13964 InodeRef in;
13965 int r = path_walk(path, &in, perms);
13966 if (r < 0)
13967 return r;
13968
13969 *lp = in->layout;
13970
11fdf7f2 13971 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13972 return 0;
13973}
13974
13975int Client::fdescribe_layout(int fd, file_layout_t *lp)
13976{
11fdf7f2 13977 std::lock_guard lock(client_lock);
7c673cae 13978
181888fb
FG
13979 if (unmounting)
13980 return -ENOTCONN;
13981
7c673cae
FG
13982 Fh *f = get_filehandle(fd);
13983 if (!f)
13984 return -EBADF;
13985 Inode *in = f->inode.get();
13986
13987 *lp = in->layout;
13988
11fdf7f2 13989 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13990 return 0;
13991}
13992
d2e6a577
FG
13993int64_t Client::get_default_pool_id()
13994{
11fdf7f2 13995 std::lock_guard lock(client_lock);
181888fb
FG
13996
13997 if (unmounting)
13998 return -ENOTCONN;
13999
d2e6a577
FG
14000 /* first data pool is the default */
14001 return mdsmap->get_first_data_pool();
14002}
7c673cae
FG
14003
14004// expose osdmap
14005
14006int64_t Client::get_pool_id(const char *pool_name)
14007{
11fdf7f2 14008 std::lock_guard lock(client_lock);
181888fb
FG
14009
14010 if (unmounting)
14011 return -ENOTCONN;
14012
7c673cae
FG
14013 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14014 pool_name);
14015}
14016
14017string Client::get_pool_name(int64_t pool)
14018{
11fdf7f2 14019 std::lock_guard lock(client_lock);
181888fb
FG
14020
14021 if (unmounting)
14022 return string();
14023
7c673cae
FG
14024 return objecter->with_osdmap([pool](const OSDMap& o) {
14025 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14026 });
14027}
14028
14029int Client::get_pool_replication(int64_t pool)
14030{
11fdf7f2 14031 std::lock_guard lock(client_lock);
181888fb
FG
14032
14033 if (unmounting)
14034 return -ENOTCONN;
14035
7c673cae
FG
14036 return objecter->with_osdmap([pool](const OSDMap& o) {
14037 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
14038 });
14039}
14040
14041int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14042{
11fdf7f2 14043 std::lock_guard lock(client_lock);
7c673cae 14044
181888fb
FG
14045 if (unmounting)
14046 return -ENOTCONN;
14047
7c673cae
FG
14048 Fh *f = get_filehandle(fd);
14049 if (!f)
14050 return -EBADF;
14051 Inode *in = f->inode.get();
14052
14053 vector<ObjectExtent> extents;
14054 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 14055 ceph_assert(extents.size() == 1);
7c673cae
FG
14056
14057 objecter->with_osdmap([&](const OSDMap& o) {
14058 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14059 o.pg_to_acting_osds(pg, osds);
14060 });
14061
14062 if (osds.empty())
14063 return -EINVAL;
14064
14065 /*
14066 * Return the remainder of the extent (stripe unit)
14067 *
14068 * If length = 1 is passed to Striper::file_to_extents we get a single
14069 * extent back, but its length is one so we still need to compute the length
14070 * to the end of the stripe unit.
14071 *
14072 * If length = su then we may get 1 or 2 objects back in the extents vector
14073 * which would have to be examined. Even then, the offsets are local to the
14074 * object, so matching up to the file offset is extra work.
14075 *
14076 * It seems simpler to stick with length = 1 and manually compute the
14077 * remainder.
14078 */
14079 if (len) {
14080 uint64_t su = in->layout.stripe_unit;
14081 *len = su - (off % su);
14082 }
14083
14084 return 0;
14085}
14086
14087int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14088{
11fdf7f2 14089 std::lock_guard lock(client_lock);
181888fb
FG
14090
14091 if (unmounting)
14092 return -ENOTCONN;
14093
7c673cae
FG
14094 if (id < 0)
14095 return -EINVAL;
14096 return objecter->with_osdmap([&](const OSDMap& o) {
14097 return o.crush->get_full_location_ordered(id, path);
14098 });
14099}
14100
14101int Client::get_file_stripe_address(int fd, loff_t offset,
14102 vector<entity_addr_t>& address)
14103{
11fdf7f2 14104 std::lock_guard lock(client_lock);
7c673cae 14105
181888fb
FG
14106 if (unmounting)
14107 return -ENOTCONN;
14108
7c673cae
FG
14109 Fh *f = get_filehandle(fd);
14110 if (!f)
14111 return -EBADF;
14112 Inode *in = f->inode.get();
14113
14114 // which object?
14115 vector<ObjectExtent> extents;
14116 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14117 in->truncate_size, extents);
11fdf7f2 14118 ceph_assert(extents.size() == 1);
7c673cae
FG
14119
14120 // now we have the object and its 'layout'
14121 return objecter->with_osdmap([&](const OSDMap& o) {
14122 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14123 vector<int> osds;
14124 o.pg_to_acting_osds(pg, osds);
14125 if (osds.empty())
14126 return -EINVAL;
14127 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 14128 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
14129 address.push_back(addr);
14130 }
14131 return 0;
14132 });
14133}
14134
14135int Client::get_osd_addr(int osd, entity_addr_t& addr)
14136{
11fdf7f2 14137 std::lock_guard lock(client_lock);
181888fb
FG
14138
14139 if (unmounting)
14140 return -ENOTCONN;
14141
7c673cae
FG
14142 return objecter->with_osdmap([&](const OSDMap& o) {
14143 if (!o.exists(osd))
14144 return -ENOENT;
14145
11fdf7f2 14146 addr = o.get_addrs(osd).front();
7c673cae
FG
14147 return 0;
14148 });
14149}
14150
14151int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14152 loff_t length, loff_t offset)
14153{
11fdf7f2 14154 std::lock_guard lock(client_lock);
7c673cae 14155
181888fb
FG
14156 if (unmounting)
14157 return -ENOTCONN;
14158
7c673cae
FG
14159 Fh *f = get_filehandle(fd);
14160 if (!f)
14161 return -EBADF;
14162 Inode *in = f->inode.get();
14163
14164 // map to a list of extents
14165 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14166
11fdf7f2 14167 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
14168 return 0;
14169}
14170
14171
b32b8144 14172/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
14173int Client::get_local_osd()
14174{
11fdf7f2 14175 std::lock_guard lock(client_lock);
181888fb
FG
14176
14177 if (unmounting)
14178 return -ENOTCONN;
14179
7c673cae
FG
14180 objecter->with_osdmap([this](const OSDMap& o) {
14181 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 14182 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
14183 local_osd_epoch = o.get_epoch();
14184 }
14185 });
14186 return local_osd;
14187}
14188
14189
14190
14191
14192
14193
14194// ===============================
14195
14196void Client::ms_handle_connect(Connection *con)
14197{
11fdf7f2 14198 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14199}
14200
14201bool Client::ms_handle_reset(Connection *con)
14202{
11fdf7f2 14203 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14204 return false;
14205}
14206
14207void Client::ms_handle_remote_reset(Connection *con)
14208{
11fdf7f2
TL
14209 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14210 std::lock_guard l(client_lock);
7c673cae
FG
14211 switch (con->get_peer_type()) {
14212 case CEPH_ENTITY_TYPE_MDS:
14213 {
14214 // kludge to figure out which mds this is; fixme with a Connection* state
14215 mds_rank_t mds = MDS_RANK_NONE;
14216 MetaSession *s = NULL;
11fdf7f2
TL
14217 for (auto &p : mds_sessions) {
14218 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14219 mds = p.first;
14220 s = &p.second;
7c673cae
FG
14221 }
14222 }
14223 if (mds >= 0) {
d2e6a577 14224 assert (s != NULL);
7c673cae
FG
14225 switch (s->state) {
14226 case MetaSession::STATE_CLOSING:
14227 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14228 _closed_mds_session(s);
14229 break;
14230
14231 case MetaSession::STATE_OPENING:
14232 {
14233 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14234 list<Context*> waiters;
14235 waiters.swap(s->waiting_for_open);
14236 _closed_mds_session(s);
14237 MetaSession *news = _get_or_open_mds_session(mds);
14238 news->waiting_for_open.swap(waiters);
14239 }
14240 break;
14241
14242 case MetaSession::STATE_OPEN:
14243 {
28e407b8 14244 objecter->maybe_request_map(); /* to check if we are blacklisted */
f6b5b4d7 14245 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae
FG
14246 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14247 _closed_mds_session(s);
14248 } else {
14249 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14250 s->state = MetaSession::STATE_STALE;
14251 }
14252 }
14253 break;
14254
14255 case MetaSession::STATE_NEW:
14256 case MetaSession::STATE_CLOSED:
14257 default:
14258 break;
14259 }
14260 }
14261 }
14262 break;
14263 }
14264}
14265
14266bool Client::ms_handle_refused(Connection *con)
14267{
11fdf7f2 14268 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14269 return false;
14270}
14271
7c673cae
FG
14272Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14273{
11fdf7f2
TL
14274 Inode *quota_in = root_ancestor;
14275 SnapRealm *realm = in->snaprealm;
14276 while (realm) {
14277 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14278 if (realm->ino != in->ino) {
14279 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14280 if (p == inode_map.end())
14281 break;
7c673cae 14282
11fdf7f2
TL
14283 if (p->second->quota.is_enable()) {
14284 quota_in = p->second;
14285 break;
7c673cae 14286 }
7c673cae 14287 }
11fdf7f2 14288 realm = realm->pparent;
7c673cae 14289 }
11fdf7f2
TL
14290 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14291 return quota_in;
7c673cae
FG
14292}
14293
14294/**
14295 * Traverse quota ancestors of the Inode, return true
14296 * if any of them passes the passed function
14297 */
14298bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14299 std::function<bool (const Inode &in)> test)
14300{
14301 while (true) {
11fdf7f2 14302 ceph_assert(in != NULL);
7c673cae
FG
14303 if (test(*in)) {
14304 return true;
14305 }
14306
14307 if (in == root_ancestor) {
14308 // We're done traversing, drop out
14309 return false;
14310 } else {
14311 // Continue up the tree
14312 in = get_quota_root(in, perms);
14313 }
14314 }
14315
14316 return false;
14317}
14318
14319bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14320{
14321 return check_quota_condition(in, perms,
14322 [](const Inode &in) {
14323 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14324 });
14325}
14326
14327bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14328 const UserPerm& perms)
7c673cae
FG
14329{
14330 return check_quota_condition(in, perms,
11fdf7f2 14331 [&new_bytes](const Inode &in) {
7c673cae
FG
14332 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14333 > in.quota.max_bytes;
14334 });
14335}
14336
11fdf7f2 14337bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14338{
9f95a23c
TL
14339 ceph_assert(in->size >= in->reported_size);
14340 const uint64_t size = in->size - in->reported_size;
11fdf7f2 14341 return check_quota_condition(in, perms,
9f95a23c 14342 [&size](const Inode &in) {
11fdf7f2
TL
14343 if (in.quota.max_bytes) {
14344 if (in.rstat.rbytes >= in.quota.max_bytes) {
14345 return true;
14346 }
14347
11fdf7f2 14348 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
14349 return (space >> 4) < size;
14350 } else {
14351 return false;
14352 }
14353 });
7c673cae
FG
14354}
14355
14356enum {
14357 POOL_CHECKED = 1,
14358 POOL_CHECKING = 2,
14359 POOL_READ = 4,
14360 POOL_WRITE = 8,
14361};
14362
14363int Client::check_pool_perm(Inode *in, int need)
14364{
14365 if (!cct->_conf->client_check_pool_perm)
14366 return 0;
14367
7f7e6c64
TL
14368 /* Only need to do this for regular files */
14369 if (!in->is_file())
14370 return 0;
14371
7c673cae
FG
14372 int64_t pool_id = in->layout.pool_id;
14373 std::string pool_ns = in->layout.pool_ns;
14374 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14375 int have = 0;
14376 while (true) {
14377 auto it = pool_perms.find(perm_key);
14378 if (it == pool_perms.end())
14379 break;
14380 if (it->second == POOL_CHECKING) {
14381 // avoid concurrent checkings
14382 wait_on_list(waiting_for_pool_perm);
14383 } else {
14384 have = it->second;
11fdf7f2 14385 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14386 break;
14387 }
14388 }
14389
14390 if (!have) {
14391 if (in->snapid != CEPH_NOSNAP) {
14392 // pool permission check needs to write to the first object. But for snapshot,
14393 // head of the first object may have alread been deleted. To avoid creating
14394 // orphan object, skip the check for now.
14395 return 0;
14396 }
14397
14398 pool_perms[perm_key] = POOL_CHECKING;
14399
14400 char oid_buf[32];
14401 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14402 object_t oid = oid_buf;
14403
14404 SnapContext nullsnapc;
14405
14406 C_SaferCond rd_cond;
14407 ObjectOperation rd_op;
14408 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14409
14410 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14411 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14412
14413 C_SaferCond wr_cond;
14414 ObjectOperation wr_op;
14415 wr_op.create(true);
14416
14417 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14418 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14419
9f95a23c 14420 client_lock.unlock();
7c673cae
FG
14421 int rd_ret = rd_cond.wait();
14422 int wr_ret = wr_cond.wait();
9f95a23c 14423 client_lock.lock();
7c673cae
FG
14424
14425 bool errored = false;
14426
14427 if (rd_ret == 0 || rd_ret == -ENOENT)
14428 have |= POOL_READ;
14429 else if (rd_ret != -EPERM) {
11fdf7f2 14430 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14431 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14432 errored = true;
14433 }
14434
14435 if (wr_ret == 0 || wr_ret == -EEXIST)
14436 have |= POOL_WRITE;
14437 else if (wr_ret != -EPERM) {
11fdf7f2 14438 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14439 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14440 errored = true;
14441 }
14442
14443 if (errored) {
14444 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14445 // Raise EIO because actual error code might be misleading for
14446 // userspace filesystem user.
14447 pool_perms.erase(perm_key);
14448 signal_cond_list(waiting_for_pool_perm);
14449 return -EIO;
14450 }
14451
14452 pool_perms[perm_key] = have | POOL_CHECKED;
14453 signal_cond_list(waiting_for_pool_perm);
14454 }
14455
14456 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14457 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14458 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14459 return -EPERM;
14460 }
14461 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14462 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14463 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14464 return -EPERM;
14465 }
14466
14467 return 0;
14468}
14469
14470int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14471{
14472 if (acl_type == POSIX_ACL) {
14473 if (in->xattrs.count(ACL_EA_ACCESS)) {
14474 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14475
14476 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14477 }
14478 }
14479 return -EAGAIN;
14480}
14481
14482int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14483{
14484 if (acl_type == NO_ACL)
14485 return 0;
14486
14487 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14488 if (r < 0)
14489 goto out;
14490
14491 if (acl_type == POSIX_ACL) {
14492 if (in->xattrs.count(ACL_EA_ACCESS)) {
14493 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14494 bufferptr acl(access_acl.c_str(), access_acl.length());
14495 r = posix_acl_access_chmod(acl, mode);
14496 if (r < 0)
14497 goto out;
14498 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14499 } else {
14500 r = 0;
14501 }
14502 }
14503out:
14504 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14505 return r;
14506}
14507
14508int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14509 const UserPerm& perms)
14510{
14511 if (acl_type == NO_ACL)
14512 return 0;
14513
14514 if (S_ISLNK(*mode))
14515 return 0;
14516
14517 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14518 if (r < 0)
14519 goto out;
14520
14521 if (acl_type == POSIX_ACL) {
14522 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14523 map<string, bufferptr> xattrs;
14524
14525 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14526 bufferptr acl(default_acl.c_str(), default_acl.length());
14527 r = posix_acl_inherit_mode(acl, mode);
14528 if (r < 0)
14529 goto out;
14530
14531 if (r > 0) {
14532 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14533 if (r < 0)
14534 goto out;
14535 if (r > 0)
14536 xattrs[ACL_EA_ACCESS] = acl;
14537 }
14538
14539 if (S_ISDIR(*mode))
14540 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14541
14542 r = xattrs.size();
14543 if (r > 0)
11fdf7f2 14544 encode(xattrs, xattrs_bl);
7c673cae
FG
14545 } else {
14546 if (umask_cb)
14547 *mode &= ~umask_cb(callback_handle);
14548 r = 0;
14549 }
14550 }
14551out:
14552 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14553 return r;
14554}
14555
14556void Client::set_filer_flags(int flags)
14557{
11fdf7f2
TL
14558 std::lock_guard l(client_lock);
14559 ceph_assert(flags == 0 ||
7c673cae
FG
14560 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14561 objecter->add_global_op_flags(flags);
14562}
14563
14564void Client::clear_filer_flags(int flags)
14565{
11fdf7f2
TL
14566 std::lock_guard l(client_lock);
14567 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14568 objecter->clear_global_op_flag(flags);
14569}
14570
11fdf7f2
TL
14571// called before mount
14572void Client::set_uuid(const std::string& uuid)
14573{
14574 std::lock_guard l(client_lock);
14575 assert(initialized);
14576 assert(!uuid.empty());
14577
14578 metadata["uuid"] = uuid;
14579 _close_sessions();
14580}
14581
14582// called before mount. 0 means infinite
14583void Client::set_session_timeout(unsigned timeout)
14584{
14585 std::lock_guard l(client_lock);
14586 assert(initialized);
14587
14588 metadata["timeout"] = stringify(timeout);
14589}
14590
14591// called before mount
14592int Client::start_reclaim(const std::string& uuid, unsigned flags,
14593 const std::string& fs_name)
14594{
14595 std::lock_guard l(client_lock);
14596 if (!initialized)
14597 return -ENOTCONN;
14598
14599 if (uuid.empty())
14600 return -EINVAL;
14601
14602 {
14603 auto it = metadata.find("uuid");
14604 if (it != metadata.end() && it->second == uuid)
14605 return -EINVAL;
14606 }
14607
14608 int r = subscribe_mdsmap(fs_name);
14609 if (r < 0) {
14610 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14611 return r;
14612 }
14613
14614 if (metadata.empty())
14615 populate_metadata("");
14616
14617 while (mdsmap->get_epoch() == 0)
14618 wait_on_list(waiting_for_mdsmap);
14619
14620 reclaim_errno = 0;
14621 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14622 if (!mdsmap->is_up(mds)) {
14623 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14624 wait_on_list(waiting_for_mdsmap);
14625 continue;
14626 }
14627
14628 MetaSession *session;
14629 if (!have_open_session(mds)) {
14630 session = _get_or_open_mds_session(mds);
f6b5b4d7
TL
14631 if (session->state == MetaSession::STATE_REJECTED)
14632 return -EPERM;
11fdf7f2
TL
14633 if (session->state != MetaSession::STATE_OPENING) {
14634 // umounting?
14635 return -EINVAL;
14636 }
14637 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14638 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
14639 continue;
14640 }
14641
14642 session = &mds_sessions.at(mds);
14643 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14644 return -EOPNOTSUPP;
14645
14646 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14647 session->reclaim_state == MetaSession::RECLAIMING) {
14648 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 14649 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
14650 session->con->send_message2(std::move(m));
14651 wait_on_list(waiting_for_reclaim);
14652 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14653 return reclaim_errno ? : -ENOTRECOVERABLE;
14654 } else {
14655 mds++;
14656 }
14657 }
14658
14659 // didn't find target session in any mds
14660 if (reclaim_target_addrs.empty()) {
14661 if (flags & CEPH_RECLAIM_RESET)
14662 return -ENOENT;
14663 return -ENOTRECOVERABLE;
14664 }
14665
14666 if (flags & CEPH_RECLAIM_RESET)
14667 return 0;
14668
14669 // use blacklist to check if target session was killed
14670 // (config option mds_session_blacklist_on_evict needs to be true)
14671 C_SaferCond cond;
14672 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14673 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
9f95a23c 14674 client_lock.unlock();
11fdf7f2 14675 cond.wait();
9f95a23c 14676 client_lock.lock();
11fdf7f2
TL
14677 }
14678
14679 bool blacklisted = objecter->with_osdmap(
14680 [this](const OSDMap &osd_map) -> bool {
14681 return osd_map.is_blacklisted(reclaim_target_addrs);
14682 });
14683 if (blacklisted)
14684 return -ENOTRECOVERABLE;
14685
14686 metadata["reclaiming_uuid"] = uuid;
14687 return 0;
14688}
14689
14690void Client::finish_reclaim()
14691{
14692 auto it = metadata.find("reclaiming_uuid");
14693 if (it == metadata.end()) {
14694 for (auto &p : mds_sessions)
14695 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14696 return;
14697 }
14698
14699 for (auto &p : mds_sessions) {
14700 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 14701 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
11fdf7f2
TL
14702 p.second.con->send_message2(std::move(m));
14703 }
14704
14705 metadata["uuid"] = it->second;
14706 metadata.erase(it);
14707}
14708
14709void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14710{
14711 mds_rank_t from = mds_rank_t(reply->get_source().num());
14712 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14713
14714 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14715 if (!session) {
14716 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14717 return;
14718 }
14719
14720 if (reply->get_result() >= 0) {
14721 session->reclaim_state = MetaSession::RECLAIM_OK;
14722 if (reply->get_epoch() > reclaim_osd_epoch)
14723 reclaim_osd_epoch = reply->get_epoch();
14724 if (!reply->get_addrs().empty())
14725 reclaim_target_addrs = reply->get_addrs();
14726 } else {
14727 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14728 reclaim_errno = reply->get_result();
14729 }
14730
14731 signal_cond_list(waiting_for_reclaim);
14732}
14733
7c673cae
FG
14734/**
14735 * This is included in cap release messages, to cause
14736 * the MDS to wait until this OSD map epoch. It is necessary
14737 * in corner cases where we cancel RADOS ops, so that
14738 * nobody else tries to do IO to the same objects in
14739 * the same epoch as the cancelled ops.
14740 */
14741void Client::set_cap_epoch_barrier(epoch_t e)
14742{
14743 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14744 cap_epoch_barrier = e;
14745}
14746
14747const char** Client::get_tracked_conf_keys() const
14748{
14749 static const char* keys[] = {
14750 "client_cache_size",
14751 "client_cache_mid",
14752 "client_acl_type",
b32b8144
FG
14753 "client_deleg_timeout",
14754 "client_deleg_break_on_open",
7c673cae
FG
14755 NULL
14756 };
14757 return keys;
14758}
14759
11fdf7f2 14760void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14761 const std::set <std::string> &changed)
14762{
11fdf7f2 14763 std::lock_guard lock(client_lock);
7c673cae 14764
181888fb 14765 if (changed.count("client_cache_mid")) {
7c673cae
FG
14766 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14767 }
14768 if (changed.count("client_acl_type")) {
14769 acl_type = NO_ACL;
14770 if (cct->_conf->client_acl_type == "posix_acl")
14771 acl_type = POSIX_ACL;
14772 }
14773}
14774
7c673cae
FG
14775void intrusive_ptr_add_ref(Inode *in)
14776{
14777 in->get();
14778}
14779
14780void intrusive_ptr_release(Inode *in)
14781{
14782 in->client->put_inode(in);
14783}
14784
14785mds_rank_t Client::_get_random_up_mds() const
14786{
9f95a23c 14787 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
14788
14789 std::set<mds_rank_t> up;
14790 mdsmap->get_up_mds_set(up);
14791
14792 if (up.empty())
14793 return MDS_RANK_NONE;
14794 std::set<mds_rank_t>::const_iterator p = up.begin();
14795 for (int n = rand() % up.size(); n; n--)
14796 ++p;
14797 return *p;
14798}
14799
14800
14801StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
f91f0fd5 14802 : Client(m, mc, new Objecter(m->cct, m, mc, nullptr))
7c673cae
FG
14803{
14804 monclient->set_messenger(m);
14805 objecter->set_client_incarnation(0);
14806}
14807
14808StandaloneClient::~StandaloneClient()
14809{
14810 delete objecter;
14811 objecter = nullptr;
14812}
14813
14814int StandaloneClient::init()
14815{
e306af50 14816 _pre_init();
7c673cae
FG
14817 objecter->init();
14818
9f95a23c 14819 client_lock.lock();
11fdf7f2 14820 ceph_assert(!is_initialized());
7c673cae
FG
14821
14822 messenger->add_dispatcher_tail(objecter);
14823 messenger->add_dispatcher_tail(this);
14824
14825 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14826 int r = monclient->init();
14827 if (r < 0) {
14828 // need to do cleanup because we're in an intermediate init state
14829 timer.shutdown();
9f95a23c 14830 client_lock.unlock();
7c673cae
FG
14831 objecter->shutdown();
14832 objectcacher->stop();
14833 monclient->shutdown();
14834 return r;
14835 }
14836 objecter->start();
14837
9f95a23c 14838 client_lock.unlock();
7c673cae
FG
14839 _finish_init();
14840
14841 return 0;
14842}
14843
14844void StandaloneClient::shutdown()
14845{
14846 Client::shutdown();
14847 objecter->shutdown();
14848 monclient->shutdown();
14849}