]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
7c673cae
FG
73#include "common/perf_counters.h"
74#include "common/admin_socket.h"
75#include "common/errno.h"
76#include "include/str_list.h"
77
78#define dout_subsys ceph_subsys_client
79
80#include "include/lru.h"
81#include "include/compat.h"
82#include "include/stringify.h"
83
84#include "Client.h"
85#include "Inode.h"
86#include "Dentry.h"
b32b8144 87#include "Delegation.h"
7c673cae
FG
88#include "Dir.h"
89#include "ClientSnapRealm.h"
90#include "Fh.h"
91#include "MetaSession.h"
92#include "MetaRequest.h"
93#include "ObjecterWriteback.h"
94#include "posix_acl.h"
95
11fdf7f2 96#include "include/ceph_assert.h"
7c673cae
FG
97#include "include/stat.h"
98
99#include "include/cephfs/ceph_statx.h"
100
101#if HAVE_GETGROUPLIST
102#include <grp.h>
103#include <pwd.h>
104#include <unistd.h>
105#endif
106
107#undef dout_prefix
108#define dout_prefix *_dout << "client." << whoami << " "
109
110#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112// FreeBSD fails to define this
113#ifndef O_DSYNC
114#define O_DSYNC 0x0
115#endif
116// Darwin fails to define this
117#ifndef O_RSYNC
118#define O_RSYNC 0x0
119#endif
120
121#ifndef O_DIRECT
122#define O_DIRECT 0x0
123#endif
124
125#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
127void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
128{
129 Client *client = static_cast<Client*>(p);
130 client->flush_set_callback(oset);
131}
132
133
134// -------------
135
136Client::CommandHook::CommandHook(Client *client) :
137 m_client(client)
138{
139}
140
9f95a23c
TL
141int Client::CommandHook::call(
142 std::string_view command,
143 const cmdmap_t& cmdmap,
144 Formatter *f,
145 std::ostream& errss,
146 bufferlist& out)
7c673cae 147{
7c673cae 148 f->open_object_section("result");
9f95a23c
TL
149 {
150 std::lock_guard l{m_client->client_lock};
151 if (command == "mds_requests")
152 m_client->dump_mds_requests(f);
153 else if (command == "mds_sessions")
154 m_client->dump_mds_sessions(f);
155 else if (command == "dump_cache")
156 m_client->dump_cache(f);
157 else if (command == "kick_stale_sessions")
158 m_client->_kick_stale_sessions();
159 else if (command == "status")
160 m_client->dump_status(f);
161 else
162 ceph_abort_msg("bad command registered");
163 }
7c673cae 164 f->close_section();
9f95a23c 165 return 0;
7c673cae
FG
166}
167
168
169// -------------
170
171dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
172 : inode(in), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 perms(perms)
175 { }
176
177void Client::_reset_faked_inos()
178{
179 ino_t start = 1024;
180 free_faked_inos.clear();
181 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
182 last_used_faked_ino = 0;
11fdf7f2 183 last_used_faked_root = 0;
7c673cae
FG
184 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
185}
186
187void Client::_assign_faked_ino(Inode *in)
188{
11fdf7f2
TL
189 if (0 == last_used_faked_ino)
190 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
191 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
192 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 193 last_used_faked_ino = 2048;
7c673cae
FG
194 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
195 }
11fdf7f2 196 ceph_assert(it != free_faked_inos.end());
7c673cae 197 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 198 ceph_assert(it.get_len() > 0);
7c673cae
FG
199 last_used_faked_ino = it.get_start();
200 } else {
201 ++last_used_faked_ino;
11fdf7f2 202 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
203 }
204 in->faked_ino = last_used_faked_ino;
205 free_faked_inos.erase(in->faked_ino);
206 faked_ino_map[in->faked_ino] = in->vino();
207}
208
11fdf7f2
TL
209/*
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
215*/
216void Client::_assign_faked_root(Inode *in)
217{
218 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
219 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
220 last_used_faked_root = 0;
221 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
222 }
223 assert(it != free_faked_inos.end());
224 vinodeno_t inode_info = in->vino();
225 uint64_t inode_num = (uint64_t)inode_info.ino;
226 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
227 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it.get_start() + it.get_len() > last_used_faked_root);
229
230 in->faked_ino = last_used_faked_root;
231 free_faked_inos.erase(in->faked_ino);
232 faked_ino_map[in->faked_ino] = in->vino();
233}
234
7c673cae
FG
235void Client::_release_faked_ino(Inode *in)
236{
237 free_faked_inos.insert(in->faked_ino);
238 faked_ino_map.erase(in->faked_ino);
239}
240
241vinodeno_t Client::_map_faked_ino(ino_t ino)
242{
243 vinodeno_t vino;
244 if (ino == 1)
245 vino = root->vino();
246 else if (faked_ino_map.count(ino))
247 vino = faked_ino_map[ino];
248 else
249 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 250 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
251 return vino;
252}
253
254vinodeno_t Client::map_faked_ino(ino_t ino)
255{
11fdf7f2 256 std::lock_guard lock(client_lock);
7c673cae
FG
257 return _map_faked_ino(ino);
258}
259
260// cons/des
261
262Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
263 : Dispatcher(m->cct),
7c673cae 264 timer(m->cct, client_lock),
11fdf7f2
TL
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae 278
7c673cae
FG
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
281 fuse_default_permissions = cct->_conf.get_val<bool>(
282 "fuse_default_permissions");
7c673cae 283
7c673cae
FG
284 if (cct->_conf->client_acl_type == "posix_acl")
285 acl_type = POSIX_ACL;
286
7c673cae
FG
287 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
288
289 // file handles
290 free_fd_set.insert(10, 1<<30);
291
292 mdsmap.reset(new MDSMap);
293
294 // osd interfaces
295 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
296 &client_lock));
297 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
298 client_flush_set_callback, // all commit callback
299 (void*)this,
300 cct->_conf->client_oc_size,
301 cct->_conf->client_oc_max_objects,
302 cct->_conf->client_oc_max_dirty,
303 cct->_conf->client_oc_target_dirty,
304 cct->_conf->client_oc_max_dirty_age,
305 true));
306 objecter_finisher.start();
307 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 308 objecter->enable_blacklist_events();
7c673cae
FG
309}
310
311
312Client::~Client()
313{
9f95a23c 314 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 315
31f18b77
FG
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
9f95a23c 319 std::lock_guard l{client_lock};
7c673cae
FG
320 tear_down_cache();
321}
322
323void Client::tear_down_cache()
324{
325 // fd's
326 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
327 it != fd_map.end();
328 ++it) {
329 Fh *fh = it->second;
11fdf7f2 330 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
331 _release_fh(fh);
332 }
333 fd_map.clear();
334
335 while (!opened_dirs.empty()) {
336 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 337 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
338 _closedir(dirp);
339 }
340
341 // caps!
342 // *** FIXME ***
343
344 // empty lru
7c673cae 345 trim_cache();
11fdf7f2 346 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
347
348 // close root ino
11fdf7f2 349 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
350 if (root && inode_map.size() == 1 + root_parents.size()) {
351 delete root;
352 root = 0;
353 root_ancestor = 0;
354 while (!root_parents.empty())
355 root_parents.erase(root_parents.begin());
356 inode_map.clear();
357 _reset_faked_inos();
358 }
359
11fdf7f2 360 ceph_assert(inode_map.empty());
7c673cae
FG
361}
362
363inodeno_t Client::get_root_ino()
364{
11fdf7f2 365 std::lock_guard l(client_lock);
7c673cae
FG
366 if (use_faked_inos())
367 return root->faked_ino;
368 else
369 return root->ino;
370}
371
372Inode *Client::get_root()
373{
11fdf7f2 374 std::lock_guard l(client_lock);
7c673cae
FG
375 root->ll_get();
376 return root;
377}
378
379
380// debug crapola
381
382void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
383{
384 filepath path;
385 in->make_long_path(path);
386 ldout(cct, 1) << "dump_inode: "
387 << (disconnected ? "DISCONNECTED ":"")
388 << "inode " << in->ino
389 << " " << path
390 << " ref " << in->get_num_ref()
391 << *in << dendl;
392
393 if (f) {
394 f->open_object_section("inode");
395 f->dump_stream("path") << path;
396 if (disconnected)
397 f->dump_int("disconnected", 1);
398 in->dump(f);
399 f->close_section();
400 }
401
402 did.insert(in);
403 if (in->dir) {
404 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
405 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
406 it != in->dir->dentries.end();
407 ++it) {
408 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
409 if (f) {
410 f->open_object_section("dentry");
411 it->second->dump(f);
412 f->close_section();
413 }
414 if (it->second->inode)
415 dump_inode(f, it->second->inode.get(), did, false);
416 }
417 }
418}
419
420void Client::dump_cache(Formatter *f)
421{
422 set<Inode*> did;
423
11fdf7f2 424 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
425
426 if (f)
427 f->open_array_section("cache");
428
429 if (root)
430 dump_inode(f, root, did, true);
431
432 // make a second pass to catch anything disconnected
433 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
434 it != inode_map.end();
435 ++it) {
436 if (did.count(it->second))
437 continue;
438 dump_inode(f, it->second, did, true);
439 }
440
441 if (f)
442 f->close_section();
443}
444
445void Client::dump_status(Formatter *f)
446{
9f95a23c 447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
448
449 ldout(cct, 1) << __func__ << dendl;
450
451 const epoch_t osd_epoch
452 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
453
454 if (f) {
455 f->open_object_section("metadata");
456 for (const auto& kv : metadata)
457 f->dump_string(kv.first.c_str(), kv.second);
458 f->close_section();
459
460 f->dump_int("dentry_count", lru.lru_get_size());
461 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
462 f->dump_int("id", get_nodeid().v);
11fdf7f2 463 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 464 f->dump_object("inst", inst);
11fdf7f2
TL
465 f->dump_object("addr", inst.addr);
466 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
467 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
468 f->dump_int("inode_count", inode_map.size());
469 f->dump_int("mds_epoch", mdsmap->get_epoch());
470 f->dump_int("osd_epoch", osd_epoch);
471 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 472 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
473 }
474}
475
476int Client::init()
477{
478 timer.init();
479 objectcacher->start();
9f95a23c
TL
480 {
481 std::lock_guard l{client_lock};
482 ceph_assert(!initialized);
483 messenger->add_dispatcher_tail(this);
484 }
7c673cae
FG
485 _finish_init();
486 return 0;
487}
488
489void Client::_finish_init()
490{
9f95a23c
TL
491 {
492 std::lock_guard l{client_lock};
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502 }
7c673cae 503
11fdf7f2 504 cct->_conf.add_observer(this);
7c673cae
FG
505
506 AdminSocket* admin_socket = cct->get_admin_socket();
507 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
508 &m_command_hook,
509 "show in-progress mds requests");
510 if (ret < 0) {
511 lderr(cct) << "error registering admin socket command: "
512 << cpp_strerror(-ret) << dendl;
513 }
514 ret = admin_socket->register_command("mds_sessions",
7c673cae
FG
515 &m_command_hook,
516 "show mds session state");
517 if (ret < 0) {
518 lderr(cct) << "error registering admin socket command: "
519 << cpp_strerror(-ret) << dendl;
520 }
521 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
522 &m_command_hook,
523 "show in-memory metadata cache contents");
524 if (ret < 0) {
525 lderr(cct) << "error registering admin socket command: "
526 << cpp_strerror(-ret) << dendl;
527 }
528 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
529 &m_command_hook,
530 "kick sessions that were remote reset");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("status",
7c673cae
FG
536 &m_command_hook,
537 "show overall client status");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542
9f95a23c 543 std::lock_guard l{client_lock};
7c673cae 544 initialized = true;
7c673cae
FG
545}
546
547void Client::shutdown()
548{
11fdf7f2 549 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
550
551 // If we were not mounted, but were being used for sending
552 // MDS commands, we may have sessions that need closing.
9f95a23c
TL
553 {
554 std::lock_guard l{client_lock};
555 _close_sessions();
556 }
11fdf7f2 557 cct->_conf.remove_observer(this);
7c673cae 558
11fdf7f2 559 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
560
561 if (ino_invalidate_cb) {
562 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
563 async_ino_invalidator.wait_for_empty();
564 async_ino_invalidator.stop();
565 }
566
567 if (dentry_invalidate_cb) {
568 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
569 async_dentry_invalidator.wait_for_empty();
570 async_dentry_invalidator.stop();
571 }
572
573 if (switch_interrupt_cb) {
574 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
575 interrupt_finisher.wait_for_empty();
576 interrupt_finisher.stop();
577 }
578
579 if (remount_cb) {
580 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
581 remount_finisher.wait_for_empty();
582 remount_finisher.stop();
583 }
584
585 objectcacher->stop(); // outside of client_lock! this does a join.
9f95a23c
TL
586 {
587 std::lock_guard l{client_lock};
588 ceph_assert(initialized);
589 initialized = false;
590 timer.shutdown();
591 }
7c673cae
FG
592 objecter_finisher.wait_for_empty();
593 objecter_finisher.stop();
594
595 if (logger) {
596 cct->get_perfcounters_collection()->remove(logger.get());
597 logger.reset();
598 }
599}
600
601
602// ===================
603// metadata cache stuff
604
605void Client::trim_cache(bool trim_kernel_dcache)
606{
181888fb
FG
607 uint64_t max = cct->_conf->client_cache_size;
608 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
609 unsigned last = 0;
610 while (lru.lru_get_size() != last) {
611 last = lru.lru_get_size();
612
181888fb 613 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
614
615 // trim!
31f18b77 616 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
617 if (!dn)
618 break; // done
619
620 trim_dentry(dn);
621 }
622
181888fb 623 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
624 _invalidate_kernel_dcache();
625
626 // hose root?
627 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
628 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
629 delete root;
630 root = 0;
631 root_ancestor = 0;
632 while (!root_parents.empty())
633 root_parents.erase(root_parents.begin());
634 inode_map.clear();
635 _reset_faked_inos();
636 }
637}
638
639void Client::trim_cache_for_reconnect(MetaSession *s)
640{
641 mds_rank_t mds = s->mds_num;
11fdf7f2 642 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
643
644 int trimmed = 0;
645 list<Dentry*> skipped;
646 while (lru.lru_get_size() > 0) {
647 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
648 if (!dn)
649 break;
650
651 if ((dn->inode && dn->inode->caps.count(mds)) ||
652 dn->dir->parent_inode->caps.count(mds)) {
653 trim_dentry(dn);
654 trimmed++;
655 } else
656 skipped.push_back(dn);
657 }
658
659 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
660 lru.lru_insert_mid(*p);
661
11fdf7f2 662 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
663 << " trimmed " << trimmed << " dentries" << dendl;
664
665 if (s->caps.size() > 0)
666 _invalidate_kernel_dcache();
667}
668
669void Client::trim_dentry(Dentry *dn)
670{
671 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
672 << " in dir "
673 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
674 << dendl;
675 if (dn->inode) {
676 Inode *diri = dn->dir->parent_inode;
677 diri->dir_release_count++;
678 clear_dir_complete_and_ordered(diri, true);
679 }
680 unlink(dn, false, false); // drop dir, drop dentry
681}
682
683
1adf2230
AA
684void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
685 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 686{
7c673cae
FG
687 uint64_t prior_size = in->size;
688
7c673cae
FG
689 if (truncate_seq > in->truncate_seq ||
690 (truncate_seq == in->truncate_seq && size > in->size)) {
691 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
692 in->size = size;
693 in->reported_size = size;
694 if (truncate_seq != in->truncate_seq) {
695 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
696 << truncate_seq << dendl;
697 in->truncate_seq = truncate_seq;
698 in->oset.truncate_seq = truncate_seq;
699
700 // truncate cached file data
701 if (prior_size > size) {
702 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
703 }
704 }
705
706 // truncate inline data
707 if (in->inline_version < CEPH_INLINE_NONE) {
708 uint32_t len = in->inline_data.length();
709 if (size < len)
710 in->inline_data.splice(size, len - size);
711 }
712 }
713 if (truncate_seq >= in->truncate_seq &&
714 in->truncate_size != truncate_size) {
715 if (in->is_file()) {
716 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
717 << truncate_size << dendl;
718 in->truncate_size = truncate_size;
719 in->oset.truncate_size = truncate_size;
720 } else {
721 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
722 }
723 }
1adf2230
AA
724}
725
726void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
727 utime_t ctime, utime_t mtime, utime_t atime)
728{
729 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
730 << " ctime " << ctime << " mtime " << mtime << dendl;
731
732 if (time_warp_seq > in->time_warp_seq)
733 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
734 << " is higher than local time_warp_seq "
735 << in->time_warp_seq << dendl;
736
737 int warn = false;
7c673cae
FG
738 // be careful with size, mtime, atime
739 if (issued & (CEPH_CAP_FILE_EXCL|
740 CEPH_CAP_FILE_WR|
741 CEPH_CAP_FILE_BUFFER|
742 CEPH_CAP_AUTH_EXCL|
743 CEPH_CAP_XATTR_EXCL)) {
744 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
745 if (ctime > in->ctime)
746 in->ctime = ctime;
747 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
748 //the mds updated times, so take those!
749 in->mtime = mtime;
750 in->atime = atime;
751 in->time_warp_seq = time_warp_seq;
752 } else if (time_warp_seq == in->time_warp_seq) {
753 //take max times
754 if (mtime > in->mtime)
755 in->mtime = mtime;
756 if (atime > in->atime)
757 in->atime = atime;
758 } else if (issued & CEPH_CAP_FILE_EXCL) {
759 //ignore mds values as we have a higher seq
760 } else warn = true;
761 } else {
762 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
763 if (time_warp_seq >= in->time_warp_seq) {
764 in->ctime = ctime;
765 in->mtime = mtime;
766 in->atime = atime;
767 in->time_warp_seq = time_warp_seq;
768 } else warn = true;
769 }
770 if (warn) {
771 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
772 << time_warp_seq << " is lower than local time_warp_seq "
773 << in->time_warp_seq
774 << dendl;
775 }
776}
777
778void Client::_fragmap_remove_non_leaves(Inode *in)
779{
780 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
781 if (!in->dirfragtree.is_leaf(p->first))
782 in->fragmap.erase(p++);
783 else
784 ++p;
785}
786
787void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
788{
789 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
790 if (p->second == mds)
791 in->fragmap.erase(p++);
792 else
793 ++p;
794}
795
796Inode * Client::add_update_inode(InodeStat *st, utime_t from,
797 MetaSession *session,
798 const UserPerm& request_perms)
799{
800 Inode *in;
801 bool was_new = false;
802 if (inode_map.count(st->vino)) {
803 in = inode_map[st->vino];
11fdf7f2 804 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
805 } else {
806 in = new Inode(this, st->vino, &st->layout);
807 inode_map[st->vino] = in;
808
809 if (use_faked_inos())
810 _assign_faked_ino(in);
811
812 if (!root) {
813 root = in;
11fdf7f2
TL
814 if (use_faked_inos())
815 _assign_faked_root(root);
7c673cae
FG
816 root_ancestor = in;
817 cwd = root;
818 } else if (!mounted) {
819 root_parents[root_ancestor] = in;
820 root_ancestor = in;
821 }
822
823 // immutable bits
824 in->ino = st->vino.ino;
825 in->snapid = st->vino.snapid;
826 in->mode = st->mode & S_IFMT;
827 was_new = true;
828 }
829
830 in->rdev = st->rdev;
831 if (in->is_symlink())
832 in->symlink = st->symlink;
833
7c673cae 834 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
835 bool new_version = false;
836 if (in->version == 0 ||
837 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
838 (in->version & ~1) < st->version))
839 new_version = true;
7c673cae 840
1adf2230
AA
841 int issued;
842 in->caps_issued(&issued);
843 issued |= in->caps_dirty();
844 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 845
1adf2230
AA
846 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
847 !(issued & CEPH_CAP_AUTH_EXCL)) {
848 in->mode = st->mode;
849 in->uid = st->uid;
850 in->gid = st->gid;
851 in->btime = st->btime;
81eedcae 852 in->snap_btime = st->snap_btime;
1adf2230 853 }
7c673cae 854
1adf2230
AA
855 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
856 !(issued & CEPH_CAP_LINK_EXCL)) {
857 in->nlink = st->nlink;
858 }
7c673cae 859
1adf2230
AA
860 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
861 update_inode_file_time(in, issued, st->time_warp_seq,
862 st->ctime, st->mtime, st->atime);
863 }
7c673cae 864
1adf2230
AA
865 if (new_version ||
866 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 867 in->layout = st->layout;
1adf2230
AA
868 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
869 }
7c673cae 870
1adf2230
AA
871 if (in->is_dir()) {
872 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
873 in->dirstat = st->dirstat;
874 }
875 // dir_layout/rstat/quota are not tracked by capability, update them only if
876 // the inode stat is from auth mds
877 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
878 in->dir_layout = st->dir_layout;
879 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
880 in->rstat = st->rstat;
881 in->quota = st->quota;
11fdf7f2 882 in->dir_pin = st->dir_pin;
1adf2230
AA
883 }
884 // move me if/when version reflects fragtree changes.
885 if (in->dirfragtree != st->dirfragtree) {
886 in->dirfragtree = st->dirfragtree;
887 _fragmap_remove_non_leaves(in);
7c673cae 888 }
7c673cae
FG
889 }
890
891 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
892 st->xattrbl.length() &&
893 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
894 auto p = st->xattrbl.cbegin();
895 decode(in->xattrs, p);
7c673cae
FG
896 in->xattr_version = st->xattr_version;
897 }
898
1adf2230
AA
899 if (st->inline_version > in->inline_version) {
900 in->inline_data = st->inline_data;
901 in->inline_version = st->inline_version;
7c673cae
FG
902 }
903
1adf2230
AA
904 /* always take a newer change attr */
905 if (st->change_attr > in->change_attr)
906 in->change_attr = st->change_attr;
907
908 if (st->version > in->version)
909 in->version = st->version;
910
911 if (was_new)
912 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
913
914 if (!st->cap.caps)
915 return in; // as with readdir returning indoes in different snaprealms (no caps!)
916
7c673cae 917 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
918 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
919 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
920 st->cap.flags, request_perms);
28e407b8 921 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 922 in->max_size = st->max_size;
28e407b8
AA
923 in->rstat = st->rstat;
924 }
7c673cae 925
1adf2230
AA
926 // setting I_COMPLETE needs to happen after adding the cap
927 if (in->is_dir() &&
928 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
929 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
930 in->dirstat.nfiles == 0 &&
931 in->dirstat.nsubdirs == 0) {
932 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
933 in->flags |= I_COMPLETE | I_DIR_ORDERED;
934 if (in->dir) {
935 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
936 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
937 in->dir->readdir_cache.clear();
938 for (const auto& p : in->dir->dentries) {
939 unlink(p.second, true, true); // keep dir, keep dentry
940 }
941 if (in->dir->dentries.empty())
942 close_dir(in->dir);
7c673cae 943 }
7c673cae 944 }
1adf2230
AA
945 } else {
946 in->snap_caps |= st->cap.caps;
7c673cae
FG
947 }
948
949 return in;
950}
951
952
953/*
954 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
955 */
956Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
957 Inode *in, utime_t from, MetaSession *session,
958 Dentry *old_dentry)
959{
960 Dentry *dn = NULL;
961 if (dir->dentries.count(dname))
962 dn = dir->dentries[dname];
963
11fdf7f2 964 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
965 << " in dir " << dir->parent_inode->vino() << " dn " << dn
966 << dendl;
967
968 if (dn && dn->inode) {
969 if (dn->inode->vino() == in->vino()) {
970 touch_dn(dn);
971 ldout(cct, 12) << " had dentry " << dname
972 << " with correct vino " << dn->inode->vino()
973 << dendl;
974 } else {
975 ldout(cct, 12) << " had dentry " << dname
976 << " with WRONG vino " << dn->inode->vino()
977 << dendl;
978 unlink(dn, true, true); // keep dir, keep dentry
979 }
980 }
981
982 if (!dn || !dn->inode) {
983 InodeRef tmp_ref(in);
984 if (old_dentry) {
985 if (old_dentry->dir != dir) {
986 Inode *old_diri = old_dentry->dir->parent_inode;
987 old_diri->dir_ordered_count++;
988 clear_dir_complete_and_ordered(old_diri, false);
989 }
990 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
991 }
992 Inode *diri = dir->parent_inode;
993 diri->dir_ordered_count++;
994 clear_dir_complete_and_ordered(diri, false);
995 dn = link(dir, dname, in, dn);
996 }
997
998 update_dentry_lease(dn, dlease, from, session);
999 return dn;
1000}
1001
1002void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1003{
1004 utime_t dttl = from;
1005 dttl += (float)dlease->duration_ms / 1000.0;
1006
11fdf7f2 1007 ceph_assert(dn);
7c673cae 1008
9f95a23c 1009 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1010 if (dttl > dn->lease_ttl) {
1011 ldout(cct, 10) << "got dentry lease on " << dn->name
1012 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1013 dn->lease_ttl = dttl;
1014 dn->lease_mds = session->mds_num;
1015 dn->lease_seq = dlease->seq;
1016 dn->lease_gen = session->cap_gen;
1017 }
1018 }
1019 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1020}
1021
1022
1023/*
1024 * update MDS location cache for a single inode
1025 */
1026void Client::update_dir_dist(Inode *in, DirStat *dst)
1027{
1028 // auth
1029 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1030 if (dst->auth >= 0) {
1031 in->fragmap[dst->frag] = dst->auth;
1032 } else {
1033 in->fragmap.erase(dst->frag);
1034 }
1035 if (!in->dirfragtree.is_leaf(dst->frag)) {
1036 in->dirfragtree.force_to_leaf(cct, dst->frag);
1037 _fragmap_remove_non_leaves(in);
1038 }
1039
1040 // replicated
1041 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
7c673cae
FG
1042}
1043
1044void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1045{
1046 if (diri->flags & I_COMPLETE) {
1047 if (complete) {
1048 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1049 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1050 } else {
1051 if (diri->flags & I_DIR_ORDERED) {
1052 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1053 diri->flags &= ~I_DIR_ORDERED;
1054 }
1055 }
1056 if (diri->dir)
1057 diri->dir->readdir_cache.clear();
1058 }
1059}
1060
1061/*
1062 * insert results from readdir or lssnap into the metadata cache.
1063 */
1064void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1065
11fdf7f2 1066 auto& reply = request->reply;
7c673cae 1067 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1068 uint64_t features;
1069 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1070 features = (uint64_t)-1;
1071 }
1072 else {
1073 features = con->get_features();
1074 }
7c673cae
FG
1075
1076 dir_result_t *dirp = request->dirp;
11fdf7f2 1077 ceph_assert(dirp);
7c673cae
FG
1078
1079 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1080 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1081 if (!p.end()) {
1082 // snapdir?
1083 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1084 ceph_assert(diri);
7c673cae
FG
1085 diri = open_snapdir(diri);
1086 }
1087
1088 // only open dir if we're actually adding stuff to it!
1089 Dir *dir = diri->open_dir();
11fdf7f2 1090 ceph_assert(dir);
7c673cae
FG
1091
1092 // dirstat
11fdf7f2 1093 DirStat dst(p, features);
7c673cae
FG
1094 __u32 numdn;
1095 __u16 flags;
11fdf7f2
TL
1096 decode(numdn, p);
1097 decode(flags, p);
7c673cae
FG
1098
1099 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1100 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1101
1102 frag_t fg = (unsigned)request->head.args.readdir.frag;
1103 unsigned readdir_offset = dirp->next_offset;
1104 string readdir_start = dirp->last_name;
11fdf7f2 1105 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1106
1107 unsigned last_hash = 0;
1108 if (hash_order) {
1109 if (!readdir_start.empty()) {
1110 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1111 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1112 /* mds understands offset_hash */
1113 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1114 }
1115 }
1116
1117 if (fg != dst.frag) {
1118 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1119 fg = dst.frag;
1120 if (!hash_order) {
1121 readdir_offset = 2;
1122 readdir_start.clear();
1123 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1124 }
1125 }
1126
1127 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1128 << ", hash_order=" << hash_order
1129 << ", readdir_start " << readdir_start
1130 << ", last_hash " << last_hash
1131 << ", next_offset " << readdir_offset << dendl;
1132
1133 if (diri->snapid != CEPH_SNAPDIR &&
1134 fg.is_leftmost() && readdir_offset == 2 &&
1135 !(hash_order && last_hash)) {
1136 dirp->release_count = diri->dir_release_count;
1137 dirp->ordered_count = diri->dir_ordered_count;
1138 dirp->start_shared_gen = diri->shared_gen;
1139 dirp->cache_index = 0;
1140 }
1141
1142 dirp->buffer_frag = fg;
1143
1144 _readdir_drop_dirp_buffer(dirp);
1145 dirp->buffer.reserve(numdn);
1146
1147 string dname;
1148 LeaseStat dlease;
1149 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1150 decode(dname, p);
1151 dlease.decode(p, features);
7c673cae
FG
1152 InodeStat ist(p, features);
1153
1154 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1155
1156 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1157 request->perms);
1158 Dentry *dn;
1159 if (diri->dir->dentries.count(dname)) {
1160 Dentry *olddn = diri->dir->dentries[dname];
1161 if (olddn->inode != in) {
1162 // replace incorrect dentry
1163 unlink(olddn, true, true); // keep dir, dentry
1164 dn = link(dir, dname, in, olddn);
11fdf7f2 1165 ceph_assert(dn == olddn);
7c673cae
FG
1166 } else {
1167 // keep existing dn
1168 dn = olddn;
1169 touch_dn(dn);
1170 }
1171 } else {
1172 // new dn
1173 dn = link(dir, dname, in, NULL);
1174 }
1175
1176 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1177 if (hash_order) {
1178 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1179 if (hash != last_hash)
1180 readdir_offset = 2;
1181 last_hash = hash;
1182 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1183 } else {
1184 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1185 }
1186 // add to readdir cache
1187 if (dirp->release_count == diri->dir_release_count &&
1188 dirp->ordered_count == diri->dir_ordered_count &&
1189 dirp->start_shared_gen == diri->shared_gen) {
1190 if (dirp->cache_index == dir->readdir_cache.size()) {
1191 if (i == 0) {
11fdf7f2 1192 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1193 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1194 }
1195 dir->readdir_cache.push_back(dn);
1196 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1197 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1198 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1199 else
1200 dir->readdir_cache[dirp->cache_index] = dn;
1201 } else {
11fdf7f2 1202 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1203 }
1204 dirp->cache_index++;
1205 }
1206 // add to cached result list
1207 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1208 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1209 }
1210
1211 if (numdn > 0)
1212 dirp->last_name = dname;
1213 if (end)
1214 dirp->next_offset = 2;
1215 else
1216 dirp->next_offset = readdir_offset;
1217
1218 if (dir->is_empty())
1219 close_dir(dir);
1220 }
1221}
1222
1223/** insert_trace
1224 *
1225 * insert a trace from a MDS reply into the cache.
1226 */
1227Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1228{
11fdf7f2 1229 auto& reply = request->reply;
7c673cae
FG
1230 int op = request->get_op();
1231
1232 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1233 << " is_target=" << (int)reply->head.is_target
1234 << " is_dentry=" << (int)reply->head.is_dentry
1235 << dendl;
1236
11fdf7f2 1237 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1238 if (request->got_unsafe) {
1239 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1240 ceph_assert(p.end());
7c673cae
FG
1241 return NULL;
1242 }
1243
1244 if (p.end()) {
1245 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1246
1247 Dentry *d = request->dentry();
1248 if (d) {
1249 Inode *diri = d->dir->parent_inode;
1250 diri->dir_release_count++;
1251 clear_dir_complete_and_ordered(diri, true);
1252 }
1253
1254 if (d && reply->get_result() == 0) {
1255 if (op == CEPH_MDS_OP_RENAME) {
1256 // rename
1257 Dentry *od = request->old_dentry();
1258 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1259 ceph_assert(od);
7c673cae
FG
1260 unlink(od, true, true); // keep dir, dentry
1261 } else if (op == CEPH_MDS_OP_RMDIR ||
1262 op == CEPH_MDS_OP_UNLINK) {
1263 // unlink, rmdir
1264 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1265 unlink(d, true, true); // keep dir, dentry
1266 }
1267 }
1268 return NULL;
1269 }
1270
1271 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1272 uint64_t features;
1273 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1274 features = (uint64_t)-1;
1275 }
1276 else {
1277 features = con->get_features();
1278 }
7c673cae
FG
1279 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1280
1281 // snap trace
1282 SnapRealm *realm = NULL;
1283 if (reply->snapbl.length())
1284 update_snap_trace(reply->snapbl, &realm);
1285
1286 ldout(cct, 10) << " hrm "
1287 << " is_target=" << (int)reply->head.is_target
1288 << " is_dentry=" << (int)reply->head.is_dentry
1289 << dendl;
1290
1291 InodeStat dirst;
1292 DirStat dst;
1293 string dname;
1294 LeaseStat dlease;
1295 InodeStat ist;
1296
1297 if (reply->head.is_dentry) {
1298 dirst.decode(p, features);
11fdf7f2
TL
1299 dst.decode(p, features);
1300 decode(dname, p);
1301 dlease.decode(p, features);
7c673cae
FG
1302 }
1303
1304 Inode *in = 0;
1305 if (reply->head.is_target) {
1306 ist.decode(p, features);
1307 if (cct->_conf->client_debug_getattr_caps) {
1308 unsigned wanted = 0;
1309 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1310 wanted = request->head.args.getattr.mask;
1311 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1312 wanted = request->head.args.open.mask;
1313
1314 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1315 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1316 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1317 }
1318
1319 in = add_update_inode(&ist, request->sent_stamp, session,
1320 request->perms);
1321 }
1322
1323 Inode *diri = NULL;
1324 if (reply->head.is_dentry) {
1325 diri = add_update_inode(&dirst, request->sent_stamp, session,
1326 request->perms);
1327 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1328
1329 if (in) {
1330 Dir *dir = diri->open_dir();
1331 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1332 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1333 } else {
1334 Dentry *dn = NULL;
1335 if (diri->dir && diri->dir->dentries.count(dname)) {
1336 dn = diri->dir->dentries[dname];
1337 if (dn->inode) {
1338 diri->dir_ordered_count++;
1339 clear_dir_complete_and_ordered(diri, false);
1340 unlink(dn, true, true); // keep dir, dentry
1341 }
1342 }
1343 if (dlease.duration_ms > 0) {
1344 if (!dn) {
1345 Dir *dir = diri->open_dir();
1346 dn = link(dir, dname, NULL, NULL);
1347 }
1348 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1349 }
1350 }
1351 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1352 op == CEPH_MDS_OP_MKSNAP) {
1353 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1354 // fake it for snap lookup
1355 vinodeno_t vino = ist.vino;
1356 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1357 ceph_assert(inode_map.count(vino));
7c673cae
FG
1358 diri = inode_map[vino];
1359
1360 string dname = request->path.last_dentry();
1361
1362 LeaseStat dlease;
1363 dlease.duration_ms = 0;
1364
1365 if (in) {
1366 Dir *dir = diri->open_dir();
1367 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1368 } else {
1369 if (diri->dir && diri->dir->dentries.count(dname)) {
1370 Dentry *dn = diri->dir->dentries[dname];
1371 if (dn->inode)
1372 unlink(dn, true, true); // keep dir, dentry
1373 }
1374 }
1375 }
1376
1377 if (in) {
1378 if (op == CEPH_MDS_OP_READDIR ||
1379 op == CEPH_MDS_OP_LSSNAP) {
1380 insert_readdir_results(request, session, in);
1381 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1382 // hack: return parent inode instead
1383 in = diri;
1384 }
1385
1386 if (request->dentry() == NULL && in != request->inode()) {
1387 // pin the target inode if its parent dentry is not pinned
1388 request->set_other_inode(in);
1389 }
1390 }
1391
1392 if (realm)
1393 put_snap_realm(realm);
1394
1395 request->target = in;
1396 return in;
1397}
1398
1399// -------
1400
1401mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1402{
1403 mds_rank_t mds = MDS_RANK_NONE;
1404 __u32 hash = 0;
1405 bool is_hash = false;
1406
1407 Inode *in = NULL;
1408 Dentry *de = NULL;
7c673cae
FG
1409
1410 if (req->resend_mds >= 0) {
1411 mds = req->resend_mds;
1412 req->resend_mds = -1;
11fdf7f2 1413 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1414 goto out;
1415 }
1416
1417 if (cct->_conf->client_use_random_mds)
1418 goto random_mds;
1419
1420 in = req->inode();
1421 de = req->dentry();
1422 if (in) {
11fdf7f2 1423 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1424 if (req->path.depth()) {
1425 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1426 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1427 << " on " << req->path[0]
1428 << " => " << hash << dendl;
1429 is_hash = true;
1430 }
1431 } else if (de) {
1432 if (de->inode) {
1433 in = de->inode.get();
11fdf7f2 1434 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1435 } else {
1436 in = de->dir->parent_inode;
1437 hash = in->hash_dentry_name(de->name);
11fdf7f2 1438 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1439 << " on " << de->name
1440 << " => " << hash << dendl;
1441 is_hash = true;
1442 }
1443 }
1444 if (in) {
1445 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1446 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1447 while (in->snapid != CEPH_NOSNAP) {
1448 if (in->snapid == CEPH_SNAPDIR)
1449 in = in->snapdir_parent.get();
11fdf7f2 1450 else if (!in->dentries.empty())
7c673cae
FG
1451 /* In most cases there will only be one dentry, so getting it
1452 * will be the correct action. If there are multiple hard links,
1453 * I think the MDS should be able to redirect as needed*/
1454 in = in->get_first_parent()->dir->parent_inode;
1455 else {
1456 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1457 break;
1458 }
1459 }
1460 is_hash = false;
1461 }
1462
11fdf7f2 1463 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1464 << " hash=" << hash << dendl;
1465
1466 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1467 frag_t fg = in->dirfragtree[hash];
1468 if (in->fragmap.count(fg)) {
1469 mds = in->fragmap[fg];
1470 if (phash_diri)
1471 *phash_diri = in;
91327a77
AA
1472 } else if (in->auth_cap) {
1473 mds = in->auth_cap->session->mds_num;
1474 }
1475 if (mds >= 0) {
11fdf7f2 1476 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1477 goto out;
1478 }
1479 }
1480
11fdf7f2
TL
1481 if (in->auth_cap && req->auth_is_best()) {
1482 mds = in->auth_cap->session->mds_num;
1483 } else if (!in->caps.empty()) {
1484 mds = in->caps.begin()->second.session->mds_num;
1485 } else {
7c673cae 1486 goto random_mds;
11fdf7f2
TL
1487 }
1488 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1489
1490 goto out;
1491 }
1492
1493random_mds:
1494 if (mds < 0) {
1495 mds = _get_random_up_mds();
1496 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1497 }
1498
1499out:
1500 ldout(cct, 20) << "mds is " << mds << dendl;
1501 return mds;
1502}
1503
1504
1505void Client::connect_mds_targets(mds_rank_t mds)
1506{
11fdf7f2
TL
1507 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1508 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1509 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1510 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1511 q != info.export_targets.end();
1512 ++q) {
1513 if (mds_sessions.count(*q) == 0 &&
1514 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1515 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1516 << " export target mds." << *q << dendl;
1517 _open_mds_session(*q);
1518 }
1519 }
1520}
1521
1522void Client::dump_mds_sessions(Formatter *f)
1523{
1524 f->dump_int("id", get_nodeid().v);
11fdf7f2 1525 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1526 f->dump_object("inst", inst);
1527 f->dump_stream("inst_str") << inst;
1528 f->dump_stream("addr_str") << inst.addr;
7c673cae 1529 f->open_array_section("sessions");
11fdf7f2 1530 for (const auto &p : mds_sessions) {
7c673cae 1531 f->open_object_section("session");
11fdf7f2 1532 p.second.dump(f);
7c673cae
FG
1533 f->close_section();
1534 }
1535 f->close_section();
1536 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1537}
1538void Client::dump_mds_requests(Formatter *f)
1539{
1540 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1541 p != mds_requests.end();
1542 ++p) {
1543 f->open_object_section("request");
1544 p->second->dump(f);
1545 f->close_section();
1546 }
1547}
1548
9f95a23c 1549int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1550 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1551 InodeRef *ptarget, bool *pcreated,
1552 const UserPerm& perms)
1553{
1554 // check whether this request actually did the create, and set created flag
1555 bufferlist extra_bl;
1556 inodeno_t created_ino;
1557 bool got_created_ino = false;
1558 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1559
11fdf7f2 1560 extra_bl = reply->get_extra_bl();
7c673cae 1561 if (extra_bl.length() >= 8) {
9f95a23c
TL
1562 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1563 struct openc_response_t ocres;
1564
1565 decode(ocres, extra_bl);
1566 created_ino = ocres.created_ino;
1567 /*
1568 * The userland cephfs client doesn't have a way to do an async create
1569 * (yet), so just discard delegated_inos for now. Eventually we should
1570 * store them and use them in create calls, even if they are synchronous,
1571 * if only for testing purposes.
1572 */
1573 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1574 } else {
1575 // u64 containing number of created ino
1576 decode(created_ino, extra_bl);
1577 }
7c673cae 1578 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1579 got_created_ino = true;
7c673cae
FG
1580 }
1581
1582 if (pcreated)
1583 *pcreated = got_created_ino;
1584
1585 if (request->target) {
1586 *ptarget = request->target;
1587 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1588 } else {
1589 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1590 (*ptarget) = p->second;
1591 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1592 } else {
1593 // we got a traceless reply, and need to look up what we just
1594 // created. for now, do this by name. someday, do this by the
1595 // ino... which we know! FIXME.
1596 InodeRef target;
1597 Dentry *d = request->dentry();
1598 if (d) {
1599 if (d->dir) {
1600 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1601 << d->dir->parent_inode->ino << "/" << d->name
1602 << " got_ino " << got_created_ino
1603 << " ino " << created_ino
1604 << dendl;
1605 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1606 &target, perms);
1607 } else {
1608 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1609 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1610 }
1611 } else {
1612 Inode *in = request->inode();
1613 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1614 << in->ino << dendl;
1615 r = _getattr(in, request->regetattr_mask, perms, true);
1616 target = in;
1617 }
1618 if (r >= 0) {
1619 // verify ino returned in reply and trace_dist are the same
1620 if (got_created_ino &&
1621 created_ino.val != target->ino.val) {
1622 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1623 r = -EINTR;
1624 }
1625 if (ptarget)
1626 ptarget->swap(target);
1627 }
1628 }
1629 }
1630
1631 return r;
1632}
1633
1634
1635/**
1636 * make a request
1637 *
1638 * Blocking helper to make an MDS request.
1639 *
1640 * If the ptarget flag is set, behavior changes slightly: the caller
1641 * expects to get a pointer to the inode we are creating or operating
1642 * on. As a result, we will follow up any traceless mutation reply
1643 * with a getattr or lookup to transparently handle a traceless reply
1644 * from the MDS (as when the MDS restarts and the client has to replay
1645 * a request).
1646 *
1647 * @param request the MetaRequest to execute
1648 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1649 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1650 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1651 * @param use_mds [optional] prefer a specific mds (-1 for default)
1652 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1653 */
1654int Client::make_request(MetaRequest *request,
1655 const UserPerm& perms,
1656 InodeRef *ptarget, bool *pcreated,
1657 mds_rank_t use_mds,
1658 bufferlist *pdirbl)
1659{
1660 int r = 0;
1661
1662 // assign a unique tid
1663 ceph_tid_t tid = ++last_tid;
1664 request->set_tid(tid);
1665
1666 // and timestamp
1667 request->op_stamp = ceph_clock_now();
1668
1669 // make note
1670 mds_requests[tid] = request->get();
1671 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1672 oldest_tid = tid;
1673
1674 request->set_caller_perms(perms);
1675
1676 if (cct->_conf->client_inject_fixed_oldest_tid) {
1677 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1678 request->set_oldest_client_tid(1);
1679 } else {
1680 request->set_oldest_client_tid(oldest_tid);
1681 }
1682
1683 // hack target mds?
1684 if (use_mds >= 0)
1685 request->resend_mds = use_mds;
1686
9f95a23c 1687 MetaSession *session = NULL;
7c673cae
FG
1688 while (1) {
1689 if (request->aborted())
1690 break;
1691
31f18b77
FG
1692 if (blacklisted) {
1693 request->abort(-EBLACKLISTED);
1694 break;
1695 }
1696
7c673cae 1697 // set up wait cond
9f95a23c 1698 ceph::condition_variable caller_cond;
7c673cae
FG
1699 request->caller_cond = &caller_cond;
1700
1701 // choose mds
1702 Inode *hash_diri = NULL;
1703 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1704 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1705 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1706 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1707 if (hash_diri) {
1708 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1709 _fragmap_remove_stopped_mds(hash_diri, mds);
1710 } else {
1711 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1712 request->resend_mds = _get_random_up_mds();
1713 }
1714 } else {
1715 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1716 wait_on_list(waiting_for_mdsmap);
1717 }
1718 continue;
1719 }
1720
1721 // open a session?
7c673cae
FG
1722 if (!have_open_session(mds)) {
1723 session = _get_or_open_mds_session(mds);
1724
1725 // wait
1726 if (session->state == MetaSession::STATE_OPENING) {
1727 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1728 wait_on_context_list(session->waiting_for_open);
1729 // Abort requests on REJECT from MDS
1730 if (rejected_by_mds.count(mds)) {
1731 request->abort(-EPERM);
1732 break;
1733 }
1734 continue;
1735 }
1736
1737 if (!have_open_session(mds))
1738 continue;
1739 } else {
11fdf7f2 1740 session = &mds_sessions.at(mds);
7c673cae
FG
1741 }
1742
1743 // send request.
1744 send_request(request, session);
1745
1746 // wait for signal
1747 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1748 request->kick = false;
9f95a23c
TL
1749 std::unique_lock l{client_lock, std::adopt_lock};
1750 caller_cond.wait(l, [request] {
1751 return (request->reply || // reply
1752 request->resend_mds >= 0 || // forward
1753 request->kick);
1754 });
1755 l.release();
1756 request->caller_cond = nullptr;
7c673cae
FG
1757
1758 // did we get a reply?
1759 if (request->reply)
1760 break;
1761 }
1762
1763 if (!request->reply) {
11fdf7f2
TL
1764 ceph_assert(request->aborted());
1765 ceph_assert(!request->got_unsafe);
7c673cae
FG
1766 r = request->get_abort_code();
1767 request->item.remove_myself();
1768 unregister_request(request);
11fdf7f2 1769 put_request(request);
7c673cae
FG
1770 return r;
1771 }
1772
1773 // got it!
11fdf7f2 1774 auto reply = std::move(request->reply);
7c673cae
FG
1775 r = reply->get_result();
1776 if (r >= 0)
1777 request->success = true;
1778
1779 // kick dispatcher (we've got it!)
11fdf7f2 1780 ceph_assert(request->dispatch_cond);
9f95a23c 1781 request->dispatch_cond->notify_all();
7c673cae
FG
1782 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1783 request->dispatch_cond = 0;
1784
1785 if (r >= 0 && ptarget)
9f95a23c 1786 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
7c673cae
FG
1787
1788 if (pdirbl)
11fdf7f2 1789 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1790
1791 // -- log times --
1792 utime_t lat = ceph_clock_now();
1793 lat -= request->sent_stamp;
1794 ldout(cct, 20) << "lat " << lat << dendl;
1795 logger->tinc(l_c_lat, lat);
1796 logger->tinc(l_c_reply, lat);
1797
1798 put_request(request);
7c673cae
FG
1799 return r;
1800}
1801
1802void Client::unregister_request(MetaRequest *req)
1803{
1804 mds_requests.erase(req->tid);
1805 if (req->tid == oldest_tid) {
1806 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1807 while (true) {
1808 if (p == mds_requests.end()) {
1809 oldest_tid = 0;
1810 break;
1811 }
1812 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1813 oldest_tid = p->first;
1814 break;
1815 }
1816 ++p;
1817 }
1818 }
1819 put_request(req);
1820}
1821
1822void Client::put_request(MetaRequest *request)
1823{
1824 if (request->_put()) {
1825 int op = -1;
1826 if (request->success)
1827 op = request->get_op();
1828 InodeRef other_in;
1829 request->take_other_inode(&other_in);
1830 delete request;
1831
1832 if (other_in &&
1833 (op == CEPH_MDS_OP_RMDIR ||
1834 op == CEPH_MDS_OP_RENAME ||
1835 op == CEPH_MDS_OP_RMSNAP)) {
1836 _try_to_trim_inode(other_in.get(), false);
1837 }
1838 }
1839}
1840
1841int Client::encode_inode_release(Inode *in, MetaRequest *req,
1842 mds_rank_t mds, int drop,
1843 int unless, int force)
1844{
11fdf7f2 1845 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1846 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1847 << ", have:" << ", force:" << force << ")" << dendl;
1848 int released = 0;
11fdf7f2
TL
1849 auto it = in->caps.find(mds);
1850 if (it != in->caps.end()) {
1851 Cap &cap = it->second;
7c673cae 1852 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1853 if ((drop & cap.issued) &&
1854 !(unless & cap.issued)) {
1855 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1856 cap.issued &= ~drop;
1857 cap.implemented &= ~drop;
7c673cae 1858 released = 1;
11fdf7f2 1859 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1860 } else {
1861 released = force;
1862 }
1863 if (released) {
1864 ceph_mds_request_release rel;
1865 rel.ino = in->ino;
11fdf7f2
TL
1866 rel.cap_id = cap.cap_id;
1867 rel.seq = cap.seq;
1868 rel.issue_seq = cap.issue_seq;
1869 rel.mseq = cap.mseq;
1870 rel.caps = cap.implemented;
1871 rel.wanted = cap.wanted;
7c673cae
FG
1872 rel.dname_len = 0;
1873 rel.dname_seq = 0;
1874 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1875 }
1876 }
11fdf7f2 1877 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1878 << released << dendl;
1879 return released;
1880}
1881
1882void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1883 mds_rank_t mds, int drop, int unless)
1884{
11fdf7f2 1885 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1886 << dn << ")" << dendl;
1887 int released = 0;
1888 if (dn->dir)
1889 released = encode_inode_release(dn->dir->parent_inode, req,
1890 mds, drop, unless, 1);
1891 if (released && dn->lease_mds == mds) {
1892 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1893 auto& rel = req->cap_releases.back();
7c673cae
FG
1894 rel.item.dname_len = dn->name.length();
1895 rel.item.dname_seq = dn->lease_seq;
1896 rel.dname = dn->name;
1897 }
11fdf7f2 1898 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1899 << dn << ")" << dendl;
1900}
1901
1902
1903/*
1904 * This requires the MClientRequest *request member to be set.
1905 * It will error out horribly without one.
1906 * Additionally, if you set any *drop member, you'd better have
1907 * set the corresponding dentry!
1908 */
1909void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1910{
11fdf7f2 1911 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1912 << req << ", mds: " << mds << ")" << dendl;
1913 if (req->inode_drop && req->inode())
1914 encode_inode_release(req->inode(), req,
1915 mds, req->inode_drop,
1916 req->inode_unless);
1917
1918 if (req->old_inode_drop && req->old_inode())
1919 encode_inode_release(req->old_inode(), req,
1920 mds, req->old_inode_drop,
1921 req->old_inode_unless);
1922 if (req->other_inode_drop && req->other_inode())
1923 encode_inode_release(req->other_inode(), req,
1924 mds, req->other_inode_drop,
1925 req->other_inode_unless);
1926
1927 if (req->dentry_drop && req->dentry())
1928 encode_dentry_release(req->dentry(), req,
1929 mds, req->dentry_drop,
1930 req->dentry_unless);
1931
1932 if (req->old_dentry_drop && req->old_dentry())
1933 encode_dentry_release(req->old_dentry(), req,
1934 mds, req->old_dentry_drop,
1935 req->old_dentry_unless);
11fdf7f2 1936 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1937 << req << ", mds " << mds <<dendl;
1938}
1939
1940bool Client::have_open_session(mds_rank_t mds)
1941{
11fdf7f2
TL
1942 const auto &it = mds_sessions.find(mds);
1943 return it != mds_sessions.end() &&
1944 (it->second.state == MetaSession::STATE_OPEN ||
1945 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1946}
1947
1948MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1949{
11fdf7f2
TL
1950 const auto &it = mds_sessions.find(mds);
1951 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1952 return NULL;
11fdf7f2
TL
1953 } else {
1954 return &it->second;
1955 }
7c673cae
FG
1956}
1957
1958MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1959{
11fdf7f2
TL
1960 auto it = mds_sessions.find(mds);
1961 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1962}
1963
1964/**
1965 * Populate a map of strings with client-identifying metadata,
1966 * such as the hostname. Call this once at initialization.
1967 */
1968void Client::populate_metadata(const std::string &mount_root)
1969{
1970 // Hostname
1971 struct utsname u;
1972 int r = uname(&u);
1973 if (r >= 0) {
1974 metadata["hostname"] = u.nodename;
1975 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1976 } else {
1977 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1978 }
1979
1980 metadata["pid"] = stringify(getpid());
1981
1982 // Ceph entity id (the '0' in "client.0")
1983 metadata["entity_id"] = cct->_conf->name.get_id();
1984
1985 // Our mount position
1986 if (!mount_root.empty()) {
1987 metadata["root"] = mount_root;
1988 }
1989
1990 // Ceph version
1991 metadata["ceph_version"] = pretty_version_to_str();
1992 metadata["ceph_sha1"] = git_version_to_str();
1993
1994 // Apply any metadata from the user's configured overrides
1995 std::vector<std::string> tokens;
1996 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1997 for (const auto &i : tokens) {
1998 auto eqpos = i.find("=");
1999 // Throw out anything that isn't of the form "<str>=<str>"
2000 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2001 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2002 continue;
2003 }
2004 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2005 }
2006}
2007
2008/**
2009 * Optionally add or override client metadata fields.
2010 */
2011void Client::update_metadata(std::string const &k, std::string const &v)
2012{
11fdf7f2
TL
2013 std::lock_guard l(client_lock);
2014 ceph_assert(initialized);
7c673cae 2015
11fdf7f2
TL
2016 auto it = metadata.find(k);
2017 if (it != metadata.end()) {
7c673cae 2018 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2019 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2020 }
2021
2022 metadata[k] = v;
2023}
2024
2025MetaSession *Client::_open_mds_session(mds_rank_t mds)
2026{
11fdf7f2
TL
2027 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2028 auto addrs = mdsmap->get_addrs(mds);
2029 auto em = mds_sessions.emplace(std::piecewise_construct,
2030 std::forward_as_tuple(mds),
2031 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2032 ceph_assert(em.second); /* not already present */
2033 MetaSession *session = &em.first->second;
7c673cae
FG
2034
2035 // Maybe skip sending a request to open if this MDS daemon
2036 // has previously sent us a REJECT.
2037 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2038 if (rejected_by_mds[mds] == session->addrs) {
2039 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2040 "because we were rejected" << dendl;
2041 return session;
2042 } else {
11fdf7f2 2043 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2044 "rejected us, trying with new inst" << dendl;
2045 rejected_by_mds.erase(mds);
2046 }
2047 }
2048
9f95a23c 2049 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2050 m->metadata = metadata;
2051 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2052 session->con->send_message2(std::move(m));
7c673cae
FG
2053 return session;
2054}
2055
2056void Client::_close_mds_session(MetaSession *s)
2057{
11fdf7f2 2058 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2059 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2060 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2061}
2062
2063void Client::_closed_mds_session(MetaSession *s)
2064{
11fdf7f2 2065 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2066 s->state = MetaSession::STATE_CLOSED;
2067 s->con->mark_down();
2068 signal_context_list(s->waiting_for_open);
9f95a23c 2069 mount_cond.notify_all();
7c673cae
FG
2070 remove_session_caps(s);
2071 kick_requests_closed(s);
2072 mds_sessions.erase(s->mds_num);
7c673cae
FG
2073}
2074
11fdf7f2 2075void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2076{
2077 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2078 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2079
2080 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2081 if (!session) {
2082 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2083 return;
2084 }
2085
2086 switch (m->get_op()) {
2087 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2088 {
2089 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2090 missing_features -= m->supported_features;
2091 if (!missing_features.empty()) {
2092 lderr(cct) << "mds." << from << " lacks required features '"
2093 << missing_features << "', closing session " << dendl;
2094 rejected_by_mds[session->mds_num] = session->addrs;
2095 _close_mds_session(session);
2096 _closed_mds_session(session);
2097 break;
2098 }
2099 session->mds_features = std::move(m->supported_features);
2100
2101 renew_caps(session);
2102 session->state = MetaSession::STATE_OPEN;
2103 if (unmounting)
9f95a23c 2104 mount_cond.notify_all();
11fdf7f2
TL
2105 else
2106 connect_mds_targets(from);
2107 signal_context_list(session->waiting_for_open);
2108 break;
2109 }
7c673cae
FG
2110
2111 case CEPH_SESSION_CLOSE:
2112 _closed_mds_session(session);
2113 break;
2114
2115 case CEPH_SESSION_RENEWCAPS:
2116 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2117 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2118 session->cap_ttl =
2119 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2120 if (was_stale)
2121 wake_up_session_caps(session, false);
7c673cae
FG
2122 }
2123 break;
2124
2125 case CEPH_SESSION_STALE:
28e407b8
AA
2126 // invalidate session caps/leases
2127 session->cap_gen++;
2128 session->cap_ttl = ceph_clock_now();
2129 session->cap_ttl -= 1;
7c673cae
FG
2130 renew_caps(session);
2131 break;
2132
2133 case CEPH_SESSION_RECALL_STATE:
2134 trim_caps(session, m->get_max_caps());
2135 break;
2136
2137 case CEPH_SESSION_FLUSHMSG:
a8e16298 2138 /* flush cap release */
11fdf7f2
TL
2139 if (auto& m = session->release; m) {
2140 session->con->send_message2(std::move(m));
a8e16298 2141 }
9f95a23c 2142 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2143 break;
2144
2145 case CEPH_SESSION_FORCE_RO:
2146 force_session_readonly(session);
2147 break;
2148
2149 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2150 {
2151 std::string_view error_str;
2152 auto it = m->metadata.find("error_string");
2153 if (it != m->metadata.end())
2154 error_str = it->second;
2155 else
2156 error_str = "unknown error";
2157 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2158
11fdf7f2
TL
2159 rejected_by_mds[session->mds_num] = session->addrs;
2160 _closed_mds_session(session);
2161 }
7c673cae
FG
2162 break;
2163
2164 default:
2165 ceph_abort();
2166 }
7c673cae
FG
2167}
2168
2169bool Client::_any_stale_sessions() const
2170{
9f95a23c 2171 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2172
11fdf7f2
TL
2173 for (const auto &p : mds_sessions) {
2174 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2175 return true;
2176 }
2177 }
2178
2179 return false;
2180}
2181
2182void Client::_kick_stale_sessions()
2183{
11fdf7f2 2184 ldout(cct, 1) << __func__ << dendl;
7c673cae 2185
11fdf7f2
TL
2186 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2187 MetaSession &s = it->second;
2188 ++it;
2189 if (s.state == MetaSession::STATE_STALE)
2190 _closed_mds_session(&s);
7c673cae
FG
2191 }
2192}
2193
2194void Client::send_request(MetaRequest *request, MetaSession *session,
2195 bool drop_cap_releases)
2196{
2197 // make the request
2198 mds_rank_t mds = session->mds_num;
11fdf7f2 2199 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2200 << " for mds." << mds << dendl;
11fdf7f2 2201 auto r = build_client_request(request);
7c673cae
FG
2202 if (request->dentry()) {
2203 r->set_dentry_wanted();
2204 }
2205 if (request->got_unsafe) {
2206 r->set_replayed_op();
2207 if (request->target)
2208 r->head.ino = request->target->ino;
2209 } else {
2210 encode_cap_releases(request, mds);
2211 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2212 request->cap_releases.clear();
2213 else
2214 r->releases.swap(request->cap_releases);
2215 }
2216 r->set_mdsmap_epoch(mdsmap->get_epoch());
2217 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2218 objecter->with_osdmap([r](const OSDMap& o) {
2219 r->set_osdmap_epoch(o.get_epoch());
2220 });
2221 }
2222
2223 if (request->mds == -1) {
2224 request->sent_stamp = ceph_clock_now();
11fdf7f2 2225 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2226 }
2227 request->mds = mds;
2228
2229 Inode *in = request->inode();
11fdf7f2
TL
2230 if (in) {
2231 auto it = in->caps.find(mds);
2232 if (it != in->caps.end()) {
2233 request->sent_on_mseq = it->second.mseq;
2234 }
2235 }
7c673cae
FG
2236
2237 session->requests.push_back(&request->item);
2238
11fdf7f2
TL
2239 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2240 session->con->send_message2(std::move(r));
7c673cae
FG
2241}
2242
9f95a23c 2243ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2244{
9f95a23c 2245 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2246 req->set_tid(request->tid);
2247 req->set_stamp(request->op_stamp);
2248 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2249
2250 // if the filepath's haven't been set, set them!
2251 if (request->path.empty()) {
2252 Inode *in = request->inode();
2253 Dentry *de = request->dentry();
2254 if (in)
2255 in->make_nosnap_relative_path(request->path);
2256 else if (de) {
2257 if (de->inode)
2258 de->inode->make_nosnap_relative_path(request->path);
2259 else if (de->dir) {
2260 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2261 request->path.push_dentry(de->name);
2262 }
2263 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2264 << " No path, inode, or appropriately-endowed dentry given!"
2265 << dendl;
2266 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2267 << " No path, inode, or dentry given!"
2268 << dendl;
2269 }
2270 req->set_filepath(request->get_filepath());
2271 req->set_filepath2(request->get_filepath2());
2272 req->set_data(request->data);
2273 req->set_retry_attempt(request->retry_attempt++);
2274 req->head.num_fwd = request->num_fwd;
2275 const gid_t *_gids;
2276 int gid_count = request->perms.get_gids(&_gids);
2277 req->set_gid_list(gid_count, _gids);
2278 return req;
2279}
2280
2281
2282
11fdf7f2 2283void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2284{
2285 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2286 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2287 if (!session) {
7c673cae
FG
2288 return;
2289 }
2290 ceph_tid_t tid = fwd->get_tid();
2291
2292 if (mds_requests.count(tid) == 0) {
11fdf7f2 2293 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2294 return;
2295 }
2296
2297 MetaRequest *request = mds_requests[tid];
11fdf7f2 2298 ceph_assert(request);
7c673cae
FG
2299
2300 // reset retry counter
2301 request->retry_attempt = 0;
2302
2303 // request not forwarded, or dest mds has no session.
2304 // resend.
11fdf7f2 2305 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2306 << " fwd " << fwd->get_num_fwd()
2307 << " to mds." << fwd->get_dest_mds()
2308 << ", resending to " << fwd->get_dest_mds()
2309 << dendl;
2310
2311 request->mds = -1;
2312 request->item.remove_myself();
2313 request->num_fwd = fwd->get_num_fwd();
2314 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2315 request->caller_cond->notify_all();
7c673cae
FG
2316}
2317
2318bool Client::is_dir_operation(MetaRequest *req)
2319{
2320 int op = req->get_op();
2321 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2322 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2323 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2324 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2325 return true;
2326 return false;
2327}
2328
11fdf7f2 2329void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2330{
2331 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2332 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2333 if (!session) {
7c673cae
FG
2334 return;
2335 }
2336
2337 ceph_tid_t tid = reply->get_tid();
2338 bool is_safe = reply->is_safe();
2339
2340 if (mds_requests.count(tid) == 0) {
11fdf7f2 2341 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2342 << " safe is:" << is_safe << dendl;
7c673cae
FG
2343 return;
2344 }
2345 MetaRequest *request = mds_requests.at(tid);
2346
11fdf7f2 2347 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2348 << " tid " << tid << dendl;
2349
2350 if (request->got_unsafe && !is_safe) {
2351 //duplicate response
2352 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2353 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2354 return;
2355 }
2356
2357 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2358 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2359 << " from mds." << request->mds << dendl;
2360 request->send_to_auth = true;
2361 request->resend_mds = choose_target_mds(request);
2362 Inode *in = request->inode();
11fdf7f2 2363 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2364 if (request->resend_mds >= 0 &&
2365 request->resend_mds == request->mds &&
2366 (in == NULL ||
11fdf7f2
TL
2367 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2368 request->sent_on_mseq == it->second.mseq)) {
2369 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae 2370 } else {
9f95a23c 2371 request->caller_cond->notify_all();
7c673cae
FG
2372 return;
2373 }
7c673cae
FG
2374 }
2375
11fdf7f2 2376 ceph_assert(!request->reply);
7c673cae
FG
2377 request->reply = reply;
2378 insert_trace(request, session);
2379
2380 // Handle unsafe reply
2381 if (!is_safe) {
2382 request->got_unsafe = true;
2383 session->unsafe_requests.push_back(&request->unsafe_item);
2384 if (is_dir_operation(request)) {
2385 Inode *dir = request->inode();
11fdf7f2 2386 ceph_assert(dir);
7c673cae
FG
2387 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2388 }
2389 if (request->target) {
2390 InodeRef &in = request->target;
2391 in->unsafe_ops.push_back(&request->unsafe_target_item);
2392 }
2393 }
2394
2395 // Only signal the caller once (on the first reply):
2396 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2397 if (!is_safe || !request->got_unsafe) {
9f95a23c 2398 ceph::condition_variable cond;
7c673cae
FG
2399 request->dispatch_cond = &cond;
2400
2401 // wake up waiter
11fdf7f2 2402 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2403 request->caller_cond->notify_all();
7c673cae
FG
2404
2405 // wake for kick back
9f95a23c
TL
2406 std::unique_lock l{client_lock, std::adopt_lock};
2407 cond.wait(l, [tid, request, &cond, this] {
2408 if (request->dispatch_cond) {
2409 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2410 << tid << " " << &cond << dendl;
2411 }
2412 return !request->dispatch_cond;
2413 });
2414 l.release();
7c673cae
FG
2415 }
2416
2417 if (is_safe) {
2418 // the filesystem change is committed to disk
2419 // we're done, clean up
2420 if (request->got_unsafe) {
2421 request->unsafe_item.remove_myself();
2422 request->unsafe_dir_item.remove_myself();
2423 request->unsafe_target_item.remove_myself();
2424 signal_cond_list(request->waitfor_safe);
2425 }
2426 request->item.remove_myself();
2427 unregister_request(request);
2428 }
2429 if (unmounting)
9f95a23c 2430 mount_cond.notify_all();
7c673cae
FG
2431}
2432
2433void Client::_handle_full_flag(int64_t pool)
2434{
2435 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2436 << "on " << pool << dendl;
2437 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2438 // to do this rather than blocking, because otherwise when we fill up we
2439 // potentially lock caps forever on files with dirty pages, and we need
2440 // to be able to release those caps to the MDS so that it can delete files
2441 // and free up space.
2442 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2443
2444 // For all inodes with layouts in this pool and a pending flush write op
2445 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2446 // from ObjectCacher so that it doesn't re-issue the write in response to
2447 // the ENOSPC error.
2448 // Fortunately since we're cancelling everything in a given pool, we don't
2449 // need to know which ops belong to which ObjectSet, we can just blow all
2450 // the un-flushed cached data away and mark any dirty inodes' async_err
2451 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2452 // affecting this pool, and all the objectsets we're purging were also
2453 // in this pool.
2454 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2455 i != inode_map.end(); ++i)
2456 {
2457 Inode *inode = i->second;
2458 if (inode->oset.dirty_or_tx
2459 && (pool == -1 || inode->layout.pool_id == pool)) {
2460 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2461 << " has dirty objects, purging and setting ENOSPC" << dendl;
2462 objectcacher->purge_set(&inode->oset);
2463 inode->set_async_err(-ENOSPC);
2464 }
2465 }
2466
2467 if (cancelled_epoch != (epoch_t)-1) {
2468 set_cap_epoch_barrier(cancelled_epoch);
2469 }
2470}
2471
11fdf7f2 2472void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2473{
31f18b77
FG
2474 std::set<entity_addr_t> new_blacklists;
2475 objecter->consume_blacklist_events(&new_blacklists);
2476
11fdf7f2
TL
2477 const auto myaddrs = messenger->get_myaddrs();
2478 bool new_blacklist = false;
2479 bool prenautilus = objecter->with_osdmap(
2480 [&](const OSDMap& o) {
9f95a23c 2481 return o.require_osd_release < ceph_release_t::nautilus;
11fdf7f2
TL
2482 });
2483 if (!blacklisted) {
2484 for (auto a : myaddrs.v) {
2485 // blacklist entries are always TYPE_ANY for nautilus+
2486 a.set_type(entity_addr_t::TYPE_ANY);
2487 if (new_blacklists.count(a)) {
2488 new_blacklist = true;
2489 break;
2490 }
2491 if (prenautilus) {
2492 // ...except pre-nautilus, they were TYPE_LEGACY
2493 a.set_type(entity_addr_t::TYPE_LEGACY);
2494 if (new_blacklists.count(a)) {
2495 new_blacklist = true;
2496 break;
2497 }
2498 }
2499 }
2500 }
2501 if (new_blacklist) {
31f18b77
FG
2502 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2503 return o.get_epoch();
2504 });
2505 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2506 blacklisted = true;
31f18b77 2507
11fdf7f2 2508 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2509
2510 // Since we know all our OSD ops will fail, cancel them all preemtively,
2511 // so that on an unhealthy cluster we can umount promptly even if e.g.
2512 // some PGs were inaccessible.
2513 objecter->op_cancel_writes(-EBLACKLISTED);
2514
2515 } else if (blacklisted) {
2516 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2517 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2518 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2519 }
2520
f64942e4
AA
2521 // Always subscribe to next osdmap for blacklisted client
2522 // until this client is not blacklisted.
2523 if (blacklisted) {
2524 objecter->maybe_request_map();
2525 }
2526
7c673cae
FG
2527 if (objecter->osdmap_full_flag()) {
2528 _handle_full_flag(-1);
2529 } else {
2530 // Accumulate local list of full pools so that I can drop
2531 // the objecter lock before re-entering objecter in
2532 // cancel_writes
2533 std::vector<int64_t> full_pools;
2534
2535 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2536 for (const auto& kv : o.get_pools()) {
2537 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2538 full_pools.push_back(kv.first);
2539 }
2540 }
2541 });
2542
2543 for (auto p : full_pools)
2544 _handle_full_flag(p);
2545
2546 // Subscribe to subsequent maps to watch for the full flag going
2547 // away. For the global full flag objecter does this for us, but
2548 // it pays no attention to the per-pool full flag so in this branch
2549 // we do it ourselves.
2550 if (!full_pools.empty()) {
2551 objecter->maybe_request_map();
2552 }
2553 }
7c673cae
FG
2554}
2555
2556
2557// ------------------------
2558// incoming messages
2559
2560
11fdf7f2 2561bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2562{
11fdf7f2 2563 std::lock_guard l(client_lock);
7c673cae
FG
2564 if (!initialized) {
2565 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2566 return true;
2567 }
2568
2569 switch (m->get_type()) {
2570 // mounting and mds sessions
2571 case CEPH_MSG_MDS_MAP:
9f95a23c 2572 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2573 break;
2574 case CEPH_MSG_FS_MAP:
9f95a23c 2575 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2576 break;
2577 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2578 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2579 break;
2580 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2581 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2582 break;
2583
2584 case CEPH_MSG_OSD_MAP:
9f95a23c 2585 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2586 break;
2587
2588 // requests
2589 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2590 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2591 break;
2592 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2593 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2594 break;
2595
2596 // reclaim reply
2597 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2598 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2599 break;
2600
2601 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2602 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2603 break;
2604 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2605 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2606 break;
2607 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2608 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2609 break;
2610 case MSG_COMMAND_REPLY:
2611 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2612 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2613 } else {
2614 return false;
2615 }
2616 break;
2617 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2618 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2619 break;
2620
2621 default:
2622 return false;
2623 }
2624
2625 // unmounting?
2626 if (unmounting) {
2627 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2628 << "+" << inode_map.size() << dendl;
2629 long unsigned size = lru.lru_get_size() + inode_map.size();
2630 trim_cache();
2631 if (size < lru.lru_get_size() + inode_map.size()) {
2632 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2633 mount_cond.notify_all();
7c673cae
FG
2634 } else {
2635 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2636 << "+" << inode_map.size() << dendl;
2637 }
2638 }
2639
2640 return true;
2641}
2642
11fdf7f2 2643void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2644{
2645 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2646
2647 signal_cond_list(waiting_for_fsmap);
2648
2649 monclient->sub_got("fsmap", fsmap->get_epoch());
2650}
2651
11fdf7f2 2652void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2653{
2654 fsmap_user.reset(new FSMapUser);
2655 *fsmap_user = m->get_fsmap();
7c673cae
FG
2656
2657 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2658 signal_cond_list(waiting_for_fsmap);
2659}
2660
11fdf7f2 2661void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2662{
f64942e4 2663 mds_gid_t old_inc, new_inc;
7c673cae 2664 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2665 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2666 << " is identical to or older than our "
2667 << mdsmap->get_epoch() << dendl;
7c673cae 2668 return;
f64942e4 2669 }
7c673cae 2670
11fdf7f2 2671 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2672
2673 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2674 oldmap.swap(mdsmap);
2675
2676 mdsmap->decode(m->get_encoded());
2677
2678 // Cancel any commands for missing or laggy GIDs
2679 std::list<ceph_tid_t> cancel_ops;
2680 auto &commands = command_table.get_commands();
2681 for (const auto &i : commands) {
2682 auto &op = i.second;
2683 const mds_gid_t op_mds_gid = op.mds_gid;
2684 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2685 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2686 cancel_ops.push_back(i.first);
2687 if (op.outs) {
2688 std::ostringstream ss;
2689 ss << "MDS " << op_mds_gid << " went away";
2690 *(op.outs) = ss.str();
2691 }
2692 op.con->mark_down();
2693 if (op.on_finish) {
2694 op.on_finish->complete(-ETIMEDOUT);
2695 }
2696 }
2697 }
2698
2699 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2700 i != cancel_ops.end(); ++i) {
2701 command_table.erase(*i);
2702 }
2703
2704 // reset session
11fdf7f2 2705 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2706 mds_rank_t mds = p->first;
11fdf7f2 2707 MetaSession *session = &p->second;
7c673cae
FG
2708 ++p;
2709
2710 int oldstate = oldmap->get_state(mds);
2711 int newstate = mdsmap->get_state(mds);
2712 if (!mdsmap->is_up(mds)) {
2713 session->con->mark_down();
11fdf7f2 2714 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2715 old_inc = oldmap->get_incarnation(mds);
2716 new_inc = mdsmap->get_incarnation(mds);
2717 if (old_inc != new_inc) {
2718 ldout(cct, 1) << "mds incarnation changed from "
2719 << old_inc << " to " << new_inc << dendl;
2720 oldstate = MDSMap::STATE_NULL;
2721 }
7c673cae 2722 session->con->mark_down();
11fdf7f2 2723 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2724 // When new MDS starts to take over, notify kernel to trim unused entries
2725 // in its dcache/icache. Hopefully, the kernel will release some unused
2726 // inodes before the new MDS enters reconnect state.
2727 trim_cache_for_reconnect(session);
2728 } else if (oldstate == newstate)
2729 continue; // no change
2730
2731 session->mds_state = newstate;
2732 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2733 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2734 send_reconnect(session);
81eedcae
TL
2735 } else if (newstate > MDSMap::STATE_RECONNECT) {
2736 if (oldstate < MDSMap::STATE_RECONNECT) {
2737 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2738 _closed_mds_session(session);
2739 continue;
2740 }
2741 if (newstate >= MDSMap::STATE_ACTIVE) {
2742 if (oldstate < MDSMap::STATE_ACTIVE) {
2743 // kick new requests
2744 kick_requests(session);
2745 kick_flushing_caps(session);
2746 signal_context_list(session->waiting_for_open);
2747 wake_up_session_caps(session, true);
2748 }
2749 connect_mds_targets(mds);
7c673cae 2750 }
7c673cae
FG
2751 } else if (newstate == MDSMap::STATE_NULL &&
2752 mds >= mdsmap->get_max_mds()) {
2753 _closed_mds_session(session);
2754 }
2755 }
2756
2757 // kick any waiting threads
2758 signal_cond_list(waiting_for_mdsmap);
2759
7c673cae
FG
2760 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2761}
2762
2763void Client::send_reconnect(MetaSession *session)
2764{
2765 mds_rank_t mds = session->mds_num;
11fdf7f2 2766 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2767
2768 // trim unused caps to reduce MDS's cache rejoin time
2769 trim_cache_for_reconnect(session);
2770
2771 session->readonly = false;
2772
11fdf7f2 2773 session->release.reset();
7c673cae
FG
2774
2775 // reset my cap seq number
2776 session->seq = 0;
2777 //connect to the mds' offload targets
2778 connect_mds_targets(mds);
2779 //make sure unsafe requests get saved
2780 resend_unsafe_requests(session);
2781
11fdf7f2
TL
2782 early_kick_flushing_caps(session);
2783
9f95a23c 2784 auto m = make_message<MClientReconnect>();
11fdf7f2 2785 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2786
2787 // i have an open session.
2788 ceph::unordered_set<inodeno_t> did_snaprealm;
2789 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2790 p != inode_map.end();
2791 ++p) {
2792 Inode *in = p->second;
11fdf7f2
TL
2793 auto it = in->caps.find(mds);
2794 if (it != in->caps.end()) {
2795 if (allow_multi &&
9f95a23c
TL
2796 m->get_approx_size() >=
2797 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
2798 m->mark_more();
2799 session->con->send_message2(std::move(m));
2800
9f95a23c 2801 m = make_message<MClientReconnect>();
11fdf7f2
TL
2802 }
2803
2804 Cap &cap = it->second;
7c673cae 2805 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2806 << " " << ccap_string(cap.issued)
7c673cae
FG
2807 << " wants " << ccap_string(in->caps_wanted())
2808 << dendl;
2809 filepath path;
2810 in->make_long_path(path);
2811 ldout(cct, 10) << " path " << path << dendl;
2812
2813 bufferlist flockbl;
2814 _encode_filelocks(in, flockbl);
2815
11fdf7f2
TL
2816 cap.seq = 0; // reset seq.
2817 cap.issue_seq = 0; // reset seq.
2818 cap.mseq = 0; // reset seq.
2819 // cap gen should catch up with session cap_gen
2820 if (cap.gen < session->cap_gen) {
2821 cap.gen = session->cap_gen;
2822 cap.issued = cap.implemented = CEPH_CAP_PIN;
2823 } else {
2824 cap.issued = cap.implemented;
2825 }
7c673cae
FG
2826 snapid_t snap_follows = 0;
2827 if (!in->cap_snaps.empty())
2828 snap_follows = in->cap_snaps.begin()->first;
2829
2830 m->add_cap(p->first.ino,
11fdf7f2 2831 cap.cap_id,
7c673cae
FG
2832 path.get_ino(), path.get_path(), // ino
2833 in->caps_wanted(), // wanted
11fdf7f2 2834 cap.issued, // issued
7c673cae
FG
2835 in->snaprealm->ino,
2836 snap_follows,
2837 flockbl);
2838
2839 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2840 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2841 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2842 did_snaprealm.insert(in->snaprealm->ino);
2843 }
2844 }
2845 }
2846
11fdf7f2
TL
2847 if (!allow_multi)
2848 m->set_encoding_version(0); // use connection features to choose encoding
2849 session->con->send_message2(std::move(m));
7c673cae 2850
9f95a23c 2851 mount_cond.notify_all();
11fdf7f2
TL
2852
2853 if (session->reclaim_state == MetaSession::RECLAIMING)
2854 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2855}
2856
2857
2858void Client::kick_requests(MetaSession *session)
2859{
11fdf7f2 2860 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2861 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2862 p != mds_requests.end();
2863 ++p) {
31f18b77
FG
2864 MetaRequest *req = p->second;
2865 if (req->got_unsafe)
2866 continue;
2867 if (req->aborted()) {
2868 if (req->caller_cond) {
2869 req->kick = true;
9f95a23c 2870 req->caller_cond->notify_all();
31f18b77 2871 }
7c673cae 2872 continue;
31f18b77
FG
2873 }
2874 if (req->retry_attempt > 0)
7c673cae 2875 continue; // new requests only
31f18b77 2876 if (req->mds == session->mds_num) {
7c673cae
FG
2877 send_request(p->second, session);
2878 }
2879 }
2880}
2881
2882void Client::resend_unsafe_requests(MetaSession *session)
2883{
2884 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2885 !iter.end();
2886 ++iter)
2887 send_request(*iter, session);
2888
2889 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2890 // process completed requests in clientreplay stage.
2891 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2892 p != mds_requests.end();
2893 ++p) {
2894 MetaRequest *req = p->second;
2895 if (req->got_unsafe)
2896 continue;
31f18b77
FG
2897 if (req->aborted())
2898 continue;
7c673cae
FG
2899 if (req->retry_attempt == 0)
2900 continue; // old requests only
2901 if (req->mds == session->mds_num)
2902 send_request(req, session, true);
2903 }
2904}
2905
2906void Client::wait_unsafe_requests()
2907{
2908 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2909 for (const auto &p : mds_sessions) {
2910 const MetaSession &s = p.second;
2911 if (!s.unsafe_requests.empty()) {
2912 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2913 req->get();
2914 last_unsafe_reqs.push_back(req);
2915 }
2916 }
2917
2918 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2919 p != last_unsafe_reqs.end();
2920 ++p) {
2921 MetaRequest *req = *p;
2922 if (req->unsafe_item.is_on_list())
2923 wait_on_list(req->waitfor_safe);
2924 put_request(req);
2925 }
2926}
2927
2928void Client::kick_requests_closed(MetaSession *session)
2929{
11fdf7f2 2930 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2931 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2932 p != mds_requests.end(); ) {
2933 MetaRequest *req = p->second;
2934 ++p;
2935 if (req->mds == session->mds_num) {
2936 if (req->caller_cond) {
2937 req->kick = true;
9f95a23c 2938 req->caller_cond->notify_all();
7c673cae
FG
2939 }
2940 req->item.remove_myself();
2941 if (req->got_unsafe) {
11fdf7f2 2942 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 2943 req->unsafe_item.remove_myself();
eafe8130
TL
2944 if (is_dir_operation(req)) {
2945 Inode *dir = req->inode();
2946 assert(dir);
2947 dir->set_async_err(-EIO);
2948 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2949 << dir->ino << " " << req->get_tid() << dendl;
2950 req->unsafe_dir_item.remove_myself();
2951 }
2952 if (req->target) {
2953 InodeRef &in = req->target;
2954 in->set_async_err(-EIO);
2955 lderr(cct) << "kick_requests_closed drop req of inode : "
2956 << in->ino << " " << req->get_tid() << dendl;
2957 req->unsafe_target_item.remove_myself();
2958 }
7c673cae
FG
2959 signal_cond_list(req->waitfor_safe);
2960 unregister_request(req);
2961 }
2962 }
2963 }
11fdf7f2
TL
2964 ceph_assert(session->requests.empty());
2965 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2966}
2967
2968
2969
2970
2971/************
2972 * leases
2973 */
2974
2975void Client::got_mds_push(MetaSession *s)
2976{
2977 s->seq++;
2978 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2979 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 2980 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2981 }
2982}
2983
11fdf7f2 2984void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2985{
11fdf7f2 2986 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2987
11fdf7f2 2988 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2989
2990 mds_rank_t mds = mds_rank_t(m->get_source().num());
2991 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2992 if (!session) {
7c673cae
FG
2993 return;
2994 }
2995
2996 got_mds_push(session);
2997
2998 ceph_seq_t seq = m->get_seq();
2999
3000 Inode *in;
3001 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3002 if (inode_map.count(vino) == 0) {
3003 ldout(cct, 10) << " don't have vino " << vino << dendl;
3004 goto revoke;
3005 }
3006 in = inode_map[vino];
3007
9f95a23c 3008 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3009 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3010 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3011 goto revoke;
3012 }
3013 Dentry *dn = in->dir->dentries[m->dname];
3014 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3015 dn->lease_mds = -1;
3016 }
3017
3018 revoke:
11fdf7f2 3019 {
9f95a23c
TL
3020 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3021 m->get_mask(), m->get_ino(),
3022 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3023 m->get_connection()->send_message2(std::move(reply));
3024 }
7c673cae
FG
3025}
3026
3027void Client::put_inode(Inode *in, int n)
3028{
11fdf7f2 3029 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3030 int left = in->_put(n);
3031 if (left == 0) {
3032 // release any caps
3033 remove_all_caps(in);
3034
11fdf7f2 3035 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3036 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3037 ceph_assert(!unclean);
7c673cae
FG
3038 inode_map.erase(in->vino());
3039 if (use_faked_inos())
3040 _release_faked_ino(in);
3041
3042 if (in == root) {
3043 root = 0;
3044 root_ancestor = 0;
3045 while (!root_parents.empty())
3046 root_parents.erase(root_parents.begin());
3047 }
3048
3049 delete in;
3050 }
3051}
3052
3053void Client::close_dir(Dir *dir)
3054{
3055 Inode *in = dir->parent_inode;
11fdf7f2
TL
3056 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3057 ceph_assert(dir->is_empty());
3058 ceph_assert(in->dir == dir);
3059 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3060 if (!in->dentries.empty())
7c673cae
FG
3061 in->get_first_parent()->put(); // unpin dentry
3062
3063 delete in->dir;
3064 in->dir = 0;
3065 put_inode(in); // unpin inode
3066}
3067
3068 /**
3069 * Don't call this with in==NULL, use get_or_create for that
3070 * leave dn set to default NULL unless you're trying to add
3071 * a new inode to a pre-created Dentry
3072 */
3073Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3074{
3075 if (!dn) {
3076 // create a new Dentry
11fdf7f2
TL
3077 dn = new Dentry(dir, name);
3078
7c673cae
FG
3079 lru.lru_insert_mid(dn); // mid or top?
3080
3081 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3082 << " dn " << dn << " (new dn)" << dendl;
3083 } else {
11fdf7f2 3084 ceph_assert(!dn->inode);
7c673cae
FG
3085 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3086 << " dn " << dn << " (old dn)" << dendl;
3087 }
3088
3089 if (in) { // link to inode
11fdf7f2 3090 InodeRef tmp_ref;
7c673cae 3091 // only one parent for directories!
11fdf7f2
TL
3092 if (in->is_dir() && !in->dentries.empty()) {
3093 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3094 Dentry *olddn = in->get_first_parent();
11fdf7f2 3095 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3096 Inode *old_diri = olddn->dir->parent_inode;
3097 old_diri->dir_release_count++;
3098 clear_dir_complete_and_ordered(old_diri, true);
3099 unlink(olddn, true, true); // keep dir, dentry
3100 }
3101
11fdf7f2
TL
3102 dn->link(in);
3103 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3104 }
3105
3106 return dn;
3107}
3108
3109void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3110{
11fdf7f2 3111 InodeRef in(dn->inode);
7c673cae
FG
3112 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3113 << " inode " << dn->inode << dendl;
3114
3115 // unlink from inode
11fdf7f2
TL
3116 if (dn->inode) {
3117 dn->unlink();
3118 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3119 }
3120
3121 if (keepdentry) {
3122 dn->lease_mds = -1;
3123 } else {
3124 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3125
3126 // unlink from dir
11fdf7f2
TL
3127 Dir *dir = dn->dir;
3128 dn->detach();
7c673cae
FG
3129
3130 // delete den
3131 lru.lru_remove(dn);
3132 dn->put();
11fdf7f2
TL
3133
3134 if (dir->is_empty() && !keepdir)
3135 close_dir(dir);
7c673cae
FG
3136 }
3137}
3138
3139/**
3140 * For asynchronous flushes, check for errors from the IO and
3141 * update the inode if necessary
3142 */
3143class C_Client_FlushComplete : public Context {
3144private:
3145 Client *client;
3146 InodeRef inode;
3147public:
3148 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3149 void finish(int r) override {
9f95a23c 3150 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3151 if (r != 0) {
3152 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3153 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3154 << " 0x" << std::hex << inode->ino << std::dec
3155 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3156 inode->set_async_err(r);
3157 }
3158 }
3159};
3160
3161
3162/****
3163 * caps
3164 */
3165
3166void Client::get_cap_ref(Inode *in, int cap)
3167{
3168 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3169 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3170 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3171 in->get();
3172 }
3173 if ((cap & CEPH_CAP_FILE_CACHE) &&
3174 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3175 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3176 in->get();
3177 }
3178 in->get_cap_ref(cap);
3179}
3180
3181void Client::put_cap_ref(Inode *in, int cap)
3182{
3183 int last = in->put_cap_ref(cap);
3184 if (last) {
3185 int put_nref = 0;
3186 int drop = last & ~in->caps_issued();
3187 if (in->snapid == CEPH_NOSNAP) {
3188 if ((last & CEPH_CAP_FILE_WR) &&
3189 !in->cap_snaps.empty() &&
3190 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3191 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3192 in->cap_snaps.rbegin()->second.writing = 0;
3193 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3194 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3195 }
3196 if (last & CEPH_CAP_FILE_BUFFER) {
3197 for (auto &p : in->cap_snaps)
3198 p.second.dirty_data = 0;
3199 signal_cond_list(in->waitfor_commit);
11fdf7f2 3200 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3201 ++put_nref;
3202 }
3203 }
3204 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3205 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3206 ++put_nref;
3207 }
3208 if (drop)
3209 check_caps(in, 0);
3210 if (put_nref)
3211 put_inode(in, put_nref);
3212 }
3213}
3214
3215int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3216{
3217 int r = check_pool_perm(in, need);
3218 if (r < 0)
3219 return r;
3220
3221 while (1) {
3222 int file_wanted = in->caps_file_wanted();
3223 if ((file_wanted & need) != need) {
3224 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3225 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3226 << dendl;
3227 return -EBADF;
3228 }
3229
3230 int implemented;
3231 int have = in->caps_issued(&implemented);
3232
3233 bool waitfor_caps = false;
3234 bool waitfor_commit = false;
3235
3236 if (have & need & CEPH_CAP_FILE_WR) {
3237 if (endoff > 0 &&
3238 (endoff >= (loff_t)in->max_size ||
3239 endoff > (loff_t)(in->size << 1)) &&
3240 endoff > (loff_t)in->wanted_max_size) {
3241 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3242 in->wanted_max_size = endoff;
3243 check_caps(in, 0);
3244 }
3245
3246 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3247 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3248 waitfor_caps = true;
3249 }
3250 if (!in->cap_snaps.empty()) {
3251 if (in->cap_snaps.rbegin()->second.writing) {
3252 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3253 waitfor_caps = true;
3254 }
3255 for (auto &p : in->cap_snaps) {
3256 if (p.second.dirty_data) {
3257 waitfor_commit = true;
3258 break;
3259 }
3260 }
3261 if (waitfor_commit) {
3262 _flush(in, new C_Client_FlushComplete(this, in));
3263 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3264 }
3265 }
3266 }
3267
3268 if (!waitfor_caps && !waitfor_commit) {
3269 if ((have & need) == need) {
7c673cae
FG
3270 int revoking = implemented & ~have;
3271 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3272 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3273 << " revoking " << ccap_string(revoking)
7c673cae 3274 << dendl;
c07f9fc5 3275 if ((revoking & want) == 0) {
7c673cae
FG
3276 *phave = need | (have & want);
3277 in->get_cap_ref(need);
3278 return 0;
3279 }
3280 }
3281 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3282 waitfor_caps = true;
3283 }
3284
3285 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3286 in->auth_cap->session->readonly)
3287 return -EROFS;
3288
3289 if (in->flags & I_CAP_DROPPED) {
3290 int mds_wanted = in->caps_mds_wanted();
3291 if ((mds_wanted & need) != need) {
3292 int ret = _renew_caps(in);
3293 if (ret < 0)
3294 return ret;
3295 continue;
3296 }
a8e16298 3297 if (!(file_wanted & ~mds_wanted))
7c673cae 3298 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3299 }
3300
3301 if (waitfor_caps)
3302 wait_on_list(in->waitfor_caps);
3303 else if (waitfor_commit)
3304 wait_on_list(in->waitfor_commit);
3305 }
3306}
3307
3308int Client::get_caps_used(Inode *in)
3309{
3310 unsigned used = in->caps_used();
3311 if (!(used & CEPH_CAP_FILE_CACHE) &&
3312 !objectcacher->set_is_empty(&in->oset))
3313 used |= CEPH_CAP_FILE_CACHE;
3314 return used;
3315}
3316
3317void Client::cap_delay_requeue(Inode *in)
3318{
11fdf7f2 3319 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3320 in->hold_caps_until = ceph_clock_now();
3321 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3322 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3323}
3324
3325void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3326 int flags, int used, int want, int retain,
7c673cae
FG
3327 int flush, ceph_tid_t flush_tid)
3328{
3329 int held = cap->issued | cap->implemented;
3330 int revoking = cap->implemented & ~cap->issued;
3331 retain &= ~revoking;
3332 int dropping = cap->issued & ~retain;
3333 int op = CEPH_CAP_OP_UPDATE;
3334
11fdf7f2 3335 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3336 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3337 << " used " << ccap_string(used)
3338 << " want " << ccap_string(want)
3339 << " flush " << ccap_string(flush)
3340 << " retain " << ccap_string(retain)
3341 << " held "<< ccap_string(held)
3342 << " revoking " << ccap_string(revoking)
3343 << " dropping " << ccap_string(dropping)
3344 << dendl;
3345
3346 if (cct->_conf->client_inject_release_failure && revoking) {
3347 const int would_have_issued = cap->issued & retain;
3348 const int would_have_implemented = cap->implemented & (cap->issued | used);
3349 // Simulated bug:
3350 // - tell the server we think issued is whatever they issued plus whatever we implemented
3351 // - leave what we have implemented in place
3352 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3353 cap->issued = cap->issued | cap->implemented;
3354
3355 // Make an exception for revoking xattr caps: we are injecting
3356 // failure to release other caps, but allow xattr because client
3357 // will block on xattr ops if it can't release these to MDS (#9800)
3358 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3359 cap->issued ^= xattr_mask & revoking;
3360 cap->implemented ^= xattr_mask & revoking;
3361
3362 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3363 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3364 } else {
3365 // Normal behaviour
3366 cap->issued &= retain;
3367 cap->implemented &= cap->issued | used;
3368 }
3369
3370 snapid_t follows = 0;
3371
3372 if (flush)
3373 follows = in->snaprealm->get_snap_context().seq;
3374
9f95a23c 3375 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3376 in->ino,
3377 0,
3378 cap->cap_id, cap->seq,
3379 cap->implemented,
3380 want,
3381 flush,
3382 cap->mseq,
3383 cap_epoch_barrier);
3384 m->caller_uid = in->cap_dirtier_uid;
3385 m->caller_gid = in->cap_dirtier_gid;
3386
3387 m->head.issue_seq = cap->issue_seq;
3388 m->set_tid(flush_tid);
3389
3390 m->head.uid = in->uid;
3391 m->head.gid = in->gid;
3392 m->head.mode = in->mode;
3393
3394 m->head.nlink = in->nlink;
3395
3396 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3397 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3398 m->head.xattr_version = in->xattr_version;
3399 }
3400
3401 m->size = in->size;
3402 m->max_size = in->max_size;
3403 m->truncate_seq = in->truncate_seq;
3404 m->truncate_size = in->truncate_size;
3405 m->mtime = in->mtime;
3406 m->atime = in->atime;
3407 m->ctime = in->ctime;
3408 m->btime = in->btime;
3409 m->time_warp_seq = in->time_warp_seq;
3410 m->change_attr = in->change_attr;
eafe8130
TL
3411
3412 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3413 !in->cap_snaps.empty() &&
3414 in->cap_snaps.rbegin()->second.flush_tid == 0)
3415 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3416 m->flags = flags;
3417
7c673cae
FG
3418 if (flush & CEPH_CAP_FILE_WR) {
3419 m->inline_version = in->inline_version;
3420 m->inline_data = in->inline_data;
3421 }
3422
3423 in->reported_size = in->size;
3424 m->set_snap_follows(follows);
3425 cap->wanted = want;
3426 if (cap == in->auth_cap) {
3427 m->set_max_size(in->wanted_max_size);
3428 in->requested_max_size = in->wanted_max_size;
3429 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3430 }
3431
3432 if (!session->flushing_caps_tids.empty())
3433 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3434
11fdf7f2 3435 session->con->send_message2(std::move(m));
7c673cae
FG
3436}
3437
31f18b77
FG
3438static bool is_max_size_approaching(Inode *in)
3439{
3440 /* mds will adjust max size according to the reported size */
3441 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3442 return false;
3443 if (in->size >= in->max_size)
3444 return true;
3445 /* half of previous max_size increment has been used */
3446 if (in->max_size > in->reported_size &&
3447 (in->size << 1) >= in->max_size + in->reported_size)
3448 return true;
3449 return false;
3450}
7c673cae 3451
11fdf7f2
TL
3452static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3453{
3454 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3455 return used;
3456 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3457 return used;
3458
3459 if (issued & CEPH_CAP_FILE_LAZYIO) {
3460 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3461 used &= ~CEPH_CAP_FILE_CACHE;
3462 used |= CEPH_CAP_FILE_LAZYIO;
3463 }
3464 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3465 used &= ~CEPH_CAP_FILE_BUFFER;
3466 used |= CEPH_CAP_FILE_LAZYIO;
3467 }
3468 } else {
3469 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3470 used &= ~CEPH_CAP_FILE_CACHE;
3471 used |= CEPH_CAP_FILE_LAZYIO;
3472 }
3473 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3474 used &= ~CEPH_CAP_FILE_BUFFER;
3475 used |= CEPH_CAP_FILE_LAZYIO;
3476 }
3477 }
3478 return used;
3479}
3480
7c673cae
FG
3481/**
3482 * check_caps
3483 *
3484 * Examine currently used and wanted versus held caps. Release, flush or ack
3485 * revoked caps to the MDS as appropriate.
3486 *
3487 * @param in the inode to check
3488 * @param flags flags to apply to cap check
3489 */
3490void Client::check_caps(Inode *in, unsigned flags)
3491{
3492 unsigned wanted = in->caps_wanted();
3493 unsigned used = get_caps_used(in);
3494 unsigned cap_used;
3495
7c673cae
FG
3496 int implemented;
3497 int issued = in->caps_issued(&implemented);
3498 int revoking = implemented & ~issued;
3499
11fdf7f2
TL
3500 int orig_used = used;
3501 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3502
7c673cae 3503 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3504 if (!unmounting && in->nlink > 0) {
3505 if (wanted) {
7c673cae 3506 retain |= CEPH_CAP_ANY;
a8e16298
TL
3507 } else if (in->is_dir() &&
3508 (issued & CEPH_CAP_FILE_SHARED) &&
3509 (in->flags & I_COMPLETE)) {
3510 // we do this here because we don't want to drop to Fs (and then
3511 // drop the Fs if we do a create!) if that alone makes us send lookups
3512 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3513 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3514 retain |= wanted;
3515 } else {
7c673cae 3516 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3517 // keep RD only if we didn't have the file open RW,
3518 // because then the mds would revoke it anyway to
3519 // journal max_size=0.
3520 if (in->max_size == 0)
3521 retain |= CEPH_CAP_ANY_RD;
3522 }
7c673cae
FG
3523 }
3524
11fdf7f2 3525 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3526 << " wanted " << ccap_string(wanted)
3527 << " used " << ccap_string(used)
3528 << " issued " << ccap_string(issued)
3529 << " revoking " << ccap_string(revoking)
3530 << " flags=" << flags
3531 << dendl;
3532
3533 if (in->snapid != CEPH_NOSNAP)
3534 return; //snap caps last forever, can't write
3535
3536 if (in->caps.empty())
3537 return; // guard if at end of func
3538
11fdf7f2
TL
3539 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3540 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3541 if (_release(in))
11fdf7f2 3542 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3543 }
7c673cae 3544
7c673cae 3545
11fdf7f2
TL
3546 for (auto &p : in->caps) {
3547 mds_rank_t mds = p.first;
3548 Cap &cap = p.second;
7c673cae 3549
11fdf7f2 3550 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3551
3552 cap_used = used;
11fdf7f2 3553 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3554 cap_used &= ~in->auth_cap->issued;
3555
11fdf7f2 3556 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3557
3558 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3559 << " issued " << ccap_string(cap.issued)
3560 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3561 << " revoking " << ccap_string(revoking) << dendl;
3562
3563 if (in->wanted_max_size > in->max_size &&
3564 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3565 &cap == in->auth_cap)
7c673cae
FG
3566 goto ack;
3567
3568 /* approaching file_max? */
11fdf7f2
TL
3569 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3570 &cap == in->auth_cap &&
31f18b77 3571 is_max_size_approaching(in)) {
7c673cae 3572 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3573 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3574 goto ack;
3575 }
3576
3577 /* completed revocation? */
3578 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3579 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3580 goto ack;
3581 }
3582
3583 /* want more caps from mds? */
11fdf7f2 3584 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3585 goto ack;
3586
3587 if (!revoking && unmounting && (cap_used == 0))
3588 goto ack;
3589
11fdf7f2 3590 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3591 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3592 continue;
3593
11fdf7f2 3594 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3595 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3596 cap_delay_requeue(in);
7c673cae
FG
3597 continue;
3598 }
3599
3600 ack:
eafe8130
TL
3601 if (&cap == in->auth_cap) {
3602 if (in->flags & I_KICK_FLUSH) {
3603 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3604 << " to mds." << mds << dendl;
3605 kick_flushing_caps(in, session);
3606 }
3607 if (!in->cap_snaps.empty() &&
3608 in->cap_snaps.rbegin()->second.flush_tid == 0)
3609 flush_snaps(in);
7c673cae
FG
3610 }
3611
3612 int flushing;
3613 ceph_tid_t flush_tid;
11fdf7f2 3614 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3615 flushing = mark_caps_flushing(in, &flush_tid);
3616 } else {
3617 flushing = 0;
3618 flush_tid = 0;
3619 }
3620
eafe8130
TL
3621 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3622 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3623 flushing, flush_tid);
7c673cae
FG
3624 }
3625}
3626
3627
3628void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3629{
3630 int used = get_caps_used(in);
3631 int dirty = in->caps_dirty();
11fdf7f2 3632 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3633
3634 if (in->cap_snaps.size() &&
3635 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3636 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3637 return;
3638 } else if (in->caps_dirty() ||
3639 (used & CEPH_CAP_FILE_WR) ||
3640 (dirty & CEPH_CAP_ANY_WR)) {
3641 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3642 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3643 CapSnap &capsnap = capsnapem.first->second;
3644 capsnap.context = old_snapc;
3645 capsnap.issued = in->caps_issued();
3646 capsnap.dirty = in->caps_dirty();
3647
3648 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3649
3650 capsnap.uid = in->uid;
3651 capsnap.gid = in->gid;
3652 capsnap.mode = in->mode;
3653 capsnap.btime = in->btime;
3654 capsnap.xattrs = in->xattrs;
3655 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3656 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3657 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3658
3659 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3660 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3661 capsnap.writing = 1;
3662 } else {
3663 finish_cap_snap(in, capsnap, used);
3664 }
3665 } else {
11fdf7f2 3666 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3667 }
3668}
3669
3670void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3671{
11fdf7f2 3672 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3673 capsnap.size = in->size;
3674 capsnap.mtime = in->mtime;
3675 capsnap.atime = in->atime;
3676 capsnap.ctime = in->ctime;
3677 capsnap.time_warp_seq = in->time_warp_seq;
3678 capsnap.change_attr = in->change_attr;
7c673cae
FG
3679 capsnap.dirty |= in->caps_dirty();
3680
11fdf7f2
TL
3681 /* Only reset it if it wasn't set before */
3682 if (capsnap.cap_dirtier_uid == -1) {
3683 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3684 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3685 }
3686
7c673cae
FG
3687 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3688 capsnap.inline_data = in->inline_data;
3689 capsnap.inline_version = in->inline_version;
3690 }
3691
3692 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3693 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3694 << " WRBUFFER, delaying" << dendl;
3695 } else {
3696 capsnap.dirty_data = 0;
3697 flush_snaps(in);
3698 }
3699}
3700
3701void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3702{
11fdf7f2 3703 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3704 in->cap_snaps.at(seq).dirty_data = 0;
3705 flush_snaps(in);
3706}
3707
eafe8130
TL
3708void Client::send_flush_snap(Inode *in, MetaSession *session,
3709 snapid_t follows, CapSnap& capsnap)
3710{
9f95a23c
TL
3711 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3712 in->ino, in->snaprealm->ino, 0,
3713 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
3714 m->caller_uid = capsnap.cap_dirtier_uid;
3715 m->caller_gid = capsnap.cap_dirtier_gid;
3716
3717 m->set_client_tid(capsnap.flush_tid);
3718 m->head.snap_follows = follows;
3719
3720 m->head.caps = capsnap.issued;
3721 m->head.dirty = capsnap.dirty;
3722
3723 m->head.uid = capsnap.uid;
3724 m->head.gid = capsnap.gid;
3725 m->head.mode = capsnap.mode;
3726 m->btime = capsnap.btime;
3727
3728 m->size = capsnap.size;
3729
3730 m->head.xattr_version = capsnap.xattr_version;
3731 encode(capsnap.xattrs, m->xattrbl);
3732
3733 m->ctime = capsnap.ctime;
3734 m->btime = capsnap.btime;
3735 m->mtime = capsnap.mtime;
3736 m->atime = capsnap.atime;
3737 m->time_warp_seq = capsnap.time_warp_seq;
3738 m->change_attr = capsnap.change_attr;
3739
3740 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3741 m->inline_version = in->inline_version;
3742 m->inline_data = in->inline_data;
3743 }
3744
3745 ceph_assert(!session->flushing_caps_tids.empty());
3746 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3747
3748 session->con->send_message2(std::move(m));
3749}
3750
3751void Client::flush_snaps(Inode *in)
7c673cae 3752{
eafe8130 3753 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3754 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3755
3756 // pick auth mds
11fdf7f2 3757 ceph_assert(in->auth_cap);
7c673cae 3758 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3759
3760 for (auto &p : in->cap_snaps) {
3761 CapSnap &capsnap = p.second;
eafe8130
TL
3762 // only do new flush
3763 if (capsnap.flush_tid > 0)
3764 continue;
7c673cae
FG
3765
3766 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3767 << " follows " << p.first
3768 << " size " << capsnap.size
3769 << " mtime " << capsnap.mtime
3770 << " dirty_data=" << capsnap.dirty_data
3771 << " writing=" << capsnap.writing
3772 << " on " << *in << dendl;
3773 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3774 break;
7c673cae 3775
eafe8130
TL
3776 capsnap.flush_tid = ++last_flush_tid;
3777 session->flushing_caps_tids.insert(capsnap.flush_tid);
3778 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3779 if (!in->flushing_cap_item.is_on_list())
3780 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3781
eafe8130 3782 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3783 }
3784}
3785
9f95a23c 3786void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 3787{
9f95a23c 3788 ceph::condition_variable cond;
7c673cae 3789 ls.push_back(&cond);
9f95a23c
TL
3790 std::unique_lock l{client_lock, std::adopt_lock};
3791 cond.wait(l);
3792 l.release();
7c673cae
FG
3793 ls.remove(&cond);
3794}
3795
9f95a23c 3796void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 3797{
9f95a23c
TL
3798 for (auto cond : ls) {
3799 cond->notify_all();
3800 }
7c673cae
FG
3801}
3802
3803void Client::wait_on_context_list(list<Context*>& ls)
3804{
9f95a23c 3805 ceph::condition_variable cond;
7c673cae
FG
3806 bool done = false;
3807 int r;
9f95a23c
TL
3808 ls.push_back(new C_Cond(cond, &done, &r));
3809 std::unique_lock l{client_lock, std::adopt_lock};
3810 cond.wait(l, [&done] { return done;});
3811 l.release();
7c673cae
FG
3812}
3813
3814void Client::signal_context_list(list<Context*>& ls)
3815{
3816 while (!ls.empty()) {
3817 ls.front()->complete(0);
3818 ls.pop_front();
3819 }
3820}
3821
a8e16298 3822void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3823{
11fdf7f2
TL
3824 for (const auto &cap : s->caps) {
3825 auto &in = cap->inode;
a8e16298 3826 if (reconnect) {
11fdf7f2
TL
3827 in.requested_max_size = 0;
3828 in.wanted_max_size = 0;
a8e16298
TL
3829 } else {
3830 if (cap->gen < s->cap_gen) {
3831 // mds did not re-issue stale cap.
3832 cap->issued = cap->implemented = CEPH_CAP_PIN;
3833 // make sure mds knows what we want.
11fdf7f2
TL
3834 if (in.caps_file_wanted() & ~cap->wanted)
3835 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3836 }
3837 }
11fdf7f2 3838 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3839 }
3840}
3841
3842
3843// flush dirty data (from objectcache)
3844
3845class C_Client_CacheInvalidate : public Context {
3846private:
3847 Client *client;
3848 vinodeno_t ino;
3849 int64_t offset, length;
3850public:
3851 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3852 client(c), offset(off), length(len) {
3853 if (client->use_faked_inos())
3854 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3855 else
3856 ino = in->vino();
3857 }
3858 void finish(int r) override {
3859 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 3860 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
3861 client->_async_invalidate(ino, offset, length);
3862 }
3863};
3864
3865void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3866{
3867 if (unmounting)
3868 return;
11fdf7f2 3869 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3870 ino_invalidate_cb(callback_handle, ino, off, len);
3871}
3872
3873void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3874
3875 if (ino_invalidate_cb)
3876 // we queue the invalidate, which calls the callback and decrements the ref
3877 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3878}
3879
3880void Client::_invalidate_inode_cache(Inode *in)
3881{
11fdf7f2 3882 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3883
3884 // invalidate our userspace inode cache
94b18763 3885 if (cct->_conf->client_oc) {
7c673cae 3886 objectcacher->release_set(&in->oset);
94b18763
FG
3887 if (!objectcacher->set_is_empty(&in->oset))
3888 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3889 }
7c673cae
FG
3890
3891 _schedule_invalidate_callback(in, 0, 0);
3892}
3893
3894void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3895{
11fdf7f2 3896 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3897
3898 // invalidate our userspace inode cache
3899 if (cct->_conf->client_oc) {
3900 vector<ObjectExtent> ls;
3901 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3902 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3903 }
3904
3905 _schedule_invalidate_callback(in, off, len);
3906}
3907
3908bool Client::_release(Inode *in)
3909{
3910 ldout(cct, 20) << "_release " << *in << dendl;
3911 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3912 _invalidate_inode_cache(in);
3913 return true;
3914 }
3915 return false;
3916}
3917
3918bool Client::_flush(Inode *in, Context *onfinish)
3919{
3920 ldout(cct, 10) << "_flush " << *in << dendl;
3921
3922 if (!in->oset.dirty_or_tx) {
3923 ldout(cct, 10) << " nothing to flush" << dendl;
3924 onfinish->complete(0);
3925 return true;
3926 }
3927
3928 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3929 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3930 objectcacher->purge_set(&in->oset);
3931 if (onfinish) {
3932 onfinish->complete(-ENOSPC);
3933 }
3934 return true;
3935 }
3936
3937 return objectcacher->flush_set(&in->oset, onfinish);
3938}
3939
3940void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3941{
9f95a23c 3942 ceph_assert(ceph_mutex_is_locked(client_lock));
7c673cae
FG
3943 if (!in->oset.dirty_or_tx) {
3944 ldout(cct, 10) << " nothing to flush" << dendl;
3945 return;
3946 }
3947
11fdf7f2 3948 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3949 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3950 offset, size, &onflush);
7c673cae
FG
3951 if (!ret) {
3952 // wait for flush
9f95a23c 3953 client_lock.unlock();
11fdf7f2 3954 onflush.wait();
9f95a23c 3955 client_lock.lock();
7c673cae
FG
3956 }
3957}
3958
3959void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3960{
11fdf7f2 3961 // std::lock_guard l(client_lock);
9f95a23c 3962 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 3963 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3964 ceph_assert(in);
7c673cae
FG
3965 _flushed(in);
3966}
3967
3968void Client::_flushed(Inode *in)
3969{
3970 ldout(cct, 10) << "_flushed " << *in << dendl;
3971
3972 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3973}
3974
3975
3976
3977// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3978void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3979{
3980 unsigned had = in->caps_issued();
3981
3982 if ((issued & CEPH_CAP_FILE_CACHE) &&
3983 !(had & CEPH_CAP_FILE_CACHE))
3984 in->cache_gen++;
3985
3986 if ((issued & CEPH_CAP_FILE_SHARED) &&
3987 !(had & CEPH_CAP_FILE_SHARED)) {
3988 in->shared_gen++;
3989
3990 if (in->is_dir())
3991 clear_dir_complete_and_ordered(in, true);
3992 }
3993}
3994
3995void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3996 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3997 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3998{
11fdf7f2
TL
3999 if (!in->is_any_caps()) {
4000 ceph_assert(in->snaprealm == 0);
4001 in->snaprealm = get_snap_realm(realm);
4002 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4003 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4004 } else {
4005 ceph_assert(in->snaprealm);
4006 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4007 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4008 in->snaprealm_item.remove_myself();
4009 auto oldrealm = in->snaprealm;
4010 in->snaprealm = get_snap_realm(realm);
4011 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4012 put_snap_realm(oldrealm);
4013 }
4014 }
4015
7c673cae 4016 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4017 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4018 Cap &cap = capem.first->second;
4019 if (!capem.second) {
4020 if (cap.gen < mds_session->cap_gen)
4021 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4022
4023 /*
4024 * auth mds of the inode changed. we received the cap export
4025 * message, but still haven't received the cap import message.
4026 * handle_cap_export() updated the new auth MDS' cap.
4027 *
4028 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4029 * a message that was send before the cap import message. So
4030 * don't remove caps.
4031 */
11fdf7f2 4032 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4033 if (&cap != in->auth_cap)
4034 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4035
11fdf7f2
TL
4036 ceph_assert(cap.cap_id == cap_id);
4037 seq = cap.seq;
4038 mseq = cap.mseq;
4039 issued |= cap.issued;
7c673cae
FG
4040 flags |= CEPH_CAP_FLAG_AUTH;
4041 }
7c673cae
FG
4042 }
4043
11fdf7f2 4044 check_cap_issue(in, issued);
7c673cae
FG
4045
4046 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4047 if (in->auth_cap != &cap &&
7c673cae
FG
4048 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4049 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4050 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4051 << "add myself to new auth MDS' flushing caps list" << dendl;
4052 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4053 }
11fdf7f2 4054 in->auth_cap = &cap;
7c673cae
FG
4055 }
4056 }
4057
11fdf7f2
TL
4058 unsigned old_caps = cap.issued;
4059 cap.cap_id = cap_id;
4060 cap.issued = issued;
4061 cap.implemented |= issued;
4062 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4063 cap.wanted = wanted;
a8e16298 4064 else
11fdf7f2
TL
4065 cap.wanted |= wanted;
4066 cap.seq = seq;
4067 cap.issue_seq = seq;
4068 cap.mseq = mseq;
4069 cap.gen = mds_session->cap_gen;
4070 cap.latest_perms = cap_perms;
4071 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4072 << " from mds." << mds
4073 << " on " << *in
4074 << dendl;
4075
4076 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4077 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4078 for (auto &p : in->caps) {
4079 if (&p.second == &cap)
7c673cae 4080 continue;
11fdf7f2 4081 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4082 check_caps(in, CHECK_CAPS_NODELAY);
4083 break;
4084 }
4085 }
4086 }
4087
4088 if (issued & ~old_caps)
4089 signal_cond_list(in->waitfor_caps);
4090}
4091
4092void Client::remove_cap(Cap *cap, bool queue_release)
4093{
11fdf7f2 4094 auto &in = cap->inode;
7c673cae
FG
4095 MetaSession *session = cap->session;
4096 mds_rank_t mds = cap->session->mds_num;
4097
11fdf7f2 4098 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4099
4100 if (queue_release) {
4101 session->enqueue_cap_release(
11fdf7f2 4102 in.ino,
7c673cae
FG
4103 cap->cap_id,
4104 cap->issue_seq,
4105 cap->mseq,
4106 cap_epoch_barrier);
4107 }
4108
11fdf7f2
TL
4109 if (in.auth_cap == cap) {
4110 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4111 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4112 in.flushing_cap_item.remove_myself();
7c673cae 4113 }
11fdf7f2 4114 in.auth_cap = NULL;
7c673cae 4115 }
11fdf7f2
TL
4116 size_t n = in.caps.erase(mds);
4117 ceph_assert(n == 1);
7c673cae
FG
4118 cap = nullptr;
4119
11fdf7f2
TL
4120 if (!in.is_any_caps()) {
4121 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4122 in.snaprealm_item.remove_myself();
4123 put_snap_realm(in.snaprealm);
4124 in.snaprealm = 0;
7c673cae
FG
4125 }
4126}
4127
4128void Client::remove_all_caps(Inode *in)
4129{
4130 while (!in->caps.empty())
11fdf7f2 4131 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4132}
4133
4134void Client::remove_session_caps(MetaSession *s)
4135{
11fdf7f2 4136 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4137
4138 while (s->caps.size()) {
4139 Cap *cap = *s->caps.begin();
11fdf7f2 4140 InodeRef in(&cap->inode);
eafe8130 4141 bool dirty_caps = false;
7c673cae 4142 if (in->auth_cap == cap) {
7c673cae
FG
4143 dirty_caps = in->dirty_caps | in->flushing_caps;
4144 in->wanted_max_size = 0;
4145 in->requested_max_size = 0;
7c673cae 4146 }
a8e16298
TL
4147 if (cap->wanted | cap->issued)
4148 in->flags |= I_CAP_DROPPED;
7c673cae 4149 remove_cap(cap, false);
eafe8130 4150 in->cap_snaps.clear();
7c673cae 4151 if (dirty_caps) {
11fdf7f2 4152 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4153 if (in->flushing_caps) {
4154 num_flushing_caps--;
4155 in->flushing_cap_tids.clear();
4156 }
4157 in->flushing_caps = 0;
28e407b8 4158 in->mark_caps_clean();
11fdf7f2 4159 put_inode(in.get());
7c673cae 4160 }
a8e16298 4161 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4162 }
4163 s->flushing_caps_tids.clear();
9f95a23c 4164 sync_cond.notify_all();
7c673cae
FG
4165}
4166
91327a77 4167int Client::_do_remount(bool retry_on_error)
b32b8144 4168{
11fdf7f2 4169 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4170
b32b8144
FG
4171 errno = 0;
4172 int r = remount_cb(callback_handle);
91327a77
AA
4173 if (r == 0) {
4174 retries_on_invalidate = 0;
4175 } else {
b32b8144
FG
4176 int e = errno;
4177 client_t whoami = get_nodeid();
4178 if (r == -1) {
4179 lderr(cct) <<
4180 "failed to remount (to trim kernel dentries): "
4181 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4182 } else {
4183 lderr(cct) <<
4184 "failed to remount (to trim kernel dentries): "
4185 "return code = " << r << dendl;
4186 }
91327a77 4187 bool should_abort =
11fdf7f2
TL
4188 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4189 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4190 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4191 if (should_abort && !unmounting) {
4192 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4193 ceph_abort();
4194 }
4195 }
4196 return r;
4197}
4198
7c673cae
FG
4199class C_Client_Remount : public Context {
4200private:
4201 Client *client;
4202public:
4203 explicit C_Client_Remount(Client *c) : client(c) {}
4204 void finish(int r) override {
11fdf7f2 4205 ceph_assert(r == 0);
91327a77 4206 client->_do_remount(true);
7c673cae
FG
4207 }
4208};
4209
4210void Client::_invalidate_kernel_dcache()
4211{
4212 if (unmounting)
4213 return;
94b18763
FG
4214 if (can_invalidate_dentries) {
4215 if (dentry_invalidate_cb && root->dir) {
4216 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4217 p != root->dir->dentries.end();
4218 ++p) {
4219 if (p->second->inode)
4220 _schedule_invalidate_dentry_callback(p->second, false);
4221 }
7c673cae
FG
4222 }
4223 } else if (remount_cb) {
4224 // Hacky:
4225 // when remounting a file system, linux kernel trims all unused dentries in the fs
4226 remount_finisher.queue(new C_Client_Remount(this));
4227 }
4228}
4229
91327a77
AA
4230void Client::_trim_negative_child_dentries(InodeRef& in)
4231{
4232 if (!in->is_dir())
4233 return;
4234
4235 Dir* dir = in->dir;
4236 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4237 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4238 Dentry *dn = p->second;
4239 ++p;
11fdf7f2 4240 ceph_assert(!dn->inode);
91327a77
AA
4241 if (dn->lru_is_expireable())
4242 unlink(dn, true, false); // keep dir, drop dentry
4243 }
4244 if (dir->dentries.empty()) {
4245 close_dir(dir);
4246 }
4247 }
4248
4249 if (in->flags & I_SNAPDIR_OPEN) {
4250 InodeRef snapdir = open_snapdir(in.get());
4251 _trim_negative_child_dentries(snapdir);
4252 }
4253}
4254
28e407b8 4255void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4256{
4257 mds_rank_t mds = s->mds_num;
28e407b8 4258 size_t caps_size = s->caps.size();
11fdf7f2 4259 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4260 << " caps " << caps_size << dendl;
4261
28e407b8
AA
4262 uint64_t trimmed = 0;
4263 auto p = s->caps.begin();
4264 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4265 * looking at from getting deleted during traversal. */
7c673cae
FG
4266 while ((caps_size - trimmed) > max && !p.end()) {
4267 Cap *cap = *p;
11fdf7f2 4268 InodeRef in(&cap->inode);
7c673cae
FG
4269
4270 // Increment p early because it will be invalidated if cap
4271 // is deleted inside remove_cap
4272 ++p;
4273
4274 if (in->caps.size() > 1 && cap != in->auth_cap) {
4275 int mine = cap->issued | cap->implemented;
4276 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4277 // disposable non-auth cap
b32b8144 4278 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4279 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4280 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4281 trimmed++;
4282 }
4283 } else {
4284 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4285 _trim_negative_child_dentries(in);
7c673cae 4286 bool all = true;
11fdf7f2
TL
4287 auto q = in->dentries.begin();
4288 while (q != in->dentries.end()) {
4289 Dentry *dn = *q;
4290 ++q;
7c673cae
FG
4291 if (dn->lru_is_expireable()) {
4292 if (can_invalidate_dentries &&
4293 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4294 // Only issue one of these per DN for inodes in root: handle
4295 // others more efficiently by calling for root-child DNs at
4296 // the end of this function.
4297 _schedule_invalidate_dentry_callback(dn, true);
4298 }
28e407b8
AA
4299 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4300 to_trim.insert(dn);
7c673cae
FG
4301 } else {
4302 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4303 all = false;
4304 }
4305 }
4306 if (all && in->ino != MDS_INO_ROOT) {
4307 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4308 trimmed++;
4309 }
4310 }
4311 }
28e407b8
AA
4312 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4313 for (const auto &dn : to_trim) {
4314 trim_dentry(dn);
4315 }
4316 to_trim.clear();
7c673cae 4317
b32b8144 4318 caps_size = s->caps.size();
11fdf7f2 4319 if (caps_size > (size_t)max)
7c673cae
FG
4320 _invalidate_kernel_dcache();
4321}
4322
4323void Client::force_session_readonly(MetaSession *s)
4324{
4325 s->readonly = true;
4326 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4327 auto &in = (*p)->inode;
4328 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4329 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4330 }
4331}
4332
7c673cae
FG
4333int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4334{
4335 MetaSession *session = in->auth_cap->session;
4336
4337 int flushing = in->dirty_caps;
11fdf7f2 4338 ceph_assert(flushing);
7c673cae
FG
4339
4340 ceph_tid_t flush_tid = ++last_flush_tid;
4341 in->flushing_cap_tids[flush_tid] = flushing;
4342
4343 if (!in->flushing_caps) {
11fdf7f2 4344 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4345 num_flushing_caps++;
4346 } else {
11fdf7f2 4347 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4348 }
4349
4350 in->flushing_caps |= flushing;
28e407b8 4351 in->mark_caps_clean();
7c673cae
FG
4352
4353 if (!in->flushing_cap_item.is_on_list())
4354 session->flushing_caps.push_back(&in->flushing_cap_item);
4355 session->flushing_caps_tids.insert(flush_tid);
4356
4357 *ptid = flush_tid;
4358 return flushing;
4359}
4360
4361void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4362{
4363 for (auto &p : in->cap_snaps) {
4364 CapSnap &capsnap = p.second;
4365 if (capsnap.flush_tid > 0) {
4366 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4367 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4368 }
4369 }
4370 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4371 it != in->flushing_cap_tids.end();
4372 ++it) {
4373 old_s->flushing_caps_tids.erase(it->first);
4374 new_s->flushing_caps_tids.insert(it->first);
4375 }
4376 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4377}
4378
4379/*
4380 * Flush all caps back to the MDS. Because the callers generally wait on the
4381 * result of this function (syncfs and umount cases), we set
4382 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4383 */
4384void Client::flush_caps_sync()
4385{
4386 ldout(cct, 10) << __func__ << dendl;
28e407b8 4387 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4388 while (!p.end()) {
4389 unsigned flags = CHECK_CAPS_NODELAY;
4390 Inode *in = *p;
4391
4392 ++p;
28e407b8
AA
4393 delayed_list.pop_front();
4394 if (p.end() && dirty_list.empty())
7c673cae
FG
4395 flags |= CHECK_CAPS_SYNCHRONOUS;
4396 check_caps(in, flags);
4397 }
4398
4399 // other caps, too
28e407b8 4400 p = dirty_list.begin();
7c673cae
FG
4401 while (!p.end()) {
4402 unsigned flags = CHECK_CAPS_NODELAY;
4403 Inode *in = *p;
4404
4405 ++p;
4406 if (p.end())
4407 flags |= CHECK_CAPS_SYNCHRONOUS;
4408 check_caps(in, flags);
4409 }
4410}
4411
7c673cae
FG
4412void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4413{
4414 while (in->flushing_caps) {
4415 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4416 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4417 if (it->first > want)
4418 break;
11fdf7f2 4419 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4420 << ccap_string(it->second) << " want " << want
4421 << " last " << it->first << dendl;
4422 wait_on_list(in->waitfor_caps);
4423 }
4424}
4425
4426void Client::wait_sync_caps(ceph_tid_t want)
4427{
4428 retry:
11fdf7f2 4429 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4430 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4431 for (auto &p : mds_sessions) {
4432 MetaSession *s = &p.second;
7c673cae
FG
4433 if (s->flushing_caps_tids.empty())
4434 continue;
4435 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4436 if (oldest_tid <= want) {
11fdf7f2 4437 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4438 << " (want " << want << ")" << dendl;
9f95a23c
TL
4439 std::unique_lock l{client_lock, std::adopt_lock};
4440 sync_cond.wait(l);
4441 l.release();
7c673cae
FG
4442 goto retry;
4443 }
4444 }
4445}
4446
eafe8130
TL
4447void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4448{
4449 in->flags &= ~I_KICK_FLUSH;
4450
4451 Cap *cap = in->auth_cap;
4452 ceph_assert(cap->session == session);
4453
4454 ceph_tid_t last_snap_flush = 0;
4455 for (auto p = in->flushing_cap_tids.rbegin();
4456 p != in->flushing_cap_tids.rend();
4457 ++p) {
4458 if (!p->second) {
4459 last_snap_flush = p->first;
4460 break;
4461 }
4462 }
4463
4464 int wanted = in->caps_wanted();
4465 int used = get_caps_used(in) | in->caps_dirty();
4466 auto it = in->cap_snaps.begin();
4467 for (auto& p : in->flushing_cap_tids) {
4468 if (p.second) {
4469 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4470 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4471 p.second, p.first);
4472 } else {
4473 ceph_assert(it != in->cap_snaps.end());
4474 ceph_assert(it->second.flush_tid == p.first);
4475 send_flush_snap(in, session, it->first, it->second);
4476 ++it;
4477 }
4478 }
4479}
4480
7c673cae
FG
4481void Client::kick_flushing_caps(MetaSession *session)
4482{
4483 mds_rank_t mds = session->mds_num;
11fdf7f2 4484 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4485
4486 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4487 Inode *in = *p;
eafe8130
TL
4488 if (in->flags & I_KICK_FLUSH) {
4489 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4490 kick_flushing_caps(in, session);
4491 }
7c673cae 4492 }
7c673cae
FG
4493}
4494
4495void Client::early_kick_flushing_caps(MetaSession *session)
4496{
7c673cae
FG
4497 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4498 Inode *in = *p;
11fdf7f2
TL
4499 Cap *cap = in->auth_cap;
4500 ceph_assert(cap);
7c673cae
FG
4501
4502 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4503 // stage. This guarantees that MDS processes the cap flush message before issuing
4504 // the flushing caps to other client.
eafe8130
TL
4505 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4506 in->flags |= I_KICK_FLUSH;
7c673cae 4507 continue;
eafe8130 4508 }
7c673cae
FG
4509
4510 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4511 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4512 // send_reconnect() also will reset these sequence numbers. make sure
4513 // sequence numbers in cap flush message match later reconnect message.
4514 cap->seq = 0;
4515 cap->issue_seq = 0;
4516 cap->mseq = 0;
4517 cap->issued = cap->implemented;
4518
eafe8130 4519 kick_flushing_caps(in, session);
7c673cae
FG
4520 }
4521}
4522
7c673cae
FG
4523void SnapRealm::build_snap_context()
4524{
4525 set<snapid_t> snaps;
4526 snapid_t max_seq = seq;
4527
4528 // start with prior_parents?
4529 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4530 snaps.insert(prior_parent_snaps[i]);
4531
4532 // current parent's snaps
4533 if (pparent) {
4534 const SnapContext& psnapc = pparent->get_snap_context();
4535 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4536 if (psnapc.snaps[i] >= parent_since)
4537 snaps.insert(psnapc.snaps[i]);
4538 if (psnapc.seq > max_seq)
4539 max_seq = psnapc.seq;
4540 }
4541
4542 // my snaps
4543 for (unsigned i=0; i<my_snaps.size(); i++)
4544 snaps.insert(my_snaps[i]);
4545
4546 // ok!
4547 cached_snap_context.seq = max_seq;
4548 cached_snap_context.snaps.resize(0);
4549 cached_snap_context.snaps.reserve(snaps.size());
4550 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4551 cached_snap_context.snaps.push_back(*p);
4552}
4553
4554void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4555{
4556 list<SnapRealm*> q;
4557 q.push_back(realm);
4558
4559 while (!q.empty()) {
4560 realm = q.front();
4561 q.pop_front();
4562
11fdf7f2 4563 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4564 realm->invalidate_cache();
4565
4566 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4567 p != realm->pchildren.end();
4568 ++p)
4569 q.push_back(*p);
4570 }
4571}
4572
4573SnapRealm *Client::get_snap_realm(inodeno_t r)
4574{
4575 SnapRealm *realm = snap_realms[r];
4576 if (!realm)
4577 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4578 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4579 realm->nref++;
4580 return realm;
4581}
4582
4583SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4584{
4585 if (snap_realms.count(r) == 0) {
11fdf7f2 4586 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4587 return NULL;
4588 }
4589 SnapRealm *realm = snap_realms[r];
11fdf7f2 4590 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4591 realm->nref++;
4592 return realm;
4593}
4594
4595void Client::put_snap_realm(SnapRealm *realm)
4596{
11fdf7f2 4597 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4598 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4599 if (--realm->nref == 0) {
4600 snap_realms.erase(realm->ino);
4601 if (realm->pparent) {
4602 realm->pparent->pchildren.erase(realm);
4603 put_snap_realm(realm->pparent);
4604 }
4605 delete realm;
4606 }
4607}
4608
4609bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4610{
4611 if (realm->parent != parent) {
11fdf7f2 4612 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4613 << " " << realm->parent << " -> " << parent << dendl;
4614 realm->parent = parent;
4615 if (realm->pparent) {
4616 realm->pparent->pchildren.erase(realm);
4617 put_snap_realm(realm->pparent);
4618 }
4619 realm->pparent = get_snap_realm(parent);
4620 realm->pparent->pchildren.insert(realm);
4621 return true;
4622 }
4623 return false;
4624}
4625
4626static bool has_new_snaps(const SnapContext& old_snapc,
4627 const SnapContext& new_snapc)
4628{
4629 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4630}
4631
4632
11fdf7f2 4633void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4634{
4635 SnapRealm *first_realm = NULL;
11fdf7f2 4636 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4637
4638 map<SnapRealm*, SnapContext> dirty_realms;
4639
11fdf7f2 4640 auto p = bl.cbegin();
7c673cae
FG
4641 while (!p.end()) {
4642 SnapRealmInfo info;
11fdf7f2 4643 decode(info, p);
7c673cae
FG
4644 SnapRealm *realm = get_snap_realm(info.ino());
4645
4646 bool invalidate = false;
4647
4648 if (info.seq() > realm->seq) {
11fdf7f2 4649 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4650 << dendl;
4651
4652 if (flush) {
4653 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4654 // flush me + children
4655 list<SnapRealm*> q;
4656 q.push_back(realm);
4657 while (!q.empty()) {
4658 SnapRealm *realm = q.front();
4659 q.pop_front();
4660
4661 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4662 p != realm->pchildren.end();
4663 ++p)
4664 q.push_back(*p);
4665
4666 if (dirty_realms.count(realm) == 0) {
4667 realm->nref++;
4668 dirty_realms[realm] = realm->get_snap_context();
4669 }
4670 }
4671 }
4672
4673 // update
4674 realm->seq = info.seq();
4675 realm->created = info.created();
4676 realm->parent_since = info.parent_since();
4677 realm->prior_parent_snaps = info.prior_parent_snaps;
4678 realm->my_snaps = info.my_snaps;
4679 invalidate = true;
4680 }
4681
4682 // _always_ verify parent
4683 if (adjust_realm_parent(realm, info.parent()))
4684 invalidate = true;
4685
4686 if (invalidate) {
4687 invalidate_snaprealm_and_children(realm);
11fdf7f2 4688 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4689 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4690 } else {
11fdf7f2 4691 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4692 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4693 }
4694
4695 if (!first_realm)
4696 first_realm = realm;
4697 else
4698 put_snap_realm(realm);
4699 }
4700
4701 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4702 q != dirty_realms.end();
4703 ++q) {
4704 SnapRealm *realm = q->first;
4705 // if there are new snaps ?
4706 if (has_new_snaps(q->second, realm->get_snap_context())) {
4707 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4708 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4709 while (!r.end()) {
4710 Inode *in = *r;
4711 ++r;
4712 queue_cap_snap(in, q->second);
4713 }
4714 } else {
4715 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4716 }
4717 put_snap_realm(realm);
4718 }
4719
4720 if (realm_ret)
4721 *realm_ret = first_realm;
4722 else
4723 put_snap_realm(first_realm);
4724}
4725
11fdf7f2 4726void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4727{
11fdf7f2 4728 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4729 mds_rank_t mds = mds_rank_t(m->get_source().num());
4730 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4731 if (!session) {
7c673cae
FG
4732 return;
4733 }
4734
4735 got_mds_push(session);
4736
4737 map<Inode*, SnapContext> to_move;
4738 SnapRealm *realm = 0;
4739
4740 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4741 ceph_assert(m->head.split);
7c673cae 4742 SnapRealmInfo info;
11fdf7f2
TL
4743 auto p = m->bl.cbegin();
4744 decode(info, p);
4745 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4746
4747 // flush, then move, ino's.
4748 realm = get_snap_realm(info.ino());
4749 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4750 for (auto& ino : m->split_inos) {
4751 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4752 if (inode_map.count(vino)) {
4753 Inode *in = inode_map[vino];
4754 if (!in->snaprealm || in->snaprealm == realm)
4755 continue;
4756 if (in->snaprealm->created > info.created()) {
4757 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4758 << *in->snaprealm << dendl;
4759 continue;
4760 }
4761 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4762
4763
4764 in->snaprealm_item.remove_myself();
4765 to_move[in] = in->snaprealm->get_snap_context();
4766 put_snap_realm(in->snaprealm);
4767 }
4768 }
4769
4770 // move child snaprealms, too
11fdf7f2
TL
4771 for (auto& child_realm : m->split_realms) {
4772 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4773 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4774 if (!child)
4775 continue;
4776 adjust_realm_parent(child, realm->ino);
4777 put_snap_realm(child);
4778 }
4779 }
4780
4781 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4782
4783 if (realm) {
4784 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4785 Inode *in = p->first;
4786 in->snaprealm = realm;
4787 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4788 realm->nref++;
4789 // queue for snap writeback
4790 if (has_new_snaps(p->second, realm->get_snap_context()))
4791 queue_cap_snap(in, p->second);
4792 }
4793 put_snap_realm(realm);
4794 }
7c673cae
FG
4795}
4796
11fdf7f2 4797void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4798{
4799 mds_rank_t mds = mds_rank_t(m->get_source().num());
4800 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4801 if (!session) {
7c673cae
FG
4802 return;
4803 }
4804
4805 got_mds_push(session);
4806
11fdf7f2 4807 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4808
4809 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4810 if (inode_map.count(vino)) {
4811 Inode *in = NULL;
4812 in = inode_map[vino];
4813
4814 if (in) {
4815 in->quota = m->quota;
4816 in->rstat = m->rstat;
4817 }
4818 }
7c673cae
FG
4819}
4820
11fdf7f2 4821void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4822{
4823 mds_rank_t mds = mds_rank_t(m->get_source().num());
4824 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4825 if (!session) {
7c673cae
FG
4826 return;
4827 }
4828
4829 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4830 // Pause RADOS operations until we see the required epoch
4831 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4832 }
4833
4834 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4835 // Record the barrier so that we will transmit it to MDS when releasing
4836 set_cap_epoch_barrier(m->osd_epoch_barrier);
4837 }
4838
4839 got_mds_push(session);
4840
11fdf7f2 4841 Inode *in;
7c673cae 4842 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4843 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4844 in = it->second;
4845 } else {
7c673cae 4846 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4847 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4848 session->enqueue_cap_release(
4849 m->get_ino(),
4850 m->get_cap_id(),
4851 m->get_seq(),
4852 m->get_mseq(),
4853 cap_epoch_barrier);
4854 } else {
11fdf7f2 4855 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4856 }
7c673cae
FG
4857
4858 // in case the mds is waiting on e.g. a revocation
4859 flush_cap_releases();
4860 return;
4861 }
4862
4863 switch (m->get_op()) {
11fdf7f2
TL
4864 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4865 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4866 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4867 }
4868
11fdf7f2
TL
4869 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4870 Cap &cap = in->caps.at(mds);
7c673cae 4871
11fdf7f2
TL
4872 switch (m->get_op()) {
4873 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4874 case CEPH_CAP_OP_IMPORT:
4875 case CEPH_CAP_OP_REVOKE:
4876 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4877 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4878 }
4879 } else {
4880 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4881 return;
7c673cae
FG
4882 }
4883}
4884
11fdf7f2 4885void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4886{
4887 mds_rank_t mds = session->mds_num;
4888
11fdf7f2 4889 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4890 << " IMPORT from mds." << mds << dendl;
4891
4892 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4893 Cap *cap = NULL;
4894 UserPerm cap_perms;
11fdf7f2
TL
4895 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4896 cap = &it->second;
4897 cap_perms = cap->latest_perms;
7c673cae
FG
4898 }
4899
4900 // add/update it
4901 SnapRealm *realm = NULL;
4902 update_snap_trace(m->snapbl, &realm);
4903
4904 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4905 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4906 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4907
4908 if (cap && cap->cap_id == m->peer.cap_id) {
4909 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4910 }
4911
4912 if (realm)
4913 put_snap_realm(realm);
4914
eafe8130 4915 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 4916 // reflush any/all caps (if we are now the auth_cap)
eafe8130 4917 kick_flushing_caps(in, session);
7c673cae
FG
4918 }
4919}
4920
11fdf7f2 4921void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4922{
4923 mds_rank_t mds = session->mds_num;
4924
11fdf7f2 4925 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4926 << " EXPORT from mds." << mds << dendl;
4927
11fdf7f2
TL
4928 auto it = in->caps.find(mds);
4929 if (it != in->caps.end()) {
4930 Cap &cap = it->second;
4931 if (cap.cap_id == m->get_cap_id()) {
4932 if (m->peer.cap_id) {
4933 const auto peer_mds = mds_rank_t(m->peer.mds);
4934 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4935 auto it = in->caps.find(peer_mds);
4936 if (it != in->caps.end()) {
4937 Cap &tcap = it->second;
4938 if (tcap.cap_id == m->peer.cap_id &&
4939 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4940 tcap.cap_id = m->peer.cap_id;
4941 tcap.seq = m->peer.seq - 1;
4942 tcap.issue_seq = tcap.seq;
4943 tcap.issued |= cap.issued;
4944 tcap.implemented |= cap.issued;
4945 if (&cap == in->auth_cap)
4946 in->auth_cap = &tcap;
4947 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4948 adjust_session_flushing_caps(in, session, tsession);
4949 }
4950 } else {
4951 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4952 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4953 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4954 cap.latest_perms);
4955 }
7c673cae 4956 } else {
11fdf7f2
TL
4957 if (cap.wanted | cap.issued)
4958 in->flags |= I_CAP_DROPPED;
7c673cae 4959 }
7c673cae 4960
11fdf7f2
TL
4961 remove_cap(&cap, false);
4962 }
7c673cae 4963 }
7c673cae
FG
4964}
4965
11fdf7f2 4966void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4967{
4968 mds_rank_t mds = session->mds_num;
11fdf7f2 4969 ceph_assert(in->caps.count(mds));
7c673cae 4970
11fdf7f2 4971 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4972 << " size " << in->size << " -> " << m->get_size()
4973 << dendl;
4974
1adf2230
AA
4975 int issued;
4976 in->caps_issued(&issued);
4977 issued |= in->caps_dirty();
4978 update_inode_file_size(in, issued, m->get_size(),
4979 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4980}
4981
11fdf7f2 4982void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4983{
4984 ceph_tid_t flush_ack_tid = m->get_client_tid();
4985 int dirty = m->get_dirty();
4986 int cleaned = 0;
4987 int flushed = 0;
4988
11fdf7f2
TL
4989 auto it = in->flushing_cap_tids.begin();
4990 if (it->first < flush_ack_tid) {
4991 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4992 << " got unexpected flush ack tid " << flush_ack_tid
4993 << " expected is " << it->first << dendl;
4994 }
4995 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
4996 if (!it->second) {
4997 // cap snap
4998 ++it;
4999 continue;
5000 }
7c673cae
FG
5001 if (it->first == flush_ack_tid)
5002 cleaned = it->second;
5003 if (it->first <= flush_ack_tid) {
5004 session->flushing_caps_tids.erase(it->first);
5005 in->flushing_cap_tids.erase(it++);
5006 ++flushed;
5007 continue;
5008 }
5009 cleaned &= ~it->second;
5010 if (!cleaned)
5011 break;
5012 ++it;
5013 }
5014
11fdf7f2 5015 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5016 << " cleaned " << ccap_string(cleaned) << " on " << *in
5017 << " with " << ccap_string(dirty) << dendl;
5018
5019 if (flushed) {
5020 signal_cond_list(in->waitfor_caps);
5021 if (session->flushing_caps_tids.empty() ||
5022 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5023 sync_cond.notify_all();
7c673cae
FG
5024 }
5025
5026 if (!dirty) {
5027 in->cap_dirtier_uid = -1;
5028 in->cap_dirtier_gid = -1;
5029 }
5030
5031 if (!cleaned) {
5032 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5033 } else {
5034 if (in->flushing_caps) {
5035 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5036 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5037 in->flushing_caps &= ~cleaned;
5038 if (in->flushing_caps == 0) {
5039 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5040 num_flushing_caps--;
eafe8130 5041 if (in->flushing_cap_tids.empty())
7c673cae
FG
5042 in->flushing_cap_item.remove_myself();
5043 }
5044 if (!in->caps_dirty())
5045 put_inode(in);
5046 }
5047 }
7c673cae
FG
5048}
5049
5050
11fdf7f2 5051void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5052{
eafe8130 5053 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5054 mds_rank_t mds = session->mds_num;
11fdf7f2 5055 ceph_assert(in->caps.count(mds));
7c673cae
FG
5056 snapid_t follows = m->get_snap_follows();
5057
11fdf7f2
TL
5058 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5059 auto& capsnap = it->second;
eafe8130
TL
5060 if (flush_ack_tid != capsnap.flush_tid) {
5061 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5062 } else {
eafe8130 5063 InodeRef tmp_ref(in);
11fdf7f2 5064 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5065 << " on " << *in << dendl;
7c673cae 5066 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5067 in->flushing_cap_tids.erase(capsnap.flush_tid);
5068 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5069 in->flushing_cap_item.remove_myself();
11fdf7f2 5070 in->cap_snaps.erase(it);
eafe8130
TL
5071
5072 signal_cond_list(in->waitfor_caps);
5073 if (session->flushing_caps_tids.empty() ||
5074 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5075 sync_cond.notify_all();
7c673cae
FG
5076 }
5077 } else {
11fdf7f2 5078 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5079 << " on " << *in << dendl;
5080 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5081 }
7c673cae
FG
5082}
5083
5084class C_Client_DentryInvalidate : public Context {
5085private:
5086 Client *client;
5087 vinodeno_t dirino;
5088 vinodeno_t ino;
5089 string name;
5090public:
5091 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5092 client(c), name(dn->name) {
5093 if (client->use_faked_inos()) {
5094 dirino.ino = dn->dir->parent_inode->faked_ino;
5095 if (del)
5096 ino.ino = dn->inode->faked_ino;
5097 } else {
5098 dirino = dn->dir->parent_inode->vino();
5099 if (del)
5100 ino = dn->inode->vino();
5101 }
5102 if (!del)
5103 ino.ino = inodeno_t();
5104 }
5105 void finish(int r) override {
5106 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5107 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5108 client->_async_dentry_invalidate(dirino, ino, name);
5109 }
5110};
5111
5112void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5113{
5114 if (unmounting)
5115 return;
11fdf7f2 5116 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5117 << " in dir " << dirino << dendl;
5118 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5119}
5120
5121void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5122{
5123 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5124 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5125}
5126
5127void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5128{
5129 int ref = in->get_num_ref();
494da23a 5130 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5131
5132 if (in->dir && !in->dir->dentries.empty()) {
5133 for (auto p = in->dir->dentries.begin();
5134 p != in->dir->dentries.end(); ) {
5135 Dentry *dn = p->second;
5136 ++p;
5137 /* rmsnap removes whole subtree, need trim inodes recursively.
5138 * we don't need to invalidate dentries recursively. because
5139 * invalidating a directory dentry effectively invalidate
5140 * whole subtree */
5141 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5142 _try_to_trim_inode(dn->inode.get(), false);
5143
5144 if (dn->lru_is_expireable())
5145 unlink(dn, true, false); // keep dir, drop dentry
5146 }
5147 if (in->dir->dentries.empty()) {
5148 close_dir(in->dir);
5149 --ref;
5150 }
5151 }
5152
5153 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5154 InodeRef snapdir = open_snapdir(in);
5155 _try_to_trim_inode(snapdir.get(), false);
5156 --ref;
5157 }
5158
494da23a 5159 if (ref > 0) {
11fdf7f2
TL
5160 auto q = in->dentries.begin();
5161 while (q != in->dentries.end()) {
5162 Dentry *dn = *q;
5163 ++q;
494da23a
TL
5164 if( in->ll_ref > 0 && sched_inval) {
5165 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5166 // so in->dentries doesn't always reflect the state of kernel's dcache.
5167 _schedule_invalidate_dentry_callback(dn, true);
5168 }
7c673cae
FG
5169 unlink(dn, true, true);
5170 }
5171 }
5172}
5173
11fdf7f2 5174void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5175{
5176 mds_rank_t mds = session->mds_num;
5177 int used = get_caps_used(in);
5178 int wanted = in->caps_wanted();
5179
a8e16298
TL
5180 const unsigned new_caps = m->get_caps();
5181 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5182 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5183 << " mds." << mds << " seq " << m->get_seq()
5184 << " caps now " << ccap_string(new_caps)
a8e16298 5185 << " was " << ccap_string(cap->issued)
92f5a8d4 5186 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5187
5188 if (was_stale)
5189 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5190 cap->seq = m->get_seq();
28e407b8 5191 cap->gen = session->cap_gen;
7c673cae 5192
11fdf7f2 5193 check_cap_issue(in, new_caps);
a8e16298 5194
7c673cae 5195 // update inode
1adf2230
AA
5196 int issued;
5197 in->caps_issued(&issued);
5198 issued |= in->caps_dirty();
7c673cae 5199
1adf2230
AA
5200 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5201 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5202 in->mode = m->head.mode;
5203 in->uid = m->head.uid;
5204 in->gid = m->head.gid;
5205 in->btime = m->btime;
5206 }
5207 bool deleted_inode = false;
1adf2230
AA
5208 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5209 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5210 in->nlink = m->head.nlink;
5211 if (in->nlink == 0 &&
5212 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5213 deleted_inode = true;
5214 }
1adf2230 5215 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5216 m->xattrbl.length() &&
5217 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5218 auto p = m->xattrbl.cbegin();
5219 decode(in->xattrs, p);
7c673cae
FG
5220 in->xattr_version = m->head.xattr_version;
5221 }
28e407b8
AA
5222
5223 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5224 in->dirstat.nfiles = m->get_nfiles();
5225 in->dirstat.nsubdirs = m->get_nsubdirs();
5226 }
5227
1adf2230
AA
5228 if (new_caps & CEPH_CAP_ANY_RD) {
5229 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5230 m->get_ctime(), m->get_mtime(), m->get_atime());
5231 }
5232
5233 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5234 in->layout = m->get_layout();
5235 update_inode_file_size(in, issued, m->get_size(),
5236 m->get_truncate_seq(), m->get_truncate_size());
5237 }
5238
5239 if (m->inline_version > in->inline_version) {
5240 in->inline_data = m->inline_data;
5241 in->inline_version = m->inline_version;
5242 }
5243
5244 /* always take a newer change attr */
5245 if (m->get_change_attr() > in->change_attr)
5246 in->change_attr = m->get_change_attr();
7c673cae
FG
5247
5248 // max_size
5249 if (cap == in->auth_cap &&
1adf2230
AA
5250 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5251 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5252 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5253 in->max_size = m->get_max_size();
5254 if (in->max_size > in->wanted_max_size) {
5255 in->wanted_max_size = 0;
5256 in->requested_max_size = 0;
5257 }
5258 }
5259
5260 bool check = false;
a8e16298
TL
5261 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5262 (wanted & ~(cap->wanted | new_caps))) {
5263 // If mds is importing cap, prior cap messages that update 'wanted'
5264 // may get dropped by mds (migrate seq mismatch).
5265 //
5266 // We don't send cap message to update 'wanted' if what we want are
5267 // already issued. If mds revokes caps, cap message that releases caps
5268 // also tells mds what we want. But if caps got revoked by mds forcedly
5269 // (session stale). We may haven't told mds what we want.
7c673cae 5270 check = true;
a8e16298 5271 }
7c673cae 5272
7c673cae
FG
5273
5274 // update caps
a8e16298 5275 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5276 if (revoked) {
5277 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5278 cap->issued = new_caps;
5279 cap->implemented |= new_caps;
5280
b32b8144
FG
5281 // recall delegations if we're losing caps necessary for them
5282 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5283 in->recall_deleg(false);
5284 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5285 in->recall_deleg(true);
5286
11fdf7f2
TL
5287 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5288 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5289 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5290 // waitin' for flush
11fdf7f2 5291 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5292 if (_release(in))
5293 check = true;
5294 } else {
5295 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5296 check = true;
5297 }
a8e16298
TL
5298 } else if (cap->issued == new_caps) {
5299 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5300 } else {
a8e16298 5301 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5302 cap->issued = new_caps;
5303 cap->implemented |= new_caps;
5304
5305 if (cap == in->auth_cap) {
5306 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5307 for (const auto &p : in->caps) {
5308 if (&p.second == cap)
7c673cae 5309 continue;
11fdf7f2 5310 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5311 check = true;
5312 break;
5313 }
5314 }
5315 }
5316 }
5317
5318 if (check)
5319 check_caps(in, 0);
5320
5321 // wake up waiters
5322 if (new_caps)
5323 signal_cond_list(in->waitfor_caps);
5324
5325 // may drop inode's last ref
5326 if (deleted_inode)
5327 _try_to_trim_inode(in, true);
7c673cae
FG
5328}
5329
7c673cae
FG
5330int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5331{
5332 if (perms.uid() == 0)
5333 return 0;
5334
5335 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5336 int ret = _posix_acl_permission(in, perms, want);
5337 if (ret != -EAGAIN)
5338 return ret;
5339 }
5340
5341 // check permissions before doing anything else
5342 if (!in->check_mode(perms, want))
5343 return -EACCES;
5344 return 0;
5345}
5346
5347int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5348 const UserPerm& perms)
5349{
5350 int r = _getattr_for_perm(in, perms);
5351 if (r < 0)
5352 goto out;
5353
5354 r = 0;
5355 if (strncmp(name, "system.", 7) == 0) {
5356 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5357 r = -EPERM;
5358 } else {
5359 r = inode_permission(in, perms, want);
5360 }
5361out:
1adf2230 5362 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5363 return r;
5364}
5365
5366ostream& operator<<(ostream &out, const UserPerm& perm) {
5367 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5368 return out;
5369}
5370
5371int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5372 const UserPerm& perms)
5373{
181888fb 5374 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5375 int r = _getattr_for_perm(in, perms);
5376 if (r < 0)
5377 goto out;
5378
5379 if (mask & CEPH_SETATTR_SIZE) {
5380 r = inode_permission(in, perms, MAY_WRITE);
5381 if (r < 0)
5382 goto out;
5383 }
5384
5385 r = -EPERM;
5386 if (mask & CEPH_SETATTR_UID) {
5387 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5388 goto out;
5389 }
5390 if (mask & CEPH_SETATTR_GID) {
5391 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5392 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5393 goto out;
5394 }
5395
5396 if (mask & CEPH_SETATTR_MODE) {
5397 if (perms.uid() != 0 && perms.uid() != in->uid)
5398 goto out;
5399
5400 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5401 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5402 stx->stx_mode &= ~S_ISGID;
5403 }
5404
5405 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5406 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5407 if (perms.uid() != 0 && perms.uid() != in->uid) {
5408 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5409 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5410 check_mask |= CEPH_SETATTR_MTIME;
5411 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5412 check_mask |= CEPH_SETATTR_ATIME;
5413 if (check_mask & mask) {
5414 goto out;
5415 } else {
5416 r = inode_permission(in, perms, MAY_WRITE);
5417 if (r < 0)
5418 goto out;
5419 }
5420 }
5421 }
5422 r = 0;
5423out:
5424 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5425 return r;
5426}
5427
5428int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5429{
181888fb 5430 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5431 unsigned want = 0;
5432
5433 if ((flags & O_ACCMODE) == O_WRONLY)
5434 want = MAY_WRITE;
5435 else if ((flags & O_ACCMODE) == O_RDWR)
5436 want = MAY_READ | MAY_WRITE;
5437 else if ((flags & O_ACCMODE) == O_RDONLY)
5438 want = MAY_READ;
5439 if (flags & O_TRUNC)
5440 want |= MAY_WRITE;
5441
5442 int r = 0;
5443 switch (in->mode & S_IFMT) {
5444 case S_IFLNK:
5445 r = -ELOOP;
5446 goto out;
5447 case S_IFDIR:
5448 if (want & MAY_WRITE) {
5449 r = -EISDIR;
5450 goto out;
5451 }
5452 break;
5453 }
5454
5455 r = _getattr_for_perm(in, perms);
5456 if (r < 0)
5457 goto out;
5458
5459 r = inode_permission(in, perms, want);
5460out:
5461 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5462 return r;
5463}
5464
5465int Client::may_lookup(Inode *dir, const UserPerm& perms)
5466{
181888fb 5467 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5468 int r = _getattr_for_perm(dir, perms);
5469 if (r < 0)
5470 goto out;
5471
5472 r = inode_permission(dir, perms, MAY_EXEC);
5473out:
5474 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5475 return r;
5476}
5477
5478int Client::may_create(Inode *dir, const UserPerm& perms)
5479{
181888fb 5480 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5481 int r = _getattr_for_perm(dir, perms);
5482 if (r < 0)
5483 goto out;
5484
5485 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5486out:
5487 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5488 return r;
5489}
5490
5491int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5492{
181888fb 5493 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5494 int r = _getattr_for_perm(dir, perms);
5495 if (r < 0)
5496 goto out;
5497
5498 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5499 if (r < 0)
5500 goto out;
5501
5502 /* 'name == NULL' means rmsnap */
5503 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5504 InodeRef otherin;
5505 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5506 if (r < 0)
5507 goto out;
5508 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5509 r = -EPERM;
5510 }
5511out:
5512 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5513 return r;
5514}
5515
5516int Client::may_hardlink(Inode *in, const UserPerm& perms)
5517{
181888fb 5518 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5519 int r = _getattr_for_perm(in, perms);
5520 if (r < 0)
5521 goto out;
5522
5523 if (perms.uid() == 0 || perms.uid() == in->uid) {
5524 r = 0;
5525 goto out;
5526 }
5527
5528 r = -EPERM;
5529 if (!S_ISREG(in->mode))
5530 goto out;
5531
5532 if (in->mode & S_ISUID)
5533 goto out;
5534
5535 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5536 goto out;
5537
5538 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5539out:
5540 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5541 return r;
5542}
5543
5544int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5545{
5546 int mask = CEPH_STAT_CAP_MODE;
5547 bool force = false;
5548 if (acl_type != NO_ACL) {
5549 mask |= CEPH_STAT_CAP_XATTR;
5550 force = in->xattr_version == 0;
5551 }
5552 return _getattr(in, mask, perms, force);
5553}
5554
5555vinodeno_t Client::_get_vino(Inode *in)
5556{
5557 /* The caller must hold the client lock */
5558 return vinodeno_t(in->ino, in->snapid);
5559}
5560
7c673cae
FG
5561/**
5562 * Resolve an MDS spec to a list of MDS daemon GIDs.
5563 *
5564 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5565 * It may be '*' in which case it matches all GIDs.
5566 *
5567 * If no error is returned, the `targets` vector will be populated with at least
5568 * one MDS.
5569 */
5570int Client::resolve_mds(
5571 const std::string &mds_spec,
5572 std::vector<mds_gid_t> *targets)
5573{
11fdf7f2
TL
5574 ceph_assert(fsmap);
5575 ceph_assert(targets != nullptr);
7c673cae
FG
5576
5577 mds_role_t role;
5578 std::stringstream ss;
5579 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5580 if (role_r == 0) {
5581 // We got a role, resolve it to a GID
5582 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5583 << role << "'" << dendl;
5584 targets->push_back(
5585 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5586 return 0;
5587 }
5588
5589 std::string strtol_err;
5590 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5591 if (strtol_err.empty()) {
5592 // It is a possible GID
5593 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5594 if (fsmap->gid_exists(mds_gid)) {
5595 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5596 targets->push_back(mds_gid);
5597 } else {
5598 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5599 << dendl;
5600 return -ENOENT;
5601 }
5602 } else if (mds_spec == "*") {
5603 // It is a wildcard: use all MDSs
5604 const auto mds_info = fsmap->get_mds_info();
5605
5606 if (mds_info.empty()) {
5607 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5608 return -ENOENT;
5609 }
5610
5611 for (const auto i : mds_info) {
5612 targets->push_back(i.first);
5613 }
5614 } else {
5615 // It did not parse as an integer, it is not a wildcard, it must be a name
5616 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5617 if (mds_gid == 0) {
5618 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5619
5620 lderr(cct) << "FSMap: " << *fsmap << dendl;
5621
5622 return -ENOENT;
5623 } else {
5624 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5625 << "' to GID " << mds_gid << dendl;
5626 targets->push_back(mds_gid);
5627 }
5628 }
5629
5630 return 0;
5631}
5632
5633
5634/**
5635 * Authenticate with mon and establish global ID
5636 */
5637int Client::authenticate()
5638{
9f95a23c 5639 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
5640
5641 if (monclient->is_authenticated()) {
5642 return 0;
5643 }
5644
9f95a23c 5645 client_lock.unlock();
7c673cae 5646 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
9f95a23c 5647 client_lock.lock();
7c673cae
FG
5648 if (r < 0) {
5649 return r;
5650 }
5651
5652 whoami = monclient->get_global_id();
5653 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5654
5655 return 0;
5656}
5657
5658int Client::fetch_fsmap(bool user)
5659{
5660 int r;
5661 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5662 // rather than MDSMap because no one MDSMap contains all the daemons, and
5663 // a `tell` can address any daemon.
5664 version_t fsmap_latest;
5665 do {
5666 C_SaferCond cond;
5667 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
9f95a23c 5668 client_lock.unlock();
7c673cae 5669 r = cond.wait();
9f95a23c 5670 client_lock.lock();
7c673cae
FG
5671 } while (r == -EAGAIN);
5672
5673 if (r < 0) {
5674 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5675 return r;
5676 }
5677
5678 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5679
5680 if (user) {
5681 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5682 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5683 monclient->renew_subs();
5684 wait_on_list(waiting_for_fsmap);
5685 }
11fdf7f2
TL
5686 ceph_assert(fsmap_user);
5687 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5688 } else {
5689 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5690 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5691 monclient->renew_subs();
5692 wait_on_list(waiting_for_fsmap);
5693 }
11fdf7f2
TL
5694 ceph_assert(fsmap);
5695 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5696 }
5697 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5698 << fsmap_latest << dendl;
5699 return 0;
5700}
5701
5702/**
5703 *
5704 * @mds_spec one of ID, rank, GID, "*"
5705 *
5706 */
5707int Client::mds_command(
5708 const std::string &mds_spec,
5709 const vector<string>& cmd,
5710 const bufferlist& inbl,
5711 bufferlist *outbl,
5712 string *outs,
5713 Context *onfinish)
5714{
11fdf7f2 5715 std::lock_guard lock(client_lock);
7c673cae 5716
181888fb
FG
5717 if (!initialized)
5718 return -ENOTCONN;
7c673cae
FG
5719
5720 int r;
5721 r = authenticate();
5722 if (r < 0) {
5723 return r;
5724 }
5725
5726 r = fetch_fsmap(false);
5727 if (r < 0) {
5728 return r;
5729 }
5730
5731 // Look up MDS target(s) of the command
5732 std::vector<mds_gid_t> targets;
5733 r = resolve_mds(mds_spec, &targets);
5734 if (r < 0) {
5735 return r;
5736 }
5737
5738 // If daemons are laggy, we won't send them commands. If all
5739 // are laggy then we fail.
5740 std::vector<mds_gid_t> non_laggy;
5741 for (const auto gid : targets) {
5742 const auto info = fsmap->get_info_gid(gid);
5743 if (!info.laggy()) {
5744 non_laggy.push_back(gid);
5745 }
5746 }
5747 if (non_laggy.size() == 0) {
5748 *outs = "All targeted MDS daemons are laggy";
5749 return -ENOENT;
5750 }
5751
5752 if (metadata.empty()) {
5753 // We are called on an unmounted client, so metadata
5754 // won't be initialized yet.
5755 populate_metadata("");
5756 }
5757
5758 // Send commands to targets
5759 C_GatherBuilder gather(cct, onfinish);
5760 for (const auto target_gid : non_laggy) {
5761 const auto info = fsmap->get_info_gid(target_gid);
5762
5763 // Open a connection to the target MDS
11fdf7f2 5764 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5765
5766 // Generate MDSCommandOp state
5767 auto &op = command_table.start_command();
5768
5769 op.on_finish = gather.new_sub();
5770 op.cmd = cmd;
5771 op.outbl = outbl;
5772 op.outs = outs;
5773 op.inbl = inbl;
5774 op.mds_gid = target_gid;
5775 op.con = conn;
5776
5777 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5778 << " tid=" << op.tid << cmd << dendl;
5779
5780 // Construct and send MCommand
11fdf7f2
TL
5781 auto m = op.get_message(monclient->get_fsid());
5782 conn->send_message2(std::move(m));
7c673cae
FG
5783 }
5784 gather.activate();
5785
5786 return 0;
5787}
5788
11fdf7f2 5789void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5790{
5791 ceph_tid_t const tid = m->get_tid();
5792
5793 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5794
5795 if (!command_table.exists(tid)) {
5796 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5797 return;
5798 }
5799
5800 auto &op = command_table.get_command(tid);
5801 if (op.outbl) {
11fdf7f2 5802 *op.outbl = m->get_data();
7c673cae
FG
5803 }
5804 if (op.outs) {
5805 *op.outs = m->rs;
5806 }
5807
5808 if (op.on_finish) {
5809 op.on_finish->complete(m->r);
5810 }
5811
5812 command_table.erase(tid);
7c673cae
FG
5813}
5814
5815// -------------------
5816// MOUNT
5817
11fdf7f2 5818int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5819{
7c673cae
FG
5820 int r = authenticate();
5821 if (r < 0) {
5822 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5823 return r;
5824 }
5825
11fdf7f2
TL
5826 std::string resolved_fs_name;
5827 if (fs_name.empty()) {
9f95a23c
TL
5828 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5829 if (resolved_fs_name.empty())
5830 // Try the backwards compatibility fs name option
5831 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
5832 } else {
5833 resolved_fs_name = fs_name;
5834 }
5835
7c673cae 5836 std::string want = "mdsmap";
11fdf7f2 5837 if (!resolved_fs_name.empty()) {
7c673cae
FG
5838 r = fetch_fsmap(true);
5839 if (r < 0)
5840 return r;
11fdf7f2
TL
5841 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5842 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5843 return -ENOENT;
11fdf7f2 5844 }
7c673cae
FG
5845
5846 std::ostringstream oss;
11fdf7f2 5847 oss << want << "." << fscid;
7c673cae
FG
5848 want = oss.str();
5849 }
5850 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5851
5852 monclient->sub_want(want, 0, 0);
5853 monclient->renew_subs();
5854
11fdf7f2
TL
5855 return 0;
5856}
5857
5858int Client::mount(const std::string &mount_root, const UserPerm& perms,
5859 bool require_mds, const std::string &fs_name)
5860{
5861 std::lock_guard lock(client_lock);
5862
5863 if (mounted) {
5864 ldout(cct, 5) << "already mounted" << dendl;
5865 return 0;
5866 }
5867
5868 unmounting = false;
5869
5870 int r = subscribe_mdsmap(fs_name);
5871 if (r < 0) {
5872 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5873 return r;
5874 }
5875
7c673cae
FG
5876 tick(); // start tick
5877
5878 if (require_mds) {
5879 while (1) {
5880 auto availability = mdsmap->is_cluster_available();
5881 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5882 // Error out
5883 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5884 return CEPH_FUSE_NO_MDS_UP;
5885 } else if (availability == MDSMap::AVAILABLE) {
5886 // Continue to mount
5887 break;
5888 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5889 // Else, wait. MDSMonitor will update the map to bring
5890 // us to a conclusion eventually.
5891 wait_on_list(waiting_for_mdsmap);
5892 } else {
5893 // Unexpected value!
5894 ceph_abort();
5895 }
5896 }
5897 }
5898
5899 populate_metadata(mount_root.empty() ? "/" : mount_root);
5900
5901 filepath fp(CEPH_INO_ROOT);
5902 if (!mount_root.empty()) {
5903 fp = filepath(mount_root.c_str());
5904 }
5905 while (true) {
5906 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5907 req->set_filepath(fp);
5908 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5909 int res = make_request(req, perms);
5910 if (res < 0) {
5911 if (res == -EACCES && root) {
5912 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5913 break;
5914 }
5915 return res;
5916 }
5917
5918 if (fp.depth())
5919 fp.pop_dentry();
5920 else
5921 break;
5922 }
5923
11fdf7f2 5924 ceph_assert(root);
7c673cae
FG
5925 _ll_get(root);
5926
5927 mounted = true;
5928
5929 // trace?
5930 if (!cct->_conf->client_trace.empty()) {
5931 traceout.open(cct->_conf->client_trace.c_str());
5932 if (traceout.is_open()) {
5933 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5934 } else {
5935 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5936 }
5937 }
5938
5939 /*
5940 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5941 ldout(cct, 3) << "op: struct stat st;" << dendl;
5942 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5943 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5944 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5945 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5946 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5947 ldout(cct, 3) << "op: int fd;" << dendl;
5948 */
5949 return 0;
5950}
5951
5952// UNMOUNT
5953
5954void Client::_close_sessions()
5955{
5956 while (!mds_sessions.empty()) {
5957 // send session closes!
11fdf7f2
TL
5958 for (auto &p : mds_sessions) {
5959 if (p.second.state != MetaSession::STATE_CLOSING) {
5960 _close_mds_session(&p.second);
7c673cae
FG
5961 }
5962 }
5963
5964 // wait for sessions to close
5965 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
9f95a23c
TL
5966 std::unique_lock l{client_lock, std::adopt_lock};
5967 mount_cond.wait(l);
5968 l.release();
7c673cae
FG
5969 }
5970}
5971
31f18b77
FG
5972void Client::flush_mdlog_sync()
5973{
5974 if (mds_requests.empty())
5975 return;
11fdf7f2
TL
5976 for (auto &p : mds_sessions) {
5977 flush_mdlog(&p.second);
31f18b77
FG
5978 }
5979}
5980
5981void Client::flush_mdlog(MetaSession *session)
5982{
5983 // Only send this to Luminous or newer MDS daemons, older daemons
5984 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5985 const uint64_t features = session->con->get_features();
5986 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 5987 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 5988 session->con->send_message2(std::move(m));
31f18b77
FG
5989 }
5990}
5991
5992
11fdf7f2
TL
5993void Client::_abort_mds_sessions(int err)
5994{
5995 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5996 auto req = p->second;
5997 ++p;
5998 // unsafe requests will be removed during close session below.
5999 if (req->got_unsafe)
6000 continue;
6001
6002 req->abort(err);
6003 if (req->caller_cond) {
6004 req->kick = true;
9f95a23c 6005 req->caller_cond->notify_all();
11fdf7f2
TL
6006 }
6007 }
6008
6009 // Process aborts on any requests that were on this waitlist.
6010 // Any requests that were on a waiting_for_open session waitlist
6011 // will get kicked during close session below.
6012 signal_cond_list(waiting_for_mdsmap);
6013
6014 // Force-close all sessions
6015 while(!mds_sessions.empty()) {
6016 auto& session = mds_sessions.begin()->second;
6017 _closed_mds_session(&session);
6018 }
6019}
6020
6021void Client::_unmount(bool abort)
7c673cae 6022{
9f95a23c 6023 std::unique_lock lock{client_lock, std::adopt_lock};
181888fb
FG
6024 if (unmounting)
6025 return;
7c673cae 6026
11fdf7f2
TL
6027 if (abort || blacklisted) {
6028 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6029 } else {
6030 ldout(cct, 2) << "unmounting" << dendl;
6031 }
7c673cae
FG
6032 unmounting = true;
6033
b32b8144
FG
6034 deleg_timeout = 0;
6035
11fdf7f2
TL
6036 if (abort) {
6037 // Abort all mds sessions
6038 _abort_mds_sessions(-ENOTCONN);
6039
6040 objecter->op_cancel_writes(-ENOTCONN);
6041 } else {
6042 // flush the mdlog for pending requests, if any
6043 flush_mdlog_sync();
6044 }
6045
9f95a23c
TL
6046 mount_cond.wait(lock, [this] {
6047 if (!mds_requests.empty()) {
6048 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6049 << dendl;
6050 }
6051 return mds_requests.empty();
6052 });
7c673cae
FG
6053 if (tick_event)
6054 timer.cancel_event(tick_event);
6055 tick_event = 0;
6056
6057 cwd.reset();
6058
6059 // clean up any unclosed files
6060 while (!fd_map.empty()) {
6061 Fh *fh = fd_map.begin()->second;
6062 fd_map.erase(fd_map.begin());
6063 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6064 _release_fh(fh);
6065 }
6066
6067 while (!ll_unclosed_fh_set.empty()) {
6068 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6069 Fh *fh = *it;
6070 ll_unclosed_fh_set.erase(fh);
6071 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6072 _release_fh(fh);
6073 }
6074
6075 while (!opened_dirs.empty()) {
6076 dir_result_t *dirp = *opened_dirs.begin();
6077 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6078 _closedir(dirp);
6079 }
6080
6081 _ll_drop_pins();
6082
9f95a23c
TL
6083 mount_cond.wait(lock, [this] {
6084 if (unsafe_sync_write > 0) {
6085 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6086 << dendl;
6087 }
6088 return unsafe_sync_write <= 0;
6089 });
7c673cae
FG
6090
6091 if (cct->_conf->client_oc) {
6092 // flush/release all buffered data
11fdf7f2
TL
6093 std::list<InodeRef> anchor;
6094 for (auto& p : inode_map) {
6095 Inode *in = p.second;
7c673cae 6096 if (!in) {
11fdf7f2
TL
6097 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6098 ceph_assert(in);
7c673cae 6099 }
11fdf7f2
TL
6100
6101 // prevent inode from getting freed
6102 anchor.emplace_back(in);
6103
6104 if (abort || blacklisted) {
6105 objectcacher->purge_set(&in->oset);
6106 } else if (!in->caps.empty()) {
7c673cae
FG
6107 _release(in);
6108 _flush(in, new C_Client_FlushComplete(this, in));
6109 }
6110 }
6111 }
6112
11fdf7f2
TL
6113 if (abort || blacklisted) {
6114 for (auto p = dirty_list.begin(); !p.end(); ) {
6115 Inode *in = *p;
6116 ++p;
6117 if (in->dirty_caps) {
6118 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6119 in->mark_caps_clean();
6120 put_inode(in);
6121 }
6122 }
6123 } else {
6124 flush_caps_sync();
6125 wait_sync_caps(last_flush_tid);
6126 }
7c673cae
FG
6127
6128 // empty lru cache
7c673cae
FG
6129 trim_cache();
6130
6131 while (lru.lru_get_size() > 0 ||
6132 !inode_map.empty()) {
6133 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6134 << "+" << inode_map.size() << " items"
6135 << ", waiting (for caps to release?)"
6136 << dendl;
9f95a23c
TL
6137 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6138 r == std::cv_status::timeout) {
7c673cae
FG
6139 dump_cache(NULL);
6140 }
6141 }
11fdf7f2
TL
6142 ceph_assert(lru.lru_get_size() == 0);
6143 ceph_assert(inode_map.empty());
7c673cae
FG
6144
6145 // stop tracing
6146 if (!cct->_conf->client_trace.empty()) {
6147 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6148 traceout.close();
6149 }
6150
6151 _close_sessions();
6152
6153 mounted = false;
6154
9f95a23c 6155 lock.release();
7c673cae
FG
6156 ldout(cct, 2) << "unmounted." << dendl;
6157}
6158
b32b8144
FG
6159void Client::unmount()
6160{
11fdf7f2
TL
6161 std::lock_guard lock(client_lock);
6162 _unmount(false);
6163}
6164
6165void Client::abort_conn()
6166{
6167 std::lock_guard lock(client_lock);
6168 _unmount(true);
b32b8144
FG
6169}
6170
7c673cae
FG
6171void Client::flush_cap_releases()
6172{
6173 // send any cap releases
11fdf7f2
TL
6174 for (auto &p : mds_sessions) {
6175 auto &session = p.second;
6176 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6177 p.first)) {
7c673cae
FG
6178 if (cct->_conf->client_inject_release_failure) {
6179 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6180 } else {
11fdf7f2 6181 session.con->send_message2(std::move(session.release));
7c673cae 6182 }
11fdf7f2 6183 session.release.reset();
7c673cae
FG
6184 }
6185 }
6186}
6187
6188void Client::tick()
6189{
6190 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6191 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6192 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6193 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6194 }
6195
6196 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6197 tick_event = timer.add_event_after(
6198 cct->_conf->client_tick_interval,
9f95a23c 6199 new LambdaContext([this](int) {
3efd9988 6200 // Called back via Timer, which takes client_lock for us
9f95a23c 6201 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3efd9988
FG
6202 tick();
6203 }));
7c673cae
FG
6204 utime_t now = ceph_clock_now();
6205
6206 if (!mounted && !mds_requests.empty()) {
6207 MetaRequest *req = mds_requests.begin()->second;
6208 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6209 req->abort(-ETIMEDOUT);
6210 if (req->caller_cond) {
6211 req->kick = true;
9f95a23c 6212 req->caller_cond->notify_all();
7c673cae
FG
6213 }
6214 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6215 for (auto &p : mds_sessions) {
6216 signal_context_list(p.second.waiting_for_open);
6217 }
7c673cae
FG
6218 }
6219 }
6220
6221 if (mdsmap->get_epoch()) {
6222 // renew caps?
6223 utime_t el = now - last_cap_renew;
6224 if (el > mdsmap->get_session_timeout() / 3.0)
6225 renew_caps();
6226
6227 flush_cap_releases();
6228 }
6229
6230 // delayed caps
28e407b8 6231 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6232 while (!p.end()) {
6233 Inode *in = *p;
6234 ++p;
6235 if (in->hold_caps_until > now)
6236 break;
28e407b8 6237 delayed_list.pop_front();
7c673cae
FG
6238 check_caps(in, CHECK_CAPS_NODELAY);
6239 }
6240
6241 trim_cache(true);
6242}
6243
6244void Client::renew_caps()
6245{
6246 ldout(cct, 10) << "renew_caps()" << dendl;
6247 last_cap_renew = ceph_clock_now();
6248
11fdf7f2
TL
6249 for (auto &p : mds_sessions) {
6250 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6251 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6252 renew_caps(&p.second);
7c673cae
FG
6253 }
6254}
6255
6256void Client::renew_caps(MetaSession *session)
6257{
6258 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6259 session->last_cap_renew_request = ceph_clock_now();
6260 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6261 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6262}
6263
6264
6265// ===============================================================
6266// high level (POSIXy) interface
6267
6268int Client::_do_lookup(Inode *dir, const string& name, int mask,
6269 InodeRef *target, const UserPerm& perms)
6270{
6271 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6272 MetaRequest *req = new MetaRequest(op);
6273 filepath path;
6274 dir->make_nosnap_relative_path(path);
6275 path.push_dentry(name);
6276 req->set_filepath(path);
6277 req->set_inode(dir);
6278 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6279 mask |= DEBUG_GETATTR_CAPS;
6280 req->head.args.getattr.mask = mask;
6281
11fdf7f2 6282 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6283
6284 int r = make_request(req, perms, target);
11fdf7f2 6285 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6286 return r;
6287}
6288
6289int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6290 const UserPerm& perms)
6291{
6292 int r = 0;
6293 Dentry *dn = NULL;
6294
7c673cae 6295 if (dname == "..") {
11fdf7f2
TL
6296 if (dir->dentries.empty()) {
6297 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6298 filepath path(dir->ino);
6299 req->set_filepath(path);
6300
6301 InodeRef tmptarget;
6302 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6303
6304 if (r == 0) {
6305 Inode *tempino = tmptarget.get();
6306 _ll_get(tempino);
6307 *target = tempino;
6308 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6309 } else {
6310 *target = dir;
6311 }
6312 }
7c673cae
FG
6313 else
6314 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6315 goto done;
6316 }
6317
6318 if (dname == ".") {
6319 *target = dir;
6320 goto done;
6321 }
6322
11fdf7f2
TL
6323 if (!dir->is_dir()) {
6324 r = -ENOTDIR;
6325 goto done;
6326 }
6327
7c673cae
FG
6328 if (dname.length() > NAME_MAX) {
6329 r = -ENAMETOOLONG;
6330 goto done;
6331 }
6332
6333 if (dname == cct->_conf->client_snapdir &&
6334 dir->snapid == CEPH_NOSNAP) {
6335 *target = open_snapdir(dir);
6336 goto done;
6337 }
6338
6339 if (dir->dir &&
6340 dir->dir->dentries.count(dname)) {
6341 dn = dir->dir->dentries[dname];
6342
11fdf7f2 6343 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6344 << " seq " << dn->lease_seq
6345 << dendl;
6346
94b18763 6347 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6348 // is dn lease valid?
6349 utime_t now = ceph_clock_now();
6350 if (dn->lease_mds >= 0 &&
6351 dn->lease_ttl > now &&
6352 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6353 MetaSession &s = mds_sessions.at(dn->lease_mds);
6354 if (s.cap_ttl > now &&
6355 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6356 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6357 // make trim_caps() behave.
6358 dir->try_touch_cap(dn->lease_mds);
6359 goto hit_dn;
6360 }
11fdf7f2 6361 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6362 << " vs lease_gen " << dn->lease_gen << dendl;
6363 }
92f5a8d4 6364 // dir shared caps?
94b18763 6365 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6366 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6367 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6368 goto hit_dn;
6369 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6370 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6371 << *dir << " dn '" << dname << "'" << dendl;
6372 return -ENOENT;
6373 }
6374 }
6375 } else {
6376 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6377 }
6378 } else {
6379 // can we conclude ENOENT locally?
94b18763 6380 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6381 (dir->flags & I_COMPLETE)) {
11fdf7f2 6382 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6383 return -ENOENT;
6384 }
6385 }
6386
6387 r = _do_lookup(dir, dname, mask, target, perms);
6388 goto done;
6389
6390 hit_dn:
6391 if (dn->inode) {
6392 *target = dn->inode;
6393 } else {
6394 r = -ENOENT;
6395 }
6396 touch_dn(dn);
6397
6398 done:
6399 if (r < 0)
11fdf7f2 6400 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6401 else
11fdf7f2 6402 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6403 return r;
6404}
6405
6406int Client::get_or_create(Inode *dir, const char* name,
6407 Dentry **pdn, bool expect_null)
6408{
6409 // lookup
11fdf7f2 6410 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6411 dir->open_dir();
6412 if (dir->dir->dentries.count(name)) {
6413 Dentry *dn = dir->dir->dentries[name];
6414
6415 // is dn lease valid?
6416 utime_t now = ceph_clock_now();
6417 if (dn->inode &&
6418 dn->lease_mds >= 0 &&
6419 dn->lease_ttl > now &&
6420 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6421 MetaSession &s = mds_sessions.at(dn->lease_mds);
6422 if (s.cap_ttl > now &&
6423 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6424 if (expect_null)
6425 return -EEXIST;
6426 }
6427 }
6428 *pdn = dn;
6429 } else {
6430 // otherwise link up a new one
6431 *pdn = link(dir->dir, name, NULL, NULL);
6432 }
6433
6434 // success
6435 return 0;
6436}
6437
6438int Client::path_walk(const filepath& origpath, InodeRef *end,
6439 const UserPerm& perms, bool followsym, int mask)
6440{
6441 filepath path = origpath;
6442 InodeRef cur;
6443 if (origpath.absolute())
6444 cur = root;
6445 else
6446 cur = cwd;
11fdf7f2 6447 ceph_assert(cur);
7c673cae 6448
11fdf7f2 6449 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6450
6451 int symlinks = 0;
6452
6453 unsigned i=0;
6454 while (i < path.depth() && cur) {
6455 int caps = 0;
6456 const string &dname = path[i];
6457 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6458 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6459 InodeRef next;
6460 if (cct->_conf->client_permissions) {
6461 int r = may_lookup(cur.get(), perms);
6462 if (r < 0)
6463 return r;
6464 caps = CEPH_CAP_AUTH_SHARED;
6465 }
6466
6467 /* Get extra requested caps on the last component */
6468 if (i == (path.depth() - 1))
6469 caps |= mask;
6470 int r = _lookup(cur.get(), dname, caps, &next, perms);
6471 if (r < 0)
6472 return r;
6473 // only follow trailing symlink if followsym. always follow
6474 // 'directory' symlinks.
6475 if (next && next->is_symlink()) {
6476 symlinks++;
6477 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6478 if (symlinks > MAXSYMLINKS) {
6479 return -ELOOP;
6480 }
6481
6482 if (i < path.depth() - 1) {
6483 // dir symlink
6484 // replace consumed components of path with symlink dir target
6485 filepath resolved(next->symlink.c_str());
6486 resolved.append(path.postfixpath(i + 1));
6487 path = resolved;
6488 i = 0;
6489 if (next->symlink[0] == '/') {
6490 cur = root;
6491 }
6492 continue;
6493 } else if (followsym) {
6494 if (next->symlink[0] == '/') {
6495 path = next->symlink.c_str();
6496 i = 0;
6497 // reset position
6498 cur = root;
6499 } else {
6500 filepath more(next->symlink.c_str());
6501 // we need to remove the symlink component from off of the path
6502 // before adding the target that the symlink points to. remain
6503 // at the same position in the path.
6504 path.pop_dentry();
6505 path.append(more);
6506 }
6507 continue;
6508 }
6509 }
6510 cur.swap(next);
6511 i++;
6512 }
6513 if (!cur)
6514 return -ENOENT;
6515 if (end)
6516 end->swap(cur);
6517 return 0;
6518}
6519
6520
6521// namespace ops
6522
6523int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6524{
11fdf7f2 6525 std::lock_guard lock(client_lock);
7c673cae
FG
6526 tout(cct) << "link" << std::endl;
6527 tout(cct) << relexisting << std::endl;
6528 tout(cct) << relpath << std::endl;
6529
181888fb
FG
6530 if (unmounting)
6531 return -ENOTCONN;
6532
7c673cae
FG
6533 filepath existing(relexisting);
6534
6535 InodeRef in, dir;
6536 int r = path_walk(existing, &in, perm, true);
6537 if (r < 0)
6538 return r;
6539 if (std::string(relpath) == "/") {
6540 r = -EEXIST;
6541 return r;
6542 }
6543 filepath path(relpath);
6544 string name = path.last_dentry();
6545 path.pop_dentry();
6546
6547 r = path_walk(path, &dir, perm, true);
6548 if (r < 0)
6549 return r;
6550 if (cct->_conf->client_permissions) {
6551 if (S_ISDIR(in->mode)) {
6552 r = -EPERM;
6553 return r;
6554 }
6555 r = may_hardlink(in.get(), perm);
6556 if (r < 0)
6557 return r;
6558 r = may_create(dir.get(), perm);
6559 if (r < 0)
6560 return r;
6561 }
6562 r = _link(in.get(), dir.get(), name.c_str(), perm);
6563 return r;
6564}
6565
6566int Client::unlink(const char *relpath, const UserPerm& perm)
6567{
11fdf7f2
TL
6568 std::lock_guard lock(client_lock);
6569 tout(cct) << __func__ << std::endl;
7c673cae
FG
6570 tout(cct) << relpath << std::endl;
6571
181888fb
FG
6572 if (unmounting)
6573 return -ENOTCONN;
6574
7c673cae
FG
6575 if (std::string(relpath) == "/")
6576 return -EISDIR;
6577
6578 filepath path(relpath);
6579 string name = path.last_dentry();
6580 path.pop_dentry();
6581 InodeRef dir;
6582 int r = path_walk(path, &dir, perm);
6583 if (r < 0)
6584 return r;
6585 if (cct->_conf->client_permissions) {
6586 r = may_delete(dir.get(), name.c_str(), perm);
6587 if (r < 0)
6588 return r;
6589 }
6590 return _unlink(dir.get(), name.c_str(), perm);
6591}
6592
6593int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6594{
11fdf7f2
TL
6595 std::lock_guard lock(client_lock);
6596 tout(cct) << __func__ << std::endl;
7c673cae
FG
6597 tout(cct) << relfrom << std::endl;
6598 tout(cct) << relto << std::endl;
6599
181888fb
FG
6600 if (unmounting)
6601 return -ENOTCONN;
6602
7c673cae
FG
6603 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6604 return -EBUSY;
6605
6606 filepath from(relfrom);
6607 filepath to(relto);
6608 string fromname = from.last_dentry();
6609 from.pop_dentry();
6610 string toname = to.last_dentry();
6611 to.pop_dentry();
6612
6613 InodeRef fromdir, todir;
6614 int r = path_walk(from, &fromdir, perm);
6615 if (r < 0)
6616 goto out;
6617 r = path_walk(to, &todir, perm);
6618 if (r < 0)
6619 goto out;
6620
6621 if (cct->_conf->client_permissions) {
6622 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6623 if (r < 0)
6624 return r;
6625 r = may_delete(todir.get(), toname.c_str(), perm);
6626 if (r < 0 && r != -ENOENT)
6627 return r;
6628 }
6629 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6630out:
6631 return r;
6632}
6633
6634// dirs
6635
6636int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6637{
11fdf7f2
TL
6638 std::lock_guard lock(client_lock);
6639 tout(cct) << __func__ << std::endl;
7c673cae
FG
6640 tout(cct) << relpath << std::endl;
6641 tout(cct) << mode << std::endl;
11fdf7f2 6642 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6643
181888fb
FG
6644 if (unmounting)
6645 return -ENOTCONN;
6646
7c673cae
FG
6647 if (std::string(relpath) == "/")
6648 return -EEXIST;
6649
6650 filepath path(relpath);
6651 string name = path.last_dentry();
6652 path.pop_dentry();
6653 InodeRef dir;
6654 int r = path_walk(path, &dir, perm);
6655 if (r < 0)
6656 return r;
6657 if (cct->_conf->client_permissions) {
6658 r = may_create(dir.get(), perm);
6659 if (r < 0)
6660 return r;
6661 }
6662 return _mkdir(dir.get(), name.c_str(), mode, perm);
6663}
6664
6665int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6666{
11fdf7f2 6667 std::lock_guard lock(client_lock);
7c673cae 6668 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6669 tout(cct) << __func__ << std::endl;
7c673cae
FG
6670 tout(cct) << relpath << std::endl;
6671 tout(cct) << mode << std::endl;
6672
181888fb
FG
6673 if (unmounting)
6674 return -ENOTCONN;
6675
7c673cae
FG
6676 //get through existing parts of path
6677 filepath path(relpath);
6678 unsigned int i;
6679 int r = 0, caps = 0;
6680 InodeRef cur, next;
6681 cur = cwd;
6682 for (i=0; i<path.depth(); ++i) {
6683 if (cct->_conf->client_permissions) {
6684 r = may_lookup(cur.get(), perms);
6685 if (r < 0)
6686 break;
6687 caps = CEPH_CAP_AUTH_SHARED;
6688 }
6689 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6690 if (r < 0)
6691 break;
6692 cur.swap(next);
6693 }
7c673cae 6694 if (r!=-ENOENT) return r;
11fdf7f2 6695 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6696 //make new directory at each level
6697 for (; i<path.depth(); ++i) {
6698 if (cct->_conf->client_permissions) {
6699 r = may_create(cur.get(), perms);
6700 if (r < 0)
6701 return r;
6702 }
6703 //make new dir
6704 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6705
7c673cae 6706 //check proper creation/existence
c07f9fc5
FG
6707 if(-EEXIST == r && i < path.depth() - 1) {
6708 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6709 }
6710 if (r < 0)
6711 return r;
7c673cae
FG
6712 //move to new dir and continue
6713 cur.swap(next);
11fdf7f2 6714 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6715 << filepath(cur->ino).get_path() << dendl;
6716 }
6717 return 0;
6718}
6719
6720int Client::rmdir(const char *relpath, const UserPerm& perms)
6721{
11fdf7f2
TL
6722 std::lock_guard lock(client_lock);
6723 tout(cct) << __func__ << std::endl;
7c673cae
FG
6724 tout(cct) << relpath << std::endl;
6725
181888fb
FG
6726 if (unmounting)
6727 return -ENOTCONN;
6728
7c673cae
FG
6729 if (std::string(relpath) == "/")
6730 return -EBUSY;
6731
6732 filepath path(relpath);
6733 string name = path.last_dentry();
6734 path.pop_dentry();
6735 InodeRef dir;
6736 int r = path_walk(path, &dir, perms);
6737 if (r < 0)
6738 return r;
6739 if (cct->_conf->client_permissions) {
6740 int r = may_delete(dir.get(), name.c_str(), perms);
6741 if (r < 0)
6742 return r;
6743 }
6744 return _rmdir(dir.get(), name.c_str(), perms);
6745}
6746
6747int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6748{
11fdf7f2
TL
6749 std::lock_guard lock(client_lock);
6750 tout(cct) << __func__ << std::endl;
7c673cae
FG
6751 tout(cct) << relpath << std::endl;
6752 tout(cct) << mode << std::endl;
6753 tout(cct) << rdev << std::endl;
6754
181888fb
FG
6755 if (unmounting)
6756 return -ENOTCONN;
6757
7c673cae
FG
6758 if (std::string(relpath) == "/")
6759 return -EEXIST;
6760
6761 filepath path(relpath);
6762 string name = path.last_dentry();
6763 path.pop_dentry();
6764 InodeRef dir;
6765 int r = path_walk(path, &dir, perms);
6766 if (r < 0)
6767 return r;
6768 if (cct->_conf->client_permissions) {
6769 int r = may_create(dir.get(), perms);
6770 if (r < 0)
6771 return r;
6772 }
6773 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6774}
6775
6776// symlinks
6777
6778int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6779{
11fdf7f2
TL
6780 std::lock_guard lock(client_lock);
6781 tout(cct) << __func__ << std::endl;
7c673cae
FG
6782 tout(cct) << target << std::endl;
6783 tout(cct) << relpath << std::endl;
6784
181888fb
FG
6785 if (unmounting)
6786 return -ENOTCONN;
6787
7c673cae
FG
6788 if (std::string(relpath) == "/")
6789 return -EEXIST;
6790
6791 filepath path(relpath);
6792 string name = path.last_dentry();
6793 path.pop_dentry();
6794 InodeRef dir;
6795 int r = path_walk(path, &dir, perms);
6796 if (r < 0)
6797 return r;
6798 if (cct->_conf->client_permissions) {
6799 int r = may_create(dir.get(), perms);
6800 if (r < 0)
6801 return r;
6802 }
6803 return _symlink(dir.get(), name.c_str(), target, perms);
6804}
6805
6806int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6807{
11fdf7f2
TL
6808 std::lock_guard lock(client_lock);
6809 tout(cct) << __func__ << std::endl;
7c673cae
FG
6810 tout(cct) << relpath << std::endl;
6811
181888fb
FG
6812 if (unmounting)
6813 return -ENOTCONN;
6814
7c673cae
FG
6815 filepath path(relpath);
6816 InodeRef in;
6817 int r = path_walk(path, &in, perms, false);
6818 if (r < 0)
6819 return r;
6820
6821 return _readlink(in.get(), buf, size);
6822}
6823
6824int Client::_readlink(Inode *in, char *buf, size_t size)
6825{
6826 if (!in->is_symlink())
6827 return -EINVAL;
6828
6829 // copy into buf (at most size bytes)
6830 int r = in->symlink.length();
6831 if (r > (int)size)
6832 r = size;
6833 memcpy(buf, in->symlink.c_str(), r);
6834 return r;
6835}
6836
6837
6838// inode stuff
6839
6840int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6841{
94b18763 6842 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6843
11fdf7f2 6844 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6845 if (yes && !force)
6846 return 0;
6847
6848 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6849 filepath path;
6850 in->make_nosnap_relative_path(path);
6851 req->set_filepath(path);
6852 req->set_inode(in);
6853 req->head.args.getattr.mask = mask;
6854
6855 int res = make_request(req, perms);
11fdf7f2 6856 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6857 return res;
6858}
6859
6860int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6861 const UserPerm& perms, InodeRef *inp)
6862{
6863 int issued = in->caps_issued();
6864
11fdf7f2 6865 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6866 ccap_string(issued) << dendl;
6867
6868 if (in->snapid != CEPH_NOSNAP) {
6869 return -EROFS;
6870 }
6871 if ((mask & CEPH_SETATTR_SIZE) &&
6872 (unsigned long)stx->stx_size > in->size &&
6873 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6874 perms)) {
6875 return -EDQUOT;
6876 }
6877
6878 // make the change locally?
6879 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6880 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6881 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6882 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6883 << in->cap_dirtier_gid << ", forcing sync setattr"
6884 << dendl;
6885 /*
6886 * This works because we implicitly flush the caps as part of the
6887 * request, so the cap update check will happen with the writeback
6888 * cap context, and then the setattr check will happen with the
6889 * caller's context.
6890 *
6891 * In reality this pattern is likely pretty rare (different users
6892 * setattr'ing the same file). If that turns out not to be the
6893 * case later, we can build a more complex pipelined cap writeback
6894 * infrastructure...
6895 */
6896 if (!mask)
6897 mask |= CEPH_SETATTR_CTIME;
6898 goto force_request;
6899 }
6900
6901 if (!mask) {
6902 // caller just needs us to bump the ctime
6903 in->ctime = ceph_clock_now();
6904 in->cap_dirtier_uid = perms.uid();
6905 in->cap_dirtier_gid = perms.gid();
6906 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6907 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6908 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6909 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6910 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6911 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6912 else
6913 mask |= CEPH_SETATTR_CTIME;
6914 }
6915
6916 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6917 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6918
6919 mask &= ~CEPH_SETATTR_KILL_SGUID;
6920
6921 if (mask & CEPH_SETATTR_UID) {
6922 in->ctime = ceph_clock_now();
6923 in->cap_dirtier_uid = perms.uid();
6924 in->cap_dirtier_gid = perms.gid();
6925 in->uid = stx->stx_uid;
28e407b8 6926 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6927 mask &= ~CEPH_SETATTR_UID;
6928 kill_sguid = true;
6929 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6930 }
6931 if (mask & CEPH_SETATTR_GID) {
6932 in->ctime = ceph_clock_now();
6933 in->cap_dirtier_uid = perms.uid();
6934 in->cap_dirtier_gid = perms.gid();
6935 in->gid = stx->stx_gid;
28e407b8 6936 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6937 mask &= ~CEPH_SETATTR_GID;
6938 kill_sguid = true;
6939 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6940 }
6941
6942 if (mask & CEPH_SETATTR_MODE) {
6943 in->ctime = ceph_clock_now();
6944 in->cap_dirtier_uid = perms.uid();
6945 in->cap_dirtier_gid = perms.gid();
6946 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6947 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6948 mask &= ~CEPH_SETATTR_MODE;
6949 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6950 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6951 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6952 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6953 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6954 }
6955
6956 if (mask & CEPH_SETATTR_BTIME) {
6957 in->ctime = ceph_clock_now();
6958 in->cap_dirtier_uid = perms.uid();
6959 in->cap_dirtier_gid = perms.gid();
6960 in->btime = utime_t(stx->stx_btime);
28e407b8 6961 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6962 mask &= ~CEPH_SETATTR_BTIME;
6963 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6964 }
6965 } else if (mask & CEPH_SETATTR_SIZE) {
6966 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6967 mask |= CEPH_SETATTR_KILL_SGUID;
6968 }
6969
6970 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6971 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6972 if (mask & CEPH_SETATTR_MTIME)
6973 in->mtime = utime_t(stx->stx_mtime);
6974 if (mask & CEPH_SETATTR_ATIME)
6975 in->atime = utime_t(stx->stx_atime);
6976 in->ctime = ceph_clock_now();
6977 in->cap_dirtier_uid = perms.uid();
6978 in->cap_dirtier_gid = perms.gid();
6979 in->time_warp_seq++;
28e407b8 6980 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6981 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6982 }
6983 }
6984 if (!mask) {
6985 in->change_attr++;
6986 return 0;
6987 }
6988
6989force_request:
6990 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6991
6992 filepath path;
6993
6994 in->make_nosnap_relative_path(path);
6995 req->set_filepath(path);
6996 req->set_inode(in);
6997
6998 if (mask & CEPH_SETATTR_KILL_SGUID) {
6999 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7000 }
7001 if (mask & CEPH_SETATTR_MODE) {
7002 req->head.args.setattr.mode = stx->stx_mode;
7003 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7004 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7005 }
7006 if (mask & CEPH_SETATTR_UID) {
7007 req->head.args.setattr.uid = stx->stx_uid;
7008 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7009 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7010 }
7011 if (mask & CEPH_SETATTR_GID) {
7012 req->head.args.setattr.gid = stx->stx_gid;
7013 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7014 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7015 }
7016 if (mask & CEPH_SETATTR_BTIME) {
7017 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7018 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7019 }
7020 if (mask & CEPH_SETATTR_MTIME) {
7021 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 7022 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7023 CEPH_CAP_FILE_WR;
7024 }
7025 if (mask & CEPH_SETATTR_ATIME) {
7026 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7027 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7028 CEPH_CAP_FILE_WR;
7029 }
7030 if (mask & CEPH_SETATTR_SIZE) {
7031 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7032 req->head.args.setattr.size = stx->stx_size;
7033 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7034 } else { //too big!
7035 put_request(req);
7036 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7037 return -EFBIG;
7038 }
94b18763 7039 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7040 CEPH_CAP_FILE_WR;
7041 }
7042 req->head.args.setattr.mask = mask;
7043
7044 req->regetattr_mask = mask;
7045
7046 int res = make_request(req, perms, inp);
7047 ldout(cct, 10) << "_setattr result=" << res << dendl;
7048 return res;
7049}
7050
7051/* Note that we only care about attrs that setattr cares about */
7052void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7053{
7054 stx->stx_size = st->st_size;
7055 stx->stx_mode = st->st_mode;
7056 stx->stx_uid = st->st_uid;
7057 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7058#ifdef __APPLE__
7059 stx->stx_mtime = st->st_mtimespec;
7060 stx->stx_atime = st->st_atimespec;
7061#else
7c673cae
FG
7062 stx->stx_mtime = st->st_mtim;
7063 stx->stx_atime = st->st_atim;
11fdf7f2 7064#endif
7c673cae
FG
7065}
7066
7067int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7068 const UserPerm& perms, InodeRef *inp)
7069{
7070 int ret = _do_setattr(in, stx, mask, perms, inp);
7071 if (ret < 0)
7072 return ret;
7073 if (mask & CEPH_SETATTR_MODE)
7074 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7075 return ret;
7076}
7077
7078int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7079 const UserPerm& perms)
7080{
7081 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7082 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7083 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7084 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7085 if (cct->_conf->client_permissions) {
7086 int r = may_setattr(in.get(), stx, mask, perms);
7087 if (r < 0)
7088 return r;
7089 }
7090 return __setattrx(in.get(), stx, mask, perms);
7091}
7092
7093int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7094 const UserPerm& perms)
7095{
7096 struct ceph_statx stx;
7097
7098 stat_to_statx(attr, &stx);
7099 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7100
7101 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7102 mask &= ~CEPH_SETATTR_UID;
7103 }
7104 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7105 mask &= ~CEPH_SETATTR_GID;
7106 }
7107
7c673cae
FG
7108 return _setattrx(in, &stx, mask, perms);
7109}
7110
7111int Client::setattr(const char *relpath, struct stat *attr, int mask,
7112 const UserPerm& perms)
7113{
11fdf7f2
TL
7114 std::lock_guard lock(client_lock);
7115 tout(cct) << __func__ << std::endl;
7c673cae
FG
7116 tout(cct) << relpath << std::endl;
7117 tout(cct) << mask << std::endl;
7118
181888fb
FG
7119 if (unmounting)
7120 return -ENOTCONN;
7121
7c673cae
FG
7122 filepath path(relpath);
7123 InodeRef in;
7124 int r = path_walk(path, &in, perms);
7125 if (r < 0)
7126 return r;
7127 return _setattr(in, attr, mask, perms);
7128}
7129
7130int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7131 const UserPerm& perms, int flags)
7132{
11fdf7f2
TL
7133 std::lock_guard lock(client_lock);
7134 tout(cct) << __func__ << std::endl;
7c673cae
FG
7135 tout(cct) << relpath << std::endl;
7136 tout(cct) << mask << std::endl;
7137
181888fb
FG
7138 if (unmounting)
7139 return -ENOTCONN;
7140
7c673cae
FG
7141 filepath path(relpath);
7142 InodeRef in;
7143 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7144 if (r < 0)
7145 return r;
7146 return _setattrx(in, stx, mask, perms);
7147}
7148
7149int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7150{
11fdf7f2
TL
7151 std::lock_guard lock(client_lock);
7152 tout(cct) << __func__ << std::endl;
7c673cae
FG
7153 tout(cct) << fd << std::endl;
7154 tout(cct) << mask << std::endl;
7155
181888fb
FG
7156 if (unmounting)
7157 return -ENOTCONN;
7158
7c673cae
FG
7159 Fh *f = get_filehandle(fd);
7160 if (!f)
7161 return -EBADF;
7162#if defined(__linux__) && defined(O_PATH)
7163 if (f->flags & O_PATH)
7164 return -EBADF;
7165#endif
7166 return _setattr(f->inode, attr, mask, perms);
7167}
7168
7169int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7170{
11fdf7f2
TL
7171 std::lock_guard lock(client_lock);
7172 tout(cct) << __func__ << std::endl;
7c673cae
FG
7173 tout(cct) << fd << std::endl;
7174 tout(cct) << mask << std::endl;
7175
181888fb
FG
7176 if (unmounting)
7177 return -ENOTCONN;
7178
7c673cae
FG
7179 Fh *f = get_filehandle(fd);
7180 if (!f)
7181 return -EBADF;
7182#if defined(__linux__) && defined(O_PATH)
7183 if (f->flags & O_PATH)
7184 return -EBADF;
7185#endif
7186 return _setattrx(f->inode, stx, mask, perms);
7187}
7188
7189int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7190 frag_info_t *dirstat, int mask)
7191{
11fdf7f2
TL
7192 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7193 std::lock_guard lock(client_lock);
7c673cae
FG
7194 tout(cct) << "stat" << std::endl;
7195 tout(cct) << relpath << std::endl;
181888fb
FG
7196
7197 if (unmounting)
7198 return -ENOTCONN;
7199
7c673cae
FG
7200 filepath path(relpath);
7201 InodeRef in;
7202 int r = path_walk(path, &in, perms, true, mask);
7203 if (r < 0)
7204 return r;
7205 r = _getattr(in, mask, perms);
7206 if (r < 0) {
11fdf7f2 7207 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7208 return r;
7209 }
7210 fill_stat(in, stbuf, dirstat);
11fdf7f2 7211 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7212 return r;
7213}
7214
7215unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7216{
7217 unsigned mask = 0;
7218
7219 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7220 if (flags & AT_NO_ATTR_SYNC)
7221 goto out;
7222
7223 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7224 mask |= CEPH_CAP_PIN;
7225 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7226 mask |= CEPH_CAP_AUTH_SHARED;
7227 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7228 mask |= CEPH_CAP_LINK_SHARED;
7229 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7230 mask |= CEPH_CAP_FILE_SHARED;
7231 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7232 mask |= CEPH_CAP_XATTR_SHARED;
7233out:
7234 return mask;
7235}
7236
7237int Client::statx(const char *relpath, struct ceph_statx *stx,
7238 const UserPerm& perms,
7239 unsigned int want, unsigned int flags)
7240{
11fdf7f2
TL
7241 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7242 std::lock_guard lock(client_lock);
7c673cae
FG
7243 tout(cct) << "statx" << std::endl;
7244 tout(cct) << relpath << std::endl;
181888fb
FG
7245
7246 if (unmounting)
7247 return -ENOTCONN;
7248
7c673cae
FG
7249 filepath path(relpath);
7250 InodeRef in;
7251
7252 unsigned mask = statx_to_mask(flags, want);
7253
7254 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7255 if (r < 0)
7256 return r;
7257
7258 r = _getattr(in, mask, perms);
7259 if (r < 0) {
11fdf7f2 7260 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7261 return r;
7262 }
7263
7264 fill_statx(in, mask, stx);
11fdf7f2 7265 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7266 return r;
7267}
7268
7269int Client::lstat(const char *relpath, struct stat *stbuf,
7270 const UserPerm& perms, frag_info_t *dirstat, int mask)
7271{
11fdf7f2
TL
7272 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7273 std::lock_guard lock(client_lock);
7274 tout(cct) << __func__ << std::endl;
7c673cae 7275 tout(cct) << relpath << std::endl;
181888fb
FG
7276
7277 if (unmounting)
7278 return -ENOTCONN;
7279
7c673cae
FG
7280 filepath path(relpath);
7281 InodeRef in;
7282 // don't follow symlinks
7283 int r = path_walk(path, &in, perms, false, mask);
7284 if (r < 0)
7285 return r;
7286 r = _getattr(in, mask, perms);
7287 if (r < 0) {
11fdf7f2 7288 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7289 return r;
7290 }
7291 fill_stat(in, stbuf, dirstat);
11fdf7f2 7292 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7293 return r;
7294}
7295
7296int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7297{
11fdf7f2 7298 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7299 << " mode 0" << oct << in->mode << dec
7300 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7301 memset(st, 0, sizeof(struct stat));
7302 if (use_faked_inos())
7303 st->st_ino = in->faked_ino;
7304 else
7305 st->st_ino = in->ino;
7306 st->st_dev = in->snapid;
7307 st->st_mode = in->mode;
7308 st->st_rdev = in->rdev;
28e407b8
AA
7309 if (in->is_dir()) {
7310 switch (in->nlink) {
7311 case 0:
7312 st->st_nlink = 0; /* dir is unlinked */
7313 break;
7314 case 1:
7315 st->st_nlink = 1 /* parent dentry */
7316 + 1 /* <dir>/. */
7317 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7318 break;
7319 default:
7320 ceph_abort();
7321 }
7322 } else {
7323 st->st_nlink = in->nlink;
7324 }
7c673cae
FG
7325 st->st_uid = in->uid;
7326 st->st_gid = in->gid;
7327 if (in->ctime > in->mtime) {
7328 stat_set_ctime_sec(st, in->ctime.sec());
7329 stat_set_ctime_nsec(st, in->ctime.nsec());
7330 } else {
7331 stat_set_ctime_sec(st, in->mtime.sec());
7332 stat_set_ctime_nsec(st, in->mtime.nsec());
7333 }
7334 stat_set_atime_sec(st, in->atime.sec());
7335 stat_set_atime_nsec(st, in->atime.nsec());
7336 stat_set_mtime_sec(st, in->mtime.sec());
7337 stat_set_mtime_nsec(st, in->mtime.nsec());
7338 if (in->is_dir()) {
7339 if (cct->_conf->client_dirsize_rbytes)
7340 st->st_size = in->rstat.rbytes;
7341 else
7342 st->st_size = in->dirstat.size();
7343 st->st_blocks = 1;
7344 } else {
7345 st->st_size = in->size;
7346 st->st_blocks = (in->size + 511) >> 9;
7347 }
11fdf7f2 7348 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7349
7350 if (dirstat)
7351 *dirstat = in->dirstat;
7352 if (rstat)
7353 *rstat = in->rstat;
7354
7355 return in->caps_issued();
7356}
7357
7358void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7359{
11fdf7f2 7360 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7361 << " mode 0" << oct << in->mode << dec
7362 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7363 memset(stx, 0, sizeof(struct ceph_statx));
7364
7365 /*
7366 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7367 * so that all bits are set.
7368 */
7369 if (!mask)
7370 mask = ~0;
7371
7372 /* These are always considered to be available */
7373 stx->stx_dev = in->snapid;
11fdf7f2 7374 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7375
7376 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7377 stx->stx_mode = S_IFMT & in->mode;
7378 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7379 stx->stx_rdev = in->rdev;
7380 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7381
7382 if (mask & CEPH_CAP_AUTH_SHARED) {
7383 stx->stx_uid = in->uid;
7384 stx->stx_gid = in->gid;
7385 stx->stx_mode = in->mode;
7386 in->btime.to_timespec(&stx->stx_btime);
7387 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7388 }
7389
7390 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7391 if (in->is_dir()) {
7392 switch (in->nlink) {
7393 case 0:
7394 stx->stx_nlink = 0; /* dir is unlinked */
7395 break;
7396 case 1:
7397 stx->stx_nlink = 1 /* parent dentry */
7398 + 1 /* <dir>/. */
7399 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7400 break;
7401 default:
7402 ceph_abort();
7403 }
7404 } else {
7405 stx->stx_nlink = in->nlink;
7406 }
7c673cae
FG
7407 stx->stx_mask |= CEPH_STATX_NLINK;
7408 }
7409
7410 if (mask & CEPH_CAP_FILE_SHARED) {
7411
7412 in->atime.to_timespec(&stx->stx_atime);
7413 in->mtime.to_timespec(&stx->stx_mtime);
7414
7415 if (in->is_dir()) {
7416 if (cct->_conf->client_dirsize_rbytes)
7417 stx->stx_size = in->rstat.rbytes;
7418 else
7419 stx->stx_size = in->dirstat.size();
7420 stx->stx_blocks = 1;
7421 } else {
7422 stx->stx_size = in->size;
7423 stx->stx_blocks = (in->size + 511) >> 9;
7424 }
7425 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7426 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7427 }
7428
7429 /* Change time and change_attr both require all shared caps to view */
7430 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7431 stx->stx_version = in->change_attr;
7432 if (in->ctime > in->mtime)
7433 in->ctime.to_timespec(&stx->stx_ctime);
7434 else
7435 in->mtime.to_timespec(&stx->stx_ctime);
7436 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7437 }
7438
7439}
7440
7441void Client::touch_dn(Dentry *dn)
7442{
7443 lru.lru_touch(dn);
7444}
7445
7446int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7447{
11fdf7f2
TL
7448 std::lock_guard lock(client_lock);
7449 tout(cct) << __func__ << std::endl;
7c673cae
FG
7450 tout(cct) << relpath << std::endl;
7451 tout(cct) << mode << std::endl;
181888fb
FG
7452
7453 if (unmounting)
7454 return -ENOTCONN;
7455
7c673cae
FG
7456 filepath path(relpath);
7457 InodeRef in;
7458 int r = path_walk(path, &in, perms);
7459 if (r < 0)
7460 return r;
7461 struct stat attr;
7462 attr.st_mode = mode;
7463 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7464}
7465
7466int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7467{
11fdf7f2
TL
7468 std::lock_guard lock(client_lock);
7469 tout(cct) << __func__ << std::endl;
7c673cae
FG
7470 tout(cct) << fd << std::endl;
7471 tout(cct) << mode << std::endl;
181888fb
FG
7472
7473 if (unmounting)
7474 return -ENOTCONN;
7475
7c673cae
FG
7476 Fh *f = get_filehandle(fd);
7477 if (!f)
7478 return -EBADF;
7479#if defined(__linux__) && defined(O_PATH)
7480 if (f->flags & O_PATH)
7481 return -EBADF;
7482#endif
7483 struct stat attr;
7484 attr.st_mode = mode;
7485 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7486}
7487
7488int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7489{
11fdf7f2
TL
7490 std::lock_guard lock(client_lock);
7491 tout(cct) << __func__ << std::endl;
7c673cae
FG
7492 tout(cct) << relpath << std::endl;
7493 tout(cct) << mode << std::endl;
181888fb
FG
7494
7495 if (unmounting)
7496 return -ENOTCONN;
7497
7c673cae
FG
7498 filepath path(relpath);
7499 InodeRef in;
7500 // don't follow symlinks
7501 int r = path_walk(path, &in, perms, false);
7502 if (r < 0)
7503 return r;
7504 struct stat attr;
7505 attr.st_mode = mode;
7506 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7507}
7508
7509int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7510 const UserPerm& perms)
7511{
11fdf7f2
TL
7512 std::lock_guard lock(client_lock);
7513 tout(cct) << __func__ << std::endl;
7c673cae
FG
7514 tout(cct) << relpath << std::endl;
7515 tout(cct) << new_uid << std::endl;
7516 tout(cct) << new_gid << std::endl;
181888fb
FG
7517
7518 if (unmounting)
7519 return -ENOTCONN;
7520
7c673cae
FG
7521 filepath path(relpath);
7522 InodeRef in;
7523 int r = path_walk(path, &in, perms);
7524 if (r < 0)
7525 return r;
7526 struct stat attr;
7527 attr.st_uid = new_uid;
7528 attr.st_gid = new_gid;
181888fb 7529 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7530}
7531
7532int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7533{
11fdf7f2
TL
7534 std::lock_guard lock(client_lock);
7535 tout(cct) << __func__ << std::endl;
7c673cae
FG
7536 tout(cct) << fd << std::endl;
7537 tout(cct) << new_uid << std::endl;
7538 tout(cct) << new_gid << std::endl;
181888fb
FG
7539
7540 if (unmounting)
7541 return -ENOTCONN;
7542
7c673cae
FG
7543 Fh *f = get_filehandle(fd);
7544 if (!f)
7545 return -EBADF;
7546#if defined(__linux__) && defined(O_PATH)
7547 if (f->flags & O_PATH)
7548 return -EBADF;
7549#endif
7550 struct stat attr;
7551 attr.st_uid = new_uid;
7552 attr.st_gid = new_gid;
7553 int mask = 0;
7554 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7555 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7556 return _setattr(f->inode, &attr, mask, perms);
7557}
7558
7559int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7560 const UserPerm& perms)
7561{
11fdf7f2
TL
7562 std::lock_guard lock(client_lock);
7563 tout(cct) << __func__ << std::endl;
7c673cae
FG
7564 tout(cct) << relpath << std::endl;
7565 tout(cct) << new_uid << std::endl;
7566 tout(cct) << new_gid << std::endl;
181888fb
FG
7567
7568 if (unmounting)
7569 return -ENOTCONN;
7570
7c673cae
FG
7571 filepath path(relpath);
7572 InodeRef in;
7573 // don't follow symlinks
7574 int r = path_walk(path, &in, perms, false);
7575 if (r < 0)
7576 return r;
7577 struct stat attr;
7578 attr.st_uid = new_uid;
7579 attr.st_gid = new_gid;
7580 int mask = 0;
7581 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7582 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7583 return _setattr(in, &attr, mask, perms);
7584}
7585
11fdf7f2
TL
7586static void attr_set_atime_and_mtime(struct stat *attr,
7587 const utime_t &atime,
7588 const utime_t &mtime)
7589{
7590 stat_set_atime_sec(attr, atime.tv.tv_sec);
7591 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7592 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7593 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7594}
7595
7596// for [l]utime() invoke the timeval variant as the timespec
7597// variant are not yet implemented. for futime[s](), invoke
7598// the timespec variant.
7c673cae
FG
7599int Client::utime(const char *relpath, struct utimbuf *buf,
7600 const UserPerm& perms)
7601{
11fdf7f2
TL
7602 struct timeval tv[2];
7603 tv[0].tv_sec = buf->actime;
7604 tv[0].tv_usec = 0;
7605 tv[1].tv_sec = buf->modtime;
7606 tv[1].tv_usec = 0;
7607
7608 return utimes(relpath, tv, perms);
7609}
7610
7611int Client::lutime(const char *relpath, struct utimbuf *buf,
7612 const UserPerm& perms)
7613{
7614 struct timeval tv[2];
7615 tv[0].tv_sec = buf->actime;
7616 tv[0].tv_usec = 0;
7617 tv[1].tv_sec = buf->modtime;
7618 tv[1].tv_usec = 0;
7619
7620 return lutimes(relpath, tv, perms);
7621}
7622
7623int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7624{
7625 struct timespec ts[2];
7626 ts[0].tv_sec = buf->actime;
7627 ts[0].tv_nsec = 0;
7628 ts[1].tv_sec = buf->modtime;
7629 ts[1].tv_nsec = 0;
7630
7631 return futimens(fd, ts, perms);
7632}
7633
7634int Client::utimes(const char *relpath, struct timeval times[2],
7635 const UserPerm& perms)
7636{
7637 std::lock_guard lock(client_lock);
7638 tout(cct) << __func__ << std::endl;
7c673cae 7639 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7640 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7641 << std::endl;
7642 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7643 << std::endl;
181888fb
FG
7644
7645 if (unmounting)
7646 return -ENOTCONN;
7647
7c673cae
FG
7648 filepath path(relpath);
7649 InodeRef in;
7650 int r = path_walk(path, &in, perms);
7651 if (r < 0)
7652 return r;
7653 struct stat attr;
11fdf7f2
TL
7654 utime_t atime(times[0]);
7655 utime_t mtime(times[1]);
7656
7657 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7658 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7659}
7660
11fdf7f2
TL
7661int Client::lutimes(const char *relpath, struct timeval times[2],
7662 const UserPerm& perms)
7c673cae 7663{
11fdf7f2
TL
7664 std::lock_guard lock(client_lock);
7665 tout(cct) << __func__ << std::endl;
7c673cae 7666 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7667 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7668 << std::endl;
7669 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7670 << std::endl;
181888fb
FG
7671
7672 if (unmounting)
7673 return -ENOTCONN;
7674
7c673cae
FG
7675 filepath path(relpath);
7676 InodeRef in;
7c673cae
FG
7677 int r = path_walk(path, &in, perms, false);
7678 if (r < 0)
7679 return r;
7680 struct stat attr;
11fdf7f2
TL
7681 utime_t atime(times[0]);
7682 utime_t mtime(times[1]);
7683
7684 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7685 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7686}
7687
11fdf7f2
TL
7688int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7689{
7690 struct timespec ts[2];
7691 ts[0].tv_sec = times[0].tv_sec;
7692 ts[0].tv_nsec = times[0].tv_usec * 1000;
7693 ts[1].tv_sec = times[1].tv_sec;
7694 ts[1].tv_nsec = times[1].tv_usec * 1000;
7695
7696 return futimens(fd, ts, perms);
7697}
7698
7699int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7700{
7701 std::lock_guard lock(client_lock);
7702 tout(cct) << __func__ << std::endl;
7703 tout(cct) << fd << std::endl;
7704 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7705 << std::endl;
7706 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7707 << std::endl;
7708
7709 if (unmounting)
7710 return -ENOTCONN;
7711
7712 Fh *f = get_filehandle(fd);
7713 if (!f)
7714 return -EBADF;
7715#if defined(__linux__) && defined(O_PATH)
7716 if (f->flags & O_PATH)
7717 return -EBADF;
7718#endif
7719 struct stat attr;
7720 utime_t atime(times[0]);
7721 utime_t mtime(times[1]);
7722
7723 attr_set_atime_and_mtime(&attr, atime, mtime);
7724 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7725}
7726
7c673cae
FG
7727int Client::flock(int fd, int operation, uint64_t owner)
7728{
11fdf7f2
TL
7729 std::lock_guard lock(client_lock);
7730 tout(cct) << __func__ << std::endl;
7c673cae
FG
7731 tout(cct) << fd << std::endl;
7732 tout(cct) << operation << std::endl;
7733 tout(cct) << owner << std::endl;
181888fb
FG
7734
7735 if (unmounting)
7736 return -ENOTCONN;
7737
7c673cae
FG
7738 Fh *f = get_filehandle(fd);
7739 if (!f)
7740 return -EBADF;
7741
7742 return _flock(f, operation, owner);
7743}
7744
7745int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7746{
11fdf7f2
TL
7747 std::lock_guard lock(client_lock);
7748 tout(cct) << __func__ << std::endl;
7c673cae 7749 tout(cct) << relpath << std::endl;
181888fb
FG
7750
7751 if (unmounting)
7752 return -ENOTCONN;
7753
7c673cae
FG
7754 filepath path(relpath);
7755 InodeRef in;
7756 int r = path_walk(path, &in, perms, true);
7757 if (r < 0)
7758 return r;
7759 if (cct->_conf->client_permissions) {
7760 int r = may_open(in.get(), O_RDONLY, perms);
7761 if (r < 0)
7762 return r;
7763 }
7764 r = _opendir(in.get(), dirpp, perms);
7765 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7766 if (r != -ENOTDIR)
7767 tout(cct) << (unsigned long)*dirpp << std::endl;
7768 return r;
7769}
7770
7771int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7772{
7773 if (!in->is_dir())
7774 return -ENOTDIR;
7775 *dirpp = new dir_result_t(in, perms);
7776 opened_dirs.insert(*dirpp);
11fdf7f2 7777 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7778 return 0;
7779}
7780
7781
7782int Client::closedir(dir_result_t *dir)
7783{
11fdf7f2
TL
7784 std::lock_guard lock(client_lock);
7785 tout(cct) << __func__ << std::endl;
7c673cae
FG
7786 tout(cct) << (unsigned long)dir << std::endl;
7787
11fdf7f2 7788 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7789 _closedir(dir);
7790 return 0;
7791}
7792
7793void Client::_closedir(dir_result_t *dirp)
7794{
11fdf7f2 7795 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7796 if (dirp->inode) {
11fdf7f2 7797 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7798 dirp->inode.reset();
7799 }
7800 _readdir_drop_dirp_buffer(dirp);
7801 opened_dirs.erase(dirp);
7802 delete dirp;
7803}
7804
7805void Client::rewinddir(dir_result_t *dirp)
7806{
11fdf7f2
TL
7807 std::lock_guard lock(client_lock);
7808 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7809
7810 if (unmounting)
7811 return;
7812
7c673cae
FG
7813 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7814 _readdir_drop_dirp_buffer(d);
7815 d->reset();
7816}
7817
7818loff_t Client::telldir(dir_result_t *dirp)
7819{
7820 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7821 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7822 return d->offset;
7823}
7824
7825void Client::seekdir(dir_result_t *dirp, loff_t offset)
7826{
11fdf7f2 7827 std::lock_guard lock(client_lock);
7c673cae 7828
11fdf7f2 7829 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7830
181888fb
FG
7831 if (unmounting)
7832 return;
7833
7c673cae
FG
7834 if (offset == dirp->offset)
7835 return;
7836
7837 if (offset > dirp->offset)
7838 dirp->release_count = 0; // bump if we do a forward seek
7839 else
7840 dirp->ordered_count = 0; // disable filling readdir cache
7841
7842 if (dirp->hash_order()) {
7843 if (dirp->offset > offset) {
7844 _readdir_drop_dirp_buffer(dirp);
7845 dirp->reset();
7846 }
7847 } else {
7848 if (offset == 0 ||
7849 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7850 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7851 _readdir_drop_dirp_buffer(dirp);
7852 dirp->reset();
7853 }
7854 }
7855
7856 dirp->offset = offset;
7857}
7858
7859
7860//struct dirent {
7861// ino_t d_ino; /* inode number */
7862// off_t d_off; /* offset to the next dirent */
7863// unsigned short d_reclen; /* length of this record */
7864// unsigned char d_type; /* type of file */
7865// char d_name[256]; /* filename */
7866//};
7867void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7868{
7869 strncpy(de->d_name, name, 255);
7870 de->d_name[255] = '\0';
7871#ifndef __CYGWIN__
7872 de->d_ino = ino;
11fdf7f2 7873#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7874 de->d_off = next_off;
7875#endif
7876 de->d_reclen = 1;
7877 de->d_type = IFTODT(type);
11fdf7f2 7878 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7879 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7880#endif
7881}
7882
7883void Client::_readdir_next_frag(dir_result_t *dirp)
7884{
7885 frag_t fg = dirp->buffer_frag;
7886
7887 if (fg.is_rightmost()) {
11fdf7f2 7888 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7889 dirp->set_end();
7890 return;
7891 }
7892
7893 // advance
7894 fg = fg.next();
11fdf7f2 7895 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7896
7897 if (dirp->hash_order()) {
7898 // keep last_name
7899 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7900 if (dirp->offset < new_offset) // don't decrease offset
7901 dirp->offset = new_offset;
7902 } else {
7903 dirp->last_name.clear();
7904 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7905 _readdir_rechoose_frag(dirp);
7906 }
7907}
7908
7909void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7910{
11fdf7f2 7911 ceph_assert(dirp->inode);
7c673cae
FG
7912
7913 if (dirp->hash_order())
7914 return;
7915
7916 frag_t cur = frag_t(dirp->offset_high());
7917 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7918 if (fg != cur) {
11fdf7f2 7919 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7920 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7921 dirp->last_name.clear();
7922 dirp->next_offset = 2;
7923 }
7924}
7925
7926void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7927{
11fdf7f2 7928 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7929 dirp->buffer.clear();
7930}
7931
7932int Client::_readdir_get_frag(dir_result_t *dirp)
7933{
11fdf7f2
TL
7934 ceph_assert(dirp);
7935 ceph_assert(dirp->inode);
7c673cae
FG
7936
7937 // get the current frag.
7938 frag_t fg;
7939 if (dirp->hash_order())
7940 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7941 else
7942 fg = frag_t(dirp->offset_high());
7943
11fdf7f2 7944 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7945 << " offset " << hex << dirp->offset << dec << dendl;
7946
7947 int op = CEPH_MDS_OP_READDIR;
7948 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7949 op = CEPH_MDS_OP_LSSNAP;
7950
7951 InodeRef& diri = dirp->inode;
7952
7953 MetaRequest *req = new MetaRequest(op);
7954 filepath path;
7955 diri->make_nosnap_relative_path(path);
7956 req->set_filepath(path);
7957 req->set_inode(diri.get());
7958 req->head.args.readdir.frag = fg;
7959 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7960 if (dirp->last_name.length()) {
94b18763 7961 req->path2.set_path(dirp->last_name);
7c673cae
FG
7962 } else if (dirp->hash_order()) {
7963 req->head.args.readdir.offset_hash = dirp->offset_high();
7964 }
7965 req->dirp = dirp;
7966
7967 bufferlist dirbl;
7968 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7969
7970 if (res == -EAGAIN) {
11fdf7f2 7971 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7972 _readdir_rechoose_frag(dirp);
7973 return _readdir_get_frag(dirp);
7974 }
7975
7976 if (res == 0) {
11fdf7f2 7977 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7978 << " size " << dirp->buffer.size() << dendl;
7979 } else {
11fdf7f2 7980 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7981 dirp->set_end();
7982 }
7983
7984 return res;
7985}
7986
7987struct dentry_off_lt {
7988 bool operator()(const Dentry* dn, int64_t off) const {
7989 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7990 }
7991};
7992
7993int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7994 int caps, bool getref)
7995{
9f95a23c 7996 ceph_assert(ceph_mutex_is_locked(client_lock));
11fdf7f2 7997 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7998 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7999 << dendl;
8000 Dir *dir = dirp->inode->dir;
8001
8002 if (!dir) {
8003 ldout(cct, 10) << " dir is empty" << dendl;
8004 dirp->set_end();
8005 return 0;
8006 }
8007
8008 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8009 dir->readdir_cache.end(),
8010 dirp->offset, dentry_off_lt());
8011
8012 string dn_name;
8013 while (true) {
8014 if (!dirp->inode->is_complete_and_ordered())
8015 return -EAGAIN;
8016 if (pd == dir->readdir_cache.end())
8017 break;
8018 Dentry *dn = *pd;
8019 if (dn->inode == NULL) {
8020 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8021 ++pd;
8022 continue;
8023 }
8024 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8025 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8026 ++pd;
8027 continue;
8028 }
8029
92f5a8d4 8030 int idx = pd - dir->readdir_cache.begin();
7c673cae
FG
8031 int r = _getattr(dn->inode, caps, dirp->perms);
8032 if (r < 0)
8033 return r;
92f5a8d4
TL
8034
8035 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8036 pd = dir->readdir_cache.begin() + idx;
8037 if (pd >= dir->readdir_cache.end() || *pd != dn)
8038 return -EAGAIN;
7c673cae
FG
8039
8040 struct ceph_statx stx;
8041 struct dirent de;
8042 fill_statx(dn->inode, caps, &stx);
8043
8044 uint64_t next_off = dn->offset + 1;
eafe8130 8045 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8046 ++pd;
8047 if (pd == dir->readdir_cache.end())
8048 next_off = dir_result_t::END;
8049
8050 Inode *in = NULL;
7c673cae
FG
8051 if (getref) {
8052 in = dn->inode.get();
8053 _ll_get(in);
8054 }
8055
8056 dn_name = dn->name; // fill in name while we have lock
8057
9f95a23c 8058 client_lock.unlock();
7c673cae 8059 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8060 client_lock.lock();
7c673cae
FG
8061 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8062 << " = " << r << dendl;
8063 if (r < 0) {
8064 return r;
8065 }
8066
8067 dirp->offset = next_off;
8068 if (dirp->at_end())
8069 dirp->next_offset = 2;
8070 else
8071 dirp->next_offset = dirp->offset_low();
8072 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8073 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8074 if (r > 0)
8075 return r;
8076 }
8077
11fdf7f2 8078 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8079 dirp->set_end();
8080 return 0;
8081}
8082
8083int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8084 unsigned want, unsigned flags, bool getref)
8085{
8086 int caps = statx_to_mask(flags, want);
8087
11fdf7f2 8088 std::lock_guard lock(client_lock);
7c673cae 8089
181888fb
FG
8090 if (unmounting)
8091 return -ENOTCONN;
8092
7c673cae
FG
8093 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8094
11fdf7f2 8095 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8096 << dec << " at_end=" << dirp->at_end()
8097 << " hash_order=" << dirp->hash_order() << dendl;
8098
8099 struct dirent de;
8100 struct ceph_statx stx;
8101 memset(&de, 0, sizeof(de));
8102 memset(&stx, 0, sizeof(stx));
8103
8104 InodeRef& diri = dirp->inode;
8105
8106 if (dirp->at_end())
8107 return 0;
8108
8109 if (dirp->offset == 0) {
8110 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8111 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8112 uint64_t next_off = 1;
8113
8114 int r;
8115 r = _getattr(diri, caps, dirp->perms);
8116 if (r < 0)
8117 return r;
8118
8119 fill_statx(diri, caps, &stx);
8120 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8121
8122 Inode *inode = NULL;
8123 if (getref) {
8124 inode = diri.get();
8125 _ll_get(inode);
8126 }
8127
9f95a23c 8128 client_lock.unlock();
7c673cae 8129 r = cb(p, &de, &stx, next_off, inode);
9f95a23c 8130 client_lock.lock();
7c673cae
FG
8131 if (r < 0)
8132 return r;
8133
8134 dirp->offset = next_off;
8135 if (r > 0)
8136 return r;
8137 }
8138 if (dirp->offset == 1) {
8139 ldout(cct, 15) << " including .." << dendl;
8140 uint64_t next_off = 2;
8141 InodeRef in;
11fdf7f2 8142 if (diri->dentries.empty())
7c673cae
FG
8143 in = diri;
8144 else
94b18763 8145 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8146
8147 int r;
94b18763 8148 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8149 if (r < 0)
8150 return r;
8151
8152 fill_statx(in, caps, &stx);
8153 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8154
8155 Inode *inode = NULL;
8156 if (getref) {
8157 inode = in.get();
8158 _ll_get(inode);
8159 }
8160
9f95a23c 8161 client_lock.unlock();
7c673cae 8162 r = cb(p, &de, &stx, next_off, inode);
9f95a23c 8163 client_lock.lock();
7c673cae
FG
8164 if (r < 0)
8165 return r;
8166
8167 dirp->offset = next_off;
8168 if (r > 0)
8169 return r;
8170 }
8171
8172 // can we read from our cache?
8173 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8174 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8175 << dirp->inode->is_complete_and_ordered()
8176 << " issued " << ccap_string(dirp->inode->caps_issued())
8177 << dendl;
8178 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8179 dirp->inode->is_complete_and_ordered() &&
94b18763 8180 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8181 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8182 if (err != -EAGAIN)
8183 return err;
8184 }
8185
8186 while (1) {
8187 if (dirp->at_end())
8188 return 0;
8189
8190 bool check_caps = true;
8191 if (!dirp->is_cached()) {
8192 int r = _readdir_get_frag(dirp);
8193 if (r)
8194 return r;
8195 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8196 // different than the requested one. (our dirfragtree was outdated)
8197 check_caps = false;
8198 }
8199 frag_t fg = dirp->buffer_frag;
8200
8201 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8202 << " offset " << hex << dirp->offset << dendl;
8203
8204 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8205 dirp->offset, dir_result_t::dentry_off_lt());
8206 it != dirp->buffer.end();
8207 ++it) {
8208 dir_result_t::dentry &entry = *it;
8209
8210 uint64_t next_off = entry.offset + 1;
8211
8212 int r;
8213 if (check_caps) {
8214 r = _getattr(entry.inode, caps, dirp->perms);
8215 if (r < 0)
8216 return r;
8217 }
8218
8219 fill_statx(entry.inode, caps, &stx);
8220 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8221
8222 Inode *inode = NULL;
8223 if (getref) {
8224 inode = entry.inode.get();
8225 _ll_get(inode);
8226 }
8227
9f95a23c 8228 client_lock.unlock();
7c673cae 8229 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9f95a23c 8230 client_lock.lock();
7c673cae
FG
8231
8232 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8233 << " = " << r << dendl;
8234 if (r < 0)
8235 return r;
8236
8237 dirp->offset = next_off;
8238 if (r > 0)
8239 return r;
8240 }
8241
8242 if (dirp->next_offset > 2) {
8243 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8244 _readdir_drop_dirp_buffer(dirp);
8245 continue; // more!
8246 }
8247
8248 if (!fg.is_rightmost()) {
8249 // next frag!
8250 _readdir_next_frag(dirp);
8251 continue;
8252 }
8253
8254 if (diri->shared_gen == dirp->start_shared_gen &&
8255 diri->dir_release_count == dirp->release_count) {
8256 if (diri->dir_ordered_count == dirp->ordered_count) {
8257 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8258 if (diri->dir) {
11fdf7f2 8259 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8260 diri->dir->readdir_cache.resize(dirp->cache_index);
8261 }
8262 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8263 } else {
8264 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8265 diri->flags |= I_COMPLETE;
8266 }
8267 }
8268
8269 dirp->set_end();
8270 return 0;
8271 }
8272 ceph_abort();
8273 return 0;
8274}
8275
8276
8277int Client::readdir_r(dir_result_t *d, struct dirent *de)
8278{
8279 return readdirplus_r(d, de, 0, 0, 0, NULL);
8280}
8281
8282/*
8283 * readdirplus_r
8284 *
8285 * returns
8286 * 1 if we got a dirent
8287 * 0 for end of directory
8288 * <0 on error
8289 */
8290
8291struct single_readdir {
8292 struct dirent *de;
8293 struct ceph_statx *stx;
8294 Inode *inode;
8295 bool full;
8296};
8297
8298static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8299 struct ceph_statx *stx, off_t off,
8300 Inode *in)
8301{
8302 single_readdir *c = static_cast<single_readdir *>(p);
8303
8304 if (c->full)
8305 return -1; // already filled this dirent
8306
8307 *c->de = *de;
8308 if (c->stx)
8309 *c->stx = *stx;
8310 c->inode = in;
8311 c->full = true;
8312 return 1;
8313}
8314
8315struct dirent *Client::readdir(dir_result_t *d)
8316{
8317 int ret;
8318 static struct dirent de;
8319 single_readdir sr;
8320 sr.de = &de;
8321 sr.stx = NULL;
8322 sr.inode = NULL;
8323 sr.full = false;
8324
8325 // our callback fills the dirent and sets sr.full=true on first
8326 // call, and returns -1 the second time around.
8327 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8328 if (ret < -1) {
8329 errno = -ret; // this sucks.
8330 return (dirent *) NULL;
8331 }
8332 if (sr.full) {
8333 return &de;
8334 }
8335 return (dirent *) NULL;
8336}
8337
8338int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8339 struct ceph_statx *stx, unsigned want,
8340 unsigned flags, Inode **out)
8341{
8342 single_readdir sr;
8343 sr.de = de;
8344 sr.stx = stx;
8345 sr.inode = NULL;
8346 sr.full = false;
8347
8348 // our callback fills the dirent and sets sr.full=true on first
8349 // call, and returns -1 the second time around.
8350 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8351 if (r < -1)
8352 return r;
8353 if (out)
8354 *out = sr.inode;
8355 if (sr.full)
8356 return 1;
8357 return 0;
8358}
8359
8360
8361/* getdents */
8362struct getdents_result {
8363 char *buf;
8364 int buflen;
8365 int pos;
8366 bool fullent;
8367};
8368
8369static int _readdir_getdent_cb(void *p, struct dirent *de,
8370 struct ceph_statx *stx, off_t off, Inode *in)
8371{
8372 struct getdents_result *c = static_cast<getdents_result *>(p);
8373
8374 int dlen;
8375 if (c->fullent)
8376 dlen = sizeof(*de);
8377 else
8378 dlen = strlen(de->d_name) + 1;
8379
8380 if (c->pos + dlen > c->buflen)
8381 return -1; // doesn't fit
8382
8383 if (c->fullent) {
8384 memcpy(c->buf + c->pos, de, sizeof(*de));
8385 } else {
8386 memcpy(c->buf + c->pos, de->d_name, dlen);
8387 }
8388 c->pos += dlen;
8389 return 0;
8390}
8391
8392int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8393{
8394 getdents_result gr;
8395 gr.buf = buf;
8396 gr.buflen = buflen;
8397 gr.fullent = fullent;
8398 gr.pos = 0;
8399
8400 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8401
8402 if (r < 0) { // some error
8403 if (r == -1) { // buffer ran out of space
8404 if (gr.pos) { // but we got some entries already!
8405 return gr.pos;
8406 } // or we need a larger buffer
8407 return -ERANGE;
8408 } else { // actual error, return it
8409 return r;
8410 }
8411 }
8412 return gr.pos;
8413}
8414
8415
8416/* getdir */
8417struct getdir_result {
8418 list<string> *contents;
8419 int num;
8420};
8421
8422static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8423{
8424 getdir_result *r = static_cast<getdir_result *>(p);
8425
8426 r->contents->push_back(de->d_name);
8427 r->num++;
8428 return 0;
8429}
8430
8431int Client::getdir(const char *relpath, list<string>& contents,
8432 const UserPerm& perms)
8433{
8434 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8435 {
11fdf7f2 8436 std::lock_guard lock(client_lock);
7c673cae
FG
8437 tout(cct) << "getdir" << std::endl;
8438 tout(cct) << relpath << std::endl;
8439 }
8440
8441 dir_result_t *d;
8442 int r = opendir(relpath, &d, perms);
8443 if (r < 0)
8444 return r;
8445
8446 getdir_result gr;
8447 gr.contents = &contents;
8448 gr.num = 0;
8449 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8450
8451 closedir(d);
8452
8453 if (r < 0)
8454 return r;
8455 return gr.num;
8456}
8457
8458
8459/****** file i/o **********/
8460int Client::open(const char *relpath, int flags, const UserPerm& perms,
8461 mode_t mode, int stripe_unit, int stripe_count,
8462 int object_size, const char *data_pool)
8463{
8464 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8465 std::lock_guard lock(client_lock);
7c673cae
FG
8466 tout(cct) << "open" << std::endl;
8467 tout(cct) << relpath << std::endl;
8468 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8469
181888fb
FG
8470 if (unmounting)
8471 return -ENOTCONN;
8472
7c673cae
FG
8473 Fh *fh = NULL;
8474
8475#if defined(__linux__) && defined(O_PATH)
8476 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8477 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8478 * in kernel (fs/open.c). */
8479 if (flags & O_PATH)
8480 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8481#endif
8482
8483 filepath path(relpath);
8484 InodeRef in;
8485 bool created = false;
8486 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8487 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8488 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8489
8490 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8491 return -EEXIST;
8492
8493#if defined(__linux__) && defined(O_PATH)
8494 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8495#else
8496 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8497#endif
8498 return -ELOOP;
8499
8500 if (r == -ENOENT && (flags & O_CREAT)) {
8501 filepath dirpath = path;
8502 string dname = dirpath.last_dentry();
8503 dirpath.pop_dentry();
8504 InodeRef dir;
8505 r = path_walk(dirpath, &dir, perms, true,
8506 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8507 if (r < 0)
8508 goto out;
8509 if (cct->_conf->client_permissions) {
8510 r = may_create(dir.get(), perms);
8511 if (r < 0)
8512 goto out;
8513 }
8514 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8515 stripe_count, object_size, data_pool, &created, perms);
8516 }
8517 if (r < 0)
8518 goto out;
8519
8520 if (!created) {
8521 // posix says we can only check permissions of existing files
8522 if (cct->_conf->client_permissions) {
8523 r = may_open(in.get(), flags, perms);
8524 if (r < 0)
8525 goto out;
8526 }
8527 }
8528
8529 if (!fh)
8530 r = _open(in.get(), flags, mode, &fh, perms);
8531 if (r >= 0) {
8532 // allocate a integer file descriptor
11fdf7f2 8533 ceph_assert(fh);
7c673cae 8534 r = get_fd();
11fdf7f2 8535 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8536 fd_map[r] = fh;
8537 }
8538
8539 out:
8540 tout(cct) << r << std::endl;
8541 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8542 return r;
8543}
8544
8545int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8546{
8547 /* Use default file striping parameters */
8548 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8549}
8550
8551int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8552 const UserPerm& perms)
8553{
11fdf7f2
TL
8554 std::lock_guard lock(client_lock);
8555 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8556
181888fb
FG
8557 if (unmounting)
8558 return -ENOTCONN;
8559
7c673cae
FG
8560 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8561 filepath path(ino);
8562 req->set_filepath(path);
8563
8564 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8565 char f[30];
8566 sprintf(f, "%u", h);
8567 filepath path2(dirino);
8568 path2.push_dentry(string(f));
8569 req->set_filepath2(path2);
8570
8571 int r = make_request(req, perms, NULL, NULL,
8572 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8573 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8574 return r;
8575}
8576
8577
8578/**
8579 * Load inode into local cache.
8580 *
8581 * If inode pointer is non-NULL, and take a reference on
8582 * the resulting Inode object in one operation, so that caller
8583 * can safely assume inode will still be there after return.
8584 */
1adf2230 8585int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8586{
11fdf7f2 8587 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8588
181888fb
FG
8589 if (unmounting)
8590 return -ENOTCONN;
8591
7c673cae
FG
8592 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8593 filepath path(ino);
8594 req->set_filepath(path);
8595
8596 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8597 if (r == 0 && inode != NULL) {
8598 vinodeno_t vino(ino, CEPH_NOSNAP);
8599 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8600 ceph_assert(p != inode_map.end());
7c673cae
FG
8601 *inode = p->second;
8602 _ll_get(*inode);
8603 }
11fdf7f2 8604 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8605 return r;
8606}
8607
1adf2230
AA
8608int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8609{
11fdf7f2 8610 std::lock_guard lock(client_lock);
1adf2230
AA
8611 return _lookup_ino(ino, perms, inode);
8612}
7c673cae
FG
8613
8614/**
8615 * Find the parent inode of `ino` and insert it into
8616 * our cache. Conditionally also set `parent` to a referenced
8617 * Inode* if caller provides non-NULL value.
8618 */
1adf2230 8619int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8620{
11fdf7f2 8621 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8622
7c673cae
FG
8623 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8624 filepath path(ino->ino);
8625 req->set_filepath(path);
8626
8627 InodeRef target;
8628 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8629 // Give caller a reference to the parent ino if they provided a pointer.
8630 if (parent != NULL) {
8631 if (r == 0) {
8632 *parent = target.get();
8633 _ll_get(*parent);
11fdf7f2 8634 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8635 } else {
8636 *parent = NULL;
8637 }
8638 }
11fdf7f2 8639 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8640 return r;
8641}
8642
7c673cae
FG
8643/**
8644 * Populate the parent dentry for `ino`, provided it is
8645 * a child of `parent`.
8646 */
1adf2230 8647int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8648{
11fdf7f2
TL
8649 ceph_assert(parent->is_dir());
8650 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8651
181888fb
FG
8652 if (unmounting)
8653 return -ENOTCONN;
8654
7c673cae
FG
8655 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8656 req->set_filepath2(filepath(parent->ino));
8657 req->set_filepath(filepath(ino->ino));
8658 req->set_inode(ino);
8659
8660 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8661 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8662 return r;
8663}
8664
1adf2230
AA
8665int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8666{
11fdf7f2 8667 std::lock_guard lock(client_lock);
1adf2230
AA
8668 return _lookup_name(ino, parent, perms);
8669}
7c673cae 8670
11fdf7f2 8671Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8672{
11fdf7f2
TL
8673 ceph_assert(in);
8674 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8675
11fdf7f2 8676 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8677
8678 if (in->snapid != CEPH_NOSNAP) {
8679 in->snap_cap_refs++;
8680 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8681 << ccap_string(in->caps_issued()) << dendl;
8682 }
8683
11fdf7f2 8684 const auto& conf = cct->_conf;
7c673cae
FG
8685 f->readahead.set_trigger_requests(1);
8686 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8687 uint64_t max_readahead = Readahead::NO_LIMIT;
8688 if (conf->client_readahead_max_bytes) {
11fdf7f2 8689 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8690 }
8691 if (conf->client_readahead_max_periods) {
11fdf7f2 8692 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8693 }
8694 f->readahead.set_max_readahead_size(max_readahead);
8695 vector<uint64_t> alignments;
8696 alignments.push_back(in->layout.get_period());
8697 alignments.push_back(in->layout.stripe_unit);
8698 f->readahead.set_alignments(alignments);
8699
8700 return f;
8701}
8702
8703int Client::_release_fh(Fh *f)
8704{
8705 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8706 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8707 Inode *in = f->inode.get();
11fdf7f2 8708 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8709
b32b8144
FG
8710 in->unset_deleg(f);
8711
7c673cae
FG
8712 if (in->snapid == CEPH_NOSNAP) {
8713 if (in->put_open_ref(f->mode)) {
8714 _flush(in, new C_Client_FlushComplete(this, in));
8715 check_caps(in, 0);
8716 }
8717 } else {
11fdf7f2 8718 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8719 in->snap_cap_refs--;
8720 }
8721
8722 _release_filelocks(f);
8723
8724 // Finally, read any async err (i.e. from flushes)
8725 int err = f->take_async_err();
8726 if (err != 0) {
11fdf7f2 8727 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8728 << cpp_strerror(err) << dendl;
8729 } else {
11fdf7f2 8730 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8731 }
8732
8733 _put_fh(f);
8734
8735 return err;
8736}
8737
8738void Client::_put_fh(Fh *f)
8739{
8740 int left = f->put();
8741 if (!left) {
8742 delete f;
8743 }
8744}
8745
8746int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8747 const UserPerm& perms)
8748{
8749 if (in->snapid != CEPH_NOSNAP &&
8750 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8751 return -EROFS;
8752 }
8753
8754 // use normalized flags to generate cmode
11fdf7f2
TL
8755 int cflags = ceph_flags_sys2wire(flags);
8756 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8757 cflags |= CEPH_O_LAZY;
8758
8759 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8760 int want = ceph_caps_for_mode(cmode);
8761 int result = 0;
8762
8763 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8764
b32b8144 8765 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8766 // update wanted?
8767 check_caps(in, CHECK_CAPS_NODELAY);
8768 } else {
b32b8144 8769
7c673cae
FG
8770 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8771 filepath path;
8772 in->make_nosnap_relative_path(path);
8773 req->set_filepath(path);
11fdf7f2 8774 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8775 req->head.args.open.mode = mode;
8776 req->head.args.open.pool = -1;
8777 if (cct->_conf->client_debug_getattr_caps)
8778 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8779 else
8780 req->head.args.open.mask = 0;
8781 req->head.args.open.old_size = in->size; // for O_TRUNC
8782 req->set_inode(in);
8783 result = make_request(req, perms);
b32b8144
FG
8784
8785 /*
8786 * NFS expects that delegations will be broken on a conflicting open,
8787 * not just when there is actual conflicting access to the file. SMB leases
8788 * and oplocks also have similar semantics.
8789 *
8790 * Ensure that clients that have delegations enabled will wait on minimal
8791 * caps during open, just to ensure that other clients holding delegations
8792 * return theirs first.
8793 */
8794 if (deleg_timeout && result == 0) {
8795 int need = 0, have;
8796
8797 if (cmode & CEPH_FILE_MODE_WR)
8798 need |= CEPH_CAP_FILE_WR;
8799 if (cmode & CEPH_FILE_MODE_RD)
8800 need |= CEPH_CAP_FILE_RD;
8801
8802 result = get_caps(in, need, want, &have, -1);
8803 if (result < 0) {
1adf2230 8804 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8805 " . Denying open: " <<
8806 cpp_strerror(result) << dendl;
8807 in->put_open_ref(cmode);
8808 } else {
8809 put_cap_ref(in, need);
8810 }
8811 }
7c673cae
FG
8812 }
8813
8814 // success?
8815 if (result >= 0) {
8816 if (fhp)
8817 *fhp = _create_fh(in, flags, cmode, perms);
8818 } else {
8819 in->put_open_ref(cmode);
8820 }
8821
8822 trim_cache();
8823
8824 return result;
8825}
8826
8827int Client::_renew_caps(Inode *in)
8828{
8829 int wanted = in->caps_file_wanted();
8830 if (in->is_any_caps() &&
8831 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8832 check_caps(in, CHECK_CAPS_NODELAY);
8833 return 0;
8834 }
8835
8836 int flags = 0;
8837 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8838 flags = O_RDWR;
8839 else if (wanted & CEPH_CAP_FILE_RD)
8840 flags = O_RDONLY;
8841 else if (wanted & CEPH_CAP_FILE_WR)
8842 flags = O_WRONLY;
8843
8844 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8845 filepath path;
8846 in->make_nosnap_relative_path(path);
8847 req->set_filepath(path);
8848 req->head.args.open.flags = flags;
8849 req->head.args.open.pool = -1;
8850 if (cct->_conf->client_debug_getattr_caps)
8851 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8852 else
8853 req->head.args.open.mask = 0;
8854 req->set_inode(in);
8855
8856 // duplicate in case Cap goes away; not sure if that race is a concern?
8857 const UserPerm *pperm = in->get_best_perms();
8858 UserPerm perms;
8859 if (pperm != NULL)
8860 perms = *pperm;
8861 int ret = make_request(req, perms);
8862 return ret;
8863}
8864
8865int Client::close(int fd)
8866{
8867 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8868 std::lock_guard lock(client_lock);
7c673cae
FG
8869 tout(cct) << "close" << std::endl;
8870 tout(cct) << fd << std::endl;
8871
181888fb
FG
8872 if (unmounting)
8873 return -ENOTCONN;
8874
7c673cae
FG
8875 Fh *fh = get_filehandle(fd);
8876 if (!fh)
8877 return -EBADF;
8878 int err = _release_fh(fh);
8879 fd_map.erase(fd);
8880 put_fd(fd);
8881 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8882 return err;
8883}
8884
8885
8886// ------------
8887// read, write
8888
8889loff_t Client::lseek(int fd, loff_t offset, int whence)
8890{
11fdf7f2 8891 std::lock_guard lock(client_lock);
7c673cae
FG
8892 tout(cct) << "lseek" << std::endl;
8893 tout(cct) << fd << std::endl;
8894 tout(cct) << offset << std::endl;
8895 tout(cct) << whence << std::endl;
8896
181888fb
FG
8897 if (unmounting)
8898 return -ENOTCONN;
8899
7c673cae
FG
8900 Fh *f = get_filehandle(fd);
8901 if (!f)
8902 return -EBADF;
8903#if defined(__linux__) && defined(O_PATH)
8904 if (f->flags & O_PATH)
8905 return -EBADF;
8906#endif
8907 return _lseek(f, offset, whence);
8908}
8909
8910loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8911{
8912 Inode *in = f->inode.get();
9f95a23c 8913 bool whence_check = false;
11fdf7f2 8914 loff_t pos = -1;
7c673cae 8915
9f95a23c
TL
8916 switch (whence) {
8917 case SEEK_END:
8918 whence_check = true;
8919 break;
8920
8921#ifdef SEEK_DATA
8922 case SEEK_DATA:
8923 whence_check = true;
8924 break;
8925#endif
8926
8927#ifdef SEEK_HOLE
8928 case SEEK_HOLE:
8929 whence_check = true;
8930 break;
8931#endif
8932 }
8933
8934 if (whence_check) {
8935 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8936 if (r < 0)
92f5a8d4 8937 return r;
92f5a8d4
TL
8938 }
8939
7c673cae
FG
8940 switch (whence) {
8941 case SEEK_SET:
11fdf7f2 8942 pos = offset;
7c673cae
FG
8943 break;
8944
8945 case SEEK_CUR:
92f5a8d4 8946 pos = f->pos + offset;
7c673cae
FG
8947 break;
8948
8949 case SEEK_END:
11fdf7f2 8950 pos = in->size + offset;
7c673cae
FG
8951 break;
8952
9f95a23c 8953#ifdef SEEK_DATA
92f5a8d4 8954 case SEEK_DATA:
9f95a23c
TL
8955 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
8956 return -ENXIO;
92f5a8d4
TL
8957 pos = offset;
8958 break;
9f95a23c 8959#endif
92f5a8d4 8960
9f95a23c 8961#ifdef SEEK_HOLE
92f5a8d4 8962 case SEEK_HOLE:
9f95a23c
TL
8963 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
8964 return -ENXIO;
8965 pos = in->size;
92f5a8d4 8966 break;
9f95a23c 8967#endif
92f5a8d4 8968
7c673cae 8969 default:
92f5a8d4
TL
8970 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
8971 return -EINVAL;
7c673cae
FG
8972 }
8973
11fdf7f2
TL
8974 if (pos < 0) {
8975 return -EINVAL;
8976 } else {
8977 f->pos = pos;
8978 }
8979
1adf2230 8980 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8981 return f->pos;
8982}
8983
8984
8985void Client::lock_fh_pos(Fh *f)
8986{
11fdf7f2 8987 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8988
8989 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 8990 ceph::condition_variable cond;
7c673cae 8991 f->pos_waiters.push_back(&cond);
11fdf7f2 8992 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
8993 std::unique_lock l{client_lock, std::adopt_lock};
8994 cond.wait(l, [f, me=&cond] {
8995 return !f->pos_locked && f->pos_waiters.front() == me;
8996 });
8997 l.release();
11fdf7f2
TL
8998 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8999 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9000 f->pos_waiters.pop_front();
9001 }
9002
9003 f->pos_locked = true;
9004}
9005
9006void Client::unlock_fh_pos(Fh *f)
9007{
11fdf7f2 9008 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9009 f->pos_locked = false;
9010}
9011
9012int Client::uninline_data(Inode *in, Context *onfinish)
9013{
9014 if (!in->inline_data.length()) {
9015 onfinish->complete(0);
9016 return 0;
9017 }
9018
9019 char oid_buf[32];
9020 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9021 object_t oid = oid_buf;
9022
9023 ObjectOperation create_ops;
9024 create_ops.create(false);
9025
9026 objecter->mutate(oid,
9027 OSDMap::file_to_object_locator(in->layout),
9028 create_ops,
9029 in->snaprealm->get_snap_context(),
9030 ceph::real_clock::now(),
9031 0,
9032 NULL);
9033
9034 bufferlist inline_version_bl;
11fdf7f2 9035 encode(in->inline_version, inline_version_bl);
7c673cae
FG
9036
9037 ObjectOperation uninline_ops;
9038 uninline_ops.cmpxattr("inline_version",
9039 CEPH_OSD_CMPXATTR_OP_GT,
9040 CEPH_OSD_CMPXATTR_MODE_U64,
9041 inline_version_bl);
9042 bufferlist inline_data = in->inline_data;
9043 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9044 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9045
9046 objecter->mutate(oid,
9047 OSDMap::file_to_object_locator(in->layout),
9048 uninline_ops,
9049 in->snaprealm->get_snap_context(),
9050 ceph::real_clock::now(),
9051 0,
9052 onfinish);
9053
9054 return 0;
9055}
9056
9057//
9058
9059// blocking osd interface
9060
9061int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9062{
11fdf7f2 9063 std::lock_guard lock(client_lock);
7c673cae
FG
9064 tout(cct) << "read" << std::endl;
9065 tout(cct) << fd << std::endl;
9066 tout(cct) << size << std::endl;
9067 tout(cct) << offset << std::endl;
9068
181888fb
FG
9069 if (unmounting)
9070 return -ENOTCONN;
9071
7c673cae
FG
9072 Fh *f = get_filehandle(fd);
9073 if (!f)
9074 return -EBADF;
9075#if defined(__linux__) && defined(O_PATH)
9076 if (f->flags & O_PATH)
9077 return -EBADF;
9078#endif
9079 bufferlist bl;
11fdf7f2
TL
9080 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9081 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9082 int r = _read(f, offset, size, &bl);
9083 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9084 if (r >= 0) {
9f95a23c 9085 bl.begin().copy(bl.length(), buf);
7c673cae
FG
9086 r = bl.length();
9087 }
9088 return r;
9089}
9090
9091int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9092{
9093 if (iovcnt < 0)
9094 return -EINVAL;
9095 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9096}
9097
11fdf7f2 9098int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9099{
11fdf7f2
TL
9100 int want, have = 0;
9101 bool movepos = false;
9102 std::unique_ptr<C_SaferCond> onuninline;
9103 int64_t r = 0;
9104 const auto& conf = cct->_conf;
7c673cae 9105 Inode *in = f->inode.get();
11fdf7f2
TL
9106 utime_t lat;
9107 utime_t start = ceph_clock_now();
7c673cae
FG
9108
9109 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9110 return -EBADF;
9111 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9112
7c673cae
FG
9113 if (offset < 0) {
9114 lock_fh_pos(f);
9115 offset = f->pos;
9116 movepos = true;
9117 }
9118 loff_t start_pos = offset;
9119
9120 if (in->inline_version == 0) {
11fdf7f2 9121 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9122 if (r < 0) {
11fdf7f2 9123 goto done;
c07f9fc5 9124 }
11fdf7f2 9125 ceph_assert(in->inline_version > 0);
7c673cae
FG
9126 }
9127
9128retry:
11fdf7f2
TL
9129 if (f->mode & CEPH_FILE_MODE_LAZY)
9130 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9131 else
9132 want = CEPH_CAP_FILE_CACHE;
9133 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9134 if (r < 0) {
11fdf7f2 9135 goto done;
c07f9fc5 9136 }
7c673cae 9137 if (f->flags & O_DIRECT)
11fdf7f2 9138 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9139
9140 if (in->inline_version < CEPH_INLINE_NONE) {
9141 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9142 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9143 uninline_data(in, onuninline.get());
7c673cae
FG
9144 } else {
9145 uint32_t len = in->inline_data.length();
7c673cae
FG
9146 uint64_t endoff = offset + size;
9147 if (endoff > in->size)
9148 endoff = in->size;
9149
9150 if (offset < len) {
9151 if (endoff <= len) {
9152 bl->substr_of(in->inline_data, offset, endoff - offset);
9153 } else {
9154 bl->substr_of(in->inline_data, offset, len - offset);
9155 bl->append_zero(endoff - len);
9156 }
11fdf7f2 9157 r = endoff - offset;
7c673cae
FG
9158 } else if ((uint64_t)offset < endoff) {
9159 bl->append_zero(endoff - offset);
11fdf7f2
TL
9160 r = endoff - offset;
9161 } else {
9162 r = 0;
7c673cae 9163 }
7c673cae
FG
9164 goto success;
9165 }
9166 }
9167
9168 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9169 conf->client_oc &&
9170 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9171
9172 if (f->flags & O_RSYNC) {
9173 _flush_range(in, offset, size);
9174 }
9175 r = _read_async(f, offset, size, bl);
9176 if (r < 0)
9177 goto done;
9178 } else {
9179 if (f->flags & O_DIRECT)
9180 _flush_range(in, offset, size);
9181
9182 bool checkeof = false;
9183 r = _read_sync(f, offset, size, bl, &checkeof);
9184 if (r < 0)
9185 goto done;
9186 if (checkeof) {
9187 offset += r;
9188 size -= r;
9189
9190 put_cap_ref(in, CEPH_CAP_FILE_RD);
9191 have = 0;
9192 // reverify size
9193 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9194 if (r < 0)
9195 goto done;
9196
9197 // eof? short read.
9198 if ((uint64_t)offset < in->size)
9199 goto retry;
9200 }
9201 }
9202
9203success:
11fdf7f2 9204 ceph_assert(r >= 0);
7c673cae
FG
9205 if (movepos) {
9206 // adjust fd pos
11fdf7f2 9207 f->pos = start_pos + r;
7c673cae 9208 }
11fdf7f2
TL
9209
9210 lat = ceph_clock_now();
9211 lat -= start;
9212 logger->tinc(l_c_read, lat);
7c673cae
FG
9213
9214done:
9215 // done!
11fdf7f2 9216
7c673cae 9217 if (onuninline) {
9f95a23c 9218 client_lock.unlock();
11fdf7f2 9219 int ret = onuninline->wait();
9f95a23c 9220 client_lock.lock();
11fdf7f2 9221 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9222 in->inline_data.clear();
9223 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9224 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9225 check_caps(in, 0);
9226 } else
11fdf7f2 9227 r = ret;
7c673cae 9228 }
11fdf7f2 9229 if (have) {
7c673cae 9230 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9231 }
9232 if (movepos) {
9233 unlock_fh_pos(f);
9234 }
9235 return r;
7c673cae
FG
9236}
9237
9238Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9239 client(c), f(f) {
9240 f->get();
9241 f->readahead.inc_pending();
9242}
9243
9244Client::C_Readahead::~C_Readahead() {
9245 f->readahead.dec_pending();
9246 client->_put_fh(f);
9247}
9248
9249void Client::C_Readahead::finish(int r) {
9250 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9251 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9252}
9253
9254int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9255{
11fdf7f2 9256 const auto& conf = cct->_conf;
7c673cae
FG
9257 Inode *in = f->inode.get();
9258
11fdf7f2 9259 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9260
9261 // trim read based on file size?
9262 if (off >= in->size)
9263 return 0;
9264 if (len == 0)
9265 return 0;
9266 if (off + len > in->size) {
9267 len = in->size - off;
9268 }
9269
9270 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9271 << " max_bytes=" << f->readahead.get_max_readahead_size()
9272 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9273
9274 // read (and possibly block)
11fdf7f2
TL
9275 int r = 0;
9276 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9277 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9278 off, len, bl, 0, &onfinish);
7c673cae
FG
9279 if (r == 0) {
9280 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 9281 client_lock.unlock();
11fdf7f2 9282 r = onfinish.wait();
9f95a23c 9283 client_lock.lock();
7c673cae 9284 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9285 }
9286
9287 if(f->readahead.get_min_readahead_size() > 0) {
9288 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9289 if (readahead_extent.second > 0) {
9290 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9291 << " (caller wants " << off << "~" << len << ")" << dendl;
9292 Context *onfinish2 = new C_Readahead(this, f);
9293 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9294 readahead_extent.first, readahead_extent.second,
9295 NULL, 0, onfinish2);
9296 if (r2 == 0) {
9297 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9298 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9299 } else {
9300 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9301 delete onfinish2;
9302 }
9303 }
9304 }
9305
9306 return r;
9307}
9308
9309int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9310 bool *checkeof)
9311{
9312 Inode *in = f->inode.get();
9313 uint64_t pos = off;
9314 int left = len;
9315 int read = 0;
9316
11fdf7f2 9317 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 9318
7c673cae 9319 while (left > 0) {
11fdf7f2 9320 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9321 bufferlist tbl;
9322
9323 int wanted = left;
9324 filer->read_trunc(in->ino, &in->layout, in->snapid,
9325 pos, left, &tbl, 0,
9326 in->truncate_size, in->truncate_seq,
11fdf7f2 9327 &onfinish);
9f95a23c 9328 client_lock.unlock();
11fdf7f2 9329 int r = onfinish.wait();
9f95a23c 9330 client_lock.lock();
7c673cae
FG
9331
9332 // if we get ENOENT from OSD, assume 0 bytes returned
9333 if (r == -ENOENT)
9334 r = 0;
9335 if (r < 0)
9336 return r;
9337 if (tbl.length()) {
9338 r = tbl.length();
9339
9340 read += r;
9341 pos += r;
9342 left -= r;
9343 bl->claim_append(tbl);
9344 }
9345 // short read?
9346 if (r >= 0 && r < wanted) {
9347 if (pos < in->size) {
9348 // zero up to known EOF
9349 int64_t some = in->size - pos;
9350 if (some > left)
9351 some = left;
11fdf7f2
TL
9352 auto z = buffer::ptr_node::create(some);
9353 z->zero();
9354 bl->push_back(std::move(z));
7c673cae
FG
9355 read += some;
9356 pos += some;
9357 left -= some;
9358 if (left == 0)
9359 return read;
9360 }
9361
9362 *checkeof = true;
9363 return read;
9364 }
9365 }
9366 return read;
9367}
9368
9369
9370/*
9371 * we keep count of uncommitted sync writes on the inode, so that
9372 * fsync can DDRT.
9373 */
9374void Client::_sync_write_commit(Inode *in)
9375{
11fdf7f2 9376 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9377 unsafe_sync_write--;
9378
9379 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9380
11fdf7f2 9381 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9382 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9383 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9f95a23c 9384 mount_cond.notify_all();
7c673cae
FG
9385 }
9386}
9387
9388int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9389{
11fdf7f2 9390 std::lock_guard lock(client_lock);
7c673cae
FG
9391 tout(cct) << "write" << std::endl;
9392 tout(cct) << fd << std::endl;
9393 tout(cct) << size << std::endl;
9394 tout(cct) << offset << std::endl;
9395
181888fb
FG
9396 if (unmounting)
9397 return -ENOTCONN;
9398
7c673cae
FG
9399 Fh *fh = get_filehandle(fd);
9400 if (!fh)
9401 return -EBADF;
9402#if defined(__linux__) && defined(O_PATH)
9403 if (fh->flags & O_PATH)
9404 return -EBADF;
9405#endif
11fdf7f2
TL
9406 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9407 size = std::min(size, (loff_t)INT_MAX);
9408 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9409 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9410 return r;
9411}
9412
9413int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9414{
9415 if (iovcnt < 0)
9416 return -EINVAL;
9417 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9418}
9419
11fdf7f2
TL
9420int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9421 unsigned iovcnt, int64_t offset, bool write,
9422 bool clamp_to_int)
7c673cae 9423{
7c673cae
FG
9424#if defined(__linux__) && defined(O_PATH)
9425 if (fh->flags & O_PATH)
9426 return -EBADF;
9427#endif
9428 loff_t totallen = 0;
9429 for (unsigned i = 0; i < iovcnt; i++) {
9430 totallen += iov[i].iov_len;
9431 }
11fdf7f2
TL
9432
9433 /*
9434 * Some of the API functions take 64-bit size values, but only return
9435 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9436 * we don't do I/Os larger than the values we can return.
9437 */
9438 if (clamp_to_int) {
9439 totallen = std::min(totallen, (loff_t)INT_MAX);
9440 }
7c673cae 9441 if (write) {
11fdf7f2
TL
9442 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9443 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9444 return w;
9445 } else {
9446 bufferlist bl;
11fdf7f2
TL
9447 int64_t r = _read(fh, offset, totallen, &bl);
9448 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9449 if (r <= 0)
9450 return r;
9451
9f95a23c 9452 auto iter = bl.cbegin();
7c673cae
FG
9453 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9454 /*
9455 * This piece of code aims to handle the case that bufferlist does not have enough data
9456 * to fill in the iov
9457 */
9f95a23c
TL
9458 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9459 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9460 resid -= round_size;
9461 /* iter is self-updating */
7c673cae
FG
9462 }
9463 return r;
9464 }
9465}
9466
11fdf7f2
TL
9467int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9468{
9469 std::lock_guard lock(client_lock);
9470 tout(cct) << fd << std::endl;
9471 tout(cct) << offset << std::endl;
9472
9473 if (unmounting)
9474 return -ENOTCONN;
9475
9476 Fh *fh = get_filehandle(fd);
9477 if (!fh)
9478 return -EBADF;
9479 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9480}
9481
9482int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9483 const struct iovec *iov, int iovcnt)
7c673cae 9484{
f64942e4
AA
9485 uint64_t fpos = 0;
9486
7c673cae
FG
9487 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9488 return -EFBIG;
9489
9490 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9491 Inode *in = f->inode.get();
9492
9493 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9494 return -ENOSPC;
9495 }
9496
11fdf7f2 9497 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9498
9499 // was Fh opened as writeable?
9500 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9501 return -EBADF;
9502
7c673cae
FG
9503 // use/adjust fd pos?
9504 if (offset < 0) {
9505 lock_fh_pos(f);
9506 /*
9507 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9508 * change out from under us.
9509 */
9510 if (f->flags & O_APPEND) {
9f95a23c 9511 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
9512 if (r < 0) {
9513 unlock_fh_pos(f);
9514 return r;
9515 }
9516 }
9517 offset = f->pos;
f64942e4 9518 fpos = offset+size;
7c673cae
FG
9519 unlock_fh_pos(f);
9520 }
9521
11fdf7f2
TL
9522 // check quota
9523 uint64_t endoff = offset + size;
9524 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9525 f->actor_perms)) {
9526 return -EDQUOT;
9527 }
9528
7c673cae
FG
9529 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9530
9531 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9532
9533 // time it.
9534 utime_t start = ceph_clock_now();
9535
9536 if (in->inline_version == 0) {
9537 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9538 if (r < 0)
9539 return r;
11fdf7f2 9540 ceph_assert(in->inline_version > 0);
7c673cae
FG
9541 }
9542
9543 // copy into fresh buffer (since our write may be resub, async)
9544 bufferlist bl;
9545 if (buf) {
9546 if (size > 0)
9547 bl.append(buf, size);
9548 } else if (iov){
9549 for (int i = 0; i < iovcnt; i++) {
9550 if (iov[i].iov_len > 0) {
9551 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9552 }
9553 }
9554 }
9555
9556 utime_t lat;
9557 uint64_t totalwritten;
11fdf7f2
TL
9558 int want, have;
9559 if (f->mode & CEPH_FILE_MODE_LAZY)
9560 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9561 else
9562 want = CEPH_CAP_FILE_BUFFER;
9563 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9564 if (r < 0)
9565 return r;
9566
9567 /* clear the setuid/setgid bits, if any */
181888fb 9568 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9569 struct ceph_statx stx = { 0 };
9570
9571 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9572 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9573 if (r < 0)
9574 return r;
9575 } else {
9576 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9577 }
9578
9579 if (f->flags & O_DIRECT)
11fdf7f2 9580 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9581
9582 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9583
11fdf7f2
TL
9584 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9585
7c673cae
FG
9586 if (in->inline_version < CEPH_INLINE_NONE) {
9587 if (endoff > cct->_conf->client_max_inline_size ||
9588 endoff > CEPH_INLINE_MAX_SIZE ||
9589 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9590 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9591 uninline_data(in, onuninline.get());
7c673cae
FG
9592 } else {
9593 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9594
9595 uint32_t len = in->inline_data.length();
9596
9597 if (endoff < len)
9f95a23c 9598 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
9599
9600 if (offset < len)
9601 in->inline_data.splice(offset, len - offset);
9602 else if (offset > len)
9603 in->inline_data.append_zero(offset - len);
9604
9605 in->inline_data.append(bl);
9606 in->inline_version++;
9607
9608 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9609
9610 goto success;
9611 }
9612 }
9613
11fdf7f2
TL
9614 if (cct->_conf->client_oc &&
9615 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9616 // do buffered write
9617 if (!in->oset.dirty_or_tx)
9618 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9619
9620 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9621
9622 // async, caching, non-blocking.
9623 r = objectcacher->file_write(&in->oset, &in->layout,
9624 in->snaprealm->get_snap_context(),
9625 offset, size, bl, ceph::real_clock::now(),
9626 0);
9627 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9628
9629 if (r < 0)
9630 goto done;
9631
9632 // flush cached write if O_SYNC is set on file fh
9633 // O_DSYNC == O_SYNC on linux < 2.6.33
9634 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9635 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9636 _flush_range(in, offset, size);
9637 }
9638 } else {
9639 if (f->flags & O_DIRECT)
9640 _flush_range(in, offset, size);
9641
9642 // simple, non-atomic sync write
11fdf7f2 9643 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9644 unsafe_sync_write++;
9645 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9646
9647 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9648 offset, size, bl, ceph::real_clock::now(), 0,
9649 in->truncate_size, in->truncate_seq,
11fdf7f2 9650 &onfinish);
9f95a23c 9651 client_lock.unlock();
11fdf7f2 9652 onfinish.wait();
9f95a23c 9653 client_lock.lock();
7c673cae
FG
9654 _sync_write_commit(in);
9655 }
9656
9657 // if we get here, write was successful, update client metadata
9658success:
9659 // time
9660 lat = ceph_clock_now();
9661 lat -= start;
9662 logger->tinc(l_c_wrlat, lat);
9663
f64942e4
AA
9664 if (fpos) {
9665 lock_fh_pos(f);
9666 f->pos = fpos;
9667 unlock_fh_pos(f);
9668 }
7c673cae 9669 totalwritten = size;
11fdf7f2 9670 r = (int64_t)totalwritten;
7c673cae
FG
9671
9672 // extend file?
9673 if (totalwritten + offset > in->size) {
9674 in->size = totalwritten + offset;
28e407b8 9675 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9676
11fdf7f2 9677 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9678 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9679 } else if (is_max_size_approaching(in)) {
9680 check_caps(in, 0);
7c673cae
FG
9681 }
9682
9683 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9684 } else {
9685 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9686 }
9687
9688 // mtime
91327a77 9689 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9690 in->change_attr++;
28e407b8 9691 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9692
9693done:
9694
11fdf7f2 9695 if (nullptr != onuninline) {
9f95a23c 9696 client_lock.unlock();
11fdf7f2 9697 int uninline_ret = onuninline->wait();
9f95a23c 9698 client_lock.lock();
7c673cae
FG
9699
9700 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9701 in->inline_data.clear();
9702 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9703 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9704 check_caps(in, 0);
9705 } else
9706 r = uninline_ret;
9707 }
9708
9709 put_cap_ref(in, CEPH_CAP_FILE_WR);
9710 return r;
9711}
9712
9713int Client::_flush(Fh *f)
9714{
9715 Inode *in = f->inode.get();
9716 int err = f->take_async_err();
9717 if (err != 0) {
9718 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9719 << cpp_strerror(err) << dendl;
9720 } else {
9721 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9722 }
9723
9724 return err;
9725}
9726
9727int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9728{
9729 struct ceph_statx stx;
9730 stx.stx_size = length;
9731 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9732}
9733
9734int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9735{
11fdf7f2
TL
9736 std::lock_guard lock(client_lock);
9737 tout(cct) << __func__ << std::endl;
7c673cae
FG
9738 tout(cct) << fd << std::endl;
9739 tout(cct) << length << std::endl;
9740
181888fb
FG
9741 if (unmounting)
9742 return -ENOTCONN;
9743
7c673cae
FG
9744 Fh *f = get_filehandle(fd);
9745 if (!f)
9746 return -EBADF;
9747#if defined(__linux__) && defined(O_PATH)
9748 if (f->flags & O_PATH)
9749 return -EBADF;
9750#endif
9751 struct stat attr;
9752 attr.st_size = length;
9753 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9754}
9755
9756int Client::fsync(int fd, bool syncdataonly)
9757{
11fdf7f2 9758 std::lock_guard lock(client_lock);
7c673cae
FG
9759 tout(cct) << "fsync" << std::endl;
9760 tout(cct) << fd << std::endl;
9761 tout(cct) << syncdataonly << std::endl;
9762
181888fb
FG
9763 if (unmounting)
9764 return -ENOTCONN;
9765
7c673cae
FG
9766 Fh *f = get_filehandle(fd);
9767 if (!f)
9768 return -EBADF;
9769#if defined(__linux__) && defined(O_PATH)
9770 if (f->flags & O_PATH)
9771 return -EBADF;
9772#endif
9773 int r = _fsync(f, syncdataonly);
9774 if (r == 0) {
9775 // The IOs in this fsync were okay, but maybe something happened
9776 // in the background that we shoudl be reporting?
9777 r = f->take_async_err();
1adf2230 9778 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9779 << ") = 0, async_err = " << r << dendl;
9780 } else {
9781 // Assume that an error we encountered during fsync, even reported
9782 // synchronously, would also have applied the error to the Fh, and we
9783 // should clear it here to avoid returning the same error again on next
9784 // call.
1adf2230 9785 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9786 << r << dendl;
9787 f->take_async_err();
9788 }
9789 return r;
9790}
9791
9792int Client::_fsync(Inode *in, bool syncdataonly)
9793{
9794 int r = 0;
11fdf7f2 9795 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9796 ceph_tid_t flush_tid = 0;
9797 InodeRef tmp_ref;
11fdf7f2
TL
9798 utime_t lat;
9799 utime_t start = ceph_clock_now();
7c673cae 9800
1adf2230 9801 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9802
9803 if (cct->_conf->client_oc) {
11fdf7f2
TL
9804 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9805 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9806 _flush(in, object_cacher_completion.get());
7c673cae
FG
9807 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9808 }
9809
9810 if (!syncdataonly && in->dirty_caps) {
9811 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9812 if (in->flushing_caps)
9813 flush_tid = last_flush_tid;
9814 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9815
9816 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9817 flush_mdlog_sync();
9818
7c673cae
FG
9819 MetaRequest *req = in->unsafe_ops.back();
9820 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9821
9822 req->get();
9823 wait_on_list(req->waitfor_safe);
9824 put_request(req);
9825 }
9826
11fdf7f2 9827 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 9828 client_lock.unlock();
7c673cae 9829 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9830 r = object_cacher_completion->wait();
9f95a23c 9831 client_lock.lock();
7c673cae
FG
9832 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9833 } else {
9834 // FIXME: this can starve
9835 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9836 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9837 << " uncommitted, waiting" << dendl;
9838 wait_on_list(in->waitfor_commit);
9839 }
9840 }
9841
9842 if (!r) {
9843 if (flush_tid > 0)
9844 wait_sync_caps(in, flush_tid);
9845
9846 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9847 } else {
1adf2230 9848 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9849 << cpp_strerror(-r) << dendl;
9850 }
11fdf7f2
TL
9851
9852 lat = ceph_clock_now();
9853 lat -= start;
9854 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9855
9856 return r;
9857}
9858
9859int Client::_fsync(Fh *f, bool syncdataonly)
9860{
1adf2230 9861 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9862 return _fsync(f->inode.get(), syncdataonly);
9863}
9864
9865int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9866{
11fdf7f2 9867 std::lock_guard lock(client_lock);
7c673cae
FG
9868 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9869 tout(cct) << fd << std::endl;
9870
181888fb
FG
9871 if (unmounting)
9872 return -ENOTCONN;
9873
7c673cae
FG
9874 Fh *f = get_filehandle(fd);
9875 if (!f)
9876 return -EBADF;
9877 int r = _getattr(f->inode, mask, perms);
9878 if (r < 0)
9879 return r;
9880 fill_stat(f->inode, stbuf, NULL);
1adf2230 9881 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9882 return r;
9883}
9884
9885int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9886 unsigned int want, unsigned int flags)
9887{
11fdf7f2 9888 std::lock_guard lock(client_lock);
7c673cae
FG
9889 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9890 tout(cct) << fd << std::endl;
9891
181888fb
FG
9892 if (unmounting)
9893 return -ENOTCONN;
9894
7c673cae
FG
9895 Fh *f = get_filehandle(fd);
9896 if (!f)
9897 return -EBADF;
9898
9899 unsigned mask = statx_to_mask(flags, want);
9900
9901 int r = 0;
94b18763 9902 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9903 r = _getattr(f->inode, mask, perms);
9904 if (r < 0) {
9905 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9906 return r;
9907 }
9908 }
9909
9910 fill_statx(f->inode, mask, stx);
9911 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9912 return r;
9913}
9914
9915// not written yet, but i want to link!
9916
9917int Client::chdir(const char *relpath, std::string &new_cwd,
9918 const UserPerm& perms)
9919{
11fdf7f2 9920 std::lock_guard lock(client_lock);
7c673cae
FG
9921 tout(cct) << "chdir" << std::endl;
9922 tout(cct) << relpath << std::endl;
181888fb
FG
9923
9924 if (unmounting)
9925 return -ENOTCONN;
9926
7c673cae
FG
9927 filepath path(relpath);
9928 InodeRef in;
9929 int r = path_walk(path, &in, perms);
9930 if (r < 0)
9931 return r;
92f5a8d4
TL
9932
9933 if (!(in.get()->is_dir()))
9934 return -ENOTDIR;
9935
7c673cae
FG
9936 if (cwd != in)
9937 cwd.swap(in);
9938 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9939
b5b8bbf5 9940 _getcwd(new_cwd, perms);
7c673cae
FG
9941 return 0;
9942}
9943
b5b8bbf5 9944void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9945{
9946 filepath path;
11fdf7f2 9947 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9948
9949 Inode *in = cwd.get();
9950 while (in != root) {
11fdf7f2 9951 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9952
9953 // A cwd or ancester is unlinked
11fdf7f2 9954 if (in->dentries.empty()) {
7c673cae
FG
9955 return;
9956 }
9957
9958 Dentry *dn = in->get_first_parent();
9959
9960
9961 if (!dn) {
9962 // look it up
11fdf7f2 9963 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9964 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9965 filepath path(in->ino);
9966 req->set_filepath(path);
9967 req->set_inode(in);
9968 int res = make_request(req, perms);
9969 if (res < 0)
9970 break;
9971
9972 // start over
9973 path = filepath();
9974 in = cwd.get();
9975 continue;
9976 }
9977 path.push_front_dentry(dn->name);
9978 in = dn->dir->parent_inode;
9979 }
9980 dir = "/";
9981 dir += path.get_path();
9982}
9983
b5b8bbf5
FG
9984void Client::getcwd(string& dir, const UserPerm& perms)
9985{
11fdf7f2 9986 std::lock_guard l(client_lock);
181888fb
FG
9987 if (!unmounting)
9988 _getcwd(dir, perms);
b5b8bbf5
FG
9989}
9990
7c673cae
FG
9991int Client::statfs(const char *path, struct statvfs *stbuf,
9992 const UserPerm& perms)
9993{
11fdf7f2
TL
9994 std::lock_guard l(client_lock);
9995 tout(cct) << __func__ << std::endl;
91327a77 9996 unsigned long int total_files_on_fs;
7c673cae 9997
181888fb
FG
9998 if (unmounting)
9999 return -ENOTCONN;
10000
7c673cae
FG
10001 ceph_statfs stats;
10002 C_SaferCond cond;
d2e6a577
FG
10003
10004 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10005 if (data_pools.size() == 1) {
10006 objecter->get_fs_stats(stats, data_pools[0], &cond);
10007 } else {
10008 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10009 }
7c673cae 10010
9f95a23c 10011 client_lock.unlock();
7c673cae 10012 int rval = cond.wait();
91327a77
AA
10013 assert(root);
10014 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9f95a23c 10015 client_lock.lock();
7c673cae
FG
10016
10017 if (rval < 0) {
10018 ldout(cct, 1) << "underlying call to statfs returned error: "
10019 << cpp_strerror(rval)
10020 << dendl;
10021 return rval;
10022 }
10023
10024 memset(stbuf, 0, sizeof(*stbuf));
10025
10026 /*
10027 * we're going to set a block size of 4MB so we can represent larger
10028 * FSes without overflowing. Additionally convert the space
10029 * measurements from KB to bytes while making them in terms of
10030 * blocks. We use 4MB only because it is big enough, and because it
10031 * actually *is* the (ceph) default block size.
10032 */
10033 const int CEPH_BLOCK_SHIFT = 22;
10034 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10035 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
10036 stbuf->f_files = total_files_on_fs;
10037 stbuf->f_ffree = 0;
7c673cae
FG
10038 stbuf->f_favail = -1;
10039 stbuf->f_fsid = -1; // ??
10040 stbuf->f_flag = 0; // ??
10041 stbuf->f_namemax = NAME_MAX;
10042
10043 // Usually quota_root will == root_ancestor, but if the mount root has no
10044 // quota but we can see a parent of it that does have a quota, we'll
10045 // respect that one instead.
11fdf7f2 10046 ceph_assert(root != nullptr);
7c673cae
FG
10047 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10048
10049 // get_quota_root should always give us something
10050 // because client quotas are always enabled
11fdf7f2 10051 ceph_assert(quota_root != nullptr);
7c673cae
FG
10052
10053 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10054
10055 // Skip the getattr if any sessions are stale, as we don't want to
10056 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10057 // is unhealthy.
10058 if (!_any_stale_sessions()) {
10059 int r = _getattr(quota_root, 0, perms, true);
10060 if (r != 0) {
10061 // Ignore return value: error getting latest inode metadata is not a good
10062 // reason to break "df".
10063 lderr(cct) << "Error in getattr on quota root 0x"
10064 << std::hex << quota_root->ino << std::dec
10065 << " statfs result may be outdated" << dendl;
10066 }
10067 }
10068
10069 // Special case: if there is a size quota set on the Inode acting
10070 // as the root for this client mount, then report the quota status
10071 // as the filesystem statistics.
10072 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10073 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10074 // It is possible for a quota to be exceeded: arithmetic here must
10075 // handle case where used > total.
10076 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10077
10078 stbuf->f_blocks = total;
10079 stbuf->f_bfree = free;
10080 stbuf->f_bavail = free;
10081 } else {
d2e6a577 10082 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10083 // multiple pools may be used without one filesystem namespace via
10084 // layouts, this is the most correct thing we can do.
10085 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10086 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10087 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10088 }
10089
10090 return rval;
10091}
10092
10093int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10094 struct flock *fl, uint64_t owner, bool removing)
10095{
11fdf7f2 10096 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10097 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10098 << " type " << fl->l_type << " owner " << owner
10099 << " " << fl->l_start << "~" << fl->l_len << dendl;
10100
10101 int lock_cmd;
10102 if (F_RDLCK == fl->l_type)
10103 lock_cmd = CEPH_LOCK_SHARED;
10104 else if (F_WRLCK == fl->l_type)
10105 lock_cmd = CEPH_LOCK_EXCL;
10106 else if (F_UNLCK == fl->l_type)
10107 lock_cmd = CEPH_LOCK_UNLOCK;
10108 else
10109 return -EIO;
10110
10111 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10112 sleep = 0;
10113
10114 /*
10115 * Set the most significant bit, so that MDS knows the 'owner'
10116 * is sufficient to identify the owner of lock. (old code uses
10117 * both 'owner' and 'pid')
10118 */
10119 owner |= (1ULL << 63);
10120
10121 MetaRequest *req = new MetaRequest(op);
10122 filepath path;
10123 in->make_nosnap_relative_path(path);
10124 req->set_filepath(path);
10125 req->set_inode(in);
10126
10127 req->head.args.filelock_change.rule = lock_type;
10128 req->head.args.filelock_change.type = lock_cmd;
10129 req->head.args.filelock_change.owner = owner;
10130 req->head.args.filelock_change.pid = fl->l_pid;
10131 req->head.args.filelock_change.start = fl->l_start;
10132 req->head.args.filelock_change.length = fl->l_len;
10133 req->head.args.filelock_change.wait = sleep;
10134
10135 int ret;
10136 bufferlist bl;
10137
10138 if (sleep && switch_interrupt_cb) {
10139 // enable interrupt
10140 switch_interrupt_cb(callback_handle, req->get());
10141 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10142 // disable interrupt
10143 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10144 if (ret == 0 && req->aborted()) {
10145 // effect of this lock request has been revoked by the 'lock intr' request
10146 ret = req->get_abort_code();
10147 }
7c673cae
FG
10148 put_request(req);
10149 } else {
10150 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10151 }
10152
10153 if (ret == 0) {
10154 if (op == CEPH_MDS_OP_GETFILELOCK) {
10155 ceph_filelock filelock;
11fdf7f2
TL
10156 auto p = bl.cbegin();
10157 decode(filelock, p);
7c673cae
FG
10158
10159 if (CEPH_LOCK_SHARED == filelock.type)
10160 fl->l_type = F_RDLCK;
10161 else if (CEPH_LOCK_EXCL == filelock.type)
10162 fl->l_type = F_WRLCK;
10163 else
10164 fl->l_type = F_UNLCK;
10165
10166 fl->l_whence = SEEK_SET;
10167 fl->l_start = filelock.start;
10168 fl->l_len = filelock.length;
10169 fl->l_pid = filelock.pid;
10170 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10171 ceph_lock_state_t *lock_state;
10172 if (lock_type == CEPH_LOCK_FCNTL) {
10173 if (!in->fcntl_locks)
11fdf7f2
TL
10174 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10175 lock_state = in->fcntl_locks.get();
7c673cae
FG
10176 } else if (lock_type == CEPH_LOCK_FLOCK) {
10177 if (!in->flock_locks)
11fdf7f2
TL
10178 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10179 lock_state = in->flock_locks.get();
7c673cae
FG
10180 } else {
10181 ceph_abort();
10182 return -EINVAL;
10183 }
10184 _update_lock_state(fl, owner, lock_state);
10185
10186 if (!removing) {
10187 if (lock_type == CEPH_LOCK_FCNTL) {
10188 if (!fh->fcntl_locks)
11fdf7f2
TL
10189 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10190 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10191 } else {
10192 if (!fh->flock_locks)
11fdf7f2
TL
10193 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10194 lock_state = fh->flock_locks.get();
7c673cae
FG
10195 }
10196 _update_lock_state(fl, owner, lock_state);
10197 }
10198 } else
10199 ceph_abort();
10200 }
10201 return ret;
10202}
10203
10204int Client::_interrupt_filelock(MetaRequest *req)
10205{
31f18b77
FG
10206 // Set abort code, but do not kick. The abort code prevents the request
10207 // from being re-sent.
10208 req->abort(-EINTR);
10209 if (req->mds < 0)
10210 return 0; // haven't sent the request
10211
7c673cae
FG
10212 Inode *in = req->inode();
10213
10214 int lock_type;
10215 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10216 lock_type = CEPH_LOCK_FLOCK_INTR;
10217 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10218 lock_type = CEPH_LOCK_FCNTL_INTR;
10219 else {
10220 ceph_abort();
10221 return -EINVAL;
10222 }
10223
10224 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10225 filepath path;
10226 in->make_nosnap_relative_path(path);
10227 intr_req->set_filepath(path);
10228 intr_req->set_inode(in);
10229 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10230 intr_req->head.args.filelock_change.rule = lock_type;
10231 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10232
10233 UserPerm perms(req->get_uid(), req->get_gid());
10234 return make_request(intr_req, perms, NULL, NULL, -1);
10235}
10236
10237void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10238{
10239 if (!in->fcntl_locks && !in->flock_locks)
10240 return;
10241
10242 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10243 encode(nr_fcntl_locks, bl);
7c673cae 10244 if (nr_fcntl_locks) {
11fdf7f2 10245 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10246 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10247 p != lock_state->held_locks.end();
10248 ++p)
11fdf7f2 10249 encode(p->second, bl);
7c673cae
FG
10250 }
10251
10252 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10253 encode(nr_flock_locks, bl);
7c673cae 10254 if (nr_flock_locks) {
11fdf7f2 10255 auto &lock_state = in->flock_locks;
7c673cae
FG
10256 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10257 p != lock_state->held_locks.end();
10258 ++p)
11fdf7f2 10259 encode(p->second, bl);
7c673cae
FG
10260 }
10261
11fdf7f2 10262 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10263 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10264}
10265
10266void Client::_release_filelocks(Fh *fh)
10267{
10268 if (!fh->fcntl_locks && !fh->flock_locks)
10269 return;
10270
10271 Inode *in = fh->inode.get();
11fdf7f2 10272 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10273
10274 list<pair<int, ceph_filelock> > to_release;
10275
10276 if (fh->fcntl_locks) {
11fdf7f2 10277 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10278 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10279 p != lock_state->held_locks.end();
10280 ++p)
10281 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10282 lock_state.reset();
7c673cae
FG
10283 }
10284 if (fh->flock_locks) {
11fdf7f2 10285 auto &lock_state = fh->flock_locks;
7c673cae
FG
10286 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10287 p != lock_state->held_locks.end();
10288 ++p)
10289 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10290 lock_state.reset();
7c673cae
FG
10291 }
10292
10293 if (to_release.empty())
10294 return;
10295
11fdf7f2
TL
10296 // mds has already released filelocks if session was closed.
10297 if (in->caps.empty())
10298 return;
10299
7c673cae
FG
10300 struct flock fl;
10301 memset(&fl, 0, sizeof(fl));
10302 fl.l_whence = SEEK_SET;
10303 fl.l_type = F_UNLCK;
10304
10305 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10306 p != to_release.end();
10307 ++p) {
10308 fl.l_start = p->second.start;
10309 fl.l_len = p->second.length;
10310 fl.l_pid = p->second.pid;
10311 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10312 p->second.owner, true);
10313 }
10314}
10315
10316void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10317 ceph_lock_state_t *lock_state)
10318{
10319 int lock_cmd;
10320 if (F_RDLCK == fl->l_type)
10321 lock_cmd = CEPH_LOCK_SHARED;
10322 else if (F_WRLCK == fl->l_type)
10323 lock_cmd = CEPH_LOCK_EXCL;
10324 else
10325 lock_cmd = CEPH_LOCK_UNLOCK;;
10326
10327 ceph_filelock filelock;
10328 filelock.start = fl->l_start;
10329 filelock.length = fl->l_len;
10330 filelock.client = 0;
10331 // see comment in _do_filelock()
10332 filelock.owner = owner | (1ULL << 63);
10333 filelock.pid = fl->l_pid;
10334 filelock.type = lock_cmd;
10335
10336 if (filelock.type == CEPH_LOCK_UNLOCK) {
10337 list<ceph_filelock> activated_locks;
10338 lock_state->remove_lock(filelock, activated_locks);
10339 } else {
10340 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10341 ceph_assert(r);
7c673cae
FG
10342 }
10343}
10344
10345int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10346{
10347 Inode *in = fh->inode.get();
10348 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10349 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10350 return ret;
10351}
10352
10353int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10354{
10355 Inode *in = fh->inode.get();
10356 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10357 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10358 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10359 return ret;
10360}
10361
10362int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10363{
10364 Inode *in = fh->inode.get();
10365 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10366
10367 int sleep = !(cmd & LOCK_NB);
10368 cmd &= ~LOCK_NB;
10369
10370 int type;
10371 switch (cmd) {
10372 case LOCK_SH:
10373 type = F_RDLCK;
10374 break;
10375 case LOCK_EX:
10376 type = F_WRLCK;
10377 break;
10378 case LOCK_UN:
10379 type = F_UNLCK;
10380 break;
10381 default:
10382 return -EINVAL;
10383 }
10384
10385 struct flock fl;
10386 memset(&fl, 0, sizeof(fl));
10387 fl.l_type = type;
10388 fl.l_whence = SEEK_SET;
10389
10390 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10391 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10392 return ret;
10393}
10394
10395int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10396{
10397 /* Since the only thing this does is wrap a call to statfs, and
10398 statfs takes a lock, it doesn't seem we have a need to split it
10399 out. */
10400 return statfs(0, stbuf, perms);
10401}
10402
10403void Client::ll_register_callbacks(struct client_callback_args *args)
10404{
10405 if (!args)
10406 return;
11fdf7f2
TL
10407 std::lock_guard l(client_lock);
10408 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10409 << " invalidate_ino_cb " << args->ino_cb
10410 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10411 << " switch_interrupt_cb " << args->switch_intr_cb
10412 << " remount_cb " << args->remount_cb
10413 << dendl;
10414 callback_handle = args->handle;
10415 if (args->ino_cb) {
10416 ino_invalidate_cb = args->ino_cb;
10417 async_ino_invalidator.start();
10418 }
10419 if (args->dentry_cb) {
10420 dentry_invalidate_cb = args->dentry_cb;
10421 async_dentry_invalidator.start();
10422 }
10423 if (args->switch_intr_cb) {
10424 switch_interrupt_cb = args->switch_intr_cb;
10425 interrupt_finisher.start();
10426 }
10427 if (args->remount_cb) {
10428 remount_cb = args->remount_cb;
10429 remount_finisher.start();
10430 }
7c673cae
FG
10431 umask_cb = args->umask_cb;
10432}
10433
10434int Client::test_dentry_handling(bool can_invalidate)
10435{
10436 int r = 0;
10437
10438 can_invalidate_dentries = can_invalidate;
10439
10440 if (can_invalidate_dentries) {
11fdf7f2 10441 ceph_assert(dentry_invalidate_cb);
7c673cae 10442 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10443 r = 0;
11fdf7f2
TL
10444 } else {
10445 ceph_assert(remount_cb);
7c673cae 10446 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10447 r = _do_remount(false);
b32b8144 10448 }
11fdf7f2 10449
7c673cae
FG
10450 return r;
10451}
10452
10453int Client::_sync_fs()
10454{
11fdf7f2 10455 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10456
10457 // flush file data
11fdf7f2
TL
10458 std::unique_ptr<C_SaferCond> cond = nullptr;
10459 if (cct->_conf->client_oc) {
10460 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10461 objectcacher->flush_all(cond.get());
10462 }
7c673cae
FG
10463
10464 // flush caps
10465 flush_caps_sync();
10466 ceph_tid_t flush_tid = last_flush_tid;
10467
10468 // wait for unsafe mds requests
10469 wait_unsafe_requests();
10470
10471 wait_sync_caps(flush_tid);
10472
11fdf7f2 10473 if (nullptr != cond) {
9f95a23c 10474 client_lock.unlock();
11fdf7f2
TL
10475 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10476 cond->wait();
10477 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 10478 client_lock.lock();
7c673cae
FG
10479 }
10480
10481 return 0;
10482}
10483
10484int Client::sync_fs()
10485{
11fdf7f2 10486 std::lock_guard l(client_lock);
181888fb
FG
10487
10488 if (unmounting)
10489 return -ENOTCONN;
10490
7c673cae
FG
10491 return _sync_fs();
10492}
10493
10494int64_t Client::drop_caches()
10495{
11fdf7f2 10496 std::lock_guard l(client_lock);
7c673cae
FG
10497 return objectcacher->release_all();
10498}
10499
11fdf7f2
TL
10500int Client::_lazyio(Fh *fh, int enable)
10501{
10502 Inode *in = fh->inode.get();
10503 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10504
10505 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10506 return 0;
10507
10508 int orig_mode = fh->mode;
10509 if (enable) {
10510 fh->mode |= CEPH_FILE_MODE_LAZY;
10511 in->get_open_ref(fh->mode);
10512 in->put_open_ref(orig_mode);
10513 check_caps(in, CHECK_CAPS_NODELAY);
10514 } else {
10515 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10516 in->get_open_ref(fh->mode);
10517 in->put_open_ref(orig_mode);
10518 check_caps(in, 0);
10519 }
10520
10521 return 0;
10522}
10523
10524int Client::lazyio(int fd, int enable)
10525{
10526 std::lock_guard l(client_lock);
10527 Fh *f = get_filehandle(fd);
10528 if (!f)
10529 return -EBADF;
10530
10531 return _lazyio(f, enable);
10532}
10533
10534int Client::ll_lazyio(Fh *fh, int enable)
10535{
10536 std::lock_guard lock(client_lock);
10537 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10538 tout(cct) << __func__ << std::endl;
10539
10540 return _lazyio(fh, enable);
10541}
7c673cae 10542
92f5a8d4 10543int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 10544{
11fdf7f2 10545 std::lock_guard l(client_lock);
92f5a8d4 10546 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
10547 << ", " << offset << ", " << count << ")" << dendl;
10548
10549 Fh *f = get_filehandle(fd);
10550 if (!f)
10551 return -EBADF;
10552
10553 // for now
10554 _fsync(f, true);
10555
10556 return 0;
10557}
10558
10559int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10560{
11fdf7f2 10561 std::lock_guard l(client_lock);
7c673cae
FG
10562 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10563 << ", " << offset << ", " << count << ")" << dendl;
10564
10565 Fh *f = get_filehandle(fd);
10566 if (!f)
10567 return -EBADF;
10568 Inode *in = f->inode.get();
10569
10570 _fsync(f, true);
92f5a8d4
TL
10571 if (_release(in)) {
10572 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10573 if (r < 0)
10574 return r;
10575 }
7c673cae
FG
10576 return 0;
10577}
10578
10579
10580// =============================
10581// snaps
10582
10583int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10584{
11fdf7f2 10585 std::lock_guard l(client_lock);
181888fb
FG
10586
10587 if (unmounting)
10588 return -ENOTCONN;
10589
7c673cae
FG
10590 filepath path(relpath);
10591 InodeRef in;
10592 int r = path_walk(path, &in, perm);
10593 if (r < 0)
10594 return r;
10595 if (cct->_conf->client_permissions) {
10596 r = may_create(in.get(), perm);
10597 if (r < 0)
10598 return r;
10599 }
10600 Inode *snapdir = open_snapdir(in.get());
10601 return _mkdir(snapdir, name, 0, perm);
10602}
181888fb 10603
7c673cae
FG
10604int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10605{
11fdf7f2 10606 std::lock_guard l(client_lock);
181888fb
FG
10607
10608 if (unmounting)
10609 return -ENOTCONN;
10610
7c673cae
FG
10611 filepath path(relpath);
10612 InodeRef in;
10613 int r = path_walk(path, &in, perms);
10614 if (r < 0)
10615 return r;
10616 if (cct->_conf->client_permissions) {
10617 r = may_delete(in.get(), NULL, perms);
10618 if (r < 0)
10619 return r;
10620 }
10621 Inode *snapdir = open_snapdir(in.get());
10622 return _rmdir(snapdir, name, perms);
10623}
10624
10625// =============================
10626// expose caps
10627
10628int Client::get_caps_issued(int fd) {
10629
11fdf7f2 10630 std::lock_guard lock(client_lock);
7c673cae 10631
181888fb
FG
10632 if (unmounting)
10633 return -ENOTCONN;
10634
7c673cae
FG
10635 Fh *f = get_filehandle(fd);
10636 if (!f)
10637 return -EBADF;
10638
10639 return f->inode->caps_issued();
10640}
10641
10642int Client::get_caps_issued(const char *path, const UserPerm& perms)
10643{
11fdf7f2 10644 std::lock_guard lock(client_lock);
181888fb
FG
10645
10646 if (unmounting)
10647 return -ENOTCONN;
10648
7c673cae
FG
10649 filepath p(path);
10650 InodeRef in;
10651 int r = path_walk(p, &in, perms, true);
10652 if (r < 0)
10653 return r;
10654 return in->caps_issued();
10655}
10656
10657// =========================================
10658// low level
10659
10660Inode *Client::open_snapdir(Inode *diri)
10661{
10662 Inode *in;
10663 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10664 if (!inode_map.count(vino)) {
10665 in = new Inode(this, vino, &diri->layout);
10666
10667 in->ino = diri->ino;
10668 in->snapid = CEPH_SNAPDIR;
10669 in->mode = diri->mode;
10670 in->uid = diri->uid;
10671 in->gid = diri->gid;
494da23a 10672 in->nlink = 1;
7c673cae
FG
10673 in->mtime = diri->mtime;
10674 in->ctime = diri->ctime;
10675 in->btime = diri->btime;
10676 in->size = diri->size;
10677 in->change_attr = diri->change_attr;
10678
10679 in->dirfragtree.clear();
10680 in->snapdir_parent = diri;
10681 diri->flags |= I_SNAPDIR_OPEN;
10682 inode_map[vino] = in;
10683 if (use_faked_inos())
10684 _assign_faked_ino(in);
10685 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10686 } else {
10687 in = inode_map[vino];
10688 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10689 }
10690 return in;
10691}
10692
10693int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10694 Inode **out, const UserPerm& perms)
10695{
11fdf7f2 10696 std::lock_guard lock(client_lock);
31f18b77 10697 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10698 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10699 tout(cct) << __func__ << std::endl;
7c673cae
FG
10700 tout(cct) << name << std::endl;
10701
181888fb
FG
10702 if (unmounting)
10703 return -ENOTCONN;
10704
7c673cae 10705 int r = 0;
11fdf7f2
TL
10706 if (!fuse_default_permissions) {
10707 if (strcmp(name, ".") && strcmp(name, "..")) {
10708 r = may_lookup(parent, perms);
10709 if (r < 0)
10710 return r;
10711 }
7c673cae
FG
10712 }
10713
10714 string dname(name);
10715 InodeRef in;
10716
10717 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10718 if (r < 0) {
10719 attr->st_ino = 0;
10720 goto out;
10721 }
10722
11fdf7f2 10723 ceph_assert(in);
7c673cae
FG
10724 fill_stat(in, attr);
10725 _ll_get(in.get());
10726
10727 out:
11fdf7f2 10728 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10729 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10730 tout(cct) << attr->st_ino << std::endl;
10731 *out = in.get();
10732 return r;
10733}
10734
1adf2230
AA
10735int Client::ll_lookup_inode(
10736 struct inodeno_t ino,
10737 const UserPerm& perms,
10738 Inode **inode)
10739{
81eedcae 10740 ceph_assert(inode != NULL);
11fdf7f2 10741 std::lock_guard lock(client_lock);
1adf2230
AA
10742 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10743
81eedcae
TL
10744 if (unmounting)
10745 return -ENOTCONN;
10746
1adf2230
AA
10747 // Num1: get inode and *inode
10748 int r = _lookup_ino(ino, perms, inode);
81eedcae 10749 if (r)
1adf2230 10750 return r;
81eedcae 10751
11fdf7f2 10752 ceph_assert(*inode != NULL);
1adf2230 10753
81eedcae
TL
10754 if (!(*inode)->dentries.empty()) {
10755 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10756 return 0;
10757 }
10758
10759 if ((*inode)->is_root()) {
10760 ldout(cct, 8) << "ino is root, no parent" << dendl;
10761 return 0;
10762 }
10763
1adf2230
AA
10764 // Num2: Request the parent inode, so that we can look up the name
10765 Inode *parent;
10766 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10767 if (r) {
1adf2230
AA
10768 _ll_forget(*inode, 1);
10769 return r;
1adf2230 10770 }
81eedcae 10771
11fdf7f2 10772 ceph_assert(parent != NULL);
1adf2230
AA
10773
10774 // Num3: Finally, get the name (dentry) of the requested inode
10775 r = _lookup_name(*inode, parent, perms);
10776 if (r) {
10777 // Unexpected error
10778 _ll_forget(parent, 1);
10779 _ll_forget(*inode, 1);
10780 return r;
10781 }
10782
10783 _ll_forget(parent, 1);
10784 return 0;
10785}
10786
7c673cae
FG
10787int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10788 struct ceph_statx *stx, unsigned want, unsigned flags,
10789 const UserPerm& perms)
10790{
11fdf7f2 10791 std::lock_guard lock(client_lock);
31f18b77 10792 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10793 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10794 tout(cct) << "ll_lookupx" << std::endl;
10795 tout(cct) << name << std::endl;
10796
181888fb
FG
10797 if (unmounting)
10798 return -ENOTCONN;
10799
7c673cae 10800 int r = 0;
11fdf7f2 10801 if (!fuse_default_permissions) {
7c673cae
FG
10802 r = may_lookup(parent, perms);
10803 if (r < 0)
10804 return r;
10805 }
10806
10807 string dname(name);
10808 InodeRef in;
10809
10810 unsigned mask = statx_to_mask(flags, want);
10811 r = _lookup(parent, dname, mask, &in, perms);
10812 if (r < 0) {
10813 stx->stx_ino = 0;
10814 stx->stx_mask = 0;
10815 } else {
11fdf7f2 10816 ceph_assert(in);
7c673cae
FG
10817 fill_statx(in, mask, stx);
10818 _ll_get(in.get());
10819 }
10820
11fdf7f2 10821 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10822 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10823 tout(cct) << stx->stx_ino << std::endl;
10824 *out = in.get();
10825 return r;
10826}
10827
10828int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10829 unsigned int want, unsigned int flags, const UserPerm& perms)
10830{
11fdf7f2 10831 std::lock_guard lock(client_lock);
181888fb
FG
10832
10833 if (unmounting)
10834 return -ENOTCONN;
10835
7c673cae
FG
10836 filepath fp(name, 0);
10837 InodeRef in;
10838 int rc;
10839 unsigned mask = statx_to_mask(flags, want);
10840
11fdf7f2
TL
10841 ldout(cct, 3) << __func__ << " " << name << dendl;
10842 tout(cct) << __func__ << std::endl;
7c673cae
FG
10843 tout(cct) << name << std::endl;
10844
10845 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10846 if (rc < 0) {
10847 /* zero out mask, just in case... */
10848 stx->stx_mask = 0;
10849 stx->stx_ino = 0;
10850 *out = NULL;
10851 return rc;
10852 } else {
11fdf7f2 10853 ceph_assert(in);
7c673cae
FG
10854 fill_statx(in, mask, stx);
10855 _ll_get(in.get());
10856 *out = in.get();
10857 return 0;
10858 }
10859}
10860
10861void Client::_ll_get(Inode *in)
10862{
10863 if (in->ll_ref == 0) {
10864 in->get();
11fdf7f2
TL
10865 if (in->is_dir() && !in->dentries.empty()) {
10866 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10867 in->get_first_parent()->get(); // pin dentry
10868 }
11fdf7f2
TL
10869 if (in->snapid != CEPH_NOSNAP)
10870 ll_snap_ref[in->snapid]++;
7c673cae
FG
10871 }
10872 in->ll_get();
11fdf7f2 10873 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10874}
10875
494da23a 10876int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
10877{
10878 in->ll_put(num);
11fdf7f2 10879 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10880 if (in->ll_ref == 0) {
11fdf7f2
TL
10881 if (in->is_dir() && !in->dentries.empty()) {
10882 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10883 in->get_first_parent()->put(); // unpin dentry
10884 }
11fdf7f2
TL
10885 if (in->snapid != CEPH_NOSNAP) {
10886 auto p = ll_snap_ref.find(in->snapid);
10887 ceph_assert(p != ll_snap_ref.end());
10888 ceph_assert(p->second > 0);
10889 if (--p->second == 0)
10890 ll_snap_ref.erase(p);
10891 }
7c673cae
FG
10892 put_inode(in);
10893 return 0;
10894 } else {
10895 return in->ll_ref;
10896 }
10897}
10898
10899void Client::_ll_drop_pins()
10900{
11fdf7f2 10901 ldout(cct, 10) << __func__ << dendl;
1adf2230 10902 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10903 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10904 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10905 it != inode_map.end();
10906 it = next) {
10907 Inode *in = it->second;
10908 next = it;
10909 ++next;
1adf2230
AA
10910 if (in->ll_ref){
10911 to_be_put.insert(in);
7c673cae 10912 _ll_put(in, in->ll_ref);
1adf2230 10913 }
7c673cae
FG
10914 }
10915}
10916
494da23a 10917bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 10918{
11fdf7f2 10919 inodeno_t ino = in->ino;
7c673cae 10920
11fdf7f2
TL
10921 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10922 tout(cct) << __func__ << std::endl;
7c673cae
FG
10923 tout(cct) << ino.val << std::endl;
10924 tout(cct) << count << std::endl;
10925
181888fb
FG
10926 // Ignore forget if we're no longer mounted
10927 if (unmounting)
10928 return true;
10929
7c673cae
FG
10930 if (ino == 1) return true; // ignore forget on root.
10931
10932 bool last = false;
10933 if (in->ll_ref < count) {
10934 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10935 << ", which only has ll_ref=" << in->ll_ref << dendl;
10936 _ll_put(in, in->ll_ref);
10937 last = true;
10938 } else {
10939 if (_ll_put(in, count) == 0)
10940 last = true;
10941 }
10942
10943 return last;
10944}
10945
494da23a 10946bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 10947{
11fdf7f2 10948 std::lock_guard lock(client_lock);
1adf2230
AA
10949 return _ll_forget(in, count);
10950}
10951
7c673cae
FG
10952bool Client::ll_put(Inode *in)
10953{
10954 /* ll_forget already takes the lock */
10955 return ll_forget(in, 1);
10956}
10957
11fdf7f2
TL
10958int Client::ll_get_snap_ref(snapid_t snap)
10959{
10960 std::lock_guard lock(client_lock);
10961 auto p = ll_snap_ref.find(snap);
10962 if (p != ll_snap_ref.end())
10963 return p->second;
10964 return 0;
10965}
10966
7c673cae
FG
10967snapid_t Client::ll_get_snapid(Inode *in)
10968{
11fdf7f2 10969 std::lock_guard lock(client_lock);
7c673cae
FG
10970 return in->snapid;
10971}
10972
10973Inode *Client::ll_get_inode(ino_t ino)
10974{
11fdf7f2 10975 std::lock_guard lock(client_lock);
181888fb
FG
10976
10977 if (unmounting)
10978 return NULL;
10979
7c673cae
FG
10980 vinodeno_t vino = _map_faked_ino(ino);
10981 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10982 if (p == inode_map.end())
10983 return NULL;
10984 Inode *in = p->second;
10985 _ll_get(in);
10986 return in;
10987}
10988
10989Inode *Client::ll_get_inode(vinodeno_t vino)
10990{
11fdf7f2 10991 std::lock_guard lock(client_lock);
181888fb
FG
10992
10993 if (unmounting)
10994 return NULL;
10995
7c673cae
FG
10996 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10997 if (p == inode_map.end())
10998 return NULL;
10999 Inode *in = p->second;
11000 _ll_get(in);
11001 return in;
11002}
11003
11004int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11005{
11006 vinodeno_t vino = _get_vino(in);
11007
11fdf7f2
TL
11008 ldout(cct, 8) << __func__ << " " << vino << dendl;
11009 tout(cct) << __func__ << std::endl;
7c673cae
FG
11010 tout(cct) << vino.ino.val << std::endl;
11011
11012 if (vino.snapid < CEPH_NOSNAP)
11013 return 0;
11014 else
11015 return _getattr(in, caps, perms);
11016}
11017
11018int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11019{
11fdf7f2 11020 std::lock_guard lock(client_lock);
7c673cae 11021
181888fb
FG
11022 if (unmounting)
11023 return -ENOTCONN;
11024
7c673cae
FG
11025 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11026
11027 if (res == 0)
11028 fill_stat(in, attr);
11fdf7f2 11029 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11030 return res;
11031}
11032
11033int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11034 unsigned int flags, const UserPerm& perms)
11035{
11fdf7f2 11036 std::lock_guard lock(client_lock);
7c673cae 11037
181888fb
FG
11038 if (unmounting)
11039 return -ENOTCONN;
11040
7c673cae
FG
11041 int res = 0;
11042 unsigned mask = statx_to_mask(flags, want);
11043
94b18763 11044 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
11045 res = _ll_getattr(in, mask, perms);
11046
11047 if (res == 0)
11048 fill_statx(in, mask, stx);
11fdf7f2 11049 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11050 return res;
11051}
11052
11053int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11054 const UserPerm& perms, InodeRef *inp)
11055{
11056 vinodeno_t vino = _get_vino(in);
11057
11fdf7f2 11058 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 11059 << dendl;
11fdf7f2 11060 tout(cct) << __func__ << std::endl;
7c673cae
FG
11061 tout(cct) << vino.ino.val << std::endl;
11062 tout(cct) << stx->stx_mode << std::endl;
11063 tout(cct) << stx->stx_uid << std::endl;
11064 tout(cct) << stx->stx_gid << std::endl;
11065 tout(cct) << stx->stx_size << std::endl;
11066 tout(cct) << stx->stx_mtime << std::endl;
11067 tout(cct) << stx->stx_atime << std::endl;
11068 tout(cct) << stx->stx_btime << std::endl;
11069 tout(cct) << mask << std::endl;
11070
11fdf7f2 11071 if (!fuse_default_permissions) {
7c673cae
FG
11072 int res = may_setattr(in, stx, mask, perms);
11073 if (res < 0)
11074 return res;
11075 }
11076
11077 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11078
11079 return __setattrx(in, stx, mask, perms, inp);
11080}
11081
11082int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11083 const UserPerm& perms)
11084{
11fdf7f2 11085 std::lock_guard lock(client_lock);
181888fb
FG
11086
11087 if (unmounting)
11088 return -ENOTCONN;
11089
7c673cae
FG
11090 InodeRef target(in);
11091 int res = _ll_setattrx(in, stx, mask, perms, &target);
11092 if (res == 0) {
11fdf7f2 11093 ceph_assert(in == target.get());
7c673cae
FG
11094 fill_statx(in, in->caps_issued(), stx);
11095 }
11096
11fdf7f2 11097 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11098 return res;
11099}
11100
11101int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11102 const UserPerm& perms)
11103{
11104 struct ceph_statx stx;
11105 stat_to_statx(attr, &stx);
11106
11fdf7f2 11107 std::lock_guard lock(client_lock);
181888fb
FG
11108
11109 if (unmounting)
11110 return -ENOTCONN;
11111
7c673cae
FG
11112 InodeRef target(in);
11113 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11114 if (res == 0) {
11fdf7f2 11115 ceph_assert(in == target.get());
7c673cae
FG
11116 fill_stat(in, attr);
11117 }
11118
11fdf7f2 11119 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11120 return res;
11121}
11122
11123
11124// ----------
11125// xattrs
11126
11127int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11128 const UserPerm& perms)
11129{
11fdf7f2 11130 std::lock_guard lock(client_lock);
181888fb
FG
11131
11132 if (unmounting)
11133 return -ENOTCONN;
11134
7c673cae
FG
11135 InodeRef in;
11136 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11137 if (r < 0)
11138 return r;
11139 return _getxattr(in, name, value, size, perms);
11140}
11141
11142int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11143 const UserPerm& perms)
11144{
11fdf7f2 11145 std::lock_guard lock(client_lock);
181888fb
FG
11146
11147 if (unmounting)
11148 return -ENOTCONN;
11149
7c673cae
FG
11150 InodeRef in;
11151 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11152 if (r < 0)
11153 return r;
11154 return _getxattr(in, name, value, size, perms);
11155}
11156
11157int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11158 const UserPerm& perms)
11159{
11fdf7f2 11160 std::lock_guard lock(client_lock);
181888fb
FG
11161
11162 if (unmounting)
11163 return -ENOTCONN;
11164
7c673cae
FG
11165 Fh *f = get_filehandle(fd);
11166 if (!f)
11167 return -EBADF;
11168 return _getxattr(f->inode, name, value, size, perms);
11169}
11170
11171int Client::listxattr(const char *path, char *list, size_t size,
11172 const UserPerm& perms)
11173{
11fdf7f2 11174 std::lock_guard lock(client_lock);
181888fb
FG
11175
11176 if (unmounting)
11177 return -ENOTCONN;
11178
7c673cae
FG
11179 InodeRef in;
11180 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11181 if (r < 0)
11182 return r;
11183 return Client::_listxattr(in.get(), list, size, perms);
11184}
11185
11186int Client::llistxattr(const char *path, char *list, size_t size,
11187 const UserPerm& perms)
11188{
11fdf7f2 11189 std::lock_guard lock(client_lock);
181888fb
FG
11190
11191 if (unmounting)
11192 return -ENOTCONN;
11193
7c673cae
FG
11194 InodeRef in;
11195 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11196 if (r < 0)
11197 return r;
11198 return Client::_listxattr(in.get(), list, size, perms);
11199}
11200
11201int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11202{
11fdf7f2 11203 std::lock_guard lock(client_lock);
181888fb
FG
11204
11205 if (unmounting)
11206 return -ENOTCONN;
11207
7c673cae
FG
11208 Fh *f = get_filehandle(fd);
11209 if (!f)
11210 return -EBADF;
11211 return Client::_listxattr(f->inode.get(), list, size, perms);
11212}
11213
11214int Client::removexattr(const char *path, const char *name,
11215 const UserPerm& perms)
11216{
11fdf7f2 11217 std::lock_guard lock(client_lock);
181888fb
FG
11218
11219 if (unmounting)
11220 return -ENOTCONN;
11221
7c673cae
FG
11222 InodeRef in;
11223 int r = Client::path_walk(path, &in, perms, true);
11224 if (r < 0)
11225 return r;
11226 return _removexattr(in, name, perms);
11227}
11228
11229int Client::lremovexattr(const char *path, const char *name,
11230 const UserPerm& perms)
11231{
11fdf7f2 11232 std::lock_guard lock(client_lock);
181888fb
FG
11233
11234 if (unmounting)
11235 return -ENOTCONN;
11236
7c673cae
FG
11237 InodeRef in;
11238 int r = Client::path_walk(path, &in, perms, false);
11239 if (r < 0)
11240 return r;
11241 return _removexattr(in, name, perms);
11242}
11243
11244int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11245{
11fdf7f2 11246 std::lock_guard lock(client_lock);
181888fb
FG
11247
11248 if (unmounting)
11249 return -ENOTCONN;
11250
7c673cae
FG
11251 Fh *f = get_filehandle(fd);
11252 if (!f)
11253 return -EBADF;
11254 return _removexattr(f->inode, name, perms);
11255}
11256
11257int Client::setxattr(const char *path, const char *name, const void *value,
11258 size_t size, int flags, const UserPerm& perms)
11259{
11260 _setxattr_maybe_wait_for_osdmap(name, value, size);
11261
11fdf7f2 11262 std::lock_guard lock(client_lock);
181888fb
FG
11263
11264 if (unmounting)
11265 return -ENOTCONN;
11266
7c673cae
FG
11267 InodeRef in;
11268 int r = Client::path_walk(path, &in, perms, true);
11269 if (r < 0)
11270 return r;
11271 return _setxattr(in, name, value, size, flags, perms);
11272}
11273
11274int Client::lsetxattr(const char *path, const char *name, const void *value,
11275 size_t size, int flags, const UserPerm& perms)
11276{
11277 _setxattr_maybe_wait_for_osdmap(name, value, size);
11278
11fdf7f2 11279 std::lock_guard lock(client_lock);
181888fb
FG
11280
11281 if (unmounting)
11282 return -ENOTCONN;
11283
7c673cae
FG
11284 InodeRef in;
11285 int r = Client::path_walk(path, &in, perms, false);
11286 if (r < 0)
11287 return r;
11288 return _setxattr(in, name, value, size, flags, perms);
11289}
11290
11291int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11292 int flags, const UserPerm& perms)
11293{
11294 _setxattr_maybe_wait_for_osdmap(name, value, size);
11295
11fdf7f2 11296 std::lock_guard lock(client_lock);
181888fb
FG
11297
11298 if (unmounting)
11299 return -ENOTCONN;
11300
7c673cae
FG
11301 Fh *f = get_filehandle(fd);
11302 if (!f)
11303 return -EBADF;
11304 return _setxattr(f->inode, name, value, size, flags, perms);
11305}
11306
11307int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11308 const UserPerm& perms)
11309{
11310 int r;
11311
11312 const VXattr *vxattr = _match_vxattr(in, name);
11313 if (vxattr) {
11314 r = -ENODATA;
11315
11316 // Do a force getattr to get the latest quota before returning
11317 // a value to userspace.
28e407b8
AA
11318 int flags = 0;
11319 if (vxattr->flags & VXATTR_RSTAT) {
11320 flags |= CEPH_STAT_RSTAT;
11321 }
11322 r = _getattr(in, flags, perms, true);
7c673cae
FG
11323 if (r != 0) {
11324 // Error from getattr!
11325 return r;
11326 }
11327
11328 // call pointer-to-member function
11329 char buf[256];
11330 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11331 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11332 } else {
11333 r = -ENODATA;
11334 }
11335
11336 if (size != 0) {
11337 if (r > (int)size) {
11338 r = -ERANGE;
11339 } else if (r > 0) {
11340 memcpy(value, buf, r);
11341 }
11342 }
11343 goto out;
11344 }
11345
11346 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11347 r = -EOPNOTSUPP;
11348 goto out;
11349 }
11350
11351 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11352 if (r == 0) {
11353 string n(name);
11354 r = -ENODATA;
11355 if (in->xattrs.count(n)) {
11356 r = in->xattrs[n].length();
11357 if (r > 0 && size != 0) {
11358 if (size >= (unsigned)r)
11359 memcpy(value, in->xattrs[n].c_str(), r);
11360 else
11361 r = -ERANGE;
11362 }
11363 }
11364 }
11365 out:
1adf2230 11366 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11367 return r;
11368}
11369
11370int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11371 const UserPerm& perms)
11372{
11373 if (cct->_conf->client_permissions) {
11374 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11375 if (r < 0)
11376 return r;
11377 }
11378 return _getxattr(in.get(), name, value, size, perms);
11379}
11380
11381int Client::ll_getxattr(Inode *in, const char *name, void *value,
11382 size_t size, const UserPerm& perms)
11383{
11fdf7f2 11384 std::lock_guard lock(client_lock);
7c673cae 11385
181888fb
FG
11386 if (unmounting)
11387 return -ENOTCONN;
11388
7c673cae
FG
11389 vinodeno_t vino = _get_vino(in);
11390
11fdf7f2
TL
11391 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11392 tout(cct) << __func__ << std::endl;
7c673cae
FG
11393 tout(cct) << vino.ino.val << std::endl;
11394 tout(cct) << name << std::endl;
11395
11fdf7f2 11396 if (!fuse_default_permissions) {
7c673cae
FG
11397 int r = xattr_permission(in, name, MAY_READ, perms);
11398 if (r < 0)
11399 return r;
11400 }
11401
11402 return _getxattr(in, name, value, size, perms);
11403}
11404
11405int Client::_listxattr(Inode *in, char *name, size_t size,
11406 const UserPerm& perms)
11407{
81eedcae 11408 bool len_only = (size == 0);
7c673cae 11409 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11410 if (r != 0) {
11411 goto out;
11412 }
7c673cae 11413
81eedcae
TL
11414 r = 0;
11415 for (const auto& p : in->xattrs) {
11416 size_t this_len = p.first.length() + 1;
11417 r += this_len;
11418 if (len_only)
11419 continue;
7c673cae 11420
81eedcae
TL
11421 if (this_len > size) {
11422 r = -ERANGE;
11423 goto out;
11424 }
11425
11426 memcpy(name, p.first.c_str(), this_len);
11427 name += this_len;
11428 size -= this_len;
11429 }
81eedcae 11430out:
11fdf7f2 11431 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11432 return r;
11433}
11434
11435int Client::ll_listxattr(Inode *in, char *names, size_t size,
11436 const UserPerm& perms)
11437{
11fdf7f2 11438 std::lock_guard lock(client_lock);
7c673cae 11439
181888fb
FG
11440 if (unmounting)
11441 return -ENOTCONN;
11442
7c673cae
FG
11443 vinodeno_t vino = _get_vino(in);
11444
11fdf7f2
TL
11445 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11446 tout(cct) << __func__ << std::endl;
7c673cae
FG
11447 tout(cct) << vino.ino.val << std::endl;
11448 tout(cct) << size << std::endl;
11449
11450 return _listxattr(in, names, size, perms);
11451}
11452
11453int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11454 size_t size, int flags, const UserPerm& perms)
11455{
11456
11457 int xattr_flags = 0;
11458 if (!value)
11459 xattr_flags |= CEPH_XATTR_REMOVE;
11460 if (flags & XATTR_CREATE)
11461 xattr_flags |= CEPH_XATTR_CREATE;
11462 if (flags & XATTR_REPLACE)
11463 xattr_flags |= CEPH_XATTR_REPLACE;
11464
11465 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11466 filepath path;
11467 in->make_nosnap_relative_path(path);
11468 req->set_filepath(path);
11469 req->set_string2(name);
11470 req->set_inode(in);
11471 req->head.args.setxattr.flags = xattr_flags;
11472
11473 bufferlist bl;
11fdf7f2 11474 assert (value || size == 0);
7c673cae
FG
11475 bl.append((const char*)value, size);
11476 req->set_data(bl);
11477
11478 int res = make_request(req, perms);
11479
11480 trim_cache();
11fdf7f2 11481 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11482 res << dendl;
11483 return res;
11484}
11485
11486int Client::_setxattr(Inode *in, const char *name, const void *value,
11487 size_t size, int flags, const UserPerm& perms)
11488{
11489 if (in->snapid != CEPH_NOSNAP) {
11490 return -EROFS;
11491 }
11492
11493 bool posix_acl_xattr = false;
11494 if (acl_type == POSIX_ACL)
11495 posix_acl_xattr = !strncmp(name, "system.", 7);
11496
11497 if (strncmp(name, "user.", 5) &&
11498 strncmp(name, "security.", 9) &&
11499 strncmp(name, "trusted.", 8) &&
11500 strncmp(name, "ceph.", 5) &&
11501 !posix_acl_xattr)
11502 return -EOPNOTSUPP;
11503
11fdf7f2
TL
11504 bool check_realm = false;
11505
7c673cae
FG
11506 if (posix_acl_xattr) {
11507 if (!strcmp(name, ACL_EA_ACCESS)) {
11508 mode_t new_mode = in->mode;
11509 if (value) {
11510 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11511 if (ret < 0)
11512 return ret;
11513 if (ret == 0) {
11514 value = NULL;
11515 size = 0;
11516 }
11517 if (new_mode != in->mode) {
11518 struct ceph_statx stx;
11519 stx.stx_mode = new_mode;
11520 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11521 if (ret < 0)
11522 return ret;
11523 }
11524 }
11525 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11526 if (value) {
11527 if (!S_ISDIR(in->mode))
11528 return -EACCES;
11529 int ret = posix_acl_check(value, size);
11530 if (ret < 0)
11531 return -EINVAL;
11532 if (ret == 0) {
11533 value = NULL;
11534 size = 0;
11535 }
11536 }
11537 } else {
11538 return -EOPNOTSUPP;
11539 }
11540 } else {
11541 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11542 if (vxattr) {
11543 if (vxattr->readonly)
11544 return -EOPNOTSUPP;
11545 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11546 check_realm = true;
11547 }
7c673cae
FG
11548 }
11549
11fdf7f2
TL
11550 int ret = _do_setxattr(in, name, value, size, flags, perms);
11551 if (ret >= 0 && check_realm) {
11552 // check if snaprealm was created for quota inode
11553 if (in->quota.is_enable() &&
11554 !(in->snaprealm && in->snaprealm->ino == in->ino))
11555 ret = -EOPNOTSUPP;
11556 }
11557
11558 return ret;
7c673cae
FG
11559}
11560
11561int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11562 size_t size, int flags, const UserPerm& perms)
11563{
11564 if (cct->_conf->client_permissions) {
11565 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11566 if (r < 0)
11567 return r;
11568 }
11569 return _setxattr(in.get(), name, value, size, flags, perms);
11570}
11571
11572int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11573{
11574 string tmp;
11575 if (name == "layout") {
11576 string::iterator begin = value.begin();
11577 string::iterator end = value.end();
11578 keys_and_values<string::iterator> p; // create instance of parser
11579 std::map<string, string> m; // map to receive results
11580 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11581 return -EINVAL;
11582 }
11583 if (begin != end)
11584 return -EINVAL;
11585 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11586 if (q->first == "pool") {
11587 tmp = q->second;
11588 break;
11589 }
11590 }
11591 } else if (name == "layout.pool") {
11592 tmp = value;
11593 }
11594
11595 if (tmp.length()) {
11596 int64_t pool;
11597 try {
11598 pool = boost::lexical_cast<unsigned>(tmp);
11599 if (!osdmap->have_pg_pool(pool))
11600 return -ENOENT;
11601 } catch (boost::bad_lexical_cast const&) {
11602 pool = osdmap->lookup_pg_pool_name(tmp);
11603 if (pool < 0) {
11604 return -ENOENT;
11605 }
11606 }
11607 }
11608
11609 return 0;
11610}
11611
11612void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11613{
11614 // For setting pool of layout, MetaRequest need osdmap epoch.
11615 // There is a race which create a new data pool but client and mds both don't have.
11616 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11617 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11618 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11619 string rest(strstr(name, "layout"));
11620 string v((const char*)value, size);
11621 int r = objecter->with_osdmap([&](const OSDMap& o) {
11622 return _setxattr_check_data_pool(rest, v, &o);
11623 });
11624
11625 if (r == -ENOENT) {
11626 C_SaferCond ctx;
11627 objecter->wait_for_latest_osdmap(&ctx);
11628 ctx.wait();
11629 }
11630 }
11631}
11632
11633int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11634 size_t size, int flags, const UserPerm& perms)
11635{
11636 _setxattr_maybe_wait_for_osdmap(name, value, size);
11637
11fdf7f2 11638 std::lock_guard lock(client_lock);
7c673cae 11639
181888fb
FG
11640 if (unmounting)
11641 return -ENOTCONN;
11642
7c673cae
FG
11643 vinodeno_t vino = _get_vino(in);
11644
11fdf7f2
TL
11645 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11646 tout(cct) << __func__ << std::endl;
7c673cae
FG
11647 tout(cct) << vino.ino.val << std::endl;
11648 tout(cct) << name << std::endl;
11649
11fdf7f2 11650 if (!fuse_default_permissions) {
7c673cae
FG
11651 int r = xattr_permission(in, name, MAY_WRITE, perms);
11652 if (r < 0)
11653 return r;
11654 }
11655 return _setxattr(in, name, value, size, flags, perms);
11656}
11657
11658int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11659{
11660 if (in->snapid != CEPH_NOSNAP) {
11661 return -EROFS;
11662 }
11663
11664 // same xattrs supported by kernel client
11665 if (strncmp(name, "user.", 5) &&
11666 strncmp(name, "system.", 7) &&
11667 strncmp(name, "security.", 9) &&
11668 strncmp(name, "trusted.", 8) &&
11669 strncmp(name, "ceph.", 5))
11670 return -EOPNOTSUPP;
11671
11672 const VXattr *vxattr = _match_vxattr(in, name);
11673 if (vxattr && vxattr->readonly)
11674 return -EOPNOTSUPP;
11675
11676 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11677 filepath path;
11678 in->make_nosnap_relative_path(path);
11679 req->set_filepath(path);
11680 req->set_filepath2(name);
11681 req->set_inode(in);
11682
11683 int res = make_request(req, perms);
11684
11685 trim_cache();
1adf2230 11686 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11687 return res;
11688}
11689
11690int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11691{
11692 if (cct->_conf->client_permissions) {
11693 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11694 if (r < 0)
11695 return r;
11696 }
11697 return _removexattr(in.get(), name, perms);
11698}
11699
11700int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11701{
11fdf7f2 11702 std::lock_guard lock(client_lock);
7c673cae 11703
181888fb
FG
11704 if (unmounting)
11705 return -ENOTCONN;
11706
7c673cae
FG
11707 vinodeno_t vino = _get_vino(in);
11708
11709 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11710 tout(cct) << "ll_removexattr" << std::endl;
11711 tout(cct) << vino.ino.val << std::endl;
11712 tout(cct) << name << std::endl;
11713
11fdf7f2 11714 if (!fuse_default_permissions) {
7c673cae
FG
11715 int r = xattr_permission(in, name, MAY_WRITE, perms);
11716 if (r < 0)
11717 return r;
11718 }
11719
11720 return _removexattr(in, name, perms);
11721}
11722
11723bool Client::_vxattrcb_quota_exists(Inode *in)
11724{
11fdf7f2
TL
11725 return in->quota.is_enable() &&
11726 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11727}
11728size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11729{
11730 return snprintf(val, size,
11731 "max_bytes=%lld max_files=%lld",
11732 (long long int)in->quota.max_bytes,
11733 (long long int)in->quota.max_files);
11734}
11735size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11736{
11737 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11738}
11739size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11740{
11741 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11742}
11743
11744bool Client::_vxattrcb_layout_exists(Inode *in)
11745{
11746 return in->layout != file_layout_t();
11747}
11748size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11749{
11750 int r = snprintf(val, size,
11fdf7f2 11751 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11752 (unsigned long long)in->layout.stripe_unit,
11753 (unsigned long long)in->layout.stripe_count,
11754 (unsigned long long)in->layout.object_size);
11755 objecter->with_osdmap([&](const OSDMap& o) {
11756 if (o.have_pg_pool(in->layout.pool_id))
11757 r += snprintf(val + r, size - r, "%s",
11758 o.get_pool_name(in->layout.pool_id).c_str());
11759 else
11760 r += snprintf(val + r, size - r, "%" PRIu64,
11761 (uint64_t)in->layout.pool_id);
11762 });
11763 if (in->layout.pool_ns.length())
11764 r += snprintf(val + r, size - r, " pool_namespace=%s",
11765 in->layout.pool_ns.c_str());
11766 return r;
11767}
11768size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11769{
11fdf7f2 11770 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11771}
11772size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11773{
11fdf7f2 11774 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11775}
11776size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11777{
11fdf7f2 11778 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11779}
11780size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11781{
11782 size_t r;
11783 objecter->with_osdmap([&](const OSDMap& o) {
11784 if (o.have_pg_pool(in->layout.pool_id))
11785 r = snprintf(val, size, "%s", o.get_pool_name(
11786 in->layout.pool_id).c_str());
11787 else
11788 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11789 });
11790 return r;
11791}
11792size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11793{
11794 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11795}
11796size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11797{
11fdf7f2 11798 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11799}
11800size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11801{
11fdf7f2 11802 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11803}
11804size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11805{
11fdf7f2 11806 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11807}
11808size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11809{
11fdf7f2 11810 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11811}
11812size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11813{
11fdf7f2 11814 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11815}
11816size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11817{
11fdf7f2 11818 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11819}
11820size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11821{
11fdf7f2 11822 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11823}
11824size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11825{
81eedcae 11826 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
11827 (long)in->rstat.rctime.nsec());
11828}
11fdf7f2
TL
11829bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11830{
11831 return in->dir_pin != -ENODATA;
11832}
11833size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11834{
11835 return snprintf(val, size, "%ld", (long)in->dir_pin);
11836}
7c673cae 11837
81eedcae
TL
11838bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11839{
11840 return !in->snap_btime.is_zero();
11841}
11842
11843size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11844{
11845 return snprintf(val, size, "%llu.%09lu",
11846 (long long unsigned)in->snap_btime.sec(),
11847 (long unsigned)in->snap_btime.nsec());
11848}
11849
7c673cae
FG
11850#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11851#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11852
11853#define XATTR_NAME_CEPH(_type, _name) \
11854{ \
11855 name: CEPH_XATTR_NAME(_type, _name), \
11856 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11857 readonly: true, \
7c673cae 11858 exists_cb: NULL, \
28e407b8
AA
11859 flags: 0, \
11860}
11861#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11862{ \
11863 name: CEPH_XATTR_NAME(_type, _name), \
11864 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11865 readonly: true, \
28e407b8
AA
11866 exists_cb: NULL, \
11867 flags: _flags, \
7c673cae
FG
11868}
11869#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11870{ \
11871 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11872 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11873 readonly: false, \
7c673cae 11874 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11875 flags: 0, \
7c673cae
FG
11876}
11877#define XATTR_QUOTA_FIELD(_type, _name) \
11878{ \
11879 name: CEPH_XATTR_NAME(_type, _name), \
11880 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11881 readonly: false, \
7c673cae 11882 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11883 flags: 0, \
7c673cae
FG
11884}
11885
11886const Client::VXattr Client::_dir_vxattrs[] = {
11887 {
11888 name: "ceph.dir.layout",
11889 getxattr_cb: &Client::_vxattrcb_layout,
11890 readonly: false,
7c673cae 11891 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11892 flags: 0,
7c673cae
FG
11893 },
11894 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11895 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11896 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11897 XATTR_LAYOUT_FIELD(dir, layout, pool),
11898 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11899 XATTR_NAME_CEPH(dir, entries),
11900 XATTR_NAME_CEPH(dir, files),
11901 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11902 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11903 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11904 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11905 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11906 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11907 {
11908 name: "ceph.quota",
11909 getxattr_cb: &Client::_vxattrcb_quota,
11910 readonly: false,
7c673cae 11911 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11912 flags: 0,
7c673cae
FG
11913 },
11914 XATTR_QUOTA_FIELD(quota, max_bytes),
11915 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11916 {
11917 name: "ceph.dir.pin",
11918 getxattr_cb: &Client::_vxattrcb_dir_pin,
11919 readonly: false,
11fdf7f2
TL
11920 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11921 flags: 0,
11922 },
81eedcae
TL
11923 {
11924 name: "ceph.snap.btime",
11925 getxattr_cb: &Client::_vxattrcb_snap_btime,
11926 readonly: true,
81eedcae
TL
11927 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11928 flags: 0,
11929 },
7c673cae
FG
11930 { name: "" } /* Required table terminator */
11931};
11932
11933const Client::VXattr Client::_file_vxattrs[] = {
11934 {
11935 name: "ceph.file.layout",
11936 getxattr_cb: &Client::_vxattrcb_layout,
11937 readonly: false,
7c673cae 11938 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11939 flags: 0,
7c673cae
FG
11940 },
11941 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11942 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11943 XATTR_LAYOUT_FIELD(file, layout, object_size),
11944 XATTR_LAYOUT_FIELD(file, layout, pool),
11945 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
11946 {
11947 name: "ceph.snap.btime",
11948 getxattr_cb: &Client::_vxattrcb_snap_btime,
11949 readonly: true,
81eedcae
TL
11950 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11951 flags: 0,
11952 },
7c673cae
FG
11953 { name: "" } /* Required table terminator */
11954};
11955
11956const Client::VXattr *Client::_get_vxattrs(Inode *in)
11957{
11958 if (in->is_dir())
11959 return _dir_vxattrs;
11960 else if (in->is_file())
11961 return _file_vxattrs;
11962 return NULL;
11963}
11964
11965const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11966{
11967 if (strncmp(name, "ceph.", 5) == 0) {
11968 const VXattr *vxattr = _get_vxattrs(in);
11969 if (vxattr) {
11970 while (!vxattr->name.empty()) {
11971 if (vxattr->name == name)
11972 return vxattr;
11973 vxattr++;
11974 }
11975 }
11976 }
11977 return NULL;
11978}
11979
7c673cae
FG
11980int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11981{
11fdf7f2 11982 std::lock_guard lock(client_lock);
7c673cae 11983
181888fb
FG
11984 if (unmounting)
11985 return -ENOTCONN;
11986
7c673cae
FG
11987 vinodeno_t vino = _get_vino(in);
11988
11989 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11990 tout(cct) << "ll_readlink" << std::endl;
11991 tout(cct) << vino.ino.val << std::endl;
11992
11fdf7f2
TL
11993 for (auto dn : in->dentries) {
11994 touch_dn(dn);
7c673cae
FG
11995 }
11996
11997 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11998 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11999 return r;
12000}
12001
12002int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12003 const UserPerm& perms, InodeRef *inp)
12004{
1adf2230 12005 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12006 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12007 << ", gid " << perms.gid() << ")" << dendl;
12008
12009 if (strlen(name) > NAME_MAX)
12010 return -ENAMETOOLONG;
12011
12012 if (dir->snapid != CEPH_NOSNAP) {
12013 return -EROFS;
12014 }
12015 if (is_quota_files_exceeded(dir, perms)) {
12016 return -EDQUOT;
12017 }
12018
12019 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12020
12021 filepath path;
12022 dir->make_nosnap_relative_path(path);
12023 path.push_dentry(name);
12024 req->set_filepath(path);
12025 req->set_inode(dir);
12026 req->head.args.mknod.rdev = rdev;
12027 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12028 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12029
12030 bufferlist xattrs_bl;
12031 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12032 if (res < 0)
12033 goto fail;
12034 req->head.args.mknod.mode = mode;
12035 if (xattrs_bl.length() > 0)
12036 req->set_data(xattrs_bl);
12037
12038 Dentry *de;
12039 res = get_or_create(dir, name, &de);
12040 if (res < 0)
12041 goto fail;
12042 req->set_dentry(de);
12043
12044 res = make_request(req, perms, inp);
12045
12046 trim_cache();
12047
1adf2230 12048 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12049 return res;
12050
12051 fail:
12052 put_request(req);
12053 return res;
12054}
12055
12056int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12057 dev_t rdev, struct stat *attr, Inode **out,
12058 const UserPerm& perms)
12059{
11fdf7f2 12060 std::lock_guard lock(client_lock);
7c673cae 12061
181888fb
FG
12062 if (unmounting)
12063 return -ENOTCONN;
12064
7c673cae
FG
12065 vinodeno_t vparent = _get_vino(parent);
12066
12067 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12068 tout(cct) << "ll_mknod" << std::endl;
12069 tout(cct) << vparent.ino.val << std::endl;
12070 tout(cct) << name << std::endl;
12071 tout(cct) << mode << std::endl;
12072 tout(cct) << rdev << std::endl;
12073
11fdf7f2 12074 if (!fuse_default_permissions) {
7c673cae
FG
12075 int r = may_create(parent, perms);
12076 if (r < 0)
12077 return r;
12078 }
12079
12080 InodeRef in;
12081 int r = _mknod(parent, name, mode, rdev, perms, &in);
12082 if (r == 0) {
12083 fill_stat(in, attr);
12084 _ll_get(in.get());
12085 }
12086 tout(cct) << attr->st_ino << std::endl;
12087 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12088 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12089 *out = in.get();
12090 return r;
12091}
12092
12093int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12094 dev_t rdev, Inode **out,
12095 struct ceph_statx *stx, unsigned want, unsigned flags,
12096 const UserPerm& perms)
12097{
12098 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12099 std::lock_guard lock(client_lock);
7c673cae 12100
181888fb
FG
12101 if (unmounting)
12102 return -ENOTCONN;
12103
7c673cae
FG
12104 vinodeno_t vparent = _get_vino(parent);
12105
12106 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12107 tout(cct) << "ll_mknodx" << std::endl;
12108 tout(cct) << vparent.ino.val << std::endl;
12109 tout(cct) << name << std::endl;
12110 tout(cct) << mode << std::endl;
12111 tout(cct) << rdev << std::endl;
12112
11fdf7f2 12113 if (!fuse_default_permissions) {
7c673cae
FG
12114 int r = may_create(parent, perms);
12115 if (r < 0)
12116 return r;
12117 }
12118
12119 InodeRef in;
12120 int r = _mknod(parent, name, mode, rdev, perms, &in);
12121 if (r == 0) {
12122 fill_statx(in, caps, stx);
12123 _ll_get(in.get());
12124 }
12125 tout(cct) << stx->stx_ino << std::endl;
12126 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12127 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12128 *out = in.get();
12129 return r;
12130}
12131
12132int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12133 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12134 int object_size, const char *data_pool, bool *created,
12135 const UserPerm& perms)
12136{
1adf2230 12137 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12138 mode << dec << ")" << dendl;
12139
12140 if (strlen(name) > NAME_MAX)
12141 return -ENAMETOOLONG;
12142 if (dir->snapid != CEPH_NOSNAP) {
12143 return -EROFS;
12144 }
12145 if (is_quota_files_exceeded(dir, perms)) {
12146 return -EDQUOT;
12147 }
12148
12149 // use normalized flags to generate cmode
11fdf7f2
TL
12150 int cflags = ceph_flags_sys2wire(flags);
12151 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12152 cflags |= CEPH_O_LAZY;
12153
12154 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12155
12156 int64_t pool_id = -1;
12157 if (data_pool && *data_pool) {
12158 pool_id = objecter->with_osdmap(
12159 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12160 if (pool_id < 0)
12161 return -EINVAL;
12162 if (pool_id > 0xffffffffll)
12163 return -ERANGE; // bummer!
12164 }
12165
12166 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12167
12168 filepath path;
12169 dir->make_nosnap_relative_path(path);
12170 path.push_dentry(name);
12171 req->set_filepath(path);
12172 req->set_inode(dir);
11fdf7f2 12173 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12174
12175 req->head.args.open.stripe_unit = stripe_unit;
12176 req->head.args.open.stripe_count = stripe_count;
12177 req->head.args.open.object_size = object_size;
12178 if (cct->_conf->client_debug_getattr_caps)
12179 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12180 else
12181 req->head.args.open.mask = 0;
12182 req->head.args.open.pool = pool_id;
12183 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12184 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12185
12186 mode |= S_IFREG;
12187 bufferlist xattrs_bl;
12188 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12189 if (res < 0)
12190 goto fail;
12191 req->head.args.open.mode = mode;
12192 if (xattrs_bl.length() > 0)
12193 req->set_data(xattrs_bl);
12194
12195 Dentry *de;
12196 res = get_or_create(dir, name, &de);
12197 if (res < 0)
12198 goto fail;
12199 req->set_dentry(de);
12200
12201 res = make_request(req, perms, inp, created);
12202 if (res < 0) {
12203 goto reply_error;
12204 }
12205
12206 /* If the caller passed a value in fhp, do the open */
12207 if(fhp) {
12208 (*inp)->get_open_ref(cmode);
12209 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12210 }
12211
12212 reply_error:
12213 trim_cache();
12214
1adf2230 12215 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12216 << " layout " << stripe_unit
12217 << ' ' << stripe_count
12218 << ' ' << object_size
12219 <<") = " << res << dendl;
12220 return res;
12221
12222 fail:
12223 put_request(req);
12224 return res;
12225}
12226
12227
12228int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12229 InodeRef *inp)
12230{
1adf2230 12231 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12232 << mode << dec << ", uid " << perm.uid()
12233 << ", gid " << perm.gid() << ")" << dendl;
12234
12235 if (strlen(name) > NAME_MAX)
12236 return -ENAMETOOLONG;
12237
12238 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12239 return -EROFS;
12240 }
12241 if (is_quota_files_exceeded(dir, perm)) {
12242 return -EDQUOT;
12243 }
12244 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12245 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12246
12247 filepath path;
12248 dir->make_nosnap_relative_path(path);
12249 path.push_dentry(name);
12250 req->set_filepath(path);
12251 req->set_inode(dir);
12252 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12253 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12254
12255 mode |= S_IFDIR;
12256 bufferlist xattrs_bl;
12257 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12258 if (res < 0)
12259 goto fail;
12260 req->head.args.mkdir.mode = mode;
12261 if (xattrs_bl.length() > 0)
12262 req->set_data(xattrs_bl);
12263
12264 Dentry *de;
12265 res = get_or_create(dir, name, &de);
12266 if (res < 0)
12267 goto fail;
12268 req->set_dentry(de);
12269
12270 ldout(cct, 10) << "_mkdir: making request" << dendl;
12271 res = make_request(req, perm, inp);
12272 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12273
12274 trim_cache();
12275
1adf2230 12276 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12277 return res;
12278
12279 fail:
12280 put_request(req);
12281 return res;
12282}
12283
12284int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12285 struct stat *attr, Inode **out, const UserPerm& perm)
12286{
11fdf7f2 12287 std::lock_guard lock(client_lock);
7c673cae 12288
181888fb
FG
12289 if (unmounting)
12290 return -ENOTCONN;
12291
7c673cae
FG
12292 vinodeno_t vparent = _get_vino(parent);
12293
12294 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12295 tout(cct) << "ll_mkdir" << std::endl;
12296 tout(cct) << vparent.ino.val << std::endl;
12297 tout(cct) << name << std::endl;
12298 tout(cct) << mode << std::endl;
12299
11fdf7f2 12300 if (!fuse_default_permissions) {
7c673cae
FG
12301 int r = may_create(parent, perm);
12302 if (r < 0)
12303 return r;
12304 }
12305
12306 InodeRef in;
12307 int r = _mkdir(parent, name, mode, perm, &in);
12308 if (r == 0) {
12309 fill_stat(in, attr);
12310 _ll_get(in.get());
12311 }
12312 tout(cct) << attr->st_ino << std::endl;
12313 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12314 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12315 *out = in.get();
12316 return r;
12317}
12318
12319int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12320 struct ceph_statx *stx, unsigned want, unsigned flags,
12321 const UserPerm& perms)
12322{
11fdf7f2 12323 std::lock_guard lock(client_lock);
7c673cae 12324
181888fb
FG
12325 if (unmounting)
12326 return -ENOTCONN;
12327
7c673cae
FG
12328 vinodeno_t vparent = _get_vino(parent);
12329
12330 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12331 tout(cct) << "ll_mkdirx" << std::endl;
12332 tout(cct) << vparent.ino.val << std::endl;
12333 tout(cct) << name << std::endl;
12334 tout(cct) << mode << std::endl;
12335
11fdf7f2 12336 if (!fuse_default_permissions) {
7c673cae
FG
12337 int r = may_create(parent, perms);
12338 if (r < 0)
12339 return r;
12340 }
12341
12342 InodeRef in;
12343 int r = _mkdir(parent, name, mode, perms, &in);
12344 if (r == 0) {
12345 fill_statx(in, statx_to_mask(flags, want), stx);
12346 _ll_get(in.get());
12347 } else {
12348 stx->stx_ino = 0;
12349 stx->stx_mask = 0;
12350 }
12351 tout(cct) << stx->stx_ino << std::endl;
12352 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12353 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12354 *out = in.get();
12355 return r;
12356}
12357
12358int Client::_symlink(Inode *dir, const char *name, const char *target,
12359 const UserPerm& perms, InodeRef *inp)
12360{
1adf2230 12361 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12362 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12363 << dendl;
12364
12365 if (strlen(name) > NAME_MAX)
12366 return -ENAMETOOLONG;
12367
12368 if (dir->snapid != CEPH_NOSNAP) {
12369 return -EROFS;
12370 }
12371 if (is_quota_files_exceeded(dir, perms)) {
12372 return -EDQUOT;
12373 }
12374
12375 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12376
12377 filepath path;
12378 dir->make_nosnap_relative_path(path);
12379 path.push_dentry(name);
12380 req->set_filepath(path);
12381 req->set_inode(dir);
12382 req->set_string2(target);
12383 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12384 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12385
12386 Dentry *de;
12387 int res = get_or_create(dir, name, &de);
12388 if (res < 0)
12389 goto fail;
12390 req->set_dentry(de);
12391
12392 res = make_request(req, perms, inp);
12393
12394 trim_cache();
1adf2230 12395 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12396 res << dendl;
12397 return res;
12398
12399 fail:
12400 put_request(req);
12401 return res;
12402}
12403
12404int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12405 struct stat *attr, Inode **out, const UserPerm& perms)
12406{
11fdf7f2 12407 std::lock_guard lock(client_lock);
7c673cae 12408
181888fb
FG
12409 if (unmounting)
12410 return -ENOTCONN;
12411
7c673cae
FG
12412 vinodeno_t vparent = _get_vino(parent);
12413
12414 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12415 << dendl;
12416 tout(cct) << "ll_symlink" << std::endl;
12417 tout(cct) << vparent.ino.val << std::endl;
12418 tout(cct) << name << std::endl;
12419 tout(cct) << value << std::endl;
12420
11fdf7f2 12421 if (!fuse_default_permissions) {
7c673cae
FG
12422 int r = may_create(parent, perms);
12423 if (r < 0)
12424 return r;
12425 }
12426
12427 InodeRef in;
12428 int r = _symlink(parent, name, value, perms, &in);
12429 if (r == 0) {
12430 fill_stat(in, attr);
12431 _ll_get(in.get());
12432 }
12433 tout(cct) << attr->st_ino << std::endl;
12434 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12435 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12436 *out = in.get();
12437 return r;
12438}
12439
12440int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12441 Inode **out, struct ceph_statx *stx, unsigned want,
12442 unsigned flags, const UserPerm& perms)
12443{
11fdf7f2 12444 std::lock_guard lock(client_lock);
7c673cae 12445
181888fb
FG
12446 if (unmounting)
12447 return -ENOTCONN;
12448
7c673cae
FG
12449 vinodeno_t vparent = _get_vino(parent);
12450
12451 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12452 << dendl;
12453 tout(cct) << "ll_symlinkx" << std::endl;
12454 tout(cct) << vparent.ino.val << std::endl;
12455 tout(cct) << name << std::endl;
12456 tout(cct) << value << std::endl;
12457
11fdf7f2 12458 if (!fuse_default_permissions) {
7c673cae
FG
12459 int r = may_create(parent, perms);
12460 if (r < 0)
12461 return r;
12462 }
12463
12464 InodeRef in;
12465 int r = _symlink(parent, name, value, perms, &in);
12466 if (r == 0) {
12467 fill_statx(in, statx_to_mask(flags, want), stx);
12468 _ll_get(in.get());
12469 }
12470 tout(cct) << stx->stx_ino << std::endl;
12471 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12472 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12473 *out = in.get();
12474 return r;
12475}
12476
12477int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12478{
1adf2230 12479 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12480 << " uid " << perm.uid() << " gid " << perm.gid()
12481 << ")" << dendl;
12482
12483 if (dir->snapid != CEPH_NOSNAP) {
12484 return -EROFS;
12485 }
12486
12487 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12488
12489 filepath path;
12490 dir->make_nosnap_relative_path(path);
12491 path.push_dentry(name);
12492 req->set_filepath(path);
12493
12494 InodeRef otherin;
b32b8144 12495 Inode *in;
7c673cae 12496 Dentry *de;
b32b8144 12497
7c673cae
FG
12498 int res = get_or_create(dir, name, &de);
12499 if (res < 0)
12500 goto fail;
12501 req->set_dentry(de);
12502 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12503 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12504
12505 res = _lookup(dir, name, 0, &otherin, perm);
12506 if (res < 0)
12507 goto fail;
b32b8144
FG
12508
12509 in = otherin.get();
12510 req->set_other_inode(in);
12511 in->break_all_delegs();
7c673cae
FG
12512 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12513
12514 req->set_inode(dir);
12515
12516 res = make_request(req, perm);
12517
12518 trim_cache();
1adf2230 12519 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12520 return res;
12521
12522 fail:
12523 put_request(req);
12524 return res;
12525}
12526
12527int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12528{
11fdf7f2 12529 std::lock_guard lock(client_lock);
7c673cae 12530
181888fb
FG
12531 if (unmounting)
12532 return -ENOTCONN;
12533
7c673cae
FG
12534 vinodeno_t vino = _get_vino(in);
12535
12536 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12537 tout(cct) << "ll_unlink" << std::endl;
12538 tout(cct) << vino.ino.val << std::endl;
12539 tout(cct) << name << std::endl;
12540
11fdf7f2 12541 if (!fuse_default_permissions) {
7c673cae
FG
12542 int r = may_delete(in, name, perm);
12543 if (r < 0)
12544 return r;
12545 }
12546 return _unlink(in, name, perm);
12547}
12548
12549int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12550{
1adf2230 12551 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12552 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12553
12554 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12555 return -EROFS;
12556 }
b32b8144
FG
12557
12558 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12559 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12560 filepath path;
12561 dir->make_nosnap_relative_path(path);
12562 path.push_dentry(name);
12563 req->set_filepath(path);
11fdf7f2 12564 req->set_inode(dir);
7c673cae
FG
12565
12566 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12567 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12568 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12569
12570 InodeRef in;
12571
12572 Dentry *de;
12573 int res = get_or_create(dir, name, &de);
12574 if (res < 0)
12575 goto fail;
b32b8144
FG
12576 if (op == CEPH_MDS_OP_RMDIR)
12577 req->set_dentry(de);
12578 else
12579 de->get();
12580
7c673cae
FG
12581 res = _lookup(dir, name, 0, &in, perms);
12582 if (res < 0)
12583 goto fail;
11fdf7f2
TL
12584
12585 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12586 unlink(de, true, true);
b32b8144 12587 de->put();
7c673cae 12588 }
11fdf7f2 12589 req->set_other_inode(in.get());
7c673cae
FG
12590
12591 res = make_request(req, perms);
12592
12593 trim_cache();
1adf2230 12594 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12595 return res;
12596
12597 fail:
12598 put_request(req);
12599 return res;
12600}
12601
12602int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12603{
11fdf7f2 12604 std::lock_guard lock(client_lock);
7c673cae 12605
181888fb
FG
12606 if (unmounting)
12607 return -ENOTCONN;
12608
7c673cae
FG
12609 vinodeno_t vino = _get_vino(in);
12610
12611 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12612 tout(cct) << "ll_rmdir" << std::endl;
12613 tout(cct) << vino.ino.val << std::endl;
12614 tout(cct) << name << std::endl;
12615
11fdf7f2 12616 if (!fuse_default_permissions) {
7c673cae
FG
12617 int r = may_delete(in, name, perms);
12618 if (r < 0)
12619 return r;
12620 }
12621
12622 return _rmdir(in, name, perms);
12623}
12624
12625int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12626{
1adf2230 12627 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12628 << todir->ino << " " << toname
12629 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12630 << dendl;
12631
12632 if (fromdir->snapid != todir->snapid)
12633 return -EXDEV;
12634
12635 int op = CEPH_MDS_OP_RENAME;
12636 if (fromdir->snapid != CEPH_NOSNAP) {
12637 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12638 op = CEPH_MDS_OP_RENAMESNAP;
12639 else
12640 return -EROFS;
12641 }
7c673cae
FG
12642
12643 InodeRef target;
12644 MetaRequest *req = new MetaRequest(op);
12645
12646 filepath from;
12647 fromdir->make_nosnap_relative_path(from);
12648 from.push_dentry(fromname);
12649 filepath to;
12650 todir->make_nosnap_relative_path(to);
12651 to.push_dentry(toname);
12652 req->set_filepath(to);
12653 req->set_filepath2(from);
12654
12655 Dentry *oldde;
12656 int res = get_or_create(fromdir, fromname, &oldde);
12657 if (res < 0)
12658 goto fail;
12659 Dentry *de;
12660 res = get_or_create(todir, toname, &de);
12661 if (res < 0)
12662 goto fail;
12663
12664 if (op == CEPH_MDS_OP_RENAME) {
12665 req->set_old_dentry(oldde);
12666 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12667 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12668
12669 req->set_dentry(de);
12670 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12671 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12672
12673 InodeRef oldin, otherin;
9f95a23c
TL
12674 Inode *fromdir_root = nullptr;
12675 Inode *todir_root = nullptr;
12676 int mask = 0;
12677 bool quota_check = false;
12678 if (fromdir != todir) {
12679 fromdir_root =
12680 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12681 todir_root =
12682 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12683
12684 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12685 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12686 // to auth MDS to get latest rstat for todir_root and source dir
12687 // even if their dentry caches and inode caps are satisfied.
12688 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12689 if (res < 0)
12690 goto fail;
12691
12692 quota_check = true;
12693 if (oldde->inode && oldde->inode->is_dir()) {
12694 mask |= CEPH_STAT_RSTAT;
12695 }
12696 }
12697 }
12698
12699 res = _lookup(fromdir, fromname, mask, &oldin, perm);
7c673cae
FG
12700 if (res < 0)
12701 goto fail;
b32b8144
FG
12702
12703 Inode *oldinode = oldin.get();
12704 oldinode->break_all_delegs();
12705 req->set_old_inode(oldinode);
7c673cae
FG
12706 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12707
9f95a23c
TL
12708 if (quota_check) {
12709 int64_t old_bytes, old_files;
12710 if (oldinode->is_dir()) {
12711 old_bytes = oldinode->rstat.rbytes;
12712 old_files = oldinode->rstat.rsize();
12713 } else {
12714 old_bytes = oldinode->size;
12715 old_files = 1;
12716 }
12717
12718 bool quota_exceed = false;
12719 if (todir_root && todir_root->quota.max_bytes &&
12720 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12721 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12722 << old_bytes << ") to (" << todir->ino
12723 << ") will exceed quota on " << *todir_root << dendl;
12724 quota_exceed = true;
12725 }
12726
12727 if (todir_root && todir_root->quota.max_files &&
12728 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12729 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12730 << old_files << ") to (" << todir->ino
12731 << ") will exceed quota on " << *todir_root << dendl;
12732 quota_exceed = true;
12733 }
12734
12735 if (quota_exceed) {
12736 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12737 goto fail;
12738 }
12739 }
12740
7c673cae 12741 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12742 switch (res) {
12743 case 0:
12744 {
12745 Inode *in = otherin.get();
12746 req->set_other_inode(in);
12747 in->break_all_delegs();
12748 }
7c673cae 12749 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12750 break;
12751 case -ENOENT:
12752 break;
12753 default:
12754 goto fail;
7c673cae
FG
12755 }
12756
12757 req->set_inode(todir);
12758 } else {
12759 // renamesnap reply contains no tracedn, so we need to invalidate
12760 // dentry manually
12761 unlink(oldde, true, true);
12762 unlink(de, true, true);
11fdf7f2
TL
12763
12764 req->set_inode(todir);
7c673cae
FG
12765 }
12766
12767 res = make_request(req, perm, &target);
12768 ldout(cct, 10) << "rename result is " << res << dendl;
12769
12770 // renamed item from our cache
12771
12772 trim_cache();
1adf2230 12773 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12774 return res;
12775
12776 fail:
12777 put_request(req);
12778 return res;
12779}
12780
12781int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12782 const char *newname, const UserPerm& perm)
12783{
11fdf7f2 12784 std::lock_guard lock(client_lock);
7c673cae 12785
181888fb
FG
12786 if (unmounting)
12787 return -ENOTCONN;
12788
7c673cae
FG
12789 vinodeno_t vparent = _get_vino(parent);
12790 vinodeno_t vnewparent = _get_vino(newparent);
12791
12792 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12793 << vnewparent << " " << newname << dendl;
12794 tout(cct) << "ll_rename" << std::endl;
12795 tout(cct) << vparent.ino.val << std::endl;
12796 tout(cct) << name << std::endl;
12797 tout(cct) << vnewparent.ino.val << std::endl;
12798 tout(cct) << newname << std::endl;
12799
11fdf7f2 12800 if (!fuse_default_permissions) {
7c673cae
FG
12801 int r = may_delete(parent, name, perm);
12802 if (r < 0)
12803 return r;
12804 r = may_delete(newparent, newname, perm);
12805 if (r < 0 && r != -ENOENT)
12806 return r;
12807 }
12808
12809 return _rename(parent, name, newparent, newname, perm);
12810}
12811
12812int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12813{
1adf2230 12814 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12815 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12816
12817 if (strlen(newname) > NAME_MAX)
12818 return -ENAMETOOLONG;
12819
12820 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12821 return -EROFS;
12822 }
12823 if (is_quota_files_exceeded(dir, perm)) {
12824 return -EDQUOT;
12825 }
12826
b32b8144 12827 in->break_all_delegs();
7c673cae
FG
12828 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12829
12830 filepath path(newname, dir->ino);
12831 req->set_filepath(path);
12832 filepath existing(in->ino);
12833 req->set_filepath2(existing);
12834
12835 req->set_inode(dir);
12836 req->inode_drop = CEPH_CAP_FILE_SHARED;
12837 req->inode_unless = CEPH_CAP_FILE_EXCL;
12838
12839 Dentry *de;
12840 int res = get_or_create(dir, newname, &de);
12841 if (res < 0)
12842 goto fail;
12843 req->set_dentry(de);
12844
12845 res = make_request(req, perm, inp);
12846 ldout(cct, 10) << "link result is " << res << dendl;
12847
12848 trim_cache();
1adf2230 12849 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12850 return res;
12851
12852 fail:
12853 put_request(req);
12854 return res;
12855}
12856
12857int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12858 const UserPerm& perm)
12859{
11fdf7f2 12860 std::lock_guard lock(client_lock);
7c673cae 12861
181888fb
FG
12862 if (unmounting)
12863 return -ENOTCONN;
12864
7c673cae
FG
12865 vinodeno_t vino = _get_vino(in);
12866 vinodeno_t vnewparent = _get_vino(newparent);
12867
31f18b77 12868 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12869 newname << dendl;
12870 tout(cct) << "ll_link" << std::endl;
12871 tout(cct) << vino.ino.val << std::endl;
12872 tout(cct) << vnewparent << std::endl;
12873 tout(cct) << newname << std::endl;
12874
7c673cae
FG
12875 InodeRef target;
12876
11fdf7f2 12877 if (!fuse_default_permissions) {
7c673cae
FG
12878 if (S_ISDIR(in->mode))
12879 return -EPERM;
12880
11fdf7f2 12881 int r = may_hardlink(in, perm);
7c673cae
FG
12882 if (r < 0)
12883 return r;
12884
12885 r = may_create(newparent, perm);
12886 if (r < 0)
12887 return r;
12888 }
12889
12890 return _link(in, newparent, newname, perm, &target);
12891}
12892
12893int Client::ll_num_osds(void)
12894{
11fdf7f2 12895 std::lock_guard lock(client_lock);
7c673cae
FG
12896 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12897}
12898
12899int Client::ll_osdaddr(int osd, uint32_t *addr)
12900{
11fdf7f2 12901 std::lock_guard lock(client_lock);
181888fb 12902
7c673cae
FG
12903 entity_addr_t g;
12904 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12905 if (!o.exists(osd))
12906 return false;
11fdf7f2 12907 g = o.get_addrs(osd).front();
7c673cae
FG
12908 return true;
12909 });
12910 if (!exists)
12911 return -1;
12912 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12913 *addr = ntohl(nb_addr);
12914 return 0;
12915}
181888fb 12916
7c673cae
FG
12917uint32_t Client::ll_stripe_unit(Inode *in)
12918{
11fdf7f2 12919 std::lock_guard lock(client_lock);
7c673cae
FG
12920 return in->layout.stripe_unit;
12921}
12922
12923uint64_t Client::ll_snap_seq(Inode *in)
12924{
11fdf7f2 12925 std::lock_guard lock(client_lock);
7c673cae
FG
12926 return in->snaprealm->seq;
12927}
12928
12929int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12930{
11fdf7f2 12931 std::lock_guard lock(client_lock);
7c673cae
FG
12932 *layout = in->layout;
12933 return 0;
12934}
12935
12936int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12937{
12938 return ll_file_layout(fh->inode.get(), layout);
12939}
12940
12941/* Currently we cannot take advantage of redundancy in reads, since we
12942 would have to go through all possible placement groups (a
12943 potentially quite large number determined by a hash), and use CRUSH
12944 to calculate the appropriate set of OSDs for each placement group,
12945 then index into that. An array with one entry per OSD is much more
12946 tractable and works for demonstration purposes. */
12947
12948int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12949 file_layout_t* layout)
12950{
11fdf7f2 12951 std::lock_guard lock(client_lock);
181888fb 12952
28e407b8 12953 inodeno_t ino = in->ino;
7c673cae
FG
12954 uint32_t object_size = layout->object_size;
12955 uint32_t su = layout->stripe_unit;
12956 uint32_t stripe_count = layout->stripe_count;
12957 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12958 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12959
11fdf7f2
TL
12960 if(stripe_count) {
12961 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12962 stripepos = blockno % stripe_count; // which object in the object set (X)
12963 }
7c673cae
FG
12964 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12965 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12966
12967 object_t oid = file_object_t(ino, objectno);
12968 return objecter->with_osdmap([&](const OSDMap& o) {
12969 ceph_object_layout olayout =
12970 o.file_to_object_layout(oid, *layout);
12971 pg_t pg = (pg_t)olayout.ol_pgid;
12972 vector<int> osds;
12973 int primary;
12974 o.pg_to_acting_osds(pg, &osds, &primary);
12975 return primary;
12976 });
12977}
12978
12979/* Return the offset of the block, internal to the object */
12980
12981uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12982{
11fdf7f2 12983 std::lock_guard lock(client_lock);
7c673cae
FG
12984 file_layout_t *layout=&(in->layout);
12985 uint32_t object_size = layout->object_size;
12986 uint32_t su = layout->stripe_unit;
12987 uint64_t stripes_per_object = object_size / su;
12988
12989 return (blockno % stripes_per_object) * su;
12990}
12991
12992int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12993 const UserPerm& perms)
12994{
11fdf7f2 12995 std::lock_guard lock(client_lock);
7c673cae 12996
181888fb
FG
12997 if (unmounting)
12998 return -ENOTCONN;
12999
7c673cae
FG
13000 vinodeno_t vino = _get_vino(in);
13001
13002 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13003 tout(cct) << "ll_opendir" << std::endl;
13004 tout(cct) << vino.ino.val << std::endl;
13005
11fdf7f2 13006 if (!fuse_default_permissions) {
7c673cae
FG
13007 int r = may_open(in, flags, perms);
13008 if (r < 0)
13009 return r;
13010 }
13011
13012 int r = _opendir(in, dirpp, perms);
13013 tout(cct) << (unsigned long)*dirpp << std::endl;
13014
13015 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13016 << dendl;
13017 return r;
13018}
13019
13020int Client::ll_releasedir(dir_result_t *dirp)
13021{
11fdf7f2 13022 std::lock_guard lock(client_lock);
7c673cae
FG
13023 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13024 tout(cct) << "ll_releasedir" << std::endl;
13025 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
13026
13027 if (unmounting)
13028 return -ENOTCONN;
13029
7c673cae
FG
13030 _closedir(dirp);
13031 return 0;
13032}
13033
13034int Client::ll_fsyncdir(dir_result_t *dirp)
13035{
11fdf7f2 13036 std::lock_guard lock(client_lock);
7c673cae
FG
13037 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13038 tout(cct) << "ll_fsyncdir" << std::endl;
13039 tout(cct) << (unsigned long)dirp << std::endl;
13040
181888fb
FG
13041 if (unmounting)
13042 return -ENOTCONN;
13043
7c673cae
FG
13044 return _fsync(dirp->inode.get(), false);
13045}
13046
13047int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13048{
11fdf7f2 13049 ceph_assert(!(flags & O_CREAT));
7c673cae 13050
11fdf7f2 13051 std::lock_guard lock(client_lock);
7c673cae 13052
181888fb
FG
13053 if (unmounting)
13054 return -ENOTCONN;
13055
7c673cae
FG
13056 vinodeno_t vino = _get_vino(in);
13057
13058 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13059 tout(cct) << "ll_open" << std::endl;
13060 tout(cct) << vino.ino.val << std::endl;
13061 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13062
13063 int r;
11fdf7f2 13064 if (!fuse_default_permissions) {
7c673cae
FG
13065 r = may_open(in, flags, perms);
13066 if (r < 0)
13067 goto out;
13068 }
13069
13070 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13071
13072 out:
13073 Fh *fhptr = fhp ? *fhp : NULL;
13074 if (fhptr) {
13075 ll_unclosed_fh_set.insert(fhptr);
13076 }
13077 tout(cct) << (unsigned long)fhptr << std::endl;
13078 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13079 " = " << r << " (" << fhptr << ")" << dendl;
13080 return r;
13081}
13082
13083int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13084 int flags, InodeRef *in, int caps, Fh **fhp,
13085 const UserPerm& perms)
13086{
13087 *fhp = NULL;
13088
13089 vinodeno_t vparent = _get_vino(parent);
13090
1adf2230 13091 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13092 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13093 << ", gid " << perms.gid() << dendl;
13094 tout(cct) << "ll_create" << std::endl;
13095 tout(cct) << vparent.ino.val << std::endl;
13096 tout(cct) << name << std::endl;
13097 tout(cct) << mode << std::endl;
13098 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13099
13100 bool created = false;
13101 int r = _lookup(parent, name, caps, in, perms);
13102
13103 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13104 return -EEXIST;
13105
13106 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2 13107 if (!fuse_default_permissions) {
7c673cae
FG
13108 r = may_create(parent, perms);
13109 if (r < 0)
13110 goto out;
13111 }
13112 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13113 perms);
13114 if (r < 0)
13115 goto out;
13116 }
13117
13118 if (r < 0)
13119 goto out;
13120
11fdf7f2 13121 ceph_assert(*in);
7c673cae
FG
13122
13123 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13124 if (!created) {
11fdf7f2 13125 if (!fuse_default_permissions) {
7c673cae
FG
13126 r = may_open(in->get(), flags, perms);
13127 if (r < 0) {
13128 if (*fhp) {
13129 int release_r = _release_fh(*fhp);
11fdf7f2 13130 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13131 }
13132 goto out;
13133 }
13134 }
13135 if (*fhp == NULL) {
13136 r = _open(in->get(), flags, mode, fhp, perms);
13137 if (r < 0)
13138 goto out;
13139 }
13140 }
13141
13142out:
13143 if (*fhp) {
13144 ll_unclosed_fh_set.insert(*fhp);
13145 }
13146
13147 ino_t ino = 0;
13148 if (r >= 0) {
13149 Inode *inode = in->get();
13150 if (use_faked_inos())
13151 ino = inode->faked_ino;
13152 else
13153 ino = inode->ino;
13154 }
13155
13156 tout(cct) << (unsigned long)*fhp << std::endl;
13157 tout(cct) << ino << std::endl;
1adf2230 13158 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13159 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13160 *fhp << " " << hex << ino << dec << ")" << dendl;
13161
13162 return r;
13163}
13164
13165int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13166 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13167 const UserPerm& perms)
13168{
11fdf7f2 13169 std::lock_guard lock(client_lock);
7c673cae
FG
13170 InodeRef in;
13171
181888fb
FG
13172 if (unmounting)
13173 return -ENOTCONN;
13174
7c673cae
FG
13175 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13176 fhp, perms);
13177 if (r >= 0) {
11fdf7f2 13178 ceph_assert(in);
7c673cae
FG
13179
13180 // passing an Inode in outp requires an additional ref
13181 if (outp) {
13182 _ll_get(in.get());
13183 *outp = in.get();
13184 }
13185 fill_stat(in, attr);
13186 } else {
13187 attr->st_ino = 0;
13188 }
13189
13190 return r;
13191}
13192
13193int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13194 int oflags, Inode **outp, Fh **fhp,
13195 struct ceph_statx *stx, unsigned want, unsigned lflags,
13196 const UserPerm& perms)
13197{
13198 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13199 std::lock_guard lock(client_lock);
7c673cae
FG
13200 InodeRef in;
13201
181888fb
FG
13202 if (unmounting)
13203 return -ENOTCONN;
7c673cae
FG
13204
13205 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13206 if (r >= 0) {
11fdf7f2 13207 ceph_assert(in);
7c673cae
FG
13208
13209 // passing an Inode in outp requires an additional ref
13210 if (outp) {
13211 _ll_get(in.get());
13212 *outp = in.get();
13213 }
13214 fill_statx(in, caps, stx);
13215 } else {
13216 stx->stx_ino = 0;
13217 stx->stx_mask = 0;
13218 }
13219
13220 return r;
13221}
13222
13223loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13224{
11fdf7f2 13225 std::lock_guard lock(client_lock);
7c673cae
FG
13226 tout(cct) << "ll_lseek" << std::endl;
13227 tout(cct) << offset << std::endl;
13228 tout(cct) << whence << std::endl;
13229
181888fb
FG
13230 if (unmounting)
13231 return -ENOTCONN;
13232
7c673cae
FG
13233 return _lseek(fh, offset, whence);
13234}
13235
13236int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13237{
11fdf7f2 13238 std::lock_guard lock(client_lock);
7c673cae
FG
13239 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13240 tout(cct) << "ll_read" << std::endl;
13241 tout(cct) << (unsigned long)fh << std::endl;
13242 tout(cct) << off << std::endl;
13243 tout(cct) << len << std::endl;
13244
181888fb
FG
13245 if (unmounting)
13246 return -ENOTCONN;
13247
11fdf7f2
TL
13248 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13249 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13250 return _read(fh, off, len, bl);
13251}
13252
13253int Client::ll_read_block(Inode *in, uint64_t blockid,
13254 char *buf,
13255 uint64_t offset,
13256 uint64_t length,
13257 file_layout_t* layout)
13258{
11fdf7f2 13259 std::lock_guard lock(client_lock);
181888fb
FG
13260
13261 if (unmounting)
13262 return -ENOTCONN;
13263
b32b8144 13264 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13265 object_t oid = file_object_t(vino.ino, blockid);
13266 C_SaferCond onfinish;
13267 bufferlist bl;
13268
13269 objecter->read(oid,
13270 object_locator_t(layout->pool_id),
13271 offset,
13272 length,
13273 vino.snapid,
13274 &bl,
13275 CEPH_OSD_FLAG_READ,
13276 &onfinish);
13277
9f95a23c 13278 client_lock.unlock();
7c673cae 13279 int r = onfinish.wait();
9f95a23c 13280 client_lock.lock();
7c673cae
FG
13281
13282 if (r >= 0) {
9f95a23c 13283 bl.begin().copy(bl.length(), buf);
7c673cae
FG
13284 r = bl.length();
13285 }
13286
13287 return r;
13288}
13289
13290/* It appears that the OSD doesn't return success unless the entire
13291 buffer was written, return the write length on success. */
13292
13293int Client::ll_write_block(Inode *in, uint64_t blockid,
13294 char* buf, uint64_t offset,
13295 uint64_t length, file_layout_t* layout,
13296 uint64_t snapseq, uint32_t sync)
13297{
7c673cae 13298 vinodeno_t vino = ll_get_vino(in);
7c673cae 13299 int r = 0;
11fdf7f2
TL
13300 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13301
7c673cae
FG
13302 if (length == 0) {
13303 return -EINVAL;
13304 }
13305 if (true || sync) {
13306 /* if write is stable, the epilogue is waiting on
13307 * flock */
11fdf7f2 13308 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13309 }
13310 object_t oid = file_object_t(vino.ino, blockid);
13311 SnapContext fakesnap;
11fdf7f2
TL
13312 ceph::bufferlist bl;
13313 if (length > 0) {
13314 bl.push_back(buffer::copy(buf, length));
13315 }
7c673cae
FG
13316
13317 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13318 << dendl;
13319
13320 fakesnap.seq = snapseq;
13321
13322 /* lock just in time */
9f95a23c 13323 client_lock.lock();
181888fb 13324 if (unmounting) {
9f95a23c 13325 client_lock.unlock();
181888fb
FG
13326 return -ENOTCONN;
13327 }
7c673cae
FG
13328
13329 objecter->write(oid,
13330 object_locator_t(layout->pool_id),
13331 offset,
13332 length,
13333 fakesnap,
13334 bl,
13335 ceph::real_clock::now(),
13336 0,
11fdf7f2 13337 onsafe.get());
7c673cae 13338
9f95a23c 13339 client_lock.unlock();
11fdf7f2
TL
13340 if (nullptr != onsafe) {
13341 r = onsafe->wait();
7c673cae
FG
13342 }
13343
13344 if (r < 0) {
13345 return r;
13346 } else {
13347 return length;
13348 }
13349}
13350
13351int Client::ll_commit_blocks(Inode *in,
13352 uint64_t offset,
13353 uint64_t length)
13354{
11fdf7f2 13355 std::lock_guard lock(client_lock);
7c673cae
FG
13356 /*
13357 BarrierContext *bctx;
b32b8144 13358 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13359 uint64_t ino = vino.ino;
13360
13361 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13362 << offset << " to " << length << dendl;
13363
13364 if (length == 0) {
13365 return -EINVAL;
13366 }
13367
13368 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13369 if (p != barriers.end()) {
13370 barrier_interval civ(offset, offset + length);
13371 p->second->commit_barrier(civ);
13372 }
13373 */
13374 return 0;
13375}
13376
13377int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13378{
11fdf7f2 13379 std::lock_guard lock(client_lock);
7c673cae
FG
13380 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13381 "~" << len << dendl;
13382 tout(cct) << "ll_write" << std::endl;
13383 tout(cct) << (unsigned long)fh << std::endl;
13384 tout(cct) << off << std::endl;
13385 tout(cct) << len << std::endl;
13386
181888fb
FG
13387 if (unmounting)
13388 return -ENOTCONN;
13389
11fdf7f2
TL
13390 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13391 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13392 int r = _write(fh, off, len, data, NULL, 0);
13393 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13394 << dendl;
13395 return r;
13396}
13397
11fdf7f2
TL
13398int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13399{
13400 std::lock_guard lock(client_lock);
13401 if (unmounting)
13402 return -ENOTCONN;
13403 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13404}
13405
13406int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13407{
13408 std::lock_guard lock(client_lock);
13409 if (unmounting)
13410 return -ENOTCONN;
13411 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13412}
13413
7c673cae
FG
13414int Client::ll_flush(Fh *fh)
13415{
11fdf7f2 13416 std::lock_guard lock(client_lock);
7c673cae
FG
13417 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13418 tout(cct) << "ll_flush" << std::endl;
13419 tout(cct) << (unsigned long)fh << std::endl;
13420
181888fb
FG
13421 if (unmounting)
13422 return -ENOTCONN;
13423
7c673cae
FG
13424 return _flush(fh);
13425}
13426
13427int Client::ll_fsync(Fh *fh, bool syncdataonly)
13428{
11fdf7f2 13429 std::lock_guard lock(client_lock);
7c673cae
FG
13430 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13431 tout(cct) << "ll_fsync" << std::endl;
13432 tout(cct) << (unsigned long)fh << std::endl;
13433
181888fb
FG
13434 if (unmounting)
13435 return -ENOTCONN;
13436
7c673cae
FG
13437 int r = _fsync(fh, syncdataonly);
13438 if (r) {
13439 // If we're returning an error, clear it from the FH
13440 fh->take_async_err();
13441 }
13442 return r;
13443}
13444
28e407b8
AA
13445int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13446{
11fdf7f2 13447 std::lock_guard lock(client_lock);
28e407b8
AA
13448 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13449 tout(cct) << "ll_sync_inode" << std::endl;
13450 tout(cct) << (unsigned long)in << std::endl;
13451
13452 if (unmounting)
13453 return -ENOTCONN;
13454
13455 return _fsync(in, syncdataonly);
13456}
13457
7c673cae
FG
13458#ifdef FALLOC_FL_PUNCH_HOLE
13459
13460int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13461{
13462 if (offset < 0 || length <= 0)
13463 return -EINVAL;
13464
13465 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13466 return -EOPNOTSUPP;
13467
13468 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13469 return -EOPNOTSUPP;
13470
13471 Inode *in = fh->inode.get();
13472
13473 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13474 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13475 return -ENOSPC;
13476 }
13477
13478 if (in->snapid != CEPH_NOSNAP)
13479 return -EROFS;
13480
13481 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13482 return -EBADF;
13483
13484 uint64_t size = offset + length;
13485 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13486 size > in->size &&
11fdf7f2 13487 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13488 return -EDQUOT;
13489 }
13490
13491 int have;
13492 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13493 if (r < 0)
13494 return r;
13495
11fdf7f2 13496 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13497 if (mode & FALLOC_FL_PUNCH_HOLE) {
13498 if (in->inline_version < CEPH_INLINE_NONE &&
13499 (have & CEPH_CAP_FILE_BUFFER)) {
13500 bufferlist bl;
9f95a23c 13501 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
13502 int len = in->inline_data.length();
13503 if (offset < len) {
13504 if (offset > 0)
9f95a23c 13505 inline_iter.copy(offset, bl);
7c673cae
FG
13506 int size = length;
13507 if (offset + size > len)
13508 size = len - offset;
13509 if (size > 0)
13510 bl.append_zero(size);
9f95a23c
TL
13511 if (offset + size < len) {
13512 inline_iter += size;
13513 inline_iter.copy(len - offset - size, bl);
13514 }
7c673cae
FG
13515 in->inline_data = bl;
13516 in->inline_version++;
13517 }
91327a77 13518 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13519 in->change_attr++;
28e407b8 13520 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13521 } else {
13522 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13523 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13524 uninline_data(in, onuninline.get());
7c673cae
FG
13525 }
13526
11fdf7f2 13527 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13528
13529 unsafe_sync_write++;
13530 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13531
13532 _invalidate_inode_cache(in, offset, length);
13533 filer->zero(in->ino, &in->layout,
13534 in->snaprealm->get_snap_context(),
13535 offset, length,
13536 ceph::real_clock::now(),
11fdf7f2 13537 0, true, &onfinish);
91327a77 13538 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13539 in->change_attr++;
28e407b8 13540 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13541
9f95a23c 13542 client_lock.unlock();
11fdf7f2 13543 onfinish.wait();
9f95a23c 13544 client_lock.lock();
7c673cae
FG
13545 _sync_write_commit(in);
13546 }
13547 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13548 uint64_t size = offset + length;
13549 if (size > in->size) {
13550 in->size = size;
91327a77 13551 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13552 in->change_attr++;
28e407b8 13553 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13554
11fdf7f2 13555 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13556 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13557 } else if (is_max_size_approaching(in)) {
13558 check_caps(in, 0);
7c673cae
FG
13559 }
13560 }
13561 }
13562
11fdf7f2 13563 if (nullptr != onuninline) {
9f95a23c 13564 client_lock.unlock();
11fdf7f2 13565 int ret = onuninline->wait();
9f95a23c 13566 client_lock.lock();
7c673cae 13567
11fdf7f2 13568 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13569 in->inline_data.clear();
13570 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13571 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13572 check_caps(in, 0);
13573 } else
11fdf7f2 13574 r = ret;
7c673cae
FG
13575 }
13576
13577 put_cap_ref(in, CEPH_CAP_FILE_WR);
13578 return r;
13579}
13580#else
13581
13582int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13583{
13584 return -EOPNOTSUPP;
13585}
13586
13587#endif
13588
13589
11fdf7f2 13590int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13591{
11fdf7f2
TL
13592 std::lock_guard lock(client_lock);
13593 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13594 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13595 tout(cct) << (unsigned long)fh << std::endl;
13596
181888fb
FG
13597 if (unmounting)
13598 return -ENOTCONN;
13599
7c673cae
FG
13600 return _fallocate(fh, mode, offset, length);
13601}
13602
13603int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13604{
11fdf7f2
TL
13605 std::lock_guard lock(client_lock);
13606 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13607
181888fb
FG
13608 if (unmounting)
13609 return -ENOTCONN;
13610
7c673cae
FG
13611 Fh *fh = get_filehandle(fd);
13612 if (!fh)
13613 return -EBADF;
13614#if defined(__linux__) && defined(O_PATH)
13615 if (fh->flags & O_PATH)
13616 return -EBADF;
13617#endif
13618 return _fallocate(fh, mode, offset, length);
13619}
13620
13621int Client::ll_release(Fh *fh)
13622{
11fdf7f2 13623 std::lock_guard lock(client_lock);
91327a77
AA
13624
13625 if (unmounting)
13626 return -ENOTCONN;
13627
11fdf7f2 13628 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13629 dendl;
11fdf7f2 13630 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13631 tout(cct) << (unsigned long)fh << std::endl;
13632
13633 if (ll_unclosed_fh_set.count(fh))
13634 ll_unclosed_fh_set.erase(fh);
13635 return _release_fh(fh);
13636}
13637
13638int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13639{
11fdf7f2 13640 std::lock_guard lock(client_lock);
7c673cae
FG
13641
13642 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13643 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13644
181888fb
FG
13645 if (unmounting)
13646 return -ENOTCONN;
13647
7c673cae
FG
13648 return _getlk(fh, fl, owner);
13649}
13650
13651int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13652{
11fdf7f2 13653 std::lock_guard lock(client_lock);
7c673cae 13654
11fdf7f2
TL
13655 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13656 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13657
181888fb
FG
13658 if (unmounting)
13659 return -ENOTCONN;
13660
7c673cae
FG
13661 return _setlk(fh, fl, owner, sleep);
13662}
13663
13664int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13665{
11fdf7f2 13666 std::lock_guard lock(client_lock);
7c673cae 13667
11fdf7f2
TL
13668 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13669 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13670
181888fb
FG
13671 if (unmounting)
13672 return -ENOTCONN;
13673
7c673cae
FG
13674 return _flock(fh, cmd, owner);
13675}
13676
b32b8144
FG
13677int Client::set_deleg_timeout(uint32_t timeout)
13678{
11fdf7f2 13679 std::lock_guard lock(client_lock);
b32b8144
FG
13680
13681 /*
13682 * The whole point is to prevent blacklisting so we must time out the
13683 * delegation before the session autoclose timeout kicks in.
13684 */
13685 if (timeout >= mdsmap->get_session_autoclose())
13686 return -EINVAL;
13687
13688 deleg_timeout = timeout;
13689 return 0;
13690}
13691
13692int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13693{
13694 int ret = -EINVAL;
13695
11fdf7f2 13696 std::lock_guard lock(client_lock);
b32b8144
FG
13697
13698 if (!mounted)
13699 return -ENOTCONN;
13700
13701 Inode *inode = fh->inode.get();
13702
13703 switch(cmd) {
13704 case CEPH_DELEGATION_NONE:
13705 inode->unset_deleg(fh);
13706 ret = 0;
13707 break;
13708 default:
13709 try {
13710 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13711 } catch (std::bad_alloc&) {
b32b8144
FG
13712 ret = -ENOMEM;
13713 }
13714 break;
13715 }
13716 return ret;
13717}
13718
7c673cae
FG
13719class C_Client_RequestInterrupt : public Context {
13720private:
13721 Client *client;
13722 MetaRequest *req;
13723public:
13724 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13725 req->get();
13726 }
13727 void finish(int r) override {
11fdf7f2
TL
13728 std::lock_guard l(client->client_lock);
13729 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13730 client->_interrupt_filelock(req);
13731 client->put_request(req);
13732 }
13733};
13734
13735void Client::ll_interrupt(void *d)
13736{
13737 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13738 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13739 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13740 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13741}
13742
13743// =========================================
13744// layout
13745
13746// expose file layouts
13747
13748int Client::describe_layout(const char *relpath, file_layout_t *lp,
13749 const UserPerm& perms)
13750{
11fdf7f2 13751 std::lock_guard lock(client_lock);
7c673cae 13752
181888fb
FG
13753 if (unmounting)
13754 return -ENOTCONN;
13755
7c673cae
FG
13756 filepath path(relpath);
13757 InodeRef in;
13758 int r = path_walk(path, &in, perms);
13759 if (r < 0)
13760 return r;
13761
13762 *lp = in->layout;
13763
11fdf7f2 13764 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13765 return 0;
13766}
13767
13768int Client::fdescribe_layout(int fd, file_layout_t *lp)
13769{
11fdf7f2 13770 std::lock_guard lock(client_lock);
7c673cae 13771
181888fb
FG
13772 if (unmounting)
13773 return -ENOTCONN;
13774
7c673cae
FG
13775 Fh *f = get_filehandle(fd);
13776 if (!f)
13777 return -EBADF;
13778 Inode *in = f->inode.get();
13779
13780 *lp = in->layout;
13781
11fdf7f2 13782 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13783 return 0;
13784}
13785
d2e6a577
FG
13786int64_t Client::get_default_pool_id()
13787{
11fdf7f2 13788 std::lock_guard lock(client_lock);
181888fb
FG
13789
13790 if (unmounting)
13791 return -ENOTCONN;
13792
d2e6a577
FG
13793 /* first data pool is the default */
13794 return mdsmap->get_first_data_pool();
13795}
7c673cae
FG
13796
13797// expose osdmap
13798
13799int64_t Client::get_pool_id(const char *pool_name)
13800{
11fdf7f2 13801 std::lock_guard lock(client_lock);
181888fb
FG
13802
13803 if (unmounting)
13804 return -ENOTCONN;
13805
7c673cae
FG
13806 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13807 pool_name);
13808}
13809
13810string Client::get_pool_name(int64_t pool)
13811{
11fdf7f2 13812 std::lock_guard lock(client_lock);
181888fb
FG
13813
13814 if (unmounting)
13815 return string();
13816
7c673cae
FG
13817 return objecter->with_osdmap([pool](const OSDMap& o) {
13818 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13819 });
13820}
13821
13822int Client::get_pool_replication(int64_t pool)
13823{
11fdf7f2 13824 std::lock_guard lock(client_lock);
181888fb
FG
13825
13826 if (unmounting)
13827 return -ENOTCONN;
13828
7c673cae
FG
13829 return objecter->with_osdmap([pool](const OSDMap& o) {
13830 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13831 });
13832}
13833
13834int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13835{
11fdf7f2 13836 std::lock_guard lock(client_lock);
7c673cae 13837
181888fb
FG
13838 if (unmounting)
13839 return -ENOTCONN;
13840
7c673cae
FG
13841 Fh *f = get_filehandle(fd);
13842 if (!f)
13843 return -EBADF;
13844 Inode *in = f->inode.get();
13845
13846 vector<ObjectExtent> extents;
13847 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13848 ceph_assert(extents.size() == 1);
7c673cae
FG
13849
13850 objecter->with_osdmap([&](const OSDMap& o) {
13851 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13852 o.pg_to_acting_osds(pg, osds);
13853 });
13854
13855 if (osds.empty())
13856 return -EINVAL;
13857
13858 /*
13859 * Return the remainder of the extent (stripe unit)
13860 *
13861 * If length = 1 is passed to Striper::file_to_extents we get a single
13862 * extent back, but its length is one so we still need to compute the length
13863 * to the end of the stripe unit.
13864 *
13865 * If length = su then we may get 1 or 2 objects back in the extents vector
13866 * which would have to be examined. Even then, the offsets are local to the
13867 * object, so matching up to the file offset is extra work.
13868 *
13869 * It seems simpler to stick with length = 1 and manually compute the
13870 * remainder.
13871 */
13872 if (len) {
13873 uint64_t su = in->layout.stripe_unit;
13874 *len = su - (off % su);
13875 }
13876
13877 return 0;
13878}
13879
13880int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13881{
11fdf7f2 13882 std::lock_guard lock(client_lock);
181888fb
FG
13883
13884 if (unmounting)
13885 return -ENOTCONN;
13886
7c673cae
FG
13887 if (id < 0)
13888 return -EINVAL;
13889 return objecter->with_osdmap([&](const OSDMap& o) {
13890 return o.crush->get_full_location_ordered(id, path);
13891 });
13892}
13893
13894int Client::get_file_stripe_address(int fd, loff_t offset,
13895 vector<entity_addr_t>& address)
13896{
11fdf7f2 13897 std::lock_guard lock(client_lock);
7c673cae 13898
181888fb
FG
13899 if (unmounting)
13900 return -ENOTCONN;
13901
7c673cae
FG
13902 Fh *f = get_filehandle(fd);
13903 if (!f)
13904 return -EBADF;
13905 Inode *in = f->inode.get();
13906
13907 // which object?
13908 vector<ObjectExtent> extents;
13909 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13910 in->truncate_size, extents);
11fdf7f2 13911 ceph_assert(extents.size() == 1);
7c673cae
FG
13912
13913 // now we have the object and its 'layout'
13914 return objecter->with_osdmap([&](const OSDMap& o) {
13915 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13916 vector<int> osds;
13917 o.pg_to_acting_osds(pg, osds);
13918 if (osds.empty())
13919 return -EINVAL;
13920 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13921 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13922 address.push_back(addr);
13923 }
13924 return 0;
13925 });
13926}
13927
13928int Client::get_osd_addr(int osd, entity_addr_t& addr)
13929{
11fdf7f2 13930 std::lock_guard lock(client_lock);
181888fb
FG
13931
13932 if (unmounting)
13933 return -ENOTCONN;
13934
7c673cae
FG
13935 return objecter->with_osdmap([&](const OSDMap& o) {
13936 if (!o.exists(osd))
13937 return -ENOENT;
13938
11fdf7f2 13939 addr = o.get_addrs(osd).front();
7c673cae
FG
13940 return 0;
13941 });
13942}
13943
13944int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13945 loff_t length, loff_t offset)
13946{
11fdf7f2 13947 std::lock_guard lock(client_lock);
7c673cae 13948
181888fb
FG
13949 if (unmounting)
13950 return -ENOTCONN;
13951
7c673cae
FG
13952 Fh *f = get_filehandle(fd);
13953 if (!f)
13954 return -EBADF;
13955 Inode *in = f->inode.get();
13956
13957 // map to a list of extents
13958 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13959
11fdf7f2 13960 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13961 return 0;
13962}
13963
13964
b32b8144 13965/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13966int Client::get_local_osd()
13967{
11fdf7f2 13968 std::lock_guard lock(client_lock);
181888fb
FG
13969
13970 if (unmounting)
13971 return -ENOTCONN;
13972
7c673cae
FG
13973 objecter->with_osdmap([this](const OSDMap& o) {
13974 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13975 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13976 local_osd_epoch = o.get_epoch();
13977 }
13978 });
13979 return local_osd;
13980}
13981
13982
13983
13984
13985
13986
13987// ===============================
13988
13989void Client::ms_handle_connect(Connection *con)
13990{
11fdf7f2 13991 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13992}
13993
13994bool Client::ms_handle_reset(Connection *con)
13995{
11fdf7f2 13996 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13997 return false;
13998}
13999
14000void Client::ms_handle_remote_reset(Connection *con)
14001{
11fdf7f2
TL
14002 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14003 std::lock_guard l(client_lock);
7c673cae
FG
14004 switch (con->get_peer_type()) {
14005 case CEPH_ENTITY_TYPE_MDS:
14006 {
14007 // kludge to figure out which mds this is; fixme with a Connection* state
14008 mds_rank_t mds = MDS_RANK_NONE;
14009 MetaSession *s = NULL;
11fdf7f2
TL
14010 for (auto &p : mds_sessions) {
14011 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14012 mds = p.first;
14013 s = &p.second;
7c673cae
FG
14014 }
14015 }
14016 if (mds >= 0) {
d2e6a577 14017 assert (s != NULL);
7c673cae
FG
14018 switch (s->state) {
14019 case MetaSession::STATE_CLOSING:
14020 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14021 _closed_mds_session(s);
14022 break;
14023
14024 case MetaSession::STATE_OPENING:
14025 {
14026 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14027 list<Context*> waiters;
14028 waiters.swap(s->waiting_for_open);
14029 _closed_mds_session(s);
14030 MetaSession *news = _get_or_open_mds_session(mds);
14031 news->waiting_for_open.swap(waiters);
14032 }
14033 break;
14034
14035 case MetaSession::STATE_OPEN:
14036 {
28e407b8 14037 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 14038 const auto& conf = cct->_conf;
7c673cae
FG
14039 if (conf->client_reconnect_stale) {
14040 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14041 _closed_mds_session(s);
14042 } else {
14043 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14044 s->state = MetaSession::STATE_STALE;
14045 }
14046 }
14047 break;
14048
14049 case MetaSession::STATE_NEW:
14050 case MetaSession::STATE_CLOSED:
14051 default:
14052 break;
14053 }
14054 }
14055 }
14056 break;
14057 }
14058}
14059
14060bool Client::ms_handle_refused(Connection *con)
14061{
11fdf7f2 14062 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14063 return false;
14064}
14065
7c673cae
FG
14066Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14067{
11fdf7f2
TL
14068 Inode *quota_in = root_ancestor;
14069 SnapRealm *realm = in->snaprealm;
14070 while (realm) {
14071 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14072 if (realm->ino != in->ino) {
14073 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14074 if (p == inode_map.end())
14075 break;
7c673cae 14076
11fdf7f2
TL
14077 if (p->second->quota.is_enable()) {
14078 quota_in = p->second;
14079 break;
7c673cae 14080 }
7c673cae 14081 }
11fdf7f2 14082 realm = realm->pparent;
7c673cae 14083 }
11fdf7f2
TL
14084 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14085 return quota_in;
7c673cae
FG
14086}
14087
14088/**
14089 * Traverse quota ancestors of the Inode, return true
14090 * if any of them passes the passed function
14091 */
14092bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14093 std::function<bool (const Inode &in)> test)
14094{
14095 while (true) {
11fdf7f2 14096 ceph_assert(in != NULL);
7c673cae
FG
14097 if (test(*in)) {
14098 return true;
14099 }
14100
14101 if (in == root_ancestor) {
14102 // We're done traversing, drop out
14103 return false;
14104 } else {
14105 // Continue up the tree
14106 in = get_quota_root(in, perms);
14107 }
14108 }
14109
14110 return false;
14111}
14112
14113bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14114{
14115 return check_quota_condition(in, perms,
14116 [](const Inode &in) {
14117 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14118 });
14119}
14120
14121bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14122 const UserPerm& perms)
7c673cae
FG
14123{
14124 return check_quota_condition(in, perms,
11fdf7f2 14125 [&new_bytes](const Inode &in) {
7c673cae
FG
14126 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14127 > in.quota.max_bytes;
14128 });
14129}
14130
11fdf7f2 14131bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14132{
9f95a23c
TL
14133 ceph_assert(in->size >= in->reported_size);
14134 const uint64_t size = in->size - in->reported_size;
11fdf7f2 14135 return check_quota_condition(in, perms,
9f95a23c 14136 [&size](const Inode &in) {
11fdf7f2
TL
14137 if (in.quota.max_bytes) {
14138 if (in.rstat.rbytes >= in.quota.max_bytes) {
14139 return true;
14140 }
14141
11fdf7f2 14142 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
14143 return (space >> 4) < size;
14144 } else {
14145 return false;
14146 }
14147 });
7c673cae
FG
14148}
14149
14150enum {
14151 POOL_CHECKED = 1,
14152 POOL_CHECKING = 2,
14153 POOL_READ = 4,
14154 POOL_WRITE = 8,
14155};
14156
14157int Client::check_pool_perm(Inode *in, int need)
14158{
14159 if (!cct->_conf->client_check_pool_perm)
14160 return 0;
14161
14162 int64_t pool_id = in->layout.pool_id;
14163 std::string pool_ns = in->layout.pool_ns;
14164 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14165 int have = 0;
14166 while (true) {
14167 auto it = pool_perms.find(perm_key);
14168 if (it == pool_perms.end())
14169 break;
14170 if (it->second == POOL_CHECKING) {
14171 // avoid concurrent checkings
14172 wait_on_list(waiting_for_pool_perm);
14173 } else {
14174 have = it->second;
11fdf7f2 14175 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14176 break;
14177 }
14178 }
14179
14180 if (!have) {
14181 if (in->snapid != CEPH_NOSNAP) {
14182 // pool permission check needs to write to the first object. But for snapshot,
14183 // head of the first object may have alread been deleted. To avoid creating
14184 // orphan object, skip the check for now.
14185 return 0;
14186 }
14187
14188 pool_perms[perm_key] = POOL_CHECKING;
14189
14190 char oid_buf[32];
14191 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14192 object_t oid = oid_buf;
14193
14194 SnapContext nullsnapc;
14195
14196 C_SaferCond rd_cond;
14197 ObjectOperation rd_op;
14198 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14199
14200 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14201 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14202
14203 C_SaferCond wr_cond;
14204 ObjectOperation wr_op;
14205 wr_op.create(true);
14206
14207 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14208 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14209
9f95a23c 14210 client_lock.unlock();
7c673cae
FG
14211 int rd_ret = rd_cond.wait();
14212 int wr_ret = wr_cond.wait();
9f95a23c 14213 client_lock.lock();
7c673cae
FG
14214
14215 bool errored = false;
14216
14217 if (rd_ret == 0 || rd_ret == -ENOENT)
14218 have |= POOL_READ;
14219 else if (rd_ret != -EPERM) {
11fdf7f2 14220 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14221 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14222 errored = true;
14223 }
14224
14225 if (wr_ret == 0 || wr_ret == -EEXIST)
14226 have |= POOL_WRITE;
14227 else if (wr_ret != -EPERM) {
11fdf7f2 14228 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14229 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14230 errored = true;
14231 }
14232
14233 if (errored) {
14234 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14235 // Raise EIO because actual error code might be misleading for
14236 // userspace filesystem user.
14237 pool_perms.erase(perm_key);
14238 signal_cond_list(waiting_for_pool_perm);
14239 return -EIO;
14240 }
14241
14242 pool_perms[perm_key] = have | POOL_CHECKED;
14243 signal_cond_list(waiting_for_pool_perm);
14244 }
14245
14246 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14247 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14248 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14249 return -EPERM;
14250 }
14251 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14252 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14253 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14254 return -EPERM;
14255 }
14256
14257 return 0;
14258}
14259
14260int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14261{
14262 if (acl_type == POSIX_ACL) {
14263 if (in->xattrs.count(ACL_EA_ACCESS)) {
14264 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14265
14266 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14267 }
14268 }
14269 return -EAGAIN;
14270}
14271
14272int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14273{
14274 if (acl_type == NO_ACL)
14275 return 0;
14276
14277 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14278 if (r < 0)
14279 goto out;
14280
14281 if (acl_type == POSIX_ACL) {
14282 if (in->xattrs.count(ACL_EA_ACCESS)) {
14283 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14284 bufferptr acl(access_acl.c_str(), access_acl.length());
14285 r = posix_acl_access_chmod(acl, mode);
14286 if (r < 0)
14287 goto out;
14288 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14289 } else {
14290 r = 0;
14291 }
14292 }
14293out:
14294 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14295 return r;
14296}
14297
14298int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14299 const UserPerm& perms)
14300{
14301 if (acl_type == NO_ACL)
14302 return 0;
14303
14304 if (S_ISLNK(*mode))
14305 return 0;
14306
14307 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14308 if (r < 0)
14309 goto out;
14310
14311 if (acl_type == POSIX_ACL) {
14312 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14313 map<string, bufferptr> xattrs;
14314
14315 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14316 bufferptr acl(default_acl.c_str(), default_acl.length());
14317 r = posix_acl_inherit_mode(acl, mode);
14318 if (r < 0)
14319 goto out;
14320
14321 if (r > 0) {
14322 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14323 if (r < 0)
14324 goto out;
14325 if (r > 0)
14326 xattrs[ACL_EA_ACCESS] = acl;
14327 }
14328
14329 if (S_ISDIR(*mode))
14330 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14331
14332 r = xattrs.size();
14333 if (r > 0)
11fdf7f2 14334 encode(xattrs, xattrs_bl);
7c673cae
FG
14335 } else {
14336 if (umask_cb)
14337 *mode &= ~umask_cb(callback_handle);
14338 r = 0;
14339 }
14340 }
14341out:
14342 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14343 return r;
14344}
14345
14346void Client::set_filer_flags(int flags)
14347{
11fdf7f2
TL
14348 std::lock_guard l(client_lock);
14349 ceph_assert(flags == 0 ||
7c673cae
FG
14350 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14351 objecter->add_global_op_flags(flags);
14352}
14353
14354void Client::clear_filer_flags(int flags)
14355{
11fdf7f2
TL
14356 std::lock_guard l(client_lock);
14357 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14358 objecter->clear_global_op_flag(flags);
14359}
14360
11fdf7f2
TL
14361// called before mount
14362void Client::set_uuid(const std::string& uuid)
14363{
14364 std::lock_guard l(client_lock);
14365 assert(initialized);
14366 assert(!uuid.empty());
14367
14368 metadata["uuid"] = uuid;
14369 _close_sessions();
14370}
14371
14372// called before mount. 0 means infinite
14373void Client::set_session_timeout(unsigned timeout)
14374{
14375 std::lock_guard l(client_lock);
14376 assert(initialized);
14377
14378 metadata["timeout"] = stringify(timeout);
14379}
14380
14381// called before mount
14382int Client::start_reclaim(const std::string& uuid, unsigned flags,
14383 const std::string& fs_name)
14384{
14385 std::lock_guard l(client_lock);
14386 if (!initialized)
14387 return -ENOTCONN;
14388
14389 if (uuid.empty())
14390 return -EINVAL;
14391
14392 {
14393 auto it = metadata.find("uuid");
14394 if (it != metadata.end() && it->second == uuid)
14395 return -EINVAL;
14396 }
14397
14398 int r = subscribe_mdsmap(fs_name);
14399 if (r < 0) {
14400 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14401 return r;
14402 }
14403
14404 if (metadata.empty())
14405 populate_metadata("");
14406
14407 while (mdsmap->get_epoch() == 0)
14408 wait_on_list(waiting_for_mdsmap);
14409
14410 reclaim_errno = 0;
14411 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14412 if (!mdsmap->is_up(mds)) {
14413 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14414 wait_on_list(waiting_for_mdsmap);
14415 continue;
14416 }
14417
14418 MetaSession *session;
14419 if (!have_open_session(mds)) {
14420 session = _get_or_open_mds_session(mds);
14421 if (session->state != MetaSession::STATE_OPENING) {
14422 // umounting?
14423 return -EINVAL;
14424 }
14425 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14426 wait_on_context_list(session->waiting_for_open);
14427 if (rejected_by_mds.count(mds))
14428 return -EPERM;
14429 continue;
14430 }
14431
14432 session = &mds_sessions.at(mds);
14433 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14434 return -EOPNOTSUPP;
14435
14436 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14437 session->reclaim_state == MetaSession::RECLAIMING) {
14438 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 14439 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
14440 session->con->send_message2(std::move(m));
14441 wait_on_list(waiting_for_reclaim);
14442 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14443 return reclaim_errno ? : -ENOTRECOVERABLE;
14444 } else {
14445 mds++;
14446 }
14447 }
14448
14449 // didn't find target session in any mds
14450 if (reclaim_target_addrs.empty()) {
14451 if (flags & CEPH_RECLAIM_RESET)
14452 return -ENOENT;
14453 return -ENOTRECOVERABLE;
14454 }
14455
14456 if (flags & CEPH_RECLAIM_RESET)
14457 return 0;
14458
14459 // use blacklist to check if target session was killed
14460 // (config option mds_session_blacklist_on_evict needs to be true)
14461 C_SaferCond cond;
14462 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14463 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
9f95a23c 14464 client_lock.unlock();
11fdf7f2 14465 cond.wait();
9f95a23c 14466 client_lock.lock();
11fdf7f2
TL
14467 }
14468
14469 bool blacklisted = objecter->with_osdmap(
14470 [this](const OSDMap &osd_map) -> bool {
14471 return osd_map.is_blacklisted(reclaim_target_addrs);
14472 });
14473 if (blacklisted)
14474 return -ENOTRECOVERABLE;
14475
14476 metadata["reclaiming_uuid"] = uuid;
14477 return 0;
14478}
14479
14480void Client::finish_reclaim()
14481{
14482 auto it = metadata.find("reclaiming_uuid");
14483 if (it == metadata.end()) {
14484 for (auto &p : mds_sessions)
14485 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14486 return;
14487 }
14488
14489 for (auto &p : mds_sessions) {
14490 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 14491 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
11fdf7f2
TL
14492 p.second.con->send_message2(std::move(m));
14493 }
14494
14495 metadata["uuid"] = it->second;
14496 metadata.erase(it);
14497}
14498
14499void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14500{
14501 mds_rank_t from = mds_rank_t(reply->get_source().num());
14502 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14503
14504 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14505 if (!session) {
14506 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14507 return;
14508 }
14509
14510 if (reply->get_result() >= 0) {
14511 session->reclaim_state = MetaSession::RECLAIM_OK;
14512 if (reply->get_epoch() > reclaim_osd_epoch)
14513 reclaim_osd_epoch = reply->get_epoch();
14514 if (!reply->get_addrs().empty())
14515 reclaim_target_addrs = reply->get_addrs();
14516 } else {
14517 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14518 reclaim_errno = reply->get_result();
14519 }
14520
14521 signal_cond_list(waiting_for_reclaim);
14522}
14523
7c673cae
FG
14524/**
14525 * This is included in cap release messages, to cause
14526 * the MDS to wait until this OSD map epoch. It is necessary
14527 * in corner cases where we cancel RADOS ops, so that
14528 * nobody else tries to do IO to the same objects in
14529 * the same epoch as the cancelled ops.
14530 */
14531void Client::set_cap_epoch_barrier(epoch_t e)
14532{
14533 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14534 cap_epoch_barrier = e;
14535}
14536
14537const char** Client::get_tracked_conf_keys() const
14538{
14539 static const char* keys[] = {
14540 "client_cache_size",
14541 "client_cache_mid",
14542 "client_acl_type",
b32b8144
FG
14543 "client_deleg_timeout",
14544 "client_deleg_break_on_open",
7c673cae
FG
14545 NULL
14546 };
14547 return keys;
14548}
14549
11fdf7f2 14550void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14551 const std::set <std::string> &changed)
14552{
11fdf7f2 14553 std::lock_guard lock(client_lock);
7c673cae 14554
181888fb 14555 if (changed.count("client_cache_mid")) {
7c673cae
FG
14556 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14557 }
14558 if (changed.count("client_acl_type")) {
14559 acl_type = NO_ACL;
14560 if (cct->_conf->client_acl_type == "posix_acl")
14561 acl_type = POSIX_ACL;
14562 }
14563}
14564
7c673cae
FG
14565void intrusive_ptr_add_ref(Inode *in)
14566{
14567 in->get();
14568}
14569
14570void intrusive_ptr_release(Inode *in)
14571{
14572 in->client->put_inode(in);
14573}
14574
14575mds_rank_t Client::_get_random_up_mds() const
14576{
9f95a23c 14577 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
14578
14579 std::set<mds_rank_t> up;
14580 mdsmap->get_up_mds_set(up);
14581
14582 if (up.empty())
14583 return MDS_RANK_NONE;
14584 std::set<mds_rank_t>::const_iterator p = up.begin();
14585 for (int n = rand() % up.size(); n; n--)
14586 ++p;
14587 return *p;
14588}
14589
14590
14591StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14592 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14593{
14594 monclient->set_messenger(m);
14595 objecter->set_client_incarnation(0);
14596}
14597
14598StandaloneClient::~StandaloneClient()
14599{
14600 delete objecter;
14601 objecter = nullptr;
14602}
14603
14604int StandaloneClient::init()
14605{
14606 timer.init();
14607 objectcacher->start();
14608 objecter->init();
14609
9f95a23c 14610 client_lock.lock();
11fdf7f2 14611 ceph_assert(!is_initialized());
7c673cae
FG
14612
14613 messenger->add_dispatcher_tail(objecter);
14614 messenger->add_dispatcher_tail(this);
14615
14616 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14617 int r = monclient->init();
14618 if (r < 0) {
14619 // need to do cleanup because we're in an intermediate init state
14620 timer.shutdown();
9f95a23c 14621 client_lock.unlock();
7c673cae
FG
14622 objecter->shutdown();
14623 objectcacher->stop();
14624 monclient->shutdown();
14625 return r;
14626 }
14627 objecter->start();
14628
9f95a23c 14629 client_lock.unlock();
7c673cae
FG
14630 _finish_init();
14631
14632 return 0;
14633}
14634
14635void StandaloneClient::shutdown()
14636{
14637 Client::shutdown();
14638 objecter->shutdown();
14639 monclient->shutdown();
14640}