]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
buildsys: use download.ceph.com to download source tar ball
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
73#include "common/Mutex.h"
74#include "common/perf_counters.h"
75#include "common/admin_socket.h"
76#include "common/errno.h"
77#include "include/str_list.h"
78
79#define dout_subsys ceph_subsys_client
80
81#include "include/lru.h"
82#include "include/compat.h"
83#include "include/stringify.h"
84
85#include "Client.h"
86#include "Inode.h"
87#include "Dentry.h"
b32b8144 88#include "Delegation.h"
7c673cae
FG
89#include "Dir.h"
90#include "ClientSnapRealm.h"
91#include "Fh.h"
92#include "MetaSession.h"
93#include "MetaRequest.h"
94#include "ObjecterWriteback.h"
95#include "posix_acl.h"
96
11fdf7f2 97#include "include/ceph_assert.h"
7c673cae
FG
98#include "include/stat.h"
99
100#include "include/cephfs/ceph_statx.h"
101
102#if HAVE_GETGROUPLIST
103#include <grp.h>
104#include <pwd.h>
105#include <unistd.h>
106#endif
107
108#undef dout_prefix
109#define dout_prefix *_dout << "client." << whoami << " "
110
111#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113// FreeBSD fails to define this
114#ifndef O_DSYNC
115#define O_DSYNC 0x0
116#endif
117// Darwin fails to define this
118#ifndef O_RSYNC
119#define O_RSYNC 0x0
120#endif
121
122#ifndef O_DIRECT
123#define O_DIRECT 0x0
124#endif
125
126#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129{
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132}
133
134
135// -------------
136
137Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139{
140}
141
11fdf7f2
TL
142bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
7c673cae 145{
11fdf7f2 146 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae
FG
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
11fdf7f2 150 m_client->dump_mds_requests(f.get());
7c673cae 151 else if (command == "mds_sessions")
11fdf7f2 152 m_client->dump_mds_sessions(f.get());
7c673cae 153 else if (command == "dump_cache")
11fdf7f2 154 m_client->dump_cache(f.get());
7c673cae
FG
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
11fdf7f2 158 m_client->dump_status(f.get());
7c673cae 159 else
11fdf7f2 160 ceph_abort_msg("bad command registered");
7c673cae
FG
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
7c673cae
FG
164 return true;
165}
166
167
168// -------------
169
170dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176void Client::_reset_faked_inos()
177{
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
11fdf7f2 182 last_used_faked_root = 0;
7c673cae
FG
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184}
185
186void Client::_assign_faked_ino(Inode *in)
187{
11fdf7f2
TL
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 192 last_used_faked_ino = 2048;
7c673cae
FG
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
11fdf7f2 195 ceph_assert(it != free_faked_inos.end());
7c673cae 196 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 197 ceph_assert(it.get_len() > 0);
7c673cae
FG
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
11fdf7f2 201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206}
207
11fdf7f2
TL
208/*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214*/
215void Client::_assign_faked_root(Inode *in)
216{
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232}
233
7c673cae
FG
234void Client::_release_faked_ino(Inode *in)
235{
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238}
239
240vinodeno_t Client::_map_faked_ino(ino_t ino)
241{
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
250 return vino;
251}
252
253vinodeno_t Client::map_faked_ino(ino_t ino)
254{
11fdf7f2 255 std::lock_guard lock(client_lock);
7c673cae
FG
256 return _map_faked_ino(ino);
257}
258
259// cons/des
260
261Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
7c673cae 263 timer(m->cct, client_lock),
11fdf7f2
TL
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae
FG
278
279 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
280 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
281
282 user_id = cct->_conf->client_mount_uid;
283 group_id = cct->_conf->client_mount_gid;
284
7c673cae
FG
285 if (cct->_conf->client_acl_type == "posix_acl")
286 acl_type = POSIX_ACL;
287
7c673cae
FG
288 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
289
290 // file handles
291 free_fd_set.insert(10, 1<<30);
292
293 mdsmap.reset(new MDSMap);
294
295 // osd interfaces
296 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
297 &client_lock));
298 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
299 client_flush_set_callback, // all commit callback
300 (void*)this,
301 cct->_conf->client_oc_size,
302 cct->_conf->client_oc_max_objects,
303 cct->_conf->client_oc_max_dirty,
304 cct->_conf->client_oc_target_dirty,
305 cct->_conf->client_oc_max_dirty_age,
306 true));
307 objecter_finisher.start();
308 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 309 objecter->enable_blacklist_events();
7c673cae
FG
310}
311
312
313Client::~Client()
314{
11fdf7f2 315 ceph_assert(!client_lock.is_locked());
7c673cae 316
31f18b77
FG
317 // It is necessary to hold client_lock, because any inode destruction
318 // may call into ObjectCacher, which asserts that it's lock (which is
319 // client_lock) is held.
320 client_lock.Lock();
7c673cae 321 tear_down_cache();
31f18b77 322 client_lock.Unlock();
7c673cae
FG
323}
324
325void Client::tear_down_cache()
326{
327 // fd's
328 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
329 it != fd_map.end();
330 ++it) {
331 Fh *fh = it->second;
11fdf7f2 332 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
333 _release_fh(fh);
334 }
335 fd_map.clear();
336
337 while (!opened_dirs.empty()) {
338 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 339 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
340 _closedir(dirp);
341 }
342
343 // caps!
344 // *** FIXME ***
345
346 // empty lru
7c673cae 347 trim_cache();
11fdf7f2 348 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
349
350 // close root ino
11fdf7f2 351 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
352 if (root && inode_map.size() == 1 + root_parents.size()) {
353 delete root;
354 root = 0;
355 root_ancestor = 0;
356 while (!root_parents.empty())
357 root_parents.erase(root_parents.begin());
358 inode_map.clear();
359 _reset_faked_inos();
360 }
361
11fdf7f2 362 ceph_assert(inode_map.empty());
7c673cae
FG
363}
364
365inodeno_t Client::get_root_ino()
366{
11fdf7f2 367 std::lock_guard l(client_lock);
7c673cae
FG
368 if (use_faked_inos())
369 return root->faked_ino;
370 else
371 return root->ino;
372}
373
374Inode *Client::get_root()
375{
11fdf7f2 376 std::lock_guard l(client_lock);
7c673cae
FG
377 root->ll_get();
378 return root;
379}
380
381
382// debug crapola
383
384void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
385{
386 filepath path;
387 in->make_long_path(path);
388 ldout(cct, 1) << "dump_inode: "
389 << (disconnected ? "DISCONNECTED ":"")
390 << "inode " << in->ino
391 << " " << path
392 << " ref " << in->get_num_ref()
393 << *in << dendl;
394
395 if (f) {
396 f->open_object_section("inode");
397 f->dump_stream("path") << path;
398 if (disconnected)
399 f->dump_int("disconnected", 1);
400 in->dump(f);
401 f->close_section();
402 }
403
404 did.insert(in);
405 if (in->dir) {
406 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
407 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
408 it != in->dir->dentries.end();
409 ++it) {
410 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
411 if (f) {
412 f->open_object_section("dentry");
413 it->second->dump(f);
414 f->close_section();
415 }
416 if (it->second->inode)
417 dump_inode(f, it->second->inode.get(), did, false);
418 }
419 }
420}
421
422void Client::dump_cache(Formatter *f)
423{
424 set<Inode*> did;
425
11fdf7f2 426 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
427
428 if (f)
429 f->open_array_section("cache");
430
431 if (root)
432 dump_inode(f, root, did, true);
433
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
436 it != inode_map.end();
437 ++it) {
438 if (did.count(it->second))
439 continue;
440 dump_inode(f, it->second, did, true);
441 }
442
443 if (f)
444 f->close_section();
445}
446
447void Client::dump_status(Formatter *f)
448{
11fdf7f2 449 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
450
451 ldout(cct, 1) << __func__ << dendl;
452
453 const epoch_t osd_epoch
454 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
455
456 if (f) {
457 f->open_object_section("metadata");
458 for (const auto& kv : metadata)
459 f->dump_string(kv.first.c_str(), kv.second);
460 f->close_section();
461
462 f->dump_int("dentry_count", lru.lru_get_size());
463 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
464 f->dump_int("id", get_nodeid().v);
11fdf7f2 465 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 466 f->dump_object("inst", inst);
11fdf7f2
TL
467 f->dump_object("addr", inst.addr);
468 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
469 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
470 f->dump_int("inode_count", inode_map.size());
471 f->dump_int("mds_epoch", mdsmap->get_epoch());
472 f->dump_int("osd_epoch", osd_epoch);
473 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 474 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
475 }
476}
477
478int Client::init()
479{
480 timer.init();
481 objectcacher->start();
482
483 client_lock.Lock();
11fdf7f2 484 ceph_assert(!initialized);
7c673cae
FG
485
486 messenger->add_dispatcher_tail(this);
487 client_lock.Unlock();
488
489 _finish_init();
490 return 0;
491}
492
493void Client::_finish_init()
494{
495 client_lock.Lock();
496 // logger
497 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
498 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
499 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
500 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
11fdf7f2
TL
501 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
502 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
7c673cae
FG
503 logger.reset(plb.create_perf_counters());
504 cct->get_perfcounters_collection()->add(logger.get());
505
506 client_lock.Unlock();
507
11fdf7f2 508 cct->_conf.add_observer(this);
7c673cae
FG
509
510 AdminSocket* admin_socket = cct->get_admin_socket();
511 int ret = admin_socket->register_command("mds_requests",
512 "mds_requests",
513 &m_command_hook,
514 "show in-progress mds requests");
515 if (ret < 0) {
516 lderr(cct) << "error registering admin socket command: "
517 << cpp_strerror(-ret) << dendl;
518 }
519 ret = admin_socket->register_command("mds_sessions",
520 "mds_sessions",
521 &m_command_hook,
522 "show mds session state");
523 if (ret < 0) {
524 lderr(cct) << "error registering admin socket command: "
525 << cpp_strerror(-ret) << dendl;
526 }
527 ret = admin_socket->register_command("dump_cache",
528 "dump_cache",
529 &m_command_hook,
530 "show in-memory metadata cache contents");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("kick_stale_sessions",
536 "kick_stale_sessions",
537 &m_command_hook,
538 "kick sessions that were remote reset");
539 if (ret < 0) {
540 lderr(cct) << "error registering admin socket command: "
541 << cpp_strerror(-ret) << dendl;
542 }
543 ret = admin_socket->register_command("status",
544 "status",
545 &m_command_hook,
546 "show overall client status");
547 if (ret < 0) {
548 lderr(cct) << "error registering admin socket command: "
549 << cpp_strerror(-ret) << dendl;
550 }
551
552 client_lock.Lock();
553 initialized = true;
554 client_lock.Unlock();
555}
556
557void Client::shutdown()
558{
11fdf7f2 559 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
560
561 // If we were not mounted, but were being used for sending
562 // MDS commands, we may have sessions that need closing.
563 client_lock.Lock();
564 _close_sessions();
565 client_lock.Unlock();
566
11fdf7f2 567 cct->_conf.remove_observer(this);
7c673cae 568
11fdf7f2 569 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
570
571 if (ino_invalidate_cb) {
572 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
573 async_ino_invalidator.wait_for_empty();
574 async_ino_invalidator.stop();
575 }
576
577 if (dentry_invalidate_cb) {
578 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
579 async_dentry_invalidator.wait_for_empty();
580 async_dentry_invalidator.stop();
581 }
582
583 if (switch_interrupt_cb) {
584 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
585 interrupt_finisher.wait_for_empty();
586 interrupt_finisher.stop();
587 }
588
589 if (remount_cb) {
590 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
591 remount_finisher.wait_for_empty();
592 remount_finisher.stop();
593 }
594
595 objectcacher->stop(); // outside of client_lock! this does a join.
596
597 client_lock.Lock();
11fdf7f2 598 ceph_assert(initialized);
7c673cae
FG
599 initialized = false;
600 timer.shutdown();
601 client_lock.Unlock();
602
603 objecter_finisher.wait_for_empty();
604 objecter_finisher.stop();
605
606 if (logger) {
607 cct->get_perfcounters_collection()->remove(logger.get());
608 logger.reset();
609 }
610}
611
612
613// ===================
614// metadata cache stuff
615
616void Client::trim_cache(bool trim_kernel_dcache)
617{
181888fb
FG
618 uint64_t max = cct->_conf->client_cache_size;
619 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
620 unsigned last = 0;
621 while (lru.lru_get_size() != last) {
622 last = lru.lru_get_size();
623
181888fb 624 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
625
626 // trim!
31f18b77 627 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
628 if (!dn)
629 break; // done
630
631 trim_dentry(dn);
632 }
633
181888fb 634 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
635 _invalidate_kernel_dcache();
636
637 // hose root?
638 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
639 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
640 delete root;
641 root = 0;
642 root_ancestor = 0;
643 while (!root_parents.empty())
644 root_parents.erase(root_parents.begin());
645 inode_map.clear();
646 _reset_faked_inos();
647 }
648}
649
650void Client::trim_cache_for_reconnect(MetaSession *s)
651{
652 mds_rank_t mds = s->mds_num;
11fdf7f2 653 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
654
655 int trimmed = 0;
656 list<Dentry*> skipped;
657 while (lru.lru_get_size() > 0) {
658 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
659 if (!dn)
660 break;
661
662 if ((dn->inode && dn->inode->caps.count(mds)) ||
663 dn->dir->parent_inode->caps.count(mds)) {
664 trim_dentry(dn);
665 trimmed++;
666 } else
667 skipped.push_back(dn);
668 }
669
670 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
671 lru.lru_insert_mid(*p);
672
11fdf7f2 673 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
674 << " trimmed " << trimmed << " dentries" << dendl;
675
676 if (s->caps.size() > 0)
677 _invalidate_kernel_dcache();
678}
679
680void Client::trim_dentry(Dentry *dn)
681{
682 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
683 << " in dir "
684 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
685 << dendl;
686 if (dn->inode) {
687 Inode *diri = dn->dir->parent_inode;
688 diri->dir_release_count++;
689 clear_dir_complete_and_ordered(diri, true);
690 }
691 unlink(dn, false, false); // drop dir, drop dentry
692}
693
694
1adf2230
AA
695void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
696 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 697{
7c673cae
FG
698 uint64_t prior_size = in->size;
699
7c673cae
FG
700 if (truncate_seq > in->truncate_seq ||
701 (truncate_seq == in->truncate_seq && size > in->size)) {
702 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
703 in->size = size;
704 in->reported_size = size;
705 if (truncate_seq != in->truncate_seq) {
706 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
707 << truncate_seq << dendl;
708 in->truncate_seq = truncate_seq;
709 in->oset.truncate_seq = truncate_seq;
710
711 // truncate cached file data
712 if (prior_size > size) {
713 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
714 }
715 }
716
717 // truncate inline data
718 if (in->inline_version < CEPH_INLINE_NONE) {
719 uint32_t len = in->inline_data.length();
720 if (size < len)
721 in->inline_data.splice(size, len - size);
722 }
723 }
724 if (truncate_seq >= in->truncate_seq &&
725 in->truncate_size != truncate_size) {
726 if (in->is_file()) {
727 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
728 << truncate_size << dendl;
729 in->truncate_size = truncate_size;
730 in->oset.truncate_size = truncate_size;
731 } else {
732 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
733 }
734 }
1adf2230
AA
735}
736
737void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
738 utime_t ctime, utime_t mtime, utime_t atime)
739{
740 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
741 << " ctime " << ctime << " mtime " << mtime << dendl;
742
743 if (time_warp_seq > in->time_warp_seq)
744 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
745 << " is higher than local time_warp_seq "
746 << in->time_warp_seq << dendl;
747
748 int warn = false;
7c673cae
FG
749 // be careful with size, mtime, atime
750 if (issued & (CEPH_CAP_FILE_EXCL|
751 CEPH_CAP_FILE_WR|
752 CEPH_CAP_FILE_BUFFER|
753 CEPH_CAP_AUTH_EXCL|
754 CEPH_CAP_XATTR_EXCL)) {
755 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
756 if (ctime > in->ctime)
757 in->ctime = ctime;
758 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
759 //the mds updated times, so take those!
760 in->mtime = mtime;
761 in->atime = atime;
762 in->time_warp_seq = time_warp_seq;
763 } else if (time_warp_seq == in->time_warp_seq) {
764 //take max times
765 if (mtime > in->mtime)
766 in->mtime = mtime;
767 if (atime > in->atime)
768 in->atime = atime;
769 } else if (issued & CEPH_CAP_FILE_EXCL) {
770 //ignore mds values as we have a higher seq
771 } else warn = true;
772 } else {
773 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
774 if (time_warp_seq >= in->time_warp_seq) {
775 in->ctime = ctime;
776 in->mtime = mtime;
777 in->atime = atime;
778 in->time_warp_seq = time_warp_seq;
779 } else warn = true;
780 }
781 if (warn) {
782 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
783 << time_warp_seq << " is lower than local time_warp_seq "
784 << in->time_warp_seq
785 << dendl;
786 }
787}
788
789void Client::_fragmap_remove_non_leaves(Inode *in)
790{
791 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
792 if (!in->dirfragtree.is_leaf(p->first))
793 in->fragmap.erase(p++);
794 else
795 ++p;
796}
797
798void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
799{
800 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
801 if (p->second == mds)
802 in->fragmap.erase(p++);
803 else
804 ++p;
805}
806
807Inode * Client::add_update_inode(InodeStat *st, utime_t from,
808 MetaSession *session,
809 const UserPerm& request_perms)
810{
811 Inode *in;
812 bool was_new = false;
813 if (inode_map.count(st->vino)) {
814 in = inode_map[st->vino];
11fdf7f2 815 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
816 } else {
817 in = new Inode(this, st->vino, &st->layout);
818 inode_map[st->vino] = in;
819
820 if (use_faked_inos())
821 _assign_faked_ino(in);
822
823 if (!root) {
824 root = in;
11fdf7f2
TL
825 if (use_faked_inos())
826 _assign_faked_root(root);
7c673cae
FG
827 root_ancestor = in;
828 cwd = root;
829 } else if (!mounted) {
830 root_parents[root_ancestor] = in;
831 root_ancestor = in;
832 }
833
834 // immutable bits
835 in->ino = st->vino.ino;
836 in->snapid = st->vino.snapid;
837 in->mode = st->mode & S_IFMT;
838 was_new = true;
839 }
840
841 in->rdev = st->rdev;
842 if (in->is_symlink())
843 in->symlink = st->symlink;
844
7c673cae 845 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
846 bool new_version = false;
847 if (in->version == 0 ||
848 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
849 (in->version & ~1) < st->version))
850 new_version = true;
7c673cae 851
1adf2230
AA
852 int issued;
853 in->caps_issued(&issued);
854 issued |= in->caps_dirty();
855 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 856
1adf2230
AA
857 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
858 !(issued & CEPH_CAP_AUTH_EXCL)) {
859 in->mode = st->mode;
860 in->uid = st->uid;
861 in->gid = st->gid;
862 in->btime = st->btime;
863 }
7c673cae 864
1adf2230
AA
865 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
866 !(issued & CEPH_CAP_LINK_EXCL)) {
867 in->nlink = st->nlink;
868 }
7c673cae 869
1adf2230
AA
870 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
871 update_inode_file_time(in, issued, st->time_warp_seq,
872 st->ctime, st->mtime, st->atime);
873 }
7c673cae 874
1adf2230
AA
875 if (new_version ||
876 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 877 in->layout = st->layout;
1adf2230
AA
878 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
879 }
7c673cae 880
1adf2230
AA
881 if (in->is_dir()) {
882 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
883 in->dirstat = st->dirstat;
884 }
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
888 in->dir_layout = st->dir_layout;
889 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
890 in->rstat = st->rstat;
891 in->quota = st->quota;
11fdf7f2 892 in->dir_pin = st->dir_pin;
1adf2230
AA
893 }
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
7c673cae 898 }
7c673cae
FG
899 }
900
901 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
902 st->xattrbl.length() &&
903 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
904 auto p = st->xattrbl.cbegin();
905 decode(in->xattrs, p);
7c673cae
FG
906 in->xattr_version = st->xattr_version;
907 }
908
1adf2230
AA
909 if (st->inline_version > in->inline_version) {
910 in->inline_data = st->inline_data;
911 in->inline_version = st->inline_version;
7c673cae
FG
912 }
913
1adf2230
AA
914 /* always take a newer change attr */
915 if (st->change_attr > in->change_attr)
916 in->change_attr = st->change_attr;
917
918 if (st->version > in->version)
919 in->version = st->version;
920
921 if (was_new)
922 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
923
924 if (!st->cap.caps)
925 return in; // as with readdir returning indoes in different snaprealms (no caps!)
926
7c673cae 927 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
928 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
929 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
930 st->cap.flags, request_perms);
28e407b8 931 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 932 in->max_size = st->max_size;
28e407b8
AA
933 in->rstat = st->rstat;
934 }
7c673cae 935
1adf2230
AA
936 // setting I_COMPLETE needs to happen after adding the cap
937 if (in->is_dir() &&
938 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
939 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
940 in->dirstat.nfiles == 0 &&
941 in->dirstat.nsubdirs == 0) {
942 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
943 in->flags |= I_COMPLETE | I_DIR_ORDERED;
944 if (in->dir) {
945 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
946 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
947 in->dir->readdir_cache.clear();
948 for (const auto& p : in->dir->dentries) {
949 unlink(p.second, true, true); // keep dir, keep dentry
950 }
951 if (in->dir->dentries.empty())
952 close_dir(in->dir);
7c673cae 953 }
7c673cae 954 }
1adf2230
AA
955 } else {
956 in->snap_caps |= st->cap.caps;
7c673cae
FG
957 }
958
959 return in;
960}
961
962
963/*
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
965 */
966Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
967 Inode *in, utime_t from, MetaSession *session,
968 Dentry *old_dentry)
969{
970 Dentry *dn = NULL;
971 if (dir->dentries.count(dname))
972 dn = dir->dentries[dname];
973
11fdf7f2 974 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
975 << " in dir " << dir->parent_inode->vino() << " dn " << dn
976 << dendl;
977
978 if (dn && dn->inode) {
979 if (dn->inode->vino() == in->vino()) {
980 touch_dn(dn);
981 ldout(cct, 12) << " had dentry " << dname
982 << " with correct vino " << dn->inode->vino()
983 << dendl;
984 } else {
985 ldout(cct, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn->inode->vino()
987 << dendl;
988 unlink(dn, true, true); // keep dir, keep dentry
989 }
990 }
991
992 if (!dn || !dn->inode) {
993 InodeRef tmp_ref(in);
994 if (old_dentry) {
995 if (old_dentry->dir != dir) {
996 Inode *old_diri = old_dentry->dir->parent_inode;
997 old_diri->dir_ordered_count++;
998 clear_dir_complete_and_ordered(old_diri, false);
999 }
1000 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1001 }
1002 Inode *diri = dir->parent_inode;
1003 diri->dir_ordered_count++;
1004 clear_dir_complete_and_ordered(diri, false);
1005 dn = link(dir, dname, in, dn);
1006 }
1007
1008 update_dentry_lease(dn, dlease, from, session);
1009 return dn;
1010}
1011
1012void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1013{
1014 utime_t dttl = from;
1015 dttl += (float)dlease->duration_ms / 1000.0;
1016
11fdf7f2 1017 ceph_assert(dn);
7c673cae
FG
1018
1019 if (dlease->mask & CEPH_LOCK_DN) {
1020 if (dttl > dn->lease_ttl) {
1021 ldout(cct, 10) << "got dentry lease on " << dn->name
1022 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1023 dn->lease_ttl = dttl;
1024 dn->lease_mds = session->mds_num;
1025 dn->lease_seq = dlease->seq;
1026 dn->lease_gen = session->cap_gen;
1027 }
1028 }
1029 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1030}
1031
1032
1033/*
1034 * update MDS location cache for a single inode
1035 */
1036void Client::update_dir_dist(Inode *in, DirStat *dst)
1037{
1038 // auth
1039 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1040 if (dst->auth >= 0) {
1041 in->fragmap[dst->frag] = dst->auth;
1042 } else {
1043 in->fragmap.erase(dst->frag);
1044 }
1045 if (!in->dirfragtree.is_leaf(dst->frag)) {
1046 in->dirfragtree.force_to_leaf(cct, dst->frag);
1047 _fragmap_remove_non_leaves(in);
1048 }
1049
1050 // replicated
1051 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1052
1053 // dist
1054 /*
1055 if (!st->dirfrag_dist.empty()) { // FIXME
1056 set<int> dist = st->dirfrag_dist.begin()->second;
1057 if (dist.empty() && !in->dir_contacts.empty())
1058 ldout(cct, 9) << "lost dist spec for " << in->ino
1059 << " " << dist << dendl;
1060 if (!dist.empty() && in->dir_contacts.empty())
1061 ldout(cct, 9) << "got dist spec for " << in->ino
1062 << " " << dist << dendl;
1063 in->dir_contacts = dist;
1064 }
1065 */
1066}
1067
1068void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1069{
1070 if (diri->flags & I_COMPLETE) {
1071 if (complete) {
1072 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1073 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1074 } else {
1075 if (diri->flags & I_DIR_ORDERED) {
1076 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1077 diri->flags &= ~I_DIR_ORDERED;
1078 }
1079 }
1080 if (diri->dir)
1081 diri->dir->readdir_cache.clear();
1082 }
1083}
1084
1085/*
1086 * insert results from readdir or lssnap into the metadata cache.
1087 */
1088void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1089
11fdf7f2 1090 auto& reply = request->reply;
7c673cae 1091 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1092 uint64_t features;
1093 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1094 features = (uint64_t)-1;
1095 }
1096 else {
1097 features = con->get_features();
1098 }
7c673cae
FG
1099
1100 dir_result_t *dirp = request->dirp;
11fdf7f2 1101 ceph_assert(dirp);
7c673cae
FG
1102
1103 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1104 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1105 if (!p.end()) {
1106 // snapdir?
1107 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1108 ceph_assert(diri);
7c673cae
FG
1109 diri = open_snapdir(diri);
1110 }
1111
1112 // only open dir if we're actually adding stuff to it!
1113 Dir *dir = diri->open_dir();
11fdf7f2 1114 ceph_assert(dir);
7c673cae
FG
1115
1116 // dirstat
11fdf7f2 1117 DirStat dst(p, features);
7c673cae
FG
1118 __u32 numdn;
1119 __u16 flags;
11fdf7f2
TL
1120 decode(numdn, p);
1121 decode(flags, p);
7c673cae
FG
1122
1123 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1124 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1125
1126 frag_t fg = (unsigned)request->head.args.readdir.frag;
1127 unsigned readdir_offset = dirp->next_offset;
1128 string readdir_start = dirp->last_name;
11fdf7f2 1129 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1130
1131 unsigned last_hash = 0;
1132 if (hash_order) {
1133 if (!readdir_start.empty()) {
1134 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1135 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1136 /* mds understands offset_hash */
1137 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1138 }
1139 }
1140
1141 if (fg != dst.frag) {
1142 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1143 fg = dst.frag;
1144 if (!hash_order) {
1145 readdir_offset = 2;
1146 readdir_start.clear();
1147 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1148 }
1149 }
1150
1151 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1152 << ", hash_order=" << hash_order
1153 << ", readdir_start " << readdir_start
1154 << ", last_hash " << last_hash
1155 << ", next_offset " << readdir_offset << dendl;
1156
1157 if (diri->snapid != CEPH_SNAPDIR &&
1158 fg.is_leftmost() && readdir_offset == 2 &&
1159 !(hash_order && last_hash)) {
1160 dirp->release_count = diri->dir_release_count;
1161 dirp->ordered_count = diri->dir_ordered_count;
1162 dirp->start_shared_gen = diri->shared_gen;
1163 dirp->cache_index = 0;
1164 }
1165
1166 dirp->buffer_frag = fg;
1167
1168 _readdir_drop_dirp_buffer(dirp);
1169 dirp->buffer.reserve(numdn);
1170
1171 string dname;
1172 LeaseStat dlease;
1173 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1174 decode(dname, p);
1175 dlease.decode(p, features);
7c673cae
FG
1176 InodeStat ist(p, features);
1177
1178 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1179
1180 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1181 request->perms);
1182 Dentry *dn;
1183 if (diri->dir->dentries.count(dname)) {
1184 Dentry *olddn = diri->dir->dentries[dname];
1185 if (olddn->inode != in) {
1186 // replace incorrect dentry
1187 unlink(olddn, true, true); // keep dir, dentry
1188 dn = link(dir, dname, in, olddn);
11fdf7f2 1189 ceph_assert(dn == olddn);
7c673cae
FG
1190 } else {
1191 // keep existing dn
1192 dn = olddn;
1193 touch_dn(dn);
1194 }
1195 } else {
1196 // new dn
1197 dn = link(dir, dname, in, NULL);
1198 }
1199
1200 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1201 if (hash_order) {
1202 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1203 if (hash != last_hash)
1204 readdir_offset = 2;
1205 last_hash = hash;
1206 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1207 } else {
1208 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1209 }
1210 // add to readdir cache
1211 if (dirp->release_count == diri->dir_release_count &&
1212 dirp->ordered_count == diri->dir_ordered_count &&
1213 dirp->start_shared_gen == diri->shared_gen) {
1214 if (dirp->cache_index == dir->readdir_cache.size()) {
1215 if (i == 0) {
11fdf7f2 1216 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1217 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1218 }
1219 dir->readdir_cache.push_back(dn);
1220 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1221 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1222 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1223 else
1224 dir->readdir_cache[dirp->cache_index] = dn;
1225 } else {
11fdf7f2 1226 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1227 }
1228 dirp->cache_index++;
1229 }
1230 // add to cached result list
1231 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1232 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1233 }
1234
1235 if (numdn > 0)
1236 dirp->last_name = dname;
1237 if (end)
1238 dirp->next_offset = 2;
1239 else
1240 dirp->next_offset = readdir_offset;
1241
1242 if (dir->is_empty())
1243 close_dir(dir);
1244 }
1245}
1246
1247/** insert_trace
1248 *
1249 * insert a trace from a MDS reply into the cache.
1250 */
1251Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1252{
11fdf7f2 1253 auto& reply = request->reply;
7c673cae
FG
1254 int op = request->get_op();
1255
1256 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1257 << " is_target=" << (int)reply->head.is_target
1258 << " is_dentry=" << (int)reply->head.is_dentry
1259 << dendl;
1260
11fdf7f2 1261 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1262 if (request->got_unsafe) {
1263 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1264 ceph_assert(p.end());
7c673cae
FG
1265 return NULL;
1266 }
1267
1268 if (p.end()) {
1269 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1270
1271 Dentry *d = request->dentry();
1272 if (d) {
1273 Inode *diri = d->dir->parent_inode;
1274 diri->dir_release_count++;
1275 clear_dir_complete_and_ordered(diri, true);
1276 }
1277
1278 if (d && reply->get_result() == 0) {
1279 if (op == CEPH_MDS_OP_RENAME) {
1280 // rename
1281 Dentry *od = request->old_dentry();
1282 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1283 ceph_assert(od);
7c673cae
FG
1284 unlink(od, true, true); // keep dir, dentry
1285 } else if (op == CEPH_MDS_OP_RMDIR ||
1286 op == CEPH_MDS_OP_UNLINK) {
1287 // unlink, rmdir
1288 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1289 unlink(d, true, true); // keep dir, dentry
1290 }
1291 }
1292 return NULL;
1293 }
1294
1295 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1296 uint64_t features;
1297 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1298 features = (uint64_t)-1;
1299 }
1300 else {
1301 features = con->get_features();
1302 }
7c673cae
FG
1303 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1304
1305 // snap trace
1306 SnapRealm *realm = NULL;
1307 if (reply->snapbl.length())
1308 update_snap_trace(reply->snapbl, &realm);
1309
1310 ldout(cct, 10) << " hrm "
1311 << " is_target=" << (int)reply->head.is_target
1312 << " is_dentry=" << (int)reply->head.is_dentry
1313 << dendl;
1314
1315 InodeStat dirst;
1316 DirStat dst;
1317 string dname;
1318 LeaseStat dlease;
1319 InodeStat ist;
1320
1321 if (reply->head.is_dentry) {
1322 dirst.decode(p, features);
11fdf7f2
TL
1323 dst.decode(p, features);
1324 decode(dname, p);
1325 dlease.decode(p, features);
7c673cae
FG
1326 }
1327
1328 Inode *in = 0;
1329 if (reply->head.is_target) {
1330 ist.decode(p, features);
1331 if (cct->_conf->client_debug_getattr_caps) {
1332 unsigned wanted = 0;
1333 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1334 wanted = request->head.args.getattr.mask;
1335 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1336 wanted = request->head.args.open.mask;
1337
1338 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1339 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1340 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1341 }
1342
1343 in = add_update_inode(&ist, request->sent_stamp, session,
1344 request->perms);
1345 }
1346
1347 Inode *diri = NULL;
1348 if (reply->head.is_dentry) {
1349 diri = add_update_inode(&dirst, request->sent_stamp, session,
1350 request->perms);
1351 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1352
1353 if (in) {
1354 Dir *dir = diri->open_dir();
1355 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1356 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1357 } else {
1358 Dentry *dn = NULL;
1359 if (diri->dir && diri->dir->dentries.count(dname)) {
1360 dn = diri->dir->dentries[dname];
1361 if (dn->inode) {
1362 diri->dir_ordered_count++;
1363 clear_dir_complete_and_ordered(diri, false);
1364 unlink(dn, true, true); // keep dir, dentry
1365 }
1366 }
1367 if (dlease.duration_ms > 0) {
1368 if (!dn) {
1369 Dir *dir = diri->open_dir();
1370 dn = link(dir, dname, NULL, NULL);
1371 }
1372 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1373 }
1374 }
1375 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1376 op == CEPH_MDS_OP_MKSNAP) {
1377 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1378 // fake it for snap lookup
1379 vinodeno_t vino = ist.vino;
1380 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1381 ceph_assert(inode_map.count(vino));
7c673cae
FG
1382 diri = inode_map[vino];
1383
1384 string dname = request->path.last_dentry();
1385
1386 LeaseStat dlease;
1387 dlease.duration_ms = 0;
1388
1389 if (in) {
1390 Dir *dir = diri->open_dir();
1391 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1392 } else {
1393 if (diri->dir && diri->dir->dentries.count(dname)) {
1394 Dentry *dn = diri->dir->dentries[dname];
1395 if (dn->inode)
1396 unlink(dn, true, true); // keep dir, dentry
1397 }
1398 }
1399 }
1400
1401 if (in) {
1402 if (op == CEPH_MDS_OP_READDIR ||
1403 op == CEPH_MDS_OP_LSSNAP) {
1404 insert_readdir_results(request, session, in);
1405 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1406 // hack: return parent inode instead
1407 in = diri;
1408 }
1409
1410 if (request->dentry() == NULL && in != request->inode()) {
1411 // pin the target inode if its parent dentry is not pinned
1412 request->set_other_inode(in);
1413 }
1414 }
1415
1416 if (realm)
1417 put_snap_realm(realm);
1418
1419 request->target = in;
1420 return in;
1421}
1422
1423// -------
1424
1425mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1426{
1427 mds_rank_t mds = MDS_RANK_NONE;
1428 __u32 hash = 0;
1429 bool is_hash = false;
1430
1431 Inode *in = NULL;
1432 Dentry *de = NULL;
7c673cae
FG
1433
1434 if (req->resend_mds >= 0) {
1435 mds = req->resend_mds;
1436 req->resend_mds = -1;
11fdf7f2 1437 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1438 goto out;
1439 }
1440
1441 if (cct->_conf->client_use_random_mds)
1442 goto random_mds;
1443
1444 in = req->inode();
1445 de = req->dentry();
1446 if (in) {
11fdf7f2 1447 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1448 if (req->path.depth()) {
1449 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1450 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1451 << " on " << req->path[0]
1452 << " => " << hash << dendl;
1453 is_hash = true;
1454 }
1455 } else if (de) {
1456 if (de->inode) {
1457 in = de->inode.get();
11fdf7f2 1458 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1459 } else {
1460 in = de->dir->parent_inode;
1461 hash = in->hash_dentry_name(de->name);
11fdf7f2 1462 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1463 << " on " << de->name
1464 << " => " << hash << dendl;
1465 is_hash = true;
1466 }
1467 }
1468 if (in) {
1469 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1470 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1471 while (in->snapid != CEPH_NOSNAP) {
1472 if (in->snapid == CEPH_SNAPDIR)
1473 in = in->snapdir_parent.get();
11fdf7f2 1474 else if (!in->dentries.empty())
7c673cae
FG
1475 /* In most cases there will only be one dentry, so getting it
1476 * will be the correct action. If there are multiple hard links,
1477 * I think the MDS should be able to redirect as needed*/
1478 in = in->get_first_parent()->dir->parent_inode;
1479 else {
1480 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1481 break;
1482 }
1483 }
1484 is_hash = false;
1485 }
1486
11fdf7f2 1487 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1488 << " hash=" << hash << dendl;
1489
1490 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1491 frag_t fg = in->dirfragtree[hash];
1492 if (in->fragmap.count(fg)) {
1493 mds = in->fragmap[fg];
1494 if (phash_diri)
1495 *phash_diri = in;
91327a77
AA
1496 } else if (in->auth_cap) {
1497 mds = in->auth_cap->session->mds_num;
1498 }
1499 if (mds >= 0) {
11fdf7f2 1500 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1501 goto out;
1502 }
1503 }
1504
11fdf7f2
TL
1505 if (in->auth_cap && req->auth_is_best()) {
1506 mds = in->auth_cap->session->mds_num;
1507 } else if (!in->caps.empty()) {
1508 mds = in->caps.begin()->second.session->mds_num;
1509 } else {
7c673cae 1510 goto random_mds;
11fdf7f2
TL
1511 }
1512 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1513
1514 goto out;
1515 }
1516
1517random_mds:
1518 if (mds < 0) {
1519 mds = _get_random_up_mds();
1520 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1521 }
1522
1523out:
1524 ldout(cct, 20) << "mds is " << mds << dendl;
1525 return mds;
1526}
1527
1528
1529void Client::connect_mds_targets(mds_rank_t mds)
1530{
11fdf7f2
TL
1531 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1532 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1533 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1534 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1535 q != info.export_targets.end();
1536 ++q) {
1537 if (mds_sessions.count(*q) == 0 &&
1538 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1539 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1540 << " export target mds." << *q << dendl;
1541 _open_mds_session(*q);
1542 }
1543 }
1544}
1545
1546void Client::dump_mds_sessions(Formatter *f)
1547{
1548 f->dump_int("id", get_nodeid().v);
11fdf7f2 1549 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1550 f->dump_object("inst", inst);
1551 f->dump_stream("inst_str") << inst;
1552 f->dump_stream("addr_str") << inst.addr;
7c673cae 1553 f->open_array_section("sessions");
11fdf7f2 1554 for (const auto &p : mds_sessions) {
7c673cae 1555 f->open_object_section("session");
11fdf7f2 1556 p.second.dump(f);
7c673cae
FG
1557 f->close_section();
1558 }
1559 f->close_section();
1560 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1561}
1562void Client::dump_mds_requests(Formatter *f)
1563{
1564 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1565 p != mds_requests.end();
1566 ++p) {
1567 f->open_object_section("request");
1568 p->second->dump(f);
1569 f->close_section();
1570 }
1571}
1572
1573int Client::verify_reply_trace(int r,
11fdf7f2 1574 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1575 InodeRef *ptarget, bool *pcreated,
1576 const UserPerm& perms)
1577{
1578 // check whether this request actually did the create, and set created flag
1579 bufferlist extra_bl;
1580 inodeno_t created_ino;
1581 bool got_created_ino = false;
1582 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1583
11fdf7f2 1584 extra_bl = reply->get_extra_bl();
7c673cae
FG
1585 if (extra_bl.length() >= 8) {
1586 // if the extra bufferlist has a buffer, we assume its the created inode
1587 // and that this request to create succeeded in actually creating
1588 // the inode (won the race with other create requests)
11fdf7f2 1589 decode(created_ino, extra_bl);
7c673cae
FG
1590 got_created_ino = true;
1591 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1592 }
1593
1594 if (pcreated)
1595 *pcreated = got_created_ino;
1596
1597 if (request->target) {
1598 *ptarget = request->target;
1599 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1600 } else {
1601 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1602 (*ptarget) = p->second;
1603 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1604 } else {
1605 // we got a traceless reply, and need to look up what we just
1606 // created. for now, do this by name. someday, do this by the
1607 // ino... which we know! FIXME.
1608 InodeRef target;
1609 Dentry *d = request->dentry();
1610 if (d) {
1611 if (d->dir) {
1612 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1613 << d->dir->parent_inode->ino << "/" << d->name
1614 << " got_ino " << got_created_ino
1615 << " ino " << created_ino
1616 << dendl;
1617 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1618 &target, perms);
1619 } else {
1620 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1621 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1622 }
1623 } else {
1624 Inode *in = request->inode();
1625 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1626 << in->ino << dendl;
1627 r = _getattr(in, request->regetattr_mask, perms, true);
1628 target = in;
1629 }
1630 if (r >= 0) {
1631 // verify ino returned in reply and trace_dist are the same
1632 if (got_created_ino &&
1633 created_ino.val != target->ino.val) {
1634 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1635 r = -EINTR;
1636 }
1637 if (ptarget)
1638 ptarget->swap(target);
1639 }
1640 }
1641 }
1642
1643 return r;
1644}
1645
1646
1647/**
1648 * make a request
1649 *
1650 * Blocking helper to make an MDS request.
1651 *
1652 * If the ptarget flag is set, behavior changes slightly: the caller
1653 * expects to get a pointer to the inode we are creating or operating
1654 * on. As a result, we will follow up any traceless mutation reply
1655 * with a getattr or lookup to transparently handle a traceless reply
1656 * from the MDS (as when the MDS restarts and the client has to replay
1657 * a request).
1658 *
1659 * @param request the MetaRequest to execute
1660 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1661 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1662 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1663 * @param use_mds [optional] prefer a specific mds (-1 for default)
1664 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1665 */
1666int Client::make_request(MetaRequest *request,
1667 const UserPerm& perms,
1668 InodeRef *ptarget, bool *pcreated,
1669 mds_rank_t use_mds,
1670 bufferlist *pdirbl)
1671{
1672 int r = 0;
1673
1674 // assign a unique tid
1675 ceph_tid_t tid = ++last_tid;
1676 request->set_tid(tid);
1677
1678 // and timestamp
1679 request->op_stamp = ceph_clock_now();
1680
1681 // make note
1682 mds_requests[tid] = request->get();
1683 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1684 oldest_tid = tid;
1685
1686 request->set_caller_perms(perms);
1687
1688 if (cct->_conf->client_inject_fixed_oldest_tid) {
1689 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1690 request->set_oldest_client_tid(1);
1691 } else {
1692 request->set_oldest_client_tid(oldest_tid);
1693 }
1694
1695 // hack target mds?
1696 if (use_mds >= 0)
1697 request->resend_mds = use_mds;
1698
1699 while (1) {
1700 if (request->aborted())
1701 break;
1702
31f18b77
FG
1703 if (blacklisted) {
1704 request->abort(-EBLACKLISTED);
1705 break;
1706 }
1707
7c673cae
FG
1708 // set up wait cond
1709 Cond caller_cond;
1710 request->caller_cond = &caller_cond;
1711
1712 // choose mds
1713 Inode *hash_diri = NULL;
1714 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1715 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1716 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1717 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1718 if (hash_diri) {
1719 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1720 _fragmap_remove_stopped_mds(hash_diri, mds);
1721 } else {
1722 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1723 request->resend_mds = _get_random_up_mds();
1724 }
1725 } else {
1726 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1727 wait_on_list(waiting_for_mdsmap);
1728 }
1729 continue;
1730 }
1731
1732 // open a session?
1733 MetaSession *session = NULL;
1734 if (!have_open_session(mds)) {
1735 session = _get_or_open_mds_session(mds);
1736
1737 // wait
1738 if (session->state == MetaSession::STATE_OPENING) {
1739 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1740 wait_on_context_list(session->waiting_for_open);
1741 // Abort requests on REJECT from MDS
1742 if (rejected_by_mds.count(mds)) {
1743 request->abort(-EPERM);
1744 break;
1745 }
1746 continue;
1747 }
1748
1749 if (!have_open_session(mds))
1750 continue;
1751 } else {
11fdf7f2 1752 session = &mds_sessions.at(mds);
7c673cae
FG
1753 }
1754
1755 // send request.
1756 send_request(request, session);
1757
1758 // wait for signal
1759 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1760 request->kick = false;
1761 while (!request->reply && // reply
1762 request->resend_mds < 0 && // forward
1763 !request->kick)
1764 caller_cond.Wait(client_lock);
1765 request->caller_cond = NULL;
1766
1767 // did we get a reply?
1768 if (request->reply)
1769 break;
1770 }
1771
1772 if (!request->reply) {
11fdf7f2
TL
1773 ceph_assert(request->aborted());
1774 ceph_assert(!request->got_unsafe);
7c673cae
FG
1775 r = request->get_abort_code();
1776 request->item.remove_myself();
1777 unregister_request(request);
11fdf7f2 1778 put_request(request);
7c673cae
FG
1779 return r;
1780 }
1781
1782 // got it!
11fdf7f2 1783 auto reply = std::move(request->reply);
7c673cae
FG
1784 r = reply->get_result();
1785 if (r >= 0)
1786 request->success = true;
1787
1788 // kick dispatcher (we've got it!)
11fdf7f2 1789 ceph_assert(request->dispatch_cond);
7c673cae
FG
1790 request->dispatch_cond->Signal();
1791 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1792 request->dispatch_cond = 0;
1793
1794 if (r >= 0 && ptarget)
1795 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1796
1797 if (pdirbl)
11fdf7f2 1798 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1799
1800 // -- log times --
1801 utime_t lat = ceph_clock_now();
1802 lat -= request->sent_stamp;
1803 ldout(cct, 20) << "lat " << lat << dendl;
1804 logger->tinc(l_c_lat, lat);
1805 logger->tinc(l_c_reply, lat);
1806
1807 put_request(request);
7c673cae
FG
1808 return r;
1809}
1810
1811void Client::unregister_request(MetaRequest *req)
1812{
1813 mds_requests.erase(req->tid);
1814 if (req->tid == oldest_tid) {
1815 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1816 while (true) {
1817 if (p == mds_requests.end()) {
1818 oldest_tid = 0;
1819 break;
1820 }
1821 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1822 oldest_tid = p->first;
1823 break;
1824 }
1825 ++p;
1826 }
1827 }
1828 put_request(req);
1829}
1830
1831void Client::put_request(MetaRequest *request)
1832{
1833 if (request->_put()) {
1834 int op = -1;
1835 if (request->success)
1836 op = request->get_op();
1837 InodeRef other_in;
1838 request->take_other_inode(&other_in);
1839 delete request;
1840
1841 if (other_in &&
1842 (op == CEPH_MDS_OP_RMDIR ||
1843 op == CEPH_MDS_OP_RENAME ||
1844 op == CEPH_MDS_OP_RMSNAP)) {
1845 _try_to_trim_inode(other_in.get(), false);
1846 }
1847 }
1848}
1849
1850int Client::encode_inode_release(Inode *in, MetaRequest *req,
1851 mds_rank_t mds, int drop,
1852 int unless, int force)
1853{
11fdf7f2 1854 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1855 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1856 << ", have:" << ", force:" << force << ")" << dendl;
1857 int released = 0;
11fdf7f2
TL
1858 auto it = in->caps.find(mds);
1859 if (it != in->caps.end()) {
1860 Cap &cap = it->second;
7c673cae 1861 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1862 if ((drop & cap.issued) &&
1863 !(unless & cap.issued)) {
1864 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1865 cap.issued &= ~drop;
1866 cap.implemented &= ~drop;
7c673cae 1867 released = 1;
11fdf7f2 1868 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1869 } else {
1870 released = force;
1871 }
1872 if (released) {
1873 ceph_mds_request_release rel;
1874 rel.ino = in->ino;
11fdf7f2
TL
1875 rel.cap_id = cap.cap_id;
1876 rel.seq = cap.seq;
1877 rel.issue_seq = cap.issue_seq;
1878 rel.mseq = cap.mseq;
1879 rel.caps = cap.implemented;
1880 rel.wanted = cap.wanted;
7c673cae
FG
1881 rel.dname_len = 0;
1882 rel.dname_seq = 0;
1883 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1884 }
1885 }
11fdf7f2 1886 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1887 << released << dendl;
1888 return released;
1889}
1890
1891void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1892 mds_rank_t mds, int drop, int unless)
1893{
11fdf7f2 1894 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1895 << dn << ")" << dendl;
1896 int released = 0;
1897 if (dn->dir)
1898 released = encode_inode_release(dn->dir->parent_inode, req,
1899 mds, drop, unless, 1);
1900 if (released && dn->lease_mds == mds) {
1901 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1902 auto& rel = req->cap_releases.back();
7c673cae
FG
1903 rel.item.dname_len = dn->name.length();
1904 rel.item.dname_seq = dn->lease_seq;
1905 rel.dname = dn->name;
1906 }
11fdf7f2 1907 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1908 << dn << ")" << dendl;
1909}
1910
1911
1912/*
1913 * This requires the MClientRequest *request member to be set.
1914 * It will error out horribly without one.
1915 * Additionally, if you set any *drop member, you'd better have
1916 * set the corresponding dentry!
1917 */
1918void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1919{
11fdf7f2 1920 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1921 << req << ", mds: " << mds << ")" << dendl;
1922 if (req->inode_drop && req->inode())
1923 encode_inode_release(req->inode(), req,
1924 mds, req->inode_drop,
1925 req->inode_unless);
1926
1927 if (req->old_inode_drop && req->old_inode())
1928 encode_inode_release(req->old_inode(), req,
1929 mds, req->old_inode_drop,
1930 req->old_inode_unless);
1931 if (req->other_inode_drop && req->other_inode())
1932 encode_inode_release(req->other_inode(), req,
1933 mds, req->other_inode_drop,
1934 req->other_inode_unless);
1935
1936 if (req->dentry_drop && req->dentry())
1937 encode_dentry_release(req->dentry(), req,
1938 mds, req->dentry_drop,
1939 req->dentry_unless);
1940
1941 if (req->old_dentry_drop && req->old_dentry())
1942 encode_dentry_release(req->old_dentry(), req,
1943 mds, req->old_dentry_drop,
1944 req->old_dentry_unless);
11fdf7f2 1945 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1946 << req << ", mds " << mds <<dendl;
1947}
1948
1949bool Client::have_open_session(mds_rank_t mds)
1950{
11fdf7f2
TL
1951 const auto &it = mds_sessions.find(mds);
1952 return it != mds_sessions.end() &&
1953 (it->second.state == MetaSession::STATE_OPEN ||
1954 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1955}
1956
1957MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1958{
11fdf7f2
TL
1959 const auto &it = mds_sessions.find(mds);
1960 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1961 return NULL;
11fdf7f2
TL
1962 } else {
1963 return &it->second;
1964 }
7c673cae
FG
1965}
1966
1967MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1968{
11fdf7f2
TL
1969 auto it = mds_sessions.find(mds);
1970 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1971}
1972
1973/**
1974 * Populate a map of strings with client-identifying metadata,
1975 * such as the hostname. Call this once at initialization.
1976 */
1977void Client::populate_metadata(const std::string &mount_root)
1978{
1979 // Hostname
1980 struct utsname u;
1981 int r = uname(&u);
1982 if (r >= 0) {
1983 metadata["hostname"] = u.nodename;
1984 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1985 } else {
1986 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1987 }
1988
1989 metadata["pid"] = stringify(getpid());
1990
1991 // Ceph entity id (the '0' in "client.0")
1992 metadata["entity_id"] = cct->_conf->name.get_id();
1993
1994 // Our mount position
1995 if (!mount_root.empty()) {
1996 metadata["root"] = mount_root;
1997 }
1998
1999 // Ceph version
2000 metadata["ceph_version"] = pretty_version_to_str();
2001 metadata["ceph_sha1"] = git_version_to_str();
2002
2003 // Apply any metadata from the user's configured overrides
2004 std::vector<std::string> tokens;
2005 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2006 for (const auto &i : tokens) {
2007 auto eqpos = i.find("=");
2008 // Throw out anything that isn't of the form "<str>=<str>"
2009 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2010 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2011 continue;
2012 }
2013 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2014 }
2015}
2016
2017/**
2018 * Optionally add or override client metadata fields.
2019 */
2020void Client::update_metadata(std::string const &k, std::string const &v)
2021{
11fdf7f2
TL
2022 std::lock_guard l(client_lock);
2023 ceph_assert(initialized);
7c673cae 2024
11fdf7f2
TL
2025 auto it = metadata.find(k);
2026 if (it != metadata.end()) {
7c673cae 2027 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2028 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2029 }
2030
2031 metadata[k] = v;
2032}
2033
2034MetaSession *Client::_open_mds_session(mds_rank_t mds)
2035{
11fdf7f2
TL
2036 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2037 auto addrs = mdsmap->get_addrs(mds);
2038 auto em = mds_sessions.emplace(std::piecewise_construct,
2039 std::forward_as_tuple(mds),
2040 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2041 ceph_assert(em.second); /* not already present */
2042 MetaSession *session = &em.first->second;
7c673cae
FG
2043
2044 // Maybe skip sending a request to open if this MDS daemon
2045 // has previously sent us a REJECT.
2046 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2047 if (rejected_by_mds[mds] == session->addrs) {
2048 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2049 "because we were rejected" << dendl;
2050 return session;
2051 } else {
11fdf7f2 2052 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2053 "rejected us, trying with new inst" << dendl;
2054 rejected_by_mds.erase(mds);
2055 }
2056 }
2057
11fdf7f2
TL
2058 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2059 m->metadata = metadata;
2060 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2061 session->con->send_message2(std::move(m));
7c673cae
FG
2062 return session;
2063}
2064
2065void Client::_close_mds_session(MetaSession *s)
2066{
11fdf7f2 2067 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2068 s->state = MetaSession::STATE_CLOSING;
11fdf7f2 2069 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2070}
2071
2072void Client::_closed_mds_session(MetaSession *s)
2073{
11fdf7f2 2074 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2075 s->state = MetaSession::STATE_CLOSED;
2076 s->con->mark_down();
2077 signal_context_list(s->waiting_for_open);
2078 mount_cond.Signal();
2079 remove_session_caps(s);
2080 kick_requests_closed(s);
2081 mds_sessions.erase(s->mds_num);
7c673cae
FG
2082}
2083
11fdf7f2 2084void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2085{
2086 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2087 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2088
2089 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2090 if (!session) {
2091 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2092 return;
2093 }
2094
2095 switch (m->get_op()) {
2096 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2097 {
2098 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2099 missing_features -= m->supported_features;
2100 if (!missing_features.empty()) {
2101 lderr(cct) << "mds." << from << " lacks required features '"
2102 << missing_features << "', closing session " << dendl;
2103 rejected_by_mds[session->mds_num] = session->addrs;
2104 _close_mds_session(session);
2105 _closed_mds_session(session);
2106 break;
2107 }
2108 session->mds_features = std::move(m->supported_features);
2109
2110 renew_caps(session);
2111 session->state = MetaSession::STATE_OPEN;
2112 if (unmounting)
2113 mount_cond.Signal();
2114 else
2115 connect_mds_targets(from);
2116 signal_context_list(session->waiting_for_open);
2117 break;
2118 }
7c673cae
FG
2119
2120 case CEPH_SESSION_CLOSE:
2121 _closed_mds_session(session);
2122 break;
2123
2124 case CEPH_SESSION_RENEWCAPS:
2125 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2126 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2127 session->cap_ttl =
2128 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2129 if (was_stale)
2130 wake_up_session_caps(session, false);
7c673cae
FG
2131 }
2132 break;
2133
2134 case CEPH_SESSION_STALE:
28e407b8
AA
2135 // invalidate session caps/leases
2136 session->cap_gen++;
2137 session->cap_ttl = ceph_clock_now();
2138 session->cap_ttl -= 1;
7c673cae
FG
2139 renew_caps(session);
2140 break;
2141
2142 case CEPH_SESSION_RECALL_STATE:
2143 trim_caps(session, m->get_max_caps());
2144 break;
2145
2146 case CEPH_SESSION_FLUSHMSG:
a8e16298 2147 /* flush cap release */
11fdf7f2
TL
2148 if (auto& m = session->release; m) {
2149 session->con->send_message2(std::move(m));
a8e16298 2150 }
11fdf7f2 2151 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2152 break;
2153
2154 case CEPH_SESSION_FORCE_RO:
2155 force_session_readonly(session);
2156 break;
2157
2158 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2159 {
2160 std::string_view error_str;
2161 auto it = m->metadata.find("error_string");
2162 if (it != m->metadata.end())
2163 error_str = it->second;
2164 else
2165 error_str = "unknown error";
2166 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2167
11fdf7f2
TL
2168 rejected_by_mds[session->mds_num] = session->addrs;
2169 _closed_mds_session(session);
2170 }
7c673cae
FG
2171 break;
2172
2173 default:
2174 ceph_abort();
2175 }
7c673cae
FG
2176}
2177
2178bool Client::_any_stale_sessions() const
2179{
11fdf7f2 2180 ceph_assert(client_lock.is_locked_by_me());
7c673cae 2181
11fdf7f2
TL
2182 for (const auto &p : mds_sessions) {
2183 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2184 return true;
2185 }
2186 }
2187
2188 return false;
2189}
2190
2191void Client::_kick_stale_sessions()
2192{
11fdf7f2 2193 ldout(cct, 1) << __func__ << dendl;
7c673cae 2194
11fdf7f2
TL
2195 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2196 MetaSession &s = it->second;
2197 ++it;
2198 if (s.state == MetaSession::STATE_STALE)
2199 _closed_mds_session(&s);
7c673cae
FG
2200 }
2201}
2202
2203void Client::send_request(MetaRequest *request, MetaSession *session,
2204 bool drop_cap_releases)
2205{
2206 // make the request
2207 mds_rank_t mds = session->mds_num;
11fdf7f2 2208 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2209 << " for mds." << mds << dendl;
11fdf7f2 2210 auto r = build_client_request(request);
7c673cae
FG
2211 if (request->dentry()) {
2212 r->set_dentry_wanted();
2213 }
2214 if (request->got_unsafe) {
2215 r->set_replayed_op();
2216 if (request->target)
2217 r->head.ino = request->target->ino;
2218 } else {
2219 encode_cap_releases(request, mds);
2220 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2221 request->cap_releases.clear();
2222 else
2223 r->releases.swap(request->cap_releases);
2224 }
2225 r->set_mdsmap_epoch(mdsmap->get_epoch());
2226 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2227 objecter->with_osdmap([r](const OSDMap& o) {
2228 r->set_osdmap_epoch(o.get_epoch());
2229 });
2230 }
2231
2232 if (request->mds == -1) {
2233 request->sent_stamp = ceph_clock_now();
11fdf7f2 2234 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2235 }
2236 request->mds = mds;
2237
2238 Inode *in = request->inode();
11fdf7f2
TL
2239 if (in) {
2240 auto it = in->caps.find(mds);
2241 if (it != in->caps.end()) {
2242 request->sent_on_mseq = it->second.mseq;
2243 }
2244 }
7c673cae
FG
2245
2246 session->requests.push_back(&request->item);
2247
11fdf7f2
TL
2248 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2249 session->con->send_message2(std::move(r));
7c673cae
FG
2250}
2251
11fdf7f2 2252MClientRequest::ref Client::build_client_request(MetaRequest *request)
7c673cae 2253{
11fdf7f2 2254 auto req = MClientRequest::create(request->get_op());
7c673cae
FG
2255 req->set_tid(request->tid);
2256 req->set_stamp(request->op_stamp);
2257 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2258
2259 // if the filepath's haven't been set, set them!
2260 if (request->path.empty()) {
2261 Inode *in = request->inode();
2262 Dentry *de = request->dentry();
2263 if (in)
2264 in->make_nosnap_relative_path(request->path);
2265 else if (de) {
2266 if (de->inode)
2267 de->inode->make_nosnap_relative_path(request->path);
2268 else if (de->dir) {
2269 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2270 request->path.push_dentry(de->name);
2271 }
2272 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2273 << " No path, inode, or appropriately-endowed dentry given!"
2274 << dendl;
2275 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2276 << " No path, inode, or dentry given!"
2277 << dendl;
2278 }
2279 req->set_filepath(request->get_filepath());
2280 req->set_filepath2(request->get_filepath2());
2281 req->set_data(request->data);
2282 req->set_retry_attempt(request->retry_attempt++);
2283 req->head.num_fwd = request->num_fwd;
2284 const gid_t *_gids;
2285 int gid_count = request->perms.get_gids(&_gids);
2286 req->set_gid_list(gid_count, _gids);
2287 return req;
2288}
2289
2290
2291
11fdf7f2 2292void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2293{
2294 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2295 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2296 if (!session) {
7c673cae
FG
2297 return;
2298 }
2299 ceph_tid_t tid = fwd->get_tid();
2300
2301 if (mds_requests.count(tid) == 0) {
11fdf7f2 2302 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2303 return;
2304 }
2305
2306 MetaRequest *request = mds_requests[tid];
11fdf7f2 2307 ceph_assert(request);
7c673cae
FG
2308
2309 // reset retry counter
2310 request->retry_attempt = 0;
2311
2312 // request not forwarded, or dest mds has no session.
2313 // resend.
11fdf7f2 2314 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2315 << " fwd " << fwd->get_num_fwd()
2316 << " to mds." << fwd->get_dest_mds()
2317 << ", resending to " << fwd->get_dest_mds()
2318 << dendl;
2319
2320 request->mds = -1;
2321 request->item.remove_myself();
2322 request->num_fwd = fwd->get_num_fwd();
2323 request->resend_mds = fwd->get_dest_mds();
2324 request->caller_cond->Signal();
7c673cae
FG
2325}
2326
2327bool Client::is_dir_operation(MetaRequest *req)
2328{
2329 int op = req->get_op();
2330 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2331 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2332 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2333 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2334 return true;
2335 return false;
2336}
2337
11fdf7f2 2338void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2339{
2340 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2341 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2342 if (!session) {
7c673cae
FG
2343 return;
2344 }
2345
2346 ceph_tid_t tid = reply->get_tid();
2347 bool is_safe = reply->is_safe();
2348
2349 if (mds_requests.count(tid) == 0) {
11fdf7f2 2350 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2351 << " safe is:" << is_safe << dendl;
7c673cae
FG
2352 return;
2353 }
2354 MetaRequest *request = mds_requests.at(tid);
2355
11fdf7f2 2356 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2357 << " tid " << tid << dendl;
2358
2359 if (request->got_unsafe && !is_safe) {
2360 //duplicate response
2361 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2362 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2363 return;
2364 }
2365
2366 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2367 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2368 << " from mds." << request->mds << dendl;
2369 request->send_to_auth = true;
2370 request->resend_mds = choose_target_mds(request);
2371 Inode *in = request->inode();
11fdf7f2 2372 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2373 if (request->resend_mds >= 0 &&
2374 request->resend_mds == request->mds &&
2375 (in == NULL ||
11fdf7f2
TL
2376 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2377 request->sent_on_mseq == it->second.mseq)) {
2378 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae
FG
2379 } else {
2380 request->caller_cond->Signal();
7c673cae
FG
2381 return;
2382 }
7c673cae
FG
2383 }
2384
11fdf7f2 2385 ceph_assert(!request->reply);
7c673cae
FG
2386 request->reply = reply;
2387 insert_trace(request, session);
2388
2389 // Handle unsafe reply
2390 if (!is_safe) {
2391 request->got_unsafe = true;
2392 session->unsafe_requests.push_back(&request->unsafe_item);
2393 if (is_dir_operation(request)) {
2394 Inode *dir = request->inode();
11fdf7f2 2395 ceph_assert(dir);
7c673cae
FG
2396 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2397 }
2398 if (request->target) {
2399 InodeRef &in = request->target;
2400 in->unsafe_ops.push_back(&request->unsafe_target_item);
2401 }
2402 }
2403
2404 // Only signal the caller once (on the first reply):
2405 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2406 if (!is_safe || !request->got_unsafe) {
2407 Cond cond;
2408 request->dispatch_cond = &cond;
2409
2410 // wake up waiter
11fdf7f2 2411 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
7c673cae
FG
2412 request->caller_cond->Signal();
2413
2414 // wake for kick back
2415 while (request->dispatch_cond) {
11fdf7f2 2416 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
7c673cae
FG
2417 cond.Wait(client_lock);
2418 }
2419 }
2420
2421 if (is_safe) {
2422 // the filesystem change is committed to disk
2423 // we're done, clean up
2424 if (request->got_unsafe) {
2425 request->unsafe_item.remove_myself();
2426 request->unsafe_dir_item.remove_myself();
2427 request->unsafe_target_item.remove_myself();
2428 signal_cond_list(request->waitfor_safe);
2429 }
2430 request->item.remove_myself();
2431 unregister_request(request);
2432 }
2433 if (unmounting)
2434 mount_cond.Signal();
2435}
2436
2437void Client::_handle_full_flag(int64_t pool)
2438{
2439 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2440 << "on " << pool << dendl;
2441 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2442 // to do this rather than blocking, because otherwise when we fill up we
2443 // potentially lock caps forever on files with dirty pages, and we need
2444 // to be able to release those caps to the MDS so that it can delete files
2445 // and free up space.
2446 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2447
2448 // For all inodes with layouts in this pool and a pending flush write op
2449 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2450 // from ObjectCacher so that it doesn't re-issue the write in response to
2451 // the ENOSPC error.
2452 // Fortunately since we're cancelling everything in a given pool, we don't
2453 // need to know which ops belong to which ObjectSet, we can just blow all
2454 // the un-flushed cached data away and mark any dirty inodes' async_err
2455 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2456 // affecting this pool, and all the objectsets we're purging were also
2457 // in this pool.
2458 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2459 i != inode_map.end(); ++i)
2460 {
2461 Inode *inode = i->second;
2462 if (inode->oset.dirty_or_tx
2463 && (pool == -1 || inode->layout.pool_id == pool)) {
2464 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2465 << " has dirty objects, purging and setting ENOSPC" << dendl;
2466 objectcacher->purge_set(&inode->oset);
2467 inode->set_async_err(-ENOSPC);
2468 }
2469 }
2470
2471 if (cancelled_epoch != (epoch_t)-1) {
2472 set_cap_epoch_barrier(cancelled_epoch);
2473 }
2474}
2475
11fdf7f2 2476void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2477{
31f18b77
FG
2478 std::set<entity_addr_t> new_blacklists;
2479 objecter->consume_blacklist_events(&new_blacklists);
2480
11fdf7f2
TL
2481 const auto myaddrs = messenger->get_myaddrs();
2482 bool new_blacklist = false;
2483 bool prenautilus = objecter->with_osdmap(
2484 [&](const OSDMap& o) {
2485 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2486 });
2487 if (!blacklisted) {
2488 for (auto a : myaddrs.v) {
2489 // blacklist entries are always TYPE_ANY for nautilus+
2490 a.set_type(entity_addr_t::TYPE_ANY);
2491 if (new_blacklists.count(a)) {
2492 new_blacklist = true;
2493 break;
2494 }
2495 if (prenautilus) {
2496 // ...except pre-nautilus, they were TYPE_LEGACY
2497 a.set_type(entity_addr_t::TYPE_LEGACY);
2498 if (new_blacklists.count(a)) {
2499 new_blacklist = true;
2500 break;
2501 }
2502 }
2503 }
2504 }
2505 if (new_blacklist) {
31f18b77
FG
2506 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2507 return o.get_epoch();
2508 });
2509 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2510 blacklisted = true;
31f18b77 2511
11fdf7f2 2512 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2513
2514 // Since we know all our OSD ops will fail, cancel them all preemtively,
2515 // so that on an unhealthy cluster we can umount promptly even if e.g.
2516 // some PGs were inaccessible.
2517 objecter->op_cancel_writes(-EBLACKLISTED);
2518
2519 } else if (blacklisted) {
2520 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2521 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2522 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2523 }
2524
f64942e4
AA
2525 // Always subscribe to next osdmap for blacklisted client
2526 // until this client is not blacklisted.
2527 if (blacklisted) {
2528 objecter->maybe_request_map();
2529 }
2530
7c673cae
FG
2531 if (objecter->osdmap_full_flag()) {
2532 _handle_full_flag(-1);
2533 } else {
2534 // Accumulate local list of full pools so that I can drop
2535 // the objecter lock before re-entering objecter in
2536 // cancel_writes
2537 std::vector<int64_t> full_pools;
2538
2539 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2540 for (const auto& kv : o.get_pools()) {
2541 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2542 full_pools.push_back(kv.first);
2543 }
2544 }
2545 });
2546
2547 for (auto p : full_pools)
2548 _handle_full_flag(p);
2549
2550 // Subscribe to subsequent maps to watch for the full flag going
2551 // away. For the global full flag objecter does this for us, but
2552 // it pays no attention to the per-pool full flag so in this branch
2553 // we do it ourselves.
2554 if (!full_pools.empty()) {
2555 objecter->maybe_request_map();
2556 }
2557 }
7c673cae
FG
2558}
2559
2560
2561// ------------------------
2562// incoming messages
2563
2564
11fdf7f2 2565bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2566{
11fdf7f2 2567 std::lock_guard l(client_lock);
7c673cae
FG
2568 if (!initialized) {
2569 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2570 return true;
2571 }
2572
2573 switch (m->get_type()) {
2574 // mounting and mds sessions
2575 case CEPH_MSG_MDS_MAP:
11fdf7f2 2576 handle_mds_map(MMDSMap::msgref_cast(m));
7c673cae
FG
2577 break;
2578 case CEPH_MSG_FS_MAP:
11fdf7f2 2579 handle_fs_map(MFSMap::msgref_cast(m));
7c673cae
FG
2580 break;
2581 case CEPH_MSG_FS_MAP_USER:
11fdf7f2 2582 handle_fs_map_user(MFSMapUser::msgref_cast(m));
7c673cae
FG
2583 break;
2584 case CEPH_MSG_CLIENT_SESSION:
11fdf7f2 2585 handle_client_session(MClientSession::msgref_cast(m));
7c673cae
FG
2586 break;
2587
2588 case CEPH_MSG_OSD_MAP:
11fdf7f2 2589 handle_osd_map(MOSDMap::msgref_cast(m));
7c673cae
FG
2590 break;
2591
2592 // requests
2593 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
11fdf7f2 2594 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
7c673cae
FG
2595 break;
2596 case CEPH_MSG_CLIENT_REPLY:
11fdf7f2
TL
2597 handle_client_reply(MClientReply::msgref_cast(m));
2598 break;
2599
2600 // reclaim reply
2601 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2602 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
7c673cae
FG
2603 break;
2604
2605 case CEPH_MSG_CLIENT_SNAP:
11fdf7f2 2606 handle_snap(MClientSnap::msgref_cast(m));
7c673cae
FG
2607 break;
2608 case CEPH_MSG_CLIENT_CAPS:
11fdf7f2 2609 handle_caps(MClientCaps::msgref_cast(m));
7c673cae
FG
2610 break;
2611 case CEPH_MSG_CLIENT_LEASE:
11fdf7f2 2612 handle_lease(MClientLease::msgref_cast(m));
7c673cae
FG
2613 break;
2614 case MSG_COMMAND_REPLY:
2615 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
11fdf7f2 2616 handle_command_reply(MCommandReply::msgref_cast(m));
7c673cae
FG
2617 } else {
2618 return false;
2619 }
2620 break;
2621 case CEPH_MSG_CLIENT_QUOTA:
11fdf7f2 2622 handle_quota(MClientQuota::msgref_cast(m));
7c673cae
FG
2623 break;
2624
2625 default:
2626 return false;
2627 }
2628
2629 // unmounting?
2630 if (unmounting) {
2631 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2632 << "+" << inode_map.size() << dendl;
2633 long unsigned size = lru.lru_get_size() + inode_map.size();
2634 trim_cache();
2635 if (size < lru.lru_get_size() + inode_map.size()) {
2636 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2637 mount_cond.Signal();
2638 } else {
2639 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2640 << "+" << inode_map.size() << dendl;
2641 }
2642 }
2643
2644 return true;
2645}
2646
11fdf7f2 2647void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2648{
2649 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2650
2651 signal_cond_list(waiting_for_fsmap);
2652
2653 monclient->sub_got("fsmap", fsmap->get_epoch());
2654}
2655
11fdf7f2 2656void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2657{
2658 fsmap_user.reset(new FSMapUser);
2659 *fsmap_user = m->get_fsmap();
7c673cae
FG
2660
2661 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2662 signal_cond_list(waiting_for_fsmap);
2663}
2664
11fdf7f2 2665void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2666{
f64942e4 2667 mds_gid_t old_inc, new_inc;
7c673cae 2668 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2669 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2670 << " is identical to or older than our "
2671 << mdsmap->get_epoch() << dendl;
7c673cae 2672 return;
f64942e4 2673 }
7c673cae 2674
11fdf7f2 2675 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2676
2677 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2678 oldmap.swap(mdsmap);
2679
2680 mdsmap->decode(m->get_encoded());
2681
2682 // Cancel any commands for missing or laggy GIDs
2683 std::list<ceph_tid_t> cancel_ops;
2684 auto &commands = command_table.get_commands();
2685 for (const auto &i : commands) {
2686 auto &op = i.second;
2687 const mds_gid_t op_mds_gid = op.mds_gid;
2688 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2689 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2690 cancel_ops.push_back(i.first);
2691 if (op.outs) {
2692 std::ostringstream ss;
2693 ss << "MDS " << op_mds_gid << " went away";
2694 *(op.outs) = ss.str();
2695 }
2696 op.con->mark_down();
2697 if (op.on_finish) {
2698 op.on_finish->complete(-ETIMEDOUT);
2699 }
2700 }
2701 }
2702
2703 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2704 i != cancel_ops.end(); ++i) {
2705 command_table.erase(*i);
2706 }
2707
2708 // reset session
11fdf7f2 2709 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2710 mds_rank_t mds = p->first;
11fdf7f2 2711 MetaSession *session = &p->second;
7c673cae
FG
2712 ++p;
2713
2714 int oldstate = oldmap->get_state(mds);
2715 int newstate = mdsmap->get_state(mds);
2716 if (!mdsmap->is_up(mds)) {
2717 session->con->mark_down();
11fdf7f2 2718 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2719 old_inc = oldmap->get_incarnation(mds);
2720 new_inc = mdsmap->get_incarnation(mds);
2721 if (old_inc != new_inc) {
2722 ldout(cct, 1) << "mds incarnation changed from "
2723 << old_inc << " to " << new_inc << dendl;
2724 oldstate = MDSMap::STATE_NULL;
2725 }
7c673cae 2726 session->con->mark_down();
11fdf7f2 2727 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2728 // When new MDS starts to take over, notify kernel to trim unused entries
2729 // in its dcache/icache. Hopefully, the kernel will release some unused
2730 // inodes before the new MDS enters reconnect state.
2731 trim_cache_for_reconnect(session);
2732 } else if (oldstate == newstate)
2733 continue; // no change
2734
2735 session->mds_state = newstate;
f64942e4
AA
2736 if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
2737 // missed reconnect close the session so that it can be reopened
2738 _closed_mds_session(session);
2739 continue;
2740 }
7c673cae 2741 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2742 session->con = messenger->connect_to_mds(session->addrs);
7c673cae
FG
2743 send_reconnect(session);
2744 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2745 if (oldstate < MDSMap::STATE_ACTIVE) {
2746 // kick new requests
2747 kick_requests(session);
2748 kick_flushing_caps(session);
2749 signal_context_list(session->waiting_for_open);
a8e16298 2750 wake_up_session_caps(session, true);
7c673cae
FG
2751 }
2752 connect_mds_targets(mds);
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
7c673cae
FG
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763}
2764
2765void Client::send_reconnect(MetaSession *session)
2766{
2767 mds_rank_t mds = session->mds_num;
11fdf7f2 2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
11fdf7f2 2775 session->release.reset();
7c673cae
FG
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
11fdf7f2
TL
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
11fdf7f2
TL
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
7c673cae 2806 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2807 << " " << ccap_string(cap.issued)
7c673cae
FG
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
11fdf7f2
TL
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
7c673cae
FG
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
11fdf7f2 2832 cap.cap_id,
7c673cae
FG
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
11fdf7f2 2835 cap.issued, // issued
7c673cae
FG
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
11fdf7f2
TL
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
7c673cae
FG
2851
2852 mount_cond.Signal();
11fdf7f2
TL
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2856}
2857
2858
2859void Client::kick_requests(MetaSession *session)
2860{
11fdf7f2 2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
31f18b77
FG
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
7c673cae 2873 continue;
31f18b77
FG
2874 }
2875 if (req->retry_attempt > 0)
7c673cae 2876 continue; // new requests only
31f18b77 2877 if (req->mds == session->mds_num) {
7c673cae
FG
2878 send_request(p->second, session);
2879 }
2880 }
2881}
2882
2883void Client::resend_unsafe_requests(MetaSession *session)
2884{
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
31f18b77
FG
2898 if (req->aborted())
2899 continue;
7c673cae
FG
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905}
2906
2907void Client::wait_unsafe_requests()
2908{
2909 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927}
2928
2929void Client::kick_requests_closed(MetaSession *session)
2930{
11fdf7f2 2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
11fdf7f2 2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae
FG
2944 req->unsafe_item.remove_myself();
2945 req->unsafe_dir_item.remove_myself();
2946 req->unsafe_target_item.remove_myself();
2947 signal_cond_list(req->waitfor_safe);
2948 unregister_request(req);
2949 }
2950 }
2951 }
11fdf7f2
TL
2952 ceph_assert(session->requests.empty());
2953 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2954}
2955
2956
2957
2958
2959/************
2960 * leases
2961 */
2962
2963void Client::got_mds_push(MetaSession *s)
2964{
2965 s->seq++;
2966 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2967 if (s->state == MetaSession::STATE_CLOSING) {
11fdf7f2 2968 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2969 }
2970}
2971
11fdf7f2 2972void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2973{
11fdf7f2 2974 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2975
11fdf7f2 2976 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2977
2978 mds_rank_t mds = mds_rank_t(m->get_source().num());
2979 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2980 if (!session) {
7c673cae
FG
2981 return;
2982 }
2983
2984 got_mds_push(session);
2985
2986 ceph_seq_t seq = m->get_seq();
2987
2988 Inode *in;
2989 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2990 if (inode_map.count(vino) == 0) {
2991 ldout(cct, 10) << " don't have vino " << vino << dendl;
2992 goto revoke;
2993 }
2994 in = inode_map[vino];
2995
2996 if (m->get_mask() & CEPH_LOCK_DN) {
2997 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2998 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2999 goto revoke;
3000 }
3001 Dentry *dn = in->dir->dentries[m->dname];
3002 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3003 dn->lease_mds = -1;
3004 }
3005
3006 revoke:
11fdf7f2
TL
3007 {
3008 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3009 m->get_connection()->send_message2(std::move(reply));
3010 }
7c673cae
FG
3011}
3012
3013void Client::put_inode(Inode *in, int n)
3014{
11fdf7f2 3015 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3016 int left = in->_put(n);
3017 if (left == 0) {
3018 // release any caps
3019 remove_all_caps(in);
3020
11fdf7f2 3021 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3022 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3023 ceph_assert(!unclean);
7c673cae
FG
3024 inode_map.erase(in->vino());
3025 if (use_faked_inos())
3026 _release_faked_ino(in);
3027
3028 if (in == root) {
3029 root = 0;
3030 root_ancestor = 0;
3031 while (!root_parents.empty())
3032 root_parents.erase(root_parents.begin());
3033 }
3034
3035 delete in;
3036 }
3037}
3038
3039void Client::close_dir(Dir *dir)
3040{
3041 Inode *in = dir->parent_inode;
11fdf7f2
TL
3042 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3043 ceph_assert(dir->is_empty());
3044 ceph_assert(in->dir == dir);
3045 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3046 if (!in->dentries.empty())
7c673cae
FG
3047 in->get_first_parent()->put(); // unpin dentry
3048
3049 delete in->dir;
3050 in->dir = 0;
3051 put_inode(in); // unpin inode
3052}
3053
3054 /**
3055 * Don't call this with in==NULL, use get_or_create for that
3056 * leave dn set to default NULL unless you're trying to add
3057 * a new inode to a pre-created Dentry
3058 */
3059Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3060{
3061 if (!dn) {
3062 // create a new Dentry
11fdf7f2
TL
3063 dn = new Dentry(dir, name);
3064
7c673cae
FG
3065 lru.lru_insert_mid(dn); // mid or top?
3066
3067 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3068 << " dn " << dn << " (new dn)" << dendl;
3069 } else {
11fdf7f2 3070 ceph_assert(!dn->inode);
7c673cae
FG
3071 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3072 << " dn " << dn << " (old dn)" << dendl;
3073 }
3074
3075 if (in) { // link to inode
11fdf7f2 3076 InodeRef tmp_ref;
7c673cae 3077 // only one parent for directories!
11fdf7f2
TL
3078 if (in->is_dir() && !in->dentries.empty()) {
3079 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3080 Dentry *olddn = in->get_first_parent();
11fdf7f2 3081 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3082 Inode *old_diri = olddn->dir->parent_inode;
3083 old_diri->dir_release_count++;
3084 clear_dir_complete_and_ordered(old_diri, true);
3085 unlink(olddn, true, true); // keep dir, dentry
3086 }
3087
11fdf7f2
TL
3088 dn->link(in);
3089 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3090 }
3091
3092 return dn;
3093}
3094
3095void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3096{
11fdf7f2 3097 InodeRef in(dn->inode);
7c673cae
FG
3098 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3099 << " inode " << dn->inode << dendl;
3100
3101 // unlink from inode
11fdf7f2
TL
3102 if (dn->inode) {
3103 dn->unlink();
3104 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3105 }
3106
3107 if (keepdentry) {
3108 dn->lease_mds = -1;
3109 } else {
3110 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3111
3112 // unlink from dir
11fdf7f2
TL
3113 Dir *dir = dn->dir;
3114 dn->detach();
7c673cae
FG
3115
3116 // delete den
3117 lru.lru_remove(dn);
3118 dn->put();
11fdf7f2
TL
3119
3120 if (dir->is_empty() && !keepdir)
3121 close_dir(dir);
7c673cae
FG
3122 }
3123}
3124
3125/**
3126 * For asynchronous flushes, check for errors from the IO and
3127 * update the inode if necessary
3128 */
3129class C_Client_FlushComplete : public Context {
3130private:
3131 Client *client;
3132 InodeRef inode;
3133public:
3134 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3135 void finish(int r) override {
11fdf7f2 3136 ceph_assert(client->client_lock.is_locked_by_me());
7c673cae
FG
3137 if (r != 0) {
3138 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3139 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3140 << " 0x" << std::hex << inode->ino << std::dec
3141 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3142 inode->set_async_err(r);
3143 }
3144 }
3145};
3146
3147
3148/****
3149 * caps
3150 */
3151
3152void Client::get_cap_ref(Inode *in, int cap)
3153{
3154 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3155 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3156 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3157 in->get();
3158 }
3159 if ((cap & CEPH_CAP_FILE_CACHE) &&
3160 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3161 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3162 in->get();
3163 }
3164 in->get_cap_ref(cap);
3165}
3166
3167void Client::put_cap_ref(Inode *in, int cap)
3168{
3169 int last = in->put_cap_ref(cap);
3170 if (last) {
3171 int put_nref = 0;
3172 int drop = last & ~in->caps_issued();
3173 if (in->snapid == CEPH_NOSNAP) {
3174 if ((last & CEPH_CAP_FILE_WR) &&
3175 !in->cap_snaps.empty() &&
3176 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3177 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3178 in->cap_snaps.rbegin()->second.writing = 0;
3179 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3180 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3181 }
3182 if (last & CEPH_CAP_FILE_BUFFER) {
3183 for (auto &p : in->cap_snaps)
3184 p.second.dirty_data = 0;
3185 signal_cond_list(in->waitfor_commit);
11fdf7f2 3186 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3187 ++put_nref;
3188 }
3189 }
3190 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3191 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3192 ++put_nref;
3193 }
3194 if (drop)
3195 check_caps(in, 0);
3196 if (put_nref)
3197 put_inode(in, put_nref);
3198 }
3199}
3200
3201int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3202{
3203 int r = check_pool_perm(in, need);
3204 if (r < 0)
3205 return r;
3206
3207 while (1) {
3208 int file_wanted = in->caps_file_wanted();
3209 if ((file_wanted & need) != need) {
3210 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3211 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3212 << dendl;
3213 return -EBADF;
3214 }
3215
3216 int implemented;
3217 int have = in->caps_issued(&implemented);
3218
3219 bool waitfor_caps = false;
3220 bool waitfor_commit = false;
3221
3222 if (have & need & CEPH_CAP_FILE_WR) {
3223 if (endoff > 0 &&
3224 (endoff >= (loff_t)in->max_size ||
3225 endoff > (loff_t)(in->size << 1)) &&
3226 endoff > (loff_t)in->wanted_max_size) {
3227 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3228 in->wanted_max_size = endoff;
3229 check_caps(in, 0);
3230 }
3231
3232 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3233 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3234 waitfor_caps = true;
3235 }
3236 if (!in->cap_snaps.empty()) {
3237 if (in->cap_snaps.rbegin()->second.writing) {
3238 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3239 waitfor_caps = true;
3240 }
3241 for (auto &p : in->cap_snaps) {
3242 if (p.second.dirty_data) {
3243 waitfor_commit = true;
3244 break;
3245 }
3246 }
3247 if (waitfor_commit) {
3248 _flush(in, new C_Client_FlushComplete(this, in));
3249 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3250 }
3251 }
3252 }
3253
3254 if (!waitfor_caps && !waitfor_commit) {
3255 if ((have & need) == need) {
7c673cae
FG
3256 int revoking = implemented & ~have;
3257 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3258 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3259 << " revoking " << ccap_string(revoking)
7c673cae 3260 << dendl;
c07f9fc5 3261 if ((revoking & want) == 0) {
7c673cae
FG
3262 *phave = need | (have & want);
3263 in->get_cap_ref(need);
3264 return 0;
3265 }
3266 }
3267 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3268 waitfor_caps = true;
3269 }
3270
3271 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3272 in->auth_cap->session->readonly)
3273 return -EROFS;
3274
3275 if (in->flags & I_CAP_DROPPED) {
3276 int mds_wanted = in->caps_mds_wanted();
3277 if ((mds_wanted & need) != need) {
3278 int ret = _renew_caps(in);
3279 if (ret < 0)
3280 return ret;
3281 continue;
3282 }
a8e16298 3283 if (!(file_wanted & ~mds_wanted))
7c673cae 3284 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3285 }
3286
3287 if (waitfor_caps)
3288 wait_on_list(in->waitfor_caps);
3289 else if (waitfor_commit)
3290 wait_on_list(in->waitfor_commit);
3291 }
3292}
3293
3294int Client::get_caps_used(Inode *in)
3295{
3296 unsigned used = in->caps_used();
3297 if (!(used & CEPH_CAP_FILE_CACHE) &&
3298 !objectcacher->set_is_empty(&in->oset))
3299 used |= CEPH_CAP_FILE_CACHE;
3300 return used;
3301}
3302
3303void Client::cap_delay_requeue(Inode *in)
3304{
11fdf7f2 3305 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3306 in->hold_caps_until = ceph_clock_now();
3307 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3308 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3309}
3310
3311void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3312 bool sync, int used, int want, int retain,
3313 int flush, ceph_tid_t flush_tid)
3314{
3315 int held = cap->issued | cap->implemented;
3316 int revoking = cap->implemented & ~cap->issued;
3317 retain &= ~revoking;
3318 int dropping = cap->issued & ~retain;
3319 int op = CEPH_CAP_OP_UPDATE;
3320
11fdf7f2 3321 ldout(cct, 10) << __func__ << " " << *in
7c673cae
FG
3322 << " mds." << session->mds_num << " seq " << cap->seq
3323 << (sync ? " sync " : " async ")
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
11fdf7f2 3362 auto m = MClientCaps::create(op,
7c673cae
FG
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3384 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
3398 if (sync)
11fdf7f2
TL
3399 m->flags |= MClientCaps::FLAG_SYNC;
3400 if (!in->cap_snaps.empty())
3401 m->flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
7c673cae
FG
3402
3403 if (flush & CEPH_CAP_FILE_WR) {
3404 m->inline_version = in->inline_version;
3405 m->inline_data = in->inline_data;
3406 }
3407
3408 in->reported_size = in->size;
3409 m->set_snap_follows(follows);
3410 cap->wanted = want;
3411 if (cap == in->auth_cap) {
3412 m->set_max_size(in->wanted_max_size);
3413 in->requested_max_size = in->wanted_max_size;
3414 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3415 }
3416
3417 if (!session->flushing_caps_tids.empty())
3418 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3419
11fdf7f2 3420 session->con->send_message2(std::move(m));
7c673cae
FG
3421}
3422
31f18b77
FG
3423static bool is_max_size_approaching(Inode *in)
3424{
3425 /* mds will adjust max size according to the reported size */
3426 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3427 return false;
3428 if (in->size >= in->max_size)
3429 return true;
3430 /* half of previous max_size increment has been used */
3431 if (in->max_size > in->reported_size &&
3432 (in->size << 1) >= in->max_size + in->reported_size)
3433 return true;
3434 return false;
3435}
7c673cae 3436
11fdf7f2
TL
3437static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3438{
3439 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3440 return used;
3441 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3442 return used;
3443
3444 if (issued & CEPH_CAP_FILE_LAZYIO) {
3445 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3446 used &= ~CEPH_CAP_FILE_CACHE;
3447 used |= CEPH_CAP_FILE_LAZYIO;
3448 }
3449 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3450 used &= ~CEPH_CAP_FILE_BUFFER;
3451 used |= CEPH_CAP_FILE_LAZYIO;
3452 }
3453 } else {
3454 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3455 used &= ~CEPH_CAP_FILE_CACHE;
3456 used |= CEPH_CAP_FILE_LAZYIO;
3457 }
3458 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3459 used &= ~CEPH_CAP_FILE_BUFFER;
3460 used |= CEPH_CAP_FILE_LAZYIO;
3461 }
3462 }
3463 return used;
3464}
3465
7c673cae
FG
3466/**
3467 * check_caps
3468 *
3469 * Examine currently used and wanted versus held caps. Release, flush or ack
3470 * revoked caps to the MDS as appropriate.
3471 *
3472 * @param in the inode to check
3473 * @param flags flags to apply to cap check
3474 */
3475void Client::check_caps(Inode *in, unsigned flags)
3476{
3477 unsigned wanted = in->caps_wanted();
3478 unsigned used = get_caps_used(in);
3479 unsigned cap_used;
3480
7c673cae
FG
3481 int implemented;
3482 int issued = in->caps_issued(&implemented);
3483 int revoking = implemented & ~issued;
3484
11fdf7f2
TL
3485 int orig_used = used;
3486 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3487
7c673cae 3488 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3489 if (!unmounting && in->nlink > 0) {
3490 if (wanted) {
7c673cae 3491 retain |= CEPH_CAP_ANY;
a8e16298
TL
3492 } else if (in->is_dir() &&
3493 (issued & CEPH_CAP_FILE_SHARED) &&
3494 (in->flags & I_COMPLETE)) {
3495 // we do this here because we don't want to drop to Fs (and then
3496 // drop the Fs if we do a create!) if that alone makes us send lookups
3497 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3498 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3499 retain |= wanted;
3500 } else {
7c673cae 3501 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3502 // keep RD only if we didn't have the file open RW,
3503 // because then the mds would revoke it anyway to
3504 // journal max_size=0.
3505 if (in->max_size == 0)
3506 retain |= CEPH_CAP_ANY_RD;
3507 }
7c673cae
FG
3508 }
3509
11fdf7f2 3510 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3511 << " wanted " << ccap_string(wanted)
3512 << " used " << ccap_string(used)
3513 << " issued " << ccap_string(issued)
3514 << " revoking " << ccap_string(revoking)
3515 << " flags=" << flags
3516 << dendl;
3517
3518 if (in->snapid != CEPH_NOSNAP)
3519 return; //snap caps last forever, can't write
3520
3521 if (in->caps.empty())
3522 return; // guard if at end of func
3523
11fdf7f2
TL
3524 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3525 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3526 if (_release(in))
11fdf7f2 3527 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3528 }
7c673cae
FG
3529
3530 if (!in->cap_snaps.empty())
3531 flush_snaps(in);
3532
11fdf7f2
TL
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
7c673cae 3536
11fdf7f2 3537 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3538
3539 cap_used = used;
11fdf7f2 3540 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3541 cap_used &= ~in->auth_cap->issued;
3542
11fdf7f2 3543 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3544
3545 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3552 &cap == in->auth_cap)
7c673cae
FG
3553 goto ack;
3554
3555 /* approaching file_max? */
11fdf7f2
TL
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
31f18b77 3558 is_max_size_approaching(in)) {
7c673cae 3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3560 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
11fdf7f2 3571 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
11fdf7f2 3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3578 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3579 continue;
3580
11fdf7f2 3581 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3582 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3583 cap_delay_requeue(in);
7c673cae
FG
3584 continue;
3585 }
3586
3587 ack:
3588 // re-send old cap/snapcap flushes first.
3589 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3590 session->mds_state < MDSMap::STATE_ACTIVE &&
3591 session->early_flushing_caps.count(in) == 0) {
3592 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3593 << " to mds." << session->mds_num << dendl;
3594 session->early_flushing_caps.insert(in);
3595 if (in->cap_snaps.size())
3596 flush_snaps(in, true);
3597 if (in->flushing_caps)
3598 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3599 }
3600
3601 int flushing;
3602 ceph_tid_t flush_tid;
11fdf7f2 3603 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3604 flushing = mark_caps_flushing(in, &flush_tid);
3605 } else {
3606 flushing = 0;
3607 flush_tid = 0;
3608 }
3609
11fdf7f2 3610 send_cap(in, session, &cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
7c673cae
FG
3611 retain, flushing, flush_tid);
3612 }
3613}
3614
3615
3616void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3617{
3618 int used = get_caps_used(in);
3619 int dirty = in->caps_dirty();
11fdf7f2 3620 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3621
3622 if (in->cap_snaps.size() &&
3623 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3624 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3625 return;
3626 } else if (in->caps_dirty() ||
3627 (used & CEPH_CAP_FILE_WR) ||
3628 (dirty & CEPH_CAP_ANY_WR)) {
3629 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3630 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3631 CapSnap &capsnap = capsnapem.first->second;
3632 capsnap.context = old_snapc;
3633 capsnap.issued = in->caps_issued();
3634 capsnap.dirty = in->caps_dirty();
3635
3636 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3637
3638 capsnap.uid = in->uid;
3639 capsnap.gid = in->gid;
3640 capsnap.mode = in->mode;
3641 capsnap.btime = in->btime;
3642 capsnap.xattrs = in->xattrs;
3643 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3644 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3645 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3646
3647 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3648 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3649 capsnap.writing = 1;
3650 } else {
3651 finish_cap_snap(in, capsnap, used);
3652 }
3653 } else {
11fdf7f2 3654 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3655 }
3656}
3657
3658void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3659{
11fdf7f2 3660 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3661 capsnap.size = in->size;
3662 capsnap.mtime = in->mtime;
3663 capsnap.atime = in->atime;
3664 capsnap.ctime = in->ctime;
3665 capsnap.time_warp_seq = in->time_warp_seq;
3666 capsnap.change_attr = in->change_attr;
7c673cae
FG
3667 capsnap.dirty |= in->caps_dirty();
3668
11fdf7f2
TL
3669 /* Only reset it if it wasn't set before */
3670 if (capsnap.cap_dirtier_uid == -1) {
3671 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3672 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3673 }
3674
7c673cae
FG
3675 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3676 capsnap.inline_data = in->inline_data;
3677 capsnap.inline_version = in->inline_version;
3678 }
3679
3680 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3681 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3682 << " WRBUFFER, delaying" << dendl;
3683 } else {
3684 capsnap.dirty_data = 0;
3685 flush_snaps(in);
3686 }
3687}
3688
3689void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3690{
11fdf7f2 3691 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3692 in->cap_snaps.at(seq).dirty_data = 0;
3693 flush_snaps(in);
3694}
3695
3696void Client::flush_snaps(Inode *in, bool all_again)
3697{
3698 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
11fdf7f2 3699 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3700
3701 // pick auth mds
11fdf7f2 3702 ceph_assert(in->auth_cap);
7c673cae
FG
3703 MetaSession *session = in->auth_cap->session;
3704 int mseq = in->auth_cap->mseq;
3705
3706 for (auto &p : in->cap_snaps) {
3707 CapSnap &capsnap = p.second;
3708 if (!all_again) {
3709 // only flush once per session
3710 if (capsnap.flush_tid > 0)
3711 continue;
3712 }
3713
3714 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3715 << " follows " << p.first
3716 << " size " << capsnap.size
3717 << " mtime " << capsnap.mtime
3718 << " dirty_data=" << capsnap.dirty_data
3719 << " writing=" << capsnap.writing
3720 << " on " << *in << dendl;
3721 if (capsnap.dirty_data || capsnap.writing)
3722 continue;
3723
3724 if (capsnap.flush_tid == 0) {
3725 capsnap.flush_tid = ++last_flush_tid;
3726 if (!in->flushing_cap_item.is_on_list())
3727 session->flushing_caps.push_back(&in->flushing_cap_item);
3728 session->flushing_caps_tids.insert(capsnap.flush_tid);
3729 }
3730
11fdf7f2 3731 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
7c673cae 3732 cap_epoch_barrier);
11fdf7f2
TL
3733 m->caller_uid = capsnap.cap_dirtier_uid;
3734 m->caller_gid = capsnap.cap_dirtier_gid;
7c673cae
FG
3735
3736 m->set_client_tid(capsnap.flush_tid);
3737 m->head.snap_follows = p.first;
3738
3739 m->head.caps = capsnap.issued;
3740 m->head.dirty = capsnap.dirty;
3741
3742 m->head.uid = capsnap.uid;
3743 m->head.gid = capsnap.gid;
3744 m->head.mode = capsnap.mode;
3745 m->btime = capsnap.btime;
3746
3747 m->size = capsnap.size;
3748
3749 m->head.xattr_version = capsnap.xattr_version;
11fdf7f2 3750 encode(capsnap.xattrs, m->xattrbl);
7c673cae
FG
3751
3752 m->ctime = capsnap.ctime;
3753 m->btime = capsnap.btime;
3754 m->mtime = capsnap.mtime;
3755 m->atime = capsnap.atime;
3756 m->time_warp_seq = capsnap.time_warp_seq;
3757 m->change_attr = capsnap.change_attr;
3758
3759 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3760 m->inline_version = in->inline_version;
3761 m->inline_data = in->inline_data;
3762 }
3763
11fdf7f2 3764 ceph_assert(!session->flushing_caps_tids.empty());
7c673cae
FG
3765 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3766
11fdf7f2 3767 session->con->send_message2(std::move(m));
7c673cae
FG
3768 }
3769}
3770
3771
3772
3773void Client::wait_on_list(list<Cond*>& ls)
3774{
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779}
3780
3781void Client::signal_cond_list(list<Cond*>& ls)
3782{
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785}
3786
3787void Client::wait_on_context_list(list<Context*>& ls)
3788{
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795}
3796
3797void Client::signal_context_list(list<Context*>& ls)
3798{
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803}
3804
a8e16298 3805void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3806{
11fdf7f2
TL
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
a8e16298 3809 if (reconnect) {
11fdf7f2
TL
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
a8e16298
TL
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
11fdf7f2
TL
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3819 }
3820 }
11fdf7f2 3821 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3822 }
3823}
3824
3825
3826// flush dirty data (from objectcache)
3827
3828class C_Client_CacheInvalidate : public Context {
3829private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
11fdf7f2 3843 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846};
3847
3848void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849{
3850 if (unmounting)
3851 return;
11fdf7f2 3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854}
3855
3856void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861}
3862
3863void Client::_invalidate_inode_cache(Inode *in)
3864{
11fdf7f2 3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3866
3867 // invalidate our userspace inode cache
94b18763 3868 if (cct->_conf->client_oc) {
7c673cae 3869 objectcacher->release_set(&in->oset);
94b18763
FG
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
7c673cae
FG
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875}
3876
3877void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878{
11fdf7f2 3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889}
3890
3891bool Client::_release(Inode *in)
3892{
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899}
3900
3901bool Client::_flush(Inode *in, Context *onfinish)
3902{
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921}
3922
3923void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924{
11fdf7f2 3925 ceph_assert(client_lock.is_locked());
7c673cae
FG
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
11fdf7f2 3931 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3933 offset, size, &onflush);
7c673cae
FG
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
11fdf7f2 3937 onflush.wait();
7c673cae
FG
3938 client_lock.Lock();
3939 }
3940}
3941
3942void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943{
11fdf7f2
TL
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
7c673cae 3946 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3947 ceph_assert(in);
7c673cae
FG
3948 _flushed(in);
3949}
3950
3951void Client::_flushed(Inode *in)
3952{
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956}
3957
3958
3959
3960// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3961void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3962{
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976}
3977
3978void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3981{
11fdf7f2
TL
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
7c673cae 3999 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
11fdf7f2
TL
4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4016 ceph_assert(&cap == in->auth_cap);
4017 ceph_assert(cap.cap_id == cap_id);
4018 seq = cap.seq;
4019 mseq = cap.mseq;
4020 issued |= cap.issued;
7c673cae
FG
4021 flags |= CEPH_CAP_FLAG_AUTH;
4022 }
7c673cae
FG
4023 }
4024
11fdf7f2 4025 check_cap_issue(in, issued);
7c673cae
FG
4026
4027 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4028 if (in->auth_cap != &cap &&
7c673cae
FG
4029 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4030 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4031 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4032 << "add myself to new auth MDS' flushing caps list" << dendl;
4033 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4034 }
11fdf7f2 4035 in->auth_cap = &cap;
7c673cae
FG
4036 }
4037 }
4038
11fdf7f2
TL
4039 unsigned old_caps = cap.issued;
4040 cap.cap_id = cap_id;
4041 cap.issued = issued;
4042 cap.implemented |= issued;
4043 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4044 cap.wanted = wanted;
a8e16298 4045 else
11fdf7f2
TL
4046 cap.wanted |= wanted;
4047 cap.seq = seq;
4048 cap.issue_seq = seq;
4049 cap.mseq = mseq;
4050 cap.gen = mds_session->cap_gen;
4051 cap.latest_perms = cap_perms;
4052 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4053 << " from mds." << mds
4054 << " on " << *in
4055 << dendl;
4056
4057 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4058 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4059 for (auto &p : in->caps) {
4060 if (&p.second == &cap)
7c673cae 4061 continue;
11fdf7f2 4062 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4063 check_caps(in, CHECK_CAPS_NODELAY);
4064 break;
4065 }
4066 }
4067 }
4068
4069 if (issued & ~old_caps)
4070 signal_cond_list(in->waitfor_caps);
4071}
4072
4073void Client::remove_cap(Cap *cap, bool queue_release)
4074{
11fdf7f2 4075 auto &in = cap->inode;
7c673cae
FG
4076 MetaSession *session = cap->session;
4077 mds_rank_t mds = cap->session->mds_num;
4078
11fdf7f2 4079 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4080
4081 if (queue_release) {
4082 session->enqueue_cap_release(
11fdf7f2 4083 in.ino,
7c673cae
FG
4084 cap->cap_id,
4085 cap->issue_seq,
4086 cap->mseq,
4087 cap_epoch_barrier);
4088 }
4089
11fdf7f2
TL
4090 if (in.auth_cap == cap) {
4091 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4092 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4093 in.flushing_cap_item.remove_myself();
7c673cae 4094 }
11fdf7f2 4095 in.auth_cap = NULL;
7c673cae 4096 }
11fdf7f2
TL
4097 size_t n = in.caps.erase(mds);
4098 ceph_assert(n == 1);
7c673cae
FG
4099 cap = nullptr;
4100
11fdf7f2
TL
4101 if (!in.is_any_caps()) {
4102 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4103 in.snaprealm_item.remove_myself();
4104 put_snap_realm(in.snaprealm);
4105 in.snaprealm = 0;
7c673cae
FG
4106 }
4107}
4108
4109void Client::remove_all_caps(Inode *in)
4110{
4111 while (!in->caps.empty())
11fdf7f2 4112 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4113}
4114
4115void Client::remove_session_caps(MetaSession *s)
4116{
11fdf7f2 4117 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4118
4119 while (s->caps.size()) {
4120 Cap *cap = *s->caps.begin();
11fdf7f2 4121 InodeRef in(&cap->inode);
7c673cae
FG
4122 bool dirty_caps = false, cap_snaps = false;
4123 if (in->auth_cap == cap) {
4124 cap_snaps = !in->cap_snaps.empty();
4125 dirty_caps = in->dirty_caps | in->flushing_caps;
4126 in->wanted_max_size = 0;
4127 in->requested_max_size = 0;
7c673cae 4128 }
a8e16298
TL
4129 if (cap->wanted | cap->issued)
4130 in->flags |= I_CAP_DROPPED;
7c673cae 4131 remove_cap(cap, false);
7c673cae 4132 if (cap_snaps) {
7c673cae
FG
4133 in->cap_snaps.clear();
4134 }
4135 if (dirty_caps) {
11fdf7f2 4136 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4137 if (in->flushing_caps) {
4138 num_flushing_caps--;
4139 in->flushing_cap_tids.clear();
4140 }
4141 in->flushing_caps = 0;
28e407b8 4142 in->mark_caps_clean();
11fdf7f2 4143 put_inode(in.get());
7c673cae 4144 }
a8e16298 4145 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4146 }
4147 s->flushing_caps_tids.clear();
4148 sync_cond.Signal();
4149}
4150
91327a77 4151int Client::_do_remount(bool retry_on_error)
b32b8144 4152{
11fdf7f2 4153 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4154
b32b8144
FG
4155 errno = 0;
4156 int r = remount_cb(callback_handle);
91327a77
AA
4157 if (r == 0) {
4158 retries_on_invalidate = 0;
4159 } else {
b32b8144
FG
4160 int e = errno;
4161 client_t whoami = get_nodeid();
4162 if (r == -1) {
4163 lderr(cct) <<
4164 "failed to remount (to trim kernel dentries): "
4165 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4166 } else {
4167 lderr(cct) <<
4168 "failed to remount (to trim kernel dentries): "
4169 "return code = " << r << dendl;
4170 }
91327a77 4171 bool should_abort =
11fdf7f2
TL
4172 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4173 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4174 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4175 if (should_abort && !unmounting) {
4176 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4177 ceph_abort();
4178 }
4179 }
4180 return r;
4181}
4182
7c673cae
FG
4183class C_Client_Remount : public Context {
4184private:
4185 Client *client;
4186public:
4187 explicit C_Client_Remount(Client *c) : client(c) {}
4188 void finish(int r) override {
11fdf7f2 4189 ceph_assert(r == 0);
91327a77 4190 client->_do_remount(true);
7c673cae
FG
4191 }
4192};
4193
4194void Client::_invalidate_kernel_dcache()
4195{
4196 if (unmounting)
4197 return;
94b18763
FG
4198 if (can_invalidate_dentries) {
4199 if (dentry_invalidate_cb && root->dir) {
4200 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4201 p != root->dir->dentries.end();
4202 ++p) {
4203 if (p->second->inode)
4204 _schedule_invalidate_dentry_callback(p->second, false);
4205 }
7c673cae
FG
4206 }
4207 } else if (remount_cb) {
4208 // Hacky:
4209 // when remounting a file system, linux kernel trims all unused dentries in the fs
4210 remount_finisher.queue(new C_Client_Remount(this));
4211 }
4212}
4213
91327a77
AA
4214void Client::_trim_negative_child_dentries(InodeRef& in)
4215{
4216 if (!in->is_dir())
4217 return;
4218
4219 Dir* dir = in->dir;
4220 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4221 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4222 Dentry *dn = p->second;
4223 ++p;
11fdf7f2 4224 ceph_assert(!dn->inode);
91327a77
AA
4225 if (dn->lru_is_expireable())
4226 unlink(dn, true, false); // keep dir, drop dentry
4227 }
4228 if (dir->dentries.empty()) {
4229 close_dir(dir);
4230 }
4231 }
4232
4233 if (in->flags & I_SNAPDIR_OPEN) {
4234 InodeRef snapdir = open_snapdir(in.get());
4235 _trim_negative_child_dentries(snapdir);
4236 }
4237}
4238
28e407b8 4239void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4240{
4241 mds_rank_t mds = s->mds_num;
28e407b8 4242 size_t caps_size = s->caps.size();
11fdf7f2 4243 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4244 << " caps " << caps_size << dendl;
4245
28e407b8
AA
4246 uint64_t trimmed = 0;
4247 auto p = s->caps.begin();
4248 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4249 * looking at from getting deleted during traversal. */
7c673cae
FG
4250 while ((caps_size - trimmed) > max && !p.end()) {
4251 Cap *cap = *p;
11fdf7f2 4252 InodeRef in(&cap->inode);
7c673cae
FG
4253
4254 // Increment p early because it will be invalidated if cap
4255 // is deleted inside remove_cap
4256 ++p;
4257
4258 if (in->caps.size() > 1 && cap != in->auth_cap) {
4259 int mine = cap->issued | cap->implemented;
4260 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4261 // disposable non-auth cap
b32b8144 4262 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4263 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4264 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4265 trimmed++;
4266 }
4267 } else {
4268 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4269 _trim_negative_child_dentries(in);
7c673cae 4270 bool all = true;
11fdf7f2
TL
4271 auto q = in->dentries.begin();
4272 while (q != in->dentries.end()) {
4273 Dentry *dn = *q;
4274 ++q;
7c673cae
FG
4275 if (dn->lru_is_expireable()) {
4276 if (can_invalidate_dentries &&
4277 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4278 // Only issue one of these per DN for inodes in root: handle
4279 // others more efficiently by calling for root-child DNs at
4280 // the end of this function.
4281 _schedule_invalidate_dentry_callback(dn, true);
4282 }
28e407b8
AA
4283 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4284 to_trim.insert(dn);
7c673cae
FG
4285 } else {
4286 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4287 all = false;
4288 }
4289 }
4290 if (all && in->ino != MDS_INO_ROOT) {
4291 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4292 trimmed++;
4293 }
4294 }
4295 }
28e407b8
AA
4296 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4297 for (const auto &dn : to_trim) {
4298 trim_dentry(dn);
4299 }
4300 to_trim.clear();
7c673cae 4301
b32b8144 4302 caps_size = s->caps.size();
11fdf7f2 4303 if (caps_size > (size_t)max)
7c673cae
FG
4304 _invalidate_kernel_dcache();
4305}
4306
4307void Client::force_session_readonly(MetaSession *s)
4308{
4309 s->readonly = true;
4310 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4311 auto &in = (*p)->inode;
4312 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4313 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4314 }
4315}
4316
7c673cae
FG
4317int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4318{
4319 MetaSession *session = in->auth_cap->session;
4320
4321 int flushing = in->dirty_caps;
11fdf7f2 4322 ceph_assert(flushing);
7c673cae
FG
4323
4324 ceph_tid_t flush_tid = ++last_flush_tid;
4325 in->flushing_cap_tids[flush_tid] = flushing;
4326
4327 if (!in->flushing_caps) {
11fdf7f2 4328 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4329 num_flushing_caps++;
4330 } else {
11fdf7f2 4331 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4332 }
4333
4334 in->flushing_caps |= flushing;
28e407b8 4335 in->mark_caps_clean();
7c673cae
FG
4336
4337 if (!in->flushing_cap_item.is_on_list())
4338 session->flushing_caps.push_back(&in->flushing_cap_item);
4339 session->flushing_caps_tids.insert(flush_tid);
4340
4341 *ptid = flush_tid;
4342 return flushing;
4343}
4344
4345void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4346{
4347 for (auto &p : in->cap_snaps) {
4348 CapSnap &capsnap = p.second;
4349 if (capsnap.flush_tid > 0) {
4350 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4351 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4352 }
4353 }
4354 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4355 it != in->flushing_cap_tids.end();
4356 ++it) {
4357 old_s->flushing_caps_tids.erase(it->first);
4358 new_s->flushing_caps_tids.insert(it->first);
4359 }
4360 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4361}
4362
4363/*
4364 * Flush all caps back to the MDS. Because the callers generally wait on the
4365 * result of this function (syncfs and umount cases), we set
4366 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4367 */
4368void Client::flush_caps_sync()
4369{
4370 ldout(cct, 10) << __func__ << dendl;
28e407b8 4371 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4372 while (!p.end()) {
4373 unsigned flags = CHECK_CAPS_NODELAY;
4374 Inode *in = *p;
4375
4376 ++p;
28e407b8
AA
4377 delayed_list.pop_front();
4378 if (p.end() && dirty_list.empty())
7c673cae
FG
4379 flags |= CHECK_CAPS_SYNCHRONOUS;
4380 check_caps(in, flags);
4381 }
4382
4383 // other caps, too
28e407b8 4384 p = dirty_list.begin();
7c673cae
FG
4385 while (!p.end()) {
4386 unsigned flags = CHECK_CAPS_NODELAY;
4387 Inode *in = *p;
4388
4389 ++p;
4390 if (p.end())
4391 flags |= CHECK_CAPS_SYNCHRONOUS;
4392 check_caps(in, flags);
4393 }
4394}
4395
4396void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4397{
11fdf7f2 4398 ldout(cct, 10) << __func__ << " " << in << " mds." << session->mds_num << dendl;
7c673cae 4399 Cap *cap = in->auth_cap;
11fdf7f2 4400 ceph_assert(cap->session == session);
7c673cae
FG
4401
4402 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4403 p != in->flushing_cap_tids.end();
4404 ++p) {
4405 bool req_sync = false;
4406
4407 /* If this is a synchronous request, then flush the journal on last one */
4408 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4409 req_sync = true;
4410
4411 send_cap(in, session, cap, req_sync,
4412 (get_caps_used(in) | in->caps_dirty()),
4413 in->caps_wanted(), (cap->issued | cap->implemented),
4414 p->second, p->first);
4415 }
4416}
4417
4418void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4419{
4420 while (in->flushing_caps) {
4421 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4422 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4423 if (it->first > want)
4424 break;
11fdf7f2 4425 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4426 << ccap_string(it->second) << " want " << want
4427 << " last " << it->first << dendl;
4428 wait_on_list(in->waitfor_caps);
4429 }
4430}
4431
4432void Client::wait_sync_caps(ceph_tid_t want)
4433{
4434 retry:
11fdf7f2 4435 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4436 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4437 for (auto &p : mds_sessions) {
4438 MetaSession *s = &p.second;
7c673cae
FG
4439 if (s->flushing_caps_tids.empty())
4440 continue;
4441 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4442 if (oldest_tid <= want) {
11fdf7f2 4443 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae
FG
4444 << " (want " << want << ")" << dendl;
4445 sync_cond.Wait(client_lock);
4446 goto retry;
4447 }
4448 }
4449}
4450
4451void Client::kick_flushing_caps(MetaSession *session)
4452{
4453 mds_rank_t mds = session->mds_num;
11fdf7f2 4454 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4455
4456 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4457 Inode *in = *p;
4458 if (session->early_flushing_caps.count(in))
4459 continue;
4460 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4461 if (in->cap_snaps.size())
4462 flush_snaps(in, true);
4463 if (in->flushing_caps)
4464 flush_caps(in, session);
4465 }
4466
4467 session->early_flushing_caps.clear();
4468}
4469
4470void Client::early_kick_flushing_caps(MetaSession *session)
4471{
4472 session->early_flushing_caps.clear();
4473
4474 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4475 Inode *in = *p;
11fdf7f2
TL
4476 Cap *cap = in->auth_cap;
4477 ceph_assert(cap);
7c673cae
FG
4478
4479 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4480 // stage. This guarantees that MDS processes the cap flush message before issuing
4481 // the flushing caps to other client.
4482 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4483 continue;
4484
4485 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4486 << " to mds." << session->mds_num << dendl;
4487
4488 session->early_flushing_caps.insert(in);
4489
11fdf7f2
TL
4490 // send_reconnect() also will reset these sequence numbers. make sure
4491 // sequence numbers in cap flush message match later reconnect message.
4492 cap->seq = 0;
4493 cap->issue_seq = 0;
4494 cap->mseq = 0;
4495 cap->issued = cap->implemented;
4496
7c673cae
FG
4497 if (in->cap_snaps.size())
4498 flush_snaps(in, true);
4499 if (in->flushing_caps)
4500 flush_caps(in, session);
4501
4502 }
4503}
4504
7c673cae
FG
4505void SnapRealm::build_snap_context()
4506{
4507 set<snapid_t> snaps;
4508 snapid_t max_seq = seq;
4509
4510 // start with prior_parents?
4511 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4512 snaps.insert(prior_parent_snaps[i]);
4513
4514 // current parent's snaps
4515 if (pparent) {
4516 const SnapContext& psnapc = pparent->get_snap_context();
4517 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4518 if (psnapc.snaps[i] >= parent_since)
4519 snaps.insert(psnapc.snaps[i]);
4520 if (psnapc.seq > max_seq)
4521 max_seq = psnapc.seq;
4522 }
4523
4524 // my snaps
4525 for (unsigned i=0; i<my_snaps.size(); i++)
4526 snaps.insert(my_snaps[i]);
4527
4528 // ok!
4529 cached_snap_context.seq = max_seq;
4530 cached_snap_context.snaps.resize(0);
4531 cached_snap_context.snaps.reserve(snaps.size());
4532 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4533 cached_snap_context.snaps.push_back(*p);
4534}
4535
4536void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4537{
4538 list<SnapRealm*> q;
4539 q.push_back(realm);
4540
4541 while (!q.empty()) {
4542 realm = q.front();
4543 q.pop_front();
4544
11fdf7f2 4545 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4546 realm->invalidate_cache();
4547
4548 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4549 p != realm->pchildren.end();
4550 ++p)
4551 q.push_back(*p);
4552 }
4553}
4554
4555SnapRealm *Client::get_snap_realm(inodeno_t r)
4556{
4557 SnapRealm *realm = snap_realms[r];
4558 if (!realm)
4559 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4560 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4561 realm->nref++;
4562 return realm;
4563}
4564
4565SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4566{
4567 if (snap_realms.count(r) == 0) {
11fdf7f2 4568 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4569 return NULL;
4570 }
4571 SnapRealm *realm = snap_realms[r];
11fdf7f2 4572 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4573 realm->nref++;
4574 return realm;
4575}
4576
4577void Client::put_snap_realm(SnapRealm *realm)
4578{
11fdf7f2 4579 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4580 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4581 if (--realm->nref == 0) {
4582 snap_realms.erase(realm->ino);
4583 if (realm->pparent) {
4584 realm->pparent->pchildren.erase(realm);
4585 put_snap_realm(realm->pparent);
4586 }
4587 delete realm;
4588 }
4589}
4590
4591bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4592{
4593 if (realm->parent != parent) {
11fdf7f2 4594 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4595 << " " << realm->parent << " -> " << parent << dendl;
4596 realm->parent = parent;
4597 if (realm->pparent) {
4598 realm->pparent->pchildren.erase(realm);
4599 put_snap_realm(realm->pparent);
4600 }
4601 realm->pparent = get_snap_realm(parent);
4602 realm->pparent->pchildren.insert(realm);
4603 return true;
4604 }
4605 return false;
4606}
4607
4608static bool has_new_snaps(const SnapContext& old_snapc,
4609 const SnapContext& new_snapc)
4610{
4611 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4612}
4613
4614
11fdf7f2 4615void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4616{
4617 SnapRealm *first_realm = NULL;
11fdf7f2 4618 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4619
4620 map<SnapRealm*, SnapContext> dirty_realms;
4621
11fdf7f2 4622 auto p = bl.cbegin();
7c673cae
FG
4623 while (!p.end()) {
4624 SnapRealmInfo info;
11fdf7f2 4625 decode(info, p);
7c673cae
FG
4626 SnapRealm *realm = get_snap_realm(info.ino());
4627
4628 bool invalidate = false;
4629
4630 if (info.seq() > realm->seq) {
11fdf7f2 4631 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4632 << dendl;
4633
4634 if (flush) {
4635 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4636 // flush me + children
4637 list<SnapRealm*> q;
4638 q.push_back(realm);
4639 while (!q.empty()) {
4640 SnapRealm *realm = q.front();
4641 q.pop_front();
4642
4643 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4644 p != realm->pchildren.end();
4645 ++p)
4646 q.push_back(*p);
4647
4648 if (dirty_realms.count(realm) == 0) {
4649 realm->nref++;
4650 dirty_realms[realm] = realm->get_snap_context();
4651 }
4652 }
4653 }
4654
4655 // update
4656 realm->seq = info.seq();
4657 realm->created = info.created();
4658 realm->parent_since = info.parent_since();
4659 realm->prior_parent_snaps = info.prior_parent_snaps;
4660 realm->my_snaps = info.my_snaps;
4661 invalidate = true;
4662 }
4663
4664 // _always_ verify parent
4665 if (adjust_realm_parent(realm, info.parent()))
4666 invalidate = true;
4667
4668 if (invalidate) {
4669 invalidate_snaprealm_and_children(realm);
11fdf7f2 4670 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4671 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4672 } else {
11fdf7f2 4673 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4674 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4675 }
4676
4677 if (!first_realm)
4678 first_realm = realm;
4679 else
4680 put_snap_realm(realm);
4681 }
4682
4683 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4684 q != dirty_realms.end();
4685 ++q) {
4686 SnapRealm *realm = q->first;
4687 // if there are new snaps ?
4688 if (has_new_snaps(q->second, realm->get_snap_context())) {
4689 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4690 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4691 while (!r.end()) {
4692 Inode *in = *r;
4693 ++r;
4694 queue_cap_snap(in, q->second);
4695 }
4696 } else {
4697 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4698 }
4699 put_snap_realm(realm);
4700 }
4701
4702 if (realm_ret)
4703 *realm_ret = first_realm;
4704 else
4705 put_snap_realm(first_realm);
4706}
4707
11fdf7f2 4708void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4709{
11fdf7f2 4710 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4711 mds_rank_t mds = mds_rank_t(m->get_source().num());
4712 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4713 if (!session) {
7c673cae
FG
4714 return;
4715 }
4716
4717 got_mds_push(session);
4718
4719 map<Inode*, SnapContext> to_move;
4720 SnapRealm *realm = 0;
4721
4722 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4723 ceph_assert(m->head.split);
7c673cae 4724 SnapRealmInfo info;
11fdf7f2
TL
4725 auto p = m->bl.cbegin();
4726 decode(info, p);
4727 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4728
4729 // flush, then move, ino's.
4730 realm = get_snap_realm(info.ino());
4731 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4732 for (auto& ino : m->split_inos) {
4733 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4734 if (inode_map.count(vino)) {
4735 Inode *in = inode_map[vino];
4736 if (!in->snaprealm || in->snaprealm == realm)
4737 continue;
4738 if (in->snaprealm->created > info.created()) {
4739 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4740 << *in->snaprealm << dendl;
4741 continue;
4742 }
4743 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4744
4745
4746 in->snaprealm_item.remove_myself();
4747 to_move[in] = in->snaprealm->get_snap_context();
4748 put_snap_realm(in->snaprealm);
4749 }
4750 }
4751
4752 // move child snaprealms, too
11fdf7f2
TL
4753 for (auto& child_realm : m->split_realms) {
4754 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4755 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4756 if (!child)
4757 continue;
4758 adjust_realm_parent(child, realm->ino);
4759 put_snap_realm(child);
4760 }
4761 }
4762
4763 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4764
4765 if (realm) {
4766 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4767 Inode *in = p->first;
4768 in->snaprealm = realm;
4769 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4770 realm->nref++;
4771 // queue for snap writeback
4772 if (has_new_snaps(p->second, realm->get_snap_context()))
4773 queue_cap_snap(in, p->second);
4774 }
4775 put_snap_realm(realm);
4776 }
7c673cae
FG
4777}
4778
11fdf7f2 4779void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4780{
4781 mds_rank_t mds = mds_rank_t(m->get_source().num());
4782 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4783 if (!session) {
7c673cae
FG
4784 return;
4785 }
4786
4787 got_mds_push(session);
4788
11fdf7f2 4789 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4790
4791 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4792 if (inode_map.count(vino)) {
4793 Inode *in = NULL;
4794 in = inode_map[vino];
4795
4796 if (in) {
4797 in->quota = m->quota;
4798 in->rstat = m->rstat;
4799 }
4800 }
7c673cae
FG
4801}
4802
11fdf7f2 4803void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4804{
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
7c673cae
FG
4808 return;
4809 }
4810
4811 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4812 // Pause RADOS operations until we see the required epoch
4813 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4814 }
4815
4816 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4817 // Record the barrier so that we will transmit it to MDS when releasing
4818 set_cap_epoch_barrier(m->osd_epoch_barrier);
4819 }
4820
4821 got_mds_push(session);
4822
11fdf7f2 4823 Inode *in;
7c673cae 4824 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4825 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4826 in = it->second;
4827 } else {
7c673cae 4828 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4829 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4830 session->enqueue_cap_release(
4831 m->get_ino(),
4832 m->get_cap_id(),
4833 m->get_seq(),
4834 m->get_mseq(),
4835 cap_epoch_barrier);
4836 } else {
11fdf7f2 4837 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4838 }
7c673cae
FG
4839
4840 // in case the mds is waiting on e.g. a revocation
4841 flush_cap_releases();
4842 return;
4843 }
4844
4845 switch (m->get_op()) {
11fdf7f2
TL
4846 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4847 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4848 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4849 }
4850
11fdf7f2
TL
4851 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4852 Cap &cap = in->caps.at(mds);
7c673cae 4853
11fdf7f2
TL
4854 switch (m->get_op()) {
4855 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4856 case CEPH_CAP_OP_IMPORT:
4857 case CEPH_CAP_OP_REVOKE:
4858 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4859 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4860 }
4861 } else {
4862 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4863 return;
7c673cae
FG
4864 }
4865}
4866
11fdf7f2 4867void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4868{
4869 mds_rank_t mds = session->mds_num;
4870
11fdf7f2 4871 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4872 << " IMPORT from mds." << mds << dendl;
4873
4874 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4875 Cap *cap = NULL;
4876 UserPerm cap_perms;
11fdf7f2
TL
4877 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4878 cap = &it->second;
4879 cap_perms = cap->latest_perms;
7c673cae
FG
4880 }
4881
4882 // add/update it
4883 SnapRealm *realm = NULL;
4884 update_snap_trace(m->snapbl, &realm);
4885
4886 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4887 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4888 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4889
4890 if (cap && cap->cap_id == m->peer.cap_id) {
4891 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4892 }
4893
4894 if (realm)
4895 put_snap_realm(realm);
4896
4897 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4898 // reflush any/all caps (if we are now the auth_cap)
4899 if (in->cap_snaps.size())
4900 flush_snaps(in, true);
4901 if (in->flushing_caps)
4902 flush_caps(in, session);
4903 }
4904}
4905
11fdf7f2 4906void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4907{
4908 mds_rank_t mds = session->mds_num;
4909
11fdf7f2 4910 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4911 << " EXPORT from mds." << mds << dendl;
4912
11fdf7f2
TL
4913 auto it = in->caps.find(mds);
4914 if (it != in->caps.end()) {
4915 Cap &cap = it->second;
4916 if (cap.cap_id == m->get_cap_id()) {
4917 if (m->peer.cap_id) {
4918 const auto peer_mds = mds_rank_t(m->peer.mds);
4919 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4920 auto it = in->caps.find(peer_mds);
4921 if (it != in->caps.end()) {
4922 Cap &tcap = it->second;
4923 if (tcap.cap_id == m->peer.cap_id &&
4924 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4925 tcap.cap_id = m->peer.cap_id;
4926 tcap.seq = m->peer.seq - 1;
4927 tcap.issue_seq = tcap.seq;
4928 tcap.issued |= cap.issued;
4929 tcap.implemented |= cap.issued;
4930 if (&cap == in->auth_cap)
4931 in->auth_cap = &tcap;
4932 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4933 adjust_session_flushing_caps(in, session, tsession);
4934 }
4935 } else {
4936 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4937 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4938 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4939 cap.latest_perms);
4940 }
7c673cae 4941 } else {
11fdf7f2
TL
4942 if (cap.wanted | cap.issued)
4943 in->flags |= I_CAP_DROPPED;
7c673cae 4944 }
7c673cae 4945
11fdf7f2
TL
4946 remove_cap(&cap, false);
4947 }
7c673cae 4948 }
7c673cae
FG
4949}
4950
11fdf7f2 4951void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4952{
4953 mds_rank_t mds = session->mds_num;
11fdf7f2 4954 ceph_assert(in->caps.count(mds));
7c673cae 4955
11fdf7f2 4956 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4957 << " size " << in->size << " -> " << m->get_size()
4958 << dendl;
4959
1adf2230
AA
4960 int issued;
4961 in->caps_issued(&issued);
4962 issued |= in->caps_dirty();
4963 update_inode_file_size(in, issued, m->get_size(),
4964 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4965}
4966
11fdf7f2 4967void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4968{
4969 ceph_tid_t flush_ack_tid = m->get_client_tid();
4970 int dirty = m->get_dirty();
4971 int cleaned = 0;
4972 int flushed = 0;
4973
11fdf7f2
TL
4974 auto it = in->flushing_cap_tids.begin();
4975 if (it->first < flush_ack_tid) {
4976 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4977 << " got unexpected flush ack tid " << flush_ack_tid
4978 << " expected is " << it->first << dendl;
4979 }
4980 for (; it != in->flushing_cap_tids.end(); ) {
7c673cae
FG
4981 if (it->first == flush_ack_tid)
4982 cleaned = it->second;
4983 if (it->first <= flush_ack_tid) {
4984 session->flushing_caps_tids.erase(it->first);
4985 in->flushing_cap_tids.erase(it++);
4986 ++flushed;
4987 continue;
4988 }
4989 cleaned &= ~it->second;
4990 if (!cleaned)
4991 break;
4992 ++it;
4993 }
4994
11fdf7f2 4995 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
4996 << " cleaned " << ccap_string(cleaned) << " on " << *in
4997 << " with " << ccap_string(dirty) << dendl;
4998
4999 if (flushed) {
5000 signal_cond_list(in->waitfor_caps);
5001 if (session->flushing_caps_tids.empty() ||
5002 *session->flushing_caps_tids.begin() > flush_ack_tid)
5003 sync_cond.Signal();
5004 }
5005
5006 if (!dirty) {
5007 in->cap_dirtier_uid = -1;
5008 in->cap_dirtier_gid = -1;
5009 }
5010
5011 if (!cleaned) {
5012 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5013 } else {
5014 if (in->flushing_caps) {
5015 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5016 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5017 in->flushing_caps &= ~cleaned;
5018 if (in->flushing_caps == 0) {
5019 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5020 num_flushing_caps--;
5021 if (in->cap_snaps.empty())
5022 in->flushing_cap_item.remove_myself();
5023 }
5024 if (!in->caps_dirty())
5025 put_inode(in);
5026 }
5027 }
7c673cae
FG
5028}
5029
5030
11fdf7f2 5031void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5032{
5033 mds_rank_t mds = session->mds_num;
11fdf7f2 5034 ceph_assert(in->caps.count(mds));
7c673cae
FG
5035 snapid_t follows = m->get_snap_follows();
5036
11fdf7f2
TL
5037 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5038 auto& capsnap = it->second;
7c673cae
FG
5039 if (m->get_client_tid() != capsnap.flush_tid) {
5040 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
5041 } else {
11fdf7f2 5042 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5043 << " on " << *in << dendl;
5044 InodeRef tmp_ref;
5045 if (in->get_num_ref() == 1)
5046 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5047 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5048 in->flushing_cap_item.remove_myself();
5049 session->flushing_caps_tids.erase(capsnap.flush_tid);
11fdf7f2 5050 in->cap_snaps.erase(it);
7c673cae
FG
5051 }
5052 } else {
11fdf7f2 5053 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5054 << " on " << *in << dendl;
5055 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5056 }
7c673cae
FG
5057}
5058
5059class C_Client_DentryInvalidate : public Context {
5060private:
5061 Client *client;
5062 vinodeno_t dirino;
5063 vinodeno_t ino;
5064 string name;
5065public:
5066 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5067 client(c), name(dn->name) {
5068 if (client->use_faked_inos()) {
5069 dirino.ino = dn->dir->parent_inode->faked_ino;
5070 if (del)
5071 ino.ino = dn->inode->faked_ino;
5072 } else {
5073 dirino = dn->dir->parent_inode->vino();
5074 if (del)
5075 ino = dn->inode->vino();
5076 }
5077 if (!del)
5078 ino.ino = inodeno_t();
5079 }
5080 void finish(int r) override {
5081 // _async_dentry_invalidate is responsible for its own locking
11fdf7f2 5082 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
5083 client->_async_dentry_invalidate(dirino, ino, name);
5084 }
5085};
5086
5087void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5088{
5089 if (unmounting)
5090 return;
11fdf7f2 5091 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5092 << " in dir " << dirino << dendl;
5093 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5094}
5095
5096void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5097{
5098 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5099 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5100}
5101
5102void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5103{
5104 int ref = in->get_num_ref();
5105
5106 if (in->dir && !in->dir->dentries.empty()) {
5107 for (auto p = in->dir->dentries.begin();
5108 p != in->dir->dentries.end(); ) {
5109 Dentry *dn = p->second;
5110 ++p;
5111 /* rmsnap removes whole subtree, need trim inodes recursively.
5112 * we don't need to invalidate dentries recursively. because
5113 * invalidating a directory dentry effectively invalidate
5114 * whole subtree */
5115 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5116 _try_to_trim_inode(dn->inode.get(), false);
5117
5118 if (dn->lru_is_expireable())
5119 unlink(dn, true, false); // keep dir, drop dentry
5120 }
5121 if (in->dir->dentries.empty()) {
5122 close_dir(in->dir);
5123 --ref;
5124 }
5125 }
5126
5127 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5128 InodeRef snapdir = open_snapdir(in);
5129 _try_to_trim_inode(snapdir.get(), false);
5130 --ref;
5131 }
5132
5133 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
11fdf7f2
TL
5134 auto q = in->dentries.begin();
5135 while (q != in->dentries.end()) {
5136 Dentry *dn = *q;
5137 ++q;
7c673cae 5138 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
11fdf7f2 5139 // so in->dentries doesn't always reflect the state of kernel's dcache.
7c673cae
FG
5140 _schedule_invalidate_dentry_callback(dn, true);
5141 unlink(dn, true, true);
5142 }
5143 }
5144}
5145
11fdf7f2 5146void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5147{
5148 mds_rank_t mds = session->mds_num;
5149 int used = get_caps_used(in);
5150 int wanted = in->caps_wanted();
5151
a8e16298
TL
5152 const unsigned new_caps = m->get_caps();
5153 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5154 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5155 << " mds." << mds << " seq " << m->get_seq()
5156 << " caps now " << ccap_string(new_caps)
a8e16298
TL
5157 << " was " << ccap_string(cap->issued)
5158 << (was_stale ? "" : " (stale)") << dendl;
5159
5160 if (was_stale)
5161 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5162 cap->seq = m->get_seq();
28e407b8 5163 cap->gen = session->cap_gen;
7c673cae 5164
11fdf7f2 5165 check_cap_issue(in, new_caps);
a8e16298 5166
7c673cae 5167 // update inode
1adf2230
AA
5168 int issued;
5169 in->caps_issued(&issued);
5170 issued |= in->caps_dirty();
7c673cae 5171
1adf2230
AA
5172 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5173 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5174 in->mode = m->head.mode;
5175 in->uid = m->head.uid;
5176 in->gid = m->head.gid;
5177 in->btime = m->btime;
5178 }
5179 bool deleted_inode = false;
1adf2230
AA
5180 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5181 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5182 in->nlink = m->head.nlink;
5183 if (in->nlink == 0 &&
5184 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5185 deleted_inode = true;
5186 }
1adf2230 5187 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5188 m->xattrbl.length() &&
5189 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5190 auto p = m->xattrbl.cbegin();
5191 decode(in->xattrs, p);
7c673cae
FG
5192 in->xattr_version = m->head.xattr_version;
5193 }
28e407b8
AA
5194
5195 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5196 in->dirstat.nfiles = m->get_nfiles();
5197 in->dirstat.nsubdirs = m->get_nsubdirs();
5198 }
5199
1adf2230
AA
5200 if (new_caps & CEPH_CAP_ANY_RD) {
5201 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5202 m->get_ctime(), m->get_mtime(), m->get_atime());
5203 }
5204
5205 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5206 in->layout = m->get_layout();
5207 update_inode_file_size(in, issued, m->get_size(),
5208 m->get_truncate_seq(), m->get_truncate_size());
5209 }
5210
5211 if (m->inline_version > in->inline_version) {
5212 in->inline_data = m->inline_data;
5213 in->inline_version = m->inline_version;
5214 }
5215
5216 /* always take a newer change attr */
5217 if (m->get_change_attr() > in->change_attr)
5218 in->change_attr = m->get_change_attr();
7c673cae
FG
5219
5220 // max_size
5221 if (cap == in->auth_cap &&
1adf2230
AA
5222 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5223 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5224 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5225 in->max_size = m->get_max_size();
5226 if (in->max_size > in->wanted_max_size) {
5227 in->wanted_max_size = 0;
5228 in->requested_max_size = 0;
5229 }
5230 }
5231
5232 bool check = false;
a8e16298
TL
5233 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5234 (wanted & ~(cap->wanted | new_caps))) {
5235 // If mds is importing cap, prior cap messages that update 'wanted'
5236 // may get dropped by mds (migrate seq mismatch).
5237 //
5238 // We don't send cap message to update 'wanted' if what we want are
5239 // already issued. If mds revokes caps, cap message that releases caps
5240 // also tells mds what we want. But if caps got revoked by mds forcedly
5241 // (session stale). We may haven't told mds what we want.
7c673cae 5242 check = true;
a8e16298 5243 }
7c673cae 5244
7c673cae
FG
5245
5246 // update caps
a8e16298 5247 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5248 if (revoked) {
5249 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5250 cap->issued = new_caps;
5251 cap->implemented |= new_caps;
5252
b32b8144
FG
5253 // recall delegations if we're losing caps necessary for them
5254 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5255 in->recall_deleg(false);
5256 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5257 in->recall_deleg(true);
5258
11fdf7f2
TL
5259 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5260 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5261 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5262 // waitin' for flush
11fdf7f2 5263 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5264 if (_release(in))
5265 check = true;
5266 } else {
5267 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5268 check = true;
5269 }
a8e16298
TL
5270 } else if (cap->issued == new_caps) {
5271 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5272 } else {
a8e16298 5273 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5274 cap->issued = new_caps;
5275 cap->implemented |= new_caps;
5276
5277 if (cap == in->auth_cap) {
5278 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5279 for (const auto &p : in->caps) {
5280 if (&p.second == cap)
7c673cae 5281 continue;
11fdf7f2 5282 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5283 check = true;
5284 break;
5285 }
5286 }
5287 }
5288 }
5289
5290 if (check)
5291 check_caps(in, 0);
5292
5293 // wake up waiters
5294 if (new_caps)
5295 signal_cond_list(in->waitfor_caps);
5296
5297 // may drop inode's last ref
5298 if (deleted_inode)
5299 _try_to_trim_inode(in, true);
7c673cae
FG
5300}
5301
7c673cae
FG
5302int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5303{
5304 if (perms.uid() == 0)
5305 return 0;
5306
5307 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5308 int ret = _posix_acl_permission(in, perms, want);
5309 if (ret != -EAGAIN)
5310 return ret;
5311 }
5312
5313 // check permissions before doing anything else
5314 if (!in->check_mode(perms, want))
5315 return -EACCES;
5316 return 0;
5317}
5318
5319int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5320 const UserPerm& perms)
5321{
5322 int r = _getattr_for_perm(in, perms);
5323 if (r < 0)
5324 goto out;
5325
5326 r = 0;
5327 if (strncmp(name, "system.", 7) == 0) {
5328 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5329 r = -EPERM;
5330 } else {
5331 r = inode_permission(in, perms, want);
5332 }
5333out:
1adf2230 5334 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5335 return r;
5336}
5337
5338ostream& operator<<(ostream &out, const UserPerm& perm) {
5339 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5340 return out;
5341}
5342
5343int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5344 const UserPerm& perms)
5345{
181888fb 5346 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5347 int r = _getattr_for_perm(in, perms);
5348 if (r < 0)
5349 goto out;
5350
5351 if (mask & CEPH_SETATTR_SIZE) {
5352 r = inode_permission(in, perms, MAY_WRITE);
5353 if (r < 0)
5354 goto out;
5355 }
5356
5357 r = -EPERM;
5358 if (mask & CEPH_SETATTR_UID) {
5359 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5360 goto out;
5361 }
5362 if (mask & CEPH_SETATTR_GID) {
5363 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5364 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5365 goto out;
5366 }
5367
5368 if (mask & CEPH_SETATTR_MODE) {
5369 if (perms.uid() != 0 && perms.uid() != in->uid)
5370 goto out;
5371
5372 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5373 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5374 stx->stx_mode &= ~S_ISGID;
5375 }
5376
5377 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5378 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5379 if (perms.uid() != 0 && perms.uid() != in->uid) {
5380 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5381 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5382 check_mask |= CEPH_SETATTR_MTIME;
5383 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5384 check_mask |= CEPH_SETATTR_ATIME;
5385 if (check_mask & mask) {
5386 goto out;
5387 } else {
5388 r = inode_permission(in, perms, MAY_WRITE);
5389 if (r < 0)
5390 goto out;
5391 }
5392 }
5393 }
5394 r = 0;
5395out:
5396 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5397 return r;
5398}
5399
5400int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5401{
181888fb 5402 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5403 unsigned want = 0;
5404
5405 if ((flags & O_ACCMODE) == O_WRONLY)
5406 want = MAY_WRITE;
5407 else if ((flags & O_ACCMODE) == O_RDWR)
5408 want = MAY_READ | MAY_WRITE;
5409 else if ((flags & O_ACCMODE) == O_RDONLY)
5410 want = MAY_READ;
5411 if (flags & O_TRUNC)
5412 want |= MAY_WRITE;
5413
5414 int r = 0;
5415 switch (in->mode & S_IFMT) {
5416 case S_IFLNK:
5417 r = -ELOOP;
5418 goto out;
5419 case S_IFDIR:
5420 if (want & MAY_WRITE) {
5421 r = -EISDIR;
5422 goto out;
5423 }
5424 break;
5425 }
5426
5427 r = _getattr_for_perm(in, perms);
5428 if (r < 0)
5429 goto out;
5430
5431 r = inode_permission(in, perms, want);
5432out:
5433 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5434 return r;
5435}
5436
5437int Client::may_lookup(Inode *dir, const UserPerm& perms)
5438{
181888fb 5439 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5440 int r = _getattr_for_perm(dir, perms);
5441 if (r < 0)
5442 goto out;
5443
5444 r = inode_permission(dir, perms, MAY_EXEC);
5445out:
5446 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5447 return r;
5448}
5449
5450int Client::may_create(Inode *dir, const UserPerm& perms)
5451{
181888fb 5452 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5453 int r = _getattr_for_perm(dir, perms);
5454 if (r < 0)
5455 goto out;
5456
5457 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5458out:
5459 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5460 return r;
5461}
5462
5463int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5464{
181888fb 5465 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5466 int r = _getattr_for_perm(dir, perms);
5467 if (r < 0)
5468 goto out;
5469
5470 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5471 if (r < 0)
5472 goto out;
5473
5474 /* 'name == NULL' means rmsnap */
5475 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5476 InodeRef otherin;
5477 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5478 if (r < 0)
5479 goto out;
5480 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5481 r = -EPERM;
5482 }
5483out:
5484 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5485 return r;
5486}
5487
5488int Client::may_hardlink(Inode *in, const UserPerm& perms)
5489{
181888fb 5490 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5491 int r = _getattr_for_perm(in, perms);
5492 if (r < 0)
5493 goto out;
5494
5495 if (perms.uid() == 0 || perms.uid() == in->uid) {
5496 r = 0;
5497 goto out;
5498 }
5499
5500 r = -EPERM;
5501 if (!S_ISREG(in->mode))
5502 goto out;
5503
5504 if (in->mode & S_ISUID)
5505 goto out;
5506
5507 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5508 goto out;
5509
5510 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5511out:
5512 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5513 return r;
5514}
5515
5516int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5517{
5518 int mask = CEPH_STAT_CAP_MODE;
5519 bool force = false;
5520 if (acl_type != NO_ACL) {
5521 mask |= CEPH_STAT_CAP_XATTR;
5522 force = in->xattr_version == 0;
5523 }
5524 return _getattr(in, mask, perms, force);
5525}
5526
5527vinodeno_t Client::_get_vino(Inode *in)
5528{
5529 /* The caller must hold the client lock */
5530 return vinodeno_t(in->ino, in->snapid);
5531}
5532
7c673cae
FG
5533/**
5534 * Resolve an MDS spec to a list of MDS daemon GIDs.
5535 *
5536 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5537 * It may be '*' in which case it matches all GIDs.
5538 *
5539 * If no error is returned, the `targets` vector will be populated with at least
5540 * one MDS.
5541 */
5542int Client::resolve_mds(
5543 const std::string &mds_spec,
5544 std::vector<mds_gid_t> *targets)
5545{
11fdf7f2
TL
5546 ceph_assert(fsmap);
5547 ceph_assert(targets != nullptr);
7c673cae
FG
5548
5549 mds_role_t role;
5550 std::stringstream ss;
5551 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5552 if (role_r == 0) {
5553 // We got a role, resolve it to a GID
5554 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5555 << role << "'" << dendl;
5556 targets->push_back(
5557 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5558 return 0;
5559 }
5560
5561 std::string strtol_err;
5562 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5563 if (strtol_err.empty()) {
5564 // It is a possible GID
5565 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5566 if (fsmap->gid_exists(mds_gid)) {
5567 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5568 targets->push_back(mds_gid);
5569 } else {
5570 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5571 << dendl;
5572 return -ENOENT;
5573 }
5574 } else if (mds_spec == "*") {
5575 // It is a wildcard: use all MDSs
5576 const auto mds_info = fsmap->get_mds_info();
5577
5578 if (mds_info.empty()) {
5579 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5580 return -ENOENT;
5581 }
5582
5583 for (const auto i : mds_info) {
5584 targets->push_back(i.first);
5585 }
5586 } else {
5587 // It did not parse as an integer, it is not a wildcard, it must be a name
5588 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5589 if (mds_gid == 0) {
5590 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5591
5592 lderr(cct) << "FSMap: " << *fsmap << dendl;
5593
5594 return -ENOENT;
5595 } else {
5596 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5597 << "' to GID " << mds_gid << dendl;
5598 targets->push_back(mds_gid);
5599 }
5600 }
5601
5602 return 0;
5603}
5604
5605
5606/**
5607 * Authenticate with mon and establish global ID
5608 */
5609int Client::authenticate()
5610{
11fdf7f2 5611 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
5612
5613 if (monclient->is_authenticated()) {
5614 return 0;
5615 }
5616
5617 client_lock.Unlock();
5618 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5619 client_lock.Lock();
5620 if (r < 0) {
5621 return r;
5622 }
5623
5624 whoami = monclient->get_global_id();
5625 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5626
5627 return 0;
5628}
5629
5630int Client::fetch_fsmap(bool user)
5631{
5632 int r;
5633 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5634 // rather than MDSMap because no one MDSMap contains all the daemons, and
5635 // a `tell` can address any daemon.
5636 version_t fsmap_latest;
5637 do {
5638 C_SaferCond cond;
5639 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5640 client_lock.Unlock();
5641 r = cond.wait();
5642 client_lock.Lock();
5643 } while (r == -EAGAIN);
5644
5645 if (r < 0) {
5646 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5647 return r;
5648 }
5649
5650 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5651
5652 if (user) {
5653 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5654 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5655 monclient->renew_subs();
5656 wait_on_list(waiting_for_fsmap);
5657 }
11fdf7f2
TL
5658 ceph_assert(fsmap_user);
5659 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5660 } else {
5661 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5662 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5663 monclient->renew_subs();
5664 wait_on_list(waiting_for_fsmap);
5665 }
11fdf7f2
TL
5666 ceph_assert(fsmap);
5667 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5668 }
5669 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5670 << fsmap_latest << dendl;
5671 return 0;
5672}
5673
5674/**
5675 *
5676 * @mds_spec one of ID, rank, GID, "*"
5677 *
5678 */
5679int Client::mds_command(
5680 const std::string &mds_spec,
5681 const vector<string>& cmd,
5682 const bufferlist& inbl,
5683 bufferlist *outbl,
5684 string *outs,
5685 Context *onfinish)
5686{
11fdf7f2 5687 std::lock_guard lock(client_lock);
7c673cae 5688
181888fb
FG
5689 if (!initialized)
5690 return -ENOTCONN;
7c673cae
FG
5691
5692 int r;
5693 r = authenticate();
5694 if (r < 0) {
5695 return r;
5696 }
5697
5698 r = fetch_fsmap(false);
5699 if (r < 0) {
5700 return r;
5701 }
5702
5703 // Look up MDS target(s) of the command
5704 std::vector<mds_gid_t> targets;
5705 r = resolve_mds(mds_spec, &targets);
5706 if (r < 0) {
5707 return r;
5708 }
5709
5710 // If daemons are laggy, we won't send them commands. If all
5711 // are laggy then we fail.
5712 std::vector<mds_gid_t> non_laggy;
5713 for (const auto gid : targets) {
5714 const auto info = fsmap->get_info_gid(gid);
5715 if (!info.laggy()) {
5716 non_laggy.push_back(gid);
5717 }
5718 }
5719 if (non_laggy.size() == 0) {
5720 *outs = "All targeted MDS daemons are laggy";
5721 return -ENOENT;
5722 }
5723
5724 if (metadata.empty()) {
5725 // We are called on an unmounted client, so metadata
5726 // won't be initialized yet.
5727 populate_metadata("");
5728 }
5729
5730 // Send commands to targets
5731 C_GatherBuilder gather(cct, onfinish);
5732 for (const auto target_gid : non_laggy) {
5733 const auto info = fsmap->get_info_gid(target_gid);
5734
5735 // Open a connection to the target MDS
11fdf7f2 5736 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5737
5738 // Generate MDSCommandOp state
5739 auto &op = command_table.start_command();
5740
5741 op.on_finish = gather.new_sub();
5742 op.cmd = cmd;
5743 op.outbl = outbl;
5744 op.outs = outs;
5745 op.inbl = inbl;
5746 op.mds_gid = target_gid;
5747 op.con = conn;
5748
5749 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5750 << " tid=" << op.tid << cmd << dendl;
5751
5752 // Construct and send MCommand
11fdf7f2
TL
5753 auto m = op.get_message(monclient->get_fsid());
5754 conn->send_message2(std::move(m));
7c673cae
FG
5755 }
5756 gather.activate();
5757
5758 return 0;
5759}
5760
11fdf7f2 5761void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5762{
5763 ceph_tid_t const tid = m->get_tid();
5764
5765 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5766
5767 if (!command_table.exists(tid)) {
5768 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5769 return;
5770 }
5771
5772 auto &op = command_table.get_command(tid);
5773 if (op.outbl) {
11fdf7f2 5774 *op.outbl = m->get_data();
7c673cae
FG
5775 }
5776 if (op.outs) {
5777 *op.outs = m->rs;
5778 }
5779
5780 if (op.on_finish) {
5781 op.on_finish->complete(m->r);
5782 }
5783
5784 command_table.erase(tid);
7c673cae
FG
5785}
5786
5787// -------------------
5788// MOUNT
5789
11fdf7f2 5790int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5791{
7c673cae
FG
5792 int r = authenticate();
5793 if (r < 0) {
5794 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5795 return r;
5796 }
5797
11fdf7f2
TL
5798 std::string resolved_fs_name;
5799 if (fs_name.empty()) {
5800 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5801 } else {
5802 resolved_fs_name = fs_name;
5803 }
5804
7c673cae 5805 std::string want = "mdsmap";
11fdf7f2 5806 if (!resolved_fs_name.empty()) {
7c673cae
FG
5807 r = fetch_fsmap(true);
5808 if (r < 0)
5809 return r;
11fdf7f2
TL
5810 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5811 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5812 return -ENOENT;
11fdf7f2 5813 }
7c673cae
FG
5814
5815 std::ostringstream oss;
11fdf7f2 5816 oss << want << "." << fscid;
7c673cae
FG
5817 want = oss.str();
5818 }
5819 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5820
5821 monclient->sub_want(want, 0, 0);
5822 monclient->renew_subs();
5823
11fdf7f2
TL
5824 return 0;
5825}
5826
5827int Client::mount(const std::string &mount_root, const UserPerm& perms,
5828 bool require_mds, const std::string &fs_name)
5829{
5830 std::lock_guard lock(client_lock);
5831
5832 if (mounted) {
5833 ldout(cct, 5) << "already mounted" << dendl;
5834 return 0;
5835 }
5836
5837 unmounting = false;
5838
5839 int r = subscribe_mdsmap(fs_name);
5840 if (r < 0) {
5841 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5842 return r;
5843 }
5844
7c673cae
FG
5845 tick(); // start tick
5846
5847 if (require_mds) {
5848 while (1) {
5849 auto availability = mdsmap->is_cluster_available();
5850 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5851 // Error out
5852 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5853 return CEPH_FUSE_NO_MDS_UP;
5854 } else if (availability == MDSMap::AVAILABLE) {
5855 // Continue to mount
5856 break;
5857 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5858 // Else, wait. MDSMonitor will update the map to bring
5859 // us to a conclusion eventually.
5860 wait_on_list(waiting_for_mdsmap);
5861 } else {
5862 // Unexpected value!
5863 ceph_abort();
5864 }
5865 }
5866 }
5867
5868 populate_metadata(mount_root.empty() ? "/" : mount_root);
5869
5870 filepath fp(CEPH_INO_ROOT);
5871 if (!mount_root.empty()) {
5872 fp = filepath(mount_root.c_str());
5873 }
5874 while (true) {
5875 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5876 req->set_filepath(fp);
5877 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5878 int res = make_request(req, perms);
5879 if (res < 0) {
5880 if (res == -EACCES && root) {
5881 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5882 break;
5883 }
5884 return res;
5885 }
5886
5887 if (fp.depth())
5888 fp.pop_dentry();
5889 else
5890 break;
5891 }
5892
11fdf7f2 5893 ceph_assert(root);
7c673cae
FG
5894 _ll_get(root);
5895
5896 mounted = true;
5897
5898 // trace?
5899 if (!cct->_conf->client_trace.empty()) {
5900 traceout.open(cct->_conf->client_trace.c_str());
5901 if (traceout.is_open()) {
5902 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5903 } else {
5904 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5905 }
5906 }
5907
5908 /*
5909 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5910 ldout(cct, 3) << "op: struct stat st;" << dendl;
5911 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5912 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5913 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5914 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5915 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5916 ldout(cct, 3) << "op: int fd;" << dendl;
5917 */
5918 return 0;
5919}
5920
5921// UNMOUNT
5922
5923void Client::_close_sessions()
5924{
5925 while (!mds_sessions.empty()) {
5926 // send session closes!
11fdf7f2
TL
5927 for (auto &p : mds_sessions) {
5928 if (p.second.state != MetaSession::STATE_CLOSING) {
5929 _close_mds_session(&p.second);
7c673cae
FG
5930 }
5931 }
5932
5933 // wait for sessions to close
5934 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5935 mount_cond.Wait(client_lock);
5936 }
5937}
5938
31f18b77
FG
5939void Client::flush_mdlog_sync()
5940{
5941 if (mds_requests.empty())
5942 return;
11fdf7f2
TL
5943 for (auto &p : mds_sessions) {
5944 flush_mdlog(&p.second);
31f18b77
FG
5945 }
5946}
5947
5948void Client::flush_mdlog(MetaSession *session)
5949{
5950 // Only send this to Luminous or newer MDS daemons, older daemons
5951 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5952 const uint64_t features = session->con->get_features();
5953 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
11fdf7f2
TL
5954 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5955 session->con->send_message2(std::move(m));
31f18b77
FG
5956 }
5957}
5958
5959
11fdf7f2
TL
5960void Client::_abort_mds_sessions(int err)
5961{
5962 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5963 auto req = p->second;
5964 ++p;
5965 // unsafe requests will be removed during close session below.
5966 if (req->got_unsafe)
5967 continue;
5968
5969 req->abort(err);
5970 if (req->caller_cond) {
5971 req->kick = true;
5972 req->caller_cond->Signal();
5973 }
5974 }
5975
5976 // Process aborts on any requests that were on this waitlist.
5977 // Any requests that were on a waiting_for_open session waitlist
5978 // will get kicked during close session below.
5979 signal_cond_list(waiting_for_mdsmap);
5980
5981 // Force-close all sessions
5982 while(!mds_sessions.empty()) {
5983 auto& session = mds_sessions.begin()->second;
5984 _closed_mds_session(&session);
5985 }
5986}
5987
5988void Client::_unmount(bool abort)
7c673cae 5989{
181888fb
FG
5990 if (unmounting)
5991 return;
7c673cae 5992
11fdf7f2
TL
5993 if (abort || blacklisted) {
5994 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
5995 } else {
5996 ldout(cct, 2) << "unmounting" << dendl;
5997 }
7c673cae
FG
5998 unmounting = true;
5999
b32b8144
FG
6000 deleg_timeout = 0;
6001
11fdf7f2
TL
6002 if (abort) {
6003 // Abort all mds sessions
6004 _abort_mds_sessions(-ENOTCONN);
6005
6006 objecter->op_cancel_writes(-ENOTCONN);
6007 } else {
6008 // flush the mdlog for pending requests, if any
6009 flush_mdlog_sync();
6010 }
6011
7c673cae
FG
6012 while (!mds_requests.empty()) {
6013 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6014 mount_cond.Wait(client_lock);
6015 }
6016
6017 if (tick_event)
6018 timer.cancel_event(tick_event);
6019 tick_event = 0;
6020
6021 cwd.reset();
6022
6023 // clean up any unclosed files
6024 while (!fd_map.empty()) {
6025 Fh *fh = fd_map.begin()->second;
6026 fd_map.erase(fd_map.begin());
6027 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6028 _release_fh(fh);
6029 }
6030
6031 while (!ll_unclosed_fh_set.empty()) {
6032 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6033 Fh *fh = *it;
6034 ll_unclosed_fh_set.erase(fh);
6035 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6036 _release_fh(fh);
6037 }
6038
6039 while (!opened_dirs.empty()) {
6040 dir_result_t *dirp = *opened_dirs.begin();
6041 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6042 _closedir(dirp);
6043 }
6044
6045 _ll_drop_pins();
6046
6047 while (unsafe_sync_write > 0) {
6048 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6049 mount_cond.Wait(client_lock);
6050 }
6051
6052 if (cct->_conf->client_oc) {
6053 // flush/release all buffered data
11fdf7f2
TL
6054 std::list<InodeRef> anchor;
6055 for (auto& p : inode_map) {
6056 Inode *in = p.second;
7c673cae 6057 if (!in) {
11fdf7f2
TL
6058 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6059 ceph_assert(in);
7c673cae 6060 }
11fdf7f2
TL
6061
6062 // prevent inode from getting freed
6063 anchor.emplace_back(in);
6064
6065 if (abort || blacklisted) {
6066 objectcacher->purge_set(&in->oset);
6067 } else if (!in->caps.empty()) {
7c673cae
FG
6068 _release(in);
6069 _flush(in, new C_Client_FlushComplete(this, in));
6070 }
6071 }
6072 }
6073
11fdf7f2
TL
6074 if (abort || blacklisted) {
6075 for (auto p = dirty_list.begin(); !p.end(); ) {
6076 Inode *in = *p;
6077 ++p;
6078 if (in->dirty_caps) {
6079 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6080 in->mark_caps_clean();
6081 put_inode(in);
6082 }
6083 }
6084 } else {
6085 flush_caps_sync();
6086 wait_sync_caps(last_flush_tid);
6087 }
7c673cae
FG
6088
6089 // empty lru cache
7c673cae
FG
6090 trim_cache();
6091
6092 while (lru.lru_get_size() > 0 ||
6093 !inode_map.empty()) {
6094 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6095 << "+" << inode_map.size() << " items"
6096 << ", waiting (for caps to release?)"
6097 << dendl;
6098 utime_t until = ceph_clock_now() + utime_t(5, 0);
6099 int r = mount_cond.WaitUntil(client_lock, until);
6100 if (r == ETIMEDOUT) {
6101 dump_cache(NULL);
6102 }
6103 }
11fdf7f2
TL
6104 ceph_assert(lru.lru_get_size() == 0);
6105 ceph_assert(inode_map.empty());
7c673cae
FG
6106
6107 // stop tracing
6108 if (!cct->_conf->client_trace.empty()) {
6109 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6110 traceout.close();
6111 }
6112
6113 _close_sessions();
6114
6115 mounted = false;
6116
6117 ldout(cct, 2) << "unmounted." << dendl;
6118}
6119
b32b8144
FG
6120void Client::unmount()
6121{
11fdf7f2
TL
6122 std::lock_guard lock(client_lock);
6123 _unmount(false);
6124}
6125
6126void Client::abort_conn()
6127{
6128 std::lock_guard lock(client_lock);
6129 _unmount(true);
b32b8144
FG
6130}
6131
7c673cae
FG
6132void Client::flush_cap_releases()
6133{
6134 // send any cap releases
11fdf7f2
TL
6135 for (auto &p : mds_sessions) {
6136 auto &session = p.second;
6137 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6138 p.first)) {
7c673cae
FG
6139 if (cct->_conf->client_inject_release_failure) {
6140 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6141 } else {
11fdf7f2 6142 session.con->send_message2(std::move(session.release));
7c673cae 6143 }
11fdf7f2 6144 session.release.reset();
7c673cae
FG
6145 }
6146 }
6147}
6148
6149void Client::tick()
6150{
6151 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6152 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6153 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6154 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6155 }
6156
6157 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6158 tick_event = timer.add_event_after(
6159 cct->_conf->client_tick_interval,
6160 new FunctionContext([this](int) {
6161 // Called back via Timer, which takes client_lock for us
11fdf7f2 6162 ceph_assert(client_lock.is_locked_by_me());
3efd9988
FG
6163 tick();
6164 }));
7c673cae
FG
6165 utime_t now = ceph_clock_now();
6166
6167 if (!mounted && !mds_requests.empty()) {
6168 MetaRequest *req = mds_requests.begin()->second;
6169 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6170 req->abort(-ETIMEDOUT);
6171 if (req->caller_cond) {
6172 req->kick = true;
6173 req->caller_cond->Signal();
6174 }
6175 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6176 for (auto &p : mds_sessions) {
6177 signal_context_list(p.second.waiting_for_open);
6178 }
7c673cae
FG
6179 }
6180 }
6181
6182 if (mdsmap->get_epoch()) {
6183 // renew caps?
6184 utime_t el = now - last_cap_renew;
6185 if (el > mdsmap->get_session_timeout() / 3.0)
6186 renew_caps();
6187
6188 flush_cap_releases();
6189 }
6190
6191 // delayed caps
28e407b8 6192 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6193 while (!p.end()) {
6194 Inode *in = *p;
6195 ++p;
6196 if (in->hold_caps_until > now)
6197 break;
28e407b8 6198 delayed_list.pop_front();
7c673cae
FG
6199 check_caps(in, CHECK_CAPS_NODELAY);
6200 }
6201
6202 trim_cache(true);
6203}
6204
6205void Client::renew_caps()
6206{
6207 ldout(cct, 10) << "renew_caps()" << dendl;
6208 last_cap_renew = ceph_clock_now();
6209
11fdf7f2
TL
6210 for (auto &p : mds_sessions) {
6211 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6212 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6213 renew_caps(&p.second);
7c673cae
FG
6214 }
6215}
6216
6217void Client::renew_caps(MetaSession *session)
6218{
6219 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6220 session->last_cap_renew_request = ceph_clock_now();
6221 uint64_t seq = ++session->cap_renew_seq;
11fdf7f2 6222 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6223}
6224
6225
6226// ===============================================================
6227// high level (POSIXy) interface
6228
6229int Client::_do_lookup(Inode *dir, const string& name, int mask,
6230 InodeRef *target, const UserPerm& perms)
6231{
6232 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6233 MetaRequest *req = new MetaRequest(op);
6234 filepath path;
6235 dir->make_nosnap_relative_path(path);
6236 path.push_dentry(name);
6237 req->set_filepath(path);
6238 req->set_inode(dir);
6239 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6240 mask |= DEBUG_GETATTR_CAPS;
6241 req->head.args.getattr.mask = mask;
6242
11fdf7f2 6243 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6244
6245 int r = make_request(req, perms, target);
11fdf7f2 6246 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6247 return r;
6248}
6249
6250int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6251 const UserPerm& perms)
6252{
6253 int r = 0;
6254 Dentry *dn = NULL;
6255
7c673cae 6256 if (dname == "..") {
11fdf7f2
TL
6257 if (dir->dentries.empty()) {
6258 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6259 filepath path(dir->ino);
6260 req->set_filepath(path);
6261
6262 InodeRef tmptarget;
6263 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6264
6265 if (r == 0) {
6266 Inode *tempino = tmptarget.get();
6267 _ll_get(tempino);
6268 *target = tempino;
6269 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6270 } else {
6271 *target = dir;
6272 }
6273 }
7c673cae
FG
6274 else
6275 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6276 goto done;
6277 }
6278
6279 if (dname == ".") {
6280 *target = dir;
6281 goto done;
6282 }
6283
11fdf7f2
TL
6284 if (!dir->is_dir()) {
6285 r = -ENOTDIR;
6286 goto done;
6287 }
6288
7c673cae
FG
6289 if (dname.length() > NAME_MAX) {
6290 r = -ENAMETOOLONG;
6291 goto done;
6292 }
6293
6294 if (dname == cct->_conf->client_snapdir &&
6295 dir->snapid == CEPH_NOSNAP) {
6296 *target = open_snapdir(dir);
6297 goto done;
6298 }
6299
6300 if (dir->dir &&
6301 dir->dir->dentries.count(dname)) {
6302 dn = dir->dir->dentries[dname];
6303
11fdf7f2 6304 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6305 << " seq " << dn->lease_seq
6306 << dendl;
6307
94b18763 6308 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6309 // is dn lease valid?
6310 utime_t now = ceph_clock_now();
6311 if (dn->lease_mds >= 0 &&
6312 dn->lease_ttl > now &&
6313 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6314 MetaSession &s = mds_sessions.at(dn->lease_mds);
6315 if (s.cap_ttl > now &&
6316 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6317 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6318 // make trim_caps() behave.
6319 dir->try_touch_cap(dn->lease_mds);
6320 goto hit_dn;
6321 }
11fdf7f2 6322 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6323 << " vs lease_gen " << dn->lease_gen << dendl;
6324 }
6325 // dir lease?
94b18763 6326 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6327 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6328 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6329 goto hit_dn;
6330 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6331 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6332 << *dir << " dn '" << dname << "'" << dendl;
6333 return -ENOENT;
6334 }
6335 }
6336 } else {
6337 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6338 }
6339 } else {
6340 // can we conclude ENOENT locally?
94b18763 6341 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6342 (dir->flags & I_COMPLETE)) {
11fdf7f2 6343 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6344 return -ENOENT;
6345 }
6346 }
6347
6348 r = _do_lookup(dir, dname, mask, target, perms);
6349 goto done;
6350
6351 hit_dn:
6352 if (dn->inode) {
6353 *target = dn->inode;
6354 } else {
6355 r = -ENOENT;
6356 }
6357 touch_dn(dn);
6358
6359 done:
6360 if (r < 0)
11fdf7f2 6361 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6362 else
11fdf7f2 6363 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6364 return r;
6365}
6366
6367int Client::get_or_create(Inode *dir, const char* name,
6368 Dentry **pdn, bool expect_null)
6369{
6370 // lookup
11fdf7f2 6371 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6372 dir->open_dir();
6373 if (dir->dir->dentries.count(name)) {
6374 Dentry *dn = dir->dir->dentries[name];
6375
6376 // is dn lease valid?
6377 utime_t now = ceph_clock_now();
6378 if (dn->inode &&
6379 dn->lease_mds >= 0 &&
6380 dn->lease_ttl > now &&
6381 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6382 MetaSession &s = mds_sessions.at(dn->lease_mds);
6383 if (s.cap_ttl > now &&
6384 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6385 if (expect_null)
6386 return -EEXIST;
6387 }
6388 }
6389 *pdn = dn;
6390 } else {
6391 // otherwise link up a new one
6392 *pdn = link(dir->dir, name, NULL, NULL);
6393 }
6394
6395 // success
6396 return 0;
6397}
6398
6399int Client::path_walk(const filepath& origpath, InodeRef *end,
6400 const UserPerm& perms, bool followsym, int mask)
6401{
6402 filepath path = origpath;
6403 InodeRef cur;
6404 if (origpath.absolute())
6405 cur = root;
6406 else
6407 cur = cwd;
11fdf7f2 6408 ceph_assert(cur);
7c673cae 6409
11fdf7f2 6410 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6411
6412 int symlinks = 0;
6413
6414 unsigned i=0;
6415 while (i < path.depth() && cur) {
6416 int caps = 0;
6417 const string &dname = path[i];
6418 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6419 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6420 InodeRef next;
6421 if (cct->_conf->client_permissions) {
6422 int r = may_lookup(cur.get(), perms);
6423 if (r < 0)
6424 return r;
6425 caps = CEPH_CAP_AUTH_SHARED;
6426 }
6427
6428 /* Get extra requested caps on the last component */
6429 if (i == (path.depth() - 1))
6430 caps |= mask;
6431 int r = _lookup(cur.get(), dname, caps, &next, perms);
6432 if (r < 0)
6433 return r;
6434 // only follow trailing symlink if followsym. always follow
6435 // 'directory' symlinks.
6436 if (next && next->is_symlink()) {
6437 symlinks++;
6438 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6439 if (symlinks > MAXSYMLINKS) {
6440 return -ELOOP;
6441 }
6442
6443 if (i < path.depth() - 1) {
6444 // dir symlink
6445 // replace consumed components of path with symlink dir target
6446 filepath resolved(next->symlink.c_str());
6447 resolved.append(path.postfixpath(i + 1));
6448 path = resolved;
6449 i = 0;
6450 if (next->symlink[0] == '/') {
6451 cur = root;
6452 }
6453 continue;
6454 } else if (followsym) {
6455 if (next->symlink[0] == '/') {
6456 path = next->symlink.c_str();
6457 i = 0;
6458 // reset position
6459 cur = root;
6460 } else {
6461 filepath more(next->symlink.c_str());
6462 // we need to remove the symlink component from off of the path
6463 // before adding the target that the symlink points to. remain
6464 // at the same position in the path.
6465 path.pop_dentry();
6466 path.append(more);
6467 }
6468 continue;
6469 }
6470 }
6471 cur.swap(next);
6472 i++;
6473 }
6474 if (!cur)
6475 return -ENOENT;
6476 if (end)
6477 end->swap(cur);
6478 return 0;
6479}
6480
6481
6482// namespace ops
6483
6484int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6485{
11fdf7f2 6486 std::lock_guard lock(client_lock);
7c673cae
FG
6487 tout(cct) << "link" << std::endl;
6488 tout(cct) << relexisting << std::endl;
6489 tout(cct) << relpath << std::endl;
6490
181888fb
FG
6491 if (unmounting)
6492 return -ENOTCONN;
6493
7c673cae
FG
6494 filepath existing(relexisting);
6495
6496 InodeRef in, dir;
6497 int r = path_walk(existing, &in, perm, true);
6498 if (r < 0)
6499 return r;
6500 if (std::string(relpath) == "/") {
6501 r = -EEXIST;
6502 return r;
6503 }
6504 filepath path(relpath);
6505 string name = path.last_dentry();
6506 path.pop_dentry();
6507
6508 r = path_walk(path, &dir, perm, true);
6509 if (r < 0)
6510 return r;
6511 if (cct->_conf->client_permissions) {
6512 if (S_ISDIR(in->mode)) {
6513 r = -EPERM;
6514 return r;
6515 }
6516 r = may_hardlink(in.get(), perm);
6517 if (r < 0)
6518 return r;
6519 r = may_create(dir.get(), perm);
6520 if (r < 0)
6521 return r;
6522 }
6523 r = _link(in.get(), dir.get(), name.c_str(), perm);
6524 return r;
6525}
6526
6527int Client::unlink(const char *relpath, const UserPerm& perm)
6528{
11fdf7f2
TL
6529 std::lock_guard lock(client_lock);
6530 tout(cct) << __func__ << std::endl;
7c673cae
FG
6531 tout(cct) << relpath << std::endl;
6532
181888fb
FG
6533 if (unmounting)
6534 return -ENOTCONN;
6535
7c673cae
FG
6536 if (std::string(relpath) == "/")
6537 return -EISDIR;
6538
6539 filepath path(relpath);
6540 string name = path.last_dentry();
6541 path.pop_dentry();
6542 InodeRef dir;
6543 int r = path_walk(path, &dir, perm);
6544 if (r < 0)
6545 return r;
6546 if (cct->_conf->client_permissions) {
6547 r = may_delete(dir.get(), name.c_str(), perm);
6548 if (r < 0)
6549 return r;
6550 }
6551 return _unlink(dir.get(), name.c_str(), perm);
6552}
6553
6554int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6555{
11fdf7f2
TL
6556 std::lock_guard lock(client_lock);
6557 tout(cct) << __func__ << std::endl;
7c673cae
FG
6558 tout(cct) << relfrom << std::endl;
6559 tout(cct) << relto << std::endl;
6560
181888fb
FG
6561 if (unmounting)
6562 return -ENOTCONN;
6563
7c673cae
FG
6564 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6565 return -EBUSY;
6566
6567 filepath from(relfrom);
6568 filepath to(relto);
6569 string fromname = from.last_dentry();
6570 from.pop_dentry();
6571 string toname = to.last_dentry();
6572 to.pop_dentry();
6573
6574 InodeRef fromdir, todir;
6575 int r = path_walk(from, &fromdir, perm);
6576 if (r < 0)
6577 goto out;
6578 r = path_walk(to, &todir, perm);
6579 if (r < 0)
6580 goto out;
6581
6582 if (cct->_conf->client_permissions) {
6583 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6584 if (r < 0)
6585 return r;
6586 r = may_delete(todir.get(), toname.c_str(), perm);
6587 if (r < 0 && r != -ENOENT)
6588 return r;
6589 }
6590 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6591out:
6592 return r;
6593}
6594
6595// dirs
6596
6597int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6598{
11fdf7f2
TL
6599 std::lock_guard lock(client_lock);
6600 tout(cct) << __func__ << std::endl;
7c673cae
FG
6601 tout(cct) << relpath << std::endl;
6602 tout(cct) << mode << std::endl;
11fdf7f2 6603 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6604
181888fb
FG
6605 if (unmounting)
6606 return -ENOTCONN;
6607
7c673cae
FG
6608 if (std::string(relpath) == "/")
6609 return -EEXIST;
6610
6611 filepath path(relpath);
6612 string name = path.last_dentry();
6613 path.pop_dentry();
6614 InodeRef dir;
6615 int r = path_walk(path, &dir, perm);
6616 if (r < 0)
6617 return r;
6618 if (cct->_conf->client_permissions) {
6619 r = may_create(dir.get(), perm);
6620 if (r < 0)
6621 return r;
6622 }
6623 return _mkdir(dir.get(), name.c_str(), mode, perm);
6624}
6625
6626int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6627{
11fdf7f2 6628 std::lock_guard lock(client_lock);
7c673cae 6629 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6630 tout(cct) << __func__ << std::endl;
7c673cae
FG
6631 tout(cct) << relpath << std::endl;
6632 tout(cct) << mode << std::endl;
6633
181888fb
FG
6634 if (unmounting)
6635 return -ENOTCONN;
6636
7c673cae
FG
6637 //get through existing parts of path
6638 filepath path(relpath);
6639 unsigned int i;
6640 int r = 0, caps = 0;
6641 InodeRef cur, next;
6642 cur = cwd;
6643 for (i=0; i<path.depth(); ++i) {
6644 if (cct->_conf->client_permissions) {
6645 r = may_lookup(cur.get(), perms);
6646 if (r < 0)
6647 break;
6648 caps = CEPH_CAP_AUTH_SHARED;
6649 }
6650 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6651 if (r < 0)
6652 break;
6653 cur.swap(next);
6654 }
6655 //check that we have work left to do
6656 if (i==path.depth()) return -EEXIST;
6657 if (r!=-ENOENT) return r;
11fdf7f2 6658 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6659 //make new directory at each level
6660 for (; i<path.depth(); ++i) {
6661 if (cct->_conf->client_permissions) {
6662 r = may_create(cur.get(), perms);
6663 if (r < 0)
6664 return r;
6665 }
6666 //make new dir
6667 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6668
7c673cae 6669 //check proper creation/existence
c07f9fc5
FG
6670 if(-EEXIST == r && i < path.depth() - 1) {
6671 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6672 }
6673 if (r < 0)
6674 return r;
7c673cae
FG
6675 //move to new dir and continue
6676 cur.swap(next);
11fdf7f2 6677 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6678 << filepath(cur->ino).get_path() << dendl;
6679 }
6680 return 0;
6681}
6682
6683int Client::rmdir(const char *relpath, const UserPerm& perms)
6684{
11fdf7f2
TL
6685 std::lock_guard lock(client_lock);
6686 tout(cct) << __func__ << std::endl;
7c673cae
FG
6687 tout(cct) << relpath << std::endl;
6688
181888fb
FG
6689 if (unmounting)
6690 return -ENOTCONN;
6691
7c673cae
FG
6692 if (std::string(relpath) == "/")
6693 return -EBUSY;
6694
6695 filepath path(relpath);
6696 string name = path.last_dentry();
6697 path.pop_dentry();
6698 InodeRef dir;
6699 int r = path_walk(path, &dir, perms);
6700 if (r < 0)
6701 return r;
6702 if (cct->_conf->client_permissions) {
6703 int r = may_delete(dir.get(), name.c_str(), perms);
6704 if (r < 0)
6705 return r;
6706 }
6707 return _rmdir(dir.get(), name.c_str(), perms);
6708}
6709
6710int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6711{
11fdf7f2
TL
6712 std::lock_guard lock(client_lock);
6713 tout(cct) << __func__ << std::endl;
7c673cae
FG
6714 tout(cct) << relpath << std::endl;
6715 tout(cct) << mode << std::endl;
6716 tout(cct) << rdev << std::endl;
6717
181888fb
FG
6718 if (unmounting)
6719 return -ENOTCONN;
6720
7c673cae
FG
6721 if (std::string(relpath) == "/")
6722 return -EEXIST;
6723
6724 filepath path(relpath);
6725 string name = path.last_dentry();
6726 path.pop_dentry();
6727 InodeRef dir;
6728 int r = path_walk(path, &dir, perms);
6729 if (r < 0)
6730 return r;
6731 if (cct->_conf->client_permissions) {
6732 int r = may_create(dir.get(), perms);
6733 if (r < 0)
6734 return r;
6735 }
6736 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6737}
6738
6739// symlinks
6740
6741int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6742{
11fdf7f2
TL
6743 std::lock_guard lock(client_lock);
6744 tout(cct) << __func__ << std::endl;
7c673cae
FG
6745 tout(cct) << target << std::endl;
6746 tout(cct) << relpath << std::endl;
6747
181888fb
FG
6748 if (unmounting)
6749 return -ENOTCONN;
6750
7c673cae
FG
6751 if (std::string(relpath) == "/")
6752 return -EEXIST;
6753
6754 filepath path(relpath);
6755 string name = path.last_dentry();
6756 path.pop_dentry();
6757 InodeRef dir;
6758 int r = path_walk(path, &dir, perms);
6759 if (r < 0)
6760 return r;
6761 if (cct->_conf->client_permissions) {
6762 int r = may_create(dir.get(), perms);
6763 if (r < 0)
6764 return r;
6765 }
6766 return _symlink(dir.get(), name.c_str(), target, perms);
6767}
6768
6769int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6770{
11fdf7f2
TL
6771 std::lock_guard lock(client_lock);
6772 tout(cct) << __func__ << std::endl;
7c673cae
FG
6773 tout(cct) << relpath << std::endl;
6774
181888fb
FG
6775 if (unmounting)
6776 return -ENOTCONN;
6777
7c673cae
FG
6778 filepath path(relpath);
6779 InodeRef in;
6780 int r = path_walk(path, &in, perms, false);
6781 if (r < 0)
6782 return r;
6783
6784 return _readlink(in.get(), buf, size);
6785}
6786
6787int Client::_readlink(Inode *in, char *buf, size_t size)
6788{
6789 if (!in->is_symlink())
6790 return -EINVAL;
6791
6792 // copy into buf (at most size bytes)
6793 int r = in->symlink.length();
6794 if (r > (int)size)
6795 r = size;
6796 memcpy(buf, in->symlink.c_str(), r);
6797 return r;
6798}
6799
6800
6801// inode stuff
6802
6803int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6804{
94b18763 6805 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6806
11fdf7f2 6807 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6808 if (yes && !force)
6809 return 0;
6810
6811 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6812 filepath path;
6813 in->make_nosnap_relative_path(path);
6814 req->set_filepath(path);
6815 req->set_inode(in);
6816 req->head.args.getattr.mask = mask;
6817
6818 int res = make_request(req, perms);
11fdf7f2 6819 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6820 return res;
6821}
6822
6823int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6824 const UserPerm& perms, InodeRef *inp)
6825{
6826 int issued = in->caps_issued();
6827
11fdf7f2 6828 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6829 ccap_string(issued) << dendl;
6830
6831 if (in->snapid != CEPH_NOSNAP) {
6832 return -EROFS;
6833 }
6834 if ((mask & CEPH_SETATTR_SIZE) &&
6835 (unsigned long)stx->stx_size > in->size &&
6836 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6837 perms)) {
6838 return -EDQUOT;
6839 }
6840
6841 // make the change locally?
6842 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6843 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6844 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6845 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6846 << in->cap_dirtier_gid << ", forcing sync setattr"
6847 << dendl;
6848 /*
6849 * This works because we implicitly flush the caps as part of the
6850 * request, so the cap update check will happen with the writeback
6851 * cap context, and then the setattr check will happen with the
6852 * caller's context.
6853 *
6854 * In reality this pattern is likely pretty rare (different users
6855 * setattr'ing the same file). If that turns out not to be the
6856 * case later, we can build a more complex pipelined cap writeback
6857 * infrastructure...
6858 */
6859 if (!mask)
6860 mask |= CEPH_SETATTR_CTIME;
6861 goto force_request;
6862 }
6863
6864 if (!mask) {
6865 // caller just needs us to bump the ctime
6866 in->ctime = ceph_clock_now();
6867 in->cap_dirtier_uid = perms.uid();
6868 in->cap_dirtier_gid = perms.gid();
6869 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6870 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6871 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6872 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6873 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6874 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6875 else
6876 mask |= CEPH_SETATTR_CTIME;
6877 }
6878
6879 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6880 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6881
6882 mask &= ~CEPH_SETATTR_KILL_SGUID;
6883
6884 if (mask & CEPH_SETATTR_UID) {
6885 in->ctime = ceph_clock_now();
6886 in->cap_dirtier_uid = perms.uid();
6887 in->cap_dirtier_gid = perms.gid();
6888 in->uid = stx->stx_uid;
28e407b8 6889 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6890 mask &= ~CEPH_SETATTR_UID;
6891 kill_sguid = true;
6892 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6893 }
6894 if (mask & CEPH_SETATTR_GID) {
6895 in->ctime = ceph_clock_now();
6896 in->cap_dirtier_uid = perms.uid();
6897 in->cap_dirtier_gid = perms.gid();
6898 in->gid = stx->stx_gid;
28e407b8 6899 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6900 mask &= ~CEPH_SETATTR_GID;
6901 kill_sguid = true;
6902 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6903 }
6904
6905 if (mask & CEPH_SETATTR_MODE) {
6906 in->ctime = ceph_clock_now();
6907 in->cap_dirtier_uid = perms.uid();
6908 in->cap_dirtier_gid = perms.gid();
6909 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6910 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6911 mask &= ~CEPH_SETATTR_MODE;
6912 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6913 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6914 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6915 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6916 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6917 }
6918
6919 if (mask & CEPH_SETATTR_BTIME) {
6920 in->ctime = ceph_clock_now();
6921 in->cap_dirtier_uid = perms.uid();
6922 in->cap_dirtier_gid = perms.gid();
6923 in->btime = utime_t(stx->stx_btime);
28e407b8 6924 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6925 mask &= ~CEPH_SETATTR_BTIME;
6926 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6927 }
6928 } else if (mask & CEPH_SETATTR_SIZE) {
6929 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6930 mask |= CEPH_SETATTR_KILL_SGUID;
6931 }
6932
6933 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6934 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6935 if (mask & CEPH_SETATTR_MTIME)
6936 in->mtime = utime_t(stx->stx_mtime);
6937 if (mask & CEPH_SETATTR_ATIME)
6938 in->atime = utime_t(stx->stx_atime);
6939 in->ctime = ceph_clock_now();
6940 in->cap_dirtier_uid = perms.uid();
6941 in->cap_dirtier_gid = perms.gid();
6942 in->time_warp_seq++;
28e407b8 6943 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6944 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6945 }
6946 }
6947 if (!mask) {
6948 in->change_attr++;
6949 return 0;
6950 }
6951
6952force_request:
6953 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6954
6955 filepath path;
6956
6957 in->make_nosnap_relative_path(path);
6958 req->set_filepath(path);
6959 req->set_inode(in);
6960
6961 if (mask & CEPH_SETATTR_KILL_SGUID) {
6962 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6963 }
6964 if (mask & CEPH_SETATTR_MODE) {
6965 req->head.args.setattr.mode = stx->stx_mode;
6966 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6967 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6968 }
6969 if (mask & CEPH_SETATTR_UID) {
6970 req->head.args.setattr.uid = stx->stx_uid;
6971 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6972 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6973 }
6974 if (mask & CEPH_SETATTR_GID) {
6975 req->head.args.setattr.gid = stx->stx_gid;
6976 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6977 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6978 }
6979 if (mask & CEPH_SETATTR_BTIME) {
6980 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6981 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6982 }
6983 if (mask & CEPH_SETATTR_MTIME) {
6984 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6985 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6986 CEPH_CAP_FILE_WR;
6987 }
6988 if (mask & CEPH_SETATTR_ATIME) {
6989 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6990 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6991 CEPH_CAP_FILE_WR;
6992 }
6993 if (mask & CEPH_SETATTR_SIZE) {
6994 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6995 req->head.args.setattr.size = stx->stx_size;
6996 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6997 } else { //too big!
6998 put_request(req);
6999 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7000 return -EFBIG;
7001 }
94b18763 7002 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7003 CEPH_CAP_FILE_WR;
7004 }
7005 req->head.args.setattr.mask = mask;
7006
7007 req->regetattr_mask = mask;
7008
7009 int res = make_request(req, perms, inp);
7010 ldout(cct, 10) << "_setattr result=" << res << dendl;
7011 return res;
7012}
7013
7014/* Note that we only care about attrs that setattr cares about */
7015void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7016{
7017 stx->stx_size = st->st_size;
7018 stx->stx_mode = st->st_mode;
7019 stx->stx_uid = st->st_uid;
7020 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7021#ifdef __APPLE__
7022 stx->stx_mtime = st->st_mtimespec;
7023 stx->stx_atime = st->st_atimespec;
7024#else
7c673cae
FG
7025 stx->stx_mtime = st->st_mtim;
7026 stx->stx_atime = st->st_atim;
11fdf7f2 7027#endif
7c673cae
FG
7028}
7029
7030int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7031 const UserPerm& perms, InodeRef *inp)
7032{
7033 int ret = _do_setattr(in, stx, mask, perms, inp);
7034 if (ret < 0)
7035 return ret;
7036 if (mask & CEPH_SETATTR_MODE)
7037 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7038 return ret;
7039}
7040
7041int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7042 const UserPerm& perms)
7043{
7044 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7045 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7046 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7047 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7048 if (cct->_conf->client_permissions) {
7049 int r = may_setattr(in.get(), stx, mask, perms);
7050 if (r < 0)
7051 return r;
7052 }
7053 return __setattrx(in.get(), stx, mask, perms);
7054}
7055
7056int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7057 const UserPerm& perms)
7058{
7059 struct ceph_statx stx;
7060
7061 stat_to_statx(attr, &stx);
7062 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7063
7064 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7065 mask &= ~CEPH_SETATTR_UID;
7066 }
7067 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7068 mask &= ~CEPH_SETATTR_GID;
7069 }
7070
7c673cae
FG
7071 return _setattrx(in, &stx, mask, perms);
7072}
7073
7074int Client::setattr(const char *relpath, struct stat *attr, int mask,
7075 const UserPerm& perms)
7076{
11fdf7f2
TL
7077 std::lock_guard lock(client_lock);
7078 tout(cct) << __func__ << std::endl;
7c673cae
FG
7079 tout(cct) << relpath << std::endl;
7080 tout(cct) << mask << std::endl;
7081
181888fb
FG
7082 if (unmounting)
7083 return -ENOTCONN;
7084
7c673cae
FG
7085 filepath path(relpath);
7086 InodeRef in;
7087 int r = path_walk(path, &in, perms);
7088 if (r < 0)
7089 return r;
7090 return _setattr(in, attr, mask, perms);
7091}
7092
7093int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7094 const UserPerm& perms, int flags)
7095{
11fdf7f2
TL
7096 std::lock_guard lock(client_lock);
7097 tout(cct) << __func__ << std::endl;
7c673cae
FG
7098 tout(cct) << relpath << std::endl;
7099 tout(cct) << mask << std::endl;
7100
181888fb
FG
7101 if (unmounting)
7102 return -ENOTCONN;
7103
7c673cae
FG
7104 filepath path(relpath);
7105 InodeRef in;
7106 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7107 if (r < 0)
7108 return r;
7109 return _setattrx(in, stx, mask, perms);
7110}
7111
7112int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7113{
11fdf7f2
TL
7114 std::lock_guard lock(client_lock);
7115 tout(cct) << __func__ << std::endl;
7c673cae
FG
7116 tout(cct) << fd << std::endl;
7117 tout(cct) << mask << std::endl;
7118
181888fb
FG
7119 if (unmounting)
7120 return -ENOTCONN;
7121
7c673cae
FG
7122 Fh *f = get_filehandle(fd);
7123 if (!f)
7124 return -EBADF;
7125#if defined(__linux__) && defined(O_PATH)
7126 if (f->flags & O_PATH)
7127 return -EBADF;
7128#endif
7129 return _setattr(f->inode, attr, mask, perms);
7130}
7131
7132int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7133{
11fdf7f2
TL
7134 std::lock_guard lock(client_lock);
7135 tout(cct) << __func__ << std::endl;
7c673cae
FG
7136 tout(cct) << fd << std::endl;
7137 tout(cct) << mask << std::endl;
7138
181888fb
FG
7139 if (unmounting)
7140 return -ENOTCONN;
7141
7c673cae
FG
7142 Fh *f = get_filehandle(fd);
7143 if (!f)
7144 return -EBADF;
7145#if defined(__linux__) && defined(O_PATH)
7146 if (f->flags & O_PATH)
7147 return -EBADF;
7148#endif
7149 return _setattrx(f->inode, stx, mask, perms);
7150}
7151
7152int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7153 frag_info_t *dirstat, int mask)
7154{
11fdf7f2
TL
7155 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7156 std::lock_guard lock(client_lock);
7c673cae
FG
7157 tout(cct) << "stat" << std::endl;
7158 tout(cct) << relpath << std::endl;
181888fb
FG
7159
7160 if (unmounting)
7161 return -ENOTCONN;
7162
7c673cae
FG
7163 filepath path(relpath);
7164 InodeRef in;
7165 int r = path_walk(path, &in, perms, true, mask);
7166 if (r < 0)
7167 return r;
7168 r = _getattr(in, mask, perms);
7169 if (r < 0) {
11fdf7f2 7170 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7171 return r;
7172 }
7173 fill_stat(in, stbuf, dirstat);
11fdf7f2 7174 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7175 return r;
7176}
7177
7178unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7179{
7180 unsigned mask = 0;
7181
7182 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7183 if (flags & AT_NO_ATTR_SYNC)
7184 goto out;
7185
7186 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7187 mask |= CEPH_CAP_PIN;
7188 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7189 mask |= CEPH_CAP_AUTH_SHARED;
7190 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7191 mask |= CEPH_CAP_LINK_SHARED;
7192 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7193 mask |= CEPH_CAP_FILE_SHARED;
7194 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7195 mask |= CEPH_CAP_XATTR_SHARED;
7196out:
7197 return mask;
7198}
7199
7200int Client::statx(const char *relpath, struct ceph_statx *stx,
7201 const UserPerm& perms,
7202 unsigned int want, unsigned int flags)
7203{
11fdf7f2
TL
7204 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7205 std::lock_guard lock(client_lock);
7c673cae
FG
7206 tout(cct) << "statx" << std::endl;
7207 tout(cct) << relpath << std::endl;
181888fb
FG
7208
7209 if (unmounting)
7210 return -ENOTCONN;
7211
7c673cae
FG
7212 filepath path(relpath);
7213 InodeRef in;
7214
7215 unsigned mask = statx_to_mask(flags, want);
7216
7217 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7218 if (r < 0)
7219 return r;
7220
7221 r = _getattr(in, mask, perms);
7222 if (r < 0) {
11fdf7f2 7223 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7224 return r;
7225 }
7226
7227 fill_statx(in, mask, stx);
11fdf7f2 7228 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7229 return r;
7230}
7231
7232int Client::lstat(const char *relpath, struct stat *stbuf,
7233 const UserPerm& perms, frag_info_t *dirstat, int mask)
7234{
11fdf7f2
TL
7235 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7236 std::lock_guard lock(client_lock);
7237 tout(cct) << __func__ << std::endl;
7c673cae 7238 tout(cct) << relpath << std::endl;
181888fb
FG
7239
7240 if (unmounting)
7241 return -ENOTCONN;
7242
7c673cae
FG
7243 filepath path(relpath);
7244 InodeRef in;
7245 // don't follow symlinks
7246 int r = path_walk(path, &in, perms, false, mask);
7247 if (r < 0)
7248 return r;
7249 r = _getattr(in, mask, perms);
7250 if (r < 0) {
11fdf7f2 7251 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7252 return r;
7253 }
7254 fill_stat(in, stbuf, dirstat);
11fdf7f2 7255 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7256 return r;
7257}
7258
7259int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7260{
11fdf7f2 7261 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7262 << " mode 0" << oct << in->mode << dec
7263 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7264 memset(st, 0, sizeof(struct stat));
7265 if (use_faked_inos())
7266 st->st_ino = in->faked_ino;
7267 else
7268 st->st_ino = in->ino;
7269 st->st_dev = in->snapid;
7270 st->st_mode = in->mode;
7271 st->st_rdev = in->rdev;
28e407b8
AA
7272 if (in->is_dir()) {
7273 switch (in->nlink) {
7274 case 0:
7275 st->st_nlink = 0; /* dir is unlinked */
7276 break;
7277 case 1:
7278 st->st_nlink = 1 /* parent dentry */
7279 + 1 /* <dir>/. */
7280 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7281 break;
7282 default:
7283 ceph_abort();
7284 }
7285 } else {
7286 st->st_nlink = in->nlink;
7287 }
7c673cae
FG
7288 st->st_uid = in->uid;
7289 st->st_gid = in->gid;
7290 if (in->ctime > in->mtime) {
7291 stat_set_ctime_sec(st, in->ctime.sec());
7292 stat_set_ctime_nsec(st, in->ctime.nsec());
7293 } else {
7294 stat_set_ctime_sec(st, in->mtime.sec());
7295 stat_set_ctime_nsec(st, in->mtime.nsec());
7296 }
7297 stat_set_atime_sec(st, in->atime.sec());
7298 stat_set_atime_nsec(st, in->atime.nsec());
7299 stat_set_mtime_sec(st, in->mtime.sec());
7300 stat_set_mtime_nsec(st, in->mtime.nsec());
7301 if (in->is_dir()) {
7302 if (cct->_conf->client_dirsize_rbytes)
7303 st->st_size = in->rstat.rbytes;
7304 else
7305 st->st_size = in->dirstat.size();
7306 st->st_blocks = 1;
7307 } else {
7308 st->st_size = in->size;
7309 st->st_blocks = (in->size + 511) >> 9;
7310 }
11fdf7f2 7311 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7312
7313 if (dirstat)
7314 *dirstat = in->dirstat;
7315 if (rstat)
7316 *rstat = in->rstat;
7317
7318 return in->caps_issued();
7319}
7320
7321void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7322{
11fdf7f2 7323 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7324 << " mode 0" << oct << in->mode << dec
7325 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7326 memset(stx, 0, sizeof(struct ceph_statx));
7327
7328 /*
7329 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7330 * so that all bits are set.
7331 */
7332 if (!mask)
7333 mask = ~0;
7334
7335 /* These are always considered to be available */
7336 stx->stx_dev = in->snapid;
11fdf7f2 7337 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7338
7339 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7340 stx->stx_mode = S_IFMT & in->mode;
7341 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7342 stx->stx_rdev = in->rdev;
7343 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7344
7345 if (mask & CEPH_CAP_AUTH_SHARED) {
7346 stx->stx_uid = in->uid;
7347 stx->stx_gid = in->gid;
7348 stx->stx_mode = in->mode;
7349 in->btime.to_timespec(&stx->stx_btime);
7350 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7351 }
7352
7353 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7354 if (in->is_dir()) {
7355 switch (in->nlink) {
7356 case 0:
7357 stx->stx_nlink = 0; /* dir is unlinked */
7358 break;
7359 case 1:
7360 stx->stx_nlink = 1 /* parent dentry */
7361 + 1 /* <dir>/. */
7362 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7363 break;
7364 default:
7365 ceph_abort();
7366 }
7367 } else {
7368 stx->stx_nlink = in->nlink;
7369 }
7c673cae
FG
7370 stx->stx_mask |= CEPH_STATX_NLINK;
7371 }
7372
7373 if (mask & CEPH_CAP_FILE_SHARED) {
7374
7375 in->atime.to_timespec(&stx->stx_atime);
7376 in->mtime.to_timespec(&stx->stx_mtime);
7377
7378 if (in->is_dir()) {
7379 if (cct->_conf->client_dirsize_rbytes)
7380 stx->stx_size = in->rstat.rbytes;
7381 else
7382 stx->stx_size = in->dirstat.size();
7383 stx->stx_blocks = 1;
7384 } else {
7385 stx->stx_size = in->size;
7386 stx->stx_blocks = (in->size + 511) >> 9;
7387 }
7388 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7389 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7390 }
7391
7392 /* Change time and change_attr both require all shared caps to view */
7393 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7394 stx->stx_version = in->change_attr;
7395 if (in->ctime > in->mtime)
7396 in->ctime.to_timespec(&stx->stx_ctime);
7397 else
7398 in->mtime.to_timespec(&stx->stx_ctime);
7399 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7400 }
7401
7402}
7403
7404void Client::touch_dn(Dentry *dn)
7405{
7406 lru.lru_touch(dn);
7407}
7408
7409int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7410{
11fdf7f2
TL
7411 std::lock_guard lock(client_lock);
7412 tout(cct) << __func__ << std::endl;
7c673cae
FG
7413 tout(cct) << relpath << std::endl;
7414 tout(cct) << mode << std::endl;
181888fb
FG
7415
7416 if (unmounting)
7417 return -ENOTCONN;
7418
7c673cae
FG
7419 filepath path(relpath);
7420 InodeRef in;
7421 int r = path_walk(path, &in, perms);
7422 if (r < 0)
7423 return r;
7424 struct stat attr;
7425 attr.st_mode = mode;
7426 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7427}
7428
7429int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7430{
11fdf7f2
TL
7431 std::lock_guard lock(client_lock);
7432 tout(cct) << __func__ << std::endl;
7c673cae
FG
7433 tout(cct) << fd << std::endl;
7434 tout(cct) << mode << std::endl;
181888fb
FG
7435
7436 if (unmounting)
7437 return -ENOTCONN;
7438
7c673cae
FG
7439 Fh *f = get_filehandle(fd);
7440 if (!f)
7441 return -EBADF;
7442#if defined(__linux__) && defined(O_PATH)
7443 if (f->flags & O_PATH)
7444 return -EBADF;
7445#endif
7446 struct stat attr;
7447 attr.st_mode = mode;
7448 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7449}
7450
7451int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7452{
11fdf7f2
TL
7453 std::lock_guard lock(client_lock);
7454 tout(cct) << __func__ << std::endl;
7c673cae
FG
7455 tout(cct) << relpath << std::endl;
7456 tout(cct) << mode << std::endl;
181888fb
FG
7457
7458 if (unmounting)
7459 return -ENOTCONN;
7460
7c673cae
FG
7461 filepath path(relpath);
7462 InodeRef in;
7463 // don't follow symlinks
7464 int r = path_walk(path, &in, perms, false);
7465 if (r < 0)
7466 return r;
7467 struct stat attr;
7468 attr.st_mode = mode;
7469 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7470}
7471
7472int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7473 const UserPerm& perms)
7474{
11fdf7f2
TL
7475 std::lock_guard lock(client_lock);
7476 tout(cct) << __func__ << std::endl;
7c673cae
FG
7477 tout(cct) << relpath << std::endl;
7478 tout(cct) << new_uid << std::endl;
7479 tout(cct) << new_gid << std::endl;
181888fb
FG
7480
7481 if (unmounting)
7482 return -ENOTCONN;
7483
7c673cae
FG
7484 filepath path(relpath);
7485 InodeRef in;
7486 int r = path_walk(path, &in, perms);
7487 if (r < 0)
7488 return r;
7489 struct stat attr;
7490 attr.st_uid = new_uid;
7491 attr.st_gid = new_gid;
181888fb 7492 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7493}
7494
7495int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7496{
11fdf7f2
TL
7497 std::lock_guard lock(client_lock);
7498 tout(cct) << __func__ << std::endl;
7c673cae
FG
7499 tout(cct) << fd << std::endl;
7500 tout(cct) << new_uid << std::endl;
7501 tout(cct) << new_gid << std::endl;
181888fb
FG
7502
7503 if (unmounting)
7504 return -ENOTCONN;
7505
7c673cae
FG
7506 Fh *f = get_filehandle(fd);
7507 if (!f)
7508 return -EBADF;
7509#if defined(__linux__) && defined(O_PATH)
7510 if (f->flags & O_PATH)
7511 return -EBADF;
7512#endif
7513 struct stat attr;
7514 attr.st_uid = new_uid;
7515 attr.st_gid = new_gid;
7516 int mask = 0;
7517 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7518 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7519 return _setattr(f->inode, &attr, mask, perms);
7520}
7521
7522int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7523 const UserPerm& perms)
7524{
11fdf7f2
TL
7525 std::lock_guard lock(client_lock);
7526 tout(cct) << __func__ << std::endl;
7c673cae
FG
7527 tout(cct) << relpath << std::endl;
7528 tout(cct) << new_uid << std::endl;
7529 tout(cct) << new_gid << std::endl;
181888fb
FG
7530
7531 if (unmounting)
7532 return -ENOTCONN;
7533
7c673cae
FG
7534 filepath path(relpath);
7535 InodeRef in;
7536 // don't follow symlinks
7537 int r = path_walk(path, &in, perms, false);
7538 if (r < 0)
7539 return r;
7540 struct stat attr;
7541 attr.st_uid = new_uid;
7542 attr.st_gid = new_gid;
7543 int mask = 0;
7544 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7545 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7546 return _setattr(in, &attr, mask, perms);
7547}
7548
11fdf7f2
TL
7549static void attr_set_atime_and_mtime(struct stat *attr,
7550 const utime_t &atime,
7551 const utime_t &mtime)
7552{
7553 stat_set_atime_sec(attr, atime.tv.tv_sec);
7554 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7555 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7556 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7557}
7558
7559// for [l]utime() invoke the timeval variant as the timespec
7560// variant are not yet implemented. for futime[s](), invoke
7561// the timespec variant.
7c673cae
FG
7562int Client::utime(const char *relpath, struct utimbuf *buf,
7563 const UserPerm& perms)
7564{
11fdf7f2
TL
7565 struct timeval tv[2];
7566 tv[0].tv_sec = buf->actime;
7567 tv[0].tv_usec = 0;
7568 tv[1].tv_sec = buf->modtime;
7569 tv[1].tv_usec = 0;
7570
7571 return utimes(relpath, tv, perms);
7572}
7573
7574int Client::lutime(const char *relpath, struct utimbuf *buf,
7575 const UserPerm& perms)
7576{
7577 struct timeval tv[2];
7578 tv[0].tv_sec = buf->actime;
7579 tv[0].tv_usec = 0;
7580 tv[1].tv_sec = buf->modtime;
7581 tv[1].tv_usec = 0;
7582
7583 return lutimes(relpath, tv, perms);
7584}
7585
7586int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7587{
7588 struct timespec ts[2];
7589 ts[0].tv_sec = buf->actime;
7590 ts[0].tv_nsec = 0;
7591 ts[1].tv_sec = buf->modtime;
7592 ts[1].tv_nsec = 0;
7593
7594 return futimens(fd, ts, perms);
7595}
7596
7597int Client::utimes(const char *relpath, struct timeval times[2],
7598 const UserPerm& perms)
7599{
7600 std::lock_guard lock(client_lock);
7601 tout(cct) << __func__ << std::endl;
7c673cae 7602 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7603 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7604 << std::endl;
7605 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7606 << std::endl;
181888fb
FG
7607
7608 if (unmounting)
7609 return -ENOTCONN;
7610
7c673cae
FG
7611 filepath path(relpath);
7612 InodeRef in;
7613 int r = path_walk(path, &in, perms);
7614 if (r < 0)
7615 return r;
7616 struct stat attr;
11fdf7f2
TL
7617 utime_t atime(times[0]);
7618 utime_t mtime(times[1]);
7619
7620 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7621 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7622}
7623
11fdf7f2
TL
7624int Client::lutimes(const char *relpath, struct timeval times[2],
7625 const UserPerm& perms)
7c673cae 7626{
11fdf7f2
TL
7627 std::lock_guard lock(client_lock);
7628 tout(cct) << __func__ << std::endl;
7c673cae 7629 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7630 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7631 << std::endl;
7632 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7633 << std::endl;
181888fb
FG
7634
7635 if (unmounting)
7636 return -ENOTCONN;
7637
7c673cae
FG
7638 filepath path(relpath);
7639 InodeRef in;
7c673cae
FG
7640 int r = path_walk(path, &in, perms, false);
7641 if (r < 0)
7642 return r;
7643 struct stat attr;
11fdf7f2
TL
7644 utime_t atime(times[0]);
7645 utime_t mtime(times[1]);
7646
7647 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7648 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7649}
7650
11fdf7f2
TL
7651int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7652{
7653 struct timespec ts[2];
7654 ts[0].tv_sec = times[0].tv_sec;
7655 ts[0].tv_nsec = times[0].tv_usec * 1000;
7656 ts[1].tv_sec = times[1].tv_sec;
7657 ts[1].tv_nsec = times[1].tv_usec * 1000;
7658
7659 return futimens(fd, ts, perms);
7660}
7661
7662int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7663{
7664 std::lock_guard lock(client_lock);
7665 tout(cct) << __func__ << std::endl;
7666 tout(cct) << fd << std::endl;
7667 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7668 << std::endl;
7669 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7670 << std::endl;
7671
7672 if (unmounting)
7673 return -ENOTCONN;
7674
7675 Fh *f = get_filehandle(fd);
7676 if (!f)
7677 return -EBADF;
7678#if defined(__linux__) && defined(O_PATH)
7679 if (f->flags & O_PATH)
7680 return -EBADF;
7681#endif
7682 struct stat attr;
7683 utime_t atime(times[0]);
7684 utime_t mtime(times[1]);
7685
7686 attr_set_atime_and_mtime(&attr, atime, mtime);
7687 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7688}
7689
7c673cae
FG
7690int Client::flock(int fd, int operation, uint64_t owner)
7691{
11fdf7f2
TL
7692 std::lock_guard lock(client_lock);
7693 tout(cct) << __func__ << std::endl;
7c673cae
FG
7694 tout(cct) << fd << std::endl;
7695 tout(cct) << operation << std::endl;
7696 tout(cct) << owner << std::endl;
181888fb
FG
7697
7698 if (unmounting)
7699 return -ENOTCONN;
7700
7c673cae
FG
7701 Fh *f = get_filehandle(fd);
7702 if (!f)
7703 return -EBADF;
7704
7705 return _flock(f, operation, owner);
7706}
7707
7708int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7709{
11fdf7f2
TL
7710 std::lock_guard lock(client_lock);
7711 tout(cct) << __func__ << std::endl;
7c673cae 7712 tout(cct) << relpath << std::endl;
181888fb
FG
7713
7714 if (unmounting)
7715 return -ENOTCONN;
7716
7c673cae
FG
7717 filepath path(relpath);
7718 InodeRef in;
7719 int r = path_walk(path, &in, perms, true);
7720 if (r < 0)
7721 return r;
7722 if (cct->_conf->client_permissions) {
7723 int r = may_open(in.get(), O_RDONLY, perms);
7724 if (r < 0)
7725 return r;
7726 }
7727 r = _opendir(in.get(), dirpp, perms);
7728 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7729 if (r != -ENOTDIR)
7730 tout(cct) << (unsigned long)*dirpp << std::endl;
7731 return r;
7732}
7733
7734int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7735{
7736 if (!in->is_dir())
7737 return -ENOTDIR;
7738 *dirpp = new dir_result_t(in, perms);
7739 opened_dirs.insert(*dirpp);
11fdf7f2 7740 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7741 return 0;
7742}
7743
7744
7745int Client::closedir(dir_result_t *dir)
7746{
11fdf7f2
TL
7747 std::lock_guard lock(client_lock);
7748 tout(cct) << __func__ << std::endl;
7c673cae
FG
7749 tout(cct) << (unsigned long)dir << std::endl;
7750
11fdf7f2 7751 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7752 _closedir(dir);
7753 return 0;
7754}
7755
7756void Client::_closedir(dir_result_t *dirp)
7757{
11fdf7f2 7758 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7759 if (dirp->inode) {
11fdf7f2 7760 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7761 dirp->inode.reset();
7762 }
7763 _readdir_drop_dirp_buffer(dirp);
7764 opened_dirs.erase(dirp);
7765 delete dirp;
7766}
7767
7768void Client::rewinddir(dir_result_t *dirp)
7769{
11fdf7f2
TL
7770 std::lock_guard lock(client_lock);
7771 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7772
7773 if (unmounting)
7774 return;
7775
7c673cae
FG
7776 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7777 _readdir_drop_dirp_buffer(d);
7778 d->reset();
7779}
7780
7781loff_t Client::telldir(dir_result_t *dirp)
7782{
7783 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7784 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7785 return d->offset;
7786}
7787
7788void Client::seekdir(dir_result_t *dirp, loff_t offset)
7789{
11fdf7f2 7790 std::lock_guard lock(client_lock);
7c673cae 7791
11fdf7f2 7792 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7793
181888fb
FG
7794 if (unmounting)
7795 return;
7796
7c673cae
FG
7797 if (offset == dirp->offset)
7798 return;
7799
7800 if (offset > dirp->offset)
7801 dirp->release_count = 0; // bump if we do a forward seek
7802 else
7803 dirp->ordered_count = 0; // disable filling readdir cache
7804
7805 if (dirp->hash_order()) {
7806 if (dirp->offset > offset) {
7807 _readdir_drop_dirp_buffer(dirp);
7808 dirp->reset();
7809 }
7810 } else {
7811 if (offset == 0 ||
7812 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7813 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7814 _readdir_drop_dirp_buffer(dirp);
7815 dirp->reset();
7816 }
7817 }
7818
7819 dirp->offset = offset;
7820}
7821
7822
7823//struct dirent {
7824// ino_t d_ino; /* inode number */
7825// off_t d_off; /* offset to the next dirent */
7826// unsigned short d_reclen; /* length of this record */
7827// unsigned char d_type; /* type of file */
7828// char d_name[256]; /* filename */
7829//};
7830void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7831{
7832 strncpy(de->d_name, name, 255);
7833 de->d_name[255] = '\0';
7834#ifndef __CYGWIN__
7835 de->d_ino = ino;
11fdf7f2 7836#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7837 de->d_off = next_off;
7838#endif
7839 de->d_reclen = 1;
7840 de->d_type = IFTODT(type);
11fdf7f2 7841 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7842 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7843#endif
7844}
7845
7846void Client::_readdir_next_frag(dir_result_t *dirp)
7847{
7848 frag_t fg = dirp->buffer_frag;
7849
7850 if (fg.is_rightmost()) {
11fdf7f2 7851 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7852 dirp->set_end();
7853 return;
7854 }
7855
7856 // advance
7857 fg = fg.next();
11fdf7f2 7858 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7859
7860 if (dirp->hash_order()) {
7861 // keep last_name
7862 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7863 if (dirp->offset < new_offset) // don't decrease offset
7864 dirp->offset = new_offset;
7865 } else {
7866 dirp->last_name.clear();
7867 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7868 _readdir_rechoose_frag(dirp);
7869 }
7870}
7871
7872void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7873{
11fdf7f2 7874 ceph_assert(dirp->inode);
7c673cae
FG
7875
7876 if (dirp->hash_order())
7877 return;
7878
7879 frag_t cur = frag_t(dirp->offset_high());
7880 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7881 if (fg != cur) {
11fdf7f2 7882 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7883 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7884 dirp->last_name.clear();
7885 dirp->next_offset = 2;
7886 }
7887}
7888
7889void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7890{
11fdf7f2 7891 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7892 dirp->buffer.clear();
7893}
7894
7895int Client::_readdir_get_frag(dir_result_t *dirp)
7896{
11fdf7f2
TL
7897 ceph_assert(dirp);
7898 ceph_assert(dirp->inode);
7c673cae
FG
7899
7900 // get the current frag.
7901 frag_t fg;
7902 if (dirp->hash_order())
7903 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7904 else
7905 fg = frag_t(dirp->offset_high());
7906
11fdf7f2 7907 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7908 << " offset " << hex << dirp->offset << dec << dendl;
7909
7910 int op = CEPH_MDS_OP_READDIR;
7911 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7912 op = CEPH_MDS_OP_LSSNAP;
7913
7914 InodeRef& diri = dirp->inode;
7915
7916 MetaRequest *req = new MetaRequest(op);
7917 filepath path;
7918 diri->make_nosnap_relative_path(path);
7919 req->set_filepath(path);
7920 req->set_inode(diri.get());
7921 req->head.args.readdir.frag = fg;
7922 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7923 if (dirp->last_name.length()) {
94b18763 7924 req->path2.set_path(dirp->last_name);
7c673cae
FG
7925 } else if (dirp->hash_order()) {
7926 req->head.args.readdir.offset_hash = dirp->offset_high();
7927 }
7928 req->dirp = dirp;
7929
7930 bufferlist dirbl;
7931 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7932
7933 if (res == -EAGAIN) {
11fdf7f2 7934 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7935 _readdir_rechoose_frag(dirp);
7936 return _readdir_get_frag(dirp);
7937 }
7938
7939 if (res == 0) {
11fdf7f2 7940 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7941 << " size " << dirp->buffer.size() << dendl;
7942 } else {
11fdf7f2 7943 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7944 dirp->set_end();
7945 }
7946
7947 return res;
7948}
7949
7950struct dentry_off_lt {
7951 bool operator()(const Dentry* dn, int64_t off) const {
7952 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7953 }
7954};
7955
7956int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7957 int caps, bool getref)
7958{
11fdf7f2
TL
7959 ceph_assert(client_lock.is_locked());
7960 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7961 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7962 << dendl;
7963 Dir *dir = dirp->inode->dir;
7964
7965 if (!dir) {
7966 ldout(cct, 10) << " dir is empty" << dendl;
7967 dirp->set_end();
7968 return 0;
7969 }
7970
7971 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7972 dir->readdir_cache.end(),
7973 dirp->offset, dentry_off_lt());
7974
7975 string dn_name;
7976 while (true) {
7977 if (!dirp->inode->is_complete_and_ordered())
7978 return -EAGAIN;
7979 if (pd == dir->readdir_cache.end())
7980 break;
7981 Dentry *dn = *pd;
7982 if (dn->inode == NULL) {
7983 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7984 ++pd;
7985 continue;
7986 }
7987 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7988 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7989 ++pd;
7990 continue;
7991 }
7992
7993 int r = _getattr(dn->inode, caps, dirp->perms);
7994 if (r < 0)
7995 return r;
7996
7997 struct ceph_statx stx;
7998 struct dirent de;
7999 fill_statx(dn->inode, caps, &stx);
8000
8001 uint64_t next_off = dn->offset + 1;
8002 ++pd;
8003 if (pd == dir->readdir_cache.end())
8004 next_off = dir_result_t::END;
8005
8006 Inode *in = NULL;
8007 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8008 if (getref) {
8009 in = dn->inode.get();
8010 _ll_get(in);
8011 }
8012
8013 dn_name = dn->name; // fill in name while we have lock
8014
8015 client_lock.Unlock();
8016 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8017 client_lock.Lock();
8018 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8019 << " = " << r << dendl;
8020 if (r < 0) {
8021 return r;
8022 }
8023
8024 dirp->offset = next_off;
8025 if (dirp->at_end())
8026 dirp->next_offset = 2;
8027 else
8028 dirp->next_offset = dirp->offset_low();
8029 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8030 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8031 if (r > 0)
8032 return r;
8033 }
8034
11fdf7f2 8035 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8036 dirp->set_end();
8037 return 0;
8038}
8039
8040int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8041 unsigned want, unsigned flags, bool getref)
8042{
8043 int caps = statx_to_mask(flags, want);
8044
11fdf7f2 8045 std::lock_guard lock(client_lock);
7c673cae 8046
181888fb
FG
8047 if (unmounting)
8048 return -ENOTCONN;
8049
7c673cae
FG
8050 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8051
11fdf7f2 8052 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8053 << dec << " at_end=" << dirp->at_end()
8054 << " hash_order=" << dirp->hash_order() << dendl;
8055
8056 struct dirent de;
8057 struct ceph_statx stx;
8058 memset(&de, 0, sizeof(de));
8059 memset(&stx, 0, sizeof(stx));
8060
8061 InodeRef& diri = dirp->inode;
8062
8063 if (dirp->at_end())
8064 return 0;
8065
8066 if (dirp->offset == 0) {
8067 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8068 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8069 uint64_t next_off = 1;
8070
8071 int r;
8072 r = _getattr(diri, caps, dirp->perms);
8073 if (r < 0)
8074 return r;
8075
8076 fill_statx(diri, caps, &stx);
8077 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8078
8079 Inode *inode = NULL;
8080 if (getref) {
8081 inode = diri.get();
8082 _ll_get(inode);
8083 }
8084
8085 client_lock.Unlock();
8086 r = cb(p, &de, &stx, next_off, inode);
8087 client_lock.Lock();
8088 if (r < 0)
8089 return r;
8090
8091 dirp->offset = next_off;
8092 if (r > 0)
8093 return r;
8094 }
8095 if (dirp->offset == 1) {
8096 ldout(cct, 15) << " including .." << dendl;
8097 uint64_t next_off = 2;
8098 InodeRef in;
11fdf7f2 8099 if (diri->dentries.empty())
7c673cae
FG
8100 in = diri;
8101 else
94b18763 8102 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8103
8104 int r;
94b18763 8105 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8106 if (r < 0)
8107 return r;
8108
8109 fill_statx(in, caps, &stx);
8110 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8111
8112 Inode *inode = NULL;
8113 if (getref) {
8114 inode = in.get();
8115 _ll_get(inode);
8116 }
8117
8118 client_lock.Unlock();
8119 r = cb(p, &de, &stx, next_off, inode);
8120 client_lock.Lock();
8121 if (r < 0)
8122 return r;
8123
8124 dirp->offset = next_off;
8125 if (r > 0)
8126 return r;
8127 }
8128
8129 // can we read from our cache?
8130 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8131 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8132 << dirp->inode->is_complete_and_ordered()
8133 << " issued " << ccap_string(dirp->inode->caps_issued())
8134 << dendl;
8135 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8136 dirp->inode->is_complete_and_ordered() &&
94b18763 8137 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8138 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8139 if (err != -EAGAIN)
8140 return err;
8141 }
8142
8143 while (1) {
8144 if (dirp->at_end())
8145 return 0;
8146
8147 bool check_caps = true;
8148 if (!dirp->is_cached()) {
8149 int r = _readdir_get_frag(dirp);
8150 if (r)
8151 return r;
8152 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8153 // different than the requested one. (our dirfragtree was outdated)
8154 check_caps = false;
8155 }
8156 frag_t fg = dirp->buffer_frag;
8157
8158 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8159 << " offset " << hex << dirp->offset << dendl;
8160
8161 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8162 dirp->offset, dir_result_t::dentry_off_lt());
8163 it != dirp->buffer.end();
8164 ++it) {
8165 dir_result_t::dentry &entry = *it;
8166
8167 uint64_t next_off = entry.offset + 1;
8168
8169 int r;
8170 if (check_caps) {
8171 r = _getattr(entry.inode, caps, dirp->perms);
8172 if (r < 0)
8173 return r;
8174 }
8175
8176 fill_statx(entry.inode, caps, &stx);
8177 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8178
8179 Inode *inode = NULL;
8180 if (getref) {
8181 inode = entry.inode.get();
8182 _ll_get(inode);
8183 }
8184
8185 client_lock.Unlock();
8186 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8187 client_lock.Lock();
8188
8189 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8190 << " = " << r << dendl;
8191 if (r < 0)
8192 return r;
8193
8194 dirp->offset = next_off;
8195 if (r > 0)
8196 return r;
8197 }
8198
8199 if (dirp->next_offset > 2) {
8200 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8201 _readdir_drop_dirp_buffer(dirp);
8202 continue; // more!
8203 }
8204
8205 if (!fg.is_rightmost()) {
8206 // next frag!
8207 _readdir_next_frag(dirp);
8208 continue;
8209 }
8210
8211 if (diri->shared_gen == dirp->start_shared_gen &&
8212 diri->dir_release_count == dirp->release_count) {
8213 if (diri->dir_ordered_count == dirp->ordered_count) {
8214 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8215 if (diri->dir) {
11fdf7f2 8216 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8217 diri->dir->readdir_cache.resize(dirp->cache_index);
8218 }
8219 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8220 } else {
8221 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8222 diri->flags |= I_COMPLETE;
8223 }
8224 }
8225
8226 dirp->set_end();
8227 return 0;
8228 }
8229 ceph_abort();
8230 return 0;
8231}
8232
8233
8234int Client::readdir_r(dir_result_t *d, struct dirent *de)
8235{
8236 return readdirplus_r(d, de, 0, 0, 0, NULL);
8237}
8238
8239/*
8240 * readdirplus_r
8241 *
8242 * returns
8243 * 1 if we got a dirent
8244 * 0 for end of directory
8245 * <0 on error
8246 */
8247
8248struct single_readdir {
8249 struct dirent *de;
8250 struct ceph_statx *stx;
8251 Inode *inode;
8252 bool full;
8253};
8254
8255static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8256 struct ceph_statx *stx, off_t off,
8257 Inode *in)
8258{
8259 single_readdir *c = static_cast<single_readdir *>(p);
8260
8261 if (c->full)
8262 return -1; // already filled this dirent
8263
8264 *c->de = *de;
8265 if (c->stx)
8266 *c->stx = *stx;
8267 c->inode = in;
8268 c->full = true;
8269 return 1;
8270}
8271
8272struct dirent *Client::readdir(dir_result_t *d)
8273{
8274 int ret;
8275 static struct dirent de;
8276 single_readdir sr;
8277 sr.de = &de;
8278 sr.stx = NULL;
8279 sr.inode = NULL;
8280 sr.full = false;
8281
8282 // our callback fills the dirent and sets sr.full=true on first
8283 // call, and returns -1 the second time around.
8284 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8285 if (ret < -1) {
8286 errno = -ret; // this sucks.
8287 return (dirent *) NULL;
8288 }
8289 if (sr.full) {
8290 return &de;
8291 }
8292 return (dirent *) NULL;
8293}
8294
8295int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8296 struct ceph_statx *stx, unsigned want,
8297 unsigned flags, Inode **out)
8298{
8299 single_readdir sr;
8300 sr.de = de;
8301 sr.stx = stx;
8302 sr.inode = NULL;
8303 sr.full = false;
8304
8305 // our callback fills the dirent and sets sr.full=true on first
8306 // call, and returns -1 the second time around.
8307 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8308 if (r < -1)
8309 return r;
8310 if (out)
8311 *out = sr.inode;
8312 if (sr.full)
8313 return 1;
8314 return 0;
8315}
8316
8317
8318/* getdents */
8319struct getdents_result {
8320 char *buf;
8321 int buflen;
8322 int pos;
8323 bool fullent;
8324};
8325
8326static int _readdir_getdent_cb(void *p, struct dirent *de,
8327 struct ceph_statx *stx, off_t off, Inode *in)
8328{
8329 struct getdents_result *c = static_cast<getdents_result *>(p);
8330
8331 int dlen;
8332 if (c->fullent)
8333 dlen = sizeof(*de);
8334 else
8335 dlen = strlen(de->d_name) + 1;
8336
8337 if (c->pos + dlen > c->buflen)
8338 return -1; // doesn't fit
8339
8340 if (c->fullent) {
8341 memcpy(c->buf + c->pos, de, sizeof(*de));
8342 } else {
8343 memcpy(c->buf + c->pos, de->d_name, dlen);
8344 }
8345 c->pos += dlen;
8346 return 0;
8347}
8348
8349int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8350{
8351 getdents_result gr;
8352 gr.buf = buf;
8353 gr.buflen = buflen;
8354 gr.fullent = fullent;
8355 gr.pos = 0;
8356
8357 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8358
8359 if (r < 0) { // some error
8360 if (r == -1) { // buffer ran out of space
8361 if (gr.pos) { // but we got some entries already!
8362 return gr.pos;
8363 } // or we need a larger buffer
8364 return -ERANGE;
8365 } else { // actual error, return it
8366 return r;
8367 }
8368 }
8369 return gr.pos;
8370}
8371
8372
8373/* getdir */
8374struct getdir_result {
8375 list<string> *contents;
8376 int num;
8377};
8378
8379static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8380{
8381 getdir_result *r = static_cast<getdir_result *>(p);
8382
8383 r->contents->push_back(de->d_name);
8384 r->num++;
8385 return 0;
8386}
8387
8388int Client::getdir(const char *relpath, list<string>& contents,
8389 const UserPerm& perms)
8390{
8391 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8392 {
11fdf7f2 8393 std::lock_guard lock(client_lock);
7c673cae
FG
8394 tout(cct) << "getdir" << std::endl;
8395 tout(cct) << relpath << std::endl;
8396 }
8397
8398 dir_result_t *d;
8399 int r = opendir(relpath, &d, perms);
8400 if (r < 0)
8401 return r;
8402
8403 getdir_result gr;
8404 gr.contents = &contents;
8405 gr.num = 0;
8406 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8407
8408 closedir(d);
8409
8410 if (r < 0)
8411 return r;
8412 return gr.num;
8413}
8414
8415
8416/****** file i/o **********/
8417int Client::open(const char *relpath, int flags, const UserPerm& perms,
8418 mode_t mode, int stripe_unit, int stripe_count,
8419 int object_size, const char *data_pool)
8420{
8421 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8422 std::lock_guard lock(client_lock);
7c673cae
FG
8423 tout(cct) << "open" << std::endl;
8424 tout(cct) << relpath << std::endl;
8425 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8426
181888fb
FG
8427 if (unmounting)
8428 return -ENOTCONN;
8429
7c673cae
FG
8430 Fh *fh = NULL;
8431
8432#if defined(__linux__) && defined(O_PATH)
8433 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8434 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8435 * in kernel (fs/open.c). */
8436 if (flags & O_PATH)
8437 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8438#endif
8439
8440 filepath path(relpath);
8441 InodeRef in;
8442 bool created = false;
8443 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8444 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8445 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8446
8447 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8448 return -EEXIST;
8449
8450#if defined(__linux__) && defined(O_PATH)
8451 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8452#else
8453 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8454#endif
8455 return -ELOOP;
8456
8457 if (r == -ENOENT && (flags & O_CREAT)) {
8458 filepath dirpath = path;
8459 string dname = dirpath.last_dentry();
8460 dirpath.pop_dentry();
8461 InodeRef dir;
8462 r = path_walk(dirpath, &dir, perms, true,
8463 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8464 if (r < 0)
8465 goto out;
8466 if (cct->_conf->client_permissions) {
8467 r = may_create(dir.get(), perms);
8468 if (r < 0)
8469 goto out;
8470 }
8471 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8472 stripe_count, object_size, data_pool, &created, perms);
8473 }
8474 if (r < 0)
8475 goto out;
8476
8477 if (!created) {
8478 // posix says we can only check permissions of existing files
8479 if (cct->_conf->client_permissions) {
8480 r = may_open(in.get(), flags, perms);
8481 if (r < 0)
8482 goto out;
8483 }
8484 }
8485
8486 if (!fh)
8487 r = _open(in.get(), flags, mode, &fh, perms);
8488 if (r >= 0) {
8489 // allocate a integer file descriptor
11fdf7f2 8490 ceph_assert(fh);
7c673cae 8491 r = get_fd();
11fdf7f2 8492 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8493 fd_map[r] = fh;
8494 }
8495
8496 out:
8497 tout(cct) << r << std::endl;
8498 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8499 return r;
8500}
8501
8502int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8503{
8504 /* Use default file striping parameters */
8505 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8506}
8507
8508int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8509 const UserPerm& perms)
8510{
11fdf7f2
TL
8511 std::lock_guard lock(client_lock);
8512 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8513
181888fb
FG
8514 if (unmounting)
8515 return -ENOTCONN;
8516
7c673cae
FG
8517 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8518 filepath path(ino);
8519 req->set_filepath(path);
8520
8521 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8522 char f[30];
8523 sprintf(f, "%u", h);
8524 filepath path2(dirino);
8525 path2.push_dentry(string(f));
8526 req->set_filepath2(path2);
8527
8528 int r = make_request(req, perms, NULL, NULL,
8529 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8530 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8531 return r;
8532}
8533
8534
8535/**
8536 * Load inode into local cache.
8537 *
8538 * If inode pointer is non-NULL, and take a reference on
8539 * the resulting Inode object in one operation, so that caller
8540 * can safely assume inode will still be there after return.
8541 */
1adf2230 8542int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8543{
11fdf7f2 8544 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8545
181888fb
FG
8546 if (unmounting)
8547 return -ENOTCONN;
8548
7c673cae
FG
8549 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8550 filepath path(ino);
8551 req->set_filepath(path);
8552
8553 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8554 if (r == 0 && inode != NULL) {
8555 vinodeno_t vino(ino, CEPH_NOSNAP);
8556 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8557 ceph_assert(p != inode_map.end());
7c673cae
FG
8558 *inode = p->second;
8559 _ll_get(*inode);
8560 }
11fdf7f2 8561 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8562 return r;
8563}
8564
1adf2230
AA
8565int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8566{
11fdf7f2 8567 std::lock_guard lock(client_lock);
1adf2230
AA
8568 return _lookup_ino(ino, perms, inode);
8569}
7c673cae
FG
8570
8571/**
8572 * Find the parent inode of `ino` and insert it into
8573 * our cache. Conditionally also set `parent` to a referenced
8574 * Inode* if caller provides non-NULL value.
8575 */
1adf2230 8576int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8577{
11fdf7f2 8578 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8579
181888fb
FG
8580 if (unmounting)
8581 return -ENOTCONN;
8582
11fdf7f2 8583 if (!ino->dentries.empty()) {
7c673cae
FG
8584 // if we exposed the parent here, we'd need to check permissions,
8585 // but right now we just rely on the MDS doing so in make_request
11fdf7f2 8586 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
7c673cae
FG
8587 return 0;
8588 }
8589
8590 if (ino->is_root()) {
8591 *parent = NULL;
1adf2230 8592 ldout(cct, 8) << "ino is root, no parent" << dendl;
7c673cae
FG
8593 return -EINVAL;
8594 }
8595
8596 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8597 filepath path(ino->ino);
8598 req->set_filepath(path);
8599
8600 InodeRef target;
8601 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8602 // Give caller a reference to the parent ino if they provided a pointer.
8603 if (parent != NULL) {
8604 if (r == 0) {
8605 *parent = target.get();
8606 _ll_get(*parent);
11fdf7f2 8607 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8608 } else {
8609 *parent = NULL;
8610 }
8611 }
11fdf7f2 8612 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8613 return r;
8614}
8615
1adf2230
AA
8616int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8617{
11fdf7f2 8618 std::lock_guard lock(client_lock);
1adf2230
AA
8619 return _lookup_parent(ino, perms, parent);
8620}
7c673cae
FG
8621
8622/**
8623 * Populate the parent dentry for `ino`, provided it is
8624 * a child of `parent`.
8625 */
1adf2230 8626int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8627{
11fdf7f2
TL
8628 ceph_assert(parent->is_dir());
8629 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8630
181888fb
FG
8631 if (unmounting)
8632 return -ENOTCONN;
8633
7c673cae
FG
8634 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8635 req->set_filepath2(filepath(parent->ino));
8636 req->set_filepath(filepath(ino->ino));
8637 req->set_inode(ino);
8638
8639 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8640 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8641 return r;
8642}
8643
1adf2230
AA
8644int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8645{
11fdf7f2 8646 std::lock_guard lock(client_lock);
1adf2230
AA
8647 return _lookup_name(ino, parent, perms);
8648}
7c673cae 8649
11fdf7f2 8650Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8651{
11fdf7f2
TL
8652 ceph_assert(in);
8653 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8654
11fdf7f2 8655 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8656
8657 if (in->snapid != CEPH_NOSNAP) {
8658 in->snap_cap_refs++;
8659 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8660 << ccap_string(in->caps_issued()) << dendl;
8661 }
8662
11fdf7f2 8663 const auto& conf = cct->_conf;
7c673cae
FG
8664 f->readahead.set_trigger_requests(1);
8665 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8666 uint64_t max_readahead = Readahead::NO_LIMIT;
8667 if (conf->client_readahead_max_bytes) {
11fdf7f2 8668 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8669 }
8670 if (conf->client_readahead_max_periods) {
11fdf7f2 8671 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8672 }
8673 f->readahead.set_max_readahead_size(max_readahead);
8674 vector<uint64_t> alignments;
8675 alignments.push_back(in->layout.get_period());
8676 alignments.push_back(in->layout.stripe_unit);
8677 f->readahead.set_alignments(alignments);
8678
8679 return f;
8680}
8681
8682int Client::_release_fh(Fh *f)
8683{
8684 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8685 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8686 Inode *in = f->inode.get();
11fdf7f2 8687 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8688
b32b8144
FG
8689 in->unset_deleg(f);
8690
7c673cae
FG
8691 if (in->snapid == CEPH_NOSNAP) {
8692 if (in->put_open_ref(f->mode)) {
8693 _flush(in, new C_Client_FlushComplete(this, in));
8694 check_caps(in, 0);
8695 }
8696 } else {
11fdf7f2 8697 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8698 in->snap_cap_refs--;
8699 }
8700
8701 _release_filelocks(f);
8702
8703 // Finally, read any async err (i.e. from flushes)
8704 int err = f->take_async_err();
8705 if (err != 0) {
11fdf7f2 8706 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8707 << cpp_strerror(err) << dendl;
8708 } else {
11fdf7f2 8709 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8710 }
8711
8712 _put_fh(f);
8713
8714 return err;
8715}
8716
8717void Client::_put_fh(Fh *f)
8718{
8719 int left = f->put();
8720 if (!left) {
8721 delete f;
8722 }
8723}
8724
8725int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8726 const UserPerm& perms)
8727{
8728 if (in->snapid != CEPH_NOSNAP &&
8729 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8730 return -EROFS;
8731 }
8732
8733 // use normalized flags to generate cmode
11fdf7f2
TL
8734 int cflags = ceph_flags_sys2wire(flags);
8735 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8736 cflags |= CEPH_O_LAZY;
8737
8738 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8739 int want = ceph_caps_for_mode(cmode);
8740 int result = 0;
8741
8742 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8743
b32b8144 8744 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8745 // update wanted?
8746 check_caps(in, CHECK_CAPS_NODELAY);
8747 } else {
b32b8144 8748
7c673cae
FG
8749 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8750 filepath path;
8751 in->make_nosnap_relative_path(path);
8752 req->set_filepath(path);
11fdf7f2 8753 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8754 req->head.args.open.mode = mode;
8755 req->head.args.open.pool = -1;
8756 if (cct->_conf->client_debug_getattr_caps)
8757 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8758 else
8759 req->head.args.open.mask = 0;
8760 req->head.args.open.old_size = in->size; // for O_TRUNC
8761 req->set_inode(in);
8762 result = make_request(req, perms);
b32b8144
FG
8763
8764 /*
8765 * NFS expects that delegations will be broken on a conflicting open,
8766 * not just when there is actual conflicting access to the file. SMB leases
8767 * and oplocks also have similar semantics.
8768 *
8769 * Ensure that clients that have delegations enabled will wait on minimal
8770 * caps during open, just to ensure that other clients holding delegations
8771 * return theirs first.
8772 */
8773 if (deleg_timeout && result == 0) {
8774 int need = 0, have;
8775
8776 if (cmode & CEPH_FILE_MODE_WR)
8777 need |= CEPH_CAP_FILE_WR;
8778 if (cmode & CEPH_FILE_MODE_RD)
8779 need |= CEPH_CAP_FILE_RD;
8780
8781 result = get_caps(in, need, want, &have, -1);
8782 if (result < 0) {
1adf2230 8783 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8784 " . Denying open: " <<
8785 cpp_strerror(result) << dendl;
8786 in->put_open_ref(cmode);
8787 } else {
8788 put_cap_ref(in, need);
8789 }
8790 }
7c673cae
FG
8791 }
8792
8793 // success?
8794 if (result >= 0) {
8795 if (fhp)
8796 *fhp = _create_fh(in, flags, cmode, perms);
8797 } else {
8798 in->put_open_ref(cmode);
8799 }
8800
8801 trim_cache();
8802
8803 return result;
8804}
8805
8806int Client::_renew_caps(Inode *in)
8807{
8808 int wanted = in->caps_file_wanted();
8809 if (in->is_any_caps() &&
8810 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8811 check_caps(in, CHECK_CAPS_NODELAY);
8812 return 0;
8813 }
8814
8815 int flags = 0;
8816 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8817 flags = O_RDWR;
8818 else if (wanted & CEPH_CAP_FILE_RD)
8819 flags = O_RDONLY;
8820 else if (wanted & CEPH_CAP_FILE_WR)
8821 flags = O_WRONLY;
8822
8823 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8824 filepath path;
8825 in->make_nosnap_relative_path(path);
8826 req->set_filepath(path);
8827 req->head.args.open.flags = flags;
8828 req->head.args.open.pool = -1;
8829 if (cct->_conf->client_debug_getattr_caps)
8830 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8831 else
8832 req->head.args.open.mask = 0;
8833 req->set_inode(in);
8834
8835 // duplicate in case Cap goes away; not sure if that race is a concern?
8836 const UserPerm *pperm = in->get_best_perms();
8837 UserPerm perms;
8838 if (pperm != NULL)
8839 perms = *pperm;
8840 int ret = make_request(req, perms);
8841 return ret;
8842}
8843
8844int Client::close(int fd)
8845{
8846 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8847 std::lock_guard lock(client_lock);
7c673cae
FG
8848 tout(cct) << "close" << std::endl;
8849 tout(cct) << fd << std::endl;
8850
181888fb
FG
8851 if (unmounting)
8852 return -ENOTCONN;
8853
7c673cae
FG
8854 Fh *fh = get_filehandle(fd);
8855 if (!fh)
8856 return -EBADF;
8857 int err = _release_fh(fh);
8858 fd_map.erase(fd);
8859 put_fd(fd);
8860 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8861 return err;
8862}
8863
8864
8865// ------------
8866// read, write
8867
8868loff_t Client::lseek(int fd, loff_t offset, int whence)
8869{
11fdf7f2 8870 std::lock_guard lock(client_lock);
7c673cae
FG
8871 tout(cct) << "lseek" << std::endl;
8872 tout(cct) << fd << std::endl;
8873 tout(cct) << offset << std::endl;
8874 tout(cct) << whence << std::endl;
8875
181888fb
FG
8876 if (unmounting)
8877 return -ENOTCONN;
8878
7c673cae
FG
8879 Fh *f = get_filehandle(fd);
8880 if (!f)
8881 return -EBADF;
8882#if defined(__linux__) && defined(O_PATH)
8883 if (f->flags & O_PATH)
8884 return -EBADF;
8885#endif
8886 return _lseek(f, offset, whence);
8887}
8888
8889loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8890{
8891 Inode *in = f->inode.get();
8892 int r;
11fdf7f2 8893 loff_t pos = -1;
7c673cae
FG
8894
8895 switch (whence) {
8896 case SEEK_SET:
11fdf7f2 8897 pos = offset;
7c673cae
FG
8898 break;
8899
8900 case SEEK_CUR:
11fdf7f2 8901 pos += offset;
7c673cae
FG
8902 break;
8903
8904 case SEEK_END:
8905 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8906 if (r < 0)
8907 return r;
11fdf7f2 8908 pos = in->size + offset;
7c673cae
FG
8909 break;
8910
8911 default:
8912 ceph_abort();
8913 }
8914
11fdf7f2
TL
8915 if (pos < 0) {
8916 return -EINVAL;
8917 } else {
8918 f->pos = pos;
8919 }
8920
1adf2230 8921 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8922 return f->pos;
8923}
8924
8925
8926void Client::lock_fh_pos(Fh *f)
8927{
11fdf7f2 8928 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8929
8930 if (f->pos_locked || !f->pos_waiters.empty()) {
8931 Cond cond;
8932 f->pos_waiters.push_back(&cond);
11fdf7f2 8933 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
7c673cae
FG
8934 while (f->pos_locked || f->pos_waiters.front() != &cond)
8935 cond.Wait(client_lock);
11fdf7f2
TL
8936 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8937 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
8938 f->pos_waiters.pop_front();
8939 }
8940
8941 f->pos_locked = true;
8942}
8943
8944void Client::unlock_fh_pos(Fh *f)
8945{
11fdf7f2 8946 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8947 f->pos_locked = false;
8948}
8949
8950int Client::uninline_data(Inode *in, Context *onfinish)
8951{
8952 if (!in->inline_data.length()) {
8953 onfinish->complete(0);
8954 return 0;
8955 }
8956
8957 char oid_buf[32];
8958 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8959 object_t oid = oid_buf;
8960
8961 ObjectOperation create_ops;
8962 create_ops.create(false);
8963
8964 objecter->mutate(oid,
8965 OSDMap::file_to_object_locator(in->layout),
8966 create_ops,
8967 in->snaprealm->get_snap_context(),
8968 ceph::real_clock::now(),
8969 0,
8970 NULL);
8971
8972 bufferlist inline_version_bl;
11fdf7f2 8973 encode(in->inline_version, inline_version_bl);
7c673cae
FG
8974
8975 ObjectOperation uninline_ops;
8976 uninline_ops.cmpxattr("inline_version",
8977 CEPH_OSD_CMPXATTR_OP_GT,
8978 CEPH_OSD_CMPXATTR_MODE_U64,
8979 inline_version_bl);
8980 bufferlist inline_data = in->inline_data;
8981 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8982 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8983
8984 objecter->mutate(oid,
8985 OSDMap::file_to_object_locator(in->layout),
8986 uninline_ops,
8987 in->snaprealm->get_snap_context(),
8988 ceph::real_clock::now(),
8989 0,
8990 onfinish);
8991
8992 return 0;
8993}
8994
8995//
8996
8997// blocking osd interface
8998
8999int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9000{
11fdf7f2 9001 std::lock_guard lock(client_lock);
7c673cae
FG
9002 tout(cct) << "read" << std::endl;
9003 tout(cct) << fd << std::endl;
9004 tout(cct) << size << std::endl;
9005 tout(cct) << offset << std::endl;
9006
181888fb
FG
9007 if (unmounting)
9008 return -ENOTCONN;
9009
7c673cae
FG
9010 Fh *f = get_filehandle(fd);
9011 if (!f)
9012 return -EBADF;
9013#if defined(__linux__) && defined(O_PATH)
9014 if (f->flags & O_PATH)
9015 return -EBADF;
9016#endif
9017 bufferlist bl;
11fdf7f2
TL
9018 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9019 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9020 int r = _read(f, offset, size, &bl);
9021 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9022 if (r >= 0) {
9023 bl.copy(0, bl.length(), buf);
9024 r = bl.length();
9025 }
9026 return r;
9027}
9028
9029int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9030{
9031 if (iovcnt < 0)
9032 return -EINVAL;
9033 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9034}
9035
11fdf7f2 9036int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9037{
11fdf7f2
TL
9038 int want, have = 0;
9039 bool movepos = false;
9040 std::unique_ptr<C_SaferCond> onuninline;
9041 int64_t r = 0;
9042 const auto& conf = cct->_conf;
7c673cae 9043 Inode *in = f->inode.get();
11fdf7f2
TL
9044 utime_t lat;
9045 utime_t start = ceph_clock_now();
7c673cae
FG
9046
9047 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9048 return -EBADF;
9049 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9050
7c673cae
FG
9051 if (offset < 0) {
9052 lock_fh_pos(f);
9053 offset = f->pos;
9054 movepos = true;
9055 }
9056 loff_t start_pos = offset;
9057
9058 if (in->inline_version == 0) {
11fdf7f2 9059 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9060 if (r < 0) {
11fdf7f2 9061 goto done;
c07f9fc5 9062 }
11fdf7f2 9063 ceph_assert(in->inline_version > 0);
7c673cae
FG
9064 }
9065
9066retry:
11fdf7f2
TL
9067 if (f->mode & CEPH_FILE_MODE_LAZY)
9068 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9069 else
9070 want = CEPH_CAP_FILE_CACHE;
9071 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9072 if (r < 0) {
11fdf7f2 9073 goto done;
c07f9fc5 9074 }
7c673cae 9075 if (f->flags & O_DIRECT)
11fdf7f2 9076 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9077
9078 if (in->inline_version < CEPH_INLINE_NONE) {
9079 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9080 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9081 uninline_data(in, onuninline.get());
7c673cae
FG
9082 } else {
9083 uint32_t len = in->inline_data.length();
7c673cae
FG
9084 uint64_t endoff = offset + size;
9085 if (endoff > in->size)
9086 endoff = in->size;
9087
9088 if (offset < len) {
9089 if (endoff <= len) {
9090 bl->substr_of(in->inline_data, offset, endoff - offset);
9091 } else {
9092 bl->substr_of(in->inline_data, offset, len - offset);
9093 bl->append_zero(endoff - len);
9094 }
11fdf7f2 9095 r = endoff - offset;
7c673cae
FG
9096 } else if ((uint64_t)offset < endoff) {
9097 bl->append_zero(endoff - offset);
11fdf7f2
TL
9098 r = endoff - offset;
9099 } else {
9100 r = 0;
7c673cae 9101 }
7c673cae
FG
9102 goto success;
9103 }
9104 }
9105
9106 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9107 conf->client_oc &&
9108 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9109
9110 if (f->flags & O_RSYNC) {
9111 _flush_range(in, offset, size);
9112 }
9113 r = _read_async(f, offset, size, bl);
9114 if (r < 0)
9115 goto done;
9116 } else {
9117 if (f->flags & O_DIRECT)
9118 _flush_range(in, offset, size);
9119
9120 bool checkeof = false;
9121 r = _read_sync(f, offset, size, bl, &checkeof);
9122 if (r < 0)
9123 goto done;
9124 if (checkeof) {
9125 offset += r;
9126 size -= r;
9127
9128 put_cap_ref(in, CEPH_CAP_FILE_RD);
9129 have = 0;
9130 // reverify size
9131 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9132 if (r < 0)
9133 goto done;
9134
9135 // eof? short read.
9136 if ((uint64_t)offset < in->size)
9137 goto retry;
9138 }
9139 }
9140
9141success:
11fdf7f2 9142 ceph_assert(r >= 0);
7c673cae
FG
9143 if (movepos) {
9144 // adjust fd pos
11fdf7f2 9145 f->pos = start_pos + r;
7c673cae 9146 }
11fdf7f2
TL
9147
9148 lat = ceph_clock_now();
9149 lat -= start;
9150 logger->tinc(l_c_read, lat);
7c673cae
FG
9151
9152done:
9153 // done!
11fdf7f2 9154
7c673cae
FG
9155 if (onuninline) {
9156 client_lock.Unlock();
11fdf7f2 9157 int ret = onuninline->wait();
7c673cae 9158 client_lock.Lock();
11fdf7f2 9159 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9160 in->inline_data.clear();
9161 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9162 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9163 check_caps(in, 0);
9164 } else
11fdf7f2 9165 r = ret;
7c673cae 9166 }
11fdf7f2 9167 if (have) {
7c673cae 9168 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9169 }
9170 if (movepos) {
9171 unlock_fh_pos(f);
9172 }
9173 return r;
7c673cae
FG
9174}
9175
9176Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9177 client(c), f(f) {
9178 f->get();
9179 f->readahead.inc_pending();
9180}
9181
9182Client::C_Readahead::~C_Readahead() {
9183 f->readahead.dec_pending();
9184 client->_put_fh(f);
9185}
9186
9187void Client::C_Readahead::finish(int r) {
9188 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9189 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9190}
9191
9192int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9193{
11fdf7f2 9194 const auto& conf = cct->_conf;
7c673cae
FG
9195 Inode *in = f->inode.get();
9196
11fdf7f2 9197 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9198
9199 // trim read based on file size?
9200 if (off >= in->size)
9201 return 0;
9202 if (len == 0)
9203 return 0;
9204 if (off + len > in->size) {
9205 len = in->size - off;
9206 }
9207
9208 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9209 << " max_bytes=" << f->readahead.get_max_readahead_size()
9210 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9211
9212 // read (and possibly block)
11fdf7f2
TL
9213 int r = 0;
9214 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9215 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9216 off, len, bl, 0, &onfinish);
7c673cae
FG
9217 if (r == 0) {
9218 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9219 client_lock.Unlock();
11fdf7f2 9220 r = onfinish.wait();
7c673cae
FG
9221 client_lock.Lock();
9222 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9223 }
9224
9225 if(f->readahead.get_min_readahead_size() > 0) {
9226 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9227 if (readahead_extent.second > 0) {
9228 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9229 << " (caller wants " << off << "~" << len << ")" << dendl;
9230 Context *onfinish2 = new C_Readahead(this, f);
9231 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9232 readahead_extent.first, readahead_extent.second,
9233 NULL, 0, onfinish2);
9234 if (r2 == 0) {
9235 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9236 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9237 } else {
9238 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9239 delete onfinish2;
9240 }
9241 }
9242 }
9243
9244 return r;
9245}
9246
9247int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9248 bool *checkeof)
9249{
9250 Inode *in = f->inode.get();
9251 uint64_t pos = off;
9252 int left = len;
9253 int read = 0;
9254
11fdf7f2 9255 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9256
9257 Mutex flock("Client::_read_sync flock");
9258 Cond cond;
9259 while (left > 0) {
11fdf7f2 9260 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9261 bufferlist tbl;
9262
9263 int wanted = left;
9264 filer->read_trunc(in->ino, &in->layout, in->snapid,
9265 pos, left, &tbl, 0,
9266 in->truncate_size, in->truncate_seq,
11fdf7f2 9267 &onfinish);
7c673cae 9268 client_lock.Unlock();
11fdf7f2 9269 int r = onfinish.wait();
7c673cae
FG
9270 client_lock.Lock();
9271
9272 // if we get ENOENT from OSD, assume 0 bytes returned
9273 if (r == -ENOENT)
9274 r = 0;
9275 if (r < 0)
9276 return r;
9277 if (tbl.length()) {
9278 r = tbl.length();
9279
9280 read += r;
9281 pos += r;
9282 left -= r;
9283 bl->claim_append(tbl);
9284 }
9285 // short read?
9286 if (r >= 0 && r < wanted) {
9287 if (pos < in->size) {
9288 // zero up to known EOF
9289 int64_t some = in->size - pos;
9290 if (some > left)
9291 some = left;
11fdf7f2
TL
9292 auto z = buffer::ptr_node::create(some);
9293 z->zero();
9294 bl->push_back(std::move(z));
7c673cae
FG
9295 read += some;
9296 pos += some;
9297 left -= some;
9298 if (left == 0)
9299 return read;
9300 }
9301
9302 *checkeof = true;
9303 return read;
9304 }
9305 }
9306 return read;
9307}
9308
9309
9310/*
9311 * we keep count of uncommitted sync writes on the inode, so that
9312 * fsync can DDRT.
9313 */
9314void Client::_sync_write_commit(Inode *in)
9315{
11fdf7f2 9316 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9317 unsafe_sync_write--;
9318
9319 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9320
11fdf7f2 9321 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9322 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9323 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
7c673cae
FG
9324 mount_cond.Signal();
9325 }
9326}
9327
9328int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9329{
11fdf7f2 9330 std::lock_guard lock(client_lock);
7c673cae
FG
9331 tout(cct) << "write" << std::endl;
9332 tout(cct) << fd << std::endl;
9333 tout(cct) << size << std::endl;
9334 tout(cct) << offset << std::endl;
9335
181888fb
FG
9336 if (unmounting)
9337 return -ENOTCONN;
9338
7c673cae
FG
9339 Fh *fh = get_filehandle(fd);
9340 if (!fh)
9341 return -EBADF;
9342#if defined(__linux__) && defined(O_PATH)
9343 if (fh->flags & O_PATH)
9344 return -EBADF;
9345#endif
11fdf7f2
TL
9346 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9347 size = std::min(size, (loff_t)INT_MAX);
9348 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9349 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9350 return r;
9351}
9352
9353int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9354{
9355 if (iovcnt < 0)
9356 return -EINVAL;
9357 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9358}
9359
11fdf7f2
TL
9360int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9361 unsigned iovcnt, int64_t offset, bool write,
9362 bool clamp_to_int)
7c673cae 9363{
7c673cae
FG
9364#if defined(__linux__) && defined(O_PATH)
9365 if (fh->flags & O_PATH)
9366 return -EBADF;
9367#endif
9368 loff_t totallen = 0;
9369 for (unsigned i = 0; i < iovcnt; i++) {
9370 totallen += iov[i].iov_len;
9371 }
11fdf7f2
TL
9372
9373 /*
9374 * Some of the API functions take 64-bit size values, but only return
9375 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9376 * we don't do I/Os larger than the values we can return.
9377 */
9378 if (clamp_to_int) {
9379 totallen = std::min(totallen, (loff_t)INT_MAX);
9380 }
7c673cae 9381 if (write) {
11fdf7f2
TL
9382 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9383 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9384 return w;
9385 } else {
9386 bufferlist bl;
11fdf7f2
TL
9387 int64_t r = _read(fh, offset, totallen, &bl);
9388 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9389 if (r <= 0)
9390 return r;
9391
9392 int bufoff = 0;
9393 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9394 /*
9395 * This piece of code aims to handle the case that bufferlist does not have enough data
9396 * to fill in the iov
9397 */
9398 if (resid < iov[j].iov_len) {
9399 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9400 break;
9401 } else {
9402 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9403 }
9404 resid -= iov[j].iov_len;
9405 bufoff += iov[j].iov_len;
9406 }
9407 return r;
9408 }
9409}
9410
11fdf7f2
TL
9411int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9412{
9413 std::lock_guard lock(client_lock);
9414 tout(cct) << fd << std::endl;
9415 tout(cct) << offset << std::endl;
9416
9417 if (unmounting)
9418 return -ENOTCONN;
9419
9420 Fh *fh = get_filehandle(fd);
9421 if (!fh)
9422 return -EBADF;
9423 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9424}
9425
9426int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9427 const struct iovec *iov, int iovcnt)
7c673cae 9428{
f64942e4
AA
9429 uint64_t fpos = 0;
9430
7c673cae
FG
9431 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9432 return -EFBIG;
9433
9434 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9435 Inode *in = f->inode.get();
9436
9437 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9438 return -ENOSPC;
9439 }
9440
11fdf7f2 9441 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9442
9443 // was Fh opened as writeable?
9444 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9445 return -EBADF;
9446
7c673cae
FG
9447 // use/adjust fd pos?
9448 if (offset < 0) {
9449 lock_fh_pos(f);
9450 /*
9451 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9452 * change out from under us.
9453 */
9454 if (f->flags & O_APPEND) {
9455 int r = _lseek(f, 0, SEEK_END);
9456 if (r < 0) {
9457 unlock_fh_pos(f);
9458 return r;
9459 }
9460 }
9461 offset = f->pos;
f64942e4 9462 fpos = offset+size;
7c673cae
FG
9463 unlock_fh_pos(f);
9464 }
9465
11fdf7f2
TL
9466 // check quota
9467 uint64_t endoff = offset + size;
9468 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9469 f->actor_perms)) {
9470 return -EDQUOT;
9471 }
9472
7c673cae
FG
9473 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9474
9475 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9476
9477 // time it.
9478 utime_t start = ceph_clock_now();
9479
9480 if (in->inline_version == 0) {
9481 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9482 if (r < 0)
9483 return r;
11fdf7f2 9484 ceph_assert(in->inline_version > 0);
7c673cae
FG
9485 }
9486
9487 // copy into fresh buffer (since our write may be resub, async)
9488 bufferlist bl;
9489 if (buf) {
9490 if (size > 0)
9491 bl.append(buf, size);
9492 } else if (iov){
9493 for (int i = 0; i < iovcnt; i++) {
9494 if (iov[i].iov_len > 0) {
9495 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9496 }
9497 }
9498 }
9499
9500 utime_t lat;
9501 uint64_t totalwritten;
11fdf7f2
TL
9502 int want, have;
9503 if (f->mode & CEPH_FILE_MODE_LAZY)
9504 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9505 else
9506 want = CEPH_CAP_FILE_BUFFER;
9507 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9508 if (r < 0)
9509 return r;
9510
9511 /* clear the setuid/setgid bits, if any */
181888fb 9512 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9513 struct ceph_statx stx = { 0 };
9514
9515 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9516 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9517 if (r < 0)
9518 return r;
9519 } else {
9520 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9521 }
9522
9523 if (f->flags & O_DIRECT)
11fdf7f2 9524 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9525
9526 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9527
11fdf7f2
TL
9528 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9529
7c673cae
FG
9530 if (in->inline_version < CEPH_INLINE_NONE) {
9531 if (endoff > cct->_conf->client_max_inline_size ||
9532 endoff > CEPH_INLINE_MAX_SIZE ||
9533 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9534 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9535 uninline_data(in, onuninline.get());
7c673cae
FG
9536 } else {
9537 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9538
9539 uint32_t len = in->inline_data.length();
9540
9541 if (endoff < len)
9542 in->inline_data.copy(endoff, len - endoff, bl);
9543
9544 if (offset < len)
9545 in->inline_data.splice(offset, len - offset);
9546 else if (offset > len)
9547 in->inline_data.append_zero(offset - len);
9548
9549 in->inline_data.append(bl);
9550 in->inline_version++;
9551
9552 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9553
9554 goto success;
9555 }
9556 }
9557
11fdf7f2
TL
9558 if (cct->_conf->client_oc &&
9559 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9560 // do buffered write
9561 if (!in->oset.dirty_or_tx)
9562 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9563
9564 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9565
9566 // async, caching, non-blocking.
9567 r = objectcacher->file_write(&in->oset, &in->layout,
9568 in->snaprealm->get_snap_context(),
9569 offset, size, bl, ceph::real_clock::now(),
9570 0);
9571 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9572
9573 if (r < 0)
9574 goto done;
9575
9576 // flush cached write if O_SYNC is set on file fh
9577 // O_DSYNC == O_SYNC on linux < 2.6.33
9578 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9579 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9580 _flush_range(in, offset, size);
9581 }
9582 } else {
9583 if (f->flags & O_DIRECT)
9584 _flush_range(in, offset, size);
9585
9586 // simple, non-atomic sync write
11fdf7f2 9587 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9588 unsafe_sync_write++;
9589 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9590
9591 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9592 offset, size, bl, ceph::real_clock::now(), 0,
9593 in->truncate_size, in->truncate_seq,
11fdf7f2 9594 &onfinish);
7c673cae 9595 client_lock.Unlock();
11fdf7f2 9596 onfinish.wait();
7c673cae
FG
9597 client_lock.Lock();
9598 _sync_write_commit(in);
9599 }
9600
9601 // if we get here, write was successful, update client metadata
9602success:
9603 // time
9604 lat = ceph_clock_now();
9605 lat -= start;
9606 logger->tinc(l_c_wrlat, lat);
9607
f64942e4
AA
9608 if (fpos) {
9609 lock_fh_pos(f);
9610 f->pos = fpos;
9611 unlock_fh_pos(f);
9612 }
7c673cae 9613 totalwritten = size;
11fdf7f2 9614 r = (int64_t)totalwritten;
7c673cae
FG
9615
9616 // extend file?
9617 if (totalwritten + offset > in->size) {
9618 in->size = totalwritten + offset;
28e407b8 9619 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9620
11fdf7f2 9621 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9622 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9623 } else if (is_max_size_approaching(in)) {
9624 check_caps(in, 0);
7c673cae
FG
9625 }
9626
9627 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9628 } else {
9629 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9630 }
9631
9632 // mtime
91327a77 9633 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9634 in->change_attr++;
28e407b8 9635 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9636
9637done:
9638
11fdf7f2 9639 if (nullptr != onuninline) {
7c673cae 9640 client_lock.Unlock();
11fdf7f2 9641 int uninline_ret = onuninline->wait();
7c673cae
FG
9642 client_lock.Lock();
9643
9644 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9645 in->inline_data.clear();
9646 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9647 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9648 check_caps(in, 0);
9649 } else
9650 r = uninline_ret;
9651 }
9652
9653 put_cap_ref(in, CEPH_CAP_FILE_WR);
9654 return r;
9655}
9656
9657int Client::_flush(Fh *f)
9658{
9659 Inode *in = f->inode.get();
9660 int err = f->take_async_err();
9661 if (err != 0) {
9662 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9663 << cpp_strerror(err) << dendl;
9664 } else {
9665 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9666 }
9667
9668 return err;
9669}
9670
9671int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9672{
9673 struct ceph_statx stx;
9674 stx.stx_size = length;
9675 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9676}
9677
9678int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9679{
11fdf7f2
TL
9680 std::lock_guard lock(client_lock);
9681 tout(cct) << __func__ << std::endl;
7c673cae
FG
9682 tout(cct) << fd << std::endl;
9683 tout(cct) << length << std::endl;
9684
181888fb
FG
9685 if (unmounting)
9686 return -ENOTCONN;
9687
7c673cae
FG
9688 Fh *f = get_filehandle(fd);
9689 if (!f)
9690 return -EBADF;
9691#if defined(__linux__) && defined(O_PATH)
9692 if (f->flags & O_PATH)
9693 return -EBADF;
9694#endif
9695 struct stat attr;
9696 attr.st_size = length;
9697 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9698}
9699
9700int Client::fsync(int fd, bool syncdataonly)
9701{
11fdf7f2 9702 std::lock_guard lock(client_lock);
7c673cae
FG
9703 tout(cct) << "fsync" << std::endl;
9704 tout(cct) << fd << std::endl;
9705 tout(cct) << syncdataonly << std::endl;
9706
181888fb
FG
9707 if (unmounting)
9708 return -ENOTCONN;
9709
7c673cae
FG
9710 Fh *f = get_filehandle(fd);
9711 if (!f)
9712 return -EBADF;
9713#if defined(__linux__) && defined(O_PATH)
9714 if (f->flags & O_PATH)
9715 return -EBADF;
9716#endif
9717 int r = _fsync(f, syncdataonly);
9718 if (r == 0) {
9719 // The IOs in this fsync were okay, but maybe something happened
9720 // in the background that we shoudl be reporting?
9721 r = f->take_async_err();
1adf2230 9722 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9723 << ") = 0, async_err = " << r << dendl;
9724 } else {
9725 // Assume that an error we encountered during fsync, even reported
9726 // synchronously, would also have applied the error to the Fh, and we
9727 // should clear it here to avoid returning the same error again on next
9728 // call.
1adf2230 9729 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9730 << r << dendl;
9731 f->take_async_err();
9732 }
9733 return r;
9734}
9735
9736int Client::_fsync(Inode *in, bool syncdataonly)
9737{
9738 int r = 0;
11fdf7f2 9739 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9740 ceph_tid_t flush_tid = 0;
9741 InodeRef tmp_ref;
11fdf7f2
TL
9742 utime_t lat;
9743 utime_t start = ceph_clock_now();
7c673cae 9744
1adf2230 9745 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9746
9747 if (cct->_conf->client_oc) {
11fdf7f2
TL
9748 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9749 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9750 _flush(in, object_cacher_completion.get());
7c673cae
FG
9751 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9752 }
9753
9754 if (!syncdataonly && in->dirty_caps) {
9755 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9756 if (in->flushing_caps)
9757 flush_tid = last_flush_tid;
9758 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9759
9760 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9761 flush_mdlog_sync();
9762
7c673cae
FG
9763 MetaRequest *req = in->unsafe_ops.back();
9764 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9765
9766 req->get();
9767 wait_on_list(req->waitfor_safe);
9768 put_request(req);
9769 }
9770
11fdf7f2 9771 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
7c673cae 9772 client_lock.Unlock();
7c673cae 9773 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9774 r = object_cacher_completion->wait();
7c673cae
FG
9775 client_lock.Lock();
9776 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9777 } else {
9778 // FIXME: this can starve
9779 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9780 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9781 << " uncommitted, waiting" << dendl;
9782 wait_on_list(in->waitfor_commit);
9783 }
9784 }
9785
9786 if (!r) {
9787 if (flush_tid > 0)
9788 wait_sync_caps(in, flush_tid);
9789
9790 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9791 } else {
1adf2230 9792 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9793 << cpp_strerror(-r) << dendl;
9794 }
11fdf7f2
TL
9795
9796 lat = ceph_clock_now();
9797 lat -= start;
9798 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9799
9800 return r;
9801}
9802
9803int Client::_fsync(Fh *f, bool syncdataonly)
9804{
1adf2230 9805 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9806 return _fsync(f->inode.get(), syncdataonly);
9807}
9808
9809int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9810{
11fdf7f2 9811 std::lock_guard lock(client_lock);
7c673cae
FG
9812 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9813 tout(cct) << fd << std::endl;
9814
181888fb
FG
9815 if (unmounting)
9816 return -ENOTCONN;
9817
7c673cae
FG
9818 Fh *f = get_filehandle(fd);
9819 if (!f)
9820 return -EBADF;
9821 int r = _getattr(f->inode, mask, perms);
9822 if (r < 0)
9823 return r;
9824 fill_stat(f->inode, stbuf, NULL);
1adf2230 9825 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9826 return r;
9827}
9828
9829int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9830 unsigned int want, unsigned int flags)
9831{
11fdf7f2 9832 std::lock_guard lock(client_lock);
7c673cae
FG
9833 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9834 tout(cct) << fd << std::endl;
9835
181888fb
FG
9836 if (unmounting)
9837 return -ENOTCONN;
9838
7c673cae
FG
9839 Fh *f = get_filehandle(fd);
9840 if (!f)
9841 return -EBADF;
9842
9843 unsigned mask = statx_to_mask(flags, want);
9844
9845 int r = 0;
94b18763 9846 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9847 r = _getattr(f->inode, mask, perms);
9848 if (r < 0) {
9849 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9850 return r;
9851 }
9852 }
9853
9854 fill_statx(f->inode, mask, stx);
9855 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9856 return r;
9857}
9858
9859// not written yet, but i want to link!
9860
9861int Client::chdir(const char *relpath, std::string &new_cwd,
9862 const UserPerm& perms)
9863{
11fdf7f2 9864 std::lock_guard lock(client_lock);
7c673cae
FG
9865 tout(cct) << "chdir" << std::endl;
9866 tout(cct) << relpath << std::endl;
181888fb
FG
9867
9868 if (unmounting)
9869 return -ENOTCONN;
9870
7c673cae
FG
9871 filepath path(relpath);
9872 InodeRef in;
9873 int r = path_walk(path, &in, perms);
9874 if (r < 0)
9875 return r;
9876 if (cwd != in)
9877 cwd.swap(in);
9878 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9879
b5b8bbf5 9880 _getcwd(new_cwd, perms);
7c673cae
FG
9881 return 0;
9882}
9883
b5b8bbf5 9884void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9885{
9886 filepath path;
11fdf7f2 9887 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9888
9889 Inode *in = cwd.get();
9890 while (in != root) {
11fdf7f2 9891 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9892
9893 // A cwd or ancester is unlinked
11fdf7f2 9894 if (in->dentries.empty()) {
7c673cae
FG
9895 return;
9896 }
9897
9898 Dentry *dn = in->get_first_parent();
9899
9900
9901 if (!dn) {
9902 // look it up
11fdf7f2 9903 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9904 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9905 filepath path(in->ino);
9906 req->set_filepath(path);
9907 req->set_inode(in);
9908 int res = make_request(req, perms);
9909 if (res < 0)
9910 break;
9911
9912 // start over
9913 path = filepath();
9914 in = cwd.get();
9915 continue;
9916 }
9917 path.push_front_dentry(dn->name);
9918 in = dn->dir->parent_inode;
9919 }
9920 dir = "/";
9921 dir += path.get_path();
9922}
9923
b5b8bbf5
FG
9924void Client::getcwd(string& dir, const UserPerm& perms)
9925{
11fdf7f2 9926 std::lock_guard l(client_lock);
181888fb
FG
9927 if (!unmounting)
9928 _getcwd(dir, perms);
b5b8bbf5
FG
9929}
9930
7c673cae
FG
9931int Client::statfs(const char *path, struct statvfs *stbuf,
9932 const UserPerm& perms)
9933{
11fdf7f2
TL
9934 std::lock_guard l(client_lock);
9935 tout(cct) << __func__ << std::endl;
91327a77 9936 unsigned long int total_files_on_fs;
7c673cae 9937
181888fb
FG
9938 if (unmounting)
9939 return -ENOTCONN;
9940
7c673cae
FG
9941 ceph_statfs stats;
9942 C_SaferCond cond;
d2e6a577
FG
9943
9944 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9945 if (data_pools.size() == 1) {
9946 objecter->get_fs_stats(stats, data_pools[0], &cond);
9947 } else {
9948 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9949 }
7c673cae
FG
9950
9951 client_lock.Unlock();
9952 int rval = cond.wait();
91327a77
AA
9953 assert(root);
9954 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9955 client_lock.Lock();
9956
9957 if (rval < 0) {
9958 ldout(cct, 1) << "underlying call to statfs returned error: "
9959 << cpp_strerror(rval)
9960 << dendl;
9961 return rval;
9962 }
9963
9964 memset(stbuf, 0, sizeof(*stbuf));
9965
9966 /*
9967 * we're going to set a block size of 4MB so we can represent larger
9968 * FSes without overflowing. Additionally convert the space
9969 * measurements from KB to bytes while making them in terms of
9970 * blocks. We use 4MB only because it is big enough, and because it
9971 * actually *is* the (ceph) default block size.
9972 */
9973 const int CEPH_BLOCK_SHIFT = 22;
9974 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9975 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9976 stbuf->f_files = total_files_on_fs;
9977 stbuf->f_ffree = 0;
7c673cae
FG
9978 stbuf->f_favail = -1;
9979 stbuf->f_fsid = -1; // ??
9980 stbuf->f_flag = 0; // ??
9981 stbuf->f_namemax = NAME_MAX;
9982
9983 // Usually quota_root will == root_ancestor, but if the mount root has no
9984 // quota but we can see a parent of it that does have a quota, we'll
9985 // respect that one instead.
11fdf7f2 9986 ceph_assert(root != nullptr);
7c673cae
FG
9987 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9988
9989 // get_quota_root should always give us something
9990 // because client quotas are always enabled
11fdf7f2 9991 ceph_assert(quota_root != nullptr);
7c673cae
FG
9992
9993 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9994
9995 // Skip the getattr if any sessions are stale, as we don't want to
9996 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9997 // is unhealthy.
9998 if (!_any_stale_sessions()) {
9999 int r = _getattr(quota_root, 0, perms, true);
10000 if (r != 0) {
10001 // Ignore return value: error getting latest inode metadata is not a good
10002 // reason to break "df".
10003 lderr(cct) << "Error in getattr on quota root 0x"
10004 << std::hex << quota_root->ino << std::dec
10005 << " statfs result may be outdated" << dendl;
10006 }
10007 }
10008
10009 // Special case: if there is a size quota set on the Inode acting
10010 // as the root for this client mount, then report the quota status
10011 // as the filesystem statistics.
10012 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10013 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10014 // It is possible for a quota to be exceeded: arithmetic here must
10015 // handle case where used > total.
10016 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10017
10018 stbuf->f_blocks = total;
10019 stbuf->f_bfree = free;
10020 stbuf->f_bavail = free;
10021 } else {
d2e6a577 10022 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10023 // multiple pools may be used without one filesystem namespace via
10024 // layouts, this is the most correct thing we can do.
10025 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10026 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10027 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10028 }
10029
10030 return rval;
10031}
10032
10033int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10034 struct flock *fl, uint64_t owner, bool removing)
10035{
11fdf7f2 10036 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10037 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10038 << " type " << fl->l_type << " owner " << owner
10039 << " " << fl->l_start << "~" << fl->l_len << dendl;
10040
10041 int lock_cmd;
10042 if (F_RDLCK == fl->l_type)
10043 lock_cmd = CEPH_LOCK_SHARED;
10044 else if (F_WRLCK == fl->l_type)
10045 lock_cmd = CEPH_LOCK_EXCL;
10046 else if (F_UNLCK == fl->l_type)
10047 lock_cmd = CEPH_LOCK_UNLOCK;
10048 else
10049 return -EIO;
10050
10051 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10052 sleep = 0;
10053
10054 /*
10055 * Set the most significant bit, so that MDS knows the 'owner'
10056 * is sufficient to identify the owner of lock. (old code uses
10057 * both 'owner' and 'pid')
10058 */
10059 owner |= (1ULL << 63);
10060
10061 MetaRequest *req = new MetaRequest(op);
10062 filepath path;
10063 in->make_nosnap_relative_path(path);
10064 req->set_filepath(path);
10065 req->set_inode(in);
10066
10067 req->head.args.filelock_change.rule = lock_type;
10068 req->head.args.filelock_change.type = lock_cmd;
10069 req->head.args.filelock_change.owner = owner;
10070 req->head.args.filelock_change.pid = fl->l_pid;
10071 req->head.args.filelock_change.start = fl->l_start;
10072 req->head.args.filelock_change.length = fl->l_len;
10073 req->head.args.filelock_change.wait = sleep;
10074
10075 int ret;
10076 bufferlist bl;
10077
10078 if (sleep && switch_interrupt_cb) {
10079 // enable interrupt
10080 switch_interrupt_cb(callback_handle, req->get());
10081 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10082 // disable interrupt
10083 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10084 if (ret == 0 && req->aborted()) {
10085 // effect of this lock request has been revoked by the 'lock intr' request
10086 ret = req->get_abort_code();
10087 }
7c673cae
FG
10088 put_request(req);
10089 } else {
10090 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10091 }
10092
10093 if (ret == 0) {
10094 if (op == CEPH_MDS_OP_GETFILELOCK) {
10095 ceph_filelock filelock;
11fdf7f2
TL
10096 auto p = bl.cbegin();
10097 decode(filelock, p);
7c673cae
FG
10098
10099 if (CEPH_LOCK_SHARED == filelock.type)
10100 fl->l_type = F_RDLCK;
10101 else if (CEPH_LOCK_EXCL == filelock.type)
10102 fl->l_type = F_WRLCK;
10103 else
10104 fl->l_type = F_UNLCK;
10105
10106 fl->l_whence = SEEK_SET;
10107 fl->l_start = filelock.start;
10108 fl->l_len = filelock.length;
10109 fl->l_pid = filelock.pid;
10110 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10111 ceph_lock_state_t *lock_state;
10112 if (lock_type == CEPH_LOCK_FCNTL) {
10113 if (!in->fcntl_locks)
11fdf7f2
TL
10114 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10115 lock_state = in->fcntl_locks.get();
7c673cae
FG
10116 } else if (lock_type == CEPH_LOCK_FLOCK) {
10117 if (!in->flock_locks)
11fdf7f2
TL
10118 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10119 lock_state = in->flock_locks.get();
7c673cae
FG
10120 } else {
10121 ceph_abort();
10122 return -EINVAL;
10123 }
10124 _update_lock_state(fl, owner, lock_state);
10125
10126 if (!removing) {
10127 if (lock_type == CEPH_LOCK_FCNTL) {
10128 if (!fh->fcntl_locks)
11fdf7f2
TL
10129 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10130 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10131 } else {
10132 if (!fh->flock_locks)
11fdf7f2
TL
10133 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10134 lock_state = fh->flock_locks.get();
7c673cae
FG
10135 }
10136 _update_lock_state(fl, owner, lock_state);
10137 }
10138 } else
10139 ceph_abort();
10140 }
10141 return ret;
10142}
10143
10144int Client::_interrupt_filelock(MetaRequest *req)
10145{
31f18b77
FG
10146 // Set abort code, but do not kick. The abort code prevents the request
10147 // from being re-sent.
10148 req->abort(-EINTR);
10149 if (req->mds < 0)
10150 return 0; // haven't sent the request
10151
7c673cae
FG
10152 Inode *in = req->inode();
10153
10154 int lock_type;
10155 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10156 lock_type = CEPH_LOCK_FLOCK_INTR;
10157 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10158 lock_type = CEPH_LOCK_FCNTL_INTR;
10159 else {
10160 ceph_abort();
10161 return -EINVAL;
10162 }
10163
10164 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10165 filepath path;
10166 in->make_nosnap_relative_path(path);
10167 intr_req->set_filepath(path);
10168 intr_req->set_inode(in);
10169 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10170 intr_req->head.args.filelock_change.rule = lock_type;
10171 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10172
10173 UserPerm perms(req->get_uid(), req->get_gid());
10174 return make_request(intr_req, perms, NULL, NULL, -1);
10175}
10176
10177void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10178{
10179 if (!in->fcntl_locks && !in->flock_locks)
10180 return;
10181
10182 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10183 encode(nr_fcntl_locks, bl);
7c673cae 10184 if (nr_fcntl_locks) {
11fdf7f2 10185 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10186 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10187 p != lock_state->held_locks.end();
10188 ++p)
11fdf7f2 10189 encode(p->second, bl);
7c673cae
FG
10190 }
10191
10192 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10193 encode(nr_flock_locks, bl);
7c673cae 10194 if (nr_flock_locks) {
11fdf7f2 10195 auto &lock_state = in->flock_locks;
7c673cae
FG
10196 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10197 p != lock_state->held_locks.end();
10198 ++p)
11fdf7f2 10199 encode(p->second, bl);
7c673cae
FG
10200 }
10201
11fdf7f2 10202 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10203 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10204}
10205
10206void Client::_release_filelocks(Fh *fh)
10207{
10208 if (!fh->fcntl_locks && !fh->flock_locks)
10209 return;
10210
10211 Inode *in = fh->inode.get();
11fdf7f2 10212 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10213
10214 list<pair<int, ceph_filelock> > to_release;
10215
10216 if (fh->fcntl_locks) {
11fdf7f2 10217 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10218 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10219 p != lock_state->held_locks.end();
10220 ++p)
10221 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10222 lock_state.reset();
7c673cae
FG
10223 }
10224 if (fh->flock_locks) {
11fdf7f2 10225 auto &lock_state = fh->flock_locks;
7c673cae
FG
10226 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10227 p != lock_state->held_locks.end();
10228 ++p)
10229 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10230 lock_state.reset();
7c673cae
FG
10231 }
10232
10233 if (to_release.empty())
10234 return;
10235
11fdf7f2
TL
10236 // mds has already released filelocks if session was closed.
10237 if (in->caps.empty())
10238 return;
10239
7c673cae
FG
10240 struct flock fl;
10241 memset(&fl, 0, sizeof(fl));
10242 fl.l_whence = SEEK_SET;
10243 fl.l_type = F_UNLCK;
10244
10245 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10246 p != to_release.end();
10247 ++p) {
10248 fl.l_start = p->second.start;
10249 fl.l_len = p->second.length;
10250 fl.l_pid = p->second.pid;
10251 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10252 p->second.owner, true);
10253 }
10254}
10255
10256void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10257 ceph_lock_state_t *lock_state)
10258{
10259 int lock_cmd;
10260 if (F_RDLCK == fl->l_type)
10261 lock_cmd = CEPH_LOCK_SHARED;
10262 else if (F_WRLCK == fl->l_type)
10263 lock_cmd = CEPH_LOCK_EXCL;
10264 else
10265 lock_cmd = CEPH_LOCK_UNLOCK;;
10266
10267 ceph_filelock filelock;
10268 filelock.start = fl->l_start;
10269 filelock.length = fl->l_len;
10270 filelock.client = 0;
10271 // see comment in _do_filelock()
10272 filelock.owner = owner | (1ULL << 63);
10273 filelock.pid = fl->l_pid;
10274 filelock.type = lock_cmd;
10275
10276 if (filelock.type == CEPH_LOCK_UNLOCK) {
10277 list<ceph_filelock> activated_locks;
10278 lock_state->remove_lock(filelock, activated_locks);
10279 } else {
10280 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10281 ceph_assert(r);
7c673cae
FG
10282 }
10283}
10284
10285int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10286{
10287 Inode *in = fh->inode.get();
10288 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10289 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10290 return ret;
10291}
10292
10293int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10294{
10295 Inode *in = fh->inode.get();
10296 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10297 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10298 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10299 return ret;
10300}
10301
10302int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10303{
10304 Inode *in = fh->inode.get();
10305 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10306
10307 int sleep = !(cmd & LOCK_NB);
10308 cmd &= ~LOCK_NB;
10309
10310 int type;
10311 switch (cmd) {
10312 case LOCK_SH:
10313 type = F_RDLCK;
10314 break;
10315 case LOCK_EX:
10316 type = F_WRLCK;
10317 break;
10318 case LOCK_UN:
10319 type = F_UNLCK;
10320 break;
10321 default:
10322 return -EINVAL;
10323 }
10324
10325 struct flock fl;
10326 memset(&fl, 0, sizeof(fl));
10327 fl.l_type = type;
10328 fl.l_whence = SEEK_SET;
10329
10330 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10331 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10332 return ret;
10333}
10334
10335int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10336{
10337 /* Since the only thing this does is wrap a call to statfs, and
10338 statfs takes a lock, it doesn't seem we have a need to split it
10339 out. */
10340 return statfs(0, stbuf, perms);
10341}
10342
10343void Client::ll_register_callbacks(struct client_callback_args *args)
10344{
10345 if (!args)
10346 return;
11fdf7f2
TL
10347 std::lock_guard l(client_lock);
10348 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10349 << " invalidate_ino_cb " << args->ino_cb
10350 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10351 << " switch_interrupt_cb " << args->switch_intr_cb
10352 << " remount_cb " << args->remount_cb
10353 << dendl;
10354 callback_handle = args->handle;
10355 if (args->ino_cb) {
10356 ino_invalidate_cb = args->ino_cb;
10357 async_ino_invalidator.start();
10358 }
10359 if (args->dentry_cb) {
10360 dentry_invalidate_cb = args->dentry_cb;
10361 async_dentry_invalidator.start();
10362 }
10363 if (args->switch_intr_cb) {
10364 switch_interrupt_cb = args->switch_intr_cb;
10365 interrupt_finisher.start();
10366 }
10367 if (args->remount_cb) {
10368 remount_cb = args->remount_cb;
10369 remount_finisher.start();
10370 }
7c673cae
FG
10371 umask_cb = args->umask_cb;
10372}
10373
10374int Client::test_dentry_handling(bool can_invalidate)
10375{
10376 int r = 0;
10377
10378 can_invalidate_dentries = can_invalidate;
10379
10380 if (can_invalidate_dentries) {
11fdf7f2 10381 ceph_assert(dentry_invalidate_cb);
7c673cae 10382 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10383 r = 0;
11fdf7f2
TL
10384 } else {
10385 ceph_assert(remount_cb);
7c673cae 10386 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10387 r = _do_remount(false);
b32b8144 10388 }
11fdf7f2 10389
7c673cae
FG
10390 return r;
10391}
10392
10393int Client::_sync_fs()
10394{
11fdf7f2 10395 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10396
10397 // flush file data
11fdf7f2
TL
10398 std::unique_ptr<C_SaferCond> cond = nullptr;
10399 if (cct->_conf->client_oc) {
10400 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10401 objectcacher->flush_all(cond.get());
10402 }
7c673cae
FG
10403
10404 // flush caps
10405 flush_caps_sync();
10406 ceph_tid_t flush_tid = last_flush_tid;
10407
10408 // wait for unsafe mds requests
10409 wait_unsafe_requests();
10410
10411 wait_sync_caps(flush_tid);
10412
11fdf7f2 10413 if (nullptr != cond) {
7c673cae 10414 client_lock.Unlock();
11fdf7f2
TL
10415 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10416 cond->wait();
10417 ldout(cct, 15) << __func__ << " flush finished" << dendl;
7c673cae
FG
10418 client_lock.Lock();
10419 }
10420
10421 return 0;
10422}
10423
10424int Client::sync_fs()
10425{
11fdf7f2 10426 std::lock_guard l(client_lock);
181888fb
FG
10427
10428 if (unmounting)
10429 return -ENOTCONN;
10430
7c673cae
FG
10431 return _sync_fs();
10432}
10433
10434int64_t Client::drop_caches()
10435{
11fdf7f2 10436 std::lock_guard l(client_lock);
7c673cae
FG
10437 return objectcacher->release_all();
10438}
10439
11fdf7f2
TL
10440int Client::_lazyio(Fh *fh, int enable)
10441{
10442 Inode *in = fh->inode.get();
10443 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10444
10445 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10446 return 0;
10447
10448 int orig_mode = fh->mode;
10449 if (enable) {
10450 fh->mode |= CEPH_FILE_MODE_LAZY;
10451 in->get_open_ref(fh->mode);
10452 in->put_open_ref(orig_mode);
10453 check_caps(in, CHECK_CAPS_NODELAY);
10454 } else {
10455 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10456 in->get_open_ref(fh->mode);
10457 in->put_open_ref(orig_mode);
10458 check_caps(in, 0);
10459 }
10460
10461 return 0;
10462}
10463
10464int Client::lazyio(int fd, int enable)
10465{
10466 std::lock_guard l(client_lock);
10467 Fh *f = get_filehandle(fd);
10468 if (!f)
10469 return -EBADF;
10470
10471 return _lazyio(f, enable);
10472}
10473
10474int Client::ll_lazyio(Fh *fh, int enable)
10475{
10476 std::lock_guard lock(client_lock);
10477 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10478 tout(cct) << __func__ << std::endl;
10479
10480 return _lazyio(fh, enable);
10481}
7c673cae
FG
10482
10483int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10484{
11fdf7f2 10485 std::lock_guard l(client_lock);
7c673cae
FG
10486 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10487 << ", " << offset << ", " << count << ")" << dendl;
10488
10489 Fh *f = get_filehandle(fd);
10490 if (!f)
10491 return -EBADF;
10492
10493 // for now
10494 _fsync(f, true);
10495
10496 return 0;
10497}
10498
10499int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10500{
11fdf7f2 10501 std::lock_guard l(client_lock);
7c673cae
FG
10502 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10503 << ", " << offset << ", " << count << ")" << dendl;
10504
10505 Fh *f = get_filehandle(fd);
10506 if (!f)
10507 return -EBADF;
10508 Inode *in = f->inode.get();
10509
10510 _fsync(f, true);
10511 if (_release(in))
10512 check_caps(in, 0);
10513 return 0;
10514}
10515
10516
10517// =============================
10518// snaps
10519
10520int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10521{
11fdf7f2 10522 std::lock_guard l(client_lock);
181888fb
FG
10523
10524 if (unmounting)
10525 return -ENOTCONN;
10526
7c673cae
FG
10527 filepath path(relpath);
10528 InodeRef in;
10529 int r = path_walk(path, &in, perm);
10530 if (r < 0)
10531 return r;
10532 if (cct->_conf->client_permissions) {
10533 r = may_create(in.get(), perm);
10534 if (r < 0)
10535 return r;
10536 }
10537 Inode *snapdir = open_snapdir(in.get());
10538 return _mkdir(snapdir, name, 0, perm);
10539}
181888fb 10540
7c673cae
FG
10541int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10542{
11fdf7f2 10543 std::lock_guard l(client_lock);
181888fb
FG
10544
10545 if (unmounting)
10546 return -ENOTCONN;
10547
7c673cae
FG
10548 filepath path(relpath);
10549 InodeRef in;
10550 int r = path_walk(path, &in, perms);
10551 if (r < 0)
10552 return r;
10553 if (cct->_conf->client_permissions) {
10554 r = may_delete(in.get(), NULL, perms);
10555 if (r < 0)
10556 return r;
10557 }
10558 Inode *snapdir = open_snapdir(in.get());
10559 return _rmdir(snapdir, name, perms);
10560}
10561
10562// =============================
10563// expose caps
10564
10565int Client::get_caps_issued(int fd) {
10566
11fdf7f2 10567 std::lock_guard lock(client_lock);
7c673cae 10568
181888fb
FG
10569 if (unmounting)
10570 return -ENOTCONN;
10571
7c673cae
FG
10572 Fh *f = get_filehandle(fd);
10573 if (!f)
10574 return -EBADF;
10575
10576 return f->inode->caps_issued();
10577}
10578
10579int Client::get_caps_issued(const char *path, const UserPerm& perms)
10580{
11fdf7f2 10581 std::lock_guard lock(client_lock);
181888fb
FG
10582
10583 if (unmounting)
10584 return -ENOTCONN;
10585
7c673cae
FG
10586 filepath p(path);
10587 InodeRef in;
10588 int r = path_walk(p, &in, perms, true);
10589 if (r < 0)
10590 return r;
10591 return in->caps_issued();
10592}
10593
10594// =========================================
10595// low level
10596
10597Inode *Client::open_snapdir(Inode *diri)
10598{
10599 Inode *in;
10600 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10601 if (!inode_map.count(vino)) {
10602 in = new Inode(this, vino, &diri->layout);
10603
10604 in->ino = diri->ino;
10605 in->snapid = CEPH_SNAPDIR;
10606 in->mode = diri->mode;
10607 in->uid = diri->uid;
10608 in->gid = diri->gid;
10609 in->mtime = diri->mtime;
10610 in->ctime = diri->ctime;
10611 in->btime = diri->btime;
10612 in->size = diri->size;
10613 in->change_attr = diri->change_attr;
10614
10615 in->dirfragtree.clear();
10616 in->snapdir_parent = diri;
10617 diri->flags |= I_SNAPDIR_OPEN;
10618 inode_map[vino] = in;
10619 if (use_faked_inos())
10620 _assign_faked_ino(in);
10621 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10622 } else {
10623 in = inode_map[vino];
10624 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10625 }
10626 return in;
10627}
10628
10629int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10630 Inode **out, const UserPerm& perms)
10631{
11fdf7f2 10632 std::lock_guard lock(client_lock);
31f18b77 10633 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10634 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10635 tout(cct) << __func__ << std::endl;
7c673cae
FG
10636 tout(cct) << name << std::endl;
10637
181888fb
FG
10638 if (unmounting)
10639 return -ENOTCONN;
10640
7c673cae 10641 int r = 0;
11fdf7f2
TL
10642 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10643 "fuse_default_permissions");
10644 if (!fuse_default_permissions) {
10645 if (strcmp(name, ".") && strcmp(name, "..")) {
10646 r = may_lookup(parent, perms);
10647 if (r < 0)
10648 return r;
10649 }
7c673cae
FG
10650 }
10651
10652 string dname(name);
10653 InodeRef in;
10654
10655 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10656 if (r < 0) {
10657 attr->st_ino = 0;
10658 goto out;
10659 }
10660
11fdf7f2 10661 ceph_assert(in);
7c673cae
FG
10662 fill_stat(in, attr);
10663 _ll_get(in.get());
10664
10665 out:
11fdf7f2 10666 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10667 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10668 tout(cct) << attr->st_ino << std::endl;
10669 *out = in.get();
10670 return r;
10671}
10672
1adf2230
AA
10673int Client::ll_lookup_inode(
10674 struct inodeno_t ino,
10675 const UserPerm& perms,
10676 Inode **inode)
10677{
11fdf7f2 10678 std::lock_guard lock(client_lock);
1adf2230
AA
10679 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10680
10681 // Num1: get inode and *inode
10682 int r = _lookup_ino(ino, perms, inode);
10683 if (r) {
10684 return r;
10685 }
11fdf7f2
TL
10686 ceph_assert(inode != NULL);
10687 ceph_assert(*inode != NULL);
1adf2230
AA
10688
10689 // Num2: Request the parent inode, so that we can look up the name
10690 Inode *parent;
10691 r = _lookup_parent(*inode, perms, &parent);
10692 if (r && r != -EINVAL) {
10693 // Unexpected error
10694 _ll_forget(*inode, 1);
10695 return r;
10696 } else if (r == -EINVAL) {
10697 // EINVAL indicates node without parents (root), drop out now
10698 // and don't try to look up the non-existent dentry.
10699 return 0;
10700 }
10701 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10702 // is already in cache
11fdf7f2 10703 ceph_assert(parent != NULL);
1adf2230
AA
10704
10705 // Num3: Finally, get the name (dentry) of the requested inode
10706 r = _lookup_name(*inode, parent, perms);
10707 if (r) {
10708 // Unexpected error
10709 _ll_forget(parent, 1);
10710 _ll_forget(*inode, 1);
10711 return r;
10712 }
10713
10714 _ll_forget(parent, 1);
10715 return 0;
10716}
10717
7c673cae
FG
10718int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10719 struct ceph_statx *stx, unsigned want, unsigned flags,
10720 const UserPerm& perms)
10721{
11fdf7f2 10722 std::lock_guard lock(client_lock);
31f18b77 10723 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10724 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10725 tout(cct) << "ll_lookupx" << std::endl;
10726 tout(cct) << name << std::endl;
10727
181888fb
FG
10728 if (unmounting)
10729 return -ENOTCONN;
10730
7c673cae 10731 int r = 0;
11fdf7f2
TL
10732 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10733 "fuse_default_permissions");
10734 if (!fuse_default_permissions) {
7c673cae
FG
10735 r = may_lookup(parent, perms);
10736 if (r < 0)
10737 return r;
10738 }
10739
10740 string dname(name);
10741 InodeRef in;
10742
10743 unsigned mask = statx_to_mask(flags, want);
10744 r = _lookup(parent, dname, mask, &in, perms);
10745 if (r < 0) {
10746 stx->stx_ino = 0;
10747 stx->stx_mask = 0;
10748 } else {
11fdf7f2 10749 ceph_assert(in);
7c673cae
FG
10750 fill_statx(in, mask, stx);
10751 _ll_get(in.get());
10752 }
10753
11fdf7f2 10754 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10755 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10756 tout(cct) << stx->stx_ino << std::endl;
10757 *out = in.get();
10758 return r;
10759}
10760
10761int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10762 unsigned int want, unsigned int flags, const UserPerm& perms)
10763{
11fdf7f2 10764 std::lock_guard lock(client_lock);
181888fb
FG
10765
10766 if (unmounting)
10767 return -ENOTCONN;
10768
7c673cae
FG
10769 filepath fp(name, 0);
10770 InodeRef in;
10771 int rc;
10772 unsigned mask = statx_to_mask(flags, want);
10773
11fdf7f2
TL
10774 ldout(cct, 3) << __func__ << " " << name << dendl;
10775 tout(cct) << __func__ << std::endl;
7c673cae
FG
10776 tout(cct) << name << std::endl;
10777
10778 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10779 if (rc < 0) {
10780 /* zero out mask, just in case... */
10781 stx->stx_mask = 0;
10782 stx->stx_ino = 0;
10783 *out = NULL;
10784 return rc;
10785 } else {
11fdf7f2 10786 ceph_assert(in);
7c673cae
FG
10787 fill_statx(in, mask, stx);
10788 _ll_get(in.get());
10789 *out = in.get();
10790 return 0;
10791 }
10792}
10793
10794void Client::_ll_get(Inode *in)
10795{
10796 if (in->ll_ref == 0) {
10797 in->get();
11fdf7f2
TL
10798 if (in->is_dir() && !in->dentries.empty()) {
10799 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10800 in->get_first_parent()->get(); // pin dentry
10801 }
11fdf7f2
TL
10802 if (in->snapid != CEPH_NOSNAP)
10803 ll_snap_ref[in->snapid]++;
7c673cae
FG
10804 }
10805 in->ll_get();
11fdf7f2 10806 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10807}
10808
10809int Client::_ll_put(Inode *in, int num)
10810{
10811 in->ll_put(num);
11fdf7f2 10812 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10813 if (in->ll_ref == 0) {
11fdf7f2
TL
10814 if (in->is_dir() && !in->dentries.empty()) {
10815 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10816 in->get_first_parent()->put(); // unpin dentry
10817 }
11fdf7f2
TL
10818 if (in->snapid != CEPH_NOSNAP) {
10819 auto p = ll_snap_ref.find(in->snapid);
10820 ceph_assert(p != ll_snap_ref.end());
10821 ceph_assert(p->second > 0);
10822 if (--p->second == 0)
10823 ll_snap_ref.erase(p);
10824 }
7c673cae
FG
10825 put_inode(in);
10826 return 0;
10827 } else {
10828 return in->ll_ref;
10829 }
10830}
10831
10832void Client::_ll_drop_pins()
10833{
11fdf7f2 10834 ldout(cct, 10) << __func__ << dendl;
1adf2230 10835 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10836 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10837 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10838 it != inode_map.end();
10839 it = next) {
10840 Inode *in = it->second;
10841 next = it;
10842 ++next;
1adf2230
AA
10843 if (in->ll_ref){
10844 to_be_put.insert(in);
7c673cae 10845 _ll_put(in, in->ll_ref);
1adf2230 10846 }
7c673cae
FG
10847 }
10848}
10849
1adf2230 10850bool Client::_ll_forget(Inode *in, int count)
7c673cae 10851{
11fdf7f2 10852 inodeno_t ino = in->ino;
7c673cae 10853
11fdf7f2
TL
10854 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10855 tout(cct) << __func__ << std::endl;
7c673cae
FG
10856 tout(cct) << ino.val << std::endl;
10857 tout(cct) << count << std::endl;
10858
181888fb
FG
10859 // Ignore forget if we're no longer mounted
10860 if (unmounting)
10861 return true;
10862
7c673cae
FG
10863 if (ino == 1) return true; // ignore forget on root.
10864
10865 bool last = false;
10866 if (in->ll_ref < count) {
10867 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10868 << ", which only has ll_ref=" << in->ll_ref << dendl;
10869 _ll_put(in, in->ll_ref);
10870 last = true;
10871 } else {
10872 if (_ll_put(in, count) == 0)
10873 last = true;
10874 }
10875
10876 return last;
10877}
10878
1adf2230
AA
10879bool Client::ll_forget(Inode *in, int count)
10880{
11fdf7f2 10881 std::lock_guard lock(client_lock);
1adf2230
AA
10882 return _ll_forget(in, count);
10883}
10884
7c673cae
FG
10885bool Client::ll_put(Inode *in)
10886{
10887 /* ll_forget already takes the lock */
10888 return ll_forget(in, 1);
10889}
10890
11fdf7f2
TL
10891int Client::ll_get_snap_ref(snapid_t snap)
10892{
10893 std::lock_guard lock(client_lock);
10894 auto p = ll_snap_ref.find(snap);
10895 if (p != ll_snap_ref.end())
10896 return p->second;
10897 return 0;
10898}
10899
7c673cae
FG
10900snapid_t Client::ll_get_snapid(Inode *in)
10901{
11fdf7f2 10902 std::lock_guard lock(client_lock);
7c673cae
FG
10903 return in->snapid;
10904}
10905
10906Inode *Client::ll_get_inode(ino_t ino)
10907{
11fdf7f2 10908 std::lock_guard lock(client_lock);
181888fb
FG
10909
10910 if (unmounting)
10911 return NULL;
10912
7c673cae
FG
10913 vinodeno_t vino = _map_faked_ino(ino);
10914 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10915 if (p == inode_map.end())
10916 return NULL;
10917 Inode *in = p->second;
10918 _ll_get(in);
10919 return in;
10920}
10921
10922Inode *Client::ll_get_inode(vinodeno_t vino)
10923{
11fdf7f2 10924 std::lock_guard lock(client_lock);
181888fb
FG
10925
10926 if (unmounting)
10927 return NULL;
10928
7c673cae
FG
10929 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10930 if (p == inode_map.end())
10931 return NULL;
10932 Inode *in = p->second;
10933 _ll_get(in);
10934 return in;
10935}
10936
10937int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10938{
10939 vinodeno_t vino = _get_vino(in);
10940
11fdf7f2
TL
10941 ldout(cct, 8) << __func__ << " " << vino << dendl;
10942 tout(cct) << __func__ << std::endl;
7c673cae
FG
10943 tout(cct) << vino.ino.val << std::endl;
10944
10945 if (vino.snapid < CEPH_NOSNAP)
10946 return 0;
10947 else
10948 return _getattr(in, caps, perms);
10949}
10950
10951int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10952{
11fdf7f2 10953 std::lock_guard lock(client_lock);
7c673cae 10954
181888fb
FG
10955 if (unmounting)
10956 return -ENOTCONN;
10957
7c673cae
FG
10958 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10959
10960 if (res == 0)
10961 fill_stat(in, attr);
11fdf7f2 10962 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10963 return res;
10964}
10965
10966int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10967 unsigned int flags, const UserPerm& perms)
10968{
11fdf7f2 10969 std::lock_guard lock(client_lock);
7c673cae 10970
181888fb
FG
10971 if (unmounting)
10972 return -ENOTCONN;
10973
7c673cae
FG
10974 int res = 0;
10975 unsigned mask = statx_to_mask(flags, want);
10976
94b18763 10977 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10978 res = _ll_getattr(in, mask, perms);
10979
10980 if (res == 0)
10981 fill_statx(in, mask, stx);
11fdf7f2 10982 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10983 return res;
10984}
10985
10986int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10987 const UserPerm& perms, InodeRef *inp)
10988{
10989 vinodeno_t vino = _get_vino(in);
10990
11fdf7f2 10991 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 10992 << dendl;
11fdf7f2 10993 tout(cct) << __func__ << std::endl;
7c673cae
FG
10994 tout(cct) << vino.ino.val << std::endl;
10995 tout(cct) << stx->stx_mode << std::endl;
10996 tout(cct) << stx->stx_uid << std::endl;
10997 tout(cct) << stx->stx_gid << std::endl;
10998 tout(cct) << stx->stx_size << std::endl;
10999 tout(cct) << stx->stx_mtime << std::endl;
11000 tout(cct) << stx->stx_atime << std::endl;
11001 tout(cct) << stx->stx_btime << std::endl;
11002 tout(cct) << mask << std::endl;
11003
11fdf7f2
TL
11004 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11005 "fuse_default_permissions");
11006 if (!fuse_default_permissions) {
7c673cae
FG
11007 int res = may_setattr(in, stx, mask, perms);
11008 if (res < 0)
11009 return res;
11010 }
11011
11012 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11013
11014 return __setattrx(in, stx, mask, perms, inp);
11015}
11016
11017int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11018 const UserPerm& perms)
11019{
11fdf7f2 11020 std::lock_guard lock(client_lock);
181888fb
FG
11021
11022 if (unmounting)
11023 return -ENOTCONN;
11024
7c673cae
FG
11025 InodeRef target(in);
11026 int res = _ll_setattrx(in, stx, mask, perms, &target);
11027 if (res == 0) {
11fdf7f2 11028 ceph_assert(in == target.get());
7c673cae
FG
11029 fill_statx(in, in->caps_issued(), stx);
11030 }
11031
11fdf7f2 11032 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11033 return res;
11034}
11035
11036int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11037 const UserPerm& perms)
11038{
11039 struct ceph_statx stx;
11040 stat_to_statx(attr, &stx);
11041
11fdf7f2 11042 std::lock_guard lock(client_lock);
181888fb
FG
11043
11044 if (unmounting)
11045 return -ENOTCONN;
11046
7c673cae
FG
11047 InodeRef target(in);
11048 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11049 if (res == 0) {
11fdf7f2 11050 ceph_assert(in == target.get());
7c673cae
FG
11051 fill_stat(in, attr);
11052 }
11053
11fdf7f2 11054 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11055 return res;
11056}
11057
11058
11059// ----------
11060// xattrs
11061
11062int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11063 const UserPerm& perms)
11064{
11fdf7f2 11065 std::lock_guard lock(client_lock);
181888fb
FG
11066
11067 if (unmounting)
11068 return -ENOTCONN;
11069
7c673cae
FG
11070 InodeRef in;
11071 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11072 if (r < 0)
11073 return r;
11074 return _getxattr(in, name, value, size, perms);
11075}
11076
11077int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11078 const UserPerm& perms)
11079{
11fdf7f2 11080 std::lock_guard lock(client_lock);
181888fb
FG
11081
11082 if (unmounting)
11083 return -ENOTCONN;
11084
7c673cae
FG
11085 InodeRef in;
11086 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11087 if (r < 0)
11088 return r;
11089 return _getxattr(in, name, value, size, perms);
11090}
11091
11092int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11093 const UserPerm& perms)
11094{
11fdf7f2 11095 std::lock_guard lock(client_lock);
181888fb
FG
11096
11097 if (unmounting)
11098 return -ENOTCONN;
11099
7c673cae
FG
11100 Fh *f = get_filehandle(fd);
11101 if (!f)
11102 return -EBADF;
11103 return _getxattr(f->inode, name, value, size, perms);
11104}
11105
11106int Client::listxattr(const char *path, char *list, size_t size,
11107 const UserPerm& perms)
11108{
11fdf7f2 11109 std::lock_guard lock(client_lock);
181888fb
FG
11110
11111 if (unmounting)
11112 return -ENOTCONN;
11113
7c673cae
FG
11114 InodeRef in;
11115 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11116 if (r < 0)
11117 return r;
11118 return Client::_listxattr(in.get(), list, size, perms);
11119}
11120
11121int Client::llistxattr(const char *path, char *list, size_t size,
11122 const UserPerm& perms)
11123{
11fdf7f2 11124 std::lock_guard lock(client_lock);
181888fb
FG
11125
11126 if (unmounting)
11127 return -ENOTCONN;
11128
7c673cae
FG
11129 InodeRef in;
11130 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11131 if (r < 0)
11132 return r;
11133 return Client::_listxattr(in.get(), list, size, perms);
11134}
11135
11136int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11137{
11fdf7f2 11138 std::lock_guard lock(client_lock);
181888fb
FG
11139
11140 if (unmounting)
11141 return -ENOTCONN;
11142
7c673cae
FG
11143 Fh *f = get_filehandle(fd);
11144 if (!f)
11145 return -EBADF;
11146 return Client::_listxattr(f->inode.get(), list, size, perms);
11147}
11148
11149int Client::removexattr(const char *path, const char *name,
11150 const UserPerm& perms)
11151{
11fdf7f2 11152 std::lock_guard lock(client_lock);
181888fb
FG
11153
11154 if (unmounting)
11155 return -ENOTCONN;
11156
7c673cae
FG
11157 InodeRef in;
11158 int r = Client::path_walk(path, &in, perms, true);
11159 if (r < 0)
11160 return r;
11161 return _removexattr(in, name, perms);
11162}
11163
11164int Client::lremovexattr(const char *path, const char *name,
11165 const UserPerm& perms)
11166{
11fdf7f2 11167 std::lock_guard lock(client_lock);
181888fb
FG
11168
11169 if (unmounting)
11170 return -ENOTCONN;
11171
7c673cae
FG
11172 InodeRef in;
11173 int r = Client::path_walk(path, &in, perms, false);
11174 if (r < 0)
11175 return r;
11176 return _removexattr(in, name, perms);
11177}
11178
11179int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11180{
11fdf7f2 11181 std::lock_guard lock(client_lock);
181888fb
FG
11182
11183 if (unmounting)
11184 return -ENOTCONN;
11185
7c673cae
FG
11186 Fh *f = get_filehandle(fd);
11187 if (!f)
11188 return -EBADF;
11189 return _removexattr(f->inode, name, perms);
11190}
11191
11192int Client::setxattr(const char *path, const char *name, const void *value,
11193 size_t size, int flags, const UserPerm& perms)
11194{
11195 _setxattr_maybe_wait_for_osdmap(name, value, size);
11196
11fdf7f2 11197 std::lock_guard lock(client_lock);
181888fb
FG
11198
11199 if (unmounting)
11200 return -ENOTCONN;
11201
7c673cae
FG
11202 InodeRef in;
11203 int r = Client::path_walk(path, &in, perms, true);
11204 if (r < 0)
11205 return r;
11206 return _setxattr(in, name, value, size, flags, perms);
11207}
11208
11209int Client::lsetxattr(const char *path, const char *name, const void *value,
11210 size_t size, int flags, const UserPerm& perms)
11211{
11212 _setxattr_maybe_wait_for_osdmap(name, value, size);
11213
11fdf7f2 11214 std::lock_guard lock(client_lock);
181888fb
FG
11215
11216 if (unmounting)
11217 return -ENOTCONN;
11218
7c673cae
FG
11219 InodeRef in;
11220 int r = Client::path_walk(path, &in, perms, false);
11221 if (r < 0)
11222 return r;
11223 return _setxattr(in, name, value, size, flags, perms);
11224}
11225
11226int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11227 int flags, const UserPerm& perms)
11228{
11229 _setxattr_maybe_wait_for_osdmap(name, value, size);
11230
11fdf7f2 11231 std::lock_guard lock(client_lock);
181888fb
FG
11232
11233 if (unmounting)
11234 return -ENOTCONN;
11235
7c673cae
FG
11236 Fh *f = get_filehandle(fd);
11237 if (!f)
11238 return -EBADF;
11239 return _setxattr(f->inode, name, value, size, flags, perms);
11240}
11241
11242int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11243 const UserPerm& perms)
11244{
11245 int r;
11246
11247 const VXattr *vxattr = _match_vxattr(in, name);
11248 if (vxattr) {
11249 r = -ENODATA;
11250
11251 // Do a force getattr to get the latest quota before returning
11252 // a value to userspace.
28e407b8
AA
11253 int flags = 0;
11254 if (vxattr->flags & VXATTR_RSTAT) {
11255 flags |= CEPH_STAT_RSTAT;
11256 }
11257 r = _getattr(in, flags, perms, true);
7c673cae
FG
11258 if (r != 0) {
11259 // Error from getattr!
11260 return r;
11261 }
11262
11263 // call pointer-to-member function
11264 char buf[256];
11265 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11266 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11267 } else {
11268 r = -ENODATA;
11269 }
11270
11271 if (size != 0) {
11272 if (r > (int)size) {
11273 r = -ERANGE;
11274 } else if (r > 0) {
11275 memcpy(value, buf, r);
11276 }
11277 }
11278 goto out;
11279 }
11280
11281 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11282 r = -EOPNOTSUPP;
11283 goto out;
11284 }
11285
11286 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11287 if (r == 0) {
11288 string n(name);
11289 r = -ENODATA;
11290 if (in->xattrs.count(n)) {
11291 r = in->xattrs[n].length();
11292 if (r > 0 && size != 0) {
11293 if (size >= (unsigned)r)
11294 memcpy(value, in->xattrs[n].c_str(), r);
11295 else
11296 r = -ERANGE;
11297 }
11298 }
11299 }
11300 out:
1adf2230 11301 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11302 return r;
11303}
11304
11305int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11306 const UserPerm& perms)
11307{
11308 if (cct->_conf->client_permissions) {
11309 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11310 if (r < 0)
11311 return r;
11312 }
11313 return _getxattr(in.get(), name, value, size, perms);
11314}
11315
11316int Client::ll_getxattr(Inode *in, const char *name, void *value,
11317 size_t size, const UserPerm& perms)
11318{
11fdf7f2 11319 std::lock_guard lock(client_lock);
7c673cae 11320
181888fb
FG
11321 if (unmounting)
11322 return -ENOTCONN;
11323
7c673cae
FG
11324 vinodeno_t vino = _get_vino(in);
11325
11fdf7f2
TL
11326 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11327 tout(cct) << __func__ << std::endl;
7c673cae
FG
11328 tout(cct) << vino.ino.val << std::endl;
11329 tout(cct) << name << std::endl;
11330
11fdf7f2
TL
11331 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11332 "fuse_default_permissions");
11333 if (!fuse_default_permissions) {
7c673cae
FG
11334 int r = xattr_permission(in, name, MAY_READ, perms);
11335 if (r < 0)
11336 return r;
11337 }
11338
11339 return _getxattr(in, name, value, size, perms);
11340}
11341
11342int Client::_listxattr(Inode *in, char *name, size_t size,
11343 const UserPerm& perms)
11344{
11345 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11346 if (r == 0) {
11347 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11348 p != in->xattrs.end();
11349 ++p)
11350 r += p->first.length() + 1;
11351
11352 const VXattr *vxattrs = _get_vxattrs(in);
11353 r += _vxattrs_name_size(vxattrs);
11354
11355 if (size != 0) {
11356 if (size >= (unsigned)r) {
11357 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11358 p != in->xattrs.end();
11359 ++p) {
11360 memcpy(name, p->first.c_str(), p->first.length());
11361 name += p->first.length();
11362 *name = '\0';
11363 name++;
11364 }
11365 if (vxattrs) {
11366 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11367 const VXattr& vxattr = vxattrs[i];
11368 if (vxattr.hidden)
11369 continue;
11370 // call pointer-to-member function
11371 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11372 continue;
11373 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11374 name += vxattr.name.length();
11375 *name = '\0';
11376 name++;
11377 }
11378 }
11379 } else
11380 r = -ERANGE;
11381 }
11382 }
11fdf7f2 11383 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11384 return r;
11385}
11386
11387int Client::ll_listxattr(Inode *in, char *names, size_t size,
11388 const UserPerm& perms)
11389{
11fdf7f2 11390 std::lock_guard lock(client_lock);
7c673cae 11391
181888fb
FG
11392 if (unmounting)
11393 return -ENOTCONN;
11394
7c673cae
FG
11395 vinodeno_t vino = _get_vino(in);
11396
11fdf7f2
TL
11397 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11398 tout(cct) << __func__ << std::endl;
7c673cae
FG
11399 tout(cct) << vino.ino.val << std::endl;
11400 tout(cct) << size << std::endl;
11401
11402 return _listxattr(in, names, size, perms);
11403}
11404
11405int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11406 size_t size, int flags, const UserPerm& perms)
11407{
11408
11409 int xattr_flags = 0;
11410 if (!value)
11411 xattr_flags |= CEPH_XATTR_REMOVE;
11412 if (flags & XATTR_CREATE)
11413 xattr_flags |= CEPH_XATTR_CREATE;
11414 if (flags & XATTR_REPLACE)
11415 xattr_flags |= CEPH_XATTR_REPLACE;
11416
11417 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11418 filepath path;
11419 in->make_nosnap_relative_path(path);
11420 req->set_filepath(path);
11421 req->set_string2(name);
11422 req->set_inode(in);
11423 req->head.args.setxattr.flags = xattr_flags;
11424
11425 bufferlist bl;
11fdf7f2 11426 assert (value || size == 0);
7c673cae
FG
11427 bl.append((const char*)value, size);
11428 req->set_data(bl);
11429
11430 int res = make_request(req, perms);
11431
11432 trim_cache();
11fdf7f2 11433 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11434 res << dendl;
11435 return res;
11436}
11437
11438int Client::_setxattr(Inode *in, const char *name, const void *value,
11439 size_t size, int flags, const UserPerm& perms)
11440{
11441 if (in->snapid != CEPH_NOSNAP) {
11442 return -EROFS;
11443 }
11444
11445 bool posix_acl_xattr = false;
11446 if (acl_type == POSIX_ACL)
11447 posix_acl_xattr = !strncmp(name, "system.", 7);
11448
11449 if (strncmp(name, "user.", 5) &&
11450 strncmp(name, "security.", 9) &&
11451 strncmp(name, "trusted.", 8) &&
11452 strncmp(name, "ceph.", 5) &&
11453 !posix_acl_xattr)
11454 return -EOPNOTSUPP;
11455
11fdf7f2
TL
11456 bool check_realm = false;
11457
7c673cae
FG
11458 if (posix_acl_xattr) {
11459 if (!strcmp(name, ACL_EA_ACCESS)) {
11460 mode_t new_mode = in->mode;
11461 if (value) {
11462 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11463 if (ret < 0)
11464 return ret;
11465 if (ret == 0) {
11466 value = NULL;
11467 size = 0;
11468 }
11469 if (new_mode != in->mode) {
11470 struct ceph_statx stx;
11471 stx.stx_mode = new_mode;
11472 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11473 if (ret < 0)
11474 return ret;
11475 }
11476 }
11477 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11478 if (value) {
11479 if (!S_ISDIR(in->mode))
11480 return -EACCES;
11481 int ret = posix_acl_check(value, size);
11482 if (ret < 0)
11483 return -EINVAL;
11484 if (ret == 0) {
11485 value = NULL;
11486 size = 0;
11487 }
11488 }
11489 } else {
11490 return -EOPNOTSUPP;
11491 }
11492 } else {
11493 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11494 if (vxattr) {
11495 if (vxattr->readonly)
11496 return -EOPNOTSUPP;
11497 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11498 check_realm = true;
11499 }
7c673cae
FG
11500 }
11501
11fdf7f2
TL
11502 int ret = _do_setxattr(in, name, value, size, flags, perms);
11503 if (ret >= 0 && check_realm) {
11504 // check if snaprealm was created for quota inode
11505 if (in->quota.is_enable() &&
11506 !(in->snaprealm && in->snaprealm->ino == in->ino))
11507 ret = -EOPNOTSUPP;
11508 }
11509
11510 return ret;
7c673cae
FG
11511}
11512
11513int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11514 size_t size, int flags, const UserPerm& perms)
11515{
11516 if (cct->_conf->client_permissions) {
11517 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11518 if (r < 0)
11519 return r;
11520 }
11521 return _setxattr(in.get(), name, value, size, flags, perms);
11522}
11523
11524int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11525{
11526 string tmp;
11527 if (name == "layout") {
11528 string::iterator begin = value.begin();
11529 string::iterator end = value.end();
11530 keys_and_values<string::iterator> p; // create instance of parser
11531 std::map<string, string> m; // map to receive results
11532 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11533 return -EINVAL;
11534 }
11535 if (begin != end)
11536 return -EINVAL;
11537 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11538 if (q->first == "pool") {
11539 tmp = q->second;
11540 break;
11541 }
11542 }
11543 } else if (name == "layout.pool") {
11544 tmp = value;
11545 }
11546
11547 if (tmp.length()) {
11548 int64_t pool;
11549 try {
11550 pool = boost::lexical_cast<unsigned>(tmp);
11551 if (!osdmap->have_pg_pool(pool))
11552 return -ENOENT;
11553 } catch (boost::bad_lexical_cast const&) {
11554 pool = osdmap->lookup_pg_pool_name(tmp);
11555 if (pool < 0) {
11556 return -ENOENT;
11557 }
11558 }
11559 }
11560
11561 return 0;
11562}
11563
11564void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11565{
11566 // For setting pool of layout, MetaRequest need osdmap epoch.
11567 // There is a race which create a new data pool but client and mds both don't have.
11568 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11569 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11570 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11571 string rest(strstr(name, "layout"));
11572 string v((const char*)value, size);
11573 int r = objecter->with_osdmap([&](const OSDMap& o) {
11574 return _setxattr_check_data_pool(rest, v, &o);
11575 });
11576
11577 if (r == -ENOENT) {
11578 C_SaferCond ctx;
11579 objecter->wait_for_latest_osdmap(&ctx);
11580 ctx.wait();
11581 }
11582 }
11583}
11584
11585int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11586 size_t size, int flags, const UserPerm& perms)
11587{
11588 _setxattr_maybe_wait_for_osdmap(name, value, size);
11589
11fdf7f2 11590 std::lock_guard lock(client_lock);
7c673cae 11591
181888fb
FG
11592 if (unmounting)
11593 return -ENOTCONN;
11594
7c673cae
FG
11595 vinodeno_t vino = _get_vino(in);
11596
11fdf7f2
TL
11597 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11598 tout(cct) << __func__ << std::endl;
7c673cae
FG
11599 tout(cct) << vino.ino.val << std::endl;
11600 tout(cct) << name << std::endl;
11601
11fdf7f2
TL
11602 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11603 "fuse_default_permissions");
11604 if (!fuse_default_permissions) {
7c673cae
FG
11605 int r = xattr_permission(in, name, MAY_WRITE, perms);
11606 if (r < 0)
11607 return r;
11608 }
11609 return _setxattr(in, name, value, size, flags, perms);
11610}
11611
11612int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11613{
11614 if (in->snapid != CEPH_NOSNAP) {
11615 return -EROFS;
11616 }
11617
11618 // same xattrs supported by kernel client
11619 if (strncmp(name, "user.", 5) &&
11620 strncmp(name, "system.", 7) &&
11621 strncmp(name, "security.", 9) &&
11622 strncmp(name, "trusted.", 8) &&
11623 strncmp(name, "ceph.", 5))
11624 return -EOPNOTSUPP;
11625
11626 const VXattr *vxattr = _match_vxattr(in, name);
11627 if (vxattr && vxattr->readonly)
11628 return -EOPNOTSUPP;
11629
11630 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11631 filepath path;
11632 in->make_nosnap_relative_path(path);
11633 req->set_filepath(path);
11634 req->set_filepath2(name);
11635 req->set_inode(in);
11636
11637 int res = make_request(req, perms);
11638
11639 trim_cache();
1adf2230 11640 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11641 return res;
11642}
11643
11644int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11645{
11646 if (cct->_conf->client_permissions) {
11647 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11648 if (r < 0)
11649 return r;
11650 }
11651 return _removexattr(in.get(), name, perms);
11652}
11653
11654int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11655{
11fdf7f2 11656 std::lock_guard lock(client_lock);
7c673cae 11657
181888fb
FG
11658 if (unmounting)
11659 return -ENOTCONN;
11660
7c673cae
FG
11661 vinodeno_t vino = _get_vino(in);
11662
11663 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11664 tout(cct) << "ll_removexattr" << std::endl;
11665 tout(cct) << vino.ino.val << std::endl;
11666 tout(cct) << name << std::endl;
11667
11fdf7f2
TL
11668 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11669 "fuse_default_permissions");
11670 if (!fuse_default_permissions) {
7c673cae
FG
11671 int r = xattr_permission(in, name, MAY_WRITE, perms);
11672 if (r < 0)
11673 return r;
11674 }
11675
11676 return _removexattr(in, name, perms);
11677}
11678
11679bool Client::_vxattrcb_quota_exists(Inode *in)
11680{
11fdf7f2
TL
11681 return in->quota.is_enable() &&
11682 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11683}
11684size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11685{
11686 return snprintf(val, size,
11687 "max_bytes=%lld max_files=%lld",
11688 (long long int)in->quota.max_bytes,
11689 (long long int)in->quota.max_files);
11690}
11691size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11692{
11693 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11694}
11695size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11696{
11697 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11698}
11699
11700bool Client::_vxattrcb_layout_exists(Inode *in)
11701{
11702 return in->layout != file_layout_t();
11703}
11704size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11705{
11706 int r = snprintf(val, size,
11fdf7f2 11707 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11708 (unsigned long long)in->layout.stripe_unit,
11709 (unsigned long long)in->layout.stripe_count,
11710 (unsigned long long)in->layout.object_size);
11711 objecter->with_osdmap([&](const OSDMap& o) {
11712 if (o.have_pg_pool(in->layout.pool_id))
11713 r += snprintf(val + r, size - r, "%s",
11714 o.get_pool_name(in->layout.pool_id).c_str());
11715 else
11716 r += snprintf(val + r, size - r, "%" PRIu64,
11717 (uint64_t)in->layout.pool_id);
11718 });
11719 if (in->layout.pool_ns.length())
11720 r += snprintf(val + r, size - r, " pool_namespace=%s",
11721 in->layout.pool_ns.c_str());
11722 return r;
11723}
11724size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11725{
11fdf7f2 11726 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11727}
11728size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11729{
11fdf7f2 11730 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11731}
11732size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11733{
11fdf7f2 11734 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11735}
11736size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11737{
11738 size_t r;
11739 objecter->with_osdmap([&](const OSDMap& o) {
11740 if (o.have_pg_pool(in->layout.pool_id))
11741 r = snprintf(val, size, "%s", o.get_pool_name(
11742 in->layout.pool_id).c_str());
11743 else
11744 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11745 });
11746 return r;
11747}
11748size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11749{
11750 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11751}
11752size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11753{
11fdf7f2 11754 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11755}
11756size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11757{
11fdf7f2 11758 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11759}
11760size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11761{
11fdf7f2 11762 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11763}
11764size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11765{
11fdf7f2 11766 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11767}
11768size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11769{
11fdf7f2 11770 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11771}
11772size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11773{
11fdf7f2 11774 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11775}
11776size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11777{
11fdf7f2 11778 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11779}
11780size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11781{
11782 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11783 (long)in->rstat.rctime.nsec());
11784}
11fdf7f2
TL
11785bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11786{
11787 return in->dir_pin != -ENODATA;
11788}
11789size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11790{
11791 return snprintf(val, size, "%ld", (long)in->dir_pin);
11792}
7c673cae
FG
11793
11794#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11795#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11796
11797#define XATTR_NAME_CEPH(_type, _name) \
11798{ \
11799 name: CEPH_XATTR_NAME(_type, _name), \
11800 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11801 readonly: true, \
11802 hidden: false, \
11803 exists_cb: NULL, \
28e407b8
AA
11804 flags: 0, \
11805}
11806#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11807{ \
11808 name: CEPH_XATTR_NAME(_type, _name), \
11809 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11810 readonly: true, \
11811 hidden: false, \
11812 exists_cb: NULL, \
11813 flags: _flags, \
7c673cae
FG
11814}
11815#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11816{ \
11817 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11818 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11819 readonly: false, \
11820 hidden: true, \
11821 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11822 flags: 0, \
7c673cae
FG
11823}
11824#define XATTR_QUOTA_FIELD(_type, _name) \
11825{ \
11826 name: CEPH_XATTR_NAME(_type, _name), \
11827 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11828 readonly: false, \
11829 hidden: true, \
11830 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11831 flags: 0, \
7c673cae
FG
11832}
11833
11834const Client::VXattr Client::_dir_vxattrs[] = {
11835 {
11836 name: "ceph.dir.layout",
11837 getxattr_cb: &Client::_vxattrcb_layout,
11838 readonly: false,
11839 hidden: true,
11840 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11841 flags: 0,
7c673cae
FG
11842 },
11843 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11844 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11845 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11846 XATTR_LAYOUT_FIELD(dir, layout, pool),
11847 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11848 XATTR_NAME_CEPH(dir, entries),
11849 XATTR_NAME_CEPH(dir, files),
11850 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11851 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11852 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11853 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11854 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11855 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11856 {
11857 name: "ceph.quota",
11858 getxattr_cb: &Client::_vxattrcb_quota,
11859 readonly: false,
11860 hidden: true,
11861 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11862 flags: 0,
7c673cae
FG
11863 },
11864 XATTR_QUOTA_FIELD(quota, max_bytes),
11865 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11866 {
11867 name: "ceph.dir.pin",
11868 getxattr_cb: &Client::_vxattrcb_dir_pin,
11869 readonly: false,
11870 hidden: true,
11871 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11872 flags: 0,
11873 },
7c673cae
FG
11874 { name: "" } /* Required table terminator */
11875};
11876
11877const Client::VXattr Client::_file_vxattrs[] = {
11878 {
11879 name: "ceph.file.layout",
11880 getxattr_cb: &Client::_vxattrcb_layout,
11881 readonly: false,
11882 hidden: true,
11883 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11884 flags: 0,
7c673cae
FG
11885 },
11886 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11887 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11888 XATTR_LAYOUT_FIELD(file, layout, object_size),
11889 XATTR_LAYOUT_FIELD(file, layout, pool),
11890 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11891 { name: "" } /* Required table terminator */
11892};
11893
11894const Client::VXattr *Client::_get_vxattrs(Inode *in)
11895{
11896 if (in->is_dir())
11897 return _dir_vxattrs;
11898 else if (in->is_file())
11899 return _file_vxattrs;
11900 return NULL;
11901}
11902
11903const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11904{
11905 if (strncmp(name, "ceph.", 5) == 0) {
11906 const VXattr *vxattr = _get_vxattrs(in);
11907 if (vxattr) {
11908 while (!vxattr->name.empty()) {
11909 if (vxattr->name == name)
11910 return vxattr;
11911 vxattr++;
11912 }
11913 }
11914 }
11915 return NULL;
11916}
11917
11918size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11919{
11920 size_t len = 0;
11921 while (!vxattr->name.empty()) {
11922 if (!vxattr->hidden)
11923 len += vxattr->name.length() + 1;
11924 vxattr++;
11925 }
11926 return len;
11927}
11928
11929int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11930{
11fdf7f2 11931 std::lock_guard lock(client_lock);
7c673cae 11932
181888fb
FG
11933 if (unmounting)
11934 return -ENOTCONN;
11935
7c673cae
FG
11936 vinodeno_t vino = _get_vino(in);
11937
11938 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11939 tout(cct) << "ll_readlink" << std::endl;
11940 tout(cct) << vino.ino.val << std::endl;
11941
11fdf7f2
TL
11942 for (auto dn : in->dentries) {
11943 touch_dn(dn);
7c673cae
FG
11944 }
11945
11946 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11947 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11948 return r;
11949}
11950
11951int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11952 const UserPerm& perms, InodeRef *inp)
11953{
1adf2230 11954 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11955 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11956 << ", gid " << perms.gid() << ")" << dendl;
11957
11958 if (strlen(name) > NAME_MAX)
11959 return -ENAMETOOLONG;
11960
11961 if (dir->snapid != CEPH_NOSNAP) {
11962 return -EROFS;
11963 }
11964 if (is_quota_files_exceeded(dir, perms)) {
11965 return -EDQUOT;
11966 }
11967
11968 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11969
11970 filepath path;
11971 dir->make_nosnap_relative_path(path);
11972 path.push_dentry(name);
11973 req->set_filepath(path);
11974 req->set_inode(dir);
11975 req->head.args.mknod.rdev = rdev;
11976 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11977 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11978
11979 bufferlist xattrs_bl;
11980 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11981 if (res < 0)
11982 goto fail;
11983 req->head.args.mknod.mode = mode;
11984 if (xattrs_bl.length() > 0)
11985 req->set_data(xattrs_bl);
11986
11987 Dentry *de;
11988 res = get_or_create(dir, name, &de);
11989 if (res < 0)
11990 goto fail;
11991 req->set_dentry(de);
11992
11993 res = make_request(req, perms, inp);
11994
11995 trim_cache();
11996
1adf2230 11997 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11998 return res;
11999
12000 fail:
12001 put_request(req);
12002 return res;
12003}
12004
12005int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12006 dev_t rdev, struct stat *attr, Inode **out,
12007 const UserPerm& perms)
12008{
11fdf7f2 12009 std::lock_guard lock(client_lock);
7c673cae 12010
181888fb
FG
12011 if (unmounting)
12012 return -ENOTCONN;
12013
7c673cae
FG
12014 vinodeno_t vparent = _get_vino(parent);
12015
12016 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12017 tout(cct) << "ll_mknod" << std::endl;
12018 tout(cct) << vparent.ino.val << std::endl;
12019 tout(cct) << name << std::endl;
12020 tout(cct) << mode << std::endl;
12021 tout(cct) << rdev << std::endl;
12022
11fdf7f2
TL
12023 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12024 "fuse_default_permissions");
12025 if (!fuse_default_permissions) {
7c673cae
FG
12026 int r = may_create(parent, perms);
12027 if (r < 0)
12028 return r;
12029 }
12030
12031 InodeRef in;
12032 int r = _mknod(parent, name, mode, rdev, perms, &in);
12033 if (r == 0) {
12034 fill_stat(in, attr);
12035 _ll_get(in.get());
12036 }
12037 tout(cct) << attr->st_ino << std::endl;
12038 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12039 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12040 *out = in.get();
12041 return r;
12042}
12043
12044int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12045 dev_t rdev, Inode **out,
12046 struct ceph_statx *stx, unsigned want, unsigned flags,
12047 const UserPerm& perms)
12048{
12049 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12050 std::lock_guard lock(client_lock);
7c673cae 12051
181888fb
FG
12052 if (unmounting)
12053 return -ENOTCONN;
12054
7c673cae
FG
12055 vinodeno_t vparent = _get_vino(parent);
12056
12057 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12058 tout(cct) << "ll_mknodx" << std::endl;
12059 tout(cct) << vparent.ino.val << std::endl;
12060 tout(cct) << name << std::endl;
12061 tout(cct) << mode << std::endl;
12062 tout(cct) << rdev << std::endl;
12063
11fdf7f2
TL
12064 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12065 "fuse_default_permissions");
12066 if (!fuse_default_permissions) {
7c673cae
FG
12067 int r = may_create(parent, perms);
12068 if (r < 0)
12069 return r;
12070 }
12071
12072 InodeRef in;
12073 int r = _mknod(parent, name, mode, rdev, perms, &in);
12074 if (r == 0) {
12075 fill_statx(in, caps, stx);
12076 _ll_get(in.get());
12077 }
12078 tout(cct) << stx->stx_ino << std::endl;
12079 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12080 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12081 *out = in.get();
12082 return r;
12083}
12084
12085int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12086 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12087 int object_size, const char *data_pool, bool *created,
12088 const UserPerm& perms)
12089{
1adf2230 12090 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12091 mode << dec << ")" << dendl;
12092
12093 if (strlen(name) > NAME_MAX)
12094 return -ENAMETOOLONG;
12095 if (dir->snapid != CEPH_NOSNAP) {
12096 return -EROFS;
12097 }
12098 if (is_quota_files_exceeded(dir, perms)) {
12099 return -EDQUOT;
12100 }
12101
12102 // use normalized flags to generate cmode
11fdf7f2
TL
12103 int cflags = ceph_flags_sys2wire(flags);
12104 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12105 cflags |= CEPH_O_LAZY;
12106
12107 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12108
12109 int64_t pool_id = -1;
12110 if (data_pool && *data_pool) {
12111 pool_id = objecter->with_osdmap(
12112 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12113 if (pool_id < 0)
12114 return -EINVAL;
12115 if (pool_id > 0xffffffffll)
12116 return -ERANGE; // bummer!
12117 }
12118
12119 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12120
12121 filepath path;
12122 dir->make_nosnap_relative_path(path);
12123 path.push_dentry(name);
12124 req->set_filepath(path);
12125 req->set_inode(dir);
11fdf7f2 12126 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12127
12128 req->head.args.open.stripe_unit = stripe_unit;
12129 req->head.args.open.stripe_count = stripe_count;
12130 req->head.args.open.object_size = object_size;
12131 if (cct->_conf->client_debug_getattr_caps)
12132 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12133 else
12134 req->head.args.open.mask = 0;
12135 req->head.args.open.pool = pool_id;
12136 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12137 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12138
12139 mode |= S_IFREG;
12140 bufferlist xattrs_bl;
12141 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12142 if (res < 0)
12143 goto fail;
12144 req->head.args.open.mode = mode;
12145 if (xattrs_bl.length() > 0)
12146 req->set_data(xattrs_bl);
12147
12148 Dentry *de;
12149 res = get_or_create(dir, name, &de);
12150 if (res < 0)
12151 goto fail;
12152 req->set_dentry(de);
12153
12154 res = make_request(req, perms, inp, created);
12155 if (res < 0) {
12156 goto reply_error;
12157 }
12158
12159 /* If the caller passed a value in fhp, do the open */
12160 if(fhp) {
12161 (*inp)->get_open_ref(cmode);
12162 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12163 }
12164
12165 reply_error:
12166 trim_cache();
12167
1adf2230 12168 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12169 << " layout " << stripe_unit
12170 << ' ' << stripe_count
12171 << ' ' << object_size
12172 <<") = " << res << dendl;
12173 return res;
12174
12175 fail:
12176 put_request(req);
12177 return res;
12178}
12179
12180
12181int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12182 InodeRef *inp)
12183{
1adf2230 12184 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12185 << mode << dec << ", uid " << perm.uid()
12186 << ", gid " << perm.gid() << ")" << dendl;
12187
12188 if (strlen(name) > NAME_MAX)
12189 return -ENAMETOOLONG;
12190
12191 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12192 return -EROFS;
12193 }
12194 if (is_quota_files_exceeded(dir, perm)) {
12195 return -EDQUOT;
12196 }
12197 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12198 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12199
12200 filepath path;
12201 dir->make_nosnap_relative_path(path);
12202 path.push_dentry(name);
12203 req->set_filepath(path);
12204 req->set_inode(dir);
12205 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12206 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12207
12208 mode |= S_IFDIR;
12209 bufferlist xattrs_bl;
12210 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12211 if (res < 0)
12212 goto fail;
12213 req->head.args.mkdir.mode = mode;
12214 if (xattrs_bl.length() > 0)
12215 req->set_data(xattrs_bl);
12216
12217 Dentry *de;
12218 res = get_or_create(dir, name, &de);
12219 if (res < 0)
12220 goto fail;
12221 req->set_dentry(de);
12222
12223 ldout(cct, 10) << "_mkdir: making request" << dendl;
12224 res = make_request(req, perm, inp);
12225 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12226
12227 trim_cache();
12228
1adf2230 12229 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12230 return res;
12231
12232 fail:
12233 put_request(req);
12234 return res;
12235}
12236
12237int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12238 struct stat *attr, Inode **out, const UserPerm& perm)
12239{
11fdf7f2 12240 std::lock_guard lock(client_lock);
7c673cae 12241
181888fb
FG
12242 if (unmounting)
12243 return -ENOTCONN;
12244
7c673cae
FG
12245 vinodeno_t vparent = _get_vino(parent);
12246
12247 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12248 tout(cct) << "ll_mkdir" << std::endl;
12249 tout(cct) << vparent.ino.val << std::endl;
12250 tout(cct) << name << std::endl;
12251 tout(cct) << mode << std::endl;
12252
11fdf7f2
TL
12253 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12254 "fuse_default_permissions");
12255 if (!fuse_default_permissions) {
7c673cae
FG
12256 int r = may_create(parent, perm);
12257 if (r < 0)
12258 return r;
12259 }
12260
12261 InodeRef in;
12262 int r = _mkdir(parent, name, mode, perm, &in);
12263 if (r == 0) {
12264 fill_stat(in, attr);
12265 _ll_get(in.get());
12266 }
12267 tout(cct) << attr->st_ino << std::endl;
12268 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12269 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12270 *out = in.get();
12271 return r;
12272}
12273
12274int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12275 struct ceph_statx *stx, unsigned want, unsigned flags,
12276 const UserPerm& perms)
12277{
11fdf7f2 12278 std::lock_guard lock(client_lock);
7c673cae 12279
181888fb
FG
12280 if (unmounting)
12281 return -ENOTCONN;
12282
7c673cae
FG
12283 vinodeno_t vparent = _get_vino(parent);
12284
12285 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12286 tout(cct) << "ll_mkdirx" << std::endl;
12287 tout(cct) << vparent.ino.val << std::endl;
12288 tout(cct) << name << std::endl;
12289 tout(cct) << mode << std::endl;
12290
11fdf7f2
TL
12291 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12292 "fuse_default_permissions");
12293 if (!fuse_default_permissions) {
7c673cae
FG
12294 int r = may_create(parent, perms);
12295 if (r < 0)
12296 return r;
12297 }
12298
12299 InodeRef in;
12300 int r = _mkdir(parent, name, mode, perms, &in);
12301 if (r == 0) {
12302 fill_statx(in, statx_to_mask(flags, want), stx);
12303 _ll_get(in.get());
12304 } else {
12305 stx->stx_ino = 0;
12306 stx->stx_mask = 0;
12307 }
12308 tout(cct) << stx->stx_ino << std::endl;
12309 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12310 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12311 *out = in.get();
12312 return r;
12313}
12314
12315int Client::_symlink(Inode *dir, const char *name, const char *target,
12316 const UserPerm& perms, InodeRef *inp)
12317{
1adf2230 12318 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12319 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12320 << dendl;
12321
12322 if (strlen(name) > NAME_MAX)
12323 return -ENAMETOOLONG;
12324
12325 if (dir->snapid != CEPH_NOSNAP) {
12326 return -EROFS;
12327 }
12328 if (is_quota_files_exceeded(dir, perms)) {
12329 return -EDQUOT;
12330 }
12331
12332 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12333
12334 filepath path;
12335 dir->make_nosnap_relative_path(path);
12336 path.push_dentry(name);
12337 req->set_filepath(path);
12338 req->set_inode(dir);
12339 req->set_string2(target);
12340 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12341 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12342
12343 Dentry *de;
12344 int res = get_or_create(dir, name, &de);
12345 if (res < 0)
12346 goto fail;
12347 req->set_dentry(de);
12348
12349 res = make_request(req, perms, inp);
12350
12351 trim_cache();
1adf2230 12352 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12353 res << dendl;
12354 return res;
12355
12356 fail:
12357 put_request(req);
12358 return res;
12359}
12360
12361int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12362 struct stat *attr, Inode **out, const UserPerm& perms)
12363{
11fdf7f2 12364 std::lock_guard lock(client_lock);
7c673cae 12365
181888fb
FG
12366 if (unmounting)
12367 return -ENOTCONN;
12368
7c673cae
FG
12369 vinodeno_t vparent = _get_vino(parent);
12370
12371 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12372 << dendl;
12373 tout(cct) << "ll_symlink" << std::endl;
12374 tout(cct) << vparent.ino.val << std::endl;
12375 tout(cct) << name << std::endl;
12376 tout(cct) << value << std::endl;
12377
11fdf7f2
TL
12378 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12379 "fuse_default_permissions");
12380 if (!fuse_default_permissions) {
7c673cae
FG
12381 int r = may_create(parent, perms);
12382 if (r < 0)
12383 return r;
12384 }
12385
12386 InodeRef in;
12387 int r = _symlink(parent, name, value, perms, &in);
12388 if (r == 0) {
12389 fill_stat(in, attr);
12390 _ll_get(in.get());
12391 }
12392 tout(cct) << attr->st_ino << std::endl;
12393 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12394 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12395 *out = in.get();
12396 return r;
12397}
12398
12399int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12400 Inode **out, struct ceph_statx *stx, unsigned want,
12401 unsigned flags, const UserPerm& perms)
12402{
11fdf7f2 12403 std::lock_guard lock(client_lock);
7c673cae 12404
181888fb
FG
12405 if (unmounting)
12406 return -ENOTCONN;
12407
7c673cae
FG
12408 vinodeno_t vparent = _get_vino(parent);
12409
12410 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12411 << dendl;
12412 tout(cct) << "ll_symlinkx" << std::endl;
12413 tout(cct) << vparent.ino.val << std::endl;
12414 tout(cct) << name << std::endl;
12415 tout(cct) << value << std::endl;
12416
11fdf7f2
TL
12417 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12418 "fuse_default_permissions");
12419 if (!fuse_default_permissions) {
7c673cae
FG
12420 int r = may_create(parent, perms);
12421 if (r < 0)
12422 return r;
12423 }
12424
12425 InodeRef in;
12426 int r = _symlink(parent, name, value, perms, &in);
12427 if (r == 0) {
12428 fill_statx(in, statx_to_mask(flags, want), stx);
12429 _ll_get(in.get());
12430 }
12431 tout(cct) << stx->stx_ino << std::endl;
12432 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12433 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12434 *out = in.get();
12435 return r;
12436}
12437
12438int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12439{
1adf2230 12440 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12441 << " uid " << perm.uid() << " gid " << perm.gid()
12442 << ")" << dendl;
12443
12444 if (dir->snapid != CEPH_NOSNAP) {
12445 return -EROFS;
12446 }
12447
12448 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12449
12450 filepath path;
12451 dir->make_nosnap_relative_path(path);
12452 path.push_dentry(name);
12453 req->set_filepath(path);
12454
12455 InodeRef otherin;
b32b8144 12456 Inode *in;
7c673cae 12457 Dentry *de;
b32b8144 12458
7c673cae
FG
12459 int res = get_or_create(dir, name, &de);
12460 if (res < 0)
12461 goto fail;
12462 req->set_dentry(de);
12463 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12464 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12465
12466 res = _lookup(dir, name, 0, &otherin, perm);
12467 if (res < 0)
12468 goto fail;
b32b8144
FG
12469
12470 in = otherin.get();
12471 req->set_other_inode(in);
12472 in->break_all_delegs();
7c673cae
FG
12473 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12474
12475 req->set_inode(dir);
12476
12477 res = make_request(req, perm);
12478
12479 trim_cache();
1adf2230 12480 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12481 return res;
12482
12483 fail:
12484 put_request(req);
12485 return res;
12486}
12487
12488int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12489{
11fdf7f2 12490 std::lock_guard lock(client_lock);
7c673cae 12491
181888fb
FG
12492 if (unmounting)
12493 return -ENOTCONN;
12494
7c673cae
FG
12495 vinodeno_t vino = _get_vino(in);
12496
12497 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12498 tout(cct) << "ll_unlink" << std::endl;
12499 tout(cct) << vino.ino.val << std::endl;
12500 tout(cct) << name << std::endl;
12501
11fdf7f2
TL
12502 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12503 "fuse_default_permissions");
12504 if (!fuse_default_permissions) {
7c673cae
FG
12505 int r = may_delete(in, name, perm);
12506 if (r < 0)
12507 return r;
12508 }
12509 return _unlink(in, name, perm);
12510}
12511
12512int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12513{
1adf2230 12514 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12515 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12516
12517 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12518 return -EROFS;
12519 }
b32b8144
FG
12520
12521 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12522 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12523 filepath path;
12524 dir->make_nosnap_relative_path(path);
12525 path.push_dentry(name);
12526 req->set_filepath(path);
11fdf7f2 12527 req->set_inode(dir);
7c673cae
FG
12528
12529 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12530 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12531 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12532
12533 InodeRef in;
12534
12535 Dentry *de;
12536 int res = get_or_create(dir, name, &de);
12537 if (res < 0)
12538 goto fail;
b32b8144
FG
12539 if (op == CEPH_MDS_OP_RMDIR)
12540 req->set_dentry(de);
12541 else
12542 de->get();
12543
7c673cae
FG
12544 res = _lookup(dir, name, 0, &in, perms);
12545 if (res < 0)
12546 goto fail;
11fdf7f2
TL
12547
12548 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12549 unlink(de, true, true);
b32b8144 12550 de->put();
7c673cae 12551 }
11fdf7f2 12552 req->set_other_inode(in.get());
7c673cae
FG
12553
12554 res = make_request(req, perms);
12555
12556 trim_cache();
1adf2230 12557 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12558 return res;
12559
12560 fail:
12561 put_request(req);
12562 return res;
12563}
12564
12565int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12566{
11fdf7f2 12567 std::lock_guard lock(client_lock);
7c673cae 12568
181888fb
FG
12569 if (unmounting)
12570 return -ENOTCONN;
12571
7c673cae
FG
12572 vinodeno_t vino = _get_vino(in);
12573
12574 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12575 tout(cct) << "ll_rmdir" << std::endl;
12576 tout(cct) << vino.ino.val << std::endl;
12577 tout(cct) << name << std::endl;
12578
11fdf7f2
TL
12579 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12580 "fuse_default_permissions");
12581 if (!fuse_default_permissions) {
7c673cae
FG
12582 int r = may_delete(in, name, perms);
12583 if (r < 0)
12584 return r;
12585 }
12586
12587 return _rmdir(in, name, perms);
12588}
12589
12590int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12591{
1adf2230 12592 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12593 << todir->ino << " " << toname
12594 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12595 << dendl;
12596
12597 if (fromdir->snapid != todir->snapid)
12598 return -EXDEV;
12599
12600 int op = CEPH_MDS_OP_RENAME;
12601 if (fromdir->snapid != CEPH_NOSNAP) {
12602 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12603 op = CEPH_MDS_OP_RENAMESNAP;
12604 else
12605 return -EROFS;
12606 }
12607 if (fromdir != todir) {
12608 Inode *fromdir_root =
12609 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12610 Inode *todir_root =
12611 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12612 if (fromdir_root != todir_root) {
12613 return -EXDEV;
12614 }
12615 }
12616
12617 InodeRef target;
12618 MetaRequest *req = new MetaRequest(op);
12619
12620 filepath from;
12621 fromdir->make_nosnap_relative_path(from);
12622 from.push_dentry(fromname);
12623 filepath to;
12624 todir->make_nosnap_relative_path(to);
12625 to.push_dentry(toname);
12626 req->set_filepath(to);
12627 req->set_filepath2(from);
12628
12629 Dentry *oldde;
12630 int res = get_or_create(fromdir, fromname, &oldde);
12631 if (res < 0)
12632 goto fail;
12633 Dentry *de;
12634 res = get_or_create(todir, toname, &de);
12635 if (res < 0)
12636 goto fail;
12637
12638 if (op == CEPH_MDS_OP_RENAME) {
12639 req->set_old_dentry(oldde);
12640 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12641 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12642
12643 req->set_dentry(de);
12644 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12645 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12646
12647 InodeRef oldin, otherin;
12648 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12649 if (res < 0)
12650 goto fail;
b32b8144
FG
12651
12652 Inode *oldinode = oldin.get();
12653 oldinode->break_all_delegs();
12654 req->set_old_inode(oldinode);
7c673cae
FG
12655 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12656
12657 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12658 switch (res) {
12659 case 0:
12660 {
12661 Inode *in = otherin.get();
12662 req->set_other_inode(in);
12663 in->break_all_delegs();
12664 }
7c673cae 12665 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12666 break;
12667 case -ENOENT:
12668 break;
12669 default:
12670 goto fail;
7c673cae
FG
12671 }
12672
12673 req->set_inode(todir);
12674 } else {
12675 // renamesnap reply contains no tracedn, so we need to invalidate
12676 // dentry manually
12677 unlink(oldde, true, true);
12678 unlink(de, true, true);
11fdf7f2
TL
12679
12680 req->set_inode(todir);
7c673cae
FG
12681 }
12682
12683 res = make_request(req, perm, &target);
12684 ldout(cct, 10) << "rename result is " << res << dendl;
12685
12686 // renamed item from our cache
12687
12688 trim_cache();
1adf2230 12689 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12690 return res;
12691
12692 fail:
12693 put_request(req);
12694 return res;
12695}
12696
12697int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12698 const char *newname, const UserPerm& perm)
12699{
11fdf7f2 12700 std::lock_guard lock(client_lock);
7c673cae 12701
181888fb
FG
12702 if (unmounting)
12703 return -ENOTCONN;
12704
7c673cae
FG
12705 vinodeno_t vparent = _get_vino(parent);
12706 vinodeno_t vnewparent = _get_vino(newparent);
12707
12708 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12709 << vnewparent << " " << newname << dendl;
12710 tout(cct) << "ll_rename" << std::endl;
12711 tout(cct) << vparent.ino.val << std::endl;
12712 tout(cct) << name << std::endl;
12713 tout(cct) << vnewparent.ino.val << std::endl;
12714 tout(cct) << newname << std::endl;
12715
11fdf7f2
TL
12716 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12717 "fuse_default_permissions");
12718 if (!fuse_default_permissions) {
7c673cae
FG
12719 int r = may_delete(parent, name, perm);
12720 if (r < 0)
12721 return r;
12722 r = may_delete(newparent, newname, perm);
12723 if (r < 0 && r != -ENOENT)
12724 return r;
12725 }
12726
12727 return _rename(parent, name, newparent, newname, perm);
12728}
12729
12730int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12731{
1adf2230 12732 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12733 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12734
12735 if (strlen(newname) > NAME_MAX)
12736 return -ENAMETOOLONG;
12737
12738 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12739 return -EROFS;
12740 }
12741 if (is_quota_files_exceeded(dir, perm)) {
12742 return -EDQUOT;
12743 }
12744
b32b8144 12745 in->break_all_delegs();
7c673cae
FG
12746 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12747
12748 filepath path(newname, dir->ino);
12749 req->set_filepath(path);
12750 filepath existing(in->ino);
12751 req->set_filepath2(existing);
12752
12753 req->set_inode(dir);
12754 req->inode_drop = CEPH_CAP_FILE_SHARED;
12755 req->inode_unless = CEPH_CAP_FILE_EXCL;
12756
12757 Dentry *de;
12758 int res = get_or_create(dir, newname, &de);
12759 if (res < 0)
12760 goto fail;
12761 req->set_dentry(de);
12762
12763 res = make_request(req, perm, inp);
12764 ldout(cct, 10) << "link result is " << res << dendl;
12765
12766 trim_cache();
1adf2230 12767 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12768 return res;
12769
12770 fail:
12771 put_request(req);
12772 return res;
12773}
12774
12775int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12776 const UserPerm& perm)
12777{
11fdf7f2 12778 std::lock_guard lock(client_lock);
7c673cae 12779
181888fb
FG
12780 if (unmounting)
12781 return -ENOTCONN;
12782
7c673cae
FG
12783 vinodeno_t vino = _get_vino(in);
12784 vinodeno_t vnewparent = _get_vino(newparent);
12785
31f18b77 12786 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12787 newname << dendl;
12788 tout(cct) << "ll_link" << std::endl;
12789 tout(cct) << vino.ino.val << std::endl;
12790 tout(cct) << vnewparent << std::endl;
12791 tout(cct) << newname << std::endl;
12792
7c673cae
FG
12793 InodeRef target;
12794
11fdf7f2
TL
12795 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12796 "fuse_default_permissions");
12797 if (!fuse_default_permissions) {
7c673cae
FG
12798 if (S_ISDIR(in->mode))
12799 return -EPERM;
12800
11fdf7f2 12801 int r = may_hardlink(in, perm);
7c673cae
FG
12802 if (r < 0)
12803 return r;
12804
12805 r = may_create(newparent, perm);
12806 if (r < 0)
12807 return r;
12808 }
12809
12810 return _link(in, newparent, newname, perm, &target);
12811}
12812
12813int Client::ll_num_osds(void)
12814{
11fdf7f2 12815 std::lock_guard lock(client_lock);
7c673cae
FG
12816 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12817}
12818
12819int Client::ll_osdaddr(int osd, uint32_t *addr)
12820{
11fdf7f2 12821 std::lock_guard lock(client_lock);
181888fb 12822
7c673cae
FG
12823 entity_addr_t g;
12824 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12825 if (!o.exists(osd))
12826 return false;
11fdf7f2 12827 g = o.get_addrs(osd).front();
7c673cae
FG
12828 return true;
12829 });
12830 if (!exists)
12831 return -1;
12832 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12833 *addr = ntohl(nb_addr);
12834 return 0;
12835}
181888fb 12836
7c673cae
FG
12837uint32_t Client::ll_stripe_unit(Inode *in)
12838{
11fdf7f2 12839 std::lock_guard lock(client_lock);
7c673cae
FG
12840 return in->layout.stripe_unit;
12841}
12842
12843uint64_t Client::ll_snap_seq(Inode *in)
12844{
11fdf7f2 12845 std::lock_guard lock(client_lock);
7c673cae
FG
12846 return in->snaprealm->seq;
12847}
12848
12849int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12850{
11fdf7f2 12851 std::lock_guard lock(client_lock);
7c673cae
FG
12852 *layout = in->layout;
12853 return 0;
12854}
12855
12856int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12857{
12858 return ll_file_layout(fh->inode.get(), layout);
12859}
12860
12861/* Currently we cannot take advantage of redundancy in reads, since we
12862 would have to go through all possible placement groups (a
12863 potentially quite large number determined by a hash), and use CRUSH
12864 to calculate the appropriate set of OSDs for each placement group,
12865 then index into that. An array with one entry per OSD is much more
12866 tractable and works for demonstration purposes. */
12867
12868int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12869 file_layout_t* layout)
12870{
11fdf7f2 12871 std::lock_guard lock(client_lock);
181888fb 12872
28e407b8 12873 inodeno_t ino = in->ino;
7c673cae
FG
12874 uint32_t object_size = layout->object_size;
12875 uint32_t su = layout->stripe_unit;
12876 uint32_t stripe_count = layout->stripe_count;
12877 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12878 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12879
11fdf7f2
TL
12880 if(stripe_count) {
12881 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12882 stripepos = blockno % stripe_count; // which object in the object set (X)
12883 }
7c673cae
FG
12884 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12885 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12886
12887 object_t oid = file_object_t(ino, objectno);
12888 return objecter->with_osdmap([&](const OSDMap& o) {
12889 ceph_object_layout olayout =
12890 o.file_to_object_layout(oid, *layout);
12891 pg_t pg = (pg_t)olayout.ol_pgid;
12892 vector<int> osds;
12893 int primary;
12894 o.pg_to_acting_osds(pg, &osds, &primary);
12895 return primary;
12896 });
12897}
12898
12899/* Return the offset of the block, internal to the object */
12900
12901uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12902{
11fdf7f2 12903 std::lock_guard lock(client_lock);
7c673cae
FG
12904 file_layout_t *layout=&(in->layout);
12905 uint32_t object_size = layout->object_size;
12906 uint32_t su = layout->stripe_unit;
12907 uint64_t stripes_per_object = object_size / su;
12908
12909 return (blockno % stripes_per_object) * su;
12910}
12911
12912int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12913 const UserPerm& perms)
12914{
11fdf7f2 12915 std::lock_guard lock(client_lock);
7c673cae 12916
181888fb
FG
12917 if (unmounting)
12918 return -ENOTCONN;
12919
7c673cae
FG
12920 vinodeno_t vino = _get_vino(in);
12921
12922 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12923 tout(cct) << "ll_opendir" << std::endl;
12924 tout(cct) << vino.ino.val << std::endl;
12925
11fdf7f2
TL
12926 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12927 "fuse_default_permissions");
12928 if (!fuse_default_permissions) {
7c673cae
FG
12929 int r = may_open(in, flags, perms);
12930 if (r < 0)
12931 return r;
12932 }
12933
12934 int r = _opendir(in, dirpp, perms);
12935 tout(cct) << (unsigned long)*dirpp << std::endl;
12936
12937 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12938 << dendl;
12939 return r;
12940}
12941
12942int Client::ll_releasedir(dir_result_t *dirp)
12943{
11fdf7f2 12944 std::lock_guard lock(client_lock);
7c673cae
FG
12945 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12946 tout(cct) << "ll_releasedir" << std::endl;
12947 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12948
12949 if (unmounting)
12950 return -ENOTCONN;
12951
7c673cae
FG
12952 _closedir(dirp);
12953 return 0;
12954}
12955
12956int Client::ll_fsyncdir(dir_result_t *dirp)
12957{
11fdf7f2 12958 std::lock_guard lock(client_lock);
7c673cae
FG
12959 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12960 tout(cct) << "ll_fsyncdir" << std::endl;
12961 tout(cct) << (unsigned long)dirp << std::endl;
12962
181888fb
FG
12963 if (unmounting)
12964 return -ENOTCONN;
12965
7c673cae
FG
12966 return _fsync(dirp->inode.get(), false);
12967}
12968
12969int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12970{
11fdf7f2 12971 ceph_assert(!(flags & O_CREAT));
7c673cae 12972
11fdf7f2 12973 std::lock_guard lock(client_lock);
7c673cae 12974
181888fb
FG
12975 if (unmounting)
12976 return -ENOTCONN;
12977
7c673cae
FG
12978 vinodeno_t vino = _get_vino(in);
12979
12980 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12981 tout(cct) << "ll_open" << std::endl;
12982 tout(cct) << vino.ino.val << std::endl;
12983 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12984
12985 int r;
11fdf7f2
TL
12986 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12987 "fuse_default_permissions");
12988 if (!fuse_default_permissions) {
7c673cae
FG
12989 r = may_open(in, flags, perms);
12990 if (r < 0)
12991 goto out;
12992 }
12993
12994 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12995
12996 out:
12997 Fh *fhptr = fhp ? *fhp : NULL;
12998 if (fhptr) {
12999 ll_unclosed_fh_set.insert(fhptr);
13000 }
13001 tout(cct) << (unsigned long)fhptr << std::endl;
13002 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13003 " = " << r << " (" << fhptr << ")" << dendl;
13004 return r;
13005}
13006
13007int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13008 int flags, InodeRef *in, int caps, Fh **fhp,
13009 const UserPerm& perms)
13010{
13011 *fhp = NULL;
13012
13013 vinodeno_t vparent = _get_vino(parent);
13014
1adf2230 13015 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13016 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13017 << ", gid " << perms.gid() << dendl;
13018 tout(cct) << "ll_create" << std::endl;
13019 tout(cct) << vparent.ino.val << std::endl;
13020 tout(cct) << name << std::endl;
13021 tout(cct) << mode << std::endl;
13022 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13023
13024 bool created = false;
13025 int r = _lookup(parent, name, caps, in, perms);
13026
13027 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13028 return -EEXIST;
13029
13030 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2
TL
13031 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13032 "fuse_default_permissions");
13033 if (!fuse_default_permissions) {
7c673cae
FG
13034 r = may_create(parent, perms);
13035 if (r < 0)
13036 goto out;
13037 }
13038 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13039 perms);
13040 if (r < 0)
13041 goto out;
13042 }
13043
13044 if (r < 0)
13045 goto out;
13046
11fdf7f2 13047 ceph_assert(*in);
7c673cae
FG
13048
13049 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13050 if (!created) {
11fdf7f2
TL
13051 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13052 "fuse_default_permissions");
13053 if (!fuse_default_permissions) {
7c673cae
FG
13054 r = may_open(in->get(), flags, perms);
13055 if (r < 0) {
13056 if (*fhp) {
13057 int release_r = _release_fh(*fhp);
11fdf7f2 13058 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13059 }
13060 goto out;
13061 }
13062 }
13063 if (*fhp == NULL) {
13064 r = _open(in->get(), flags, mode, fhp, perms);
13065 if (r < 0)
13066 goto out;
13067 }
13068 }
13069
13070out:
13071 if (*fhp) {
13072 ll_unclosed_fh_set.insert(*fhp);
13073 }
13074
13075 ino_t ino = 0;
13076 if (r >= 0) {
13077 Inode *inode = in->get();
13078 if (use_faked_inos())
13079 ino = inode->faked_ino;
13080 else
13081 ino = inode->ino;
13082 }
13083
13084 tout(cct) << (unsigned long)*fhp << std::endl;
13085 tout(cct) << ino << std::endl;
1adf2230 13086 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13087 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13088 *fhp << " " << hex << ino << dec << ")" << dendl;
13089
13090 return r;
13091}
13092
13093int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13094 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13095 const UserPerm& perms)
13096{
11fdf7f2 13097 std::lock_guard lock(client_lock);
7c673cae
FG
13098 InodeRef in;
13099
181888fb
FG
13100 if (unmounting)
13101 return -ENOTCONN;
13102
7c673cae
FG
13103 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13104 fhp, perms);
13105 if (r >= 0) {
11fdf7f2 13106 ceph_assert(in);
7c673cae
FG
13107
13108 // passing an Inode in outp requires an additional ref
13109 if (outp) {
13110 _ll_get(in.get());
13111 *outp = in.get();
13112 }
13113 fill_stat(in, attr);
13114 } else {
13115 attr->st_ino = 0;
13116 }
13117
13118 return r;
13119}
13120
13121int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13122 int oflags, Inode **outp, Fh **fhp,
13123 struct ceph_statx *stx, unsigned want, unsigned lflags,
13124 const UserPerm& perms)
13125{
13126 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13127 std::lock_guard lock(client_lock);
7c673cae
FG
13128 InodeRef in;
13129
181888fb
FG
13130 if (unmounting)
13131 return -ENOTCONN;
7c673cae
FG
13132
13133 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13134 if (r >= 0) {
11fdf7f2 13135 ceph_assert(in);
7c673cae
FG
13136
13137 // passing an Inode in outp requires an additional ref
13138 if (outp) {
13139 _ll_get(in.get());
13140 *outp = in.get();
13141 }
13142 fill_statx(in, caps, stx);
13143 } else {
13144 stx->stx_ino = 0;
13145 stx->stx_mask = 0;
13146 }
13147
13148 return r;
13149}
13150
13151loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13152{
11fdf7f2 13153 std::lock_guard lock(client_lock);
7c673cae
FG
13154 tout(cct) << "ll_lseek" << std::endl;
13155 tout(cct) << offset << std::endl;
13156 tout(cct) << whence << std::endl;
13157
181888fb
FG
13158 if (unmounting)
13159 return -ENOTCONN;
13160
7c673cae
FG
13161 return _lseek(fh, offset, whence);
13162}
13163
13164int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13165{
11fdf7f2 13166 std::lock_guard lock(client_lock);
7c673cae
FG
13167 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13168 tout(cct) << "ll_read" << std::endl;
13169 tout(cct) << (unsigned long)fh << std::endl;
13170 tout(cct) << off << std::endl;
13171 tout(cct) << len << std::endl;
13172
181888fb
FG
13173 if (unmounting)
13174 return -ENOTCONN;
13175
11fdf7f2
TL
13176 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13177 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13178 return _read(fh, off, len, bl);
13179}
13180
13181int Client::ll_read_block(Inode *in, uint64_t blockid,
13182 char *buf,
13183 uint64_t offset,
13184 uint64_t length,
13185 file_layout_t* layout)
13186{
11fdf7f2 13187 std::lock_guard lock(client_lock);
181888fb
FG
13188
13189 if (unmounting)
13190 return -ENOTCONN;
13191
b32b8144 13192 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13193 object_t oid = file_object_t(vino.ino, blockid);
13194 C_SaferCond onfinish;
13195 bufferlist bl;
13196
13197 objecter->read(oid,
13198 object_locator_t(layout->pool_id),
13199 offset,
13200 length,
13201 vino.snapid,
13202 &bl,
13203 CEPH_OSD_FLAG_READ,
13204 &onfinish);
13205
13206 client_lock.Unlock();
13207 int r = onfinish.wait();
13208 client_lock.Lock();
13209
13210 if (r >= 0) {
13211 bl.copy(0, bl.length(), buf);
13212 r = bl.length();
13213 }
13214
13215 return r;
13216}
13217
13218/* It appears that the OSD doesn't return success unless the entire
13219 buffer was written, return the write length on success. */
13220
13221int Client::ll_write_block(Inode *in, uint64_t blockid,
13222 char* buf, uint64_t offset,
13223 uint64_t length, file_layout_t* layout,
13224 uint64_t snapseq, uint32_t sync)
13225{
7c673cae 13226 vinodeno_t vino = ll_get_vino(in);
7c673cae 13227 int r = 0;
11fdf7f2
TL
13228 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13229
7c673cae
FG
13230 if (length == 0) {
13231 return -EINVAL;
13232 }
13233 if (true || sync) {
13234 /* if write is stable, the epilogue is waiting on
13235 * flock */
11fdf7f2 13236 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13237 }
13238 object_t oid = file_object_t(vino.ino, blockid);
13239 SnapContext fakesnap;
11fdf7f2
TL
13240 ceph::bufferlist bl;
13241 if (length > 0) {
13242 bl.push_back(buffer::copy(buf, length));
13243 }
7c673cae
FG
13244
13245 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13246 << dendl;
13247
13248 fakesnap.seq = snapseq;
13249
13250 /* lock just in time */
13251 client_lock.Lock();
181888fb
FG
13252 if (unmounting) {
13253 client_lock.Unlock();
181888fb
FG
13254 return -ENOTCONN;
13255 }
7c673cae
FG
13256
13257 objecter->write(oid,
13258 object_locator_t(layout->pool_id),
13259 offset,
13260 length,
13261 fakesnap,
13262 bl,
13263 ceph::real_clock::now(),
13264 0,
11fdf7f2 13265 onsafe.get());
7c673cae
FG
13266
13267 client_lock.Unlock();
11fdf7f2
TL
13268 if (nullptr != onsafe) {
13269 r = onsafe->wait();
7c673cae
FG
13270 }
13271
13272 if (r < 0) {
13273 return r;
13274 } else {
13275 return length;
13276 }
13277}
13278
13279int Client::ll_commit_blocks(Inode *in,
13280 uint64_t offset,
13281 uint64_t length)
13282{
11fdf7f2 13283 std::lock_guard lock(client_lock);
7c673cae
FG
13284 /*
13285 BarrierContext *bctx;
b32b8144 13286 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13287 uint64_t ino = vino.ino;
13288
13289 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13290 << offset << " to " << length << dendl;
13291
13292 if (length == 0) {
13293 return -EINVAL;
13294 }
13295
13296 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13297 if (p != barriers.end()) {
13298 barrier_interval civ(offset, offset + length);
13299 p->second->commit_barrier(civ);
13300 }
13301 */
13302 return 0;
13303}
13304
13305int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13306{
11fdf7f2 13307 std::lock_guard lock(client_lock);
7c673cae
FG
13308 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13309 "~" << len << dendl;
13310 tout(cct) << "ll_write" << std::endl;
13311 tout(cct) << (unsigned long)fh << std::endl;
13312 tout(cct) << off << std::endl;
13313 tout(cct) << len << std::endl;
13314
181888fb
FG
13315 if (unmounting)
13316 return -ENOTCONN;
13317
11fdf7f2
TL
13318 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13319 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13320 int r = _write(fh, off, len, data, NULL, 0);
13321 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13322 << dendl;
13323 return r;
13324}
13325
11fdf7f2
TL
13326int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13327{
13328 std::lock_guard lock(client_lock);
13329 if (unmounting)
13330 return -ENOTCONN;
13331 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13332}
13333
13334int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13335{
13336 std::lock_guard lock(client_lock);
13337 if (unmounting)
13338 return -ENOTCONN;
13339 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13340}
13341
7c673cae
FG
13342int Client::ll_flush(Fh *fh)
13343{
11fdf7f2 13344 std::lock_guard lock(client_lock);
7c673cae
FG
13345 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13346 tout(cct) << "ll_flush" << std::endl;
13347 tout(cct) << (unsigned long)fh << std::endl;
13348
181888fb
FG
13349 if (unmounting)
13350 return -ENOTCONN;
13351
7c673cae
FG
13352 return _flush(fh);
13353}
13354
13355int Client::ll_fsync(Fh *fh, bool syncdataonly)
13356{
11fdf7f2 13357 std::lock_guard lock(client_lock);
7c673cae
FG
13358 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13359 tout(cct) << "ll_fsync" << std::endl;
13360 tout(cct) << (unsigned long)fh << std::endl;
13361
181888fb
FG
13362 if (unmounting)
13363 return -ENOTCONN;
13364
7c673cae
FG
13365 int r = _fsync(fh, syncdataonly);
13366 if (r) {
13367 // If we're returning an error, clear it from the FH
13368 fh->take_async_err();
13369 }
13370 return r;
13371}
13372
28e407b8
AA
13373int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13374{
11fdf7f2 13375 std::lock_guard lock(client_lock);
28e407b8
AA
13376 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13377 tout(cct) << "ll_sync_inode" << std::endl;
13378 tout(cct) << (unsigned long)in << std::endl;
13379
13380 if (unmounting)
13381 return -ENOTCONN;
13382
13383 return _fsync(in, syncdataonly);
13384}
13385
7c673cae
FG
13386#ifdef FALLOC_FL_PUNCH_HOLE
13387
13388int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13389{
13390 if (offset < 0 || length <= 0)
13391 return -EINVAL;
13392
13393 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13394 return -EOPNOTSUPP;
13395
13396 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13397 return -EOPNOTSUPP;
13398
13399 Inode *in = fh->inode.get();
13400
13401 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13402 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13403 return -ENOSPC;
13404 }
13405
13406 if (in->snapid != CEPH_NOSNAP)
13407 return -EROFS;
13408
13409 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13410 return -EBADF;
13411
13412 uint64_t size = offset + length;
13413 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13414 size > in->size &&
11fdf7f2 13415 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13416 return -EDQUOT;
13417 }
13418
13419 int have;
13420 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13421 if (r < 0)
13422 return r;
13423
11fdf7f2 13424 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13425 if (mode & FALLOC_FL_PUNCH_HOLE) {
13426 if (in->inline_version < CEPH_INLINE_NONE &&
13427 (have & CEPH_CAP_FILE_BUFFER)) {
13428 bufferlist bl;
13429 int len = in->inline_data.length();
13430 if (offset < len) {
13431 if (offset > 0)
13432 in->inline_data.copy(0, offset, bl);
13433 int size = length;
13434 if (offset + size > len)
13435 size = len - offset;
13436 if (size > 0)
13437 bl.append_zero(size);
13438 if (offset + size < len)
13439 in->inline_data.copy(offset + size, len - offset - size, bl);
13440 in->inline_data = bl;
13441 in->inline_version++;
13442 }
91327a77 13443 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13444 in->change_attr++;
28e407b8 13445 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13446 } else {
13447 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13448 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13449 uninline_data(in, onuninline.get());
7c673cae
FG
13450 }
13451
11fdf7f2 13452 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13453
13454 unsafe_sync_write++;
13455 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13456
13457 _invalidate_inode_cache(in, offset, length);
13458 filer->zero(in->ino, &in->layout,
13459 in->snaprealm->get_snap_context(),
13460 offset, length,
13461 ceph::real_clock::now(),
11fdf7f2 13462 0, true, &onfinish);
91327a77 13463 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13464 in->change_attr++;
28e407b8 13465 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13466
13467 client_lock.Unlock();
11fdf7f2 13468 onfinish.wait();
7c673cae
FG
13469 client_lock.Lock();
13470 _sync_write_commit(in);
13471 }
13472 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13473 uint64_t size = offset + length;
13474 if (size > in->size) {
13475 in->size = size;
91327a77 13476 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13477 in->change_attr++;
28e407b8 13478 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13479
11fdf7f2 13480 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13481 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13482 } else if (is_max_size_approaching(in)) {
13483 check_caps(in, 0);
7c673cae
FG
13484 }
13485 }
13486 }
13487
11fdf7f2 13488 if (nullptr != onuninline) {
7c673cae 13489 client_lock.Unlock();
11fdf7f2 13490 int ret = onuninline->wait();
7c673cae
FG
13491 client_lock.Lock();
13492
11fdf7f2 13493 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13494 in->inline_data.clear();
13495 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13496 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13497 check_caps(in, 0);
13498 } else
11fdf7f2 13499 r = ret;
7c673cae
FG
13500 }
13501
13502 put_cap_ref(in, CEPH_CAP_FILE_WR);
13503 return r;
13504}
13505#else
13506
13507int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13508{
13509 return -EOPNOTSUPP;
13510}
13511
13512#endif
13513
13514
11fdf7f2 13515int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13516{
11fdf7f2
TL
13517 std::lock_guard lock(client_lock);
13518 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13519 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13520 tout(cct) << (unsigned long)fh << std::endl;
13521
181888fb
FG
13522 if (unmounting)
13523 return -ENOTCONN;
13524
7c673cae
FG
13525 return _fallocate(fh, mode, offset, length);
13526}
13527
13528int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13529{
11fdf7f2
TL
13530 std::lock_guard lock(client_lock);
13531 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13532
181888fb
FG
13533 if (unmounting)
13534 return -ENOTCONN;
13535
7c673cae
FG
13536 Fh *fh = get_filehandle(fd);
13537 if (!fh)
13538 return -EBADF;
13539#if defined(__linux__) && defined(O_PATH)
13540 if (fh->flags & O_PATH)
13541 return -EBADF;
13542#endif
13543 return _fallocate(fh, mode, offset, length);
13544}
13545
13546int Client::ll_release(Fh *fh)
13547{
11fdf7f2 13548 std::lock_guard lock(client_lock);
91327a77
AA
13549
13550 if (unmounting)
13551 return -ENOTCONN;
13552
11fdf7f2 13553 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13554 dendl;
11fdf7f2 13555 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13556 tout(cct) << (unsigned long)fh << std::endl;
13557
13558 if (ll_unclosed_fh_set.count(fh))
13559 ll_unclosed_fh_set.erase(fh);
13560 return _release_fh(fh);
13561}
13562
13563int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13564{
11fdf7f2 13565 std::lock_guard lock(client_lock);
7c673cae
FG
13566
13567 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13568 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13569
181888fb
FG
13570 if (unmounting)
13571 return -ENOTCONN;
13572
7c673cae
FG
13573 return _getlk(fh, fl, owner);
13574}
13575
13576int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13577{
11fdf7f2 13578 std::lock_guard lock(client_lock);
7c673cae 13579
11fdf7f2
TL
13580 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13581 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13582
181888fb
FG
13583 if (unmounting)
13584 return -ENOTCONN;
13585
7c673cae
FG
13586 return _setlk(fh, fl, owner, sleep);
13587}
13588
13589int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13590{
11fdf7f2 13591 std::lock_guard lock(client_lock);
7c673cae 13592
11fdf7f2
TL
13593 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13594 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13595
181888fb
FG
13596 if (unmounting)
13597 return -ENOTCONN;
13598
7c673cae
FG
13599 return _flock(fh, cmd, owner);
13600}
13601
b32b8144
FG
13602int Client::set_deleg_timeout(uint32_t timeout)
13603{
11fdf7f2 13604 std::lock_guard lock(client_lock);
b32b8144
FG
13605
13606 /*
13607 * The whole point is to prevent blacklisting so we must time out the
13608 * delegation before the session autoclose timeout kicks in.
13609 */
13610 if (timeout >= mdsmap->get_session_autoclose())
13611 return -EINVAL;
13612
13613 deleg_timeout = timeout;
13614 return 0;
13615}
13616
13617int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13618{
13619 int ret = -EINVAL;
13620
11fdf7f2 13621 std::lock_guard lock(client_lock);
b32b8144
FG
13622
13623 if (!mounted)
13624 return -ENOTCONN;
13625
13626 Inode *inode = fh->inode.get();
13627
13628 switch(cmd) {
13629 case CEPH_DELEGATION_NONE:
13630 inode->unset_deleg(fh);
13631 ret = 0;
13632 break;
13633 default:
13634 try {
13635 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13636 } catch (std::bad_alloc&) {
b32b8144
FG
13637 ret = -ENOMEM;
13638 }
13639 break;
13640 }
13641 return ret;
13642}
13643
7c673cae
FG
13644class C_Client_RequestInterrupt : public Context {
13645private:
13646 Client *client;
13647 MetaRequest *req;
13648public:
13649 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13650 req->get();
13651 }
13652 void finish(int r) override {
11fdf7f2
TL
13653 std::lock_guard l(client->client_lock);
13654 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13655 client->_interrupt_filelock(req);
13656 client->put_request(req);
13657 }
13658};
13659
13660void Client::ll_interrupt(void *d)
13661{
13662 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13663 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13664 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13665 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13666}
13667
13668// =========================================
13669// layout
13670
13671// expose file layouts
13672
13673int Client::describe_layout(const char *relpath, file_layout_t *lp,
13674 const UserPerm& perms)
13675{
11fdf7f2 13676 std::lock_guard lock(client_lock);
7c673cae 13677
181888fb
FG
13678 if (unmounting)
13679 return -ENOTCONN;
13680
7c673cae
FG
13681 filepath path(relpath);
13682 InodeRef in;
13683 int r = path_walk(path, &in, perms);
13684 if (r < 0)
13685 return r;
13686
13687 *lp = in->layout;
13688
11fdf7f2 13689 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13690 return 0;
13691}
13692
13693int Client::fdescribe_layout(int fd, file_layout_t *lp)
13694{
11fdf7f2 13695 std::lock_guard lock(client_lock);
7c673cae 13696
181888fb
FG
13697 if (unmounting)
13698 return -ENOTCONN;
13699
7c673cae
FG
13700 Fh *f = get_filehandle(fd);
13701 if (!f)
13702 return -EBADF;
13703 Inode *in = f->inode.get();
13704
13705 *lp = in->layout;
13706
11fdf7f2 13707 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13708 return 0;
13709}
13710
d2e6a577
FG
13711int64_t Client::get_default_pool_id()
13712{
11fdf7f2 13713 std::lock_guard lock(client_lock);
181888fb
FG
13714
13715 if (unmounting)
13716 return -ENOTCONN;
13717
d2e6a577
FG
13718 /* first data pool is the default */
13719 return mdsmap->get_first_data_pool();
13720}
7c673cae
FG
13721
13722// expose osdmap
13723
13724int64_t Client::get_pool_id(const char *pool_name)
13725{
11fdf7f2 13726 std::lock_guard lock(client_lock);
181888fb
FG
13727
13728 if (unmounting)
13729 return -ENOTCONN;
13730
7c673cae
FG
13731 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13732 pool_name);
13733}
13734
13735string Client::get_pool_name(int64_t pool)
13736{
11fdf7f2 13737 std::lock_guard lock(client_lock);
181888fb
FG
13738
13739 if (unmounting)
13740 return string();
13741
7c673cae
FG
13742 return objecter->with_osdmap([pool](const OSDMap& o) {
13743 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13744 });
13745}
13746
13747int Client::get_pool_replication(int64_t pool)
13748{
11fdf7f2 13749 std::lock_guard lock(client_lock);
181888fb
FG
13750
13751 if (unmounting)
13752 return -ENOTCONN;
13753
7c673cae
FG
13754 return objecter->with_osdmap([pool](const OSDMap& o) {
13755 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13756 });
13757}
13758
13759int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13760{
11fdf7f2 13761 std::lock_guard lock(client_lock);
7c673cae 13762
181888fb
FG
13763 if (unmounting)
13764 return -ENOTCONN;
13765
7c673cae
FG
13766 Fh *f = get_filehandle(fd);
13767 if (!f)
13768 return -EBADF;
13769 Inode *in = f->inode.get();
13770
13771 vector<ObjectExtent> extents;
13772 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13773 ceph_assert(extents.size() == 1);
7c673cae
FG
13774
13775 objecter->with_osdmap([&](const OSDMap& o) {
13776 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13777 o.pg_to_acting_osds(pg, osds);
13778 });
13779
13780 if (osds.empty())
13781 return -EINVAL;
13782
13783 /*
13784 * Return the remainder of the extent (stripe unit)
13785 *
13786 * If length = 1 is passed to Striper::file_to_extents we get a single
13787 * extent back, but its length is one so we still need to compute the length
13788 * to the end of the stripe unit.
13789 *
13790 * If length = su then we may get 1 or 2 objects back in the extents vector
13791 * which would have to be examined. Even then, the offsets are local to the
13792 * object, so matching up to the file offset is extra work.
13793 *
13794 * It seems simpler to stick with length = 1 and manually compute the
13795 * remainder.
13796 */
13797 if (len) {
13798 uint64_t su = in->layout.stripe_unit;
13799 *len = su - (off % su);
13800 }
13801
13802 return 0;
13803}
13804
13805int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13806{
11fdf7f2 13807 std::lock_guard lock(client_lock);
181888fb
FG
13808
13809 if (unmounting)
13810 return -ENOTCONN;
13811
7c673cae
FG
13812 if (id < 0)
13813 return -EINVAL;
13814 return objecter->with_osdmap([&](const OSDMap& o) {
13815 return o.crush->get_full_location_ordered(id, path);
13816 });
13817}
13818
13819int Client::get_file_stripe_address(int fd, loff_t offset,
13820 vector<entity_addr_t>& address)
13821{
11fdf7f2 13822 std::lock_guard lock(client_lock);
7c673cae 13823
181888fb
FG
13824 if (unmounting)
13825 return -ENOTCONN;
13826
7c673cae
FG
13827 Fh *f = get_filehandle(fd);
13828 if (!f)
13829 return -EBADF;
13830 Inode *in = f->inode.get();
13831
13832 // which object?
13833 vector<ObjectExtent> extents;
13834 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13835 in->truncate_size, extents);
11fdf7f2 13836 ceph_assert(extents.size() == 1);
7c673cae
FG
13837
13838 // now we have the object and its 'layout'
13839 return objecter->with_osdmap([&](const OSDMap& o) {
13840 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13841 vector<int> osds;
13842 o.pg_to_acting_osds(pg, osds);
13843 if (osds.empty())
13844 return -EINVAL;
13845 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13846 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13847 address.push_back(addr);
13848 }
13849 return 0;
13850 });
13851}
13852
13853int Client::get_osd_addr(int osd, entity_addr_t& addr)
13854{
11fdf7f2 13855 std::lock_guard lock(client_lock);
181888fb
FG
13856
13857 if (unmounting)
13858 return -ENOTCONN;
13859
7c673cae
FG
13860 return objecter->with_osdmap([&](const OSDMap& o) {
13861 if (!o.exists(osd))
13862 return -ENOENT;
13863
11fdf7f2 13864 addr = o.get_addrs(osd).front();
7c673cae
FG
13865 return 0;
13866 });
13867}
13868
13869int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13870 loff_t length, loff_t offset)
13871{
11fdf7f2 13872 std::lock_guard lock(client_lock);
7c673cae 13873
181888fb
FG
13874 if (unmounting)
13875 return -ENOTCONN;
13876
7c673cae
FG
13877 Fh *f = get_filehandle(fd);
13878 if (!f)
13879 return -EBADF;
13880 Inode *in = f->inode.get();
13881
13882 // map to a list of extents
13883 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13884
11fdf7f2 13885 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13886 return 0;
13887}
13888
13889
b32b8144 13890/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13891int Client::get_local_osd()
13892{
11fdf7f2 13893 std::lock_guard lock(client_lock);
181888fb
FG
13894
13895 if (unmounting)
13896 return -ENOTCONN;
13897
7c673cae
FG
13898 objecter->with_osdmap([this](const OSDMap& o) {
13899 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13900 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13901 local_osd_epoch = o.get_epoch();
13902 }
13903 });
13904 return local_osd;
13905}
13906
13907
13908
13909
13910
13911
13912// ===============================
13913
13914void Client::ms_handle_connect(Connection *con)
13915{
11fdf7f2 13916 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13917}
13918
13919bool Client::ms_handle_reset(Connection *con)
13920{
11fdf7f2 13921 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13922 return false;
13923}
13924
13925void Client::ms_handle_remote_reset(Connection *con)
13926{
11fdf7f2
TL
13927 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13928 std::lock_guard l(client_lock);
7c673cae
FG
13929 switch (con->get_peer_type()) {
13930 case CEPH_ENTITY_TYPE_MDS:
13931 {
13932 // kludge to figure out which mds this is; fixme with a Connection* state
13933 mds_rank_t mds = MDS_RANK_NONE;
13934 MetaSession *s = NULL;
11fdf7f2
TL
13935 for (auto &p : mds_sessions) {
13936 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13937 mds = p.first;
13938 s = &p.second;
7c673cae
FG
13939 }
13940 }
13941 if (mds >= 0) {
d2e6a577 13942 assert (s != NULL);
7c673cae
FG
13943 switch (s->state) {
13944 case MetaSession::STATE_CLOSING:
13945 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13946 _closed_mds_session(s);
13947 break;
13948
13949 case MetaSession::STATE_OPENING:
13950 {
13951 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13952 list<Context*> waiters;
13953 waiters.swap(s->waiting_for_open);
13954 _closed_mds_session(s);
13955 MetaSession *news = _get_or_open_mds_session(mds);
13956 news->waiting_for_open.swap(waiters);
13957 }
13958 break;
13959
13960 case MetaSession::STATE_OPEN:
13961 {
28e407b8 13962 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 13963 const auto& conf = cct->_conf;
7c673cae
FG
13964 if (conf->client_reconnect_stale) {
13965 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13966 _closed_mds_session(s);
13967 } else {
13968 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13969 s->state = MetaSession::STATE_STALE;
13970 }
13971 }
13972 break;
13973
13974 case MetaSession::STATE_NEW:
13975 case MetaSession::STATE_CLOSED:
13976 default:
13977 break;
13978 }
13979 }
13980 }
13981 break;
13982 }
13983}
13984
13985bool Client::ms_handle_refused(Connection *con)
13986{
11fdf7f2 13987 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13988 return false;
13989}
13990
11fdf7f2 13991bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
13992{
13993 if (dest_type == CEPH_ENTITY_TYPE_MON)
13994 return true;
13995 *authorizer = monclient->build_authorizer(dest_type);
13996 return true;
13997}
13998
13999Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14000{
11fdf7f2
TL
14001 Inode *quota_in = root_ancestor;
14002 SnapRealm *realm = in->snaprealm;
14003 while (realm) {
14004 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14005 if (realm->ino != in->ino) {
14006 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14007 if (p == inode_map.end())
14008 break;
7c673cae 14009
11fdf7f2
TL
14010 if (p->second->quota.is_enable()) {
14011 quota_in = p->second;
14012 break;
7c673cae 14013 }
7c673cae 14014 }
11fdf7f2 14015 realm = realm->pparent;
7c673cae 14016 }
11fdf7f2
TL
14017 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14018 return quota_in;
7c673cae
FG
14019}
14020
14021/**
14022 * Traverse quota ancestors of the Inode, return true
14023 * if any of them passes the passed function
14024 */
14025bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14026 std::function<bool (const Inode &in)> test)
14027{
14028 while (true) {
11fdf7f2 14029 ceph_assert(in != NULL);
7c673cae
FG
14030 if (test(*in)) {
14031 return true;
14032 }
14033
14034 if (in == root_ancestor) {
14035 // We're done traversing, drop out
14036 return false;
14037 } else {
14038 // Continue up the tree
14039 in = get_quota_root(in, perms);
14040 }
14041 }
14042
14043 return false;
14044}
14045
14046bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14047{
14048 return check_quota_condition(in, perms,
14049 [](const Inode &in) {
14050 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14051 });
14052}
14053
14054bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14055 const UserPerm& perms)
7c673cae
FG
14056{
14057 return check_quota_condition(in, perms,
11fdf7f2 14058 [&new_bytes](const Inode &in) {
7c673cae
FG
14059 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14060 > in.quota.max_bytes;
14061 });
14062}
14063
11fdf7f2 14064bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14065{
11fdf7f2
TL
14066 return check_quota_condition(in, perms,
14067 [](const Inode &in) {
14068 if (in.quota.max_bytes) {
14069 if (in.rstat.rbytes >= in.quota.max_bytes) {
14070 return true;
14071 }
14072
14073 ceph_assert(in.size >= in.reported_size);
14074 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14075 const uint64_t size = in.size - in.reported_size;
14076 return (space >> 4) < size;
14077 } else {
14078 return false;
14079 }
14080 });
7c673cae
FG
14081}
14082
14083enum {
14084 POOL_CHECKED = 1,
14085 POOL_CHECKING = 2,
14086 POOL_READ = 4,
14087 POOL_WRITE = 8,
14088};
14089
14090int Client::check_pool_perm(Inode *in, int need)
14091{
14092 if (!cct->_conf->client_check_pool_perm)
14093 return 0;
14094
14095 int64_t pool_id = in->layout.pool_id;
14096 std::string pool_ns = in->layout.pool_ns;
14097 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14098 int have = 0;
14099 while (true) {
14100 auto it = pool_perms.find(perm_key);
14101 if (it == pool_perms.end())
14102 break;
14103 if (it->second == POOL_CHECKING) {
14104 // avoid concurrent checkings
14105 wait_on_list(waiting_for_pool_perm);
14106 } else {
14107 have = it->second;
11fdf7f2 14108 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14109 break;
14110 }
14111 }
14112
14113 if (!have) {
14114 if (in->snapid != CEPH_NOSNAP) {
14115 // pool permission check needs to write to the first object. But for snapshot,
14116 // head of the first object may have alread been deleted. To avoid creating
14117 // orphan object, skip the check for now.
14118 return 0;
14119 }
14120
14121 pool_perms[perm_key] = POOL_CHECKING;
14122
14123 char oid_buf[32];
14124 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14125 object_t oid = oid_buf;
14126
14127 SnapContext nullsnapc;
14128
14129 C_SaferCond rd_cond;
14130 ObjectOperation rd_op;
14131 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14132
14133 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14134 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14135
14136 C_SaferCond wr_cond;
14137 ObjectOperation wr_op;
14138 wr_op.create(true);
14139
14140 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14141 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14142
14143 client_lock.Unlock();
14144 int rd_ret = rd_cond.wait();
14145 int wr_ret = wr_cond.wait();
14146 client_lock.Lock();
14147
14148 bool errored = false;
14149
14150 if (rd_ret == 0 || rd_ret == -ENOENT)
14151 have |= POOL_READ;
14152 else if (rd_ret != -EPERM) {
11fdf7f2 14153 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14154 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14155 errored = true;
14156 }
14157
14158 if (wr_ret == 0 || wr_ret == -EEXIST)
14159 have |= POOL_WRITE;
14160 else if (wr_ret != -EPERM) {
11fdf7f2 14161 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14162 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14163 errored = true;
14164 }
14165
14166 if (errored) {
14167 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14168 // Raise EIO because actual error code might be misleading for
14169 // userspace filesystem user.
14170 pool_perms.erase(perm_key);
14171 signal_cond_list(waiting_for_pool_perm);
14172 return -EIO;
14173 }
14174
14175 pool_perms[perm_key] = have | POOL_CHECKED;
14176 signal_cond_list(waiting_for_pool_perm);
14177 }
14178
14179 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14180 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14181 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14182 return -EPERM;
14183 }
14184 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14185 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14186 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14187 return -EPERM;
14188 }
14189
14190 return 0;
14191}
14192
14193int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14194{
14195 if (acl_type == POSIX_ACL) {
14196 if (in->xattrs.count(ACL_EA_ACCESS)) {
14197 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14198
14199 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14200 }
14201 }
14202 return -EAGAIN;
14203}
14204
14205int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14206{
14207 if (acl_type == NO_ACL)
14208 return 0;
14209
14210 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14211 if (r < 0)
14212 goto out;
14213
14214 if (acl_type == POSIX_ACL) {
14215 if (in->xattrs.count(ACL_EA_ACCESS)) {
14216 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14217 bufferptr acl(access_acl.c_str(), access_acl.length());
14218 r = posix_acl_access_chmod(acl, mode);
14219 if (r < 0)
14220 goto out;
14221 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14222 } else {
14223 r = 0;
14224 }
14225 }
14226out:
14227 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14228 return r;
14229}
14230
14231int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14232 const UserPerm& perms)
14233{
14234 if (acl_type == NO_ACL)
14235 return 0;
14236
14237 if (S_ISLNK(*mode))
14238 return 0;
14239
14240 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14241 if (r < 0)
14242 goto out;
14243
14244 if (acl_type == POSIX_ACL) {
14245 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14246 map<string, bufferptr> xattrs;
14247
14248 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14249 bufferptr acl(default_acl.c_str(), default_acl.length());
14250 r = posix_acl_inherit_mode(acl, mode);
14251 if (r < 0)
14252 goto out;
14253
14254 if (r > 0) {
14255 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14256 if (r < 0)
14257 goto out;
14258 if (r > 0)
14259 xattrs[ACL_EA_ACCESS] = acl;
14260 }
14261
14262 if (S_ISDIR(*mode))
14263 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14264
14265 r = xattrs.size();
14266 if (r > 0)
11fdf7f2 14267 encode(xattrs, xattrs_bl);
7c673cae
FG
14268 } else {
14269 if (umask_cb)
14270 *mode &= ~umask_cb(callback_handle);
14271 r = 0;
14272 }
14273 }
14274out:
14275 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14276 return r;
14277}
14278
14279void Client::set_filer_flags(int flags)
14280{
11fdf7f2
TL
14281 std::lock_guard l(client_lock);
14282 ceph_assert(flags == 0 ||
7c673cae
FG
14283 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14284 objecter->add_global_op_flags(flags);
14285}
14286
14287void Client::clear_filer_flags(int flags)
14288{
11fdf7f2
TL
14289 std::lock_guard l(client_lock);
14290 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14291 objecter->clear_global_op_flag(flags);
14292}
14293
11fdf7f2
TL
14294// called before mount
14295void Client::set_uuid(const std::string& uuid)
14296{
14297 std::lock_guard l(client_lock);
14298 assert(initialized);
14299 assert(!uuid.empty());
14300
14301 metadata["uuid"] = uuid;
14302 _close_sessions();
14303}
14304
14305// called before mount. 0 means infinite
14306void Client::set_session_timeout(unsigned timeout)
14307{
14308 std::lock_guard l(client_lock);
14309 assert(initialized);
14310
14311 metadata["timeout"] = stringify(timeout);
14312}
14313
14314// called before mount
14315int Client::start_reclaim(const std::string& uuid, unsigned flags,
14316 const std::string& fs_name)
14317{
14318 std::lock_guard l(client_lock);
14319 if (!initialized)
14320 return -ENOTCONN;
14321
14322 if (uuid.empty())
14323 return -EINVAL;
14324
14325 {
14326 auto it = metadata.find("uuid");
14327 if (it != metadata.end() && it->second == uuid)
14328 return -EINVAL;
14329 }
14330
14331 int r = subscribe_mdsmap(fs_name);
14332 if (r < 0) {
14333 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14334 return r;
14335 }
14336
14337 if (metadata.empty())
14338 populate_metadata("");
14339
14340 while (mdsmap->get_epoch() == 0)
14341 wait_on_list(waiting_for_mdsmap);
14342
14343 reclaim_errno = 0;
14344 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14345 if (!mdsmap->is_up(mds)) {
14346 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14347 wait_on_list(waiting_for_mdsmap);
14348 continue;
14349 }
14350
14351 MetaSession *session;
14352 if (!have_open_session(mds)) {
14353 session = _get_or_open_mds_session(mds);
14354 if (session->state != MetaSession::STATE_OPENING) {
14355 // umounting?
14356 return -EINVAL;
14357 }
14358 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14359 wait_on_context_list(session->waiting_for_open);
14360 if (rejected_by_mds.count(mds))
14361 return -EPERM;
14362 continue;
14363 }
14364
14365 session = &mds_sessions.at(mds);
14366 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14367 return -EOPNOTSUPP;
14368
14369 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14370 session->reclaim_state == MetaSession::RECLAIMING) {
14371 session->reclaim_state = MetaSession::RECLAIMING;
14372 auto m = MClientReclaim::create(uuid, flags);
14373 session->con->send_message2(std::move(m));
14374 wait_on_list(waiting_for_reclaim);
14375 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14376 return reclaim_errno ? : -ENOTRECOVERABLE;
14377 } else {
14378 mds++;
14379 }
14380 }
14381
14382 // didn't find target session in any mds
14383 if (reclaim_target_addrs.empty()) {
14384 if (flags & CEPH_RECLAIM_RESET)
14385 return -ENOENT;
14386 return -ENOTRECOVERABLE;
14387 }
14388
14389 if (flags & CEPH_RECLAIM_RESET)
14390 return 0;
14391
14392 // use blacklist to check if target session was killed
14393 // (config option mds_session_blacklist_on_evict needs to be true)
14394 C_SaferCond cond;
14395 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14396 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14397 client_lock.Unlock();
14398 cond.wait();
14399 client_lock.Lock();
14400 }
14401
14402 bool blacklisted = objecter->with_osdmap(
14403 [this](const OSDMap &osd_map) -> bool {
14404 return osd_map.is_blacklisted(reclaim_target_addrs);
14405 });
14406 if (blacklisted)
14407 return -ENOTRECOVERABLE;
14408
14409 metadata["reclaiming_uuid"] = uuid;
14410 return 0;
14411}
14412
14413void Client::finish_reclaim()
14414{
14415 auto it = metadata.find("reclaiming_uuid");
14416 if (it == metadata.end()) {
14417 for (auto &p : mds_sessions)
14418 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14419 return;
14420 }
14421
14422 for (auto &p : mds_sessions) {
14423 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14424 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14425 p.second.con->send_message2(std::move(m));
14426 }
14427
14428 metadata["uuid"] = it->second;
14429 metadata.erase(it);
14430}
14431
14432void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14433{
14434 mds_rank_t from = mds_rank_t(reply->get_source().num());
14435 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14436
14437 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14438 if (!session) {
14439 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14440 return;
14441 }
14442
14443 if (reply->get_result() >= 0) {
14444 session->reclaim_state = MetaSession::RECLAIM_OK;
14445 if (reply->get_epoch() > reclaim_osd_epoch)
14446 reclaim_osd_epoch = reply->get_epoch();
14447 if (!reply->get_addrs().empty())
14448 reclaim_target_addrs = reply->get_addrs();
14449 } else {
14450 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14451 reclaim_errno = reply->get_result();
14452 }
14453
14454 signal_cond_list(waiting_for_reclaim);
14455}
14456
7c673cae
FG
14457/**
14458 * This is included in cap release messages, to cause
14459 * the MDS to wait until this OSD map epoch. It is necessary
14460 * in corner cases where we cancel RADOS ops, so that
14461 * nobody else tries to do IO to the same objects in
14462 * the same epoch as the cancelled ops.
14463 */
14464void Client::set_cap_epoch_barrier(epoch_t e)
14465{
14466 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14467 cap_epoch_barrier = e;
14468}
14469
14470const char** Client::get_tracked_conf_keys() const
14471{
14472 static const char* keys[] = {
14473 "client_cache_size",
14474 "client_cache_mid",
14475 "client_acl_type",
b32b8144
FG
14476 "client_deleg_timeout",
14477 "client_deleg_break_on_open",
7c673cae
FG
14478 NULL
14479 };
14480 return keys;
14481}
14482
11fdf7f2 14483void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14484 const std::set <std::string> &changed)
14485{
11fdf7f2 14486 std::lock_guard lock(client_lock);
7c673cae 14487
181888fb 14488 if (changed.count("client_cache_mid")) {
7c673cae
FG
14489 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14490 }
14491 if (changed.count("client_acl_type")) {
14492 acl_type = NO_ACL;
14493 if (cct->_conf->client_acl_type == "posix_acl")
14494 acl_type = POSIX_ACL;
14495 }
14496}
14497
7c673cae
FG
14498void intrusive_ptr_add_ref(Inode *in)
14499{
14500 in->get();
14501}
14502
14503void intrusive_ptr_release(Inode *in)
14504{
14505 in->client->put_inode(in);
14506}
14507
14508mds_rank_t Client::_get_random_up_mds() const
14509{
11fdf7f2 14510 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
14511
14512 std::set<mds_rank_t> up;
14513 mdsmap->get_up_mds_set(up);
14514
14515 if (up.empty())
14516 return MDS_RANK_NONE;
14517 std::set<mds_rank_t>::const_iterator p = up.begin();
14518 for (int n = rand() % up.size(); n; n--)
14519 ++p;
14520 return *p;
14521}
14522
14523
14524StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14525 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14526{
14527 monclient->set_messenger(m);
14528 objecter->set_client_incarnation(0);
14529}
14530
14531StandaloneClient::~StandaloneClient()
14532{
14533 delete objecter;
14534 objecter = nullptr;
14535}
14536
14537int StandaloneClient::init()
14538{
14539 timer.init();
14540 objectcacher->start();
14541 objecter->init();
14542
14543 client_lock.Lock();
11fdf7f2 14544 ceph_assert(!is_initialized());
7c673cae
FG
14545
14546 messenger->add_dispatcher_tail(objecter);
14547 messenger->add_dispatcher_tail(this);
14548
14549 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14550 int r = monclient->init();
14551 if (r < 0) {
14552 // need to do cleanup because we're in an intermediate init state
14553 timer.shutdown();
14554 client_lock.Unlock();
14555 objecter->shutdown();
14556 objectcacher->stop();
14557 monclient->shutdown();
14558 return r;
14559 }
14560 objecter->start();
14561
14562 client_lock.Unlock();
14563 _finish_init();
14564
14565 return 0;
14566}
14567
14568void StandaloneClient::shutdown()
14569{
14570 Client::shutdown();
14571 objecter->shutdown();
14572 monclient->shutdown();
14573}