]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
73#include "common/Mutex.h"
74#include "common/perf_counters.h"
75#include "common/admin_socket.h"
76#include "common/errno.h"
77#include "include/str_list.h"
78
79#define dout_subsys ceph_subsys_client
80
81#include "include/lru.h"
82#include "include/compat.h"
83#include "include/stringify.h"
84
85#include "Client.h"
86#include "Inode.h"
87#include "Dentry.h"
b32b8144 88#include "Delegation.h"
7c673cae
FG
89#include "Dir.h"
90#include "ClientSnapRealm.h"
91#include "Fh.h"
92#include "MetaSession.h"
93#include "MetaRequest.h"
94#include "ObjecterWriteback.h"
95#include "posix_acl.h"
96
11fdf7f2 97#include "include/ceph_assert.h"
7c673cae
FG
98#include "include/stat.h"
99
100#include "include/cephfs/ceph_statx.h"
101
102#if HAVE_GETGROUPLIST
103#include <grp.h>
104#include <pwd.h>
105#include <unistd.h>
106#endif
107
108#undef dout_prefix
109#define dout_prefix *_dout << "client." << whoami << " "
110
111#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113// FreeBSD fails to define this
114#ifndef O_DSYNC
115#define O_DSYNC 0x0
116#endif
117// Darwin fails to define this
118#ifndef O_RSYNC
119#define O_RSYNC 0x0
120#endif
121
122#ifndef O_DIRECT
123#define O_DIRECT 0x0
124#endif
125
126#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129{
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132}
133
134
135// -------------
136
137Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139{
140}
141
11fdf7f2
TL
142bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
7c673cae 145{
11fdf7f2 146 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae
FG
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
11fdf7f2 150 m_client->dump_mds_requests(f.get());
7c673cae 151 else if (command == "mds_sessions")
11fdf7f2 152 m_client->dump_mds_sessions(f.get());
7c673cae 153 else if (command == "dump_cache")
11fdf7f2 154 m_client->dump_cache(f.get());
7c673cae
FG
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
11fdf7f2 158 m_client->dump_status(f.get());
7c673cae 159 else
11fdf7f2 160 ceph_abort_msg("bad command registered");
7c673cae
FG
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
7c673cae
FG
164 return true;
165}
166
167
168// -------------
169
170dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176void Client::_reset_faked_inos()
177{
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
11fdf7f2 182 last_used_faked_root = 0;
7c673cae
FG
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184}
185
186void Client::_assign_faked_ino(Inode *in)
187{
11fdf7f2
TL
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 192 last_used_faked_ino = 2048;
7c673cae
FG
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
11fdf7f2 195 ceph_assert(it != free_faked_inos.end());
7c673cae 196 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 197 ceph_assert(it.get_len() > 0);
7c673cae
FG
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
11fdf7f2 201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206}
207
11fdf7f2
TL
208/*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214*/
215void Client::_assign_faked_root(Inode *in)
216{
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232}
233
7c673cae
FG
234void Client::_release_faked_ino(Inode *in)
235{
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238}
239
240vinodeno_t Client::_map_faked_ino(ino_t ino)
241{
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
250 return vino;
251}
252
253vinodeno_t Client::map_faked_ino(ino_t ino)
254{
11fdf7f2 255 std::lock_guard lock(client_lock);
7c673cae
FG
256 return _map_faked_ino(ino);
257}
258
259// cons/des
260
261Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
7c673cae 263 timer(m->cct, client_lock),
11fdf7f2
TL
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae 278
7c673cae
FG
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281
7c673cae
FG
282 if (cct->_conf->client_acl_type == "posix_acl")
283 acl_type = POSIX_ACL;
284
7c673cae
FG
285 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
286
287 // file handles
288 free_fd_set.insert(10, 1<<30);
289
290 mdsmap.reset(new MDSMap);
291
292 // osd interfaces
293 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
294 &client_lock));
295 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
296 client_flush_set_callback, // all commit callback
297 (void*)this,
298 cct->_conf->client_oc_size,
299 cct->_conf->client_oc_max_objects,
300 cct->_conf->client_oc_max_dirty,
301 cct->_conf->client_oc_target_dirty,
302 cct->_conf->client_oc_max_dirty_age,
303 true));
304 objecter_finisher.start();
305 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 306 objecter->enable_blacklist_events();
7c673cae
FG
307}
308
309
310Client::~Client()
311{
11fdf7f2 312 ceph_assert(!client_lock.is_locked());
7c673cae 313
31f18b77
FG
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 client_lock.Lock();
7c673cae 318 tear_down_cache();
31f18b77 319 client_lock.Unlock();
7c673cae
FG
320}
321
322void Client::tear_down_cache()
323{
324 // fd's
325 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
326 it != fd_map.end();
327 ++it) {
328 Fh *fh = it->second;
11fdf7f2 329 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
330 _release_fh(fh);
331 }
332 fd_map.clear();
333
334 while (!opened_dirs.empty()) {
335 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 336 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
337 _closedir(dirp);
338 }
339
340 // caps!
341 // *** FIXME ***
342
343 // empty lru
7c673cae 344 trim_cache();
11fdf7f2 345 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
346
347 // close root ino
11fdf7f2 348 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
349 if (root && inode_map.size() == 1 + root_parents.size()) {
350 delete root;
351 root = 0;
352 root_ancestor = 0;
353 while (!root_parents.empty())
354 root_parents.erase(root_parents.begin());
355 inode_map.clear();
356 _reset_faked_inos();
357 }
358
11fdf7f2 359 ceph_assert(inode_map.empty());
7c673cae
FG
360}
361
362inodeno_t Client::get_root_ino()
363{
11fdf7f2 364 std::lock_guard l(client_lock);
7c673cae
FG
365 if (use_faked_inos())
366 return root->faked_ino;
367 else
368 return root->ino;
369}
370
371Inode *Client::get_root()
372{
11fdf7f2 373 std::lock_guard l(client_lock);
7c673cae
FG
374 root->ll_get();
375 return root;
376}
377
378
379// debug crapola
380
381void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
382{
383 filepath path;
384 in->make_long_path(path);
385 ldout(cct, 1) << "dump_inode: "
386 << (disconnected ? "DISCONNECTED ":"")
387 << "inode " << in->ino
388 << " " << path
389 << " ref " << in->get_num_ref()
390 << *in << dendl;
391
392 if (f) {
393 f->open_object_section("inode");
394 f->dump_stream("path") << path;
395 if (disconnected)
396 f->dump_int("disconnected", 1);
397 in->dump(f);
398 f->close_section();
399 }
400
401 did.insert(in);
402 if (in->dir) {
403 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
404 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
405 it != in->dir->dentries.end();
406 ++it) {
407 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
408 if (f) {
409 f->open_object_section("dentry");
410 it->second->dump(f);
411 f->close_section();
412 }
413 if (it->second->inode)
414 dump_inode(f, it->second->inode.get(), did, false);
415 }
416 }
417}
418
419void Client::dump_cache(Formatter *f)
420{
421 set<Inode*> did;
422
11fdf7f2 423 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
424
425 if (f)
426 f->open_array_section("cache");
427
428 if (root)
429 dump_inode(f, root, did, true);
430
431 // make a second pass to catch anything disconnected
432 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
433 it != inode_map.end();
434 ++it) {
435 if (did.count(it->second))
436 continue;
437 dump_inode(f, it->second, did, true);
438 }
439
440 if (f)
441 f->close_section();
442}
443
444void Client::dump_status(Formatter *f)
445{
11fdf7f2 446 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
447
448 ldout(cct, 1) << __func__ << dendl;
449
450 const epoch_t osd_epoch
451 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
452
453 if (f) {
454 f->open_object_section("metadata");
455 for (const auto& kv : metadata)
456 f->dump_string(kv.first.c_str(), kv.second);
457 f->close_section();
458
459 f->dump_int("dentry_count", lru.lru_get_size());
460 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
461 f->dump_int("id", get_nodeid().v);
11fdf7f2 462 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 463 f->dump_object("inst", inst);
11fdf7f2
TL
464 f->dump_object("addr", inst.addr);
465 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
466 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
467 f->dump_int("inode_count", inode_map.size());
468 f->dump_int("mds_epoch", mdsmap->get_epoch());
469 f->dump_int("osd_epoch", osd_epoch);
470 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 471 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
472 }
473}
474
475int Client::init()
476{
477 timer.init();
478 objectcacher->start();
479
480 client_lock.Lock();
11fdf7f2 481 ceph_assert(!initialized);
7c673cae
FG
482
483 messenger->add_dispatcher_tail(this);
484 client_lock.Unlock();
485
486 _finish_init();
487 return 0;
488}
489
490void Client::_finish_init()
491{
492 client_lock.Lock();
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
11fdf7f2
TL
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
7c673cae
FG
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502
503 client_lock.Unlock();
504
11fdf7f2 505 cct->_conf.add_observer(this);
7c673cae
FG
506
507 AdminSocket* admin_socket = cct->get_admin_socket();
508 int ret = admin_socket->register_command("mds_requests",
509 "mds_requests",
510 &m_command_hook,
511 "show in-progress mds requests");
512 if (ret < 0) {
513 lderr(cct) << "error registering admin socket command: "
514 << cpp_strerror(-ret) << dendl;
515 }
516 ret = admin_socket->register_command("mds_sessions",
517 "mds_sessions",
518 &m_command_hook,
519 "show mds session state");
520 if (ret < 0) {
521 lderr(cct) << "error registering admin socket command: "
522 << cpp_strerror(-ret) << dendl;
523 }
524 ret = admin_socket->register_command("dump_cache",
525 "dump_cache",
526 &m_command_hook,
527 "show in-memory metadata cache contents");
528 if (ret < 0) {
529 lderr(cct) << "error registering admin socket command: "
530 << cpp_strerror(-ret) << dendl;
531 }
532 ret = admin_socket->register_command("kick_stale_sessions",
533 "kick_stale_sessions",
534 &m_command_hook,
535 "kick sessions that were remote reset");
536 if (ret < 0) {
537 lderr(cct) << "error registering admin socket command: "
538 << cpp_strerror(-ret) << dendl;
539 }
540 ret = admin_socket->register_command("status",
541 "status",
542 &m_command_hook,
543 "show overall client status");
544 if (ret < 0) {
545 lderr(cct) << "error registering admin socket command: "
546 << cpp_strerror(-ret) << dendl;
547 }
548
549 client_lock.Lock();
550 initialized = true;
551 client_lock.Unlock();
552}
553
554void Client::shutdown()
555{
11fdf7f2 556 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
557
558 // If we were not mounted, but were being used for sending
559 // MDS commands, we may have sessions that need closing.
560 client_lock.Lock();
561 _close_sessions();
562 client_lock.Unlock();
563
11fdf7f2 564 cct->_conf.remove_observer(this);
7c673cae 565
11fdf7f2 566 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
567
568 if (ino_invalidate_cb) {
569 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
570 async_ino_invalidator.wait_for_empty();
571 async_ino_invalidator.stop();
572 }
573
574 if (dentry_invalidate_cb) {
575 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
576 async_dentry_invalidator.wait_for_empty();
577 async_dentry_invalidator.stop();
578 }
579
580 if (switch_interrupt_cb) {
581 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
582 interrupt_finisher.wait_for_empty();
583 interrupt_finisher.stop();
584 }
585
586 if (remount_cb) {
587 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
588 remount_finisher.wait_for_empty();
589 remount_finisher.stop();
590 }
591
592 objectcacher->stop(); // outside of client_lock! this does a join.
593
594 client_lock.Lock();
11fdf7f2 595 ceph_assert(initialized);
7c673cae
FG
596 initialized = false;
597 timer.shutdown();
598 client_lock.Unlock();
599
600 objecter_finisher.wait_for_empty();
601 objecter_finisher.stop();
602
603 if (logger) {
604 cct->get_perfcounters_collection()->remove(logger.get());
605 logger.reset();
606 }
607}
608
609
610// ===================
611// metadata cache stuff
612
613void Client::trim_cache(bool trim_kernel_dcache)
614{
181888fb
FG
615 uint64_t max = cct->_conf->client_cache_size;
616 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
617 unsigned last = 0;
618 while (lru.lru_get_size() != last) {
619 last = lru.lru_get_size();
620
181888fb 621 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
622
623 // trim!
31f18b77 624 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
625 if (!dn)
626 break; // done
627
628 trim_dentry(dn);
629 }
630
181888fb 631 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
632 _invalidate_kernel_dcache();
633
634 // hose root?
635 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
636 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
637 delete root;
638 root = 0;
639 root_ancestor = 0;
640 while (!root_parents.empty())
641 root_parents.erase(root_parents.begin());
642 inode_map.clear();
643 _reset_faked_inos();
644 }
645}
646
647void Client::trim_cache_for_reconnect(MetaSession *s)
648{
649 mds_rank_t mds = s->mds_num;
11fdf7f2 650 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
651
652 int trimmed = 0;
653 list<Dentry*> skipped;
654 while (lru.lru_get_size() > 0) {
655 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
656 if (!dn)
657 break;
658
659 if ((dn->inode && dn->inode->caps.count(mds)) ||
660 dn->dir->parent_inode->caps.count(mds)) {
661 trim_dentry(dn);
662 trimmed++;
663 } else
664 skipped.push_back(dn);
665 }
666
667 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
668 lru.lru_insert_mid(*p);
669
11fdf7f2 670 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
671 << " trimmed " << trimmed << " dentries" << dendl;
672
673 if (s->caps.size() > 0)
674 _invalidate_kernel_dcache();
675}
676
677void Client::trim_dentry(Dentry *dn)
678{
679 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
680 << " in dir "
681 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
682 << dendl;
683 if (dn->inode) {
684 Inode *diri = dn->dir->parent_inode;
685 diri->dir_release_count++;
686 clear_dir_complete_and_ordered(diri, true);
687 }
688 unlink(dn, false, false); // drop dir, drop dentry
689}
690
691
1adf2230
AA
692void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
693 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 694{
7c673cae
FG
695 uint64_t prior_size = in->size;
696
7c673cae
FG
697 if (truncate_seq > in->truncate_seq ||
698 (truncate_seq == in->truncate_seq && size > in->size)) {
699 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
700 in->size = size;
701 in->reported_size = size;
702 if (truncate_seq != in->truncate_seq) {
703 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
704 << truncate_seq << dendl;
705 in->truncate_seq = truncate_seq;
706 in->oset.truncate_seq = truncate_seq;
707
708 // truncate cached file data
709 if (prior_size > size) {
710 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
711 }
712 }
713
714 // truncate inline data
715 if (in->inline_version < CEPH_INLINE_NONE) {
716 uint32_t len = in->inline_data.length();
717 if (size < len)
718 in->inline_data.splice(size, len - size);
719 }
720 }
721 if (truncate_seq >= in->truncate_seq &&
722 in->truncate_size != truncate_size) {
723 if (in->is_file()) {
724 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
725 << truncate_size << dendl;
726 in->truncate_size = truncate_size;
727 in->oset.truncate_size = truncate_size;
728 } else {
729 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
730 }
731 }
1adf2230
AA
732}
733
734void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
735 utime_t ctime, utime_t mtime, utime_t atime)
736{
737 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
738 << " ctime " << ctime << " mtime " << mtime << dendl;
739
740 if (time_warp_seq > in->time_warp_seq)
741 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
742 << " is higher than local time_warp_seq "
743 << in->time_warp_seq << dendl;
744
745 int warn = false;
7c673cae
FG
746 // be careful with size, mtime, atime
747 if (issued & (CEPH_CAP_FILE_EXCL|
748 CEPH_CAP_FILE_WR|
749 CEPH_CAP_FILE_BUFFER|
750 CEPH_CAP_AUTH_EXCL|
751 CEPH_CAP_XATTR_EXCL)) {
752 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
753 if (ctime > in->ctime)
754 in->ctime = ctime;
755 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
756 //the mds updated times, so take those!
757 in->mtime = mtime;
758 in->atime = atime;
759 in->time_warp_seq = time_warp_seq;
760 } else if (time_warp_seq == in->time_warp_seq) {
761 //take max times
762 if (mtime > in->mtime)
763 in->mtime = mtime;
764 if (atime > in->atime)
765 in->atime = atime;
766 } else if (issued & CEPH_CAP_FILE_EXCL) {
767 //ignore mds values as we have a higher seq
768 } else warn = true;
769 } else {
770 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
771 if (time_warp_seq >= in->time_warp_seq) {
772 in->ctime = ctime;
773 in->mtime = mtime;
774 in->atime = atime;
775 in->time_warp_seq = time_warp_seq;
776 } else warn = true;
777 }
778 if (warn) {
779 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
780 << time_warp_seq << " is lower than local time_warp_seq "
781 << in->time_warp_seq
782 << dendl;
783 }
784}
785
786void Client::_fragmap_remove_non_leaves(Inode *in)
787{
788 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
789 if (!in->dirfragtree.is_leaf(p->first))
790 in->fragmap.erase(p++);
791 else
792 ++p;
793}
794
795void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
796{
797 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
798 if (p->second == mds)
799 in->fragmap.erase(p++);
800 else
801 ++p;
802}
803
804Inode * Client::add_update_inode(InodeStat *st, utime_t from,
805 MetaSession *session,
806 const UserPerm& request_perms)
807{
808 Inode *in;
809 bool was_new = false;
810 if (inode_map.count(st->vino)) {
811 in = inode_map[st->vino];
11fdf7f2 812 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
813 } else {
814 in = new Inode(this, st->vino, &st->layout);
815 inode_map[st->vino] = in;
816
817 if (use_faked_inos())
818 _assign_faked_ino(in);
819
820 if (!root) {
821 root = in;
11fdf7f2
TL
822 if (use_faked_inos())
823 _assign_faked_root(root);
7c673cae
FG
824 root_ancestor = in;
825 cwd = root;
826 } else if (!mounted) {
827 root_parents[root_ancestor] = in;
828 root_ancestor = in;
829 }
830
831 // immutable bits
832 in->ino = st->vino.ino;
833 in->snapid = st->vino.snapid;
834 in->mode = st->mode & S_IFMT;
835 was_new = true;
836 }
837
838 in->rdev = st->rdev;
839 if (in->is_symlink())
840 in->symlink = st->symlink;
841
7c673cae 842 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
843 bool new_version = false;
844 if (in->version == 0 ||
845 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
846 (in->version & ~1) < st->version))
847 new_version = true;
7c673cae 848
1adf2230
AA
849 int issued;
850 in->caps_issued(&issued);
851 issued |= in->caps_dirty();
852 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 853
1adf2230
AA
854 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
855 !(issued & CEPH_CAP_AUTH_EXCL)) {
856 in->mode = st->mode;
857 in->uid = st->uid;
858 in->gid = st->gid;
859 in->btime = st->btime;
81eedcae 860 in->snap_btime = st->snap_btime;
1adf2230 861 }
7c673cae 862
1adf2230
AA
863 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
864 !(issued & CEPH_CAP_LINK_EXCL)) {
865 in->nlink = st->nlink;
866 }
7c673cae 867
1adf2230
AA
868 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
869 update_inode_file_time(in, issued, st->time_warp_seq,
870 st->ctime, st->mtime, st->atime);
871 }
7c673cae 872
1adf2230
AA
873 if (new_version ||
874 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 875 in->layout = st->layout;
1adf2230
AA
876 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
877 }
7c673cae 878
1adf2230
AA
879 if (in->is_dir()) {
880 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
881 in->dirstat = st->dirstat;
882 }
883 // dir_layout/rstat/quota are not tracked by capability, update them only if
884 // the inode stat is from auth mds
885 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
886 in->dir_layout = st->dir_layout;
887 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
888 in->rstat = st->rstat;
889 in->quota = st->quota;
11fdf7f2 890 in->dir_pin = st->dir_pin;
1adf2230
AA
891 }
892 // move me if/when version reflects fragtree changes.
893 if (in->dirfragtree != st->dirfragtree) {
894 in->dirfragtree = st->dirfragtree;
895 _fragmap_remove_non_leaves(in);
7c673cae 896 }
7c673cae
FG
897 }
898
899 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
900 st->xattrbl.length() &&
901 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
902 auto p = st->xattrbl.cbegin();
903 decode(in->xattrs, p);
7c673cae
FG
904 in->xattr_version = st->xattr_version;
905 }
906
1adf2230
AA
907 if (st->inline_version > in->inline_version) {
908 in->inline_data = st->inline_data;
909 in->inline_version = st->inline_version;
7c673cae
FG
910 }
911
1adf2230
AA
912 /* always take a newer change attr */
913 if (st->change_attr > in->change_attr)
914 in->change_attr = st->change_attr;
915
916 if (st->version > in->version)
917 in->version = st->version;
918
919 if (was_new)
920 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
921
922 if (!st->cap.caps)
923 return in; // as with readdir returning indoes in different snaprealms (no caps!)
924
7c673cae 925 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
926 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
927 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
928 st->cap.flags, request_perms);
28e407b8 929 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 930 in->max_size = st->max_size;
28e407b8
AA
931 in->rstat = st->rstat;
932 }
7c673cae 933
1adf2230
AA
934 // setting I_COMPLETE needs to happen after adding the cap
935 if (in->is_dir() &&
936 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
937 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
938 in->dirstat.nfiles == 0 &&
939 in->dirstat.nsubdirs == 0) {
940 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
941 in->flags |= I_COMPLETE | I_DIR_ORDERED;
942 if (in->dir) {
943 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
944 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
945 in->dir->readdir_cache.clear();
946 for (const auto& p : in->dir->dentries) {
947 unlink(p.second, true, true); // keep dir, keep dentry
948 }
949 if (in->dir->dentries.empty())
950 close_dir(in->dir);
7c673cae 951 }
7c673cae 952 }
1adf2230
AA
953 } else {
954 in->snap_caps |= st->cap.caps;
7c673cae
FG
955 }
956
957 return in;
958}
959
960
961/*
962 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
963 */
964Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
965 Inode *in, utime_t from, MetaSession *session,
966 Dentry *old_dentry)
967{
968 Dentry *dn = NULL;
969 if (dir->dentries.count(dname))
970 dn = dir->dentries[dname];
971
11fdf7f2 972 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
973 << " in dir " << dir->parent_inode->vino() << " dn " << dn
974 << dendl;
975
976 if (dn && dn->inode) {
977 if (dn->inode->vino() == in->vino()) {
978 touch_dn(dn);
979 ldout(cct, 12) << " had dentry " << dname
980 << " with correct vino " << dn->inode->vino()
981 << dendl;
982 } else {
983 ldout(cct, 12) << " had dentry " << dname
984 << " with WRONG vino " << dn->inode->vino()
985 << dendl;
986 unlink(dn, true, true); // keep dir, keep dentry
987 }
988 }
989
990 if (!dn || !dn->inode) {
991 InodeRef tmp_ref(in);
992 if (old_dentry) {
993 if (old_dentry->dir != dir) {
994 Inode *old_diri = old_dentry->dir->parent_inode;
995 old_diri->dir_ordered_count++;
996 clear_dir_complete_and_ordered(old_diri, false);
997 }
998 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
999 }
1000 Inode *diri = dir->parent_inode;
1001 diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(diri, false);
1003 dn = link(dir, dname, in, dn);
1004 }
1005
1006 update_dentry_lease(dn, dlease, from, session);
1007 return dn;
1008}
1009
1010void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1011{
1012 utime_t dttl = from;
1013 dttl += (float)dlease->duration_ms / 1000.0;
1014
11fdf7f2 1015 ceph_assert(dn);
7c673cae
FG
1016
1017 if (dlease->mask & CEPH_LOCK_DN) {
1018 if (dttl > dn->lease_ttl) {
1019 ldout(cct, 10) << "got dentry lease on " << dn->name
1020 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1021 dn->lease_ttl = dttl;
1022 dn->lease_mds = session->mds_num;
1023 dn->lease_seq = dlease->seq;
1024 dn->lease_gen = session->cap_gen;
1025 }
1026 }
1027 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1028}
1029
1030
1031/*
1032 * update MDS location cache for a single inode
1033 */
1034void Client::update_dir_dist(Inode *in, DirStat *dst)
1035{
1036 // auth
1037 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1038 if (dst->auth >= 0) {
1039 in->fragmap[dst->frag] = dst->auth;
1040 } else {
1041 in->fragmap.erase(dst->frag);
1042 }
1043 if (!in->dirfragtree.is_leaf(dst->frag)) {
1044 in->dirfragtree.force_to_leaf(cct, dst->frag);
1045 _fragmap_remove_non_leaves(in);
1046 }
1047
1048 // replicated
1049 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1050
1051 // dist
1052 /*
1053 if (!st->dirfrag_dist.empty()) { // FIXME
1054 set<int> dist = st->dirfrag_dist.begin()->second;
1055 if (dist.empty() && !in->dir_contacts.empty())
1056 ldout(cct, 9) << "lost dist spec for " << in->ino
1057 << " " << dist << dendl;
1058 if (!dist.empty() && in->dir_contacts.empty())
1059 ldout(cct, 9) << "got dist spec for " << in->ino
1060 << " " << dist << dendl;
1061 in->dir_contacts = dist;
1062 }
1063 */
1064}
1065
1066void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1067{
1068 if (diri->flags & I_COMPLETE) {
1069 if (complete) {
1070 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1071 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1072 } else {
1073 if (diri->flags & I_DIR_ORDERED) {
1074 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1075 diri->flags &= ~I_DIR_ORDERED;
1076 }
1077 }
1078 if (diri->dir)
1079 diri->dir->readdir_cache.clear();
1080 }
1081}
1082
1083/*
1084 * insert results from readdir or lssnap into the metadata cache.
1085 */
1086void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1087
11fdf7f2 1088 auto& reply = request->reply;
7c673cae 1089 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1090 uint64_t features;
1091 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1092 features = (uint64_t)-1;
1093 }
1094 else {
1095 features = con->get_features();
1096 }
7c673cae
FG
1097
1098 dir_result_t *dirp = request->dirp;
11fdf7f2 1099 ceph_assert(dirp);
7c673cae
FG
1100
1101 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1102 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1103 if (!p.end()) {
1104 // snapdir?
1105 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1106 ceph_assert(diri);
7c673cae
FG
1107 diri = open_snapdir(diri);
1108 }
1109
1110 // only open dir if we're actually adding stuff to it!
1111 Dir *dir = diri->open_dir();
11fdf7f2 1112 ceph_assert(dir);
7c673cae
FG
1113
1114 // dirstat
11fdf7f2 1115 DirStat dst(p, features);
7c673cae
FG
1116 __u32 numdn;
1117 __u16 flags;
11fdf7f2
TL
1118 decode(numdn, p);
1119 decode(flags, p);
7c673cae
FG
1120
1121 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1122 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1123
1124 frag_t fg = (unsigned)request->head.args.readdir.frag;
1125 unsigned readdir_offset = dirp->next_offset;
1126 string readdir_start = dirp->last_name;
11fdf7f2 1127 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1128
1129 unsigned last_hash = 0;
1130 if (hash_order) {
1131 if (!readdir_start.empty()) {
1132 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1133 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1134 /* mds understands offset_hash */
1135 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1136 }
1137 }
1138
1139 if (fg != dst.frag) {
1140 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1141 fg = dst.frag;
1142 if (!hash_order) {
1143 readdir_offset = 2;
1144 readdir_start.clear();
1145 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1146 }
1147 }
1148
1149 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1150 << ", hash_order=" << hash_order
1151 << ", readdir_start " << readdir_start
1152 << ", last_hash " << last_hash
1153 << ", next_offset " << readdir_offset << dendl;
1154
1155 if (diri->snapid != CEPH_SNAPDIR &&
1156 fg.is_leftmost() && readdir_offset == 2 &&
1157 !(hash_order && last_hash)) {
1158 dirp->release_count = diri->dir_release_count;
1159 dirp->ordered_count = diri->dir_ordered_count;
1160 dirp->start_shared_gen = diri->shared_gen;
1161 dirp->cache_index = 0;
1162 }
1163
1164 dirp->buffer_frag = fg;
1165
1166 _readdir_drop_dirp_buffer(dirp);
1167 dirp->buffer.reserve(numdn);
1168
1169 string dname;
1170 LeaseStat dlease;
1171 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1172 decode(dname, p);
1173 dlease.decode(p, features);
7c673cae
FG
1174 InodeStat ist(p, features);
1175
1176 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1177
1178 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1179 request->perms);
1180 Dentry *dn;
1181 if (diri->dir->dentries.count(dname)) {
1182 Dentry *olddn = diri->dir->dentries[dname];
1183 if (olddn->inode != in) {
1184 // replace incorrect dentry
1185 unlink(olddn, true, true); // keep dir, dentry
1186 dn = link(dir, dname, in, olddn);
11fdf7f2 1187 ceph_assert(dn == olddn);
7c673cae
FG
1188 } else {
1189 // keep existing dn
1190 dn = olddn;
1191 touch_dn(dn);
1192 }
1193 } else {
1194 // new dn
1195 dn = link(dir, dname, in, NULL);
1196 }
1197
1198 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1199 if (hash_order) {
1200 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1201 if (hash != last_hash)
1202 readdir_offset = 2;
1203 last_hash = hash;
1204 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1205 } else {
1206 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1207 }
1208 // add to readdir cache
1209 if (dirp->release_count == diri->dir_release_count &&
1210 dirp->ordered_count == diri->dir_ordered_count &&
1211 dirp->start_shared_gen == diri->shared_gen) {
1212 if (dirp->cache_index == dir->readdir_cache.size()) {
1213 if (i == 0) {
11fdf7f2 1214 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1215 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1216 }
1217 dir->readdir_cache.push_back(dn);
1218 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1219 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1220 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1221 else
1222 dir->readdir_cache[dirp->cache_index] = dn;
1223 } else {
11fdf7f2 1224 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1225 }
1226 dirp->cache_index++;
1227 }
1228 // add to cached result list
1229 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1230 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1231 }
1232
1233 if (numdn > 0)
1234 dirp->last_name = dname;
1235 if (end)
1236 dirp->next_offset = 2;
1237 else
1238 dirp->next_offset = readdir_offset;
1239
1240 if (dir->is_empty())
1241 close_dir(dir);
1242 }
1243}
1244
1245/** insert_trace
1246 *
1247 * insert a trace from a MDS reply into the cache.
1248 */
1249Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1250{
11fdf7f2 1251 auto& reply = request->reply;
7c673cae
FG
1252 int op = request->get_op();
1253
1254 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1255 << " is_target=" << (int)reply->head.is_target
1256 << " is_dentry=" << (int)reply->head.is_dentry
1257 << dendl;
1258
11fdf7f2 1259 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1260 if (request->got_unsafe) {
1261 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1262 ceph_assert(p.end());
7c673cae
FG
1263 return NULL;
1264 }
1265
1266 if (p.end()) {
1267 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1268
1269 Dentry *d = request->dentry();
1270 if (d) {
1271 Inode *diri = d->dir->parent_inode;
1272 diri->dir_release_count++;
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1281 ceph_assert(od);
7c673cae
FG
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
7c673cae
FG
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
11fdf7f2
TL
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
7c673cae
FG
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1338 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
1360 diri->dir_ordered_count++;
1361 clear_dir_complete_and_ordered(diri, false);
1362 unlink(dn, true, true); // keep dir, dentry
1363 }
1364 }
1365 if (dlease.duration_ms > 0) {
1366 if (!dn) {
1367 Dir *dir = diri->open_dir();
1368 dn = link(dir, dname, NULL, NULL);
1369 }
1370 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1371 }
1372 }
1373 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1374 op == CEPH_MDS_OP_MKSNAP) {
1375 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1376 // fake it for snap lookup
1377 vinodeno_t vino = ist.vino;
1378 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1379 ceph_assert(inode_map.count(vino));
7c673cae
FG
1380 diri = inode_map[vino];
1381
1382 string dname = request->path.last_dentry();
1383
1384 LeaseStat dlease;
1385 dlease.duration_ms = 0;
1386
1387 if (in) {
1388 Dir *dir = diri->open_dir();
1389 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1390 } else {
1391 if (diri->dir && diri->dir->dentries.count(dname)) {
1392 Dentry *dn = diri->dir->dentries[dname];
1393 if (dn->inode)
1394 unlink(dn, true, true); // keep dir, dentry
1395 }
1396 }
1397 }
1398
1399 if (in) {
1400 if (op == CEPH_MDS_OP_READDIR ||
1401 op == CEPH_MDS_OP_LSSNAP) {
1402 insert_readdir_results(request, session, in);
1403 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1404 // hack: return parent inode instead
1405 in = diri;
1406 }
1407
1408 if (request->dentry() == NULL && in != request->inode()) {
1409 // pin the target inode if its parent dentry is not pinned
1410 request->set_other_inode(in);
1411 }
1412 }
1413
1414 if (realm)
1415 put_snap_realm(realm);
1416
1417 request->target = in;
1418 return in;
1419}
1420
1421// -------
1422
1423mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1424{
1425 mds_rank_t mds = MDS_RANK_NONE;
1426 __u32 hash = 0;
1427 bool is_hash = false;
1428
1429 Inode *in = NULL;
1430 Dentry *de = NULL;
7c673cae
FG
1431
1432 if (req->resend_mds >= 0) {
1433 mds = req->resend_mds;
1434 req->resend_mds = -1;
11fdf7f2 1435 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1436 goto out;
1437 }
1438
1439 if (cct->_conf->client_use_random_mds)
1440 goto random_mds;
1441
1442 in = req->inode();
1443 de = req->dentry();
1444 if (in) {
11fdf7f2 1445 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1446 if (req->path.depth()) {
1447 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1448 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1449 << " on " << req->path[0]
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 } else if (de) {
1454 if (de->inode) {
1455 in = de->inode.get();
11fdf7f2 1456 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1457 } else {
1458 in = de->dir->parent_inode;
1459 hash = in->hash_dentry_name(de->name);
11fdf7f2 1460 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1461 << " on " << de->name
1462 << " => " << hash << dendl;
1463 is_hash = true;
1464 }
1465 }
1466 if (in) {
1467 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1468 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1469 while (in->snapid != CEPH_NOSNAP) {
1470 if (in->snapid == CEPH_SNAPDIR)
1471 in = in->snapdir_parent.get();
11fdf7f2 1472 else if (!in->dentries.empty())
7c673cae
FG
1473 /* In most cases there will only be one dentry, so getting it
1474 * will be the correct action. If there are multiple hard links,
1475 * I think the MDS should be able to redirect as needed*/
1476 in = in->get_first_parent()->dir->parent_inode;
1477 else {
1478 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1479 break;
1480 }
1481 }
1482 is_hash = false;
1483 }
1484
11fdf7f2 1485 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1486 << " hash=" << hash << dendl;
1487
1488 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1489 frag_t fg = in->dirfragtree[hash];
1490 if (in->fragmap.count(fg)) {
1491 mds = in->fragmap[fg];
1492 if (phash_diri)
1493 *phash_diri = in;
91327a77
AA
1494 } else if (in->auth_cap) {
1495 mds = in->auth_cap->session->mds_num;
1496 }
1497 if (mds >= 0) {
11fdf7f2 1498 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1499 goto out;
1500 }
1501 }
1502
11fdf7f2
TL
1503 if (in->auth_cap && req->auth_is_best()) {
1504 mds = in->auth_cap->session->mds_num;
1505 } else if (!in->caps.empty()) {
1506 mds = in->caps.begin()->second.session->mds_num;
1507 } else {
7c673cae 1508 goto random_mds;
11fdf7f2
TL
1509 }
1510 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1511
1512 goto out;
1513 }
1514
1515random_mds:
1516 if (mds < 0) {
1517 mds = _get_random_up_mds();
1518 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1519 }
1520
1521out:
1522 ldout(cct, 20) << "mds is " << mds << dendl;
1523 return mds;
1524}
1525
1526
1527void Client::connect_mds_targets(mds_rank_t mds)
1528{
11fdf7f2
TL
1529 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1530 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1531 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1532 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1533 q != info.export_targets.end();
1534 ++q) {
1535 if (mds_sessions.count(*q) == 0 &&
1536 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1537 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1538 << " export target mds." << *q << dendl;
1539 _open_mds_session(*q);
1540 }
1541 }
1542}
1543
1544void Client::dump_mds_sessions(Formatter *f)
1545{
1546 f->dump_int("id", get_nodeid().v);
11fdf7f2 1547 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1548 f->dump_object("inst", inst);
1549 f->dump_stream("inst_str") << inst;
1550 f->dump_stream("addr_str") << inst.addr;
7c673cae 1551 f->open_array_section("sessions");
11fdf7f2 1552 for (const auto &p : mds_sessions) {
7c673cae 1553 f->open_object_section("session");
11fdf7f2 1554 p.second.dump(f);
7c673cae
FG
1555 f->close_section();
1556 }
1557 f->close_section();
1558 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1559}
1560void Client::dump_mds_requests(Formatter *f)
1561{
1562 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1563 p != mds_requests.end();
1564 ++p) {
1565 f->open_object_section("request");
1566 p->second->dump(f);
1567 f->close_section();
1568 }
1569}
1570
1571int Client::verify_reply_trace(int r,
11fdf7f2 1572 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1573 InodeRef *ptarget, bool *pcreated,
1574 const UserPerm& perms)
1575{
1576 // check whether this request actually did the create, and set created flag
1577 bufferlist extra_bl;
1578 inodeno_t created_ino;
1579 bool got_created_ino = false;
1580 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1581
11fdf7f2 1582 extra_bl = reply->get_extra_bl();
7c673cae
FG
1583 if (extra_bl.length() >= 8) {
1584 // if the extra bufferlist has a buffer, we assume its the created inode
1585 // and that this request to create succeeded in actually creating
1586 // the inode (won the race with other create requests)
11fdf7f2 1587 decode(created_ino, extra_bl);
7c673cae
FG
1588 got_created_ino = true;
1589 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1590 }
1591
1592 if (pcreated)
1593 *pcreated = got_created_ino;
1594
1595 if (request->target) {
1596 *ptarget = request->target;
1597 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1598 } else {
1599 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1600 (*ptarget) = p->second;
1601 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1602 } else {
1603 // we got a traceless reply, and need to look up what we just
1604 // created. for now, do this by name. someday, do this by the
1605 // ino... which we know! FIXME.
1606 InodeRef target;
1607 Dentry *d = request->dentry();
1608 if (d) {
1609 if (d->dir) {
1610 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1611 << d->dir->parent_inode->ino << "/" << d->name
1612 << " got_ino " << got_created_ino
1613 << " ino " << created_ino
1614 << dendl;
1615 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1616 &target, perms);
1617 } else {
1618 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1619 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1620 }
1621 } else {
1622 Inode *in = request->inode();
1623 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1624 << in->ino << dendl;
1625 r = _getattr(in, request->regetattr_mask, perms, true);
1626 target = in;
1627 }
1628 if (r >= 0) {
1629 // verify ino returned in reply and trace_dist are the same
1630 if (got_created_ino &&
1631 created_ino.val != target->ino.val) {
1632 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1633 r = -EINTR;
1634 }
1635 if (ptarget)
1636 ptarget->swap(target);
1637 }
1638 }
1639 }
1640
1641 return r;
1642}
1643
1644
1645/**
1646 * make a request
1647 *
1648 * Blocking helper to make an MDS request.
1649 *
1650 * If the ptarget flag is set, behavior changes slightly: the caller
1651 * expects to get a pointer to the inode we are creating or operating
1652 * on. As a result, we will follow up any traceless mutation reply
1653 * with a getattr or lookup to transparently handle a traceless reply
1654 * from the MDS (as when the MDS restarts and the client has to replay
1655 * a request).
1656 *
1657 * @param request the MetaRequest to execute
1658 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1659 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1660 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1661 * @param use_mds [optional] prefer a specific mds (-1 for default)
1662 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1663 */
1664int Client::make_request(MetaRequest *request,
1665 const UserPerm& perms,
1666 InodeRef *ptarget, bool *pcreated,
1667 mds_rank_t use_mds,
1668 bufferlist *pdirbl)
1669{
1670 int r = 0;
1671
1672 // assign a unique tid
1673 ceph_tid_t tid = ++last_tid;
1674 request->set_tid(tid);
1675
1676 // and timestamp
1677 request->op_stamp = ceph_clock_now();
1678
1679 // make note
1680 mds_requests[tid] = request->get();
1681 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1682 oldest_tid = tid;
1683
1684 request->set_caller_perms(perms);
1685
1686 if (cct->_conf->client_inject_fixed_oldest_tid) {
1687 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1688 request->set_oldest_client_tid(1);
1689 } else {
1690 request->set_oldest_client_tid(oldest_tid);
1691 }
1692
1693 // hack target mds?
1694 if (use_mds >= 0)
1695 request->resend_mds = use_mds;
1696
1697 while (1) {
1698 if (request->aborted())
1699 break;
1700
31f18b77
FG
1701 if (blacklisted) {
1702 request->abort(-EBLACKLISTED);
1703 break;
1704 }
1705
7c673cae
FG
1706 // set up wait cond
1707 Cond caller_cond;
1708 request->caller_cond = &caller_cond;
1709
1710 // choose mds
1711 Inode *hash_diri = NULL;
1712 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1713 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1714 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1715 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1716 if (hash_diri) {
1717 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1718 _fragmap_remove_stopped_mds(hash_diri, mds);
1719 } else {
1720 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1721 request->resend_mds = _get_random_up_mds();
1722 }
1723 } else {
1724 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1725 wait_on_list(waiting_for_mdsmap);
1726 }
1727 continue;
1728 }
1729
1730 // open a session?
1731 MetaSession *session = NULL;
1732 if (!have_open_session(mds)) {
1733 session = _get_or_open_mds_session(mds);
1734
1735 // wait
1736 if (session->state == MetaSession::STATE_OPENING) {
1737 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1738 wait_on_context_list(session->waiting_for_open);
1739 // Abort requests on REJECT from MDS
1740 if (rejected_by_mds.count(mds)) {
1741 request->abort(-EPERM);
1742 break;
1743 }
1744 continue;
1745 }
1746
1747 if (!have_open_session(mds))
1748 continue;
1749 } else {
11fdf7f2 1750 session = &mds_sessions.at(mds);
7c673cae
FG
1751 }
1752
1753 // send request.
1754 send_request(request, session);
1755
1756 // wait for signal
1757 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1758 request->kick = false;
1759 while (!request->reply && // reply
1760 request->resend_mds < 0 && // forward
1761 !request->kick)
1762 caller_cond.Wait(client_lock);
1763 request->caller_cond = NULL;
1764
1765 // did we get a reply?
1766 if (request->reply)
1767 break;
1768 }
1769
1770 if (!request->reply) {
11fdf7f2
TL
1771 ceph_assert(request->aborted());
1772 ceph_assert(!request->got_unsafe);
7c673cae
FG
1773 r = request->get_abort_code();
1774 request->item.remove_myself();
1775 unregister_request(request);
11fdf7f2 1776 put_request(request);
7c673cae
FG
1777 return r;
1778 }
1779
1780 // got it!
11fdf7f2 1781 auto reply = std::move(request->reply);
7c673cae
FG
1782 r = reply->get_result();
1783 if (r >= 0)
1784 request->success = true;
1785
1786 // kick dispatcher (we've got it!)
11fdf7f2 1787 ceph_assert(request->dispatch_cond);
7c673cae
FG
1788 request->dispatch_cond->Signal();
1789 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1790 request->dispatch_cond = 0;
1791
1792 if (r >= 0 && ptarget)
1793 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1794
1795 if (pdirbl)
11fdf7f2 1796 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1797
1798 // -- log times --
1799 utime_t lat = ceph_clock_now();
1800 lat -= request->sent_stamp;
1801 ldout(cct, 20) << "lat " << lat << dendl;
1802 logger->tinc(l_c_lat, lat);
1803 logger->tinc(l_c_reply, lat);
1804
1805 put_request(request);
7c673cae
FG
1806 return r;
1807}
1808
1809void Client::unregister_request(MetaRequest *req)
1810{
1811 mds_requests.erase(req->tid);
1812 if (req->tid == oldest_tid) {
1813 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1814 while (true) {
1815 if (p == mds_requests.end()) {
1816 oldest_tid = 0;
1817 break;
1818 }
1819 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1820 oldest_tid = p->first;
1821 break;
1822 }
1823 ++p;
1824 }
1825 }
1826 put_request(req);
1827}
1828
1829void Client::put_request(MetaRequest *request)
1830{
1831 if (request->_put()) {
1832 int op = -1;
1833 if (request->success)
1834 op = request->get_op();
1835 InodeRef other_in;
1836 request->take_other_inode(&other_in);
1837 delete request;
1838
1839 if (other_in &&
1840 (op == CEPH_MDS_OP_RMDIR ||
1841 op == CEPH_MDS_OP_RENAME ||
1842 op == CEPH_MDS_OP_RMSNAP)) {
1843 _try_to_trim_inode(other_in.get(), false);
1844 }
1845 }
1846}
1847
1848int Client::encode_inode_release(Inode *in, MetaRequest *req,
1849 mds_rank_t mds, int drop,
1850 int unless, int force)
1851{
11fdf7f2 1852 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1853 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1854 << ", have:" << ", force:" << force << ")" << dendl;
1855 int released = 0;
11fdf7f2
TL
1856 auto it = in->caps.find(mds);
1857 if (it != in->caps.end()) {
1858 Cap &cap = it->second;
7c673cae 1859 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1860 if ((drop & cap.issued) &&
1861 !(unless & cap.issued)) {
1862 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1863 cap.issued &= ~drop;
1864 cap.implemented &= ~drop;
7c673cae 1865 released = 1;
11fdf7f2 1866 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1867 } else {
1868 released = force;
1869 }
1870 if (released) {
1871 ceph_mds_request_release rel;
1872 rel.ino = in->ino;
11fdf7f2
TL
1873 rel.cap_id = cap.cap_id;
1874 rel.seq = cap.seq;
1875 rel.issue_seq = cap.issue_seq;
1876 rel.mseq = cap.mseq;
1877 rel.caps = cap.implemented;
1878 rel.wanted = cap.wanted;
7c673cae
FG
1879 rel.dname_len = 0;
1880 rel.dname_seq = 0;
1881 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1882 }
1883 }
11fdf7f2 1884 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1885 << released << dendl;
1886 return released;
1887}
1888
1889void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1890 mds_rank_t mds, int drop, int unless)
1891{
11fdf7f2 1892 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1893 << dn << ")" << dendl;
1894 int released = 0;
1895 if (dn->dir)
1896 released = encode_inode_release(dn->dir->parent_inode, req,
1897 mds, drop, unless, 1);
1898 if (released && dn->lease_mds == mds) {
1899 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1900 auto& rel = req->cap_releases.back();
7c673cae
FG
1901 rel.item.dname_len = dn->name.length();
1902 rel.item.dname_seq = dn->lease_seq;
1903 rel.dname = dn->name;
1904 }
11fdf7f2 1905 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1906 << dn << ")" << dendl;
1907}
1908
1909
1910/*
1911 * This requires the MClientRequest *request member to be set.
1912 * It will error out horribly without one.
1913 * Additionally, if you set any *drop member, you'd better have
1914 * set the corresponding dentry!
1915 */
1916void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1917{
11fdf7f2 1918 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1919 << req << ", mds: " << mds << ")" << dendl;
1920 if (req->inode_drop && req->inode())
1921 encode_inode_release(req->inode(), req,
1922 mds, req->inode_drop,
1923 req->inode_unless);
1924
1925 if (req->old_inode_drop && req->old_inode())
1926 encode_inode_release(req->old_inode(), req,
1927 mds, req->old_inode_drop,
1928 req->old_inode_unless);
1929 if (req->other_inode_drop && req->other_inode())
1930 encode_inode_release(req->other_inode(), req,
1931 mds, req->other_inode_drop,
1932 req->other_inode_unless);
1933
1934 if (req->dentry_drop && req->dentry())
1935 encode_dentry_release(req->dentry(), req,
1936 mds, req->dentry_drop,
1937 req->dentry_unless);
1938
1939 if (req->old_dentry_drop && req->old_dentry())
1940 encode_dentry_release(req->old_dentry(), req,
1941 mds, req->old_dentry_drop,
1942 req->old_dentry_unless);
11fdf7f2 1943 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1944 << req << ", mds " << mds <<dendl;
1945}
1946
1947bool Client::have_open_session(mds_rank_t mds)
1948{
11fdf7f2
TL
1949 const auto &it = mds_sessions.find(mds);
1950 return it != mds_sessions.end() &&
1951 (it->second.state == MetaSession::STATE_OPEN ||
1952 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1953}
1954
1955MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1956{
11fdf7f2
TL
1957 const auto &it = mds_sessions.find(mds);
1958 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1959 return NULL;
11fdf7f2
TL
1960 } else {
1961 return &it->second;
1962 }
7c673cae
FG
1963}
1964
1965MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1966{
11fdf7f2
TL
1967 auto it = mds_sessions.find(mds);
1968 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1969}
1970
1971/**
1972 * Populate a map of strings with client-identifying metadata,
1973 * such as the hostname. Call this once at initialization.
1974 */
1975void Client::populate_metadata(const std::string &mount_root)
1976{
1977 // Hostname
1978 struct utsname u;
1979 int r = uname(&u);
1980 if (r >= 0) {
1981 metadata["hostname"] = u.nodename;
1982 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1983 } else {
1984 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1985 }
1986
1987 metadata["pid"] = stringify(getpid());
1988
1989 // Ceph entity id (the '0' in "client.0")
1990 metadata["entity_id"] = cct->_conf->name.get_id();
1991
1992 // Our mount position
1993 if (!mount_root.empty()) {
1994 metadata["root"] = mount_root;
1995 }
1996
1997 // Ceph version
1998 metadata["ceph_version"] = pretty_version_to_str();
1999 metadata["ceph_sha1"] = git_version_to_str();
2000
2001 // Apply any metadata from the user's configured overrides
2002 std::vector<std::string> tokens;
2003 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2004 for (const auto &i : tokens) {
2005 auto eqpos = i.find("=");
2006 // Throw out anything that isn't of the form "<str>=<str>"
2007 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2008 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2009 continue;
2010 }
2011 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2012 }
2013}
2014
2015/**
2016 * Optionally add or override client metadata fields.
2017 */
2018void Client::update_metadata(std::string const &k, std::string const &v)
2019{
11fdf7f2
TL
2020 std::lock_guard l(client_lock);
2021 ceph_assert(initialized);
7c673cae 2022
11fdf7f2
TL
2023 auto it = metadata.find(k);
2024 if (it != metadata.end()) {
7c673cae 2025 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2026 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2027 }
2028
2029 metadata[k] = v;
2030}
2031
2032MetaSession *Client::_open_mds_session(mds_rank_t mds)
2033{
11fdf7f2
TL
2034 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2035 auto addrs = mdsmap->get_addrs(mds);
2036 auto em = mds_sessions.emplace(std::piecewise_construct,
2037 std::forward_as_tuple(mds),
2038 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2039 ceph_assert(em.second); /* not already present */
2040 MetaSession *session = &em.first->second;
7c673cae
FG
2041
2042 // Maybe skip sending a request to open if this MDS daemon
2043 // has previously sent us a REJECT.
2044 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2045 if (rejected_by_mds[mds] == session->addrs) {
2046 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2047 "because we were rejected" << dendl;
2048 return session;
2049 } else {
11fdf7f2 2050 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2051 "rejected us, trying with new inst" << dendl;
2052 rejected_by_mds.erase(mds);
2053 }
2054 }
2055
11fdf7f2
TL
2056 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2057 m->metadata = metadata;
2058 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2059 session->con->send_message2(std::move(m));
7c673cae
FG
2060 return session;
2061}
2062
2063void Client::_close_mds_session(MetaSession *s)
2064{
11fdf7f2 2065 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2066 s->state = MetaSession::STATE_CLOSING;
11fdf7f2 2067 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2068}
2069
2070void Client::_closed_mds_session(MetaSession *s)
2071{
11fdf7f2 2072 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2073 s->state = MetaSession::STATE_CLOSED;
2074 s->con->mark_down();
2075 signal_context_list(s->waiting_for_open);
2076 mount_cond.Signal();
2077 remove_session_caps(s);
2078 kick_requests_closed(s);
2079 mds_sessions.erase(s->mds_num);
7c673cae
FG
2080}
2081
11fdf7f2 2082void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2083{
2084 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2085 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2086
2087 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2088 if (!session) {
2089 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2090 return;
2091 }
2092
2093 switch (m->get_op()) {
2094 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2095 {
2096 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2097 missing_features -= m->supported_features;
2098 if (!missing_features.empty()) {
2099 lderr(cct) << "mds." << from << " lacks required features '"
2100 << missing_features << "', closing session " << dendl;
2101 rejected_by_mds[session->mds_num] = session->addrs;
2102 _close_mds_session(session);
2103 _closed_mds_session(session);
2104 break;
2105 }
2106 session->mds_features = std::move(m->supported_features);
2107
2108 renew_caps(session);
2109 session->state = MetaSession::STATE_OPEN;
2110 if (unmounting)
2111 mount_cond.Signal();
2112 else
2113 connect_mds_targets(from);
2114 signal_context_list(session->waiting_for_open);
2115 break;
2116 }
7c673cae
FG
2117
2118 case CEPH_SESSION_CLOSE:
2119 _closed_mds_session(session);
2120 break;
2121
2122 case CEPH_SESSION_RENEWCAPS:
2123 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2124 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2125 session->cap_ttl =
2126 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2127 if (was_stale)
2128 wake_up_session_caps(session, false);
7c673cae
FG
2129 }
2130 break;
2131
2132 case CEPH_SESSION_STALE:
28e407b8
AA
2133 // invalidate session caps/leases
2134 session->cap_gen++;
2135 session->cap_ttl = ceph_clock_now();
2136 session->cap_ttl -= 1;
7c673cae
FG
2137 renew_caps(session);
2138 break;
2139
2140 case CEPH_SESSION_RECALL_STATE:
2141 trim_caps(session, m->get_max_caps());
2142 break;
2143
2144 case CEPH_SESSION_FLUSHMSG:
a8e16298 2145 /* flush cap release */
11fdf7f2
TL
2146 if (auto& m = session->release; m) {
2147 session->con->send_message2(std::move(m));
a8e16298 2148 }
11fdf7f2 2149 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2150 break;
2151
2152 case CEPH_SESSION_FORCE_RO:
2153 force_session_readonly(session);
2154 break;
2155
2156 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2157 {
2158 std::string_view error_str;
2159 auto it = m->metadata.find("error_string");
2160 if (it != m->metadata.end())
2161 error_str = it->second;
2162 else
2163 error_str = "unknown error";
2164 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2165
11fdf7f2
TL
2166 rejected_by_mds[session->mds_num] = session->addrs;
2167 _closed_mds_session(session);
2168 }
7c673cae
FG
2169 break;
2170
2171 default:
2172 ceph_abort();
2173 }
7c673cae
FG
2174}
2175
2176bool Client::_any_stale_sessions() const
2177{
11fdf7f2 2178 ceph_assert(client_lock.is_locked_by_me());
7c673cae 2179
11fdf7f2
TL
2180 for (const auto &p : mds_sessions) {
2181 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2182 return true;
2183 }
2184 }
2185
2186 return false;
2187}
2188
2189void Client::_kick_stale_sessions()
2190{
11fdf7f2 2191 ldout(cct, 1) << __func__ << dendl;
7c673cae 2192
11fdf7f2
TL
2193 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2194 MetaSession &s = it->second;
2195 ++it;
2196 if (s.state == MetaSession::STATE_STALE)
2197 _closed_mds_session(&s);
7c673cae
FG
2198 }
2199}
2200
2201void Client::send_request(MetaRequest *request, MetaSession *session,
2202 bool drop_cap_releases)
2203{
2204 // make the request
2205 mds_rank_t mds = session->mds_num;
11fdf7f2 2206 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2207 << " for mds." << mds << dendl;
11fdf7f2 2208 auto r = build_client_request(request);
7c673cae
FG
2209 if (request->dentry()) {
2210 r->set_dentry_wanted();
2211 }
2212 if (request->got_unsafe) {
2213 r->set_replayed_op();
2214 if (request->target)
2215 r->head.ino = request->target->ino;
2216 } else {
2217 encode_cap_releases(request, mds);
2218 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2219 request->cap_releases.clear();
2220 else
2221 r->releases.swap(request->cap_releases);
2222 }
2223 r->set_mdsmap_epoch(mdsmap->get_epoch());
2224 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2225 objecter->with_osdmap([r](const OSDMap& o) {
2226 r->set_osdmap_epoch(o.get_epoch());
2227 });
2228 }
2229
2230 if (request->mds == -1) {
2231 request->sent_stamp = ceph_clock_now();
11fdf7f2 2232 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2233 }
2234 request->mds = mds;
2235
2236 Inode *in = request->inode();
11fdf7f2
TL
2237 if (in) {
2238 auto it = in->caps.find(mds);
2239 if (it != in->caps.end()) {
2240 request->sent_on_mseq = it->second.mseq;
2241 }
2242 }
7c673cae
FG
2243
2244 session->requests.push_back(&request->item);
2245
11fdf7f2
TL
2246 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2247 session->con->send_message2(std::move(r));
7c673cae
FG
2248}
2249
11fdf7f2 2250MClientRequest::ref Client::build_client_request(MetaRequest *request)
7c673cae 2251{
11fdf7f2 2252 auto req = MClientRequest::create(request->get_op());
7c673cae
FG
2253 req->set_tid(request->tid);
2254 req->set_stamp(request->op_stamp);
2255 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2256
2257 // if the filepath's haven't been set, set them!
2258 if (request->path.empty()) {
2259 Inode *in = request->inode();
2260 Dentry *de = request->dentry();
2261 if (in)
2262 in->make_nosnap_relative_path(request->path);
2263 else if (de) {
2264 if (de->inode)
2265 de->inode->make_nosnap_relative_path(request->path);
2266 else if (de->dir) {
2267 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2268 request->path.push_dentry(de->name);
2269 }
2270 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2271 << " No path, inode, or appropriately-endowed dentry given!"
2272 << dendl;
2273 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or dentry given!"
2275 << dendl;
2276 }
2277 req->set_filepath(request->get_filepath());
2278 req->set_filepath2(request->get_filepath2());
2279 req->set_data(request->data);
2280 req->set_retry_attempt(request->retry_attempt++);
2281 req->head.num_fwd = request->num_fwd;
2282 const gid_t *_gids;
2283 int gid_count = request->perms.get_gids(&_gids);
2284 req->set_gid_list(gid_count, _gids);
2285 return req;
2286}
2287
2288
2289
11fdf7f2 2290void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2291{
2292 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2293 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2294 if (!session) {
7c673cae
FG
2295 return;
2296 }
2297 ceph_tid_t tid = fwd->get_tid();
2298
2299 if (mds_requests.count(tid) == 0) {
11fdf7f2 2300 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2301 return;
2302 }
2303
2304 MetaRequest *request = mds_requests[tid];
11fdf7f2 2305 ceph_assert(request);
7c673cae
FG
2306
2307 // reset retry counter
2308 request->retry_attempt = 0;
2309
2310 // request not forwarded, or dest mds has no session.
2311 // resend.
11fdf7f2 2312 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2313 << " fwd " << fwd->get_num_fwd()
2314 << " to mds." << fwd->get_dest_mds()
2315 << ", resending to " << fwd->get_dest_mds()
2316 << dendl;
2317
2318 request->mds = -1;
2319 request->item.remove_myself();
2320 request->num_fwd = fwd->get_num_fwd();
2321 request->resend_mds = fwd->get_dest_mds();
2322 request->caller_cond->Signal();
7c673cae
FG
2323}
2324
2325bool Client::is_dir_operation(MetaRequest *req)
2326{
2327 int op = req->get_op();
2328 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2329 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2330 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2331 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2332 return true;
2333 return false;
2334}
2335
11fdf7f2 2336void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2337{
2338 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2339 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2340 if (!session) {
7c673cae
FG
2341 return;
2342 }
2343
2344 ceph_tid_t tid = reply->get_tid();
2345 bool is_safe = reply->is_safe();
2346
2347 if (mds_requests.count(tid) == 0) {
11fdf7f2 2348 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2349 << " safe is:" << is_safe << dendl;
7c673cae
FG
2350 return;
2351 }
2352 MetaRequest *request = mds_requests.at(tid);
2353
11fdf7f2 2354 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2355 << " tid " << tid << dendl;
2356
2357 if (request->got_unsafe && !is_safe) {
2358 //duplicate response
2359 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2360 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2361 return;
2362 }
2363
2364 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2365 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2366 << " from mds." << request->mds << dendl;
2367 request->send_to_auth = true;
2368 request->resend_mds = choose_target_mds(request);
2369 Inode *in = request->inode();
11fdf7f2 2370 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2371 if (request->resend_mds >= 0 &&
2372 request->resend_mds == request->mds &&
2373 (in == NULL ||
11fdf7f2
TL
2374 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2375 request->sent_on_mseq == it->second.mseq)) {
2376 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae
FG
2377 } else {
2378 request->caller_cond->Signal();
7c673cae
FG
2379 return;
2380 }
7c673cae
FG
2381 }
2382
11fdf7f2 2383 ceph_assert(!request->reply);
7c673cae
FG
2384 request->reply = reply;
2385 insert_trace(request, session);
2386
2387 // Handle unsafe reply
2388 if (!is_safe) {
2389 request->got_unsafe = true;
2390 session->unsafe_requests.push_back(&request->unsafe_item);
2391 if (is_dir_operation(request)) {
2392 Inode *dir = request->inode();
11fdf7f2 2393 ceph_assert(dir);
7c673cae
FG
2394 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2395 }
2396 if (request->target) {
2397 InodeRef &in = request->target;
2398 in->unsafe_ops.push_back(&request->unsafe_target_item);
2399 }
2400 }
2401
2402 // Only signal the caller once (on the first reply):
2403 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2404 if (!is_safe || !request->got_unsafe) {
2405 Cond cond;
2406 request->dispatch_cond = &cond;
2407
2408 // wake up waiter
11fdf7f2 2409 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
7c673cae
FG
2410 request->caller_cond->Signal();
2411
2412 // wake for kick back
2413 while (request->dispatch_cond) {
11fdf7f2 2414 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
7c673cae
FG
2415 cond.Wait(client_lock);
2416 }
2417 }
2418
2419 if (is_safe) {
2420 // the filesystem change is committed to disk
2421 // we're done, clean up
2422 if (request->got_unsafe) {
2423 request->unsafe_item.remove_myself();
2424 request->unsafe_dir_item.remove_myself();
2425 request->unsafe_target_item.remove_myself();
2426 signal_cond_list(request->waitfor_safe);
2427 }
2428 request->item.remove_myself();
2429 unregister_request(request);
2430 }
2431 if (unmounting)
2432 mount_cond.Signal();
2433}
2434
2435void Client::_handle_full_flag(int64_t pool)
2436{
2437 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2438 << "on " << pool << dendl;
2439 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2440 // to do this rather than blocking, because otherwise when we fill up we
2441 // potentially lock caps forever on files with dirty pages, and we need
2442 // to be able to release those caps to the MDS so that it can delete files
2443 // and free up space.
2444 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2445
2446 // For all inodes with layouts in this pool and a pending flush write op
2447 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2448 // from ObjectCacher so that it doesn't re-issue the write in response to
2449 // the ENOSPC error.
2450 // Fortunately since we're cancelling everything in a given pool, we don't
2451 // need to know which ops belong to which ObjectSet, we can just blow all
2452 // the un-flushed cached data away and mark any dirty inodes' async_err
2453 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2454 // affecting this pool, and all the objectsets we're purging were also
2455 // in this pool.
2456 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2457 i != inode_map.end(); ++i)
2458 {
2459 Inode *inode = i->second;
2460 if (inode->oset.dirty_or_tx
2461 && (pool == -1 || inode->layout.pool_id == pool)) {
2462 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2463 << " has dirty objects, purging and setting ENOSPC" << dendl;
2464 objectcacher->purge_set(&inode->oset);
2465 inode->set_async_err(-ENOSPC);
2466 }
2467 }
2468
2469 if (cancelled_epoch != (epoch_t)-1) {
2470 set_cap_epoch_barrier(cancelled_epoch);
2471 }
2472}
2473
11fdf7f2 2474void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2475{
31f18b77
FG
2476 std::set<entity_addr_t> new_blacklists;
2477 objecter->consume_blacklist_events(&new_blacklists);
2478
11fdf7f2
TL
2479 const auto myaddrs = messenger->get_myaddrs();
2480 bool new_blacklist = false;
2481 bool prenautilus = objecter->with_osdmap(
2482 [&](const OSDMap& o) {
2483 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2484 });
2485 if (!blacklisted) {
2486 for (auto a : myaddrs.v) {
2487 // blacklist entries are always TYPE_ANY for nautilus+
2488 a.set_type(entity_addr_t::TYPE_ANY);
2489 if (new_blacklists.count(a)) {
2490 new_blacklist = true;
2491 break;
2492 }
2493 if (prenautilus) {
2494 // ...except pre-nautilus, they were TYPE_LEGACY
2495 a.set_type(entity_addr_t::TYPE_LEGACY);
2496 if (new_blacklists.count(a)) {
2497 new_blacklist = true;
2498 break;
2499 }
2500 }
2501 }
2502 }
2503 if (new_blacklist) {
31f18b77
FG
2504 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2505 return o.get_epoch();
2506 });
2507 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2508 blacklisted = true;
31f18b77 2509
11fdf7f2 2510 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2511
2512 // Since we know all our OSD ops will fail, cancel them all preemtively,
2513 // so that on an unhealthy cluster we can umount promptly even if e.g.
2514 // some PGs were inaccessible.
2515 objecter->op_cancel_writes(-EBLACKLISTED);
2516
2517 } else if (blacklisted) {
2518 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2519 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2520 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2521 }
2522
f64942e4
AA
2523 // Always subscribe to next osdmap for blacklisted client
2524 // until this client is not blacklisted.
2525 if (blacklisted) {
2526 objecter->maybe_request_map();
2527 }
2528
7c673cae
FG
2529 if (objecter->osdmap_full_flag()) {
2530 _handle_full_flag(-1);
2531 } else {
2532 // Accumulate local list of full pools so that I can drop
2533 // the objecter lock before re-entering objecter in
2534 // cancel_writes
2535 std::vector<int64_t> full_pools;
2536
2537 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2538 for (const auto& kv : o.get_pools()) {
2539 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2540 full_pools.push_back(kv.first);
2541 }
2542 }
2543 });
2544
2545 for (auto p : full_pools)
2546 _handle_full_flag(p);
2547
2548 // Subscribe to subsequent maps to watch for the full flag going
2549 // away. For the global full flag objecter does this for us, but
2550 // it pays no attention to the per-pool full flag so in this branch
2551 // we do it ourselves.
2552 if (!full_pools.empty()) {
2553 objecter->maybe_request_map();
2554 }
2555 }
7c673cae
FG
2556}
2557
2558
2559// ------------------------
2560// incoming messages
2561
2562
11fdf7f2 2563bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2564{
11fdf7f2 2565 std::lock_guard l(client_lock);
7c673cae
FG
2566 if (!initialized) {
2567 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2568 return true;
2569 }
2570
2571 switch (m->get_type()) {
2572 // mounting and mds sessions
2573 case CEPH_MSG_MDS_MAP:
11fdf7f2 2574 handle_mds_map(MMDSMap::msgref_cast(m));
7c673cae
FG
2575 break;
2576 case CEPH_MSG_FS_MAP:
11fdf7f2 2577 handle_fs_map(MFSMap::msgref_cast(m));
7c673cae
FG
2578 break;
2579 case CEPH_MSG_FS_MAP_USER:
11fdf7f2 2580 handle_fs_map_user(MFSMapUser::msgref_cast(m));
7c673cae
FG
2581 break;
2582 case CEPH_MSG_CLIENT_SESSION:
11fdf7f2 2583 handle_client_session(MClientSession::msgref_cast(m));
7c673cae
FG
2584 break;
2585
2586 case CEPH_MSG_OSD_MAP:
11fdf7f2 2587 handle_osd_map(MOSDMap::msgref_cast(m));
7c673cae
FG
2588 break;
2589
2590 // requests
2591 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
11fdf7f2 2592 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
7c673cae
FG
2593 break;
2594 case CEPH_MSG_CLIENT_REPLY:
11fdf7f2
TL
2595 handle_client_reply(MClientReply::msgref_cast(m));
2596 break;
2597
2598 // reclaim reply
2599 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2600 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
7c673cae
FG
2601 break;
2602
2603 case CEPH_MSG_CLIENT_SNAP:
11fdf7f2 2604 handle_snap(MClientSnap::msgref_cast(m));
7c673cae
FG
2605 break;
2606 case CEPH_MSG_CLIENT_CAPS:
11fdf7f2 2607 handle_caps(MClientCaps::msgref_cast(m));
7c673cae
FG
2608 break;
2609 case CEPH_MSG_CLIENT_LEASE:
11fdf7f2 2610 handle_lease(MClientLease::msgref_cast(m));
7c673cae
FG
2611 break;
2612 case MSG_COMMAND_REPLY:
2613 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
11fdf7f2 2614 handle_command_reply(MCommandReply::msgref_cast(m));
7c673cae
FG
2615 } else {
2616 return false;
2617 }
2618 break;
2619 case CEPH_MSG_CLIENT_QUOTA:
11fdf7f2 2620 handle_quota(MClientQuota::msgref_cast(m));
7c673cae
FG
2621 break;
2622
2623 default:
2624 return false;
2625 }
2626
2627 // unmounting?
2628 if (unmounting) {
2629 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2630 << "+" << inode_map.size() << dendl;
2631 long unsigned size = lru.lru_get_size() + inode_map.size();
2632 trim_cache();
2633 if (size < lru.lru_get_size() + inode_map.size()) {
2634 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2635 mount_cond.Signal();
2636 } else {
2637 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2638 << "+" << inode_map.size() << dendl;
2639 }
2640 }
2641
2642 return true;
2643}
2644
11fdf7f2 2645void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2646{
2647 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2648
2649 signal_cond_list(waiting_for_fsmap);
2650
2651 monclient->sub_got("fsmap", fsmap->get_epoch());
2652}
2653
11fdf7f2 2654void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2655{
2656 fsmap_user.reset(new FSMapUser);
2657 *fsmap_user = m->get_fsmap();
7c673cae
FG
2658
2659 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2660 signal_cond_list(waiting_for_fsmap);
2661}
2662
11fdf7f2 2663void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2664{
f64942e4 2665 mds_gid_t old_inc, new_inc;
7c673cae 2666 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2667 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2668 << " is identical to or older than our "
2669 << mdsmap->get_epoch() << dendl;
7c673cae 2670 return;
f64942e4 2671 }
7c673cae 2672
11fdf7f2 2673 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2674
2675 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2676 oldmap.swap(mdsmap);
2677
2678 mdsmap->decode(m->get_encoded());
2679
2680 // Cancel any commands for missing or laggy GIDs
2681 std::list<ceph_tid_t> cancel_ops;
2682 auto &commands = command_table.get_commands();
2683 for (const auto &i : commands) {
2684 auto &op = i.second;
2685 const mds_gid_t op_mds_gid = op.mds_gid;
2686 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2687 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2688 cancel_ops.push_back(i.first);
2689 if (op.outs) {
2690 std::ostringstream ss;
2691 ss << "MDS " << op_mds_gid << " went away";
2692 *(op.outs) = ss.str();
2693 }
2694 op.con->mark_down();
2695 if (op.on_finish) {
2696 op.on_finish->complete(-ETIMEDOUT);
2697 }
2698 }
2699 }
2700
2701 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2702 i != cancel_ops.end(); ++i) {
2703 command_table.erase(*i);
2704 }
2705
2706 // reset session
11fdf7f2 2707 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2708 mds_rank_t mds = p->first;
11fdf7f2 2709 MetaSession *session = &p->second;
7c673cae
FG
2710 ++p;
2711
2712 int oldstate = oldmap->get_state(mds);
2713 int newstate = mdsmap->get_state(mds);
2714 if (!mdsmap->is_up(mds)) {
2715 session->con->mark_down();
11fdf7f2 2716 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2717 old_inc = oldmap->get_incarnation(mds);
2718 new_inc = mdsmap->get_incarnation(mds);
2719 if (old_inc != new_inc) {
2720 ldout(cct, 1) << "mds incarnation changed from "
2721 << old_inc << " to " << new_inc << dendl;
2722 oldstate = MDSMap::STATE_NULL;
2723 }
7c673cae 2724 session->con->mark_down();
11fdf7f2 2725 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2726 // When new MDS starts to take over, notify kernel to trim unused entries
2727 // in its dcache/icache. Hopefully, the kernel will release some unused
2728 // inodes before the new MDS enters reconnect state.
2729 trim_cache_for_reconnect(session);
2730 } else if (oldstate == newstate)
2731 continue; // no change
2732
2733 session->mds_state = newstate;
2734 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2735 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2736 send_reconnect(session);
81eedcae
TL
2737 } else if (newstate > MDSMap::STATE_RECONNECT) {
2738 if (oldstate < MDSMap::STATE_RECONNECT) {
2739 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2740 _closed_mds_session(session);
2741 continue;
2742 }
2743 if (newstate >= MDSMap::STATE_ACTIVE) {
2744 if (oldstate < MDSMap::STATE_ACTIVE) {
2745 // kick new requests
2746 kick_requests(session);
2747 kick_flushing_caps(session);
2748 signal_context_list(session->waiting_for_open);
2749 wake_up_session_caps(session, true);
2750 }
2751 connect_mds_targets(mds);
7c673cae 2752 }
7c673cae
FG
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
7c673cae
FG
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763}
2764
2765void Client::send_reconnect(MetaSession *session)
2766{
2767 mds_rank_t mds = session->mds_num;
11fdf7f2 2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
11fdf7f2 2775 session->release.reset();
7c673cae
FG
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
11fdf7f2
TL
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
11fdf7f2
TL
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
7c673cae 2806 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2807 << " " << ccap_string(cap.issued)
7c673cae
FG
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
11fdf7f2
TL
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
7c673cae
FG
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
11fdf7f2 2832 cap.cap_id,
7c673cae
FG
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
11fdf7f2 2835 cap.issued, // issued
7c673cae
FG
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
11fdf7f2
TL
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
7c673cae
FG
2851
2852 mount_cond.Signal();
11fdf7f2
TL
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2856}
2857
2858
2859void Client::kick_requests(MetaSession *session)
2860{
11fdf7f2 2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
31f18b77
FG
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
7c673cae 2873 continue;
31f18b77
FG
2874 }
2875 if (req->retry_attempt > 0)
7c673cae 2876 continue; // new requests only
31f18b77 2877 if (req->mds == session->mds_num) {
7c673cae
FG
2878 send_request(p->second, session);
2879 }
2880 }
2881}
2882
2883void Client::resend_unsafe_requests(MetaSession *session)
2884{
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
31f18b77
FG
2898 if (req->aborted())
2899 continue;
7c673cae
FG
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905}
2906
2907void Client::wait_unsafe_requests()
2908{
2909 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927}
2928
2929void Client::kick_requests_closed(MetaSession *session)
2930{
11fdf7f2 2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
11fdf7f2 2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae
FG
2944 req->unsafe_item.remove_myself();
2945 req->unsafe_dir_item.remove_myself();
2946 req->unsafe_target_item.remove_myself();
2947 signal_cond_list(req->waitfor_safe);
2948 unregister_request(req);
2949 }
2950 }
2951 }
11fdf7f2
TL
2952 ceph_assert(session->requests.empty());
2953 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2954}
2955
2956
2957
2958
2959/************
2960 * leases
2961 */
2962
2963void Client::got_mds_push(MetaSession *s)
2964{
2965 s->seq++;
2966 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2967 if (s->state == MetaSession::STATE_CLOSING) {
11fdf7f2 2968 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2969 }
2970}
2971
11fdf7f2 2972void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2973{
11fdf7f2 2974 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2975
11fdf7f2 2976 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2977
2978 mds_rank_t mds = mds_rank_t(m->get_source().num());
2979 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2980 if (!session) {
7c673cae
FG
2981 return;
2982 }
2983
2984 got_mds_push(session);
2985
2986 ceph_seq_t seq = m->get_seq();
2987
2988 Inode *in;
2989 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2990 if (inode_map.count(vino) == 0) {
2991 ldout(cct, 10) << " don't have vino " << vino << dendl;
2992 goto revoke;
2993 }
2994 in = inode_map[vino];
2995
2996 if (m->get_mask() & CEPH_LOCK_DN) {
2997 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2998 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2999 goto revoke;
3000 }
3001 Dentry *dn = in->dir->dentries[m->dname];
3002 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3003 dn->lease_mds = -1;
3004 }
3005
3006 revoke:
11fdf7f2
TL
3007 {
3008 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3009 m->get_connection()->send_message2(std::move(reply));
3010 }
7c673cae
FG
3011}
3012
3013void Client::put_inode(Inode *in, int n)
3014{
11fdf7f2 3015 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3016 int left = in->_put(n);
3017 if (left == 0) {
3018 // release any caps
3019 remove_all_caps(in);
3020
11fdf7f2 3021 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3022 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3023 ceph_assert(!unclean);
7c673cae
FG
3024 inode_map.erase(in->vino());
3025 if (use_faked_inos())
3026 _release_faked_ino(in);
3027
3028 if (in == root) {
3029 root = 0;
3030 root_ancestor = 0;
3031 while (!root_parents.empty())
3032 root_parents.erase(root_parents.begin());
3033 }
3034
3035 delete in;
3036 }
3037}
3038
3039void Client::close_dir(Dir *dir)
3040{
3041 Inode *in = dir->parent_inode;
11fdf7f2
TL
3042 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3043 ceph_assert(dir->is_empty());
3044 ceph_assert(in->dir == dir);
3045 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3046 if (!in->dentries.empty())
7c673cae
FG
3047 in->get_first_parent()->put(); // unpin dentry
3048
3049 delete in->dir;
3050 in->dir = 0;
3051 put_inode(in); // unpin inode
3052}
3053
3054 /**
3055 * Don't call this with in==NULL, use get_or_create for that
3056 * leave dn set to default NULL unless you're trying to add
3057 * a new inode to a pre-created Dentry
3058 */
3059Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3060{
3061 if (!dn) {
3062 // create a new Dentry
11fdf7f2
TL
3063 dn = new Dentry(dir, name);
3064
7c673cae
FG
3065 lru.lru_insert_mid(dn); // mid or top?
3066
3067 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3068 << " dn " << dn << " (new dn)" << dendl;
3069 } else {
11fdf7f2 3070 ceph_assert(!dn->inode);
7c673cae
FG
3071 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3072 << " dn " << dn << " (old dn)" << dendl;
3073 }
3074
3075 if (in) { // link to inode
11fdf7f2 3076 InodeRef tmp_ref;
7c673cae 3077 // only one parent for directories!
11fdf7f2
TL
3078 if (in->is_dir() && !in->dentries.empty()) {
3079 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3080 Dentry *olddn = in->get_first_parent();
11fdf7f2 3081 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3082 Inode *old_diri = olddn->dir->parent_inode;
3083 old_diri->dir_release_count++;
3084 clear_dir_complete_and_ordered(old_diri, true);
3085 unlink(olddn, true, true); // keep dir, dentry
3086 }
3087
11fdf7f2
TL
3088 dn->link(in);
3089 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3090 }
3091
3092 return dn;
3093}
3094
3095void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3096{
11fdf7f2 3097 InodeRef in(dn->inode);
7c673cae
FG
3098 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3099 << " inode " << dn->inode << dendl;
3100
3101 // unlink from inode
11fdf7f2
TL
3102 if (dn->inode) {
3103 dn->unlink();
3104 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3105 }
3106
3107 if (keepdentry) {
3108 dn->lease_mds = -1;
3109 } else {
3110 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3111
3112 // unlink from dir
11fdf7f2
TL
3113 Dir *dir = dn->dir;
3114 dn->detach();
7c673cae
FG
3115
3116 // delete den
3117 lru.lru_remove(dn);
3118 dn->put();
11fdf7f2
TL
3119
3120 if (dir->is_empty() && !keepdir)
3121 close_dir(dir);
7c673cae
FG
3122 }
3123}
3124
3125/**
3126 * For asynchronous flushes, check for errors from the IO and
3127 * update the inode if necessary
3128 */
3129class C_Client_FlushComplete : public Context {
3130private:
3131 Client *client;
3132 InodeRef inode;
3133public:
3134 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3135 void finish(int r) override {
11fdf7f2 3136 ceph_assert(client->client_lock.is_locked_by_me());
7c673cae
FG
3137 if (r != 0) {
3138 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3139 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3140 << " 0x" << std::hex << inode->ino << std::dec
3141 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3142 inode->set_async_err(r);
3143 }
3144 }
3145};
3146
3147
3148/****
3149 * caps
3150 */
3151
3152void Client::get_cap_ref(Inode *in, int cap)
3153{
3154 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3155 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3156 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3157 in->get();
3158 }
3159 if ((cap & CEPH_CAP_FILE_CACHE) &&
3160 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3161 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3162 in->get();
3163 }
3164 in->get_cap_ref(cap);
3165}
3166
3167void Client::put_cap_ref(Inode *in, int cap)
3168{
3169 int last = in->put_cap_ref(cap);
3170 if (last) {
3171 int put_nref = 0;
3172 int drop = last & ~in->caps_issued();
3173 if (in->snapid == CEPH_NOSNAP) {
3174 if ((last & CEPH_CAP_FILE_WR) &&
3175 !in->cap_snaps.empty() &&
3176 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3177 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3178 in->cap_snaps.rbegin()->second.writing = 0;
3179 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3180 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3181 }
3182 if (last & CEPH_CAP_FILE_BUFFER) {
3183 for (auto &p : in->cap_snaps)
3184 p.second.dirty_data = 0;
3185 signal_cond_list(in->waitfor_commit);
11fdf7f2 3186 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3187 ++put_nref;
3188 }
3189 }
3190 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3191 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3192 ++put_nref;
3193 }
3194 if (drop)
3195 check_caps(in, 0);
3196 if (put_nref)
3197 put_inode(in, put_nref);
3198 }
3199}
3200
3201int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3202{
3203 int r = check_pool_perm(in, need);
3204 if (r < 0)
3205 return r;
3206
3207 while (1) {
3208 int file_wanted = in->caps_file_wanted();
3209 if ((file_wanted & need) != need) {
3210 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3211 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3212 << dendl;
3213 return -EBADF;
3214 }
3215
3216 int implemented;
3217 int have = in->caps_issued(&implemented);
3218
3219 bool waitfor_caps = false;
3220 bool waitfor_commit = false;
3221
3222 if (have & need & CEPH_CAP_FILE_WR) {
3223 if (endoff > 0 &&
3224 (endoff >= (loff_t)in->max_size ||
3225 endoff > (loff_t)(in->size << 1)) &&
3226 endoff > (loff_t)in->wanted_max_size) {
3227 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3228 in->wanted_max_size = endoff;
3229 check_caps(in, 0);
3230 }
3231
3232 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3233 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3234 waitfor_caps = true;
3235 }
3236 if (!in->cap_snaps.empty()) {
3237 if (in->cap_snaps.rbegin()->second.writing) {
3238 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3239 waitfor_caps = true;
3240 }
3241 for (auto &p : in->cap_snaps) {
3242 if (p.second.dirty_data) {
3243 waitfor_commit = true;
3244 break;
3245 }
3246 }
3247 if (waitfor_commit) {
3248 _flush(in, new C_Client_FlushComplete(this, in));
3249 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3250 }
3251 }
3252 }
3253
3254 if (!waitfor_caps && !waitfor_commit) {
3255 if ((have & need) == need) {
7c673cae
FG
3256 int revoking = implemented & ~have;
3257 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3258 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3259 << " revoking " << ccap_string(revoking)
7c673cae 3260 << dendl;
c07f9fc5 3261 if ((revoking & want) == 0) {
7c673cae
FG
3262 *phave = need | (have & want);
3263 in->get_cap_ref(need);
3264 return 0;
3265 }
3266 }
3267 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3268 waitfor_caps = true;
3269 }
3270
3271 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3272 in->auth_cap->session->readonly)
3273 return -EROFS;
3274
3275 if (in->flags & I_CAP_DROPPED) {
3276 int mds_wanted = in->caps_mds_wanted();
3277 if ((mds_wanted & need) != need) {
3278 int ret = _renew_caps(in);
3279 if (ret < 0)
3280 return ret;
3281 continue;
3282 }
a8e16298 3283 if (!(file_wanted & ~mds_wanted))
7c673cae 3284 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3285 }
3286
3287 if (waitfor_caps)
3288 wait_on_list(in->waitfor_caps);
3289 else if (waitfor_commit)
3290 wait_on_list(in->waitfor_commit);
3291 }
3292}
3293
3294int Client::get_caps_used(Inode *in)
3295{
3296 unsigned used = in->caps_used();
3297 if (!(used & CEPH_CAP_FILE_CACHE) &&
3298 !objectcacher->set_is_empty(&in->oset))
3299 used |= CEPH_CAP_FILE_CACHE;
3300 return used;
3301}
3302
3303void Client::cap_delay_requeue(Inode *in)
3304{
11fdf7f2 3305 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3306 in->hold_caps_until = ceph_clock_now();
3307 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3308 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3309}
3310
3311void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3312 bool sync, int used, int want, int retain,
3313 int flush, ceph_tid_t flush_tid)
3314{
3315 int held = cap->issued | cap->implemented;
3316 int revoking = cap->implemented & ~cap->issued;
3317 retain &= ~revoking;
3318 int dropping = cap->issued & ~retain;
3319 int op = CEPH_CAP_OP_UPDATE;
3320
11fdf7f2 3321 ldout(cct, 10) << __func__ << " " << *in
7c673cae
FG
3322 << " mds." << session->mds_num << " seq " << cap->seq
3323 << (sync ? " sync " : " async ")
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
11fdf7f2 3362 auto m = MClientCaps::create(op,
7c673cae
FG
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3384 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
3398 if (sync)
11fdf7f2
TL
3399 m->flags |= MClientCaps::FLAG_SYNC;
3400 if (!in->cap_snaps.empty())
3401 m->flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
7c673cae
FG
3402
3403 if (flush & CEPH_CAP_FILE_WR) {
3404 m->inline_version = in->inline_version;
3405 m->inline_data = in->inline_data;
3406 }
3407
3408 in->reported_size = in->size;
3409 m->set_snap_follows(follows);
3410 cap->wanted = want;
3411 if (cap == in->auth_cap) {
3412 m->set_max_size(in->wanted_max_size);
3413 in->requested_max_size = in->wanted_max_size;
3414 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3415 }
3416
3417 if (!session->flushing_caps_tids.empty())
3418 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3419
11fdf7f2 3420 session->con->send_message2(std::move(m));
7c673cae
FG
3421}
3422
31f18b77
FG
3423static bool is_max_size_approaching(Inode *in)
3424{
3425 /* mds will adjust max size according to the reported size */
3426 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3427 return false;
3428 if (in->size >= in->max_size)
3429 return true;
3430 /* half of previous max_size increment has been used */
3431 if (in->max_size > in->reported_size &&
3432 (in->size << 1) >= in->max_size + in->reported_size)
3433 return true;
3434 return false;
3435}
7c673cae 3436
11fdf7f2
TL
3437static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3438{
3439 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3440 return used;
3441 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3442 return used;
3443
3444 if (issued & CEPH_CAP_FILE_LAZYIO) {
3445 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3446 used &= ~CEPH_CAP_FILE_CACHE;
3447 used |= CEPH_CAP_FILE_LAZYIO;
3448 }
3449 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3450 used &= ~CEPH_CAP_FILE_BUFFER;
3451 used |= CEPH_CAP_FILE_LAZYIO;
3452 }
3453 } else {
3454 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3455 used &= ~CEPH_CAP_FILE_CACHE;
3456 used |= CEPH_CAP_FILE_LAZYIO;
3457 }
3458 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3459 used &= ~CEPH_CAP_FILE_BUFFER;
3460 used |= CEPH_CAP_FILE_LAZYIO;
3461 }
3462 }
3463 return used;
3464}
3465
7c673cae
FG
3466/**
3467 * check_caps
3468 *
3469 * Examine currently used and wanted versus held caps. Release, flush or ack
3470 * revoked caps to the MDS as appropriate.
3471 *
3472 * @param in the inode to check
3473 * @param flags flags to apply to cap check
3474 */
3475void Client::check_caps(Inode *in, unsigned flags)
3476{
3477 unsigned wanted = in->caps_wanted();
3478 unsigned used = get_caps_used(in);
3479 unsigned cap_used;
3480
7c673cae
FG
3481 int implemented;
3482 int issued = in->caps_issued(&implemented);
3483 int revoking = implemented & ~issued;
3484
11fdf7f2
TL
3485 int orig_used = used;
3486 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3487
7c673cae 3488 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3489 if (!unmounting && in->nlink > 0) {
3490 if (wanted) {
7c673cae 3491 retain |= CEPH_CAP_ANY;
a8e16298
TL
3492 } else if (in->is_dir() &&
3493 (issued & CEPH_CAP_FILE_SHARED) &&
3494 (in->flags & I_COMPLETE)) {
3495 // we do this here because we don't want to drop to Fs (and then
3496 // drop the Fs if we do a create!) if that alone makes us send lookups
3497 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3498 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3499 retain |= wanted;
3500 } else {
7c673cae 3501 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3502 // keep RD only if we didn't have the file open RW,
3503 // because then the mds would revoke it anyway to
3504 // journal max_size=0.
3505 if (in->max_size == 0)
3506 retain |= CEPH_CAP_ANY_RD;
3507 }
7c673cae
FG
3508 }
3509
11fdf7f2 3510 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3511 << " wanted " << ccap_string(wanted)
3512 << " used " << ccap_string(used)
3513 << " issued " << ccap_string(issued)
3514 << " revoking " << ccap_string(revoking)
3515 << " flags=" << flags
3516 << dendl;
3517
3518 if (in->snapid != CEPH_NOSNAP)
3519 return; //snap caps last forever, can't write
3520
3521 if (in->caps.empty())
3522 return; // guard if at end of func
3523
11fdf7f2
TL
3524 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3525 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3526 if (_release(in))
11fdf7f2 3527 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3528 }
7c673cae
FG
3529
3530 if (!in->cap_snaps.empty())
3531 flush_snaps(in);
3532
11fdf7f2
TL
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
7c673cae 3536
11fdf7f2 3537 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3538
3539 cap_used = used;
11fdf7f2 3540 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3541 cap_used &= ~in->auth_cap->issued;
3542
11fdf7f2 3543 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3544
3545 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3552 &cap == in->auth_cap)
7c673cae
FG
3553 goto ack;
3554
3555 /* approaching file_max? */
11fdf7f2
TL
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
31f18b77 3558 is_max_size_approaching(in)) {
7c673cae 3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3560 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
11fdf7f2 3571 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
11fdf7f2 3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3578 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3579 continue;
3580
11fdf7f2 3581 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3582 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3583 cap_delay_requeue(in);
7c673cae
FG
3584 continue;
3585 }
3586
3587 ack:
3588 // re-send old cap/snapcap flushes first.
3589 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3590 session->mds_state < MDSMap::STATE_ACTIVE &&
3591 session->early_flushing_caps.count(in) == 0) {
3592 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3593 << " to mds." << session->mds_num << dendl;
3594 session->early_flushing_caps.insert(in);
3595 if (in->cap_snaps.size())
3596 flush_snaps(in, true);
3597 if (in->flushing_caps)
3598 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3599 }
3600
3601 int flushing;
3602 ceph_tid_t flush_tid;
11fdf7f2 3603 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3604 flushing = mark_caps_flushing(in, &flush_tid);
3605 } else {
3606 flushing = 0;
3607 flush_tid = 0;
3608 }
3609
11fdf7f2 3610 send_cap(in, session, &cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
7c673cae
FG
3611 retain, flushing, flush_tid);
3612 }
3613}
3614
3615
3616void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3617{
3618 int used = get_caps_used(in);
3619 int dirty = in->caps_dirty();
11fdf7f2 3620 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3621
3622 if (in->cap_snaps.size() &&
3623 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3624 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3625 return;
3626 } else if (in->caps_dirty() ||
3627 (used & CEPH_CAP_FILE_WR) ||
3628 (dirty & CEPH_CAP_ANY_WR)) {
3629 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3630 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3631 CapSnap &capsnap = capsnapem.first->second;
3632 capsnap.context = old_snapc;
3633 capsnap.issued = in->caps_issued();
3634 capsnap.dirty = in->caps_dirty();
3635
3636 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3637
3638 capsnap.uid = in->uid;
3639 capsnap.gid = in->gid;
3640 capsnap.mode = in->mode;
3641 capsnap.btime = in->btime;
3642 capsnap.xattrs = in->xattrs;
3643 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3644 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3645 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3646
3647 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3648 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3649 capsnap.writing = 1;
3650 } else {
3651 finish_cap_snap(in, capsnap, used);
3652 }
3653 } else {
11fdf7f2 3654 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3655 }
3656}
3657
3658void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3659{
11fdf7f2 3660 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3661 capsnap.size = in->size;
3662 capsnap.mtime = in->mtime;
3663 capsnap.atime = in->atime;
3664 capsnap.ctime = in->ctime;
3665 capsnap.time_warp_seq = in->time_warp_seq;
3666 capsnap.change_attr = in->change_attr;
7c673cae
FG
3667 capsnap.dirty |= in->caps_dirty();
3668
11fdf7f2
TL
3669 /* Only reset it if it wasn't set before */
3670 if (capsnap.cap_dirtier_uid == -1) {
3671 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3672 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3673 }
3674
7c673cae
FG
3675 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3676 capsnap.inline_data = in->inline_data;
3677 capsnap.inline_version = in->inline_version;
3678 }
3679
3680 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3681 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3682 << " WRBUFFER, delaying" << dendl;
3683 } else {
3684 capsnap.dirty_data = 0;
3685 flush_snaps(in);
3686 }
3687}
3688
3689void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3690{
11fdf7f2 3691 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3692 in->cap_snaps.at(seq).dirty_data = 0;
3693 flush_snaps(in);
3694}
3695
3696void Client::flush_snaps(Inode *in, bool all_again)
3697{
3698 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
11fdf7f2 3699 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3700
3701 // pick auth mds
11fdf7f2 3702 ceph_assert(in->auth_cap);
7c673cae
FG
3703 MetaSession *session = in->auth_cap->session;
3704 int mseq = in->auth_cap->mseq;
3705
3706 for (auto &p : in->cap_snaps) {
3707 CapSnap &capsnap = p.second;
3708 if (!all_again) {
3709 // only flush once per session
3710 if (capsnap.flush_tid > 0)
3711 continue;
3712 }
3713
3714 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3715 << " follows " << p.first
3716 << " size " << capsnap.size
3717 << " mtime " << capsnap.mtime
3718 << " dirty_data=" << capsnap.dirty_data
3719 << " writing=" << capsnap.writing
3720 << " on " << *in << dendl;
3721 if (capsnap.dirty_data || capsnap.writing)
3722 continue;
3723
3724 if (capsnap.flush_tid == 0) {
3725 capsnap.flush_tid = ++last_flush_tid;
3726 if (!in->flushing_cap_item.is_on_list())
3727 session->flushing_caps.push_back(&in->flushing_cap_item);
3728 session->flushing_caps_tids.insert(capsnap.flush_tid);
3729 }
3730
11fdf7f2 3731 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
7c673cae 3732 cap_epoch_barrier);
11fdf7f2
TL
3733 m->caller_uid = capsnap.cap_dirtier_uid;
3734 m->caller_gid = capsnap.cap_dirtier_gid;
7c673cae
FG
3735
3736 m->set_client_tid(capsnap.flush_tid);
3737 m->head.snap_follows = p.first;
3738
3739 m->head.caps = capsnap.issued;
3740 m->head.dirty = capsnap.dirty;
3741
3742 m->head.uid = capsnap.uid;
3743 m->head.gid = capsnap.gid;
3744 m->head.mode = capsnap.mode;
3745 m->btime = capsnap.btime;
3746
3747 m->size = capsnap.size;
3748
3749 m->head.xattr_version = capsnap.xattr_version;
11fdf7f2 3750 encode(capsnap.xattrs, m->xattrbl);
7c673cae
FG
3751
3752 m->ctime = capsnap.ctime;
3753 m->btime = capsnap.btime;
3754 m->mtime = capsnap.mtime;
3755 m->atime = capsnap.atime;
3756 m->time_warp_seq = capsnap.time_warp_seq;
3757 m->change_attr = capsnap.change_attr;
3758
3759 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3760 m->inline_version = in->inline_version;
3761 m->inline_data = in->inline_data;
3762 }
3763
11fdf7f2 3764 ceph_assert(!session->flushing_caps_tids.empty());
7c673cae
FG
3765 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3766
11fdf7f2 3767 session->con->send_message2(std::move(m));
7c673cae
FG
3768 }
3769}
3770
3771
3772
3773void Client::wait_on_list(list<Cond*>& ls)
3774{
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779}
3780
3781void Client::signal_cond_list(list<Cond*>& ls)
3782{
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785}
3786
3787void Client::wait_on_context_list(list<Context*>& ls)
3788{
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795}
3796
3797void Client::signal_context_list(list<Context*>& ls)
3798{
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803}
3804
a8e16298 3805void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3806{
11fdf7f2
TL
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
a8e16298 3809 if (reconnect) {
11fdf7f2
TL
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
a8e16298
TL
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
11fdf7f2
TL
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3819 }
3820 }
11fdf7f2 3821 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3822 }
3823}
3824
3825
3826// flush dirty data (from objectcache)
3827
3828class C_Client_CacheInvalidate : public Context {
3829private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
11fdf7f2 3843 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846};
3847
3848void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849{
3850 if (unmounting)
3851 return;
11fdf7f2 3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854}
3855
3856void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861}
3862
3863void Client::_invalidate_inode_cache(Inode *in)
3864{
11fdf7f2 3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3866
3867 // invalidate our userspace inode cache
94b18763 3868 if (cct->_conf->client_oc) {
7c673cae 3869 objectcacher->release_set(&in->oset);
94b18763
FG
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
7c673cae
FG
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875}
3876
3877void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878{
11fdf7f2 3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889}
3890
3891bool Client::_release(Inode *in)
3892{
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899}
3900
3901bool Client::_flush(Inode *in, Context *onfinish)
3902{
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921}
3922
3923void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924{
11fdf7f2 3925 ceph_assert(client_lock.is_locked());
7c673cae
FG
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
11fdf7f2 3931 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3933 offset, size, &onflush);
7c673cae
FG
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
11fdf7f2 3937 onflush.wait();
7c673cae
FG
3938 client_lock.Lock();
3939 }
3940}
3941
3942void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943{
11fdf7f2
TL
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
7c673cae 3946 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3947 ceph_assert(in);
7c673cae
FG
3948 _flushed(in);
3949}
3950
3951void Client::_flushed(Inode *in)
3952{
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956}
3957
3958
3959
3960// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3961void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3962{
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976}
3977
3978void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3981{
11fdf7f2
TL
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
7c673cae 3999 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
11fdf7f2
TL
4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4016 ceph_assert(&cap == in->auth_cap);
4017 ceph_assert(cap.cap_id == cap_id);
4018 seq = cap.seq;
4019 mseq = cap.mseq;
4020 issued |= cap.issued;
7c673cae
FG
4021 flags |= CEPH_CAP_FLAG_AUTH;
4022 }
7c673cae
FG
4023 }
4024
11fdf7f2 4025 check_cap_issue(in, issued);
7c673cae
FG
4026
4027 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4028 if (in->auth_cap != &cap &&
7c673cae
FG
4029 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4030 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4031 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4032 << "add myself to new auth MDS' flushing caps list" << dendl;
4033 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4034 }
11fdf7f2 4035 in->auth_cap = &cap;
7c673cae
FG
4036 }
4037 }
4038
11fdf7f2
TL
4039 unsigned old_caps = cap.issued;
4040 cap.cap_id = cap_id;
4041 cap.issued = issued;
4042 cap.implemented |= issued;
4043 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4044 cap.wanted = wanted;
a8e16298 4045 else
11fdf7f2
TL
4046 cap.wanted |= wanted;
4047 cap.seq = seq;
4048 cap.issue_seq = seq;
4049 cap.mseq = mseq;
4050 cap.gen = mds_session->cap_gen;
4051 cap.latest_perms = cap_perms;
4052 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4053 << " from mds." << mds
4054 << " on " << *in
4055 << dendl;
4056
4057 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4058 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4059 for (auto &p : in->caps) {
4060 if (&p.second == &cap)
7c673cae 4061 continue;
11fdf7f2 4062 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4063 check_caps(in, CHECK_CAPS_NODELAY);
4064 break;
4065 }
4066 }
4067 }
4068
4069 if (issued & ~old_caps)
4070 signal_cond_list(in->waitfor_caps);
4071}
4072
4073void Client::remove_cap(Cap *cap, bool queue_release)
4074{
11fdf7f2 4075 auto &in = cap->inode;
7c673cae
FG
4076 MetaSession *session = cap->session;
4077 mds_rank_t mds = cap->session->mds_num;
4078
11fdf7f2 4079 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4080
4081 if (queue_release) {
4082 session->enqueue_cap_release(
11fdf7f2 4083 in.ino,
7c673cae
FG
4084 cap->cap_id,
4085 cap->issue_seq,
4086 cap->mseq,
4087 cap_epoch_barrier);
4088 }
4089
11fdf7f2
TL
4090 if (in.auth_cap == cap) {
4091 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4092 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4093 in.flushing_cap_item.remove_myself();
7c673cae 4094 }
11fdf7f2 4095 in.auth_cap = NULL;
7c673cae 4096 }
11fdf7f2
TL
4097 size_t n = in.caps.erase(mds);
4098 ceph_assert(n == 1);
7c673cae
FG
4099 cap = nullptr;
4100
11fdf7f2
TL
4101 if (!in.is_any_caps()) {
4102 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4103 in.snaprealm_item.remove_myself();
4104 put_snap_realm(in.snaprealm);
4105 in.snaprealm = 0;
7c673cae
FG
4106 }
4107}
4108
4109void Client::remove_all_caps(Inode *in)
4110{
4111 while (!in->caps.empty())
11fdf7f2 4112 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4113}
4114
4115void Client::remove_session_caps(MetaSession *s)
4116{
11fdf7f2 4117 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4118
4119 while (s->caps.size()) {
4120 Cap *cap = *s->caps.begin();
11fdf7f2 4121 InodeRef in(&cap->inode);
7c673cae
FG
4122 bool dirty_caps = false, cap_snaps = false;
4123 if (in->auth_cap == cap) {
4124 cap_snaps = !in->cap_snaps.empty();
4125 dirty_caps = in->dirty_caps | in->flushing_caps;
4126 in->wanted_max_size = 0;
4127 in->requested_max_size = 0;
7c673cae 4128 }
a8e16298
TL
4129 if (cap->wanted | cap->issued)
4130 in->flags |= I_CAP_DROPPED;
7c673cae 4131 remove_cap(cap, false);
7c673cae 4132 if (cap_snaps) {
7c673cae
FG
4133 in->cap_snaps.clear();
4134 }
4135 if (dirty_caps) {
11fdf7f2 4136 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4137 if (in->flushing_caps) {
4138 num_flushing_caps--;
4139 in->flushing_cap_tids.clear();
4140 }
4141 in->flushing_caps = 0;
28e407b8 4142 in->mark_caps_clean();
11fdf7f2 4143 put_inode(in.get());
7c673cae 4144 }
a8e16298 4145 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4146 }
4147 s->flushing_caps_tids.clear();
4148 sync_cond.Signal();
4149}
4150
91327a77 4151int Client::_do_remount(bool retry_on_error)
b32b8144 4152{
11fdf7f2 4153 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4154
b32b8144
FG
4155 errno = 0;
4156 int r = remount_cb(callback_handle);
91327a77
AA
4157 if (r == 0) {
4158 retries_on_invalidate = 0;
4159 } else {
b32b8144
FG
4160 int e = errno;
4161 client_t whoami = get_nodeid();
4162 if (r == -1) {
4163 lderr(cct) <<
4164 "failed to remount (to trim kernel dentries): "
4165 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4166 } else {
4167 lderr(cct) <<
4168 "failed to remount (to trim kernel dentries): "
4169 "return code = " << r << dendl;
4170 }
91327a77 4171 bool should_abort =
11fdf7f2
TL
4172 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4173 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4174 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4175 if (should_abort && !unmounting) {
4176 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4177 ceph_abort();
4178 }
4179 }
4180 return r;
4181}
4182
7c673cae
FG
4183class C_Client_Remount : public Context {
4184private:
4185 Client *client;
4186public:
4187 explicit C_Client_Remount(Client *c) : client(c) {}
4188 void finish(int r) override {
11fdf7f2 4189 ceph_assert(r == 0);
91327a77 4190 client->_do_remount(true);
7c673cae
FG
4191 }
4192};
4193
4194void Client::_invalidate_kernel_dcache()
4195{
4196 if (unmounting)
4197 return;
94b18763
FG
4198 if (can_invalidate_dentries) {
4199 if (dentry_invalidate_cb && root->dir) {
4200 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4201 p != root->dir->dentries.end();
4202 ++p) {
4203 if (p->second->inode)
4204 _schedule_invalidate_dentry_callback(p->second, false);
4205 }
7c673cae
FG
4206 }
4207 } else if (remount_cb) {
4208 // Hacky:
4209 // when remounting a file system, linux kernel trims all unused dentries in the fs
4210 remount_finisher.queue(new C_Client_Remount(this));
4211 }
4212}
4213
91327a77
AA
4214void Client::_trim_negative_child_dentries(InodeRef& in)
4215{
4216 if (!in->is_dir())
4217 return;
4218
4219 Dir* dir = in->dir;
4220 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4221 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4222 Dentry *dn = p->second;
4223 ++p;
11fdf7f2 4224 ceph_assert(!dn->inode);
91327a77
AA
4225 if (dn->lru_is_expireable())
4226 unlink(dn, true, false); // keep dir, drop dentry
4227 }
4228 if (dir->dentries.empty()) {
4229 close_dir(dir);
4230 }
4231 }
4232
4233 if (in->flags & I_SNAPDIR_OPEN) {
4234 InodeRef snapdir = open_snapdir(in.get());
4235 _trim_negative_child_dentries(snapdir);
4236 }
4237}
4238
28e407b8 4239void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4240{
4241 mds_rank_t mds = s->mds_num;
28e407b8 4242 size_t caps_size = s->caps.size();
11fdf7f2 4243 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4244 << " caps " << caps_size << dendl;
4245
28e407b8
AA
4246 uint64_t trimmed = 0;
4247 auto p = s->caps.begin();
4248 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4249 * looking at from getting deleted during traversal. */
7c673cae
FG
4250 while ((caps_size - trimmed) > max && !p.end()) {
4251 Cap *cap = *p;
11fdf7f2 4252 InodeRef in(&cap->inode);
7c673cae
FG
4253
4254 // Increment p early because it will be invalidated if cap
4255 // is deleted inside remove_cap
4256 ++p;
4257
4258 if (in->caps.size() > 1 && cap != in->auth_cap) {
4259 int mine = cap->issued | cap->implemented;
4260 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4261 // disposable non-auth cap
b32b8144 4262 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4263 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4264 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4265 trimmed++;
4266 }
4267 } else {
4268 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4269 _trim_negative_child_dentries(in);
7c673cae 4270 bool all = true;
11fdf7f2
TL
4271 auto q = in->dentries.begin();
4272 while (q != in->dentries.end()) {
4273 Dentry *dn = *q;
4274 ++q;
7c673cae
FG
4275 if (dn->lru_is_expireable()) {
4276 if (can_invalidate_dentries &&
4277 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4278 // Only issue one of these per DN for inodes in root: handle
4279 // others more efficiently by calling for root-child DNs at
4280 // the end of this function.
4281 _schedule_invalidate_dentry_callback(dn, true);
4282 }
28e407b8
AA
4283 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4284 to_trim.insert(dn);
7c673cae
FG
4285 } else {
4286 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4287 all = false;
4288 }
4289 }
4290 if (all && in->ino != MDS_INO_ROOT) {
4291 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4292 trimmed++;
4293 }
4294 }
4295 }
28e407b8
AA
4296 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4297 for (const auto &dn : to_trim) {
4298 trim_dentry(dn);
4299 }
4300 to_trim.clear();
7c673cae 4301
b32b8144 4302 caps_size = s->caps.size();
11fdf7f2 4303 if (caps_size > (size_t)max)
7c673cae
FG
4304 _invalidate_kernel_dcache();
4305}
4306
4307void Client::force_session_readonly(MetaSession *s)
4308{
4309 s->readonly = true;
4310 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4311 auto &in = (*p)->inode;
4312 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4313 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4314 }
4315}
4316
7c673cae
FG
4317int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4318{
4319 MetaSession *session = in->auth_cap->session;
4320
4321 int flushing = in->dirty_caps;
11fdf7f2 4322 ceph_assert(flushing);
7c673cae
FG
4323
4324 ceph_tid_t flush_tid = ++last_flush_tid;
4325 in->flushing_cap_tids[flush_tid] = flushing;
4326
4327 if (!in->flushing_caps) {
11fdf7f2 4328 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4329 num_flushing_caps++;
4330 } else {
11fdf7f2 4331 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4332 }
4333
4334 in->flushing_caps |= flushing;
28e407b8 4335 in->mark_caps_clean();
7c673cae
FG
4336
4337 if (!in->flushing_cap_item.is_on_list())
4338 session->flushing_caps.push_back(&in->flushing_cap_item);
4339 session->flushing_caps_tids.insert(flush_tid);
4340
4341 *ptid = flush_tid;
4342 return flushing;
4343}
4344
4345void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4346{
4347 for (auto &p : in->cap_snaps) {
4348 CapSnap &capsnap = p.second;
4349 if (capsnap.flush_tid > 0) {
4350 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4351 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4352 }
4353 }
4354 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4355 it != in->flushing_cap_tids.end();
4356 ++it) {
4357 old_s->flushing_caps_tids.erase(it->first);
4358 new_s->flushing_caps_tids.insert(it->first);
4359 }
4360 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4361}
4362
4363/*
4364 * Flush all caps back to the MDS. Because the callers generally wait on the
4365 * result of this function (syncfs and umount cases), we set
4366 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4367 */
4368void Client::flush_caps_sync()
4369{
4370 ldout(cct, 10) << __func__ << dendl;
28e407b8 4371 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4372 while (!p.end()) {
4373 unsigned flags = CHECK_CAPS_NODELAY;
4374 Inode *in = *p;
4375
4376 ++p;
28e407b8
AA
4377 delayed_list.pop_front();
4378 if (p.end() && dirty_list.empty())
7c673cae
FG
4379 flags |= CHECK_CAPS_SYNCHRONOUS;
4380 check_caps(in, flags);
4381 }
4382
4383 // other caps, too
28e407b8 4384 p = dirty_list.begin();
7c673cae
FG
4385 while (!p.end()) {
4386 unsigned flags = CHECK_CAPS_NODELAY;
4387 Inode *in = *p;
4388
4389 ++p;
4390 if (p.end())
4391 flags |= CHECK_CAPS_SYNCHRONOUS;
4392 check_caps(in, flags);
4393 }
4394}
4395
4396void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4397{
11fdf7f2 4398 ldout(cct, 10) << __func__ << " " << in << " mds." << session->mds_num << dendl;
7c673cae 4399 Cap *cap = in->auth_cap;
11fdf7f2 4400 ceph_assert(cap->session == session);
7c673cae
FG
4401
4402 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4403 p != in->flushing_cap_tids.end();
4404 ++p) {
4405 bool req_sync = false;
4406
4407 /* If this is a synchronous request, then flush the journal on last one */
4408 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4409 req_sync = true;
4410
4411 send_cap(in, session, cap, req_sync,
4412 (get_caps_used(in) | in->caps_dirty()),
4413 in->caps_wanted(), (cap->issued | cap->implemented),
4414 p->second, p->first);
4415 }
4416}
4417
4418void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4419{
4420 while (in->flushing_caps) {
4421 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4422 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4423 if (it->first > want)
4424 break;
11fdf7f2 4425 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4426 << ccap_string(it->second) << " want " << want
4427 << " last " << it->first << dendl;
4428 wait_on_list(in->waitfor_caps);
4429 }
4430}
4431
4432void Client::wait_sync_caps(ceph_tid_t want)
4433{
4434 retry:
11fdf7f2 4435 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4436 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4437 for (auto &p : mds_sessions) {
4438 MetaSession *s = &p.second;
7c673cae
FG
4439 if (s->flushing_caps_tids.empty())
4440 continue;
4441 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4442 if (oldest_tid <= want) {
11fdf7f2 4443 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae
FG
4444 << " (want " << want << ")" << dendl;
4445 sync_cond.Wait(client_lock);
4446 goto retry;
4447 }
4448 }
4449}
4450
4451void Client::kick_flushing_caps(MetaSession *session)
4452{
4453 mds_rank_t mds = session->mds_num;
11fdf7f2 4454 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4455
4456 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4457 Inode *in = *p;
4458 if (session->early_flushing_caps.count(in))
4459 continue;
4460 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4461 if (in->cap_snaps.size())
4462 flush_snaps(in, true);
4463 if (in->flushing_caps)
4464 flush_caps(in, session);
4465 }
4466
4467 session->early_flushing_caps.clear();
4468}
4469
4470void Client::early_kick_flushing_caps(MetaSession *session)
4471{
4472 session->early_flushing_caps.clear();
4473
4474 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4475 Inode *in = *p;
11fdf7f2
TL
4476 Cap *cap = in->auth_cap;
4477 ceph_assert(cap);
7c673cae
FG
4478
4479 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4480 // stage. This guarantees that MDS processes the cap flush message before issuing
4481 // the flushing caps to other client.
4482 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4483 continue;
4484
4485 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4486 << " to mds." << session->mds_num << dendl;
4487
4488 session->early_flushing_caps.insert(in);
4489
11fdf7f2
TL
4490 // send_reconnect() also will reset these sequence numbers. make sure
4491 // sequence numbers in cap flush message match later reconnect message.
4492 cap->seq = 0;
4493 cap->issue_seq = 0;
4494 cap->mseq = 0;
4495 cap->issued = cap->implemented;
4496
7c673cae
FG
4497 if (in->cap_snaps.size())
4498 flush_snaps(in, true);
4499 if (in->flushing_caps)
4500 flush_caps(in, session);
4501
4502 }
4503}
4504
7c673cae
FG
4505void SnapRealm::build_snap_context()
4506{
4507 set<snapid_t> snaps;
4508 snapid_t max_seq = seq;
4509
4510 // start with prior_parents?
4511 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4512 snaps.insert(prior_parent_snaps[i]);
4513
4514 // current parent's snaps
4515 if (pparent) {
4516 const SnapContext& psnapc = pparent->get_snap_context();
4517 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4518 if (psnapc.snaps[i] >= parent_since)
4519 snaps.insert(psnapc.snaps[i]);
4520 if (psnapc.seq > max_seq)
4521 max_seq = psnapc.seq;
4522 }
4523
4524 // my snaps
4525 for (unsigned i=0; i<my_snaps.size(); i++)
4526 snaps.insert(my_snaps[i]);
4527
4528 // ok!
4529 cached_snap_context.seq = max_seq;
4530 cached_snap_context.snaps.resize(0);
4531 cached_snap_context.snaps.reserve(snaps.size());
4532 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4533 cached_snap_context.snaps.push_back(*p);
4534}
4535
4536void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4537{
4538 list<SnapRealm*> q;
4539 q.push_back(realm);
4540
4541 while (!q.empty()) {
4542 realm = q.front();
4543 q.pop_front();
4544
11fdf7f2 4545 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4546 realm->invalidate_cache();
4547
4548 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4549 p != realm->pchildren.end();
4550 ++p)
4551 q.push_back(*p);
4552 }
4553}
4554
4555SnapRealm *Client::get_snap_realm(inodeno_t r)
4556{
4557 SnapRealm *realm = snap_realms[r];
4558 if (!realm)
4559 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4560 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4561 realm->nref++;
4562 return realm;
4563}
4564
4565SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4566{
4567 if (snap_realms.count(r) == 0) {
11fdf7f2 4568 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4569 return NULL;
4570 }
4571 SnapRealm *realm = snap_realms[r];
11fdf7f2 4572 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4573 realm->nref++;
4574 return realm;
4575}
4576
4577void Client::put_snap_realm(SnapRealm *realm)
4578{
11fdf7f2 4579 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4580 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4581 if (--realm->nref == 0) {
4582 snap_realms.erase(realm->ino);
4583 if (realm->pparent) {
4584 realm->pparent->pchildren.erase(realm);
4585 put_snap_realm(realm->pparent);
4586 }
4587 delete realm;
4588 }
4589}
4590
4591bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4592{
4593 if (realm->parent != parent) {
11fdf7f2 4594 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4595 << " " << realm->parent << " -> " << parent << dendl;
4596 realm->parent = parent;
4597 if (realm->pparent) {
4598 realm->pparent->pchildren.erase(realm);
4599 put_snap_realm(realm->pparent);
4600 }
4601 realm->pparent = get_snap_realm(parent);
4602 realm->pparent->pchildren.insert(realm);
4603 return true;
4604 }
4605 return false;
4606}
4607
4608static bool has_new_snaps(const SnapContext& old_snapc,
4609 const SnapContext& new_snapc)
4610{
4611 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4612}
4613
4614
11fdf7f2 4615void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4616{
4617 SnapRealm *first_realm = NULL;
11fdf7f2 4618 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4619
4620 map<SnapRealm*, SnapContext> dirty_realms;
4621
11fdf7f2 4622 auto p = bl.cbegin();
7c673cae
FG
4623 while (!p.end()) {
4624 SnapRealmInfo info;
11fdf7f2 4625 decode(info, p);
7c673cae
FG
4626 SnapRealm *realm = get_snap_realm(info.ino());
4627
4628 bool invalidate = false;
4629
4630 if (info.seq() > realm->seq) {
11fdf7f2 4631 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4632 << dendl;
4633
4634 if (flush) {
4635 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4636 // flush me + children
4637 list<SnapRealm*> q;
4638 q.push_back(realm);
4639 while (!q.empty()) {
4640 SnapRealm *realm = q.front();
4641 q.pop_front();
4642
4643 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4644 p != realm->pchildren.end();
4645 ++p)
4646 q.push_back(*p);
4647
4648 if (dirty_realms.count(realm) == 0) {
4649 realm->nref++;
4650 dirty_realms[realm] = realm->get_snap_context();
4651 }
4652 }
4653 }
4654
4655 // update
4656 realm->seq = info.seq();
4657 realm->created = info.created();
4658 realm->parent_since = info.parent_since();
4659 realm->prior_parent_snaps = info.prior_parent_snaps;
4660 realm->my_snaps = info.my_snaps;
4661 invalidate = true;
4662 }
4663
4664 // _always_ verify parent
4665 if (adjust_realm_parent(realm, info.parent()))
4666 invalidate = true;
4667
4668 if (invalidate) {
4669 invalidate_snaprealm_and_children(realm);
11fdf7f2 4670 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4671 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4672 } else {
11fdf7f2 4673 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4674 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4675 }
4676
4677 if (!first_realm)
4678 first_realm = realm;
4679 else
4680 put_snap_realm(realm);
4681 }
4682
4683 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4684 q != dirty_realms.end();
4685 ++q) {
4686 SnapRealm *realm = q->first;
4687 // if there are new snaps ?
4688 if (has_new_snaps(q->second, realm->get_snap_context())) {
4689 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4690 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4691 while (!r.end()) {
4692 Inode *in = *r;
4693 ++r;
4694 queue_cap_snap(in, q->second);
4695 }
4696 } else {
4697 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4698 }
4699 put_snap_realm(realm);
4700 }
4701
4702 if (realm_ret)
4703 *realm_ret = first_realm;
4704 else
4705 put_snap_realm(first_realm);
4706}
4707
11fdf7f2 4708void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4709{
11fdf7f2 4710 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4711 mds_rank_t mds = mds_rank_t(m->get_source().num());
4712 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4713 if (!session) {
7c673cae
FG
4714 return;
4715 }
4716
4717 got_mds_push(session);
4718
4719 map<Inode*, SnapContext> to_move;
4720 SnapRealm *realm = 0;
4721
4722 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4723 ceph_assert(m->head.split);
7c673cae 4724 SnapRealmInfo info;
11fdf7f2
TL
4725 auto p = m->bl.cbegin();
4726 decode(info, p);
4727 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4728
4729 // flush, then move, ino's.
4730 realm = get_snap_realm(info.ino());
4731 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4732 for (auto& ino : m->split_inos) {
4733 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4734 if (inode_map.count(vino)) {
4735 Inode *in = inode_map[vino];
4736 if (!in->snaprealm || in->snaprealm == realm)
4737 continue;
4738 if (in->snaprealm->created > info.created()) {
4739 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4740 << *in->snaprealm << dendl;
4741 continue;
4742 }
4743 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4744
4745
4746 in->snaprealm_item.remove_myself();
4747 to_move[in] = in->snaprealm->get_snap_context();
4748 put_snap_realm(in->snaprealm);
4749 }
4750 }
4751
4752 // move child snaprealms, too
11fdf7f2
TL
4753 for (auto& child_realm : m->split_realms) {
4754 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4755 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4756 if (!child)
4757 continue;
4758 adjust_realm_parent(child, realm->ino);
4759 put_snap_realm(child);
4760 }
4761 }
4762
4763 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4764
4765 if (realm) {
4766 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4767 Inode *in = p->first;
4768 in->snaprealm = realm;
4769 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4770 realm->nref++;
4771 // queue for snap writeback
4772 if (has_new_snaps(p->second, realm->get_snap_context()))
4773 queue_cap_snap(in, p->second);
4774 }
4775 put_snap_realm(realm);
4776 }
7c673cae
FG
4777}
4778
11fdf7f2 4779void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4780{
4781 mds_rank_t mds = mds_rank_t(m->get_source().num());
4782 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4783 if (!session) {
7c673cae
FG
4784 return;
4785 }
4786
4787 got_mds_push(session);
4788
11fdf7f2 4789 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4790
4791 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4792 if (inode_map.count(vino)) {
4793 Inode *in = NULL;
4794 in = inode_map[vino];
4795
4796 if (in) {
4797 in->quota = m->quota;
4798 in->rstat = m->rstat;
4799 }
4800 }
7c673cae
FG
4801}
4802
11fdf7f2 4803void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4804{
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
7c673cae
FG
4808 return;
4809 }
4810
4811 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4812 // Pause RADOS operations until we see the required epoch
4813 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4814 }
4815
4816 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4817 // Record the barrier so that we will transmit it to MDS when releasing
4818 set_cap_epoch_barrier(m->osd_epoch_barrier);
4819 }
4820
4821 got_mds_push(session);
4822
11fdf7f2 4823 Inode *in;
7c673cae 4824 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4825 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4826 in = it->second;
4827 } else {
7c673cae 4828 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4829 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4830 session->enqueue_cap_release(
4831 m->get_ino(),
4832 m->get_cap_id(),
4833 m->get_seq(),
4834 m->get_mseq(),
4835 cap_epoch_barrier);
4836 } else {
11fdf7f2 4837 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4838 }
7c673cae
FG
4839
4840 // in case the mds is waiting on e.g. a revocation
4841 flush_cap_releases();
4842 return;
4843 }
4844
4845 switch (m->get_op()) {
11fdf7f2
TL
4846 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4847 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4848 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4849 }
4850
11fdf7f2
TL
4851 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4852 Cap &cap = in->caps.at(mds);
7c673cae 4853
11fdf7f2
TL
4854 switch (m->get_op()) {
4855 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4856 case CEPH_CAP_OP_IMPORT:
4857 case CEPH_CAP_OP_REVOKE:
4858 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4859 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4860 }
4861 } else {
4862 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4863 return;
7c673cae
FG
4864 }
4865}
4866
11fdf7f2 4867void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4868{
4869 mds_rank_t mds = session->mds_num;
4870
11fdf7f2 4871 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4872 << " IMPORT from mds." << mds << dendl;
4873
4874 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4875 Cap *cap = NULL;
4876 UserPerm cap_perms;
11fdf7f2
TL
4877 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4878 cap = &it->second;
4879 cap_perms = cap->latest_perms;
7c673cae
FG
4880 }
4881
4882 // add/update it
4883 SnapRealm *realm = NULL;
4884 update_snap_trace(m->snapbl, &realm);
4885
4886 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4887 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4888 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4889
4890 if (cap && cap->cap_id == m->peer.cap_id) {
4891 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4892 }
4893
4894 if (realm)
4895 put_snap_realm(realm);
4896
4897 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4898 // reflush any/all caps (if we are now the auth_cap)
4899 if (in->cap_snaps.size())
4900 flush_snaps(in, true);
4901 if (in->flushing_caps)
4902 flush_caps(in, session);
4903 }
4904}
4905
11fdf7f2 4906void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4907{
4908 mds_rank_t mds = session->mds_num;
4909
11fdf7f2 4910 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4911 << " EXPORT from mds." << mds << dendl;
4912
11fdf7f2
TL
4913 auto it = in->caps.find(mds);
4914 if (it != in->caps.end()) {
4915 Cap &cap = it->second;
4916 if (cap.cap_id == m->get_cap_id()) {
4917 if (m->peer.cap_id) {
4918 const auto peer_mds = mds_rank_t(m->peer.mds);
4919 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4920 auto it = in->caps.find(peer_mds);
4921 if (it != in->caps.end()) {
4922 Cap &tcap = it->second;
4923 if (tcap.cap_id == m->peer.cap_id &&
4924 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4925 tcap.cap_id = m->peer.cap_id;
4926 tcap.seq = m->peer.seq - 1;
4927 tcap.issue_seq = tcap.seq;
4928 tcap.issued |= cap.issued;
4929 tcap.implemented |= cap.issued;
4930 if (&cap == in->auth_cap)
4931 in->auth_cap = &tcap;
4932 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4933 adjust_session_flushing_caps(in, session, tsession);
4934 }
4935 } else {
4936 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4937 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4938 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4939 cap.latest_perms);
4940 }
7c673cae 4941 } else {
11fdf7f2
TL
4942 if (cap.wanted | cap.issued)
4943 in->flags |= I_CAP_DROPPED;
7c673cae 4944 }
7c673cae 4945
11fdf7f2
TL
4946 remove_cap(&cap, false);
4947 }
7c673cae 4948 }
7c673cae
FG
4949}
4950
11fdf7f2 4951void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4952{
4953 mds_rank_t mds = session->mds_num;
11fdf7f2 4954 ceph_assert(in->caps.count(mds));
7c673cae 4955
11fdf7f2 4956 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4957 << " size " << in->size << " -> " << m->get_size()
4958 << dendl;
4959
1adf2230
AA
4960 int issued;
4961 in->caps_issued(&issued);
4962 issued |= in->caps_dirty();
4963 update_inode_file_size(in, issued, m->get_size(),
4964 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4965}
4966
11fdf7f2 4967void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4968{
4969 ceph_tid_t flush_ack_tid = m->get_client_tid();
4970 int dirty = m->get_dirty();
4971 int cleaned = 0;
4972 int flushed = 0;
4973
11fdf7f2
TL
4974 auto it = in->flushing_cap_tids.begin();
4975 if (it->first < flush_ack_tid) {
4976 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4977 << " got unexpected flush ack tid " << flush_ack_tid
4978 << " expected is " << it->first << dendl;
4979 }
4980 for (; it != in->flushing_cap_tids.end(); ) {
7c673cae
FG
4981 if (it->first == flush_ack_tid)
4982 cleaned = it->second;
4983 if (it->first <= flush_ack_tid) {
4984 session->flushing_caps_tids.erase(it->first);
4985 in->flushing_cap_tids.erase(it++);
4986 ++flushed;
4987 continue;
4988 }
4989 cleaned &= ~it->second;
4990 if (!cleaned)
4991 break;
4992 ++it;
4993 }
4994
11fdf7f2 4995 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
4996 << " cleaned " << ccap_string(cleaned) << " on " << *in
4997 << " with " << ccap_string(dirty) << dendl;
4998
4999 if (flushed) {
5000 signal_cond_list(in->waitfor_caps);
5001 if (session->flushing_caps_tids.empty() ||
5002 *session->flushing_caps_tids.begin() > flush_ack_tid)
5003 sync_cond.Signal();
5004 }
5005
5006 if (!dirty) {
5007 in->cap_dirtier_uid = -1;
5008 in->cap_dirtier_gid = -1;
5009 }
5010
5011 if (!cleaned) {
5012 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5013 } else {
5014 if (in->flushing_caps) {
5015 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5016 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5017 in->flushing_caps &= ~cleaned;
5018 if (in->flushing_caps == 0) {
5019 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5020 num_flushing_caps--;
5021 if (in->cap_snaps.empty())
5022 in->flushing_cap_item.remove_myself();
5023 }
5024 if (!in->caps_dirty())
5025 put_inode(in);
5026 }
5027 }
7c673cae
FG
5028}
5029
5030
11fdf7f2 5031void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5032{
5033 mds_rank_t mds = session->mds_num;
11fdf7f2 5034 ceph_assert(in->caps.count(mds));
7c673cae
FG
5035 snapid_t follows = m->get_snap_follows();
5036
11fdf7f2
TL
5037 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5038 auto& capsnap = it->second;
7c673cae
FG
5039 if (m->get_client_tid() != capsnap.flush_tid) {
5040 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
5041 } else {
11fdf7f2 5042 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5043 << " on " << *in << dendl;
5044 InodeRef tmp_ref;
5045 if (in->get_num_ref() == 1)
5046 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5047 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5048 in->flushing_cap_item.remove_myself();
5049 session->flushing_caps_tids.erase(capsnap.flush_tid);
11fdf7f2 5050 in->cap_snaps.erase(it);
7c673cae
FG
5051 }
5052 } else {
11fdf7f2 5053 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5054 << " on " << *in << dendl;
5055 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5056 }
7c673cae
FG
5057}
5058
5059class C_Client_DentryInvalidate : public Context {
5060private:
5061 Client *client;
5062 vinodeno_t dirino;
5063 vinodeno_t ino;
5064 string name;
5065public:
5066 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5067 client(c), name(dn->name) {
5068 if (client->use_faked_inos()) {
5069 dirino.ino = dn->dir->parent_inode->faked_ino;
5070 if (del)
5071 ino.ino = dn->inode->faked_ino;
5072 } else {
5073 dirino = dn->dir->parent_inode->vino();
5074 if (del)
5075 ino = dn->inode->vino();
5076 }
5077 if (!del)
5078 ino.ino = inodeno_t();
5079 }
5080 void finish(int r) override {
5081 // _async_dentry_invalidate is responsible for its own locking
11fdf7f2 5082 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
5083 client->_async_dentry_invalidate(dirino, ino, name);
5084 }
5085};
5086
5087void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5088{
5089 if (unmounting)
5090 return;
11fdf7f2 5091 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5092 << " in dir " << dirino << dendl;
5093 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5094}
5095
5096void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5097{
5098 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5099 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5100}
5101
5102void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5103{
5104 int ref = in->get_num_ref();
5105
5106 if (in->dir && !in->dir->dentries.empty()) {
5107 for (auto p = in->dir->dentries.begin();
5108 p != in->dir->dentries.end(); ) {
5109 Dentry *dn = p->second;
5110 ++p;
5111 /* rmsnap removes whole subtree, need trim inodes recursively.
5112 * we don't need to invalidate dentries recursively. because
5113 * invalidating a directory dentry effectively invalidate
5114 * whole subtree */
5115 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5116 _try_to_trim_inode(dn->inode.get(), false);
5117
5118 if (dn->lru_is_expireable())
5119 unlink(dn, true, false); // keep dir, drop dentry
5120 }
5121 if (in->dir->dentries.empty()) {
5122 close_dir(in->dir);
5123 --ref;
5124 }
5125 }
5126
5127 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5128 InodeRef snapdir = open_snapdir(in);
5129 _try_to_trim_inode(snapdir.get(), false);
5130 --ref;
5131 }
5132
5133 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
11fdf7f2
TL
5134 auto q = in->dentries.begin();
5135 while (q != in->dentries.end()) {
5136 Dentry *dn = *q;
5137 ++q;
7c673cae 5138 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
11fdf7f2 5139 // so in->dentries doesn't always reflect the state of kernel's dcache.
7c673cae
FG
5140 _schedule_invalidate_dentry_callback(dn, true);
5141 unlink(dn, true, true);
5142 }
5143 }
5144}
5145
11fdf7f2 5146void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5147{
5148 mds_rank_t mds = session->mds_num;
5149 int used = get_caps_used(in);
5150 int wanted = in->caps_wanted();
5151
a8e16298
TL
5152 const unsigned new_caps = m->get_caps();
5153 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5154 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5155 << " mds." << mds << " seq " << m->get_seq()
5156 << " caps now " << ccap_string(new_caps)
a8e16298
TL
5157 << " was " << ccap_string(cap->issued)
5158 << (was_stale ? "" : " (stale)") << dendl;
5159
5160 if (was_stale)
5161 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5162 cap->seq = m->get_seq();
28e407b8 5163 cap->gen = session->cap_gen;
7c673cae 5164
11fdf7f2 5165 check_cap_issue(in, new_caps);
a8e16298 5166
7c673cae 5167 // update inode
1adf2230
AA
5168 int issued;
5169 in->caps_issued(&issued);
5170 issued |= in->caps_dirty();
7c673cae 5171
1adf2230
AA
5172 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5173 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5174 in->mode = m->head.mode;
5175 in->uid = m->head.uid;
5176 in->gid = m->head.gid;
5177 in->btime = m->btime;
5178 }
5179 bool deleted_inode = false;
1adf2230
AA
5180 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5181 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5182 in->nlink = m->head.nlink;
5183 if (in->nlink == 0 &&
5184 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5185 deleted_inode = true;
5186 }
1adf2230 5187 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5188 m->xattrbl.length() &&
5189 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5190 auto p = m->xattrbl.cbegin();
5191 decode(in->xattrs, p);
7c673cae
FG
5192 in->xattr_version = m->head.xattr_version;
5193 }
28e407b8
AA
5194
5195 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5196 in->dirstat.nfiles = m->get_nfiles();
5197 in->dirstat.nsubdirs = m->get_nsubdirs();
5198 }
5199
1adf2230
AA
5200 if (new_caps & CEPH_CAP_ANY_RD) {
5201 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5202 m->get_ctime(), m->get_mtime(), m->get_atime());
5203 }
5204
5205 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5206 in->layout = m->get_layout();
5207 update_inode_file_size(in, issued, m->get_size(),
5208 m->get_truncate_seq(), m->get_truncate_size());
5209 }
5210
5211 if (m->inline_version > in->inline_version) {
5212 in->inline_data = m->inline_data;
5213 in->inline_version = m->inline_version;
5214 }
5215
5216 /* always take a newer change attr */
5217 if (m->get_change_attr() > in->change_attr)
5218 in->change_attr = m->get_change_attr();
7c673cae
FG
5219
5220 // max_size
5221 if (cap == in->auth_cap &&
1adf2230
AA
5222 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5223 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5224 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5225 in->max_size = m->get_max_size();
5226 if (in->max_size > in->wanted_max_size) {
5227 in->wanted_max_size = 0;
5228 in->requested_max_size = 0;
5229 }
5230 }
5231
5232 bool check = false;
a8e16298
TL
5233 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5234 (wanted & ~(cap->wanted | new_caps))) {
5235 // If mds is importing cap, prior cap messages that update 'wanted'
5236 // may get dropped by mds (migrate seq mismatch).
5237 //
5238 // We don't send cap message to update 'wanted' if what we want are
5239 // already issued. If mds revokes caps, cap message that releases caps
5240 // also tells mds what we want. But if caps got revoked by mds forcedly
5241 // (session stale). We may haven't told mds what we want.
7c673cae 5242 check = true;
a8e16298 5243 }
7c673cae 5244
7c673cae
FG
5245
5246 // update caps
a8e16298 5247 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5248 if (revoked) {
5249 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5250 cap->issued = new_caps;
5251 cap->implemented |= new_caps;
5252
b32b8144
FG
5253 // recall delegations if we're losing caps necessary for them
5254 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5255 in->recall_deleg(false);
5256 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5257 in->recall_deleg(true);
5258
11fdf7f2
TL
5259 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5260 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5261 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5262 // waitin' for flush
11fdf7f2 5263 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5264 if (_release(in))
5265 check = true;
5266 } else {
5267 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5268 check = true;
5269 }
a8e16298
TL
5270 } else if (cap->issued == new_caps) {
5271 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5272 } else {
a8e16298 5273 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5274 cap->issued = new_caps;
5275 cap->implemented |= new_caps;
5276
5277 if (cap == in->auth_cap) {
5278 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5279 for (const auto &p : in->caps) {
5280 if (&p.second == cap)
7c673cae 5281 continue;
11fdf7f2 5282 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5283 check = true;
5284 break;
5285 }
5286 }
5287 }
5288 }
5289
5290 if (check)
5291 check_caps(in, 0);
5292
5293 // wake up waiters
5294 if (new_caps)
5295 signal_cond_list(in->waitfor_caps);
5296
5297 // may drop inode's last ref
5298 if (deleted_inode)
5299 _try_to_trim_inode(in, true);
7c673cae
FG
5300}
5301
7c673cae
FG
5302int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5303{
5304 if (perms.uid() == 0)
5305 return 0;
5306
5307 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5308 int ret = _posix_acl_permission(in, perms, want);
5309 if (ret != -EAGAIN)
5310 return ret;
5311 }
5312
5313 // check permissions before doing anything else
5314 if (!in->check_mode(perms, want))
5315 return -EACCES;
5316 return 0;
5317}
5318
5319int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5320 const UserPerm& perms)
5321{
5322 int r = _getattr_for_perm(in, perms);
5323 if (r < 0)
5324 goto out;
5325
5326 r = 0;
5327 if (strncmp(name, "system.", 7) == 0) {
5328 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5329 r = -EPERM;
5330 } else {
5331 r = inode_permission(in, perms, want);
5332 }
5333out:
1adf2230 5334 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5335 return r;
5336}
5337
5338ostream& operator<<(ostream &out, const UserPerm& perm) {
5339 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5340 return out;
5341}
5342
5343int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5344 const UserPerm& perms)
5345{
181888fb 5346 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5347 int r = _getattr_for_perm(in, perms);
5348 if (r < 0)
5349 goto out;
5350
5351 if (mask & CEPH_SETATTR_SIZE) {
5352 r = inode_permission(in, perms, MAY_WRITE);
5353 if (r < 0)
5354 goto out;
5355 }
5356
5357 r = -EPERM;
5358 if (mask & CEPH_SETATTR_UID) {
5359 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5360 goto out;
5361 }
5362 if (mask & CEPH_SETATTR_GID) {
5363 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5364 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5365 goto out;
5366 }
5367
5368 if (mask & CEPH_SETATTR_MODE) {
5369 if (perms.uid() != 0 && perms.uid() != in->uid)
5370 goto out;
5371
5372 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5373 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5374 stx->stx_mode &= ~S_ISGID;
5375 }
5376
5377 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5378 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5379 if (perms.uid() != 0 && perms.uid() != in->uid) {
5380 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5381 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5382 check_mask |= CEPH_SETATTR_MTIME;
5383 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5384 check_mask |= CEPH_SETATTR_ATIME;
5385 if (check_mask & mask) {
5386 goto out;
5387 } else {
5388 r = inode_permission(in, perms, MAY_WRITE);
5389 if (r < 0)
5390 goto out;
5391 }
5392 }
5393 }
5394 r = 0;
5395out:
5396 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5397 return r;
5398}
5399
5400int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5401{
181888fb 5402 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5403 unsigned want = 0;
5404
5405 if ((flags & O_ACCMODE) == O_WRONLY)
5406 want = MAY_WRITE;
5407 else if ((flags & O_ACCMODE) == O_RDWR)
5408 want = MAY_READ | MAY_WRITE;
5409 else if ((flags & O_ACCMODE) == O_RDONLY)
5410 want = MAY_READ;
5411 if (flags & O_TRUNC)
5412 want |= MAY_WRITE;
5413
5414 int r = 0;
5415 switch (in->mode & S_IFMT) {
5416 case S_IFLNK:
5417 r = -ELOOP;
5418 goto out;
5419 case S_IFDIR:
5420 if (want & MAY_WRITE) {
5421 r = -EISDIR;
5422 goto out;
5423 }
5424 break;
5425 }
5426
5427 r = _getattr_for_perm(in, perms);
5428 if (r < 0)
5429 goto out;
5430
5431 r = inode_permission(in, perms, want);
5432out:
5433 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5434 return r;
5435}
5436
5437int Client::may_lookup(Inode *dir, const UserPerm& perms)
5438{
181888fb 5439 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5440 int r = _getattr_for_perm(dir, perms);
5441 if (r < 0)
5442 goto out;
5443
5444 r = inode_permission(dir, perms, MAY_EXEC);
5445out:
5446 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5447 return r;
5448}
5449
5450int Client::may_create(Inode *dir, const UserPerm& perms)
5451{
181888fb 5452 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5453 int r = _getattr_for_perm(dir, perms);
5454 if (r < 0)
5455 goto out;
5456
5457 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5458out:
5459 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5460 return r;
5461}
5462
5463int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5464{
181888fb 5465 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5466 int r = _getattr_for_perm(dir, perms);
5467 if (r < 0)
5468 goto out;
5469
5470 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5471 if (r < 0)
5472 goto out;
5473
5474 /* 'name == NULL' means rmsnap */
5475 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5476 InodeRef otherin;
5477 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5478 if (r < 0)
5479 goto out;
5480 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5481 r = -EPERM;
5482 }
5483out:
5484 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5485 return r;
5486}
5487
5488int Client::may_hardlink(Inode *in, const UserPerm& perms)
5489{
181888fb 5490 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5491 int r = _getattr_for_perm(in, perms);
5492 if (r < 0)
5493 goto out;
5494
5495 if (perms.uid() == 0 || perms.uid() == in->uid) {
5496 r = 0;
5497 goto out;
5498 }
5499
5500 r = -EPERM;
5501 if (!S_ISREG(in->mode))
5502 goto out;
5503
5504 if (in->mode & S_ISUID)
5505 goto out;
5506
5507 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5508 goto out;
5509
5510 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5511out:
5512 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5513 return r;
5514}
5515
5516int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5517{
5518 int mask = CEPH_STAT_CAP_MODE;
5519 bool force = false;
5520 if (acl_type != NO_ACL) {
5521 mask |= CEPH_STAT_CAP_XATTR;
5522 force = in->xattr_version == 0;
5523 }
5524 return _getattr(in, mask, perms, force);
5525}
5526
5527vinodeno_t Client::_get_vino(Inode *in)
5528{
5529 /* The caller must hold the client lock */
5530 return vinodeno_t(in->ino, in->snapid);
5531}
5532
7c673cae
FG
5533/**
5534 * Resolve an MDS spec to a list of MDS daemon GIDs.
5535 *
5536 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5537 * It may be '*' in which case it matches all GIDs.
5538 *
5539 * If no error is returned, the `targets` vector will be populated with at least
5540 * one MDS.
5541 */
5542int Client::resolve_mds(
5543 const std::string &mds_spec,
5544 std::vector<mds_gid_t> *targets)
5545{
11fdf7f2
TL
5546 ceph_assert(fsmap);
5547 ceph_assert(targets != nullptr);
7c673cae
FG
5548
5549 mds_role_t role;
5550 std::stringstream ss;
5551 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5552 if (role_r == 0) {
5553 // We got a role, resolve it to a GID
5554 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5555 << role << "'" << dendl;
5556 targets->push_back(
5557 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5558 return 0;
5559 }
5560
5561 std::string strtol_err;
5562 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5563 if (strtol_err.empty()) {
5564 // It is a possible GID
5565 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5566 if (fsmap->gid_exists(mds_gid)) {
5567 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5568 targets->push_back(mds_gid);
5569 } else {
5570 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5571 << dendl;
5572 return -ENOENT;
5573 }
5574 } else if (mds_spec == "*") {
5575 // It is a wildcard: use all MDSs
5576 const auto mds_info = fsmap->get_mds_info();
5577
5578 if (mds_info.empty()) {
5579 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5580 return -ENOENT;
5581 }
5582
5583 for (const auto i : mds_info) {
5584 targets->push_back(i.first);
5585 }
5586 } else {
5587 // It did not parse as an integer, it is not a wildcard, it must be a name
5588 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5589 if (mds_gid == 0) {
5590 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5591
5592 lderr(cct) << "FSMap: " << *fsmap << dendl;
5593
5594 return -ENOENT;
5595 } else {
5596 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5597 << "' to GID " << mds_gid << dendl;
5598 targets->push_back(mds_gid);
5599 }
5600 }
5601
5602 return 0;
5603}
5604
5605
5606/**
5607 * Authenticate with mon and establish global ID
5608 */
5609int Client::authenticate()
5610{
11fdf7f2 5611 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
5612
5613 if (monclient->is_authenticated()) {
5614 return 0;
5615 }
5616
5617 client_lock.Unlock();
5618 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5619 client_lock.Lock();
5620 if (r < 0) {
5621 return r;
5622 }
5623
5624 whoami = monclient->get_global_id();
5625 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5626
5627 return 0;
5628}
5629
5630int Client::fetch_fsmap(bool user)
5631{
5632 int r;
5633 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5634 // rather than MDSMap because no one MDSMap contains all the daemons, and
5635 // a `tell` can address any daemon.
5636 version_t fsmap_latest;
5637 do {
5638 C_SaferCond cond;
5639 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5640 client_lock.Unlock();
5641 r = cond.wait();
5642 client_lock.Lock();
5643 } while (r == -EAGAIN);
5644
5645 if (r < 0) {
5646 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5647 return r;
5648 }
5649
5650 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5651
5652 if (user) {
5653 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5654 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5655 monclient->renew_subs();
5656 wait_on_list(waiting_for_fsmap);
5657 }
11fdf7f2
TL
5658 ceph_assert(fsmap_user);
5659 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5660 } else {
5661 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5662 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5663 monclient->renew_subs();
5664 wait_on_list(waiting_for_fsmap);
5665 }
11fdf7f2
TL
5666 ceph_assert(fsmap);
5667 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5668 }
5669 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5670 << fsmap_latest << dendl;
5671 return 0;
5672}
5673
5674/**
5675 *
5676 * @mds_spec one of ID, rank, GID, "*"
5677 *
5678 */
5679int Client::mds_command(
5680 const std::string &mds_spec,
5681 const vector<string>& cmd,
5682 const bufferlist& inbl,
5683 bufferlist *outbl,
5684 string *outs,
5685 Context *onfinish)
5686{
11fdf7f2 5687 std::lock_guard lock(client_lock);
7c673cae 5688
181888fb
FG
5689 if (!initialized)
5690 return -ENOTCONN;
7c673cae
FG
5691
5692 int r;
5693 r = authenticate();
5694 if (r < 0) {
5695 return r;
5696 }
5697
5698 r = fetch_fsmap(false);
5699 if (r < 0) {
5700 return r;
5701 }
5702
5703 // Look up MDS target(s) of the command
5704 std::vector<mds_gid_t> targets;
5705 r = resolve_mds(mds_spec, &targets);
5706 if (r < 0) {
5707 return r;
5708 }
5709
5710 // If daemons are laggy, we won't send them commands. If all
5711 // are laggy then we fail.
5712 std::vector<mds_gid_t> non_laggy;
5713 for (const auto gid : targets) {
5714 const auto info = fsmap->get_info_gid(gid);
5715 if (!info.laggy()) {
5716 non_laggy.push_back(gid);
5717 }
5718 }
5719 if (non_laggy.size() == 0) {
5720 *outs = "All targeted MDS daemons are laggy";
5721 return -ENOENT;
5722 }
5723
5724 if (metadata.empty()) {
5725 // We are called on an unmounted client, so metadata
5726 // won't be initialized yet.
5727 populate_metadata("");
5728 }
5729
5730 // Send commands to targets
5731 C_GatherBuilder gather(cct, onfinish);
5732 for (const auto target_gid : non_laggy) {
5733 const auto info = fsmap->get_info_gid(target_gid);
5734
5735 // Open a connection to the target MDS
11fdf7f2 5736 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5737
5738 // Generate MDSCommandOp state
5739 auto &op = command_table.start_command();
5740
5741 op.on_finish = gather.new_sub();
5742 op.cmd = cmd;
5743 op.outbl = outbl;
5744 op.outs = outs;
5745 op.inbl = inbl;
5746 op.mds_gid = target_gid;
5747 op.con = conn;
5748
5749 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5750 << " tid=" << op.tid << cmd << dendl;
5751
5752 // Construct and send MCommand
11fdf7f2
TL
5753 auto m = op.get_message(monclient->get_fsid());
5754 conn->send_message2(std::move(m));
7c673cae
FG
5755 }
5756 gather.activate();
5757
5758 return 0;
5759}
5760
11fdf7f2 5761void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5762{
5763 ceph_tid_t const tid = m->get_tid();
5764
5765 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5766
5767 if (!command_table.exists(tid)) {
5768 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5769 return;
5770 }
5771
5772 auto &op = command_table.get_command(tid);
5773 if (op.outbl) {
11fdf7f2 5774 *op.outbl = m->get_data();
7c673cae
FG
5775 }
5776 if (op.outs) {
5777 *op.outs = m->rs;
5778 }
5779
5780 if (op.on_finish) {
5781 op.on_finish->complete(m->r);
5782 }
5783
5784 command_table.erase(tid);
7c673cae
FG
5785}
5786
5787// -------------------
5788// MOUNT
5789
11fdf7f2 5790int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5791{
7c673cae
FG
5792 int r = authenticate();
5793 if (r < 0) {
5794 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5795 return r;
5796 }
5797
11fdf7f2
TL
5798 std::string resolved_fs_name;
5799 if (fs_name.empty()) {
5800 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5801 } else {
5802 resolved_fs_name = fs_name;
5803 }
5804
7c673cae 5805 std::string want = "mdsmap";
11fdf7f2 5806 if (!resolved_fs_name.empty()) {
7c673cae
FG
5807 r = fetch_fsmap(true);
5808 if (r < 0)
5809 return r;
11fdf7f2
TL
5810 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5811 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5812 return -ENOENT;
11fdf7f2 5813 }
7c673cae
FG
5814
5815 std::ostringstream oss;
11fdf7f2 5816 oss << want << "." << fscid;
7c673cae
FG
5817 want = oss.str();
5818 }
5819 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5820
5821 monclient->sub_want(want, 0, 0);
5822 monclient->renew_subs();
5823
11fdf7f2
TL
5824 return 0;
5825}
5826
5827int Client::mount(const std::string &mount_root, const UserPerm& perms,
5828 bool require_mds, const std::string &fs_name)
5829{
5830 std::lock_guard lock(client_lock);
5831
5832 if (mounted) {
5833 ldout(cct, 5) << "already mounted" << dendl;
5834 return 0;
5835 }
5836
5837 unmounting = false;
5838
5839 int r = subscribe_mdsmap(fs_name);
5840 if (r < 0) {
5841 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5842 return r;
5843 }
5844
7c673cae
FG
5845 tick(); // start tick
5846
5847 if (require_mds) {
5848 while (1) {
5849 auto availability = mdsmap->is_cluster_available();
5850 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5851 // Error out
5852 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5853 return CEPH_FUSE_NO_MDS_UP;
5854 } else if (availability == MDSMap::AVAILABLE) {
5855 // Continue to mount
5856 break;
5857 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5858 // Else, wait. MDSMonitor will update the map to bring
5859 // us to a conclusion eventually.
5860 wait_on_list(waiting_for_mdsmap);
5861 } else {
5862 // Unexpected value!
5863 ceph_abort();
5864 }
5865 }
5866 }
5867
5868 populate_metadata(mount_root.empty() ? "/" : mount_root);
5869
5870 filepath fp(CEPH_INO_ROOT);
5871 if (!mount_root.empty()) {
5872 fp = filepath(mount_root.c_str());
5873 }
5874 while (true) {
5875 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5876 req->set_filepath(fp);
5877 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5878 int res = make_request(req, perms);
5879 if (res < 0) {
5880 if (res == -EACCES && root) {
5881 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5882 break;
5883 }
5884 return res;
5885 }
5886
5887 if (fp.depth())
5888 fp.pop_dentry();
5889 else
5890 break;
5891 }
5892
11fdf7f2 5893 ceph_assert(root);
7c673cae
FG
5894 _ll_get(root);
5895
5896 mounted = true;
5897
5898 // trace?
5899 if (!cct->_conf->client_trace.empty()) {
5900 traceout.open(cct->_conf->client_trace.c_str());
5901 if (traceout.is_open()) {
5902 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5903 } else {
5904 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5905 }
5906 }
5907
5908 /*
5909 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5910 ldout(cct, 3) << "op: struct stat st;" << dendl;
5911 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5912 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5913 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5914 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5915 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5916 ldout(cct, 3) << "op: int fd;" << dendl;
5917 */
5918 return 0;
5919}
5920
5921// UNMOUNT
5922
5923void Client::_close_sessions()
5924{
5925 while (!mds_sessions.empty()) {
5926 // send session closes!
11fdf7f2
TL
5927 for (auto &p : mds_sessions) {
5928 if (p.second.state != MetaSession::STATE_CLOSING) {
5929 _close_mds_session(&p.second);
7c673cae
FG
5930 }
5931 }
5932
5933 // wait for sessions to close
5934 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5935 mount_cond.Wait(client_lock);
5936 }
5937}
5938
31f18b77
FG
5939void Client::flush_mdlog_sync()
5940{
5941 if (mds_requests.empty())
5942 return;
11fdf7f2
TL
5943 for (auto &p : mds_sessions) {
5944 flush_mdlog(&p.second);
31f18b77
FG
5945 }
5946}
5947
5948void Client::flush_mdlog(MetaSession *session)
5949{
5950 // Only send this to Luminous or newer MDS daemons, older daemons
5951 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5952 const uint64_t features = session->con->get_features();
5953 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
11fdf7f2
TL
5954 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5955 session->con->send_message2(std::move(m));
31f18b77
FG
5956 }
5957}
5958
5959
11fdf7f2
TL
5960void Client::_abort_mds_sessions(int err)
5961{
5962 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5963 auto req = p->second;
5964 ++p;
5965 // unsafe requests will be removed during close session below.
5966 if (req->got_unsafe)
5967 continue;
5968
5969 req->abort(err);
5970 if (req->caller_cond) {
5971 req->kick = true;
5972 req->caller_cond->Signal();
5973 }
5974 }
5975
5976 // Process aborts on any requests that were on this waitlist.
5977 // Any requests that were on a waiting_for_open session waitlist
5978 // will get kicked during close session below.
5979 signal_cond_list(waiting_for_mdsmap);
5980
5981 // Force-close all sessions
5982 while(!mds_sessions.empty()) {
5983 auto& session = mds_sessions.begin()->second;
5984 _closed_mds_session(&session);
5985 }
5986}
5987
5988void Client::_unmount(bool abort)
7c673cae 5989{
181888fb
FG
5990 if (unmounting)
5991 return;
7c673cae 5992
11fdf7f2
TL
5993 if (abort || blacklisted) {
5994 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
5995 } else {
5996 ldout(cct, 2) << "unmounting" << dendl;
5997 }
7c673cae
FG
5998 unmounting = true;
5999
b32b8144
FG
6000 deleg_timeout = 0;
6001
11fdf7f2
TL
6002 if (abort) {
6003 // Abort all mds sessions
6004 _abort_mds_sessions(-ENOTCONN);
6005
6006 objecter->op_cancel_writes(-ENOTCONN);
6007 } else {
6008 // flush the mdlog for pending requests, if any
6009 flush_mdlog_sync();
6010 }
6011
7c673cae
FG
6012 while (!mds_requests.empty()) {
6013 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6014 mount_cond.Wait(client_lock);
6015 }
6016
6017 if (tick_event)
6018 timer.cancel_event(tick_event);
6019 tick_event = 0;
6020
6021 cwd.reset();
6022
6023 // clean up any unclosed files
6024 while (!fd_map.empty()) {
6025 Fh *fh = fd_map.begin()->second;
6026 fd_map.erase(fd_map.begin());
6027 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6028 _release_fh(fh);
6029 }
6030
6031 while (!ll_unclosed_fh_set.empty()) {
6032 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6033 Fh *fh = *it;
6034 ll_unclosed_fh_set.erase(fh);
6035 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6036 _release_fh(fh);
6037 }
6038
6039 while (!opened_dirs.empty()) {
6040 dir_result_t *dirp = *opened_dirs.begin();
6041 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6042 _closedir(dirp);
6043 }
6044
6045 _ll_drop_pins();
6046
6047 while (unsafe_sync_write > 0) {
6048 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6049 mount_cond.Wait(client_lock);
6050 }
6051
6052 if (cct->_conf->client_oc) {
6053 // flush/release all buffered data
11fdf7f2
TL
6054 std::list<InodeRef> anchor;
6055 for (auto& p : inode_map) {
6056 Inode *in = p.second;
7c673cae 6057 if (!in) {
11fdf7f2
TL
6058 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6059 ceph_assert(in);
7c673cae 6060 }
11fdf7f2
TL
6061
6062 // prevent inode from getting freed
6063 anchor.emplace_back(in);
6064
6065 if (abort || blacklisted) {
6066 objectcacher->purge_set(&in->oset);
6067 } else if (!in->caps.empty()) {
7c673cae
FG
6068 _release(in);
6069 _flush(in, new C_Client_FlushComplete(this, in));
6070 }
6071 }
6072 }
6073
11fdf7f2
TL
6074 if (abort || blacklisted) {
6075 for (auto p = dirty_list.begin(); !p.end(); ) {
6076 Inode *in = *p;
6077 ++p;
6078 if (in->dirty_caps) {
6079 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6080 in->mark_caps_clean();
6081 put_inode(in);
6082 }
6083 }
6084 } else {
6085 flush_caps_sync();
6086 wait_sync_caps(last_flush_tid);
6087 }
7c673cae
FG
6088
6089 // empty lru cache
7c673cae
FG
6090 trim_cache();
6091
6092 while (lru.lru_get_size() > 0 ||
6093 !inode_map.empty()) {
6094 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6095 << "+" << inode_map.size() << " items"
6096 << ", waiting (for caps to release?)"
6097 << dendl;
6098 utime_t until = ceph_clock_now() + utime_t(5, 0);
6099 int r = mount_cond.WaitUntil(client_lock, until);
6100 if (r == ETIMEDOUT) {
6101 dump_cache(NULL);
6102 }
6103 }
11fdf7f2
TL
6104 ceph_assert(lru.lru_get_size() == 0);
6105 ceph_assert(inode_map.empty());
7c673cae
FG
6106
6107 // stop tracing
6108 if (!cct->_conf->client_trace.empty()) {
6109 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6110 traceout.close();
6111 }
6112
6113 _close_sessions();
6114
6115 mounted = false;
6116
6117 ldout(cct, 2) << "unmounted." << dendl;
6118}
6119
b32b8144
FG
6120void Client::unmount()
6121{
11fdf7f2
TL
6122 std::lock_guard lock(client_lock);
6123 _unmount(false);
6124}
6125
6126void Client::abort_conn()
6127{
6128 std::lock_guard lock(client_lock);
6129 _unmount(true);
b32b8144
FG
6130}
6131
7c673cae
FG
6132void Client::flush_cap_releases()
6133{
6134 // send any cap releases
11fdf7f2
TL
6135 for (auto &p : mds_sessions) {
6136 auto &session = p.second;
6137 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6138 p.first)) {
7c673cae
FG
6139 if (cct->_conf->client_inject_release_failure) {
6140 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6141 } else {
11fdf7f2 6142 session.con->send_message2(std::move(session.release));
7c673cae 6143 }
11fdf7f2 6144 session.release.reset();
7c673cae
FG
6145 }
6146 }
6147}
6148
6149void Client::tick()
6150{
6151 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6152 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6153 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6154 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6155 }
6156
6157 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6158 tick_event = timer.add_event_after(
6159 cct->_conf->client_tick_interval,
6160 new FunctionContext([this](int) {
6161 // Called back via Timer, which takes client_lock for us
11fdf7f2 6162 ceph_assert(client_lock.is_locked_by_me());
3efd9988
FG
6163 tick();
6164 }));
7c673cae
FG
6165 utime_t now = ceph_clock_now();
6166
6167 if (!mounted && !mds_requests.empty()) {
6168 MetaRequest *req = mds_requests.begin()->second;
6169 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6170 req->abort(-ETIMEDOUT);
6171 if (req->caller_cond) {
6172 req->kick = true;
6173 req->caller_cond->Signal();
6174 }
6175 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6176 for (auto &p : mds_sessions) {
6177 signal_context_list(p.second.waiting_for_open);
6178 }
7c673cae
FG
6179 }
6180 }
6181
6182 if (mdsmap->get_epoch()) {
6183 // renew caps?
6184 utime_t el = now - last_cap_renew;
6185 if (el > mdsmap->get_session_timeout() / 3.0)
6186 renew_caps();
6187
6188 flush_cap_releases();
6189 }
6190
6191 // delayed caps
28e407b8 6192 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6193 while (!p.end()) {
6194 Inode *in = *p;
6195 ++p;
6196 if (in->hold_caps_until > now)
6197 break;
28e407b8 6198 delayed_list.pop_front();
7c673cae
FG
6199 check_caps(in, CHECK_CAPS_NODELAY);
6200 }
6201
6202 trim_cache(true);
6203}
6204
6205void Client::renew_caps()
6206{
6207 ldout(cct, 10) << "renew_caps()" << dendl;
6208 last_cap_renew = ceph_clock_now();
6209
11fdf7f2
TL
6210 for (auto &p : mds_sessions) {
6211 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6212 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6213 renew_caps(&p.second);
7c673cae
FG
6214 }
6215}
6216
6217void Client::renew_caps(MetaSession *session)
6218{
6219 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6220 session->last_cap_renew_request = ceph_clock_now();
6221 uint64_t seq = ++session->cap_renew_seq;
11fdf7f2 6222 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6223}
6224
6225
6226// ===============================================================
6227// high level (POSIXy) interface
6228
6229int Client::_do_lookup(Inode *dir, const string& name, int mask,
6230 InodeRef *target, const UserPerm& perms)
6231{
6232 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6233 MetaRequest *req = new MetaRequest(op);
6234 filepath path;
6235 dir->make_nosnap_relative_path(path);
6236 path.push_dentry(name);
6237 req->set_filepath(path);
6238 req->set_inode(dir);
6239 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6240 mask |= DEBUG_GETATTR_CAPS;
6241 req->head.args.getattr.mask = mask;
6242
11fdf7f2 6243 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6244
6245 int r = make_request(req, perms, target);
11fdf7f2 6246 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6247 return r;
6248}
6249
6250int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6251 const UserPerm& perms)
6252{
6253 int r = 0;
6254 Dentry *dn = NULL;
6255
7c673cae 6256 if (dname == "..") {
11fdf7f2
TL
6257 if (dir->dentries.empty()) {
6258 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6259 filepath path(dir->ino);
6260 req->set_filepath(path);
6261
6262 InodeRef tmptarget;
6263 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6264
6265 if (r == 0) {
6266 Inode *tempino = tmptarget.get();
6267 _ll_get(tempino);
6268 *target = tempino;
6269 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6270 } else {
6271 *target = dir;
6272 }
6273 }
7c673cae
FG
6274 else
6275 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6276 goto done;
6277 }
6278
6279 if (dname == ".") {
6280 *target = dir;
6281 goto done;
6282 }
6283
11fdf7f2
TL
6284 if (!dir->is_dir()) {
6285 r = -ENOTDIR;
6286 goto done;
6287 }
6288
7c673cae
FG
6289 if (dname.length() > NAME_MAX) {
6290 r = -ENAMETOOLONG;
6291 goto done;
6292 }
6293
6294 if (dname == cct->_conf->client_snapdir &&
6295 dir->snapid == CEPH_NOSNAP) {
6296 *target = open_snapdir(dir);
6297 goto done;
6298 }
6299
6300 if (dir->dir &&
6301 dir->dir->dentries.count(dname)) {
6302 dn = dir->dir->dentries[dname];
6303
11fdf7f2 6304 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6305 << " seq " << dn->lease_seq
6306 << dendl;
6307
94b18763 6308 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6309 // is dn lease valid?
6310 utime_t now = ceph_clock_now();
6311 if (dn->lease_mds >= 0 &&
6312 dn->lease_ttl > now &&
6313 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6314 MetaSession &s = mds_sessions.at(dn->lease_mds);
6315 if (s.cap_ttl > now &&
6316 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6317 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6318 // make trim_caps() behave.
6319 dir->try_touch_cap(dn->lease_mds);
6320 goto hit_dn;
6321 }
11fdf7f2 6322 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6323 << " vs lease_gen " << dn->lease_gen << dendl;
6324 }
6325 // dir lease?
94b18763 6326 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6327 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6328 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6329 goto hit_dn;
6330 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6331 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6332 << *dir << " dn '" << dname << "'" << dendl;
6333 return -ENOENT;
6334 }
6335 }
6336 } else {
6337 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6338 }
6339 } else {
6340 // can we conclude ENOENT locally?
94b18763 6341 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6342 (dir->flags & I_COMPLETE)) {
11fdf7f2 6343 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6344 return -ENOENT;
6345 }
6346 }
6347
6348 r = _do_lookup(dir, dname, mask, target, perms);
6349 goto done;
6350
6351 hit_dn:
6352 if (dn->inode) {
6353 *target = dn->inode;
6354 } else {
6355 r = -ENOENT;
6356 }
6357 touch_dn(dn);
6358
6359 done:
6360 if (r < 0)
11fdf7f2 6361 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6362 else
11fdf7f2 6363 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6364 return r;
6365}
6366
6367int Client::get_or_create(Inode *dir, const char* name,
6368 Dentry **pdn, bool expect_null)
6369{
6370 // lookup
11fdf7f2 6371 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6372 dir->open_dir();
6373 if (dir->dir->dentries.count(name)) {
6374 Dentry *dn = dir->dir->dentries[name];
6375
6376 // is dn lease valid?
6377 utime_t now = ceph_clock_now();
6378 if (dn->inode &&
6379 dn->lease_mds >= 0 &&
6380 dn->lease_ttl > now &&
6381 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6382 MetaSession &s = mds_sessions.at(dn->lease_mds);
6383 if (s.cap_ttl > now &&
6384 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6385 if (expect_null)
6386 return -EEXIST;
6387 }
6388 }
6389 *pdn = dn;
6390 } else {
6391 // otherwise link up a new one
6392 *pdn = link(dir->dir, name, NULL, NULL);
6393 }
6394
6395 // success
6396 return 0;
6397}
6398
6399int Client::path_walk(const filepath& origpath, InodeRef *end,
6400 const UserPerm& perms, bool followsym, int mask)
6401{
6402 filepath path = origpath;
6403 InodeRef cur;
6404 if (origpath.absolute())
6405 cur = root;
6406 else
6407 cur = cwd;
11fdf7f2 6408 ceph_assert(cur);
7c673cae 6409
11fdf7f2 6410 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6411
6412 int symlinks = 0;
6413
6414 unsigned i=0;
6415 while (i < path.depth() && cur) {
6416 int caps = 0;
6417 const string &dname = path[i];
6418 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6419 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6420 InodeRef next;
6421 if (cct->_conf->client_permissions) {
6422 int r = may_lookup(cur.get(), perms);
6423 if (r < 0)
6424 return r;
6425 caps = CEPH_CAP_AUTH_SHARED;
6426 }
6427
6428 /* Get extra requested caps on the last component */
6429 if (i == (path.depth() - 1))
6430 caps |= mask;
6431 int r = _lookup(cur.get(), dname, caps, &next, perms);
6432 if (r < 0)
6433 return r;
6434 // only follow trailing symlink if followsym. always follow
6435 // 'directory' symlinks.
6436 if (next && next->is_symlink()) {
6437 symlinks++;
6438 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6439 if (symlinks > MAXSYMLINKS) {
6440 return -ELOOP;
6441 }
6442
6443 if (i < path.depth() - 1) {
6444 // dir symlink
6445 // replace consumed components of path with symlink dir target
6446 filepath resolved(next->symlink.c_str());
6447 resolved.append(path.postfixpath(i + 1));
6448 path = resolved;
6449 i = 0;
6450 if (next->symlink[0] == '/') {
6451 cur = root;
6452 }
6453 continue;
6454 } else if (followsym) {
6455 if (next->symlink[0] == '/') {
6456 path = next->symlink.c_str();
6457 i = 0;
6458 // reset position
6459 cur = root;
6460 } else {
6461 filepath more(next->symlink.c_str());
6462 // we need to remove the symlink component from off of the path
6463 // before adding the target that the symlink points to. remain
6464 // at the same position in the path.
6465 path.pop_dentry();
6466 path.append(more);
6467 }
6468 continue;
6469 }
6470 }
6471 cur.swap(next);
6472 i++;
6473 }
6474 if (!cur)
6475 return -ENOENT;
6476 if (end)
6477 end->swap(cur);
6478 return 0;
6479}
6480
6481
6482// namespace ops
6483
6484int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6485{
11fdf7f2 6486 std::lock_guard lock(client_lock);
7c673cae
FG
6487 tout(cct) << "link" << std::endl;
6488 tout(cct) << relexisting << std::endl;
6489 tout(cct) << relpath << std::endl;
6490
181888fb
FG
6491 if (unmounting)
6492 return -ENOTCONN;
6493
7c673cae
FG
6494 filepath existing(relexisting);
6495
6496 InodeRef in, dir;
6497 int r = path_walk(existing, &in, perm, true);
6498 if (r < 0)
6499 return r;
6500 if (std::string(relpath) == "/") {
6501 r = -EEXIST;
6502 return r;
6503 }
6504 filepath path(relpath);
6505 string name = path.last_dentry();
6506 path.pop_dentry();
6507
6508 r = path_walk(path, &dir, perm, true);
6509 if (r < 0)
6510 return r;
6511 if (cct->_conf->client_permissions) {
6512 if (S_ISDIR(in->mode)) {
6513 r = -EPERM;
6514 return r;
6515 }
6516 r = may_hardlink(in.get(), perm);
6517 if (r < 0)
6518 return r;
6519 r = may_create(dir.get(), perm);
6520 if (r < 0)
6521 return r;
6522 }
6523 r = _link(in.get(), dir.get(), name.c_str(), perm);
6524 return r;
6525}
6526
6527int Client::unlink(const char *relpath, const UserPerm& perm)
6528{
11fdf7f2
TL
6529 std::lock_guard lock(client_lock);
6530 tout(cct) << __func__ << std::endl;
7c673cae
FG
6531 tout(cct) << relpath << std::endl;
6532
181888fb
FG
6533 if (unmounting)
6534 return -ENOTCONN;
6535
7c673cae
FG
6536 if (std::string(relpath) == "/")
6537 return -EISDIR;
6538
6539 filepath path(relpath);
6540 string name = path.last_dentry();
6541 path.pop_dentry();
6542 InodeRef dir;
6543 int r = path_walk(path, &dir, perm);
6544 if (r < 0)
6545 return r;
6546 if (cct->_conf->client_permissions) {
6547 r = may_delete(dir.get(), name.c_str(), perm);
6548 if (r < 0)
6549 return r;
6550 }
6551 return _unlink(dir.get(), name.c_str(), perm);
6552}
6553
6554int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6555{
11fdf7f2
TL
6556 std::lock_guard lock(client_lock);
6557 tout(cct) << __func__ << std::endl;
7c673cae
FG
6558 tout(cct) << relfrom << std::endl;
6559 tout(cct) << relto << std::endl;
6560
181888fb
FG
6561 if (unmounting)
6562 return -ENOTCONN;
6563
7c673cae
FG
6564 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6565 return -EBUSY;
6566
6567 filepath from(relfrom);
6568 filepath to(relto);
6569 string fromname = from.last_dentry();
6570 from.pop_dentry();
6571 string toname = to.last_dentry();
6572 to.pop_dentry();
6573
6574 InodeRef fromdir, todir;
6575 int r = path_walk(from, &fromdir, perm);
6576 if (r < 0)
6577 goto out;
6578 r = path_walk(to, &todir, perm);
6579 if (r < 0)
6580 goto out;
6581
6582 if (cct->_conf->client_permissions) {
6583 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6584 if (r < 0)
6585 return r;
6586 r = may_delete(todir.get(), toname.c_str(), perm);
6587 if (r < 0 && r != -ENOENT)
6588 return r;
6589 }
6590 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6591out:
6592 return r;
6593}
6594
6595// dirs
6596
6597int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6598{
11fdf7f2
TL
6599 std::lock_guard lock(client_lock);
6600 tout(cct) << __func__ << std::endl;
7c673cae
FG
6601 tout(cct) << relpath << std::endl;
6602 tout(cct) << mode << std::endl;
11fdf7f2 6603 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6604
181888fb
FG
6605 if (unmounting)
6606 return -ENOTCONN;
6607
7c673cae
FG
6608 if (std::string(relpath) == "/")
6609 return -EEXIST;
6610
6611 filepath path(relpath);
6612 string name = path.last_dentry();
6613 path.pop_dentry();
6614 InodeRef dir;
6615 int r = path_walk(path, &dir, perm);
6616 if (r < 0)
6617 return r;
6618 if (cct->_conf->client_permissions) {
6619 r = may_create(dir.get(), perm);
6620 if (r < 0)
6621 return r;
6622 }
6623 return _mkdir(dir.get(), name.c_str(), mode, perm);
6624}
6625
6626int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6627{
11fdf7f2 6628 std::lock_guard lock(client_lock);
7c673cae 6629 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6630 tout(cct) << __func__ << std::endl;
7c673cae
FG
6631 tout(cct) << relpath << std::endl;
6632 tout(cct) << mode << std::endl;
6633
181888fb
FG
6634 if (unmounting)
6635 return -ENOTCONN;
6636
7c673cae
FG
6637 //get through existing parts of path
6638 filepath path(relpath);
6639 unsigned int i;
6640 int r = 0, caps = 0;
6641 InodeRef cur, next;
6642 cur = cwd;
6643 for (i=0; i<path.depth(); ++i) {
6644 if (cct->_conf->client_permissions) {
6645 r = may_lookup(cur.get(), perms);
6646 if (r < 0)
6647 break;
6648 caps = CEPH_CAP_AUTH_SHARED;
6649 }
6650 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6651 if (r < 0)
6652 break;
6653 cur.swap(next);
6654 }
6655 //check that we have work left to do
6656 if (i==path.depth()) return -EEXIST;
6657 if (r!=-ENOENT) return r;
11fdf7f2 6658 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6659 //make new directory at each level
6660 for (; i<path.depth(); ++i) {
6661 if (cct->_conf->client_permissions) {
6662 r = may_create(cur.get(), perms);
6663 if (r < 0)
6664 return r;
6665 }
6666 //make new dir
6667 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6668
7c673cae 6669 //check proper creation/existence
c07f9fc5
FG
6670 if(-EEXIST == r && i < path.depth() - 1) {
6671 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6672 }
6673 if (r < 0)
6674 return r;
7c673cae
FG
6675 //move to new dir and continue
6676 cur.swap(next);
11fdf7f2 6677 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6678 << filepath(cur->ino).get_path() << dendl;
6679 }
6680 return 0;
6681}
6682
6683int Client::rmdir(const char *relpath, const UserPerm& perms)
6684{
11fdf7f2
TL
6685 std::lock_guard lock(client_lock);
6686 tout(cct) << __func__ << std::endl;
7c673cae
FG
6687 tout(cct) << relpath << std::endl;
6688
181888fb
FG
6689 if (unmounting)
6690 return -ENOTCONN;
6691
7c673cae
FG
6692 if (std::string(relpath) == "/")
6693 return -EBUSY;
6694
6695 filepath path(relpath);
6696 string name = path.last_dentry();
6697 path.pop_dentry();
6698 InodeRef dir;
6699 int r = path_walk(path, &dir, perms);
6700 if (r < 0)
6701 return r;
6702 if (cct->_conf->client_permissions) {
6703 int r = may_delete(dir.get(), name.c_str(), perms);
6704 if (r < 0)
6705 return r;
6706 }
6707 return _rmdir(dir.get(), name.c_str(), perms);
6708}
6709
6710int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6711{
11fdf7f2
TL
6712 std::lock_guard lock(client_lock);
6713 tout(cct) << __func__ << std::endl;
7c673cae
FG
6714 tout(cct) << relpath << std::endl;
6715 tout(cct) << mode << std::endl;
6716 tout(cct) << rdev << std::endl;
6717
181888fb
FG
6718 if (unmounting)
6719 return -ENOTCONN;
6720
7c673cae
FG
6721 if (std::string(relpath) == "/")
6722 return -EEXIST;
6723
6724 filepath path(relpath);
6725 string name = path.last_dentry();
6726 path.pop_dentry();
6727 InodeRef dir;
6728 int r = path_walk(path, &dir, perms);
6729 if (r < 0)
6730 return r;
6731 if (cct->_conf->client_permissions) {
6732 int r = may_create(dir.get(), perms);
6733 if (r < 0)
6734 return r;
6735 }
6736 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6737}
6738
6739// symlinks
6740
6741int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6742{
11fdf7f2
TL
6743 std::lock_guard lock(client_lock);
6744 tout(cct) << __func__ << std::endl;
7c673cae
FG
6745 tout(cct) << target << std::endl;
6746 tout(cct) << relpath << std::endl;
6747
181888fb
FG
6748 if (unmounting)
6749 return -ENOTCONN;
6750
7c673cae
FG
6751 if (std::string(relpath) == "/")
6752 return -EEXIST;
6753
6754 filepath path(relpath);
6755 string name = path.last_dentry();
6756 path.pop_dentry();
6757 InodeRef dir;
6758 int r = path_walk(path, &dir, perms);
6759 if (r < 0)
6760 return r;
6761 if (cct->_conf->client_permissions) {
6762 int r = may_create(dir.get(), perms);
6763 if (r < 0)
6764 return r;
6765 }
6766 return _symlink(dir.get(), name.c_str(), target, perms);
6767}
6768
6769int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6770{
11fdf7f2
TL
6771 std::lock_guard lock(client_lock);
6772 tout(cct) << __func__ << std::endl;
7c673cae
FG
6773 tout(cct) << relpath << std::endl;
6774
181888fb
FG
6775 if (unmounting)
6776 return -ENOTCONN;
6777
7c673cae
FG
6778 filepath path(relpath);
6779 InodeRef in;
6780 int r = path_walk(path, &in, perms, false);
6781 if (r < 0)
6782 return r;
6783
6784 return _readlink(in.get(), buf, size);
6785}
6786
6787int Client::_readlink(Inode *in, char *buf, size_t size)
6788{
6789 if (!in->is_symlink())
6790 return -EINVAL;
6791
6792 // copy into buf (at most size bytes)
6793 int r = in->symlink.length();
6794 if (r > (int)size)
6795 r = size;
6796 memcpy(buf, in->symlink.c_str(), r);
6797 return r;
6798}
6799
6800
6801// inode stuff
6802
6803int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6804{
94b18763 6805 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6806
11fdf7f2 6807 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6808 if (yes && !force)
6809 return 0;
6810
6811 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6812 filepath path;
6813 in->make_nosnap_relative_path(path);
6814 req->set_filepath(path);
6815 req->set_inode(in);
6816 req->head.args.getattr.mask = mask;
6817
6818 int res = make_request(req, perms);
11fdf7f2 6819 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6820 return res;
6821}
6822
6823int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6824 const UserPerm& perms, InodeRef *inp)
6825{
6826 int issued = in->caps_issued();
6827
11fdf7f2 6828 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6829 ccap_string(issued) << dendl;
6830
6831 if (in->snapid != CEPH_NOSNAP) {
6832 return -EROFS;
6833 }
6834 if ((mask & CEPH_SETATTR_SIZE) &&
6835 (unsigned long)stx->stx_size > in->size &&
6836 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6837 perms)) {
6838 return -EDQUOT;
6839 }
6840
6841 // make the change locally?
6842 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6843 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6844 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6845 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6846 << in->cap_dirtier_gid << ", forcing sync setattr"
6847 << dendl;
6848 /*
6849 * This works because we implicitly flush the caps as part of the
6850 * request, so the cap update check will happen with the writeback
6851 * cap context, and then the setattr check will happen with the
6852 * caller's context.
6853 *
6854 * In reality this pattern is likely pretty rare (different users
6855 * setattr'ing the same file). If that turns out not to be the
6856 * case later, we can build a more complex pipelined cap writeback
6857 * infrastructure...
6858 */
6859 if (!mask)
6860 mask |= CEPH_SETATTR_CTIME;
6861 goto force_request;
6862 }
6863
6864 if (!mask) {
6865 // caller just needs us to bump the ctime
6866 in->ctime = ceph_clock_now();
6867 in->cap_dirtier_uid = perms.uid();
6868 in->cap_dirtier_gid = perms.gid();
6869 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6870 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6871 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6872 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6873 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6874 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6875 else
6876 mask |= CEPH_SETATTR_CTIME;
6877 }
6878
6879 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6880 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6881
6882 mask &= ~CEPH_SETATTR_KILL_SGUID;
6883
6884 if (mask & CEPH_SETATTR_UID) {
6885 in->ctime = ceph_clock_now();
6886 in->cap_dirtier_uid = perms.uid();
6887 in->cap_dirtier_gid = perms.gid();
6888 in->uid = stx->stx_uid;
28e407b8 6889 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6890 mask &= ~CEPH_SETATTR_UID;
6891 kill_sguid = true;
6892 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6893 }
6894 if (mask & CEPH_SETATTR_GID) {
6895 in->ctime = ceph_clock_now();
6896 in->cap_dirtier_uid = perms.uid();
6897 in->cap_dirtier_gid = perms.gid();
6898 in->gid = stx->stx_gid;
28e407b8 6899 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6900 mask &= ~CEPH_SETATTR_GID;
6901 kill_sguid = true;
6902 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6903 }
6904
6905 if (mask & CEPH_SETATTR_MODE) {
6906 in->ctime = ceph_clock_now();
6907 in->cap_dirtier_uid = perms.uid();
6908 in->cap_dirtier_gid = perms.gid();
6909 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6910 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6911 mask &= ~CEPH_SETATTR_MODE;
6912 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6913 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6914 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6915 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6916 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6917 }
6918
6919 if (mask & CEPH_SETATTR_BTIME) {
6920 in->ctime = ceph_clock_now();
6921 in->cap_dirtier_uid = perms.uid();
6922 in->cap_dirtier_gid = perms.gid();
6923 in->btime = utime_t(stx->stx_btime);
28e407b8 6924 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6925 mask &= ~CEPH_SETATTR_BTIME;
6926 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6927 }
6928 } else if (mask & CEPH_SETATTR_SIZE) {
6929 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6930 mask |= CEPH_SETATTR_KILL_SGUID;
6931 }
6932
6933 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6934 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6935 if (mask & CEPH_SETATTR_MTIME)
6936 in->mtime = utime_t(stx->stx_mtime);
6937 if (mask & CEPH_SETATTR_ATIME)
6938 in->atime = utime_t(stx->stx_atime);
6939 in->ctime = ceph_clock_now();
6940 in->cap_dirtier_uid = perms.uid();
6941 in->cap_dirtier_gid = perms.gid();
6942 in->time_warp_seq++;
28e407b8 6943 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6944 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6945 }
6946 }
6947 if (!mask) {
6948 in->change_attr++;
6949 return 0;
6950 }
6951
6952force_request:
6953 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6954
6955 filepath path;
6956
6957 in->make_nosnap_relative_path(path);
6958 req->set_filepath(path);
6959 req->set_inode(in);
6960
6961 if (mask & CEPH_SETATTR_KILL_SGUID) {
6962 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6963 }
6964 if (mask & CEPH_SETATTR_MODE) {
6965 req->head.args.setattr.mode = stx->stx_mode;
6966 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6967 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6968 }
6969 if (mask & CEPH_SETATTR_UID) {
6970 req->head.args.setattr.uid = stx->stx_uid;
6971 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6972 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6973 }
6974 if (mask & CEPH_SETATTR_GID) {
6975 req->head.args.setattr.gid = stx->stx_gid;
6976 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6977 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6978 }
6979 if (mask & CEPH_SETATTR_BTIME) {
6980 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6981 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6982 }
6983 if (mask & CEPH_SETATTR_MTIME) {
6984 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6985 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6986 CEPH_CAP_FILE_WR;
6987 }
6988 if (mask & CEPH_SETATTR_ATIME) {
6989 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6990 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6991 CEPH_CAP_FILE_WR;
6992 }
6993 if (mask & CEPH_SETATTR_SIZE) {
6994 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6995 req->head.args.setattr.size = stx->stx_size;
6996 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6997 } else { //too big!
6998 put_request(req);
6999 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7000 return -EFBIG;
7001 }
94b18763 7002 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7003 CEPH_CAP_FILE_WR;
7004 }
7005 req->head.args.setattr.mask = mask;
7006
7007 req->regetattr_mask = mask;
7008
7009 int res = make_request(req, perms, inp);
7010 ldout(cct, 10) << "_setattr result=" << res << dendl;
7011 return res;
7012}
7013
7014/* Note that we only care about attrs that setattr cares about */
7015void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7016{
7017 stx->stx_size = st->st_size;
7018 stx->stx_mode = st->st_mode;
7019 stx->stx_uid = st->st_uid;
7020 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7021#ifdef __APPLE__
7022 stx->stx_mtime = st->st_mtimespec;
7023 stx->stx_atime = st->st_atimespec;
7024#else
7c673cae
FG
7025 stx->stx_mtime = st->st_mtim;
7026 stx->stx_atime = st->st_atim;
11fdf7f2 7027#endif
7c673cae
FG
7028}
7029
7030int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7031 const UserPerm& perms, InodeRef *inp)
7032{
7033 int ret = _do_setattr(in, stx, mask, perms, inp);
7034 if (ret < 0)
7035 return ret;
7036 if (mask & CEPH_SETATTR_MODE)
7037 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7038 return ret;
7039}
7040
7041int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7042 const UserPerm& perms)
7043{
7044 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7045 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7046 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7047 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7048 if (cct->_conf->client_permissions) {
7049 int r = may_setattr(in.get(), stx, mask, perms);
7050 if (r < 0)
7051 return r;
7052 }
7053 return __setattrx(in.get(), stx, mask, perms);
7054}
7055
7056int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7057 const UserPerm& perms)
7058{
7059 struct ceph_statx stx;
7060
7061 stat_to_statx(attr, &stx);
7062 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7063
7064 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7065 mask &= ~CEPH_SETATTR_UID;
7066 }
7067 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7068 mask &= ~CEPH_SETATTR_GID;
7069 }
7070
7c673cae
FG
7071 return _setattrx(in, &stx, mask, perms);
7072}
7073
7074int Client::setattr(const char *relpath, struct stat *attr, int mask,
7075 const UserPerm& perms)
7076{
11fdf7f2
TL
7077 std::lock_guard lock(client_lock);
7078 tout(cct) << __func__ << std::endl;
7c673cae
FG
7079 tout(cct) << relpath << std::endl;
7080 tout(cct) << mask << std::endl;
7081
181888fb
FG
7082 if (unmounting)
7083 return -ENOTCONN;
7084
7c673cae
FG
7085 filepath path(relpath);
7086 InodeRef in;
7087 int r = path_walk(path, &in, perms);
7088 if (r < 0)
7089 return r;
7090 return _setattr(in, attr, mask, perms);
7091}
7092
7093int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7094 const UserPerm& perms, int flags)
7095{
11fdf7f2
TL
7096 std::lock_guard lock(client_lock);
7097 tout(cct) << __func__ << std::endl;
7c673cae
FG
7098 tout(cct) << relpath << std::endl;
7099 tout(cct) << mask << std::endl;
7100
181888fb
FG
7101 if (unmounting)
7102 return -ENOTCONN;
7103
7c673cae
FG
7104 filepath path(relpath);
7105 InodeRef in;
7106 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7107 if (r < 0)
7108 return r;
7109 return _setattrx(in, stx, mask, perms);
7110}
7111
7112int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7113{
11fdf7f2
TL
7114 std::lock_guard lock(client_lock);
7115 tout(cct) << __func__ << std::endl;
7c673cae
FG
7116 tout(cct) << fd << std::endl;
7117 tout(cct) << mask << std::endl;
7118
181888fb
FG
7119 if (unmounting)
7120 return -ENOTCONN;
7121
7c673cae
FG
7122 Fh *f = get_filehandle(fd);
7123 if (!f)
7124 return -EBADF;
7125#if defined(__linux__) && defined(O_PATH)
7126 if (f->flags & O_PATH)
7127 return -EBADF;
7128#endif
7129 return _setattr(f->inode, attr, mask, perms);
7130}
7131
7132int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7133{
11fdf7f2
TL
7134 std::lock_guard lock(client_lock);
7135 tout(cct) << __func__ << std::endl;
7c673cae
FG
7136 tout(cct) << fd << std::endl;
7137 tout(cct) << mask << std::endl;
7138
181888fb
FG
7139 if (unmounting)
7140 return -ENOTCONN;
7141
7c673cae
FG
7142 Fh *f = get_filehandle(fd);
7143 if (!f)
7144 return -EBADF;
7145#if defined(__linux__) && defined(O_PATH)
7146 if (f->flags & O_PATH)
7147 return -EBADF;
7148#endif
7149 return _setattrx(f->inode, stx, mask, perms);
7150}
7151
7152int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7153 frag_info_t *dirstat, int mask)
7154{
11fdf7f2
TL
7155 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7156 std::lock_guard lock(client_lock);
7c673cae
FG
7157 tout(cct) << "stat" << std::endl;
7158 tout(cct) << relpath << std::endl;
181888fb
FG
7159
7160 if (unmounting)
7161 return -ENOTCONN;
7162
7c673cae
FG
7163 filepath path(relpath);
7164 InodeRef in;
7165 int r = path_walk(path, &in, perms, true, mask);
7166 if (r < 0)
7167 return r;
7168 r = _getattr(in, mask, perms);
7169 if (r < 0) {
11fdf7f2 7170 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7171 return r;
7172 }
7173 fill_stat(in, stbuf, dirstat);
11fdf7f2 7174 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7175 return r;
7176}
7177
7178unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7179{
7180 unsigned mask = 0;
7181
7182 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7183 if (flags & AT_NO_ATTR_SYNC)
7184 goto out;
7185
7186 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7187 mask |= CEPH_CAP_PIN;
7188 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7189 mask |= CEPH_CAP_AUTH_SHARED;
7190 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7191 mask |= CEPH_CAP_LINK_SHARED;
7192 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7193 mask |= CEPH_CAP_FILE_SHARED;
7194 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7195 mask |= CEPH_CAP_XATTR_SHARED;
7196out:
7197 return mask;
7198}
7199
7200int Client::statx(const char *relpath, struct ceph_statx *stx,
7201 const UserPerm& perms,
7202 unsigned int want, unsigned int flags)
7203{
11fdf7f2
TL
7204 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7205 std::lock_guard lock(client_lock);
7c673cae
FG
7206 tout(cct) << "statx" << std::endl;
7207 tout(cct) << relpath << std::endl;
181888fb
FG
7208
7209 if (unmounting)
7210 return -ENOTCONN;
7211
7c673cae
FG
7212 filepath path(relpath);
7213 InodeRef in;
7214
7215 unsigned mask = statx_to_mask(flags, want);
7216
7217 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7218 if (r < 0)
7219 return r;
7220
7221 r = _getattr(in, mask, perms);
7222 if (r < 0) {
11fdf7f2 7223 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7224 return r;
7225 }
7226
7227 fill_statx(in, mask, stx);
11fdf7f2 7228 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7229 return r;
7230}
7231
7232int Client::lstat(const char *relpath, struct stat *stbuf,
7233 const UserPerm& perms, frag_info_t *dirstat, int mask)
7234{
11fdf7f2
TL
7235 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7236 std::lock_guard lock(client_lock);
7237 tout(cct) << __func__ << std::endl;
7c673cae 7238 tout(cct) << relpath << std::endl;
181888fb
FG
7239
7240 if (unmounting)
7241 return -ENOTCONN;
7242
7c673cae
FG
7243 filepath path(relpath);
7244 InodeRef in;
7245 // don't follow symlinks
7246 int r = path_walk(path, &in, perms, false, mask);
7247 if (r < 0)
7248 return r;
7249 r = _getattr(in, mask, perms);
7250 if (r < 0) {
11fdf7f2 7251 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7252 return r;
7253 }
7254 fill_stat(in, stbuf, dirstat);
11fdf7f2 7255 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7256 return r;
7257}
7258
7259int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7260{
11fdf7f2 7261 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7262 << " mode 0" << oct << in->mode << dec
7263 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7264 memset(st, 0, sizeof(struct stat));
7265 if (use_faked_inos())
7266 st->st_ino = in->faked_ino;
7267 else
7268 st->st_ino = in->ino;
7269 st->st_dev = in->snapid;
7270 st->st_mode = in->mode;
7271 st->st_rdev = in->rdev;
28e407b8
AA
7272 if (in->is_dir()) {
7273 switch (in->nlink) {
7274 case 0:
7275 st->st_nlink = 0; /* dir is unlinked */
7276 break;
7277 case 1:
7278 st->st_nlink = 1 /* parent dentry */
7279 + 1 /* <dir>/. */
7280 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7281 break;
7282 default:
7283 ceph_abort();
7284 }
7285 } else {
7286 st->st_nlink = in->nlink;
7287 }
7c673cae
FG
7288 st->st_uid = in->uid;
7289 st->st_gid = in->gid;
7290 if (in->ctime > in->mtime) {
7291 stat_set_ctime_sec(st, in->ctime.sec());
7292 stat_set_ctime_nsec(st, in->ctime.nsec());
7293 } else {
7294 stat_set_ctime_sec(st, in->mtime.sec());
7295 stat_set_ctime_nsec(st, in->mtime.nsec());
7296 }
7297 stat_set_atime_sec(st, in->atime.sec());
7298 stat_set_atime_nsec(st, in->atime.nsec());
7299 stat_set_mtime_sec(st, in->mtime.sec());
7300 stat_set_mtime_nsec(st, in->mtime.nsec());
7301 if (in->is_dir()) {
7302 if (cct->_conf->client_dirsize_rbytes)
7303 st->st_size = in->rstat.rbytes;
7304 else
7305 st->st_size = in->dirstat.size();
7306 st->st_blocks = 1;
7307 } else {
7308 st->st_size = in->size;
7309 st->st_blocks = (in->size + 511) >> 9;
7310 }
11fdf7f2 7311 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7312
7313 if (dirstat)
7314 *dirstat = in->dirstat;
7315 if (rstat)
7316 *rstat = in->rstat;
7317
7318 return in->caps_issued();
7319}
7320
7321void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7322{
11fdf7f2 7323 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7324 << " mode 0" << oct << in->mode << dec
7325 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7326 memset(stx, 0, sizeof(struct ceph_statx));
7327
7328 /*
7329 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7330 * so that all bits are set.
7331 */
7332 if (!mask)
7333 mask = ~0;
7334
7335 /* These are always considered to be available */
7336 stx->stx_dev = in->snapid;
11fdf7f2 7337 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7338
7339 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7340 stx->stx_mode = S_IFMT & in->mode;
7341 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7342 stx->stx_rdev = in->rdev;
7343 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7344
7345 if (mask & CEPH_CAP_AUTH_SHARED) {
7346 stx->stx_uid = in->uid;
7347 stx->stx_gid = in->gid;
7348 stx->stx_mode = in->mode;
7349 in->btime.to_timespec(&stx->stx_btime);
7350 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7351 }
7352
7353 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7354 if (in->is_dir()) {
7355 switch (in->nlink) {
7356 case 0:
7357 stx->stx_nlink = 0; /* dir is unlinked */
7358 break;
7359 case 1:
7360 stx->stx_nlink = 1 /* parent dentry */
7361 + 1 /* <dir>/. */
7362 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7363 break;
7364 default:
7365 ceph_abort();
7366 }
7367 } else {
7368 stx->stx_nlink = in->nlink;
7369 }
7c673cae
FG
7370 stx->stx_mask |= CEPH_STATX_NLINK;
7371 }
7372
7373 if (mask & CEPH_CAP_FILE_SHARED) {
7374
7375 in->atime.to_timespec(&stx->stx_atime);
7376 in->mtime.to_timespec(&stx->stx_mtime);
7377
7378 if (in->is_dir()) {
7379 if (cct->_conf->client_dirsize_rbytes)
7380 stx->stx_size = in->rstat.rbytes;
7381 else
7382 stx->stx_size = in->dirstat.size();
7383 stx->stx_blocks = 1;
7384 } else {
7385 stx->stx_size = in->size;
7386 stx->stx_blocks = (in->size + 511) >> 9;
7387 }
7388 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7389 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7390 }
7391
7392 /* Change time and change_attr both require all shared caps to view */
7393 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7394 stx->stx_version = in->change_attr;
7395 if (in->ctime > in->mtime)
7396 in->ctime.to_timespec(&stx->stx_ctime);
7397 else
7398 in->mtime.to_timespec(&stx->stx_ctime);
7399 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7400 }
7401
7402}
7403
7404void Client::touch_dn(Dentry *dn)
7405{
7406 lru.lru_touch(dn);
7407}
7408
7409int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7410{
11fdf7f2
TL
7411 std::lock_guard lock(client_lock);
7412 tout(cct) << __func__ << std::endl;
7c673cae
FG
7413 tout(cct) << relpath << std::endl;
7414 tout(cct) << mode << std::endl;
181888fb
FG
7415
7416 if (unmounting)
7417 return -ENOTCONN;
7418
7c673cae
FG
7419 filepath path(relpath);
7420 InodeRef in;
7421 int r = path_walk(path, &in, perms);
7422 if (r < 0)
7423 return r;
7424 struct stat attr;
7425 attr.st_mode = mode;
7426 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7427}
7428
7429int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7430{
11fdf7f2
TL
7431 std::lock_guard lock(client_lock);
7432 tout(cct) << __func__ << std::endl;
7c673cae
FG
7433 tout(cct) << fd << std::endl;
7434 tout(cct) << mode << std::endl;
181888fb
FG
7435
7436 if (unmounting)
7437 return -ENOTCONN;
7438
7c673cae
FG
7439 Fh *f = get_filehandle(fd);
7440 if (!f)
7441 return -EBADF;
7442#if defined(__linux__) && defined(O_PATH)
7443 if (f->flags & O_PATH)
7444 return -EBADF;
7445#endif
7446 struct stat attr;
7447 attr.st_mode = mode;
7448 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7449}
7450
7451int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7452{
11fdf7f2
TL
7453 std::lock_guard lock(client_lock);
7454 tout(cct) << __func__ << std::endl;
7c673cae
FG
7455 tout(cct) << relpath << std::endl;
7456 tout(cct) << mode << std::endl;
181888fb
FG
7457
7458 if (unmounting)
7459 return -ENOTCONN;
7460
7c673cae
FG
7461 filepath path(relpath);
7462 InodeRef in;
7463 // don't follow symlinks
7464 int r = path_walk(path, &in, perms, false);
7465 if (r < 0)
7466 return r;
7467 struct stat attr;
7468 attr.st_mode = mode;
7469 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7470}
7471
7472int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7473 const UserPerm& perms)
7474{
11fdf7f2
TL
7475 std::lock_guard lock(client_lock);
7476 tout(cct) << __func__ << std::endl;
7c673cae
FG
7477 tout(cct) << relpath << std::endl;
7478 tout(cct) << new_uid << std::endl;
7479 tout(cct) << new_gid << std::endl;
181888fb
FG
7480
7481 if (unmounting)
7482 return -ENOTCONN;
7483
7c673cae
FG
7484 filepath path(relpath);
7485 InodeRef in;
7486 int r = path_walk(path, &in, perms);
7487 if (r < 0)
7488 return r;
7489 struct stat attr;
7490 attr.st_uid = new_uid;
7491 attr.st_gid = new_gid;
181888fb 7492 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7493}
7494
7495int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7496{
11fdf7f2
TL
7497 std::lock_guard lock(client_lock);
7498 tout(cct) << __func__ << std::endl;
7c673cae
FG
7499 tout(cct) << fd << std::endl;
7500 tout(cct) << new_uid << std::endl;
7501 tout(cct) << new_gid << std::endl;
181888fb
FG
7502
7503 if (unmounting)
7504 return -ENOTCONN;
7505
7c673cae
FG
7506 Fh *f = get_filehandle(fd);
7507 if (!f)
7508 return -EBADF;
7509#if defined(__linux__) && defined(O_PATH)
7510 if (f->flags & O_PATH)
7511 return -EBADF;
7512#endif
7513 struct stat attr;
7514 attr.st_uid = new_uid;
7515 attr.st_gid = new_gid;
7516 int mask = 0;
7517 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7518 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7519 return _setattr(f->inode, &attr, mask, perms);
7520}
7521
7522int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7523 const UserPerm& perms)
7524{
11fdf7f2
TL
7525 std::lock_guard lock(client_lock);
7526 tout(cct) << __func__ << std::endl;
7c673cae
FG
7527 tout(cct) << relpath << std::endl;
7528 tout(cct) << new_uid << std::endl;
7529 tout(cct) << new_gid << std::endl;
181888fb
FG
7530
7531 if (unmounting)
7532 return -ENOTCONN;
7533
7c673cae
FG
7534 filepath path(relpath);
7535 InodeRef in;
7536 // don't follow symlinks
7537 int r = path_walk(path, &in, perms, false);
7538 if (r < 0)
7539 return r;
7540 struct stat attr;
7541 attr.st_uid = new_uid;
7542 attr.st_gid = new_gid;
7543 int mask = 0;
7544 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7545 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7546 return _setattr(in, &attr, mask, perms);
7547}
7548
11fdf7f2
TL
7549static void attr_set_atime_and_mtime(struct stat *attr,
7550 const utime_t &atime,
7551 const utime_t &mtime)
7552{
7553 stat_set_atime_sec(attr, atime.tv.tv_sec);
7554 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7555 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7556 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7557}
7558
7559// for [l]utime() invoke the timeval variant as the timespec
7560// variant are not yet implemented. for futime[s](), invoke
7561// the timespec variant.
7c673cae
FG
7562int Client::utime(const char *relpath, struct utimbuf *buf,
7563 const UserPerm& perms)
7564{
11fdf7f2
TL
7565 struct timeval tv[2];
7566 tv[0].tv_sec = buf->actime;
7567 tv[0].tv_usec = 0;
7568 tv[1].tv_sec = buf->modtime;
7569 tv[1].tv_usec = 0;
7570
7571 return utimes(relpath, tv, perms);
7572}
7573
7574int Client::lutime(const char *relpath, struct utimbuf *buf,
7575 const UserPerm& perms)
7576{
7577 struct timeval tv[2];
7578 tv[0].tv_sec = buf->actime;
7579 tv[0].tv_usec = 0;
7580 tv[1].tv_sec = buf->modtime;
7581 tv[1].tv_usec = 0;
7582
7583 return lutimes(relpath, tv, perms);
7584}
7585
7586int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7587{
7588 struct timespec ts[2];
7589 ts[0].tv_sec = buf->actime;
7590 ts[0].tv_nsec = 0;
7591 ts[1].tv_sec = buf->modtime;
7592 ts[1].tv_nsec = 0;
7593
7594 return futimens(fd, ts, perms);
7595}
7596
7597int Client::utimes(const char *relpath, struct timeval times[2],
7598 const UserPerm& perms)
7599{
7600 std::lock_guard lock(client_lock);
7601 tout(cct) << __func__ << std::endl;
7c673cae 7602 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7603 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7604 << std::endl;
7605 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7606 << std::endl;
181888fb
FG
7607
7608 if (unmounting)
7609 return -ENOTCONN;
7610
7c673cae
FG
7611 filepath path(relpath);
7612 InodeRef in;
7613 int r = path_walk(path, &in, perms);
7614 if (r < 0)
7615 return r;
7616 struct stat attr;
11fdf7f2
TL
7617 utime_t atime(times[0]);
7618 utime_t mtime(times[1]);
7619
7620 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7621 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7622}
7623
11fdf7f2
TL
7624int Client::lutimes(const char *relpath, struct timeval times[2],
7625 const UserPerm& perms)
7c673cae 7626{
11fdf7f2
TL
7627 std::lock_guard lock(client_lock);
7628 tout(cct) << __func__ << std::endl;
7c673cae 7629 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7630 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7631 << std::endl;
7632 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7633 << std::endl;
181888fb
FG
7634
7635 if (unmounting)
7636 return -ENOTCONN;
7637
7c673cae
FG
7638 filepath path(relpath);
7639 InodeRef in;
7c673cae
FG
7640 int r = path_walk(path, &in, perms, false);
7641 if (r < 0)
7642 return r;
7643 struct stat attr;
11fdf7f2
TL
7644 utime_t atime(times[0]);
7645 utime_t mtime(times[1]);
7646
7647 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7648 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7649}
7650
11fdf7f2
TL
7651int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7652{
7653 struct timespec ts[2];
7654 ts[0].tv_sec = times[0].tv_sec;
7655 ts[0].tv_nsec = times[0].tv_usec * 1000;
7656 ts[1].tv_sec = times[1].tv_sec;
7657 ts[1].tv_nsec = times[1].tv_usec * 1000;
7658
7659 return futimens(fd, ts, perms);
7660}
7661
7662int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7663{
7664 std::lock_guard lock(client_lock);
7665 tout(cct) << __func__ << std::endl;
7666 tout(cct) << fd << std::endl;
7667 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7668 << std::endl;
7669 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7670 << std::endl;
7671
7672 if (unmounting)
7673 return -ENOTCONN;
7674
7675 Fh *f = get_filehandle(fd);
7676 if (!f)
7677 return -EBADF;
7678#if defined(__linux__) && defined(O_PATH)
7679 if (f->flags & O_PATH)
7680 return -EBADF;
7681#endif
7682 struct stat attr;
7683 utime_t atime(times[0]);
7684 utime_t mtime(times[1]);
7685
7686 attr_set_atime_and_mtime(&attr, atime, mtime);
7687 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7688}
7689
7c673cae
FG
7690int Client::flock(int fd, int operation, uint64_t owner)
7691{
11fdf7f2
TL
7692 std::lock_guard lock(client_lock);
7693 tout(cct) << __func__ << std::endl;
7c673cae
FG
7694 tout(cct) << fd << std::endl;
7695 tout(cct) << operation << std::endl;
7696 tout(cct) << owner << std::endl;
181888fb
FG
7697
7698 if (unmounting)
7699 return -ENOTCONN;
7700
7c673cae
FG
7701 Fh *f = get_filehandle(fd);
7702 if (!f)
7703 return -EBADF;
7704
7705 return _flock(f, operation, owner);
7706}
7707
7708int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7709{
11fdf7f2
TL
7710 std::lock_guard lock(client_lock);
7711 tout(cct) << __func__ << std::endl;
7c673cae 7712 tout(cct) << relpath << std::endl;
181888fb
FG
7713
7714 if (unmounting)
7715 return -ENOTCONN;
7716
7c673cae
FG
7717 filepath path(relpath);
7718 InodeRef in;
7719 int r = path_walk(path, &in, perms, true);
7720 if (r < 0)
7721 return r;
7722 if (cct->_conf->client_permissions) {
7723 int r = may_open(in.get(), O_RDONLY, perms);
7724 if (r < 0)
7725 return r;
7726 }
7727 r = _opendir(in.get(), dirpp, perms);
7728 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7729 if (r != -ENOTDIR)
7730 tout(cct) << (unsigned long)*dirpp << std::endl;
7731 return r;
7732}
7733
7734int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7735{
7736 if (!in->is_dir())
7737 return -ENOTDIR;
7738 *dirpp = new dir_result_t(in, perms);
7739 opened_dirs.insert(*dirpp);
11fdf7f2 7740 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7741 return 0;
7742}
7743
7744
7745int Client::closedir(dir_result_t *dir)
7746{
11fdf7f2
TL
7747 std::lock_guard lock(client_lock);
7748 tout(cct) << __func__ << std::endl;
7c673cae
FG
7749 tout(cct) << (unsigned long)dir << std::endl;
7750
11fdf7f2 7751 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7752 _closedir(dir);
7753 return 0;
7754}
7755
7756void Client::_closedir(dir_result_t *dirp)
7757{
11fdf7f2 7758 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7759 if (dirp->inode) {
11fdf7f2 7760 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7761 dirp->inode.reset();
7762 }
7763 _readdir_drop_dirp_buffer(dirp);
7764 opened_dirs.erase(dirp);
7765 delete dirp;
7766}
7767
7768void Client::rewinddir(dir_result_t *dirp)
7769{
11fdf7f2
TL
7770 std::lock_guard lock(client_lock);
7771 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7772
7773 if (unmounting)
7774 return;
7775
7c673cae
FG
7776 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7777 _readdir_drop_dirp_buffer(d);
7778 d->reset();
7779}
7780
7781loff_t Client::telldir(dir_result_t *dirp)
7782{
7783 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7784 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7785 return d->offset;
7786}
7787
7788void Client::seekdir(dir_result_t *dirp, loff_t offset)
7789{
11fdf7f2 7790 std::lock_guard lock(client_lock);
7c673cae 7791
11fdf7f2 7792 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7793
181888fb
FG
7794 if (unmounting)
7795 return;
7796
7c673cae
FG
7797 if (offset == dirp->offset)
7798 return;
7799
7800 if (offset > dirp->offset)
7801 dirp->release_count = 0; // bump if we do a forward seek
7802 else
7803 dirp->ordered_count = 0; // disable filling readdir cache
7804
7805 if (dirp->hash_order()) {
7806 if (dirp->offset > offset) {
7807 _readdir_drop_dirp_buffer(dirp);
7808 dirp->reset();
7809 }
7810 } else {
7811 if (offset == 0 ||
7812 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7813 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7814 _readdir_drop_dirp_buffer(dirp);
7815 dirp->reset();
7816 }
7817 }
7818
7819 dirp->offset = offset;
7820}
7821
7822
7823//struct dirent {
7824// ino_t d_ino; /* inode number */
7825// off_t d_off; /* offset to the next dirent */
7826// unsigned short d_reclen; /* length of this record */
7827// unsigned char d_type; /* type of file */
7828// char d_name[256]; /* filename */
7829//};
7830void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7831{
7832 strncpy(de->d_name, name, 255);
7833 de->d_name[255] = '\0';
7834#ifndef __CYGWIN__
7835 de->d_ino = ino;
11fdf7f2 7836#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7837 de->d_off = next_off;
7838#endif
7839 de->d_reclen = 1;
7840 de->d_type = IFTODT(type);
11fdf7f2 7841 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7842 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7843#endif
7844}
7845
7846void Client::_readdir_next_frag(dir_result_t *dirp)
7847{
7848 frag_t fg = dirp->buffer_frag;
7849
7850 if (fg.is_rightmost()) {
11fdf7f2 7851 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7852 dirp->set_end();
7853 return;
7854 }
7855
7856 // advance
7857 fg = fg.next();
11fdf7f2 7858 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7859
7860 if (dirp->hash_order()) {
7861 // keep last_name
7862 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7863 if (dirp->offset < new_offset) // don't decrease offset
7864 dirp->offset = new_offset;
7865 } else {
7866 dirp->last_name.clear();
7867 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7868 _readdir_rechoose_frag(dirp);
7869 }
7870}
7871
7872void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7873{
11fdf7f2 7874 ceph_assert(dirp->inode);
7c673cae
FG
7875
7876 if (dirp->hash_order())
7877 return;
7878
7879 frag_t cur = frag_t(dirp->offset_high());
7880 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7881 if (fg != cur) {
11fdf7f2 7882 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7883 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7884 dirp->last_name.clear();
7885 dirp->next_offset = 2;
7886 }
7887}
7888
7889void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7890{
11fdf7f2 7891 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7892 dirp->buffer.clear();
7893}
7894
7895int Client::_readdir_get_frag(dir_result_t *dirp)
7896{
11fdf7f2
TL
7897 ceph_assert(dirp);
7898 ceph_assert(dirp->inode);
7c673cae
FG
7899
7900 // get the current frag.
7901 frag_t fg;
7902 if (dirp->hash_order())
7903 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7904 else
7905 fg = frag_t(dirp->offset_high());
7906
11fdf7f2 7907 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7908 << " offset " << hex << dirp->offset << dec << dendl;
7909
7910 int op = CEPH_MDS_OP_READDIR;
7911 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7912 op = CEPH_MDS_OP_LSSNAP;
7913
7914 InodeRef& diri = dirp->inode;
7915
7916 MetaRequest *req = new MetaRequest(op);
7917 filepath path;
7918 diri->make_nosnap_relative_path(path);
7919 req->set_filepath(path);
7920 req->set_inode(diri.get());
7921 req->head.args.readdir.frag = fg;
7922 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7923 if (dirp->last_name.length()) {
94b18763 7924 req->path2.set_path(dirp->last_name);
7c673cae
FG
7925 } else if (dirp->hash_order()) {
7926 req->head.args.readdir.offset_hash = dirp->offset_high();
7927 }
7928 req->dirp = dirp;
7929
7930 bufferlist dirbl;
7931 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7932
7933 if (res == -EAGAIN) {
11fdf7f2 7934 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7935 _readdir_rechoose_frag(dirp);
7936 return _readdir_get_frag(dirp);
7937 }
7938
7939 if (res == 0) {
11fdf7f2 7940 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7941 << " size " << dirp->buffer.size() << dendl;
7942 } else {
11fdf7f2 7943 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7944 dirp->set_end();
7945 }
7946
7947 return res;
7948}
7949
7950struct dentry_off_lt {
7951 bool operator()(const Dentry* dn, int64_t off) const {
7952 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7953 }
7954};
7955
7956int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7957 int caps, bool getref)
7958{
11fdf7f2
TL
7959 ceph_assert(client_lock.is_locked());
7960 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7961 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7962 << dendl;
7963 Dir *dir = dirp->inode->dir;
7964
7965 if (!dir) {
7966 ldout(cct, 10) << " dir is empty" << dendl;
7967 dirp->set_end();
7968 return 0;
7969 }
7970
7971 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7972 dir->readdir_cache.end(),
7973 dirp->offset, dentry_off_lt());
7974
7975 string dn_name;
7976 while (true) {
7977 if (!dirp->inode->is_complete_and_ordered())
7978 return -EAGAIN;
7979 if (pd == dir->readdir_cache.end())
7980 break;
7981 Dentry *dn = *pd;
7982 if (dn->inode == NULL) {
7983 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7984 ++pd;
7985 continue;
7986 }
7987 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7988 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7989 ++pd;
7990 continue;
7991 }
7992
7993 int r = _getattr(dn->inode, caps, dirp->perms);
7994 if (r < 0)
7995 return r;
7996
7997 struct ceph_statx stx;
7998 struct dirent de;
7999 fill_statx(dn->inode, caps, &stx);
8000
8001 uint64_t next_off = dn->offset + 1;
8002 ++pd;
8003 if (pd == dir->readdir_cache.end())
8004 next_off = dir_result_t::END;
8005
8006 Inode *in = NULL;
8007 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8008 if (getref) {
8009 in = dn->inode.get();
8010 _ll_get(in);
8011 }
8012
8013 dn_name = dn->name; // fill in name while we have lock
8014
8015 client_lock.Unlock();
8016 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8017 client_lock.Lock();
8018 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8019 << " = " << r << dendl;
8020 if (r < 0) {
8021 return r;
8022 }
8023
8024 dirp->offset = next_off;
8025 if (dirp->at_end())
8026 dirp->next_offset = 2;
8027 else
8028 dirp->next_offset = dirp->offset_low();
8029 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8030 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8031 if (r > 0)
8032 return r;
8033 }
8034
11fdf7f2 8035 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8036 dirp->set_end();
8037 return 0;
8038}
8039
8040int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8041 unsigned want, unsigned flags, bool getref)
8042{
8043 int caps = statx_to_mask(flags, want);
8044
11fdf7f2 8045 std::lock_guard lock(client_lock);
7c673cae 8046
181888fb
FG
8047 if (unmounting)
8048 return -ENOTCONN;
8049
7c673cae
FG
8050 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8051
11fdf7f2 8052 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8053 << dec << " at_end=" << dirp->at_end()
8054 << " hash_order=" << dirp->hash_order() << dendl;
8055
8056 struct dirent de;
8057 struct ceph_statx stx;
8058 memset(&de, 0, sizeof(de));
8059 memset(&stx, 0, sizeof(stx));
8060
8061 InodeRef& diri = dirp->inode;
8062
8063 if (dirp->at_end())
8064 return 0;
8065
8066 if (dirp->offset == 0) {
8067 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8068 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8069 uint64_t next_off = 1;
8070
8071 int r;
8072 r = _getattr(diri, caps, dirp->perms);
8073 if (r < 0)
8074 return r;
8075
8076 fill_statx(diri, caps, &stx);
8077 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8078
8079 Inode *inode = NULL;
8080 if (getref) {
8081 inode = diri.get();
8082 _ll_get(inode);
8083 }
8084
8085 client_lock.Unlock();
8086 r = cb(p, &de, &stx, next_off, inode);
8087 client_lock.Lock();
8088 if (r < 0)
8089 return r;
8090
8091 dirp->offset = next_off;
8092 if (r > 0)
8093 return r;
8094 }
8095 if (dirp->offset == 1) {
8096 ldout(cct, 15) << " including .." << dendl;
8097 uint64_t next_off = 2;
8098 InodeRef in;
11fdf7f2 8099 if (diri->dentries.empty())
7c673cae
FG
8100 in = diri;
8101 else
94b18763 8102 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8103
8104 int r;
94b18763 8105 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8106 if (r < 0)
8107 return r;
8108
8109 fill_statx(in, caps, &stx);
8110 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8111
8112 Inode *inode = NULL;
8113 if (getref) {
8114 inode = in.get();
8115 _ll_get(inode);
8116 }
8117
8118 client_lock.Unlock();
8119 r = cb(p, &de, &stx, next_off, inode);
8120 client_lock.Lock();
8121 if (r < 0)
8122 return r;
8123
8124 dirp->offset = next_off;
8125 if (r > 0)
8126 return r;
8127 }
8128
8129 // can we read from our cache?
8130 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8131 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8132 << dirp->inode->is_complete_and_ordered()
8133 << " issued " << ccap_string(dirp->inode->caps_issued())
8134 << dendl;
8135 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8136 dirp->inode->is_complete_and_ordered() &&
94b18763 8137 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8138 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8139 if (err != -EAGAIN)
8140 return err;
8141 }
8142
8143 while (1) {
8144 if (dirp->at_end())
8145 return 0;
8146
8147 bool check_caps = true;
8148 if (!dirp->is_cached()) {
8149 int r = _readdir_get_frag(dirp);
8150 if (r)
8151 return r;
8152 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8153 // different than the requested one. (our dirfragtree was outdated)
8154 check_caps = false;
8155 }
8156 frag_t fg = dirp->buffer_frag;
8157
8158 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8159 << " offset " << hex << dirp->offset << dendl;
8160
8161 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8162 dirp->offset, dir_result_t::dentry_off_lt());
8163 it != dirp->buffer.end();
8164 ++it) {
8165 dir_result_t::dentry &entry = *it;
8166
8167 uint64_t next_off = entry.offset + 1;
8168
8169 int r;
8170 if (check_caps) {
8171 r = _getattr(entry.inode, caps, dirp->perms);
8172 if (r < 0)
8173 return r;
8174 }
8175
8176 fill_statx(entry.inode, caps, &stx);
8177 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8178
8179 Inode *inode = NULL;
8180 if (getref) {
8181 inode = entry.inode.get();
8182 _ll_get(inode);
8183 }
8184
8185 client_lock.Unlock();
8186 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8187 client_lock.Lock();
8188
8189 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8190 << " = " << r << dendl;
8191 if (r < 0)
8192 return r;
8193
8194 dirp->offset = next_off;
8195 if (r > 0)
8196 return r;
8197 }
8198
8199 if (dirp->next_offset > 2) {
8200 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8201 _readdir_drop_dirp_buffer(dirp);
8202 continue; // more!
8203 }
8204
8205 if (!fg.is_rightmost()) {
8206 // next frag!
8207 _readdir_next_frag(dirp);
8208 continue;
8209 }
8210
8211 if (diri->shared_gen == dirp->start_shared_gen &&
8212 diri->dir_release_count == dirp->release_count) {
8213 if (diri->dir_ordered_count == dirp->ordered_count) {
8214 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8215 if (diri->dir) {
11fdf7f2 8216 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8217 diri->dir->readdir_cache.resize(dirp->cache_index);
8218 }
8219 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8220 } else {
8221 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8222 diri->flags |= I_COMPLETE;
8223 }
8224 }
8225
8226 dirp->set_end();
8227 return 0;
8228 }
8229 ceph_abort();
8230 return 0;
8231}
8232
8233
8234int Client::readdir_r(dir_result_t *d, struct dirent *de)
8235{
8236 return readdirplus_r(d, de, 0, 0, 0, NULL);
8237}
8238
8239/*
8240 * readdirplus_r
8241 *
8242 * returns
8243 * 1 if we got a dirent
8244 * 0 for end of directory
8245 * <0 on error
8246 */
8247
8248struct single_readdir {
8249 struct dirent *de;
8250 struct ceph_statx *stx;
8251 Inode *inode;
8252 bool full;
8253};
8254
8255static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8256 struct ceph_statx *stx, off_t off,
8257 Inode *in)
8258{
8259 single_readdir *c = static_cast<single_readdir *>(p);
8260
8261 if (c->full)
8262 return -1; // already filled this dirent
8263
8264 *c->de = *de;
8265 if (c->stx)
8266 *c->stx = *stx;
8267 c->inode = in;
8268 c->full = true;
8269 return 1;
8270}
8271
8272struct dirent *Client::readdir(dir_result_t *d)
8273{
8274 int ret;
8275 static struct dirent de;
8276 single_readdir sr;
8277 sr.de = &de;
8278 sr.stx = NULL;
8279 sr.inode = NULL;
8280 sr.full = false;
8281
8282 // our callback fills the dirent and sets sr.full=true on first
8283 // call, and returns -1 the second time around.
8284 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8285 if (ret < -1) {
8286 errno = -ret; // this sucks.
8287 return (dirent *) NULL;
8288 }
8289 if (sr.full) {
8290 return &de;
8291 }
8292 return (dirent *) NULL;
8293}
8294
8295int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8296 struct ceph_statx *stx, unsigned want,
8297 unsigned flags, Inode **out)
8298{
8299 single_readdir sr;
8300 sr.de = de;
8301 sr.stx = stx;
8302 sr.inode = NULL;
8303 sr.full = false;
8304
8305 // our callback fills the dirent and sets sr.full=true on first
8306 // call, and returns -1 the second time around.
8307 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8308 if (r < -1)
8309 return r;
8310 if (out)
8311 *out = sr.inode;
8312 if (sr.full)
8313 return 1;
8314 return 0;
8315}
8316
8317
8318/* getdents */
8319struct getdents_result {
8320 char *buf;
8321 int buflen;
8322 int pos;
8323 bool fullent;
8324};
8325
8326static int _readdir_getdent_cb(void *p, struct dirent *de,
8327 struct ceph_statx *stx, off_t off, Inode *in)
8328{
8329 struct getdents_result *c = static_cast<getdents_result *>(p);
8330
8331 int dlen;
8332 if (c->fullent)
8333 dlen = sizeof(*de);
8334 else
8335 dlen = strlen(de->d_name) + 1;
8336
8337 if (c->pos + dlen > c->buflen)
8338 return -1; // doesn't fit
8339
8340 if (c->fullent) {
8341 memcpy(c->buf + c->pos, de, sizeof(*de));
8342 } else {
8343 memcpy(c->buf + c->pos, de->d_name, dlen);
8344 }
8345 c->pos += dlen;
8346 return 0;
8347}
8348
8349int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8350{
8351 getdents_result gr;
8352 gr.buf = buf;
8353 gr.buflen = buflen;
8354 gr.fullent = fullent;
8355 gr.pos = 0;
8356
8357 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8358
8359 if (r < 0) { // some error
8360 if (r == -1) { // buffer ran out of space
8361 if (gr.pos) { // but we got some entries already!
8362 return gr.pos;
8363 } // or we need a larger buffer
8364 return -ERANGE;
8365 } else { // actual error, return it
8366 return r;
8367 }
8368 }
8369 return gr.pos;
8370}
8371
8372
8373/* getdir */
8374struct getdir_result {
8375 list<string> *contents;
8376 int num;
8377};
8378
8379static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8380{
8381 getdir_result *r = static_cast<getdir_result *>(p);
8382
8383 r->contents->push_back(de->d_name);
8384 r->num++;
8385 return 0;
8386}
8387
8388int Client::getdir(const char *relpath, list<string>& contents,
8389 const UserPerm& perms)
8390{
8391 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8392 {
11fdf7f2 8393 std::lock_guard lock(client_lock);
7c673cae
FG
8394 tout(cct) << "getdir" << std::endl;
8395 tout(cct) << relpath << std::endl;
8396 }
8397
8398 dir_result_t *d;
8399 int r = opendir(relpath, &d, perms);
8400 if (r < 0)
8401 return r;
8402
8403 getdir_result gr;
8404 gr.contents = &contents;
8405 gr.num = 0;
8406 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8407
8408 closedir(d);
8409
8410 if (r < 0)
8411 return r;
8412 return gr.num;
8413}
8414
8415
8416/****** file i/o **********/
8417int Client::open(const char *relpath, int flags, const UserPerm& perms,
8418 mode_t mode, int stripe_unit, int stripe_count,
8419 int object_size, const char *data_pool)
8420{
8421 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8422 std::lock_guard lock(client_lock);
7c673cae
FG
8423 tout(cct) << "open" << std::endl;
8424 tout(cct) << relpath << std::endl;
8425 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8426
181888fb
FG
8427 if (unmounting)
8428 return -ENOTCONN;
8429
7c673cae
FG
8430 Fh *fh = NULL;
8431
8432#if defined(__linux__) && defined(O_PATH)
8433 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8434 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8435 * in kernel (fs/open.c). */
8436 if (flags & O_PATH)
8437 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8438#endif
8439
8440 filepath path(relpath);
8441 InodeRef in;
8442 bool created = false;
8443 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8444 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8445 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8446
8447 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8448 return -EEXIST;
8449
8450#if defined(__linux__) && defined(O_PATH)
8451 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8452#else
8453 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8454#endif
8455 return -ELOOP;
8456
8457 if (r == -ENOENT && (flags & O_CREAT)) {
8458 filepath dirpath = path;
8459 string dname = dirpath.last_dentry();
8460 dirpath.pop_dentry();
8461 InodeRef dir;
8462 r = path_walk(dirpath, &dir, perms, true,
8463 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8464 if (r < 0)
8465 goto out;
8466 if (cct->_conf->client_permissions) {
8467 r = may_create(dir.get(), perms);
8468 if (r < 0)
8469 goto out;
8470 }
8471 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8472 stripe_count, object_size, data_pool, &created, perms);
8473 }
8474 if (r < 0)
8475 goto out;
8476
8477 if (!created) {
8478 // posix says we can only check permissions of existing files
8479 if (cct->_conf->client_permissions) {
8480 r = may_open(in.get(), flags, perms);
8481 if (r < 0)
8482 goto out;
8483 }
8484 }
8485
8486 if (!fh)
8487 r = _open(in.get(), flags, mode, &fh, perms);
8488 if (r >= 0) {
8489 // allocate a integer file descriptor
11fdf7f2 8490 ceph_assert(fh);
7c673cae 8491 r = get_fd();
11fdf7f2 8492 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8493 fd_map[r] = fh;
8494 }
8495
8496 out:
8497 tout(cct) << r << std::endl;
8498 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8499 return r;
8500}
8501
8502int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8503{
8504 /* Use default file striping parameters */
8505 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8506}
8507
8508int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8509 const UserPerm& perms)
8510{
11fdf7f2
TL
8511 std::lock_guard lock(client_lock);
8512 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8513
181888fb
FG
8514 if (unmounting)
8515 return -ENOTCONN;
8516
7c673cae
FG
8517 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8518 filepath path(ino);
8519 req->set_filepath(path);
8520
8521 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8522 char f[30];
8523 sprintf(f, "%u", h);
8524 filepath path2(dirino);
8525 path2.push_dentry(string(f));
8526 req->set_filepath2(path2);
8527
8528 int r = make_request(req, perms, NULL, NULL,
8529 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8530 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8531 return r;
8532}
8533
8534
8535/**
8536 * Load inode into local cache.
8537 *
8538 * If inode pointer is non-NULL, and take a reference on
8539 * the resulting Inode object in one operation, so that caller
8540 * can safely assume inode will still be there after return.
8541 */
1adf2230 8542int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8543{
11fdf7f2 8544 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8545
181888fb
FG
8546 if (unmounting)
8547 return -ENOTCONN;
8548
7c673cae
FG
8549 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8550 filepath path(ino);
8551 req->set_filepath(path);
8552
8553 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8554 if (r == 0 && inode != NULL) {
8555 vinodeno_t vino(ino, CEPH_NOSNAP);
8556 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8557 ceph_assert(p != inode_map.end());
7c673cae
FG
8558 *inode = p->second;
8559 _ll_get(*inode);
8560 }
11fdf7f2 8561 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8562 return r;
8563}
8564
1adf2230
AA
8565int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8566{
11fdf7f2 8567 std::lock_guard lock(client_lock);
1adf2230
AA
8568 return _lookup_ino(ino, perms, inode);
8569}
7c673cae
FG
8570
8571/**
8572 * Find the parent inode of `ino` and insert it into
8573 * our cache. Conditionally also set `parent` to a referenced
8574 * Inode* if caller provides non-NULL value.
8575 */
1adf2230 8576int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8577{
11fdf7f2 8578 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8579
7c673cae
FG
8580 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8581 filepath path(ino->ino);
8582 req->set_filepath(path);
8583
8584 InodeRef target;
8585 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8586 // Give caller a reference to the parent ino if they provided a pointer.
8587 if (parent != NULL) {
8588 if (r == 0) {
8589 *parent = target.get();
8590 _ll_get(*parent);
11fdf7f2 8591 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8592 } else {
8593 *parent = NULL;
8594 }
8595 }
11fdf7f2 8596 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8597 return r;
8598}
8599
7c673cae
FG
8600/**
8601 * Populate the parent dentry for `ino`, provided it is
8602 * a child of `parent`.
8603 */
1adf2230 8604int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8605{
11fdf7f2
TL
8606 ceph_assert(parent->is_dir());
8607 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8608
181888fb
FG
8609 if (unmounting)
8610 return -ENOTCONN;
8611
7c673cae
FG
8612 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8613 req->set_filepath2(filepath(parent->ino));
8614 req->set_filepath(filepath(ino->ino));
8615 req->set_inode(ino);
8616
8617 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8618 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8619 return r;
8620}
8621
1adf2230
AA
8622int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8623{
11fdf7f2 8624 std::lock_guard lock(client_lock);
1adf2230
AA
8625 return _lookup_name(ino, parent, perms);
8626}
7c673cae 8627
11fdf7f2 8628Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8629{
11fdf7f2
TL
8630 ceph_assert(in);
8631 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8632
11fdf7f2 8633 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8634
8635 if (in->snapid != CEPH_NOSNAP) {
8636 in->snap_cap_refs++;
8637 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8638 << ccap_string(in->caps_issued()) << dendl;
8639 }
8640
11fdf7f2 8641 const auto& conf = cct->_conf;
7c673cae
FG
8642 f->readahead.set_trigger_requests(1);
8643 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8644 uint64_t max_readahead = Readahead::NO_LIMIT;
8645 if (conf->client_readahead_max_bytes) {
11fdf7f2 8646 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8647 }
8648 if (conf->client_readahead_max_periods) {
11fdf7f2 8649 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8650 }
8651 f->readahead.set_max_readahead_size(max_readahead);
8652 vector<uint64_t> alignments;
8653 alignments.push_back(in->layout.get_period());
8654 alignments.push_back(in->layout.stripe_unit);
8655 f->readahead.set_alignments(alignments);
8656
8657 return f;
8658}
8659
8660int Client::_release_fh(Fh *f)
8661{
8662 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8663 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8664 Inode *in = f->inode.get();
11fdf7f2 8665 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8666
b32b8144
FG
8667 in->unset_deleg(f);
8668
7c673cae
FG
8669 if (in->snapid == CEPH_NOSNAP) {
8670 if (in->put_open_ref(f->mode)) {
8671 _flush(in, new C_Client_FlushComplete(this, in));
8672 check_caps(in, 0);
8673 }
8674 } else {
11fdf7f2 8675 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8676 in->snap_cap_refs--;
8677 }
8678
8679 _release_filelocks(f);
8680
8681 // Finally, read any async err (i.e. from flushes)
8682 int err = f->take_async_err();
8683 if (err != 0) {
11fdf7f2 8684 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8685 << cpp_strerror(err) << dendl;
8686 } else {
11fdf7f2 8687 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8688 }
8689
8690 _put_fh(f);
8691
8692 return err;
8693}
8694
8695void Client::_put_fh(Fh *f)
8696{
8697 int left = f->put();
8698 if (!left) {
8699 delete f;
8700 }
8701}
8702
8703int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8704 const UserPerm& perms)
8705{
8706 if (in->snapid != CEPH_NOSNAP &&
8707 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8708 return -EROFS;
8709 }
8710
8711 // use normalized flags to generate cmode
11fdf7f2
TL
8712 int cflags = ceph_flags_sys2wire(flags);
8713 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8714 cflags |= CEPH_O_LAZY;
8715
8716 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8717 int want = ceph_caps_for_mode(cmode);
8718 int result = 0;
8719
8720 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8721
b32b8144 8722 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8723 // update wanted?
8724 check_caps(in, CHECK_CAPS_NODELAY);
8725 } else {
b32b8144 8726
7c673cae
FG
8727 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8728 filepath path;
8729 in->make_nosnap_relative_path(path);
8730 req->set_filepath(path);
11fdf7f2 8731 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8732 req->head.args.open.mode = mode;
8733 req->head.args.open.pool = -1;
8734 if (cct->_conf->client_debug_getattr_caps)
8735 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8736 else
8737 req->head.args.open.mask = 0;
8738 req->head.args.open.old_size = in->size; // for O_TRUNC
8739 req->set_inode(in);
8740 result = make_request(req, perms);
b32b8144
FG
8741
8742 /*
8743 * NFS expects that delegations will be broken on a conflicting open,
8744 * not just when there is actual conflicting access to the file. SMB leases
8745 * and oplocks also have similar semantics.
8746 *
8747 * Ensure that clients that have delegations enabled will wait on minimal
8748 * caps during open, just to ensure that other clients holding delegations
8749 * return theirs first.
8750 */
8751 if (deleg_timeout && result == 0) {
8752 int need = 0, have;
8753
8754 if (cmode & CEPH_FILE_MODE_WR)
8755 need |= CEPH_CAP_FILE_WR;
8756 if (cmode & CEPH_FILE_MODE_RD)
8757 need |= CEPH_CAP_FILE_RD;
8758
8759 result = get_caps(in, need, want, &have, -1);
8760 if (result < 0) {
1adf2230 8761 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8762 " . Denying open: " <<
8763 cpp_strerror(result) << dendl;
8764 in->put_open_ref(cmode);
8765 } else {
8766 put_cap_ref(in, need);
8767 }
8768 }
7c673cae
FG
8769 }
8770
8771 // success?
8772 if (result >= 0) {
8773 if (fhp)
8774 *fhp = _create_fh(in, flags, cmode, perms);
8775 } else {
8776 in->put_open_ref(cmode);
8777 }
8778
8779 trim_cache();
8780
8781 return result;
8782}
8783
8784int Client::_renew_caps(Inode *in)
8785{
8786 int wanted = in->caps_file_wanted();
8787 if (in->is_any_caps() &&
8788 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8789 check_caps(in, CHECK_CAPS_NODELAY);
8790 return 0;
8791 }
8792
8793 int flags = 0;
8794 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8795 flags = O_RDWR;
8796 else if (wanted & CEPH_CAP_FILE_RD)
8797 flags = O_RDONLY;
8798 else if (wanted & CEPH_CAP_FILE_WR)
8799 flags = O_WRONLY;
8800
8801 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8802 filepath path;
8803 in->make_nosnap_relative_path(path);
8804 req->set_filepath(path);
8805 req->head.args.open.flags = flags;
8806 req->head.args.open.pool = -1;
8807 if (cct->_conf->client_debug_getattr_caps)
8808 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8809 else
8810 req->head.args.open.mask = 0;
8811 req->set_inode(in);
8812
8813 // duplicate in case Cap goes away; not sure if that race is a concern?
8814 const UserPerm *pperm = in->get_best_perms();
8815 UserPerm perms;
8816 if (pperm != NULL)
8817 perms = *pperm;
8818 int ret = make_request(req, perms);
8819 return ret;
8820}
8821
8822int Client::close(int fd)
8823{
8824 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8825 std::lock_guard lock(client_lock);
7c673cae
FG
8826 tout(cct) << "close" << std::endl;
8827 tout(cct) << fd << std::endl;
8828
181888fb
FG
8829 if (unmounting)
8830 return -ENOTCONN;
8831
7c673cae
FG
8832 Fh *fh = get_filehandle(fd);
8833 if (!fh)
8834 return -EBADF;
8835 int err = _release_fh(fh);
8836 fd_map.erase(fd);
8837 put_fd(fd);
8838 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8839 return err;
8840}
8841
8842
8843// ------------
8844// read, write
8845
8846loff_t Client::lseek(int fd, loff_t offset, int whence)
8847{
11fdf7f2 8848 std::lock_guard lock(client_lock);
7c673cae
FG
8849 tout(cct) << "lseek" << std::endl;
8850 tout(cct) << fd << std::endl;
8851 tout(cct) << offset << std::endl;
8852 tout(cct) << whence << std::endl;
8853
181888fb
FG
8854 if (unmounting)
8855 return -ENOTCONN;
8856
7c673cae
FG
8857 Fh *f = get_filehandle(fd);
8858 if (!f)
8859 return -EBADF;
8860#if defined(__linux__) && defined(O_PATH)
8861 if (f->flags & O_PATH)
8862 return -EBADF;
8863#endif
8864 return _lseek(f, offset, whence);
8865}
8866
8867loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8868{
8869 Inode *in = f->inode.get();
8870 int r;
11fdf7f2 8871 loff_t pos = -1;
7c673cae
FG
8872
8873 switch (whence) {
8874 case SEEK_SET:
11fdf7f2 8875 pos = offset;
7c673cae
FG
8876 break;
8877
8878 case SEEK_CUR:
11fdf7f2 8879 pos += offset;
7c673cae
FG
8880 break;
8881
8882 case SEEK_END:
8883 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8884 if (r < 0)
8885 return r;
11fdf7f2 8886 pos = in->size + offset;
7c673cae
FG
8887 break;
8888
8889 default:
8890 ceph_abort();
8891 }
8892
11fdf7f2
TL
8893 if (pos < 0) {
8894 return -EINVAL;
8895 } else {
8896 f->pos = pos;
8897 }
8898
1adf2230 8899 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8900 return f->pos;
8901}
8902
8903
8904void Client::lock_fh_pos(Fh *f)
8905{
11fdf7f2 8906 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8907
8908 if (f->pos_locked || !f->pos_waiters.empty()) {
8909 Cond cond;
8910 f->pos_waiters.push_back(&cond);
11fdf7f2 8911 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
7c673cae
FG
8912 while (f->pos_locked || f->pos_waiters.front() != &cond)
8913 cond.Wait(client_lock);
11fdf7f2
TL
8914 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8915 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
8916 f->pos_waiters.pop_front();
8917 }
8918
8919 f->pos_locked = true;
8920}
8921
8922void Client::unlock_fh_pos(Fh *f)
8923{
11fdf7f2 8924 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8925 f->pos_locked = false;
8926}
8927
8928int Client::uninline_data(Inode *in, Context *onfinish)
8929{
8930 if (!in->inline_data.length()) {
8931 onfinish->complete(0);
8932 return 0;
8933 }
8934
8935 char oid_buf[32];
8936 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8937 object_t oid = oid_buf;
8938
8939 ObjectOperation create_ops;
8940 create_ops.create(false);
8941
8942 objecter->mutate(oid,
8943 OSDMap::file_to_object_locator(in->layout),
8944 create_ops,
8945 in->snaprealm->get_snap_context(),
8946 ceph::real_clock::now(),
8947 0,
8948 NULL);
8949
8950 bufferlist inline_version_bl;
11fdf7f2 8951 encode(in->inline_version, inline_version_bl);
7c673cae
FG
8952
8953 ObjectOperation uninline_ops;
8954 uninline_ops.cmpxattr("inline_version",
8955 CEPH_OSD_CMPXATTR_OP_GT,
8956 CEPH_OSD_CMPXATTR_MODE_U64,
8957 inline_version_bl);
8958 bufferlist inline_data = in->inline_data;
8959 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8960 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8961
8962 objecter->mutate(oid,
8963 OSDMap::file_to_object_locator(in->layout),
8964 uninline_ops,
8965 in->snaprealm->get_snap_context(),
8966 ceph::real_clock::now(),
8967 0,
8968 onfinish);
8969
8970 return 0;
8971}
8972
8973//
8974
8975// blocking osd interface
8976
8977int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8978{
11fdf7f2 8979 std::lock_guard lock(client_lock);
7c673cae
FG
8980 tout(cct) << "read" << std::endl;
8981 tout(cct) << fd << std::endl;
8982 tout(cct) << size << std::endl;
8983 tout(cct) << offset << std::endl;
8984
181888fb
FG
8985 if (unmounting)
8986 return -ENOTCONN;
8987
7c673cae
FG
8988 Fh *f = get_filehandle(fd);
8989 if (!f)
8990 return -EBADF;
8991#if defined(__linux__) && defined(O_PATH)
8992 if (f->flags & O_PATH)
8993 return -EBADF;
8994#endif
8995 bufferlist bl;
11fdf7f2
TL
8996 /* We can't return bytes written larger than INT_MAX, clamp size to that */
8997 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
8998 int r = _read(f, offset, size, &bl);
8999 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9000 if (r >= 0) {
9001 bl.copy(0, bl.length(), buf);
9002 r = bl.length();
9003 }
9004 return r;
9005}
9006
9007int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9008{
9009 if (iovcnt < 0)
9010 return -EINVAL;
9011 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9012}
9013
11fdf7f2 9014int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9015{
11fdf7f2
TL
9016 int want, have = 0;
9017 bool movepos = false;
9018 std::unique_ptr<C_SaferCond> onuninline;
9019 int64_t r = 0;
9020 const auto& conf = cct->_conf;
7c673cae 9021 Inode *in = f->inode.get();
11fdf7f2
TL
9022 utime_t lat;
9023 utime_t start = ceph_clock_now();
7c673cae
FG
9024
9025 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9026 return -EBADF;
9027 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9028
7c673cae
FG
9029 if (offset < 0) {
9030 lock_fh_pos(f);
9031 offset = f->pos;
9032 movepos = true;
9033 }
9034 loff_t start_pos = offset;
9035
9036 if (in->inline_version == 0) {
11fdf7f2 9037 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9038 if (r < 0) {
11fdf7f2 9039 goto done;
c07f9fc5 9040 }
11fdf7f2 9041 ceph_assert(in->inline_version > 0);
7c673cae
FG
9042 }
9043
9044retry:
11fdf7f2
TL
9045 if (f->mode & CEPH_FILE_MODE_LAZY)
9046 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9047 else
9048 want = CEPH_CAP_FILE_CACHE;
9049 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9050 if (r < 0) {
11fdf7f2 9051 goto done;
c07f9fc5 9052 }
7c673cae 9053 if (f->flags & O_DIRECT)
11fdf7f2 9054 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9055
9056 if (in->inline_version < CEPH_INLINE_NONE) {
9057 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9058 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9059 uninline_data(in, onuninline.get());
7c673cae
FG
9060 } else {
9061 uint32_t len = in->inline_data.length();
7c673cae
FG
9062 uint64_t endoff = offset + size;
9063 if (endoff > in->size)
9064 endoff = in->size;
9065
9066 if (offset < len) {
9067 if (endoff <= len) {
9068 bl->substr_of(in->inline_data, offset, endoff - offset);
9069 } else {
9070 bl->substr_of(in->inline_data, offset, len - offset);
9071 bl->append_zero(endoff - len);
9072 }
11fdf7f2 9073 r = endoff - offset;
7c673cae
FG
9074 } else if ((uint64_t)offset < endoff) {
9075 bl->append_zero(endoff - offset);
11fdf7f2
TL
9076 r = endoff - offset;
9077 } else {
9078 r = 0;
7c673cae 9079 }
7c673cae
FG
9080 goto success;
9081 }
9082 }
9083
9084 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9085 conf->client_oc &&
9086 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9087
9088 if (f->flags & O_RSYNC) {
9089 _flush_range(in, offset, size);
9090 }
9091 r = _read_async(f, offset, size, bl);
9092 if (r < 0)
9093 goto done;
9094 } else {
9095 if (f->flags & O_DIRECT)
9096 _flush_range(in, offset, size);
9097
9098 bool checkeof = false;
9099 r = _read_sync(f, offset, size, bl, &checkeof);
9100 if (r < 0)
9101 goto done;
9102 if (checkeof) {
9103 offset += r;
9104 size -= r;
9105
9106 put_cap_ref(in, CEPH_CAP_FILE_RD);
9107 have = 0;
9108 // reverify size
9109 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9110 if (r < 0)
9111 goto done;
9112
9113 // eof? short read.
9114 if ((uint64_t)offset < in->size)
9115 goto retry;
9116 }
9117 }
9118
9119success:
11fdf7f2 9120 ceph_assert(r >= 0);
7c673cae
FG
9121 if (movepos) {
9122 // adjust fd pos
11fdf7f2 9123 f->pos = start_pos + r;
7c673cae 9124 }
11fdf7f2
TL
9125
9126 lat = ceph_clock_now();
9127 lat -= start;
9128 logger->tinc(l_c_read, lat);
7c673cae
FG
9129
9130done:
9131 // done!
11fdf7f2 9132
7c673cae
FG
9133 if (onuninline) {
9134 client_lock.Unlock();
11fdf7f2 9135 int ret = onuninline->wait();
7c673cae 9136 client_lock.Lock();
11fdf7f2 9137 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9138 in->inline_data.clear();
9139 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9140 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9141 check_caps(in, 0);
9142 } else
11fdf7f2 9143 r = ret;
7c673cae 9144 }
11fdf7f2 9145 if (have) {
7c673cae 9146 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9147 }
9148 if (movepos) {
9149 unlock_fh_pos(f);
9150 }
9151 return r;
7c673cae
FG
9152}
9153
9154Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9155 client(c), f(f) {
9156 f->get();
9157 f->readahead.inc_pending();
9158}
9159
9160Client::C_Readahead::~C_Readahead() {
9161 f->readahead.dec_pending();
9162 client->_put_fh(f);
9163}
9164
9165void Client::C_Readahead::finish(int r) {
9166 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9167 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9168}
9169
9170int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9171{
11fdf7f2 9172 const auto& conf = cct->_conf;
7c673cae
FG
9173 Inode *in = f->inode.get();
9174
11fdf7f2 9175 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9176
9177 // trim read based on file size?
9178 if (off >= in->size)
9179 return 0;
9180 if (len == 0)
9181 return 0;
9182 if (off + len > in->size) {
9183 len = in->size - off;
9184 }
9185
9186 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9187 << " max_bytes=" << f->readahead.get_max_readahead_size()
9188 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9189
9190 // read (and possibly block)
11fdf7f2
TL
9191 int r = 0;
9192 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9193 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9194 off, len, bl, 0, &onfinish);
7c673cae
FG
9195 if (r == 0) {
9196 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9197 client_lock.Unlock();
11fdf7f2 9198 r = onfinish.wait();
7c673cae
FG
9199 client_lock.Lock();
9200 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9201 }
9202
9203 if(f->readahead.get_min_readahead_size() > 0) {
9204 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9205 if (readahead_extent.second > 0) {
9206 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9207 << " (caller wants " << off << "~" << len << ")" << dendl;
9208 Context *onfinish2 = new C_Readahead(this, f);
9209 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9210 readahead_extent.first, readahead_extent.second,
9211 NULL, 0, onfinish2);
9212 if (r2 == 0) {
9213 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9214 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9215 } else {
9216 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9217 delete onfinish2;
9218 }
9219 }
9220 }
9221
9222 return r;
9223}
9224
9225int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9226 bool *checkeof)
9227{
9228 Inode *in = f->inode.get();
9229 uint64_t pos = off;
9230 int left = len;
9231 int read = 0;
9232
11fdf7f2 9233 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9234
9235 Mutex flock("Client::_read_sync flock");
9236 Cond cond;
9237 while (left > 0) {
11fdf7f2 9238 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9239 bufferlist tbl;
9240
9241 int wanted = left;
9242 filer->read_trunc(in->ino, &in->layout, in->snapid,
9243 pos, left, &tbl, 0,
9244 in->truncate_size, in->truncate_seq,
11fdf7f2 9245 &onfinish);
7c673cae 9246 client_lock.Unlock();
11fdf7f2 9247 int r = onfinish.wait();
7c673cae
FG
9248 client_lock.Lock();
9249
9250 // if we get ENOENT from OSD, assume 0 bytes returned
9251 if (r == -ENOENT)
9252 r = 0;
9253 if (r < 0)
9254 return r;
9255 if (tbl.length()) {
9256 r = tbl.length();
9257
9258 read += r;
9259 pos += r;
9260 left -= r;
9261 bl->claim_append(tbl);
9262 }
9263 // short read?
9264 if (r >= 0 && r < wanted) {
9265 if (pos < in->size) {
9266 // zero up to known EOF
9267 int64_t some = in->size - pos;
9268 if (some > left)
9269 some = left;
11fdf7f2
TL
9270 auto z = buffer::ptr_node::create(some);
9271 z->zero();
9272 bl->push_back(std::move(z));
7c673cae
FG
9273 read += some;
9274 pos += some;
9275 left -= some;
9276 if (left == 0)
9277 return read;
9278 }
9279
9280 *checkeof = true;
9281 return read;
9282 }
9283 }
9284 return read;
9285}
9286
9287
9288/*
9289 * we keep count of uncommitted sync writes on the inode, so that
9290 * fsync can DDRT.
9291 */
9292void Client::_sync_write_commit(Inode *in)
9293{
11fdf7f2 9294 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9295 unsafe_sync_write--;
9296
9297 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9298
11fdf7f2 9299 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9300 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9301 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
7c673cae
FG
9302 mount_cond.Signal();
9303 }
9304}
9305
9306int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9307{
11fdf7f2 9308 std::lock_guard lock(client_lock);
7c673cae
FG
9309 tout(cct) << "write" << std::endl;
9310 tout(cct) << fd << std::endl;
9311 tout(cct) << size << std::endl;
9312 tout(cct) << offset << std::endl;
9313
181888fb
FG
9314 if (unmounting)
9315 return -ENOTCONN;
9316
7c673cae
FG
9317 Fh *fh = get_filehandle(fd);
9318 if (!fh)
9319 return -EBADF;
9320#if defined(__linux__) && defined(O_PATH)
9321 if (fh->flags & O_PATH)
9322 return -EBADF;
9323#endif
11fdf7f2
TL
9324 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9325 size = std::min(size, (loff_t)INT_MAX);
9326 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9327 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9328 return r;
9329}
9330
9331int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9332{
9333 if (iovcnt < 0)
9334 return -EINVAL;
9335 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9336}
9337
11fdf7f2
TL
9338int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9339 unsigned iovcnt, int64_t offset, bool write,
9340 bool clamp_to_int)
7c673cae 9341{
7c673cae
FG
9342#if defined(__linux__) && defined(O_PATH)
9343 if (fh->flags & O_PATH)
9344 return -EBADF;
9345#endif
9346 loff_t totallen = 0;
9347 for (unsigned i = 0; i < iovcnt; i++) {
9348 totallen += iov[i].iov_len;
9349 }
11fdf7f2
TL
9350
9351 /*
9352 * Some of the API functions take 64-bit size values, but only return
9353 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9354 * we don't do I/Os larger than the values we can return.
9355 */
9356 if (clamp_to_int) {
9357 totallen = std::min(totallen, (loff_t)INT_MAX);
9358 }
7c673cae 9359 if (write) {
11fdf7f2
TL
9360 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9361 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9362 return w;
9363 } else {
9364 bufferlist bl;
11fdf7f2
TL
9365 int64_t r = _read(fh, offset, totallen, &bl);
9366 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9367 if (r <= 0)
9368 return r;
9369
9370 int bufoff = 0;
9371 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9372 /*
9373 * This piece of code aims to handle the case that bufferlist does not have enough data
9374 * to fill in the iov
9375 */
9376 if (resid < iov[j].iov_len) {
9377 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9378 break;
9379 } else {
9380 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9381 }
9382 resid -= iov[j].iov_len;
9383 bufoff += iov[j].iov_len;
9384 }
9385 return r;
9386 }
9387}
9388
11fdf7f2
TL
9389int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9390{
9391 std::lock_guard lock(client_lock);
9392 tout(cct) << fd << std::endl;
9393 tout(cct) << offset << std::endl;
9394
9395 if (unmounting)
9396 return -ENOTCONN;
9397
9398 Fh *fh = get_filehandle(fd);
9399 if (!fh)
9400 return -EBADF;
9401 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9402}
9403
9404int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9405 const struct iovec *iov, int iovcnt)
7c673cae 9406{
f64942e4
AA
9407 uint64_t fpos = 0;
9408
7c673cae
FG
9409 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9410 return -EFBIG;
9411
9412 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9413 Inode *in = f->inode.get();
9414
9415 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9416 return -ENOSPC;
9417 }
9418
11fdf7f2 9419 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9420
9421 // was Fh opened as writeable?
9422 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9423 return -EBADF;
9424
7c673cae
FG
9425 // use/adjust fd pos?
9426 if (offset < 0) {
9427 lock_fh_pos(f);
9428 /*
9429 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9430 * change out from under us.
9431 */
9432 if (f->flags & O_APPEND) {
9433 int r = _lseek(f, 0, SEEK_END);
9434 if (r < 0) {
9435 unlock_fh_pos(f);
9436 return r;
9437 }
9438 }
9439 offset = f->pos;
f64942e4 9440 fpos = offset+size;
7c673cae
FG
9441 unlock_fh_pos(f);
9442 }
9443
11fdf7f2
TL
9444 // check quota
9445 uint64_t endoff = offset + size;
9446 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9447 f->actor_perms)) {
9448 return -EDQUOT;
9449 }
9450
7c673cae
FG
9451 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9452
9453 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9454
9455 // time it.
9456 utime_t start = ceph_clock_now();
9457
9458 if (in->inline_version == 0) {
9459 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9460 if (r < 0)
9461 return r;
11fdf7f2 9462 ceph_assert(in->inline_version > 0);
7c673cae
FG
9463 }
9464
9465 // copy into fresh buffer (since our write may be resub, async)
9466 bufferlist bl;
9467 if (buf) {
9468 if (size > 0)
9469 bl.append(buf, size);
9470 } else if (iov){
9471 for (int i = 0; i < iovcnt; i++) {
9472 if (iov[i].iov_len > 0) {
9473 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9474 }
9475 }
9476 }
9477
9478 utime_t lat;
9479 uint64_t totalwritten;
11fdf7f2
TL
9480 int want, have;
9481 if (f->mode & CEPH_FILE_MODE_LAZY)
9482 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9483 else
9484 want = CEPH_CAP_FILE_BUFFER;
9485 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9486 if (r < 0)
9487 return r;
9488
9489 /* clear the setuid/setgid bits, if any */
181888fb 9490 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9491 struct ceph_statx stx = { 0 };
9492
9493 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9494 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9495 if (r < 0)
9496 return r;
9497 } else {
9498 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9499 }
9500
9501 if (f->flags & O_DIRECT)
11fdf7f2 9502 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9503
9504 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9505
11fdf7f2
TL
9506 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9507
7c673cae
FG
9508 if (in->inline_version < CEPH_INLINE_NONE) {
9509 if (endoff > cct->_conf->client_max_inline_size ||
9510 endoff > CEPH_INLINE_MAX_SIZE ||
9511 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9512 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9513 uninline_data(in, onuninline.get());
7c673cae
FG
9514 } else {
9515 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9516
9517 uint32_t len = in->inline_data.length();
9518
9519 if (endoff < len)
9520 in->inline_data.copy(endoff, len - endoff, bl);
9521
9522 if (offset < len)
9523 in->inline_data.splice(offset, len - offset);
9524 else if (offset > len)
9525 in->inline_data.append_zero(offset - len);
9526
9527 in->inline_data.append(bl);
9528 in->inline_version++;
9529
9530 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9531
9532 goto success;
9533 }
9534 }
9535
11fdf7f2
TL
9536 if (cct->_conf->client_oc &&
9537 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9538 // do buffered write
9539 if (!in->oset.dirty_or_tx)
9540 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9541
9542 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9543
9544 // async, caching, non-blocking.
9545 r = objectcacher->file_write(&in->oset, &in->layout,
9546 in->snaprealm->get_snap_context(),
9547 offset, size, bl, ceph::real_clock::now(),
9548 0);
9549 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9550
9551 if (r < 0)
9552 goto done;
9553
9554 // flush cached write if O_SYNC is set on file fh
9555 // O_DSYNC == O_SYNC on linux < 2.6.33
9556 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9557 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9558 _flush_range(in, offset, size);
9559 }
9560 } else {
9561 if (f->flags & O_DIRECT)
9562 _flush_range(in, offset, size);
9563
9564 // simple, non-atomic sync write
11fdf7f2 9565 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9566 unsafe_sync_write++;
9567 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9568
9569 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9570 offset, size, bl, ceph::real_clock::now(), 0,
9571 in->truncate_size, in->truncate_seq,
11fdf7f2 9572 &onfinish);
7c673cae 9573 client_lock.Unlock();
11fdf7f2 9574 onfinish.wait();
7c673cae
FG
9575 client_lock.Lock();
9576 _sync_write_commit(in);
9577 }
9578
9579 // if we get here, write was successful, update client metadata
9580success:
9581 // time
9582 lat = ceph_clock_now();
9583 lat -= start;
9584 logger->tinc(l_c_wrlat, lat);
9585
f64942e4
AA
9586 if (fpos) {
9587 lock_fh_pos(f);
9588 f->pos = fpos;
9589 unlock_fh_pos(f);
9590 }
7c673cae 9591 totalwritten = size;
11fdf7f2 9592 r = (int64_t)totalwritten;
7c673cae
FG
9593
9594 // extend file?
9595 if (totalwritten + offset > in->size) {
9596 in->size = totalwritten + offset;
28e407b8 9597 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9598
11fdf7f2 9599 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9600 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9601 } else if (is_max_size_approaching(in)) {
9602 check_caps(in, 0);
7c673cae
FG
9603 }
9604
9605 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9606 } else {
9607 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9608 }
9609
9610 // mtime
91327a77 9611 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9612 in->change_attr++;
28e407b8 9613 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9614
9615done:
9616
11fdf7f2 9617 if (nullptr != onuninline) {
7c673cae 9618 client_lock.Unlock();
11fdf7f2 9619 int uninline_ret = onuninline->wait();
7c673cae
FG
9620 client_lock.Lock();
9621
9622 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9623 in->inline_data.clear();
9624 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9625 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9626 check_caps(in, 0);
9627 } else
9628 r = uninline_ret;
9629 }
9630
9631 put_cap_ref(in, CEPH_CAP_FILE_WR);
9632 return r;
9633}
9634
9635int Client::_flush(Fh *f)
9636{
9637 Inode *in = f->inode.get();
9638 int err = f->take_async_err();
9639 if (err != 0) {
9640 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9641 << cpp_strerror(err) << dendl;
9642 } else {
9643 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9644 }
9645
9646 return err;
9647}
9648
9649int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9650{
9651 struct ceph_statx stx;
9652 stx.stx_size = length;
9653 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9654}
9655
9656int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9657{
11fdf7f2
TL
9658 std::lock_guard lock(client_lock);
9659 tout(cct) << __func__ << std::endl;
7c673cae
FG
9660 tout(cct) << fd << std::endl;
9661 tout(cct) << length << std::endl;
9662
181888fb
FG
9663 if (unmounting)
9664 return -ENOTCONN;
9665
7c673cae
FG
9666 Fh *f = get_filehandle(fd);
9667 if (!f)
9668 return -EBADF;
9669#if defined(__linux__) && defined(O_PATH)
9670 if (f->flags & O_PATH)
9671 return -EBADF;
9672#endif
9673 struct stat attr;
9674 attr.st_size = length;
9675 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9676}
9677
9678int Client::fsync(int fd, bool syncdataonly)
9679{
11fdf7f2 9680 std::lock_guard lock(client_lock);
7c673cae
FG
9681 tout(cct) << "fsync" << std::endl;
9682 tout(cct) << fd << std::endl;
9683 tout(cct) << syncdataonly << std::endl;
9684
181888fb
FG
9685 if (unmounting)
9686 return -ENOTCONN;
9687
7c673cae
FG
9688 Fh *f = get_filehandle(fd);
9689 if (!f)
9690 return -EBADF;
9691#if defined(__linux__) && defined(O_PATH)
9692 if (f->flags & O_PATH)
9693 return -EBADF;
9694#endif
9695 int r = _fsync(f, syncdataonly);
9696 if (r == 0) {
9697 // The IOs in this fsync were okay, but maybe something happened
9698 // in the background that we shoudl be reporting?
9699 r = f->take_async_err();
1adf2230 9700 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9701 << ") = 0, async_err = " << r << dendl;
9702 } else {
9703 // Assume that an error we encountered during fsync, even reported
9704 // synchronously, would also have applied the error to the Fh, and we
9705 // should clear it here to avoid returning the same error again on next
9706 // call.
1adf2230 9707 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9708 << r << dendl;
9709 f->take_async_err();
9710 }
9711 return r;
9712}
9713
9714int Client::_fsync(Inode *in, bool syncdataonly)
9715{
9716 int r = 0;
11fdf7f2 9717 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9718 ceph_tid_t flush_tid = 0;
9719 InodeRef tmp_ref;
11fdf7f2
TL
9720 utime_t lat;
9721 utime_t start = ceph_clock_now();
7c673cae 9722
1adf2230 9723 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9724
9725 if (cct->_conf->client_oc) {
11fdf7f2
TL
9726 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9727 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9728 _flush(in, object_cacher_completion.get());
7c673cae
FG
9729 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9730 }
9731
9732 if (!syncdataonly && in->dirty_caps) {
9733 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9734 if (in->flushing_caps)
9735 flush_tid = last_flush_tid;
9736 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9737
9738 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9739 flush_mdlog_sync();
9740
7c673cae
FG
9741 MetaRequest *req = in->unsafe_ops.back();
9742 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9743
9744 req->get();
9745 wait_on_list(req->waitfor_safe);
9746 put_request(req);
9747 }
9748
11fdf7f2 9749 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
7c673cae 9750 client_lock.Unlock();
7c673cae 9751 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9752 r = object_cacher_completion->wait();
7c673cae
FG
9753 client_lock.Lock();
9754 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9755 } else {
9756 // FIXME: this can starve
9757 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9758 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9759 << " uncommitted, waiting" << dendl;
9760 wait_on_list(in->waitfor_commit);
9761 }
9762 }
9763
9764 if (!r) {
9765 if (flush_tid > 0)
9766 wait_sync_caps(in, flush_tid);
9767
9768 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9769 } else {
1adf2230 9770 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9771 << cpp_strerror(-r) << dendl;
9772 }
11fdf7f2
TL
9773
9774 lat = ceph_clock_now();
9775 lat -= start;
9776 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9777
9778 return r;
9779}
9780
9781int Client::_fsync(Fh *f, bool syncdataonly)
9782{
1adf2230 9783 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9784 return _fsync(f->inode.get(), syncdataonly);
9785}
9786
9787int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9788{
11fdf7f2 9789 std::lock_guard lock(client_lock);
7c673cae
FG
9790 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9791 tout(cct) << fd << std::endl;
9792
181888fb
FG
9793 if (unmounting)
9794 return -ENOTCONN;
9795
7c673cae
FG
9796 Fh *f = get_filehandle(fd);
9797 if (!f)
9798 return -EBADF;
9799 int r = _getattr(f->inode, mask, perms);
9800 if (r < 0)
9801 return r;
9802 fill_stat(f->inode, stbuf, NULL);
1adf2230 9803 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9804 return r;
9805}
9806
9807int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9808 unsigned int want, unsigned int flags)
9809{
11fdf7f2 9810 std::lock_guard lock(client_lock);
7c673cae
FG
9811 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9812 tout(cct) << fd << std::endl;
9813
181888fb
FG
9814 if (unmounting)
9815 return -ENOTCONN;
9816
7c673cae
FG
9817 Fh *f = get_filehandle(fd);
9818 if (!f)
9819 return -EBADF;
9820
9821 unsigned mask = statx_to_mask(flags, want);
9822
9823 int r = 0;
94b18763 9824 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9825 r = _getattr(f->inode, mask, perms);
9826 if (r < 0) {
9827 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9828 return r;
9829 }
9830 }
9831
9832 fill_statx(f->inode, mask, stx);
9833 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9834 return r;
9835}
9836
9837// not written yet, but i want to link!
9838
9839int Client::chdir(const char *relpath, std::string &new_cwd,
9840 const UserPerm& perms)
9841{
11fdf7f2 9842 std::lock_guard lock(client_lock);
7c673cae
FG
9843 tout(cct) << "chdir" << std::endl;
9844 tout(cct) << relpath << std::endl;
181888fb
FG
9845
9846 if (unmounting)
9847 return -ENOTCONN;
9848
7c673cae
FG
9849 filepath path(relpath);
9850 InodeRef in;
9851 int r = path_walk(path, &in, perms);
9852 if (r < 0)
9853 return r;
9854 if (cwd != in)
9855 cwd.swap(in);
9856 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9857
b5b8bbf5 9858 _getcwd(new_cwd, perms);
7c673cae
FG
9859 return 0;
9860}
9861
b5b8bbf5 9862void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9863{
9864 filepath path;
11fdf7f2 9865 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9866
9867 Inode *in = cwd.get();
9868 while (in != root) {
11fdf7f2 9869 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9870
9871 // A cwd or ancester is unlinked
11fdf7f2 9872 if (in->dentries.empty()) {
7c673cae
FG
9873 return;
9874 }
9875
9876 Dentry *dn = in->get_first_parent();
9877
9878
9879 if (!dn) {
9880 // look it up
11fdf7f2 9881 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9882 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9883 filepath path(in->ino);
9884 req->set_filepath(path);
9885 req->set_inode(in);
9886 int res = make_request(req, perms);
9887 if (res < 0)
9888 break;
9889
9890 // start over
9891 path = filepath();
9892 in = cwd.get();
9893 continue;
9894 }
9895 path.push_front_dentry(dn->name);
9896 in = dn->dir->parent_inode;
9897 }
9898 dir = "/";
9899 dir += path.get_path();
9900}
9901
b5b8bbf5
FG
9902void Client::getcwd(string& dir, const UserPerm& perms)
9903{
11fdf7f2 9904 std::lock_guard l(client_lock);
181888fb
FG
9905 if (!unmounting)
9906 _getcwd(dir, perms);
b5b8bbf5
FG
9907}
9908
7c673cae
FG
9909int Client::statfs(const char *path, struct statvfs *stbuf,
9910 const UserPerm& perms)
9911{
11fdf7f2
TL
9912 std::lock_guard l(client_lock);
9913 tout(cct) << __func__ << std::endl;
91327a77 9914 unsigned long int total_files_on_fs;
7c673cae 9915
181888fb
FG
9916 if (unmounting)
9917 return -ENOTCONN;
9918
7c673cae
FG
9919 ceph_statfs stats;
9920 C_SaferCond cond;
d2e6a577
FG
9921
9922 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9923 if (data_pools.size() == 1) {
9924 objecter->get_fs_stats(stats, data_pools[0], &cond);
9925 } else {
9926 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9927 }
7c673cae
FG
9928
9929 client_lock.Unlock();
9930 int rval = cond.wait();
91327a77
AA
9931 assert(root);
9932 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9933 client_lock.Lock();
9934
9935 if (rval < 0) {
9936 ldout(cct, 1) << "underlying call to statfs returned error: "
9937 << cpp_strerror(rval)
9938 << dendl;
9939 return rval;
9940 }
9941
9942 memset(stbuf, 0, sizeof(*stbuf));
9943
9944 /*
9945 * we're going to set a block size of 4MB so we can represent larger
9946 * FSes without overflowing. Additionally convert the space
9947 * measurements from KB to bytes while making them in terms of
9948 * blocks. We use 4MB only because it is big enough, and because it
9949 * actually *is* the (ceph) default block size.
9950 */
9951 const int CEPH_BLOCK_SHIFT = 22;
9952 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9953 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9954 stbuf->f_files = total_files_on_fs;
9955 stbuf->f_ffree = 0;
7c673cae
FG
9956 stbuf->f_favail = -1;
9957 stbuf->f_fsid = -1; // ??
9958 stbuf->f_flag = 0; // ??
9959 stbuf->f_namemax = NAME_MAX;
9960
9961 // Usually quota_root will == root_ancestor, but if the mount root has no
9962 // quota but we can see a parent of it that does have a quota, we'll
9963 // respect that one instead.
11fdf7f2 9964 ceph_assert(root != nullptr);
7c673cae
FG
9965 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9966
9967 // get_quota_root should always give us something
9968 // because client quotas are always enabled
11fdf7f2 9969 ceph_assert(quota_root != nullptr);
7c673cae
FG
9970
9971 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9972
9973 // Skip the getattr if any sessions are stale, as we don't want to
9974 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9975 // is unhealthy.
9976 if (!_any_stale_sessions()) {
9977 int r = _getattr(quota_root, 0, perms, true);
9978 if (r != 0) {
9979 // Ignore return value: error getting latest inode metadata is not a good
9980 // reason to break "df".
9981 lderr(cct) << "Error in getattr on quota root 0x"
9982 << std::hex << quota_root->ino << std::dec
9983 << " statfs result may be outdated" << dendl;
9984 }
9985 }
9986
9987 // Special case: if there is a size quota set on the Inode acting
9988 // as the root for this client mount, then report the quota status
9989 // as the filesystem statistics.
9990 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9991 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9992 // It is possible for a quota to be exceeded: arithmetic here must
9993 // handle case where used > total.
9994 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9995
9996 stbuf->f_blocks = total;
9997 stbuf->f_bfree = free;
9998 stbuf->f_bavail = free;
9999 } else {
d2e6a577 10000 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10001 // multiple pools may be used without one filesystem namespace via
10002 // layouts, this is the most correct thing we can do.
10003 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10004 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10005 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10006 }
10007
10008 return rval;
10009}
10010
10011int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10012 struct flock *fl, uint64_t owner, bool removing)
10013{
11fdf7f2 10014 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10015 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10016 << " type " << fl->l_type << " owner " << owner
10017 << " " << fl->l_start << "~" << fl->l_len << dendl;
10018
10019 int lock_cmd;
10020 if (F_RDLCK == fl->l_type)
10021 lock_cmd = CEPH_LOCK_SHARED;
10022 else if (F_WRLCK == fl->l_type)
10023 lock_cmd = CEPH_LOCK_EXCL;
10024 else if (F_UNLCK == fl->l_type)
10025 lock_cmd = CEPH_LOCK_UNLOCK;
10026 else
10027 return -EIO;
10028
10029 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10030 sleep = 0;
10031
10032 /*
10033 * Set the most significant bit, so that MDS knows the 'owner'
10034 * is sufficient to identify the owner of lock. (old code uses
10035 * both 'owner' and 'pid')
10036 */
10037 owner |= (1ULL << 63);
10038
10039 MetaRequest *req = new MetaRequest(op);
10040 filepath path;
10041 in->make_nosnap_relative_path(path);
10042 req->set_filepath(path);
10043 req->set_inode(in);
10044
10045 req->head.args.filelock_change.rule = lock_type;
10046 req->head.args.filelock_change.type = lock_cmd;
10047 req->head.args.filelock_change.owner = owner;
10048 req->head.args.filelock_change.pid = fl->l_pid;
10049 req->head.args.filelock_change.start = fl->l_start;
10050 req->head.args.filelock_change.length = fl->l_len;
10051 req->head.args.filelock_change.wait = sleep;
10052
10053 int ret;
10054 bufferlist bl;
10055
10056 if (sleep && switch_interrupt_cb) {
10057 // enable interrupt
10058 switch_interrupt_cb(callback_handle, req->get());
10059 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10060 // disable interrupt
10061 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10062 if (ret == 0 && req->aborted()) {
10063 // effect of this lock request has been revoked by the 'lock intr' request
10064 ret = req->get_abort_code();
10065 }
7c673cae
FG
10066 put_request(req);
10067 } else {
10068 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10069 }
10070
10071 if (ret == 0) {
10072 if (op == CEPH_MDS_OP_GETFILELOCK) {
10073 ceph_filelock filelock;
11fdf7f2
TL
10074 auto p = bl.cbegin();
10075 decode(filelock, p);
7c673cae
FG
10076
10077 if (CEPH_LOCK_SHARED == filelock.type)
10078 fl->l_type = F_RDLCK;
10079 else if (CEPH_LOCK_EXCL == filelock.type)
10080 fl->l_type = F_WRLCK;
10081 else
10082 fl->l_type = F_UNLCK;
10083
10084 fl->l_whence = SEEK_SET;
10085 fl->l_start = filelock.start;
10086 fl->l_len = filelock.length;
10087 fl->l_pid = filelock.pid;
10088 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10089 ceph_lock_state_t *lock_state;
10090 if (lock_type == CEPH_LOCK_FCNTL) {
10091 if (!in->fcntl_locks)
11fdf7f2
TL
10092 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10093 lock_state = in->fcntl_locks.get();
7c673cae
FG
10094 } else if (lock_type == CEPH_LOCK_FLOCK) {
10095 if (!in->flock_locks)
11fdf7f2
TL
10096 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10097 lock_state = in->flock_locks.get();
7c673cae
FG
10098 } else {
10099 ceph_abort();
10100 return -EINVAL;
10101 }
10102 _update_lock_state(fl, owner, lock_state);
10103
10104 if (!removing) {
10105 if (lock_type == CEPH_LOCK_FCNTL) {
10106 if (!fh->fcntl_locks)
11fdf7f2
TL
10107 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10108 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10109 } else {
10110 if (!fh->flock_locks)
11fdf7f2
TL
10111 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10112 lock_state = fh->flock_locks.get();
7c673cae
FG
10113 }
10114 _update_lock_state(fl, owner, lock_state);
10115 }
10116 } else
10117 ceph_abort();
10118 }
10119 return ret;
10120}
10121
10122int Client::_interrupt_filelock(MetaRequest *req)
10123{
31f18b77
FG
10124 // Set abort code, but do not kick. The abort code prevents the request
10125 // from being re-sent.
10126 req->abort(-EINTR);
10127 if (req->mds < 0)
10128 return 0; // haven't sent the request
10129
7c673cae
FG
10130 Inode *in = req->inode();
10131
10132 int lock_type;
10133 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10134 lock_type = CEPH_LOCK_FLOCK_INTR;
10135 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10136 lock_type = CEPH_LOCK_FCNTL_INTR;
10137 else {
10138 ceph_abort();
10139 return -EINVAL;
10140 }
10141
10142 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10143 filepath path;
10144 in->make_nosnap_relative_path(path);
10145 intr_req->set_filepath(path);
10146 intr_req->set_inode(in);
10147 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10148 intr_req->head.args.filelock_change.rule = lock_type;
10149 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10150
10151 UserPerm perms(req->get_uid(), req->get_gid());
10152 return make_request(intr_req, perms, NULL, NULL, -1);
10153}
10154
10155void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10156{
10157 if (!in->fcntl_locks && !in->flock_locks)
10158 return;
10159
10160 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10161 encode(nr_fcntl_locks, bl);
7c673cae 10162 if (nr_fcntl_locks) {
11fdf7f2 10163 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10164 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10165 p != lock_state->held_locks.end();
10166 ++p)
11fdf7f2 10167 encode(p->second, bl);
7c673cae
FG
10168 }
10169
10170 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10171 encode(nr_flock_locks, bl);
7c673cae 10172 if (nr_flock_locks) {
11fdf7f2 10173 auto &lock_state = in->flock_locks;
7c673cae
FG
10174 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10175 p != lock_state->held_locks.end();
10176 ++p)
11fdf7f2 10177 encode(p->second, bl);
7c673cae
FG
10178 }
10179
11fdf7f2 10180 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10181 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10182}
10183
10184void Client::_release_filelocks(Fh *fh)
10185{
10186 if (!fh->fcntl_locks && !fh->flock_locks)
10187 return;
10188
10189 Inode *in = fh->inode.get();
11fdf7f2 10190 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10191
10192 list<pair<int, ceph_filelock> > to_release;
10193
10194 if (fh->fcntl_locks) {
11fdf7f2 10195 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10196 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10197 p != lock_state->held_locks.end();
10198 ++p)
10199 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10200 lock_state.reset();
7c673cae
FG
10201 }
10202 if (fh->flock_locks) {
11fdf7f2 10203 auto &lock_state = fh->flock_locks;
7c673cae
FG
10204 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10205 p != lock_state->held_locks.end();
10206 ++p)
10207 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10208 lock_state.reset();
7c673cae
FG
10209 }
10210
10211 if (to_release.empty())
10212 return;
10213
11fdf7f2
TL
10214 // mds has already released filelocks if session was closed.
10215 if (in->caps.empty())
10216 return;
10217
7c673cae
FG
10218 struct flock fl;
10219 memset(&fl, 0, sizeof(fl));
10220 fl.l_whence = SEEK_SET;
10221 fl.l_type = F_UNLCK;
10222
10223 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10224 p != to_release.end();
10225 ++p) {
10226 fl.l_start = p->second.start;
10227 fl.l_len = p->second.length;
10228 fl.l_pid = p->second.pid;
10229 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10230 p->second.owner, true);
10231 }
10232}
10233
10234void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10235 ceph_lock_state_t *lock_state)
10236{
10237 int lock_cmd;
10238 if (F_RDLCK == fl->l_type)
10239 lock_cmd = CEPH_LOCK_SHARED;
10240 else if (F_WRLCK == fl->l_type)
10241 lock_cmd = CEPH_LOCK_EXCL;
10242 else
10243 lock_cmd = CEPH_LOCK_UNLOCK;;
10244
10245 ceph_filelock filelock;
10246 filelock.start = fl->l_start;
10247 filelock.length = fl->l_len;
10248 filelock.client = 0;
10249 // see comment in _do_filelock()
10250 filelock.owner = owner | (1ULL << 63);
10251 filelock.pid = fl->l_pid;
10252 filelock.type = lock_cmd;
10253
10254 if (filelock.type == CEPH_LOCK_UNLOCK) {
10255 list<ceph_filelock> activated_locks;
10256 lock_state->remove_lock(filelock, activated_locks);
10257 } else {
10258 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10259 ceph_assert(r);
7c673cae
FG
10260 }
10261}
10262
10263int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10264{
10265 Inode *in = fh->inode.get();
10266 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10267 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10268 return ret;
10269}
10270
10271int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10272{
10273 Inode *in = fh->inode.get();
10274 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10275 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10276 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10277 return ret;
10278}
10279
10280int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10281{
10282 Inode *in = fh->inode.get();
10283 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10284
10285 int sleep = !(cmd & LOCK_NB);
10286 cmd &= ~LOCK_NB;
10287
10288 int type;
10289 switch (cmd) {
10290 case LOCK_SH:
10291 type = F_RDLCK;
10292 break;
10293 case LOCK_EX:
10294 type = F_WRLCK;
10295 break;
10296 case LOCK_UN:
10297 type = F_UNLCK;
10298 break;
10299 default:
10300 return -EINVAL;
10301 }
10302
10303 struct flock fl;
10304 memset(&fl, 0, sizeof(fl));
10305 fl.l_type = type;
10306 fl.l_whence = SEEK_SET;
10307
10308 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10309 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10310 return ret;
10311}
10312
10313int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10314{
10315 /* Since the only thing this does is wrap a call to statfs, and
10316 statfs takes a lock, it doesn't seem we have a need to split it
10317 out. */
10318 return statfs(0, stbuf, perms);
10319}
10320
10321void Client::ll_register_callbacks(struct client_callback_args *args)
10322{
10323 if (!args)
10324 return;
11fdf7f2
TL
10325 std::lock_guard l(client_lock);
10326 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10327 << " invalidate_ino_cb " << args->ino_cb
10328 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10329 << " switch_interrupt_cb " << args->switch_intr_cb
10330 << " remount_cb " << args->remount_cb
10331 << dendl;
10332 callback_handle = args->handle;
10333 if (args->ino_cb) {
10334 ino_invalidate_cb = args->ino_cb;
10335 async_ino_invalidator.start();
10336 }
10337 if (args->dentry_cb) {
10338 dentry_invalidate_cb = args->dentry_cb;
10339 async_dentry_invalidator.start();
10340 }
10341 if (args->switch_intr_cb) {
10342 switch_interrupt_cb = args->switch_intr_cb;
10343 interrupt_finisher.start();
10344 }
10345 if (args->remount_cb) {
10346 remount_cb = args->remount_cb;
10347 remount_finisher.start();
10348 }
7c673cae
FG
10349 umask_cb = args->umask_cb;
10350}
10351
10352int Client::test_dentry_handling(bool can_invalidate)
10353{
10354 int r = 0;
10355
10356 can_invalidate_dentries = can_invalidate;
10357
10358 if (can_invalidate_dentries) {
11fdf7f2 10359 ceph_assert(dentry_invalidate_cb);
7c673cae 10360 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10361 r = 0;
11fdf7f2
TL
10362 } else {
10363 ceph_assert(remount_cb);
7c673cae 10364 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10365 r = _do_remount(false);
b32b8144 10366 }
11fdf7f2 10367
7c673cae
FG
10368 return r;
10369}
10370
10371int Client::_sync_fs()
10372{
11fdf7f2 10373 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10374
10375 // flush file data
11fdf7f2
TL
10376 std::unique_ptr<C_SaferCond> cond = nullptr;
10377 if (cct->_conf->client_oc) {
10378 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10379 objectcacher->flush_all(cond.get());
10380 }
7c673cae
FG
10381
10382 // flush caps
10383 flush_caps_sync();
10384 ceph_tid_t flush_tid = last_flush_tid;
10385
10386 // wait for unsafe mds requests
10387 wait_unsafe_requests();
10388
10389 wait_sync_caps(flush_tid);
10390
11fdf7f2 10391 if (nullptr != cond) {
7c673cae 10392 client_lock.Unlock();
11fdf7f2
TL
10393 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10394 cond->wait();
10395 ldout(cct, 15) << __func__ << " flush finished" << dendl;
7c673cae
FG
10396 client_lock.Lock();
10397 }
10398
10399 return 0;
10400}
10401
10402int Client::sync_fs()
10403{
11fdf7f2 10404 std::lock_guard l(client_lock);
181888fb
FG
10405
10406 if (unmounting)
10407 return -ENOTCONN;
10408
7c673cae
FG
10409 return _sync_fs();
10410}
10411
10412int64_t Client::drop_caches()
10413{
11fdf7f2 10414 std::lock_guard l(client_lock);
7c673cae
FG
10415 return objectcacher->release_all();
10416}
10417
11fdf7f2
TL
10418int Client::_lazyio(Fh *fh, int enable)
10419{
10420 Inode *in = fh->inode.get();
10421 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10422
10423 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10424 return 0;
10425
10426 int orig_mode = fh->mode;
10427 if (enable) {
10428 fh->mode |= CEPH_FILE_MODE_LAZY;
10429 in->get_open_ref(fh->mode);
10430 in->put_open_ref(orig_mode);
10431 check_caps(in, CHECK_CAPS_NODELAY);
10432 } else {
10433 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10434 in->get_open_ref(fh->mode);
10435 in->put_open_ref(orig_mode);
10436 check_caps(in, 0);
10437 }
10438
10439 return 0;
10440}
10441
10442int Client::lazyio(int fd, int enable)
10443{
10444 std::lock_guard l(client_lock);
10445 Fh *f = get_filehandle(fd);
10446 if (!f)
10447 return -EBADF;
10448
10449 return _lazyio(f, enable);
10450}
10451
10452int Client::ll_lazyio(Fh *fh, int enable)
10453{
10454 std::lock_guard lock(client_lock);
10455 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10456 tout(cct) << __func__ << std::endl;
10457
10458 return _lazyio(fh, enable);
10459}
7c673cae
FG
10460
10461int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10462{
11fdf7f2 10463 std::lock_guard l(client_lock);
7c673cae
FG
10464 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10465 << ", " << offset << ", " << count << ")" << dendl;
10466
10467 Fh *f = get_filehandle(fd);
10468 if (!f)
10469 return -EBADF;
10470
10471 // for now
10472 _fsync(f, true);
10473
10474 return 0;
10475}
10476
10477int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10478{
11fdf7f2 10479 std::lock_guard l(client_lock);
7c673cae
FG
10480 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10481 << ", " << offset << ", " << count << ")" << dendl;
10482
10483 Fh *f = get_filehandle(fd);
10484 if (!f)
10485 return -EBADF;
10486 Inode *in = f->inode.get();
10487
10488 _fsync(f, true);
10489 if (_release(in))
10490 check_caps(in, 0);
10491 return 0;
10492}
10493
10494
10495// =============================
10496// snaps
10497
10498int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10499{
11fdf7f2 10500 std::lock_guard l(client_lock);
181888fb
FG
10501
10502 if (unmounting)
10503 return -ENOTCONN;
10504
7c673cae
FG
10505 filepath path(relpath);
10506 InodeRef in;
10507 int r = path_walk(path, &in, perm);
10508 if (r < 0)
10509 return r;
10510 if (cct->_conf->client_permissions) {
10511 r = may_create(in.get(), perm);
10512 if (r < 0)
10513 return r;
10514 }
10515 Inode *snapdir = open_snapdir(in.get());
10516 return _mkdir(snapdir, name, 0, perm);
10517}
181888fb 10518
7c673cae
FG
10519int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10520{
11fdf7f2 10521 std::lock_guard l(client_lock);
181888fb
FG
10522
10523 if (unmounting)
10524 return -ENOTCONN;
10525
7c673cae
FG
10526 filepath path(relpath);
10527 InodeRef in;
10528 int r = path_walk(path, &in, perms);
10529 if (r < 0)
10530 return r;
10531 if (cct->_conf->client_permissions) {
10532 r = may_delete(in.get(), NULL, perms);
10533 if (r < 0)
10534 return r;
10535 }
10536 Inode *snapdir = open_snapdir(in.get());
10537 return _rmdir(snapdir, name, perms);
10538}
10539
10540// =============================
10541// expose caps
10542
10543int Client::get_caps_issued(int fd) {
10544
11fdf7f2 10545 std::lock_guard lock(client_lock);
7c673cae 10546
181888fb
FG
10547 if (unmounting)
10548 return -ENOTCONN;
10549
7c673cae
FG
10550 Fh *f = get_filehandle(fd);
10551 if (!f)
10552 return -EBADF;
10553
10554 return f->inode->caps_issued();
10555}
10556
10557int Client::get_caps_issued(const char *path, const UserPerm& perms)
10558{
11fdf7f2 10559 std::lock_guard lock(client_lock);
181888fb
FG
10560
10561 if (unmounting)
10562 return -ENOTCONN;
10563
7c673cae
FG
10564 filepath p(path);
10565 InodeRef in;
10566 int r = path_walk(p, &in, perms, true);
10567 if (r < 0)
10568 return r;
10569 return in->caps_issued();
10570}
10571
10572// =========================================
10573// low level
10574
10575Inode *Client::open_snapdir(Inode *diri)
10576{
10577 Inode *in;
10578 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10579 if (!inode_map.count(vino)) {
10580 in = new Inode(this, vino, &diri->layout);
10581
10582 in->ino = diri->ino;
10583 in->snapid = CEPH_SNAPDIR;
10584 in->mode = diri->mode;
10585 in->uid = diri->uid;
10586 in->gid = diri->gid;
10587 in->mtime = diri->mtime;
10588 in->ctime = diri->ctime;
10589 in->btime = diri->btime;
10590 in->size = diri->size;
10591 in->change_attr = diri->change_attr;
10592
10593 in->dirfragtree.clear();
10594 in->snapdir_parent = diri;
10595 diri->flags |= I_SNAPDIR_OPEN;
10596 inode_map[vino] = in;
10597 if (use_faked_inos())
10598 _assign_faked_ino(in);
10599 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10600 } else {
10601 in = inode_map[vino];
10602 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10603 }
10604 return in;
10605}
10606
10607int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10608 Inode **out, const UserPerm& perms)
10609{
11fdf7f2 10610 std::lock_guard lock(client_lock);
31f18b77 10611 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10612 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10613 tout(cct) << __func__ << std::endl;
7c673cae
FG
10614 tout(cct) << name << std::endl;
10615
181888fb
FG
10616 if (unmounting)
10617 return -ENOTCONN;
10618
7c673cae 10619 int r = 0;
11fdf7f2
TL
10620 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10621 "fuse_default_permissions");
10622 if (!fuse_default_permissions) {
10623 if (strcmp(name, ".") && strcmp(name, "..")) {
10624 r = may_lookup(parent, perms);
10625 if (r < 0)
10626 return r;
10627 }
7c673cae
FG
10628 }
10629
10630 string dname(name);
10631 InodeRef in;
10632
10633 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10634 if (r < 0) {
10635 attr->st_ino = 0;
10636 goto out;
10637 }
10638
11fdf7f2 10639 ceph_assert(in);
7c673cae
FG
10640 fill_stat(in, attr);
10641 _ll_get(in.get());
10642
10643 out:
11fdf7f2 10644 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10645 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10646 tout(cct) << attr->st_ino << std::endl;
10647 *out = in.get();
10648 return r;
10649}
10650
1adf2230
AA
10651int Client::ll_lookup_inode(
10652 struct inodeno_t ino,
10653 const UserPerm& perms,
10654 Inode **inode)
10655{
81eedcae 10656 ceph_assert(inode != NULL);
11fdf7f2 10657 std::lock_guard lock(client_lock);
1adf2230
AA
10658 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10659
81eedcae
TL
10660 if (unmounting)
10661 return -ENOTCONN;
10662
1adf2230
AA
10663 // Num1: get inode and *inode
10664 int r = _lookup_ino(ino, perms, inode);
81eedcae 10665 if (r)
1adf2230 10666 return r;
81eedcae 10667
11fdf7f2 10668 ceph_assert(*inode != NULL);
1adf2230 10669
81eedcae
TL
10670 if (!(*inode)->dentries.empty()) {
10671 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10672 return 0;
10673 }
10674
10675 if ((*inode)->is_root()) {
10676 ldout(cct, 8) << "ino is root, no parent" << dendl;
10677 return 0;
10678 }
10679
1adf2230
AA
10680 // Num2: Request the parent inode, so that we can look up the name
10681 Inode *parent;
10682 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10683 if (r) {
1adf2230
AA
10684 _ll_forget(*inode, 1);
10685 return r;
1adf2230 10686 }
81eedcae 10687
11fdf7f2 10688 ceph_assert(parent != NULL);
1adf2230
AA
10689
10690 // Num3: Finally, get the name (dentry) of the requested inode
10691 r = _lookup_name(*inode, parent, perms);
10692 if (r) {
10693 // Unexpected error
10694 _ll_forget(parent, 1);
10695 _ll_forget(*inode, 1);
10696 return r;
10697 }
10698
10699 _ll_forget(parent, 1);
10700 return 0;
10701}
10702
7c673cae
FG
10703int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10704 struct ceph_statx *stx, unsigned want, unsigned flags,
10705 const UserPerm& perms)
10706{
11fdf7f2 10707 std::lock_guard lock(client_lock);
31f18b77 10708 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10709 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10710 tout(cct) << "ll_lookupx" << std::endl;
10711 tout(cct) << name << std::endl;
10712
181888fb
FG
10713 if (unmounting)
10714 return -ENOTCONN;
10715
7c673cae 10716 int r = 0;
11fdf7f2
TL
10717 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10718 "fuse_default_permissions");
10719 if (!fuse_default_permissions) {
7c673cae
FG
10720 r = may_lookup(parent, perms);
10721 if (r < 0)
10722 return r;
10723 }
10724
10725 string dname(name);
10726 InodeRef in;
10727
10728 unsigned mask = statx_to_mask(flags, want);
10729 r = _lookup(parent, dname, mask, &in, perms);
10730 if (r < 0) {
10731 stx->stx_ino = 0;
10732 stx->stx_mask = 0;
10733 } else {
11fdf7f2 10734 ceph_assert(in);
7c673cae
FG
10735 fill_statx(in, mask, stx);
10736 _ll_get(in.get());
10737 }
10738
11fdf7f2 10739 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10740 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10741 tout(cct) << stx->stx_ino << std::endl;
10742 *out = in.get();
10743 return r;
10744}
10745
10746int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10747 unsigned int want, unsigned int flags, const UserPerm& perms)
10748{
11fdf7f2 10749 std::lock_guard lock(client_lock);
181888fb
FG
10750
10751 if (unmounting)
10752 return -ENOTCONN;
10753
7c673cae
FG
10754 filepath fp(name, 0);
10755 InodeRef in;
10756 int rc;
10757 unsigned mask = statx_to_mask(flags, want);
10758
11fdf7f2
TL
10759 ldout(cct, 3) << __func__ << " " << name << dendl;
10760 tout(cct) << __func__ << std::endl;
7c673cae
FG
10761 tout(cct) << name << std::endl;
10762
10763 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10764 if (rc < 0) {
10765 /* zero out mask, just in case... */
10766 stx->stx_mask = 0;
10767 stx->stx_ino = 0;
10768 *out = NULL;
10769 return rc;
10770 } else {
11fdf7f2 10771 ceph_assert(in);
7c673cae
FG
10772 fill_statx(in, mask, stx);
10773 _ll_get(in.get());
10774 *out = in.get();
10775 return 0;
10776 }
10777}
10778
10779void Client::_ll_get(Inode *in)
10780{
10781 if (in->ll_ref == 0) {
10782 in->get();
11fdf7f2
TL
10783 if (in->is_dir() && !in->dentries.empty()) {
10784 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10785 in->get_first_parent()->get(); // pin dentry
10786 }
11fdf7f2
TL
10787 if (in->snapid != CEPH_NOSNAP)
10788 ll_snap_ref[in->snapid]++;
7c673cae
FG
10789 }
10790 in->ll_get();
11fdf7f2 10791 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10792}
10793
10794int Client::_ll_put(Inode *in, int num)
10795{
10796 in->ll_put(num);
11fdf7f2 10797 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10798 if (in->ll_ref == 0) {
11fdf7f2
TL
10799 if (in->is_dir() && !in->dentries.empty()) {
10800 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10801 in->get_first_parent()->put(); // unpin dentry
10802 }
11fdf7f2
TL
10803 if (in->snapid != CEPH_NOSNAP) {
10804 auto p = ll_snap_ref.find(in->snapid);
10805 ceph_assert(p != ll_snap_ref.end());
10806 ceph_assert(p->second > 0);
10807 if (--p->second == 0)
10808 ll_snap_ref.erase(p);
10809 }
7c673cae
FG
10810 put_inode(in);
10811 return 0;
10812 } else {
10813 return in->ll_ref;
10814 }
10815}
10816
10817void Client::_ll_drop_pins()
10818{
11fdf7f2 10819 ldout(cct, 10) << __func__ << dendl;
1adf2230 10820 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10821 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10822 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10823 it != inode_map.end();
10824 it = next) {
10825 Inode *in = it->second;
10826 next = it;
10827 ++next;
1adf2230
AA
10828 if (in->ll_ref){
10829 to_be_put.insert(in);
7c673cae 10830 _ll_put(in, in->ll_ref);
1adf2230 10831 }
7c673cae
FG
10832 }
10833}
10834
1adf2230 10835bool Client::_ll_forget(Inode *in, int count)
7c673cae 10836{
11fdf7f2 10837 inodeno_t ino = in->ino;
7c673cae 10838
11fdf7f2
TL
10839 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10840 tout(cct) << __func__ << std::endl;
7c673cae
FG
10841 tout(cct) << ino.val << std::endl;
10842 tout(cct) << count << std::endl;
10843
181888fb
FG
10844 // Ignore forget if we're no longer mounted
10845 if (unmounting)
10846 return true;
10847
7c673cae
FG
10848 if (ino == 1) return true; // ignore forget on root.
10849
10850 bool last = false;
10851 if (in->ll_ref < count) {
10852 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10853 << ", which only has ll_ref=" << in->ll_ref << dendl;
10854 _ll_put(in, in->ll_ref);
10855 last = true;
10856 } else {
10857 if (_ll_put(in, count) == 0)
10858 last = true;
10859 }
10860
10861 return last;
10862}
10863
1adf2230
AA
10864bool Client::ll_forget(Inode *in, int count)
10865{
11fdf7f2 10866 std::lock_guard lock(client_lock);
1adf2230
AA
10867 return _ll_forget(in, count);
10868}
10869
7c673cae
FG
10870bool Client::ll_put(Inode *in)
10871{
10872 /* ll_forget already takes the lock */
10873 return ll_forget(in, 1);
10874}
10875
11fdf7f2
TL
10876int Client::ll_get_snap_ref(snapid_t snap)
10877{
10878 std::lock_guard lock(client_lock);
10879 auto p = ll_snap_ref.find(snap);
10880 if (p != ll_snap_ref.end())
10881 return p->second;
10882 return 0;
10883}
10884
7c673cae
FG
10885snapid_t Client::ll_get_snapid(Inode *in)
10886{
11fdf7f2 10887 std::lock_guard lock(client_lock);
7c673cae
FG
10888 return in->snapid;
10889}
10890
10891Inode *Client::ll_get_inode(ino_t ino)
10892{
11fdf7f2 10893 std::lock_guard lock(client_lock);
181888fb
FG
10894
10895 if (unmounting)
10896 return NULL;
10897
7c673cae
FG
10898 vinodeno_t vino = _map_faked_ino(ino);
10899 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10900 if (p == inode_map.end())
10901 return NULL;
10902 Inode *in = p->second;
10903 _ll_get(in);
10904 return in;
10905}
10906
10907Inode *Client::ll_get_inode(vinodeno_t vino)
10908{
11fdf7f2 10909 std::lock_guard lock(client_lock);
181888fb
FG
10910
10911 if (unmounting)
10912 return NULL;
10913
7c673cae
FG
10914 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10915 if (p == inode_map.end())
10916 return NULL;
10917 Inode *in = p->second;
10918 _ll_get(in);
10919 return in;
10920}
10921
10922int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10923{
10924 vinodeno_t vino = _get_vino(in);
10925
11fdf7f2
TL
10926 ldout(cct, 8) << __func__ << " " << vino << dendl;
10927 tout(cct) << __func__ << std::endl;
7c673cae
FG
10928 tout(cct) << vino.ino.val << std::endl;
10929
10930 if (vino.snapid < CEPH_NOSNAP)
10931 return 0;
10932 else
10933 return _getattr(in, caps, perms);
10934}
10935
10936int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10937{
11fdf7f2 10938 std::lock_guard lock(client_lock);
7c673cae 10939
181888fb
FG
10940 if (unmounting)
10941 return -ENOTCONN;
10942
7c673cae
FG
10943 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10944
10945 if (res == 0)
10946 fill_stat(in, attr);
11fdf7f2 10947 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10948 return res;
10949}
10950
10951int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10952 unsigned int flags, const UserPerm& perms)
10953{
11fdf7f2 10954 std::lock_guard lock(client_lock);
7c673cae 10955
181888fb
FG
10956 if (unmounting)
10957 return -ENOTCONN;
10958
7c673cae
FG
10959 int res = 0;
10960 unsigned mask = statx_to_mask(flags, want);
10961
94b18763 10962 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10963 res = _ll_getattr(in, mask, perms);
10964
10965 if (res == 0)
10966 fill_statx(in, mask, stx);
11fdf7f2 10967 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10968 return res;
10969}
10970
10971int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10972 const UserPerm& perms, InodeRef *inp)
10973{
10974 vinodeno_t vino = _get_vino(in);
10975
11fdf7f2 10976 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 10977 << dendl;
11fdf7f2 10978 tout(cct) << __func__ << std::endl;
7c673cae
FG
10979 tout(cct) << vino.ino.val << std::endl;
10980 tout(cct) << stx->stx_mode << std::endl;
10981 tout(cct) << stx->stx_uid << std::endl;
10982 tout(cct) << stx->stx_gid << std::endl;
10983 tout(cct) << stx->stx_size << std::endl;
10984 tout(cct) << stx->stx_mtime << std::endl;
10985 tout(cct) << stx->stx_atime << std::endl;
10986 tout(cct) << stx->stx_btime << std::endl;
10987 tout(cct) << mask << std::endl;
10988
11fdf7f2
TL
10989 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10990 "fuse_default_permissions");
10991 if (!fuse_default_permissions) {
7c673cae
FG
10992 int res = may_setattr(in, stx, mask, perms);
10993 if (res < 0)
10994 return res;
10995 }
10996
10997 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10998
10999 return __setattrx(in, stx, mask, perms, inp);
11000}
11001
11002int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11003 const UserPerm& perms)
11004{
11fdf7f2 11005 std::lock_guard lock(client_lock);
181888fb
FG
11006
11007 if (unmounting)
11008 return -ENOTCONN;
11009
7c673cae
FG
11010 InodeRef target(in);
11011 int res = _ll_setattrx(in, stx, mask, perms, &target);
11012 if (res == 0) {
11fdf7f2 11013 ceph_assert(in == target.get());
7c673cae
FG
11014 fill_statx(in, in->caps_issued(), stx);
11015 }
11016
11fdf7f2 11017 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11018 return res;
11019}
11020
11021int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11022 const UserPerm& perms)
11023{
11024 struct ceph_statx stx;
11025 stat_to_statx(attr, &stx);
11026
11fdf7f2 11027 std::lock_guard lock(client_lock);
181888fb
FG
11028
11029 if (unmounting)
11030 return -ENOTCONN;
11031
7c673cae
FG
11032 InodeRef target(in);
11033 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11034 if (res == 0) {
11fdf7f2 11035 ceph_assert(in == target.get());
7c673cae
FG
11036 fill_stat(in, attr);
11037 }
11038
11fdf7f2 11039 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11040 return res;
11041}
11042
11043
11044// ----------
11045// xattrs
11046
11047int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11048 const UserPerm& perms)
11049{
11fdf7f2 11050 std::lock_guard lock(client_lock);
181888fb
FG
11051
11052 if (unmounting)
11053 return -ENOTCONN;
11054
7c673cae
FG
11055 InodeRef in;
11056 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11057 if (r < 0)
11058 return r;
11059 return _getxattr(in, name, value, size, perms);
11060}
11061
11062int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11063 const UserPerm& perms)
11064{
11fdf7f2 11065 std::lock_guard lock(client_lock);
181888fb
FG
11066
11067 if (unmounting)
11068 return -ENOTCONN;
11069
7c673cae
FG
11070 InodeRef in;
11071 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11072 if (r < 0)
11073 return r;
11074 return _getxattr(in, name, value, size, perms);
11075}
11076
11077int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11078 const UserPerm& perms)
11079{
11fdf7f2 11080 std::lock_guard lock(client_lock);
181888fb
FG
11081
11082 if (unmounting)
11083 return -ENOTCONN;
11084
7c673cae
FG
11085 Fh *f = get_filehandle(fd);
11086 if (!f)
11087 return -EBADF;
11088 return _getxattr(f->inode, name, value, size, perms);
11089}
11090
11091int Client::listxattr(const char *path, char *list, size_t size,
11092 const UserPerm& perms)
11093{
11fdf7f2 11094 std::lock_guard lock(client_lock);
181888fb
FG
11095
11096 if (unmounting)
11097 return -ENOTCONN;
11098
7c673cae
FG
11099 InodeRef in;
11100 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11101 if (r < 0)
11102 return r;
11103 return Client::_listxattr(in.get(), list, size, perms);
11104}
11105
11106int Client::llistxattr(const char *path, char *list, size_t size,
11107 const UserPerm& perms)
11108{
11fdf7f2 11109 std::lock_guard lock(client_lock);
181888fb
FG
11110
11111 if (unmounting)
11112 return -ENOTCONN;
11113
7c673cae
FG
11114 InodeRef in;
11115 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11116 if (r < 0)
11117 return r;
11118 return Client::_listxattr(in.get(), list, size, perms);
11119}
11120
11121int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11122{
11fdf7f2 11123 std::lock_guard lock(client_lock);
181888fb
FG
11124
11125 if (unmounting)
11126 return -ENOTCONN;
11127
7c673cae
FG
11128 Fh *f = get_filehandle(fd);
11129 if (!f)
11130 return -EBADF;
11131 return Client::_listxattr(f->inode.get(), list, size, perms);
11132}
11133
11134int Client::removexattr(const char *path, const char *name,
11135 const UserPerm& perms)
11136{
11fdf7f2 11137 std::lock_guard lock(client_lock);
181888fb
FG
11138
11139 if (unmounting)
11140 return -ENOTCONN;
11141
7c673cae
FG
11142 InodeRef in;
11143 int r = Client::path_walk(path, &in, perms, true);
11144 if (r < 0)
11145 return r;
11146 return _removexattr(in, name, perms);
11147}
11148
11149int Client::lremovexattr(const char *path, const char *name,
11150 const UserPerm& perms)
11151{
11fdf7f2 11152 std::lock_guard lock(client_lock);
181888fb
FG
11153
11154 if (unmounting)
11155 return -ENOTCONN;
11156
7c673cae
FG
11157 InodeRef in;
11158 int r = Client::path_walk(path, &in, perms, false);
11159 if (r < 0)
11160 return r;
11161 return _removexattr(in, name, perms);
11162}
11163
11164int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11165{
11fdf7f2 11166 std::lock_guard lock(client_lock);
181888fb
FG
11167
11168 if (unmounting)
11169 return -ENOTCONN;
11170
7c673cae
FG
11171 Fh *f = get_filehandle(fd);
11172 if (!f)
11173 return -EBADF;
11174 return _removexattr(f->inode, name, perms);
11175}
11176
11177int Client::setxattr(const char *path, const char *name, const void *value,
11178 size_t size, int flags, const UserPerm& perms)
11179{
11180 _setxattr_maybe_wait_for_osdmap(name, value, size);
11181
11fdf7f2 11182 std::lock_guard lock(client_lock);
181888fb
FG
11183
11184 if (unmounting)
11185 return -ENOTCONN;
11186
7c673cae
FG
11187 InodeRef in;
11188 int r = Client::path_walk(path, &in, perms, true);
11189 if (r < 0)
11190 return r;
11191 return _setxattr(in, name, value, size, flags, perms);
11192}
11193
11194int Client::lsetxattr(const char *path, const char *name, const void *value,
11195 size_t size, int flags, const UserPerm& perms)
11196{
11197 _setxattr_maybe_wait_for_osdmap(name, value, size);
11198
11fdf7f2 11199 std::lock_guard lock(client_lock);
181888fb
FG
11200
11201 if (unmounting)
11202 return -ENOTCONN;
11203
7c673cae
FG
11204 InodeRef in;
11205 int r = Client::path_walk(path, &in, perms, false);
11206 if (r < 0)
11207 return r;
11208 return _setxattr(in, name, value, size, flags, perms);
11209}
11210
11211int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11212 int flags, const UserPerm& perms)
11213{
11214 _setxattr_maybe_wait_for_osdmap(name, value, size);
11215
11fdf7f2 11216 std::lock_guard lock(client_lock);
181888fb
FG
11217
11218 if (unmounting)
11219 return -ENOTCONN;
11220
7c673cae
FG
11221 Fh *f = get_filehandle(fd);
11222 if (!f)
11223 return -EBADF;
11224 return _setxattr(f->inode, name, value, size, flags, perms);
11225}
11226
11227int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11228 const UserPerm& perms)
11229{
11230 int r;
11231
11232 const VXattr *vxattr = _match_vxattr(in, name);
11233 if (vxattr) {
11234 r = -ENODATA;
11235
11236 // Do a force getattr to get the latest quota before returning
11237 // a value to userspace.
28e407b8
AA
11238 int flags = 0;
11239 if (vxattr->flags & VXATTR_RSTAT) {
11240 flags |= CEPH_STAT_RSTAT;
11241 }
11242 r = _getattr(in, flags, perms, true);
7c673cae
FG
11243 if (r != 0) {
11244 // Error from getattr!
11245 return r;
11246 }
11247
11248 // call pointer-to-member function
11249 char buf[256];
11250 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11251 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11252 } else {
11253 r = -ENODATA;
11254 }
11255
11256 if (size != 0) {
11257 if (r > (int)size) {
11258 r = -ERANGE;
11259 } else if (r > 0) {
11260 memcpy(value, buf, r);
11261 }
11262 }
11263 goto out;
11264 }
11265
11266 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11267 r = -EOPNOTSUPP;
11268 goto out;
11269 }
11270
11271 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11272 if (r == 0) {
11273 string n(name);
11274 r = -ENODATA;
11275 if (in->xattrs.count(n)) {
11276 r = in->xattrs[n].length();
11277 if (r > 0 && size != 0) {
11278 if (size >= (unsigned)r)
11279 memcpy(value, in->xattrs[n].c_str(), r);
11280 else
11281 r = -ERANGE;
11282 }
11283 }
11284 }
11285 out:
1adf2230 11286 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11287 return r;
11288}
11289
11290int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11291 const UserPerm& perms)
11292{
11293 if (cct->_conf->client_permissions) {
11294 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11295 if (r < 0)
11296 return r;
11297 }
11298 return _getxattr(in.get(), name, value, size, perms);
11299}
11300
11301int Client::ll_getxattr(Inode *in, const char *name, void *value,
11302 size_t size, const UserPerm& perms)
11303{
11fdf7f2 11304 std::lock_guard lock(client_lock);
7c673cae 11305
181888fb
FG
11306 if (unmounting)
11307 return -ENOTCONN;
11308
7c673cae
FG
11309 vinodeno_t vino = _get_vino(in);
11310
11fdf7f2
TL
11311 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11312 tout(cct) << __func__ << std::endl;
7c673cae
FG
11313 tout(cct) << vino.ino.val << std::endl;
11314 tout(cct) << name << std::endl;
11315
11fdf7f2
TL
11316 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11317 "fuse_default_permissions");
11318 if (!fuse_default_permissions) {
7c673cae
FG
11319 int r = xattr_permission(in, name, MAY_READ, perms);
11320 if (r < 0)
11321 return r;
11322 }
11323
11324 return _getxattr(in, name, value, size, perms);
11325}
11326
11327int Client::_listxattr(Inode *in, char *name, size_t size,
11328 const UserPerm& perms)
11329{
81eedcae 11330 bool len_only = (size == 0);
7c673cae 11331 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11332 if (r != 0) {
11333 goto out;
11334 }
7c673cae 11335
81eedcae
TL
11336 r = 0;
11337 for (const auto& p : in->xattrs) {
11338 size_t this_len = p.first.length() + 1;
11339 r += this_len;
11340 if (len_only)
11341 continue;
7c673cae 11342
81eedcae
TL
11343 if (this_len > size) {
11344 r = -ERANGE;
11345 goto out;
11346 }
11347
11348 memcpy(name, p.first.c_str(), this_len);
11349 name += this_len;
11350 size -= this_len;
11351 }
11352
11353 const VXattr *vxattr;
11354 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11355 if (vxattr->hidden)
11356 continue;
11357 // call pointer-to-member function
11358 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11359 continue;
11360
11361 size_t this_len = vxattr->name.length() + 1;
11362 r += this_len;
11363 if (len_only)
11364 continue;
11365
11366 if (this_len > size) {
11367 r = -ERANGE;
11368 goto out;
7c673cae 11369 }
81eedcae
TL
11370
11371 memcpy(name, vxattr->name.c_str(), this_len);
11372 name += this_len;
11373 size -= this_len;
7c673cae 11374 }
81eedcae 11375out:
11fdf7f2 11376 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11377 return r;
11378}
11379
11380int Client::ll_listxattr(Inode *in, char *names, size_t size,
11381 const UserPerm& perms)
11382{
11fdf7f2 11383 std::lock_guard lock(client_lock);
7c673cae 11384
181888fb
FG
11385 if (unmounting)
11386 return -ENOTCONN;
11387
7c673cae
FG
11388 vinodeno_t vino = _get_vino(in);
11389
11fdf7f2
TL
11390 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11391 tout(cct) << __func__ << std::endl;
7c673cae
FG
11392 tout(cct) << vino.ino.val << std::endl;
11393 tout(cct) << size << std::endl;
11394
11395 return _listxattr(in, names, size, perms);
11396}
11397
11398int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11399 size_t size, int flags, const UserPerm& perms)
11400{
11401
11402 int xattr_flags = 0;
11403 if (!value)
11404 xattr_flags |= CEPH_XATTR_REMOVE;
11405 if (flags & XATTR_CREATE)
11406 xattr_flags |= CEPH_XATTR_CREATE;
11407 if (flags & XATTR_REPLACE)
11408 xattr_flags |= CEPH_XATTR_REPLACE;
11409
11410 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11411 filepath path;
11412 in->make_nosnap_relative_path(path);
11413 req->set_filepath(path);
11414 req->set_string2(name);
11415 req->set_inode(in);
11416 req->head.args.setxattr.flags = xattr_flags;
11417
11418 bufferlist bl;
11fdf7f2 11419 assert (value || size == 0);
7c673cae
FG
11420 bl.append((const char*)value, size);
11421 req->set_data(bl);
11422
11423 int res = make_request(req, perms);
11424
11425 trim_cache();
11fdf7f2 11426 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11427 res << dendl;
11428 return res;
11429}
11430
11431int Client::_setxattr(Inode *in, const char *name, const void *value,
11432 size_t size, int flags, const UserPerm& perms)
11433{
11434 if (in->snapid != CEPH_NOSNAP) {
11435 return -EROFS;
11436 }
11437
11438 bool posix_acl_xattr = false;
11439 if (acl_type == POSIX_ACL)
11440 posix_acl_xattr = !strncmp(name, "system.", 7);
11441
11442 if (strncmp(name, "user.", 5) &&
11443 strncmp(name, "security.", 9) &&
11444 strncmp(name, "trusted.", 8) &&
11445 strncmp(name, "ceph.", 5) &&
11446 !posix_acl_xattr)
11447 return -EOPNOTSUPP;
11448
11fdf7f2
TL
11449 bool check_realm = false;
11450
7c673cae
FG
11451 if (posix_acl_xattr) {
11452 if (!strcmp(name, ACL_EA_ACCESS)) {
11453 mode_t new_mode = in->mode;
11454 if (value) {
11455 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11456 if (ret < 0)
11457 return ret;
11458 if (ret == 0) {
11459 value = NULL;
11460 size = 0;
11461 }
11462 if (new_mode != in->mode) {
11463 struct ceph_statx stx;
11464 stx.stx_mode = new_mode;
11465 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11466 if (ret < 0)
11467 return ret;
11468 }
11469 }
11470 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11471 if (value) {
11472 if (!S_ISDIR(in->mode))
11473 return -EACCES;
11474 int ret = posix_acl_check(value, size);
11475 if (ret < 0)
11476 return -EINVAL;
11477 if (ret == 0) {
11478 value = NULL;
11479 size = 0;
11480 }
11481 }
11482 } else {
11483 return -EOPNOTSUPP;
11484 }
11485 } else {
11486 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11487 if (vxattr) {
11488 if (vxattr->readonly)
11489 return -EOPNOTSUPP;
11490 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11491 check_realm = true;
11492 }
7c673cae
FG
11493 }
11494
11fdf7f2
TL
11495 int ret = _do_setxattr(in, name, value, size, flags, perms);
11496 if (ret >= 0 && check_realm) {
11497 // check if snaprealm was created for quota inode
11498 if (in->quota.is_enable() &&
11499 !(in->snaprealm && in->snaprealm->ino == in->ino))
11500 ret = -EOPNOTSUPP;
11501 }
11502
11503 return ret;
7c673cae
FG
11504}
11505
11506int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11507 size_t size, int flags, const UserPerm& perms)
11508{
11509 if (cct->_conf->client_permissions) {
11510 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11511 if (r < 0)
11512 return r;
11513 }
11514 return _setxattr(in.get(), name, value, size, flags, perms);
11515}
11516
11517int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11518{
11519 string tmp;
11520 if (name == "layout") {
11521 string::iterator begin = value.begin();
11522 string::iterator end = value.end();
11523 keys_and_values<string::iterator> p; // create instance of parser
11524 std::map<string, string> m; // map to receive results
11525 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11526 return -EINVAL;
11527 }
11528 if (begin != end)
11529 return -EINVAL;
11530 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11531 if (q->first == "pool") {
11532 tmp = q->second;
11533 break;
11534 }
11535 }
11536 } else if (name == "layout.pool") {
11537 tmp = value;
11538 }
11539
11540 if (tmp.length()) {
11541 int64_t pool;
11542 try {
11543 pool = boost::lexical_cast<unsigned>(tmp);
11544 if (!osdmap->have_pg_pool(pool))
11545 return -ENOENT;
11546 } catch (boost::bad_lexical_cast const&) {
11547 pool = osdmap->lookup_pg_pool_name(tmp);
11548 if (pool < 0) {
11549 return -ENOENT;
11550 }
11551 }
11552 }
11553
11554 return 0;
11555}
11556
11557void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11558{
11559 // For setting pool of layout, MetaRequest need osdmap epoch.
11560 // There is a race which create a new data pool but client and mds both don't have.
11561 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11562 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11563 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11564 string rest(strstr(name, "layout"));
11565 string v((const char*)value, size);
11566 int r = objecter->with_osdmap([&](const OSDMap& o) {
11567 return _setxattr_check_data_pool(rest, v, &o);
11568 });
11569
11570 if (r == -ENOENT) {
11571 C_SaferCond ctx;
11572 objecter->wait_for_latest_osdmap(&ctx);
11573 ctx.wait();
11574 }
11575 }
11576}
11577
11578int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11579 size_t size, int flags, const UserPerm& perms)
11580{
11581 _setxattr_maybe_wait_for_osdmap(name, value, size);
11582
11fdf7f2 11583 std::lock_guard lock(client_lock);
7c673cae 11584
181888fb
FG
11585 if (unmounting)
11586 return -ENOTCONN;
11587
7c673cae
FG
11588 vinodeno_t vino = _get_vino(in);
11589
11fdf7f2
TL
11590 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11591 tout(cct) << __func__ << std::endl;
7c673cae
FG
11592 tout(cct) << vino.ino.val << std::endl;
11593 tout(cct) << name << std::endl;
11594
11fdf7f2
TL
11595 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11596 "fuse_default_permissions");
11597 if (!fuse_default_permissions) {
7c673cae
FG
11598 int r = xattr_permission(in, name, MAY_WRITE, perms);
11599 if (r < 0)
11600 return r;
11601 }
11602 return _setxattr(in, name, value, size, flags, perms);
11603}
11604
11605int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11606{
11607 if (in->snapid != CEPH_NOSNAP) {
11608 return -EROFS;
11609 }
11610
11611 // same xattrs supported by kernel client
11612 if (strncmp(name, "user.", 5) &&
11613 strncmp(name, "system.", 7) &&
11614 strncmp(name, "security.", 9) &&
11615 strncmp(name, "trusted.", 8) &&
11616 strncmp(name, "ceph.", 5))
11617 return -EOPNOTSUPP;
11618
11619 const VXattr *vxattr = _match_vxattr(in, name);
11620 if (vxattr && vxattr->readonly)
11621 return -EOPNOTSUPP;
11622
11623 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11624 filepath path;
11625 in->make_nosnap_relative_path(path);
11626 req->set_filepath(path);
11627 req->set_filepath2(name);
11628 req->set_inode(in);
11629
11630 int res = make_request(req, perms);
11631
11632 trim_cache();
1adf2230 11633 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11634 return res;
11635}
11636
11637int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11638{
11639 if (cct->_conf->client_permissions) {
11640 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11641 if (r < 0)
11642 return r;
11643 }
11644 return _removexattr(in.get(), name, perms);
11645}
11646
11647int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11648{
11fdf7f2 11649 std::lock_guard lock(client_lock);
7c673cae 11650
181888fb
FG
11651 if (unmounting)
11652 return -ENOTCONN;
11653
7c673cae
FG
11654 vinodeno_t vino = _get_vino(in);
11655
11656 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11657 tout(cct) << "ll_removexattr" << std::endl;
11658 tout(cct) << vino.ino.val << std::endl;
11659 tout(cct) << name << std::endl;
11660
11fdf7f2
TL
11661 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11662 "fuse_default_permissions");
11663 if (!fuse_default_permissions) {
7c673cae
FG
11664 int r = xattr_permission(in, name, MAY_WRITE, perms);
11665 if (r < 0)
11666 return r;
11667 }
11668
11669 return _removexattr(in, name, perms);
11670}
11671
11672bool Client::_vxattrcb_quota_exists(Inode *in)
11673{
11fdf7f2
TL
11674 return in->quota.is_enable() &&
11675 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11676}
11677size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11678{
11679 return snprintf(val, size,
11680 "max_bytes=%lld max_files=%lld",
11681 (long long int)in->quota.max_bytes,
11682 (long long int)in->quota.max_files);
11683}
11684size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11685{
11686 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11687}
11688size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11689{
11690 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11691}
11692
11693bool Client::_vxattrcb_layout_exists(Inode *in)
11694{
11695 return in->layout != file_layout_t();
11696}
11697size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11698{
11699 int r = snprintf(val, size,
11fdf7f2 11700 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11701 (unsigned long long)in->layout.stripe_unit,
11702 (unsigned long long)in->layout.stripe_count,
11703 (unsigned long long)in->layout.object_size);
11704 objecter->with_osdmap([&](const OSDMap& o) {
11705 if (o.have_pg_pool(in->layout.pool_id))
11706 r += snprintf(val + r, size - r, "%s",
11707 o.get_pool_name(in->layout.pool_id).c_str());
11708 else
11709 r += snprintf(val + r, size - r, "%" PRIu64,
11710 (uint64_t)in->layout.pool_id);
11711 });
11712 if (in->layout.pool_ns.length())
11713 r += snprintf(val + r, size - r, " pool_namespace=%s",
11714 in->layout.pool_ns.c_str());
11715 return r;
11716}
11717size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11718{
11fdf7f2 11719 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11720}
11721size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11722{
11fdf7f2 11723 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11724}
11725size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11726{
11fdf7f2 11727 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11728}
11729size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11730{
11731 size_t r;
11732 objecter->with_osdmap([&](const OSDMap& o) {
11733 if (o.have_pg_pool(in->layout.pool_id))
11734 r = snprintf(val, size, "%s", o.get_pool_name(
11735 in->layout.pool_id).c_str());
11736 else
11737 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11738 });
11739 return r;
11740}
11741size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11742{
11743 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11744}
11745size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11746{
11fdf7f2 11747 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11748}
11749size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11750{
11fdf7f2 11751 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11752}
11753size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11754{
11fdf7f2 11755 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11756}
11757size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11758{
11fdf7f2 11759 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11760}
11761size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11762{
11fdf7f2 11763 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11764}
11765size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11766{
11fdf7f2 11767 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11768}
11769size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11770{
11fdf7f2 11771 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11772}
11773size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11774{
81eedcae 11775 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
11776 (long)in->rstat.rctime.nsec());
11777}
11fdf7f2
TL
11778bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11779{
11780 return in->dir_pin != -ENODATA;
11781}
11782size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11783{
11784 return snprintf(val, size, "%ld", (long)in->dir_pin);
11785}
7c673cae 11786
81eedcae
TL
11787bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11788{
11789 return !in->snap_btime.is_zero();
11790}
11791
11792size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11793{
11794 return snprintf(val, size, "%llu.%09lu",
11795 (long long unsigned)in->snap_btime.sec(),
11796 (long unsigned)in->snap_btime.nsec());
11797}
11798
7c673cae
FG
11799#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11800#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11801
11802#define XATTR_NAME_CEPH(_type, _name) \
11803{ \
11804 name: CEPH_XATTR_NAME(_type, _name), \
11805 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11806 readonly: true, \
11807 hidden: false, \
11808 exists_cb: NULL, \
28e407b8
AA
11809 flags: 0, \
11810}
11811#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11812{ \
11813 name: CEPH_XATTR_NAME(_type, _name), \
11814 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11815 readonly: true, \
11816 hidden: false, \
11817 exists_cb: NULL, \
11818 flags: _flags, \
7c673cae
FG
11819}
11820#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11821{ \
11822 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11823 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11824 readonly: false, \
11825 hidden: true, \
11826 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11827 flags: 0, \
7c673cae
FG
11828}
11829#define XATTR_QUOTA_FIELD(_type, _name) \
11830{ \
11831 name: CEPH_XATTR_NAME(_type, _name), \
11832 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11833 readonly: false, \
11834 hidden: true, \
11835 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11836 flags: 0, \
7c673cae
FG
11837}
11838
11839const Client::VXattr Client::_dir_vxattrs[] = {
11840 {
11841 name: "ceph.dir.layout",
11842 getxattr_cb: &Client::_vxattrcb_layout,
11843 readonly: false,
11844 hidden: true,
11845 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11846 flags: 0,
7c673cae
FG
11847 },
11848 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11849 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11850 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11851 XATTR_LAYOUT_FIELD(dir, layout, pool),
11852 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11853 XATTR_NAME_CEPH(dir, entries),
11854 XATTR_NAME_CEPH(dir, files),
11855 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11856 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11857 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11858 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11859 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11860 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11861 {
11862 name: "ceph.quota",
11863 getxattr_cb: &Client::_vxattrcb_quota,
11864 readonly: false,
11865 hidden: true,
11866 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11867 flags: 0,
7c673cae
FG
11868 },
11869 XATTR_QUOTA_FIELD(quota, max_bytes),
11870 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11871 {
11872 name: "ceph.dir.pin",
11873 getxattr_cb: &Client::_vxattrcb_dir_pin,
11874 readonly: false,
11875 hidden: true,
11876 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11877 flags: 0,
11878 },
81eedcae
TL
11879 {
11880 name: "ceph.snap.btime",
11881 getxattr_cb: &Client::_vxattrcb_snap_btime,
11882 readonly: true,
11883 hidden: false,
11884 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11885 flags: 0,
11886 },
7c673cae
FG
11887 { name: "" } /* Required table terminator */
11888};
11889
11890const Client::VXattr Client::_file_vxattrs[] = {
11891 {
11892 name: "ceph.file.layout",
11893 getxattr_cb: &Client::_vxattrcb_layout,
11894 readonly: false,
11895 hidden: true,
11896 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11897 flags: 0,
7c673cae
FG
11898 },
11899 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11900 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11901 XATTR_LAYOUT_FIELD(file, layout, object_size),
11902 XATTR_LAYOUT_FIELD(file, layout, pool),
11903 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
11904 {
11905 name: "ceph.snap.btime",
11906 getxattr_cb: &Client::_vxattrcb_snap_btime,
11907 readonly: true,
11908 hidden: false,
11909 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11910 flags: 0,
11911 },
7c673cae
FG
11912 { name: "" } /* Required table terminator */
11913};
11914
11915const Client::VXattr *Client::_get_vxattrs(Inode *in)
11916{
11917 if (in->is_dir())
11918 return _dir_vxattrs;
11919 else if (in->is_file())
11920 return _file_vxattrs;
11921 return NULL;
11922}
11923
11924const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11925{
11926 if (strncmp(name, "ceph.", 5) == 0) {
11927 const VXattr *vxattr = _get_vxattrs(in);
11928 if (vxattr) {
11929 while (!vxattr->name.empty()) {
11930 if (vxattr->name == name)
11931 return vxattr;
11932 vxattr++;
11933 }
11934 }
11935 }
11936 return NULL;
11937}
11938
7c673cae
FG
11939int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11940{
11fdf7f2 11941 std::lock_guard lock(client_lock);
7c673cae 11942
181888fb
FG
11943 if (unmounting)
11944 return -ENOTCONN;
11945
7c673cae
FG
11946 vinodeno_t vino = _get_vino(in);
11947
11948 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11949 tout(cct) << "ll_readlink" << std::endl;
11950 tout(cct) << vino.ino.val << std::endl;
11951
11fdf7f2
TL
11952 for (auto dn : in->dentries) {
11953 touch_dn(dn);
7c673cae
FG
11954 }
11955
11956 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11957 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11958 return r;
11959}
11960
11961int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11962 const UserPerm& perms, InodeRef *inp)
11963{
1adf2230 11964 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11965 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11966 << ", gid " << perms.gid() << ")" << dendl;
11967
11968 if (strlen(name) > NAME_MAX)
11969 return -ENAMETOOLONG;
11970
11971 if (dir->snapid != CEPH_NOSNAP) {
11972 return -EROFS;
11973 }
11974 if (is_quota_files_exceeded(dir, perms)) {
11975 return -EDQUOT;
11976 }
11977
11978 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11979
11980 filepath path;
11981 dir->make_nosnap_relative_path(path);
11982 path.push_dentry(name);
11983 req->set_filepath(path);
11984 req->set_inode(dir);
11985 req->head.args.mknod.rdev = rdev;
11986 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11987 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11988
11989 bufferlist xattrs_bl;
11990 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11991 if (res < 0)
11992 goto fail;
11993 req->head.args.mknod.mode = mode;
11994 if (xattrs_bl.length() > 0)
11995 req->set_data(xattrs_bl);
11996
11997 Dentry *de;
11998 res = get_or_create(dir, name, &de);
11999 if (res < 0)
12000 goto fail;
12001 req->set_dentry(de);
12002
12003 res = make_request(req, perms, inp);
12004
12005 trim_cache();
12006
1adf2230 12007 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12008 return res;
12009
12010 fail:
12011 put_request(req);
12012 return res;
12013}
12014
12015int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12016 dev_t rdev, struct stat *attr, Inode **out,
12017 const UserPerm& perms)
12018{
11fdf7f2 12019 std::lock_guard lock(client_lock);
7c673cae 12020
181888fb
FG
12021 if (unmounting)
12022 return -ENOTCONN;
12023
7c673cae
FG
12024 vinodeno_t vparent = _get_vino(parent);
12025
12026 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12027 tout(cct) << "ll_mknod" << std::endl;
12028 tout(cct) << vparent.ino.val << std::endl;
12029 tout(cct) << name << std::endl;
12030 tout(cct) << mode << std::endl;
12031 tout(cct) << rdev << std::endl;
12032
11fdf7f2
TL
12033 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12034 "fuse_default_permissions");
12035 if (!fuse_default_permissions) {
7c673cae
FG
12036 int r = may_create(parent, perms);
12037 if (r < 0)
12038 return r;
12039 }
12040
12041 InodeRef in;
12042 int r = _mknod(parent, name, mode, rdev, perms, &in);
12043 if (r == 0) {
12044 fill_stat(in, attr);
12045 _ll_get(in.get());
12046 }
12047 tout(cct) << attr->st_ino << std::endl;
12048 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12049 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12050 *out = in.get();
12051 return r;
12052}
12053
12054int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12055 dev_t rdev, Inode **out,
12056 struct ceph_statx *stx, unsigned want, unsigned flags,
12057 const UserPerm& perms)
12058{
12059 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12060 std::lock_guard lock(client_lock);
7c673cae 12061
181888fb
FG
12062 if (unmounting)
12063 return -ENOTCONN;
12064
7c673cae
FG
12065 vinodeno_t vparent = _get_vino(parent);
12066
12067 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12068 tout(cct) << "ll_mknodx" << std::endl;
12069 tout(cct) << vparent.ino.val << std::endl;
12070 tout(cct) << name << std::endl;
12071 tout(cct) << mode << std::endl;
12072 tout(cct) << rdev << std::endl;
12073
11fdf7f2
TL
12074 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12075 "fuse_default_permissions");
12076 if (!fuse_default_permissions) {
7c673cae
FG
12077 int r = may_create(parent, perms);
12078 if (r < 0)
12079 return r;
12080 }
12081
12082 InodeRef in;
12083 int r = _mknod(parent, name, mode, rdev, perms, &in);
12084 if (r == 0) {
12085 fill_statx(in, caps, stx);
12086 _ll_get(in.get());
12087 }
12088 tout(cct) << stx->stx_ino << std::endl;
12089 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12090 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12091 *out = in.get();
12092 return r;
12093}
12094
12095int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12096 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12097 int object_size, const char *data_pool, bool *created,
12098 const UserPerm& perms)
12099{
1adf2230 12100 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12101 mode << dec << ")" << dendl;
12102
12103 if (strlen(name) > NAME_MAX)
12104 return -ENAMETOOLONG;
12105 if (dir->snapid != CEPH_NOSNAP) {
12106 return -EROFS;
12107 }
12108 if (is_quota_files_exceeded(dir, perms)) {
12109 return -EDQUOT;
12110 }
12111
12112 // use normalized flags to generate cmode
11fdf7f2
TL
12113 int cflags = ceph_flags_sys2wire(flags);
12114 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12115 cflags |= CEPH_O_LAZY;
12116
12117 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12118
12119 int64_t pool_id = -1;
12120 if (data_pool && *data_pool) {
12121 pool_id = objecter->with_osdmap(
12122 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12123 if (pool_id < 0)
12124 return -EINVAL;
12125 if (pool_id > 0xffffffffll)
12126 return -ERANGE; // bummer!
12127 }
12128
12129 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12130
12131 filepath path;
12132 dir->make_nosnap_relative_path(path);
12133 path.push_dentry(name);
12134 req->set_filepath(path);
12135 req->set_inode(dir);
11fdf7f2 12136 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12137
12138 req->head.args.open.stripe_unit = stripe_unit;
12139 req->head.args.open.stripe_count = stripe_count;
12140 req->head.args.open.object_size = object_size;
12141 if (cct->_conf->client_debug_getattr_caps)
12142 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12143 else
12144 req->head.args.open.mask = 0;
12145 req->head.args.open.pool = pool_id;
12146 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12147 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12148
12149 mode |= S_IFREG;
12150 bufferlist xattrs_bl;
12151 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12152 if (res < 0)
12153 goto fail;
12154 req->head.args.open.mode = mode;
12155 if (xattrs_bl.length() > 0)
12156 req->set_data(xattrs_bl);
12157
12158 Dentry *de;
12159 res = get_or_create(dir, name, &de);
12160 if (res < 0)
12161 goto fail;
12162 req->set_dentry(de);
12163
12164 res = make_request(req, perms, inp, created);
12165 if (res < 0) {
12166 goto reply_error;
12167 }
12168
12169 /* If the caller passed a value in fhp, do the open */
12170 if(fhp) {
12171 (*inp)->get_open_ref(cmode);
12172 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12173 }
12174
12175 reply_error:
12176 trim_cache();
12177
1adf2230 12178 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12179 << " layout " << stripe_unit
12180 << ' ' << stripe_count
12181 << ' ' << object_size
12182 <<") = " << res << dendl;
12183 return res;
12184
12185 fail:
12186 put_request(req);
12187 return res;
12188}
12189
12190
12191int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12192 InodeRef *inp)
12193{
1adf2230 12194 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12195 << mode << dec << ", uid " << perm.uid()
12196 << ", gid " << perm.gid() << ")" << dendl;
12197
12198 if (strlen(name) > NAME_MAX)
12199 return -ENAMETOOLONG;
12200
12201 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12202 return -EROFS;
12203 }
12204 if (is_quota_files_exceeded(dir, perm)) {
12205 return -EDQUOT;
12206 }
12207 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12208 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12209
12210 filepath path;
12211 dir->make_nosnap_relative_path(path);
12212 path.push_dentry(name);
12213 req->set_filepath(path);
12214 req->set_inode(dir);
12215 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12216 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12217
12218 mode |= S_IFDIR;
12219 bufferlist xattrs_bl;
12220 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12221 if (res < 0)
12222 goto fail;
12223 req->head.args.mkdir.mode = mode;
12224 if (xattrs_bl.length() > 0)
12225 req->set_data(xattrs_bl);
12226
12227 Dentry *de;
12228 res = get_or_create(dir, name, &de);
12229 if (res < 0)
12230 goto fail;
12231 req->set_dentry(de);
12232
12233 ldout(cct, 10) << "_mkdir: making request" << dendl;
12234 res = make_request(req, perm, inp);
12235 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12236
12237 trim_cache();
12238
1adf2230 12239 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12240 return res;
12241
12242 fail:
12243 put_request(req);
12244 return res;
12245}
12246
12247int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12248 struct stat *attr, Inode **out, const UserPerm& perm)
12249{
11fdf7f2 12250 std::lock_guard lock(client_lock);
7c673cae 12251
181888fb
FG
12252 if (unmounting)
12253 return -ENOTCONN;
12254
7c673cae
FG
12255 vinodeno_t vparent = _get_vino(parent);
12256
12257 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12258 tout(cct) << "ll_mkdir" << std::endl;
12259 tout(cct) << vparent.ino.val << std::endl;
12260 tout(cct) << name << std::endl;
12261 tout(cct) << mode << std::endl;
12262
11fdf7f2
TL
12263 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12264 "fuse_default_permissions");
12265 if (!fuse_default_permissions) {
7c673cae
FG
12266 int r = may_create(parent, perm);
12267 if (r < 0)
12268 return r;
12269 }
12270
12271 InodeRef in;
12272 int r = _mkdir(parent, name, mode, perm, &in);
12273 if (r == 0) {
12274 fill_stat(in, attr);
12275 _ll_get(in.get());
12276 }
12277 tout(cct) << attr->st_ino << std::endl;
12278 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12279 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12280 *out = in.get();
12281 return r;
12282}
12283
12284int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12285 struct ceph_statx *stx, unsigned want, unsigned flags,
12286 const UserPerm& perms)
12287{
11fdf7f2 12288 std::lock_guard lock(client_lock);
7c673cae 12289
181888fb
FG
12290 if (unmounting)
12291 return -ENOTCONN;
12292
7c673cae
FG
12293 vinodeno_t vparent = _get_vino(parent);
12294
12295 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12296 tout(cct) << "ll_mkdirx" << std::endl;
12297 tout(cct) << vparent.ino.val << std::endl;
12298 tout(cct) << name << std::endl;
12299 tout(cct) << mode << std::endl;
12300
11fdf7f2
TL
12301 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12302 "fuse_default_permissions");
12303 if (!fuse_default_permissions) {
7c673cae
FG
12304 int r = may_create(parent, perms);
12305 if (r < 0)
12306 return r;
12307 }
12308
12309 InodeRef in;
12310 int r = _mkdir(parent, name, mode, perms, &in);
12311 if (r == 0) {
12312 fill_statx(in, statx_to_mask(flags, want), stx);
12313 _ll_get(in.get());
12314 } else {
12315 stx->stx_ino = 0;
12316 stx->stx_mask = 0;
12317 }
12318 tout(cct) << stx->stx_ino << std::endl;
12319 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12320 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12321 *out = in.get();
12322 return r;
12323}
12324
12325int Client::_symlink(Inode *dir, const char *name, const char *target,
12326 const UserPerm& perms, InodeRef *inp)
12327{
1adf2230 12328 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12329 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12330 << dendl;
12331
12332 if (strlen(name) > NAME_MAX)
12333 return -ENAMETOOLONG;
12334
12335 if (dir->snapid != CEPH_NOSNAP) {
12336 return -EROFS;
12337 }
12338 if (is_quota_files_exceeded(dir, perms)) {
12339 return -EDQUOT;
12340 }
12341
12342 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12343
12344 filepath path;
12345 dir->make_nosnap_relative_path(path);
12346 path.push_dentry(name);
12347 req->set_filepath(path);
12348 req->set_inode(dir);
12349 req->set_string2(target);
12350 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12351 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12352
12353 Dentry *de;
12354 int res = get_or_create(dir, name, &de);
12355 if (res < 0)
12356 goto fail;
12357 req->set_dentry(de);
12358
12359 res = make_request(req, perms, inp);
12360
12361 trim_cache();
1adf2230 12362 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12363 res << dendl;
12364 return res;
12365
12366 fail:
12367 put_request(req);
12368 return res;
12369}
12370
12371int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12372 struct stat *attr, Inode **out, const UserPerm& perms)
12373{
11fdf7f2 12374 std::lock_guard lock(client_lock);
7c673cae 12375
181888fb
FG
12376 if (unmounting)
12377 return -ENOTCONN;
12378
7c673cae
FG
12379 vinodeno_t vparent = _get_vino(parent);
12380
12381 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12382 << dendl;
12383 tout(cct) << "ll_symlink" << std::endl;
12384 tout(cct) << vparent.ino.val << std::endl;
12385 tout(cct) << name << std::endl;
12386 tout(cct) << value << std::endl;
12387
11fdf7f2
TL
12388 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12389 "fuse_default_permissions");
12390 if (!fuse_default_permissions) {
7c673cae
FG
12391 int r = may_create(parent, perms);
12392 if (r < 0)
12393 return r;
12394 }
12395
12396 InodeRef in;
12397 int r = _symlink(parent, name, value, perms, &in);
12398 if (r == 0) {
12399 fill_stat(in, attr);
12400 _ll_get(in.get());
12401 }
12402 tout(cct) << attr->st_ino << std::endl;
12403 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12404 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12405 *out = in.get();
12406 return r;
12407}
12408
12409int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12410 Inode **out, struct ceph_statx *stx, unsigned want,
12411 unsigned flags, const UserPerm& perms)
12412{
11fdf7f2 12413 std::lock_guard lock(client_lock);
7c673cae 12414
181888fb
FG
12415 if (unmounting)
12416 return -ENOTCONN;
12417
7c673cae
FG
12418 vinodeno_t vparent = _get_vino(parent);
12419
12420 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12421 << dendl;
12422 tout(cct) << "ll_symlinkx" << std::endl;
12423 tout(cct) << vparent.ino.val << std::endl;
12424 tout(cct) << name << std::endl;
12425 tout(cct) << value << std::endl;
12426
11fdf7f2
TL
12427 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12428 "fuse_default_permissions");
12429 if (!fuse_default_permissions) {
7c673cae
FG
12430 int r = may_create(parent, perms);
12431 if (r < 0)
12432 return r;
12433 }
12434
12435 InodeRef in;
12436 int r = _symlink(parent, name, value, perms, &in);
12437 if (r == 0) {
12438 fill_statx(in, statx_to_mask(flags, want), stx);
12439 _ll_get(in.get());
12440 }
12441 tout(cct) << stx->stx_ino << std::endl;
12442 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12443 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12444 *out = in.get();
12445 return r;
12446}
12447
12448int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12449{
1adf2230 12450 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12451 << " uid " << perm.uid() << " gid " << perm.gid()
12452 << ")" << dendl;
12453
12454 if (dir->snapid != CEPH_NOSNAP) {
12455 return -EROFS;
12456 }
12457
12458 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12459
12460 filepath path;
12461 dir->make_nosnap_relative_path(path);
12462 path.push_dentry(name);
12463 req->set_filepath(path);
12464
12465 InodeRef otherin;
b32b8144 12466 Inode *in;
7c673cae 12467 Dentry *de;
b32b8144 12468
7c673cae
FG
12469 int res = get_or_create(dir, name, &de);
12470 if (res < 0)
12471 goto fail;
12472 req->set_dentry(de);
12473 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12474 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12475
12476 res = _lookup(dir, name, 0, &otherin, perm);
12477 if (res < 0)
12478 goto fail;
b32b8144
FG
12479
12480 in = otherin.get();
12481 req->set_other_inode(in);
12482 in->break_all_delegs();
7c673cae
FG
12483 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12484
12485 req->set_inode(dir);
12486
12487 res = make_request(req, perm);
12488
12489 trim_cache();
1adf2230 12490 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12491 return res;
12492
12493 fail:
12494 put_request(req);
12495 return res;
12496}
12497
12498int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12499{
11fdf7f2 12500 std::lock_guard lock(client_lock);
7c673cae 12501
181888fb
FG
12502 if (unmounting)
12503 return -ENOTCONN;
12504
7c673cae
FG
12505 vinodeno_t vino = _get_vino(in);
12506
12507 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12508 tout(cct) << "ll_unlink" << std::endl;
12509 tout(cct) << vino.ino.val << std::endl;
12510 tout(cct) << name << std::endl;
12511
11fdf7f2
TL
12512 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12513 "fuse_default_permissions");
12514 if (!fuse_default_permissions) {
7c673cae
FG
12515 int r = may_delete(in, name, perm);
12516 if (r < 0)
12517 return r;
12518 }
12519 return _unlink(in, name, perm);
12520}
12521
12522int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12523{
1adf2230 12524 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12525 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12526
12527 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12528 return -EROFS;
12529 }
b32b8144
FG
12530
12531 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12532 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12533 filepath path;
12534 dir->make_nosnap_relative_path(path);
12535 path.push_dentry(name);
12536 req->set_filepath(path);
11fdf7f2 12537 req->set_inode(dir);
7c673cae
FG
12538
12539 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12540 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12541 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12542
12543 InodeRef in;
12544
12545 Dentry *de;
12546 int res = get_or_create(dir, name, &de);
12547 if (res < 0)
12548 goto fail;
b32b8144
FG
12549 if (op == CEPH_MDS_OP_RMDIR)
12550 req->set_dentry(de);
12551 else
12552 de->get();
12553
7c673cae
FG
12554 res = _lookup(dir, name, 0, &in, perms);
12555 if (res < 0)
12556 goto fail;
11fdf7f2
TL
12557
12558 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12559 unlink(de, true, true);
b32b8144 12560 de->put();
7c673cae 12561 }
11fdf7f2 12562 req->set_other_inode(in.get());
7c673cae
FG
12563
12564 res = make_request(req, perms);
12565
12566 trim_cache();
1adf2230 12567 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12568 return res;
12569
12570 fail:
12571 put_request(req);
12572 return res;
12573}
12574
12575int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12576{
11fdf7f2 12577 std::lock_guard lock(client_lock);
7c673cae 12578
181888fb
FG
12579 if (unmounting)
12580 return -ENOTCONN;
12581
7c673cae
FG
12582 vinodeno_t vino = _get_vino(in);
12583
12584 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12585 tout(cct) << "ll_rmdir" << std::endl;
12586 tout(cct) << vino.ino.val << std::endl;
12587 tout(cct) << name << std::endl;
12588
11fdf7f2
TL
12589 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12590 "fuse_default_permissions");
12591 if (!fuse_default_permissions) {
7c673cae
FG
12592 int r = may_delete(in, name, perms);
12593 if (r < 0)
12594 return r;
12595 }
12596
12597 return _rmdir(in, name, perms);
12598}
12599
12600int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12601{
1adf2230 12602 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12603 << todir->ino << " " << toname
12604 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12605 << dendl;
12606
12607 if (fromdir->snapid != todir->snapid)
12608 return -EXDEV;
12609
12610 int op = CEPH_MDS_OP_RENAME;
12611 if (fromdir->snapid != CEPH_NOSNAP) {
12612 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12613 op = CEPH_MDS_OP_RENAMESNAP;
12614 else
12615 return -EROFS;
12616 }
12617 if (fromdir != todir) {
12618 Inode *fromdir_root =
12619 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12620 Inode *todir_root =
12621 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12622 if (fromdir_root != todir_root) {
12623 return -EXDEV;
12624 }
12625 }
12626
12627 InodeRef target;
12628 MetaRequest *req = new MetaRequest(op);
12629
12630 filepath from;
12631 fromdir->make_nosnap_relative_path(from);
12632 from.push_dentry(fromname);
12633 filepath to;
12634 todir->make_nosnap_relative_path(to);
12635 to.push_dentry(toname);
12636 req->set_filepath(to);
12637 req->set_filepath2(from);
12638
12639 Dentry *oldde;
12640 int res = get_or_create(fromdir, fromname, &oldde);
12641 if (res < 0)
12642 goto fail;
12643 Dentry *de;
12644 res = get_or_create(todir, toname, &de);
12645 if (res < 0)
12646 goto fail;
12647
12648 if (op == CEPH_MDS_OP_RENAME) {
12649 req->set_old_dentry(oldde);
12650 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12651 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12652
12653 req->set_dentry(de);
12654 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12655 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12656
12657 InodeRef oldin, otherin;
12658 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12659 if (res < 0)
12660 goto fail;
b32b8144
FG
12661
12662 Inode *oldinode = oldin.get();
12663 oldinode->break_all_delegs();
12664 req->set_old_inode(oldinode);
7c673cae
FG
12665 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12666
12667 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12668 switch (res) {
12669 case 0:
12670 {
12671 Inode *in = otherin.get();
12672 req->set_other_inode(in);
12673 in->break_all_delegs();
12674 }
7c673cae 12675 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12676 break;
12677 case -ENOENT:
12678 break;
12679 default:
12680 goto fail;
7c673cae
FG
12681 }
12682
12683 req->set_inode(todir);
12684 } else {
12685 // renamesnap reply contains no tracedn, so we need to invalidate
12686 // dentry manually
12687 unlink(oldde, true, true);
12688 unlink(de, true, true);
11fdf7f2
TL
12689
12690 req->set_inode(todir);
7c673cae
FG
12691 }
12692
12693 res = make_request(req, perm, &target);
12694 ldout(cct, 10) << "rename result is " << res << dendl;
12695
12696 // renamed item from our cache
12697
12698 trim_cache();
1adf2230 12699 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12700 return res;
12701
12702 fail:
12703 put_request(req);
12704 return res;
12705}
12706
12707int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12708 const char *newname, const UserPerm& perm)
12709{
11fdf7f2 12710 std::lock_guard lock(client_lock);
7c673cae 12711
181888fb
FG
12712 if (unmounting)
12713 return -ENOTCONN;
12714
7c673cae
FG
12715 vinodeno_t vparent = _get_vino(parent);
12716 vinodeno_t vnewparent = _get_vino(newparent);
12717
12718 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12719 << vnewparent << " " << newname << dendl;
12720 tout(cct) << "ll_rename" << std::endl;
12721 tout(cct) << vparent.ino.val << std::endl;
12722 tout(cct) << name << std::endl;
12723 tout(cct) << vnewparent.ino.val << std::endl;
12724 tout(cct) << newname << std::endl;
12725
11fdf7f2
TL
12726 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12727 "fuse_default_permissions");
12728 if (!fuse_default_permissions) {
7c673cae
FG
12729 int r = may_delete(parent, name, perm);
12730 if (r < 0)
12731 return r;
12732 r = may_delete(newparent, newname, perm);
12733 if (r < 0 && r != -ENOENT)
12734 return r;
12735 }
12736
12737 return _rename(parent, name, newparent, newname, perm);
12738}
12739
12740int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12741{
1adf2230 12742 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12743 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12744
12745 if (strlen(newname) > NAME_MAX)
12746 return -ENAMETOOLONG;
12747
12748 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12749 return -EROFS;
12750 }
12751 if (is_quota_files_exceeded(dir, perm)) {
12752 return -EDQUOT;
12753 }
12754
b32b8144 12755 in->break_all_delegs();
7c673cae
FG
12756 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12757
12758 filepath path(newname, dir->ino);
12759 req->set_filepath(path);
12760 filepath existing(in->ino);
12761 req->set_filepath2(existing);
12762
12763 req->set_inode(dir);
12764 req->inode_drop = CEPH_CAP_FILE_SHARED;
12765 req->inode_unless = CEPH_CAP_FILE_EXCL;
12766
12767 Dentry *de;
12768 int res = get_or_create(dir, newname, &de);
12769 if (res < 0)
12770 goto fail;
12771 req->set_dentry(de);
12772
12773 res = make_request(req, perm, inp);
12774 ldout(cct, 10) << "link result is " << res << dendl;
12775
12776 trim_cache();
1adf2230 12777 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12778 return res;
12779
12780 fail:
12781 put_request(req);
12782 return res;
12783}
12784
12785int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12786 const UserPerm& perm)
12787{
11fdf7f2 12788 std::lock_guard lock(client_lock);
7c673cae 12789
181888fb
FG
12790 if (unmounting)
12791 return -ENOTCONN;
12792
7c673cae
FG
12793 vinodeno_t vino = _get_vino(in);
12794 vinodeno_t vnewparent = _get_vino(newparent);
12795
31f18b77 12796 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12797 newname << dendl;
12798 tout(cct) << "ll_link" << std::endl;
12799 tout(cct) << vino.ino.val << std::endl;
12800 tout(cct) << vnewparent << std::endl;
12801 tout(cct) << newname << std::endl;
12802
7c673cae
FG
12803 InodeRef target;
12804
11fdf7f2
TL
12805 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12806 "fuse_default_permissions");
12807 if (!fuse_default_permissions) {
7c673cae
FG
12808 if (S_ISDIR(in->mode))
12809 return -EPERM;
12810
11fdf7f2 12811 int r = may_hardlink(in, perm);
7c673cae
FG
12812 if (r < 0)
12813 return r;
12814
12815 r = may_create(newparent, perm);
12816 if (r < 0)
12817 return r;
12818 }
12819
12820 return _link(in, newparent, newname, perm, &target);
12821}
12822
12823int Client::ll_num_osds(void)
12824{
11fdf7f2 12825 std::lock_guard lock(client_lock);
7c673cae
FG
12826 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12827}
12828
12829int Client::ll_osdaddr(int osd, uint32_t *addr)
12830{
11fdf7f2 12831 std::lock_guard lock(client_lock);
181888fb 12832
7c673cae
FG
12833 entity_addr_t g;
12834 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12835 if (!o.exists(osd))
12836 return false;
11fdf7f2 12837 g = o.get_addrs(osd).front();
7c673cae
FG
12838 return true;
12839 });
12840 if (!exists)
12841 return -1;
12842 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12843 *addr = ntohl(nb_addr);
12844 return 0;
12845}
181888fb 12846
7c673cae
FG
12847uint32_t Client::ll_stripe_unit(Inode *in)
12848{
11fdf7f2 12849 std::lock_guard lock(client_lock);
7c673cae
FG
12850 return in->layout.stripe_unit;
12851}
12852
12853uint64_t Client::ll_snap_seq(Inode *in)
12854{
11fdf7f2 12855 std::lock_guard lock(client_lock);
7c673cae
FG
12856 return in->snaprealm->seq;
12857}
12858
12859int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12860{
11fdf7f2 12861 std::lock_guard lock(client_lock);
7c673cae
FG
12862 *layout = in->layout;
12863 return 0;
12864}
12865
12866int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12867{
12868 return ll_file_layout(fh->inode.get(), layout);
12869}
12870
12871/* Currently we cannot take advantage of redundancy in reads, since we
12872 would have to go through all possible placement groups (a
12873 potentially quite large number determined by a hash), and use CRUSH
12874 to calculate the appropriate set of OSDs for each placement group,
12875 then index into that. An array with one entry per OSD is much more
12876 tractable and works for demonstration purposes. */
12877
12878int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12879 file_layout_t* layout)
12880{
11fdf7f2 12881 std::lock_guard lock(client_lock);
181888fb 12882
28e407b8 12883 inodeno_t ino = in->ino;
7c673cae
FG
12884 uint32_t object_size = layout->object_size;
12885 uint32_t su = layout->stripe_unit;
12886 uint32_t stripe_count = layout->stripe_count;
12887 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12888 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12889
11fdf7f2
TL
12890 if(stripe_count) {
12891 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12892 stripepos = blockno % stripe_count; // which object in the object set (X)
12893 }
7c673cae
FG
12894 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12895 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12896
12897 object_t oid = file_object_t(ino, objectno);
12898 return objecter->with_osdmap([&](const OSDMap& o) {
12899 ceph_object_layout olayout =
12900 o.file_to_object_layout(oid, *layout);
12901 pg_t pg = (pg_t)olayout.ol_pgid;
12902 vector<int> osds;
12903 int primary;
12904 o.pg_to_acting_osds(pg, &osds, &primary);
12905 return primary;
12906 });
12907}
12908
12909/* Return the offset of the block, internal to the object */
12910
12911uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12912{
11fdf7f2 12913 std::lock_guard lock(client_lock);
7c673cae
FG
12914 file_layout_t *layout=&(in->layout);
12915 uint32_t object_size = layout->object_size;
12916 uint32_t su = layout->stripe_unit;
12917 uint64_t stripes_per_object = object_size / su;
12918
12919 return (blockno % stripes_per_object) * su;
12920}
12921
12922int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12923 const UserPerm& perms)
12924{
11fdf7f2 12925 std::lock_guard lock(client_lock);
7c673cae 12926
181888fb
FG
12927 if (unmounting)
12928 return -ENOTCONN;
12929
7c673cae
FG
12930 vinodeno_t vino = _get_vino(in);
12931
12932 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12933 tout(cct) << "ll_opendir" << std::endl;
12934 tout(cct) << vino.ino.val << std::endl;
12935
11fdf7f2
TL
12936 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12937 "fuse_default_permissions");
12938 if (!fuse_default_permissions) {
7c673cae
FG
12939 int r = may_open(in, flags, perms);
12940 if (r < 0)
12941 return r;
12942 }
12943
12944 int r = _opendir(in, dirpp, perms);
12945 tout(cct) << (unsigned long)*dirpp << std::endl;
12946
12947 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12948 << dendl;
12949 return r;
12950}
12951
12952int Client::ll_releasedir(dir_result_t *dirp)
12953{
11fdf7f2 12954 std::lock_guard lock(client_lock);
7c673cae
FG
12955 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12956 tout(cct) << "ll_releasedir" << std::endl;
12957 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12958
12959 if (unmounting)
12960 return -ENOTCONN;
12961
7c673cae
FG
12962 _closedir(dirp);
12963 return 0;
12964}
12965
12966int Client::ll_fsyncdir(dir_result_t *dirp)
12967{
11fdf7f2 12968 std::lock_guard lock(client_lock);
7c673cae
FG
12969 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12970 tout(cct) << "ll_fsyncdir" << std::endl;
12971 tout(cct) << (unsigned long)dirp << std::endl;
12972
181888fb
FG
12973 if (unmounting)
12974 return -ENOTCONN;
12975
7c673cae
FG
12976 return _fsync(dirp->inode.get(), false);
12977}
12978
12979int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12980{
11fdf7f2 12981 ceph_assert(!(flags & O_CREAT));
7c673cae 12982
11fdf7f2 12983 std::lock_guard lock(client_lock);
7c673cae 12984
181888fb
FG
12985 if (unmounting)
12986 return -ENOTCONN;
12987
7c673cae
FG
12988 vinodeno_t vino = _get_vino(in);
12989
12990 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12991 tout(cct) << "ll_open" << std::endl;
12992 tout(cct) << vino.ino.val << std::endl;
12993 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12994
12995 int r;
11fdf7f2
TL
12996 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12997 "fuse_default_permissions");
12998 if (!fuse_default_permissions) {
7c673cae
FG
12999 r = may_open(in, flags, perms);
13000 if (r < 0)
13001 goto out;
13002 }
13003
13004 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13005
13006 out:
13007 Fh *fhptr = fhp ? *fhp : NULL;
13008 if (fhptr) {
13009 ll_unclosed_fh_set.insert(fhptr);
13010 }
13011 tout(cct) << (unsigned long)fhptr << std::endl;
13012 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13013 " = " << r << " (" << fhptr << ")" << dendl;
13014 return r;
13015}
13016
13017int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13018 int flags, InodeRef *in, int caps, Fh **fhp,
13019 const UserPerm& perms)
13020{
13021 *fhp = NULL;
13022
13023 vinodeno_t vparent = _get_vino(parent);
13024
1adf2230 13025 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13026 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13027 << ", gid " << perms.gid() << dendl;
13028 tout(cct) << "ll_create" << std::endl;
13029 tout(cct) << vparent.ino.val << std::endl;
13030 tout(cct) << name << std::endl;
13031 tout(cct) << mode << std::endl;
13032 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13033
13034 bool created = false;
13035 int r = _lookup(parent, name, caps, in, perms);
13036
13037 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13038 return -EEXIST;
13039
13040 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2
TL
13041 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13042 "fuse_default_permissions");
13043 if (!fuse_default_permissions) {
7c673cae
FG
13044 r = may_create(parent, perms);
13045 if (r < 0)
13046 goto out;
13047 }
13048 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13049 perms);
13050 if (r < 0)
13051 goto out;
13052 }
13053
13054 if (r < 0)
13055 goto out;
13056
11fdf7f2 13057 ceph_assert(*in);
7c673cae
FG
13058
13059 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13060 if (!created) {
11fdf7f2
TL
13061 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13062 "fuse_default_permissions");
13063 if (!fuse_default_permissions) {
7c673cae
FG
13064 r = may_open(in->get(), flags, perms);
13065 if (r < 0) {
13066 if (*fhp) {
13067 int release_r = _release_fh(*fhp);
11fdf7f2 13068 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13069 }
13070 goto out;
13071 }
13072 }
13073 if (*fhp == NULL) {
13074 r = _open(in->get(), flags, mode, fhp, perms);
13075 if (r < 0)
13076 goto out;
13077 }
13078 }
13079
13080out:
13081 if (*fhp) {
13082 ll_unclosed_fh_set.insert(*fhp);
13083 }
13084
13085 ino_t ino = 0;
13086 if (r >= 0) {
13087 Inode *inode = in->get();
13088 if (use_faked_inos())
13089 ino = inode->faked_ino;
13090 else
13091 ino = inode->ino;
13092 }
13093
13094 tout(cct) << (unsigned long)*fhp << std::endl;
13095 tout(cct) << ino << std::endl;
1adf2230 13096 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13097 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13098 *fhp << " " << hex << ino << dec << ")" << dendl;
13099
13100 return r;
13101}
13102
13103int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13104 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13105 const UserPerm& perms)
13106{
11fdf7f2 13107 std::lock_guard lock(client_lock);
7c673cae
FG
13108 InodeRef in;
13109
181888fb
FG
13110 if (unmounting)
13111 return -ENOTCONN;
13112
7c673cae
FG
13113 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13114 fhp, perms);
13115 if (r >= 0) {
11fdf7f2 13116 ceph_assert(in);
7c673cae
FG
13117
13118 // passing an Inode in outp requires an additional ref
13119 if (outp) {
13120 _ll_get(in.get());
13121 *outp = in.get();
13122 }
13123 fill_stat(in, attr);
13124 } else {
13125 attr->st_ino = 0;
13126 }
13127
13128 return r;
13129}
13130
13131int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13132 int oflags, Inode **outp, Fh **fhp,
13133 struct ceph_statx *stx, unsigned want, unsigned lflags,
13134 const UserPerm& perms)
13135{
13136 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13137 std::lock_guard lock(client_lock);
7c673cae
FG
13138 InodeRef in;
13139
181888fb
FG
13140 if (unmounting)
13141 return -ENOTCONN;
7c673cae
FG
13142
13143 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13144 if (r >= 0) {
11fdf7f2 13145 ceph_assert(in);
7c673cae
FG
13146
13147 // passing an Inode in outp requires an additional ref
13148 if (outp) {
13149 _ll_get(in.get());
13150 *outp = in.get();
13151 }
13152 fill_statx(in, caps, stx);
13153 } else {
13154 stx->stx_ino = 0;
13155 stx->stx_mask = 0;
13156 }
13157
13158 return r;
13159}
13160
13161loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13162{
11fdf7f2 13163 std::lock_guard lock(client_lock);
7c673cae
FG
13164 tout(cct) << "ll_lseek" << std::endl;
13165 tout(cct) << offset << std::endl;
13166 tout(cct) << whence << std::endl;
13167
181888fb
FG
13168 if (unmounting)
13169 return -ENOTCONN;
13170
7c673cae
FG
13171 return _lseek(fh, offset, whence);
13172}
13173
13174int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13175{
11fdf7f2 13176 std::lock_guard lock(client_lock);
7c673cae
FG
13177 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13178 tout(cct) << "ll_read" << std::endl;
13179 tout(cct) << (unsigned long)fh << std::endl;
13180 tout(cct) << off << std::endl;
13181 tout(cct) << len << std::endl;
13182
181888fb
FG
13183 if (unmounting)
13184 return -ENOTCONN;
13185
11fdf7f2
TL
13186 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13187 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13188 return _read(fh, off, len, bl);
13189}
13190
13191int Client::ll_read_block(Inode *in, uint64_t blockid,
13192 char *buf,
13193 uint64_t offset,
13194 uint64_t length,
13195 file_layout_t* layout)
13196{
11fdf7f2 13197 std::lock_guard lock(client_lock);
181888fb
FG
13198
13199 if (unmounting)
13200 return -ENOTCONN;
13201
b32b8144 13202 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13203 object_t oid = file_object_t(vino.ino, blockid);
13204 C_SaferCond onfinish;
13205 bufferlist bl;
13206
13207 objecter->read(oid,
13208 object_locator_t(layout->pool_id),
13209 offset,
13210 length,
13211 vino.snapid,
13212 &bl,
13213 CEPH_OSD_FLAG_READ,
13214 &onfinish);
13215
13216 client_lock.Unlock();
13217 int r = onfinish.wait();
13218 client_lock.Lock();
13219
13220 if (r >= 0) {
13221 bl.copy(0, bl.length(), buf);
13222 r = bl.length();
13223 }
13224
13225 return r;
13226}
13227
13228/* It appears that the OSD doesn't return success unless the entire
13229 buffer was written, return the write length on success. */
13230
13231int Client::ll_write_block(Inode *in, uint64_t blockid,
13232 char* buf, uint64_t offset,
13233 uint64_t length, file_layout_t* layout,
13234 uint64_t snapseq, uint32_t sync)
13235{
7c673cae 13236 vinodeno_t vino = ll_get_vino(in);
7c673cae 13237 int r = 0;
11fdf7f2
TL
13238 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13239
7c673cae
FG
13240 if (length == 0) {
13241 return -EINVAL;
13242 }
13243 if (true || sync) {
13244 /* if write is stable, the epilogue is waiting on
13245 * flock */
11fdf7f2 13246 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13247 }
13248 object_t oid = file_object_t(vino.ino, blockid);
13249 SnapContext fakesnap;
11fdf7f2
TL
13250 ceph::bufferlist bl;
13251 if (length > 0) {
13252 bl.push_back(buffer::copy(buf, length));
13253 }
7c673cae
FG
13254
13255 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13256 << dendl;
13257
13258 fakesnap.seq = snapseq;
13259
13260 /* lock just in time */
13261 client_lock.Lock();
181888fb
FG
13262 if (unmounting) {
13263 client_lock.Unlock();
181888fb
FG
13264 return -ENOTCONN;
13265 }
7c673cae
FG
13266
13267 objecter->write(oid,
13268 object_locator_t(layout->pool_id),
13269 offset,
13270 length,
13271 fakesnap,
13272 bl,
13273 ceph::real_clock::now(),
13274 0,
11fdf7f2 13275 onsafe.get());
7c673cae
FG
13276
13277 client_lock.Unlock();
11fdf7f2
TL
13278 if (nullptr != onsafe) {
13279 r = onsafe->wait();
7c673cae
FG
13280 }
13281
13282 if (r < 0) {
13283 return r;
13284 } else {
13285 return length;
13286 }
13287}
13288
13289int Client::ll_commit_blocks(Inode *in,
13290 uint64_t offset,
13291 uint64_t length)
13292{
11fdf7f2 13293 std::lock_guard lock(client_lock);
7c673cae
FG
13294 /*
13295 BarrierContext *bctx;
b32b8144 13296 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13297 uint64_t ino = vino.ino;
13298
13299 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13300 << offset << " to " << length << dendl;
13301
13302 if (length == 0) {
13303 return -EINVAL;
13304 }
13305
13306 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13307 if (p != barriers.end()) {
13308 barrier_interval civ(offset, offset + length);
13309 p->second->commit_barrier(civ);
13310 }
13311 */
13312 return 0;
13313}
13314
13315int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13316{
11fdf7f2 13317 std::lock_guard lock(client_lock);
7c673cae
FG
13318 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13319 "~" << len << dendl;
13320 tout(cct) << "ll_write" << std::endl;
13321 tout(cct) << (unsigned long)fh << std::endl;
13322 tout(cct) << off << std::endl;
13323 tout(cct) << len << std::endl;
13324
181888fb
FG
13325 if (unmounting)
13326 return -ENOTCONN;
13327
11fdf7f2
TL
13328 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13329 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13330 int r = _write(fh, off, len, data, NULL, 0);
13331 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13332 << dendl;
13333 return r;
13334}
13335
11fdf7f2
TL
13336int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13337{
13338 std::lock_guard lock(client_lock);
13339 if (unmounting)
13340 return -ENOTCONN;
13341 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13342}
13343
13344int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13345{
13346 std::lock_guard lock(client_lock);
13347 if (unmounting)
13348 return -ENOTCONN;
13349 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13350}
13351
7c673cae
FG
13352int Client::ll_flush(Fh *fh)
13353{
11fdf7f2 13354 std::lock_guard lock(client_lock);
7c673cae
FG
13355 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13356 tout(cct) << "ll_flush" << std::endl;
13357 tout(cct) << (unsigned long)fh << std::endl;
13358
181888fb
FG
13359 if (unmounting)
13360 return -ENOTCONN;
13361
7c673cae
FG
13362 return _flush(fh);
13363}
13364
13365int Client::ll_fsync(Fh *fh, bool syncdataonly)
13366{
11fdf7f2 13367 std::lock_guard lock(client_lock);
7c673cae
FG
13368 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13369 tout(cct) << "ll_fsync" << std::endl;
13370 tout(cct) << (unsigned long)fh << std::endl;
13371
181888fb
FG
13372 if (unmounting)
13373 return -ENOTCONN;
13374
7c673cae
FG
13375 int r = _fsync(fh, syncdataonly);
13376 if (r) {
13377 // If we're returning an error, clear it from the FH
13378 fh->take_async_err();
13379 }
13380 return r;
13381}
13382
28e407b8
AA
13383int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13384{
11fdf7f2 13385 std::lock_guard lock(client_lock);
28e407b8
AA
13386 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13387 tout(cct) << "ll_sync_inode" << std::endl;
13388 tout(cct) << (unsigned long)in << std::endl;
13389
13390 if (unmounting)
13391 return -ENOTCONN;
13392
13393 return _fsync(in, syncdataonly);
13394}
13395
7c673cae
FG
13396#ifdef FALLOC_FL_PUNCH_HOLE
13397
13398int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13399{
13400 if (offset < 0 || length <= 0)
13401 return -EINVAL;
13402
13403 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13404 return -EOPNOTSUPP;
13405
13406 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13407 return -EOPNOTSUPP;
13408
13409 Inode *in = fh->inode.get();
13410
13411 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13412 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13413 return -ENOSPC;
13414 }
13415
13416 if (in->snapid != CEPH_NOSNAP)
13417 return -EROFS;
13418
13419 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13420 return -EBADF;
13421
13422 uint64_t size = offset + length;
13423 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13424 size > in->size &&
11fdf7f2 13425 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13426 return -EDQUOT;
13427 }
13428
13429 int have;
13430 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13431 if (r < 0)
13432 return r;
13433
11fdf7f2 13434 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13435 if (mode & FALLOC_FL_PUNCH_HOLE) {
13436 if (in->inline_version < CEPH_INLINE_NONE &&
13437 (have & CEPH_CAP_FILE_BUFFER)) {
13438 bufferlist bl;
13439 int len = in->inline_data.length();
13440 if (offset < len) {
13441 if (offset > 0)
13442 in->inline_data.copy(0, offset, bl);
13443 int size = length;
13444 if (offset + size > len)
13445 size = len - offset;
13446 if (size > 0)
13447 bl.append_zero(size);
13448 if (offset + size < len)
13449 in->inline_data.copy(offset + size, len - offset - size, bl);
13450 in->inline_data = bl;
13451 in->inline_version++;
13452 }
91327a77 13453 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13454 in->change_attr++;
28e407b8 13455 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13456 } else {
13457 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13458 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13459 uninline_data(in, onuninline.get());
7c673cae
FG
13460 }
13461
11fdf7f2 13462 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13463
13464 unsafe_sync_write++;
13465 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13466
13467 _invalidate_inode_cache(in, offset, length);
13468 filer->zero(in->ino, &in->layout,
13469 in->snaprealm->get_snap_context(),
13470 offset, length,
13471 ceph::real_clock::now(),
11fdf7f2 13472 0, true, &onfinish);
91327a77 13473 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13474 in->change_attr++;
28e407b8 13475 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13476
13477 client_lock.Unlock();
11fdf7f2 13478 onfinish.wait();
7c673cae
FG
13479 client_lock.Lock();
13480 _sync_write_commit(in);
13481 }
13482 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13483 uint64_t size = offset + length;
13484 if (size > in->size) {
13485 in->size = size;
91327a77 13486 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13487 in->change_attr++;
28e407b8 13488 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13489
11fdf7f2 13490 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13491 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13492 } else if (is_max_size_approaching(in)) {
13493 check_caps(in, 0);
7c673cae
FG
13494 }
13495 }
13496 }
13497
11fdf7f2 13498 if (nullptr != onuninline) {
7c673cae 13499 client_lock.Unlock();
11fdf7f2 13500 int ret = onuninline->wait();
7c673cae
FG
13501 client_lock.Lock();
13502
11fdf7f2 13503 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13504 in->inline_data.clear();
13505 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13506 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13507 check_caps(in, 0);
13508 } else
11fdf7f2 13509 r = ret;
7c673cae
FG
13510 }
13511
13512 put_cap_ref(in, CEPH_CAP_FILE_WR);
13513 return r;
13514}
13515#else
13516
13517int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13518{
13519 return -EOPNOTSUPP;
13520}
13521
13522#endif
13523
13524
11fdf7f2 13525int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13526{
11fdf7f2
TL
13527 std::lock_guard lock(client_lock);
13528 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13529 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13530 tout(cct) << (unsigned long)fh << std::endl;
13531
181888fb
FG
13532 if (unmounting)
13533 return -ENOTCONN;
13534
7c673cae
FG
13535 return _fallocate(fh, mode, offset, length);
13536}
13537
13538int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13539{
11fdf7f2
TL
13540 std::lock_guard lock(client_lock);
13541 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13542
181888fb
FG
13543 if (unmounting)
13544 return -ENOTCONN;
13545
7c673cae
FG
13546 Fh *fh = get_filehandle(fd);
13547 if (!fh)
13548 return -EBADF;
13549#if defined(__linux__) && defined(O_PATH)
13550 if (fh->flags & O_PATH)
13551 return -EBADF;
13552#endif
13553 return _fallocate(fh, mode, offset, length);
13554}
13555
13556int Client::ll_release(Fh *fh)
13557{
11fdf7f2 13558 std::lock_guard lock(client_lock);
91327a77
AA
13559
13560 if (unmounting)
13561 return -ENOTCONN;
13562
11fdf7f2 13563 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13564 dendl;
11fdf7f2 13565 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13566 tout(cct) << (unsigned long)fh << std::endl;
13567
13568 if (ll_unclosed_fh_set.count(fh))
13569 ll_unclosed_fh_set.erase(fh);
13570 return _release_fh(fh);
13571}
13572
13573int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13574{
11fdf7f2 13575 std::lock_guard lock(client_lock);
7c673cae
FG
13576
13577 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13578 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13579
181888fb
FG
13580 if (unmounting)
13581 return -ENOTCONN;
13582
7c673cae
FG
13583 return _getlk(fh, fl, owner);
13584}
13585
13586int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13587{
11fdf7f2 13588 std::lock_guard lock(client_lock);
7c673cae 13589
11fdf7f2
TL
13590 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13591 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13592
181888fb
FG
13593 if (unmounting)
13594 return -ENOTCONN;
13595
7c673cae
FG
13596 return _setlk(fh, fl, owner, sleep);
13597}
13598
13599int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13600{
11fdf7f2 13601 std::lock_guard lock(client_lock);
7c673cae 13602
11fdf7f2
TL
13603 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13604 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13605
181888fb
FG
13606 if (unmounting)
13607 return -ENOTCONN;
13608
7c673cae
FG
13609 return _flock(fh, cmd, owner);
13610}
13611
b32b8144
FG
13612int Client::set_deleg_timeout(uint32_t timeout)
13613{
11fdf7f2 13614 std::lock_guard lock(client_lock);
b32b8144
FG
13615
13616 /*
13617 * The whole point is to prevent blacklisting so we must time out the
13618 * delegation before the session autoclose timeout kicks in.
13619 */
13620 if (timeout >= mdsmap->get_session_autoclose())
13621 return -EINVAL;
13622
13623 deleg_timeout = timeout;
13624 return 0;
13625}
13626
13627int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13628{
13629 int ret = -EINVAL;
13630
11fdf7f2 13631 std::lock_guard lock(client_lock);
b32b8144
FG
13632
13633 if (!mounted)
13634 return -ENOTCONN;
13635
13636 Inode *inode = fh->inode.get();
13637
13638 switch(cmd) {
13639 case CEPH_DELEGATION_NONE:
13640 inode->unset_deleg(fh);
13641 ret = 0;
13642 break;
13643 default:
13644 try {
13645 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13646 } catch (std::bad_alloc&) {
b32b8144
FG
13647 ret = -ENOMEM;
13648 }
13649 break;
13650 }
13651 return ret;
13652}
13653
7c673cae
FG
13654class C_Client_RequestInterrupt : public Context {
13655private:
13656 Client *client;
13657 MetaRequest *req;
13658public:
13659 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13660 req->get();
13661 }
13662 void finish(int r) override {
11fdf7f2
TL
13663 std::lock_guard l(client->client_lock);
13664 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13665 client->_interrupt_filelock(req);
13666 client->put_request(req);
13667 }
13668};
13669
13670void Client::ll_interrupt(void *d)
13671{
13672 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13673 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13674 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13675 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13676}
13677
13678// =========================================
13679// layout
13680
13681// expose file layouts
13682
13683int Client::describe_layout(const char *relpath, file_layout_t *lp,
13684 const UserPerm& perms)
13685{
11fdf7f2 13686 std::lock_guard lock(client_lock);
7c673cae 13687
181888fb
FG
13688 if (unmounting)
13689 return -ENOTCONN;
13690
7c673cae
FG
13691 filepath path(relpath);
13692 InodeRef in;
13693 int r = path_walk(path, &in, perms);
13694 if (r < 0)
13695 return r;
13696
13697 *lp = in->layout;
13698
11fdf7f2 13699 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13700 return 0;
13701}
13702
13703int Client::fdescribe_layout(int fd, file_layout_t *lp)
13704{
11fdf7f2 13705 std::lock_guard lock(client_lock);
7c673cae 13706
181888fb
FG
13707 if (unmounting)
13708 return -ENOTCONN;
13709
7c673cae
FG
13710 Fh *f = get_filehandle(fd);
13711 if (!f)
13712 return -EBADF;
13713 Inode *in = f->inode.get();
13714
13715 *lp = in->layout;
13716
11fdf7f2 13717 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13718 return 0;
13719}
13720
d2e6a577
FG
13721int64_t Client::get_default_pool_id()
13722{
11fdf7f2 13723 std::lock_guard lock(client_lock);
181888fb
FG
13724
13725 if (unmounting)
13726 return -ENOTCONN;
13727
d2e6a577
FG
13728 /* first data pool is the default */
13729 return mdsmap->get_first_data_pool();
13730}
7c673cae
FG
13731
13732// expose osdmap
13733
13734int64_t Client::get_pool_id(const char *pool_name)
13735{
11fdf7f2 13736 std::lock_guard lock(client_lock);
181888fb
FG
13737
13738 if (unmounting)
13739 return -ENOTCONN;
13740
7c673cae
FG
13741 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13742 pool_name);
13743}
13744
13745string Client::get_pool_name(int64_t pool)
13746{
11fdf7f2 13747 std::lock_guard lock(client_lock);
181888fb
FG
13748
13749 if (unmounting)
13750 return string();
13751
7c673cae
FG
13752 return objecter->with_osdmap([pool](const OSDMap& o) {
13753 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13754 });
13755}
13756
13757int Client::get_pool_replication(int64_t pool)
13758{
11fdf7f2 13759 std::lock_guard lock(client_lock);
181888fb
FG
13760
13761 if (unmounting)
13762 return -ENOTCONN;
13763
7c673cae
FG
13764 return objecter->with_osdmap([pool](const OSDMap& o) {
13765 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13766 });
13767}
13768
13769int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13770{
11fdf7f2 13771 std::lock_guard lock(client_lock);
7c673cae 13772
181888fb
FG
13773 if (unmounting)
13774 return -ENOTCONN;
13775
7c673cae
FG
13776 Fh *f = get_filehandle(fd);
13777 if (!f)
13778 return -EBADF;
13779 Inode *in = f->inode.get();
13780
13781 vector<ObjectExtent> extents;
13782 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13783 ceph_assert(extents.size() == 1);
7c673cae
FG
13784
13785 objecter->with_osdmap([&](const OSDMap& o) {
13786 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13787 o.pg_to_acting_osds(pg, osds);
13788 });
13789
13790 if (osds.empty())
13791 return -EINVAL;
13792
13793 /*
13794 * Return the remainder of the extent (stripe unit)
13795 *
13796 * If length = 1 is passed to Striper::file_to_extents we get a single
13797 * extent back, but its length is one so we still need to compute the length
13798 * to the end of the stripe unit.
13799 *
13800 * If length = su then we may get 1 or 2 objects back in the extents vector
13801 * which would have to be examined. Even then, the offsets are local to the
13802 * object, so matching up to the file offset is extra work.
13803 *
13804 * It seems simpler to stick with length = 1 and manually compute the
13805 * remainder.
13806 */
13807 if (len) {
13808 uint64_t su = in->layout.stripe_unit;
13809 *len = su - (off % su);
13810 }
13811
13812 return 0;
13813}
13814
13815int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13816{
11fdf7f2 13817 std::lock_guard lock(client_lock);
181888fb
FG
13818
13819 if (unmounting)
13820 return -ENOTCONN;
13821
7c673cae
FG
13822 if (id < 0)
13823 return -EINVAL;
13824 return objecter->with_osdmap([&](const OSDMap& o) {
13825 return o.crush->get_full_location_ordered(id, path);
13826 });
13827}
13828
13829int Client::get_file_stripe_address(int fd, loff_t offset,
13830 vector<entity_addr_t>& address)
13831{
11fdf7f2 13832 std::lock_guard lock(client_lock);
7c673cae 13833
181888fb
FG
13834 if (unmounting)
13835 return -ENOTCONN;
13836
7c673cae
FG
13837 Fh *f = get_filehandle(fd);
13838 if (!f)
13839 return -EBADF;
13840 Inode *in = f->inode.get();
13841
13842 // which object?
13843 vector<ObjectExtent> extents;
13844 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13845 in->truncate_size, extents);
11fdf7f2 13846 ceph_assert(extents.size() == 1);
7c673cae
FG
13847
13848 // now we have the object and its 'layout'
13849 return objecter->with_osdmap([&](const OSDMap& o) {
13850 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13851 vector<int> osds;
13852 o.pg_to_acting_osds(pg, osds);
13853 if (osds.empty())
13854 return -EINVAL;
13855 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13856 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13857 address.push_back(addr);
13858 }
13859 return 0;
13860 });
13861}
13862
13863int Client::get_osd_addr(int osd, entity_addr_t& addr)
13864{
11fdf7f2 13865 std::lock_guard lock(client_lock);
181888fb
FG
13866
13867 if (unmounting)
13868 return -ENOTCONN;
13869
7c673cae
FG
13870 return objecter->with_osdmap([&](const OSDMap& o) {
13871 if (!o.exists(osd))
13872 return -ENOENT;
13873
11fdf7f2 13874 addr = o.get_addrs(osd).front();
7c673cae
FG
13875 return 0;
13876 });
13877}
13878
13879int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13880 loff_t length, loff_t offset)
13881{
11fdf7f2 13882 std::lock_guard lock(client_lock);
7c673cae 13883
181888fb
FG
13884 if (unmounting)
13885 return -ENOTCONN;
13886
7c673cae
FG
13887 Fh *f = get_filehandle(fd);
13888 if (!f)
13889 return -EBADF;
13890 Inode *in = f->inode.get();
13891
13892 // map to a list of extents
13893 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13894
11fdf7f2 13895 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13896 return 0;
13897}
13898
13899
b32b8144 13900/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13901int Client::get_local_osd()
13902{
11fdf7f2 13903 std::lock_guard lock(client_lock);
181888fb
FG
13904
13905 if (unmounting)
13906 return -ENOTCONN;
13907
7c673cae
FG
13908 objecter->with_osdmap([this](const OSDMap& o) {
13909 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13910 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13911 local_osd_epoch = o.get_epoch();
13912 }
13913 });
13914 return local_osd;
13915}
13916
13917
13918
13919
13920
13921
13922// ===============================
13923
13924void Client::ms_handle_connect(Connection *con)
13925{
11fdf7f2 13926 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13927}
13928
13929bool Client::ms_handle_reset(Connection *con)
13930{
11fdf7f2 13931 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13932 return false;
13933}
13934
13935void Client::ms_handle_remote_reset(Connection *con)
13936{
11fdf7f2
TL
13937 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13938 std::lock_guard l(client_lock);
7c673cae
FG
13939 switch (con->get_peer_type()) {
13940 case CEPH_ENTITY_TYPE_MDS:
13941 {
13942 // kludge to figure out which mds this is; fixme with a Connection* state
13943 mds_rank_t mds = MDS_RANK_NONE;
13944 MetaSession *s = NULL;
11fdf7f2
TL
13945 for (auto &p : mds_sessions) {
13946 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13947 mds = p.first;
13948 s = &p.second;
7c673cae
FG
13949 }
13950 }
13951 if (mds >= 0) {
d2e6a577 13952 assert (s != NULL);
7c673cae
FG
13953 switch (s->state) {
13954 case MetaSession::STATE_CLOSING:
13955 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13956 _closed_mds_session(s);
13957 break;
13958
13959 case MetaSession::STATE_OPENING:
13960 {
13961 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13962 list<Context*> waiters;
13963 waiters.swap(s->waiting_for_open);
13964 _closed_mds_session(s);
13965 MetaSession *news = _get_or_open_mds_session(mds);
13966 news->waiting_for_open.swap(waiters);
13967 }
13968 break;
13969
13970 case MetaSession::STATE_OPEN:
13971 {
28e407b8 13972 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 13973 const auto& conf = cct->_conf;
7c673cae
FG
13974 if (conf->client_reconnect_stale) {
13975 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13976 _closed_mds_session(s);
13977 } else {
13978 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13979 s->state = MetaSession::STATE_STALE;
13980 }
13981 }
13982 break;
13983
13984 case MetaSession::STATE_NEW:
13985 case MetaSession::STATE_CLOSED:
13986 default:
13987 break;
13988 }
13989 }
13990 }
13991 break;
13992 }
13993}
13994
13995bool Client::ms_handle_refused(Connection *con)
13996{
11fdf7f2 13997 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13998 return false;
13999}
14000
11fdf7f2 14001bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
14002{
14003 if (dest_type == CEPH_ENTITY_TYPE_MON)
14004 return true;
14005 *authorizer = monclient->build_authorizer(dest_type);
14006 return true;
14007}
14008
14009Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14010{
11fdf7f2
TL
14011 Inode *quota_in = root_ancestor;
14012 SnapRealm *realm = in->snaprealm;
14013 while (realm) {
14014 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14015 if (realm->ino != in->ino) {
14016 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14017 if (p == inode_map.end())
14018 break;
7c673cae 14019
11fdf7f2
TL
14020 if (p->second->quota.is_enable()) {
14021 quota_in = p->second;
14022 break;
7c673cae 14023 }
7c673cae 14024 }
11fdf7f2 14025 realm = realm->pparent;
7c673cae 14026 }
11fdf7f2
TL
14027 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14028 return quota_in;
7c673cae
FG
14029}
14030
14031/**
14032 * Traverse quota ancestors of the Inode, return true
14033 * if any of them passes the passed function
14034 */
14035bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14036 std::function<bool (const Inode &in)> test)
14037{
14038 while (true) {
11fdf7f2 14039 ceph_assert(in != NULL);
7c673cae
FG
14040 if (test(*in)) {
14041 return true;
14042 }
14043
14044 if (in == root_ancestor) {
14045 // We're done traversing, drop out
14046 return false;
14047 } else {
14048 // Continue up the tree
14049 in = get_quota_root(in, perms);
14050 }
14051 }
14052
14053 return false;
14054}
14055
14056bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14057{
14058 return check_quota_condition(in, perms,
14059 [](const Inode &in) {
14060 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14061 });
14062}
14063
14064bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14065 const UserPerm& perms)
7c673cae
FG
14066{
14067 return check_quota_condition(in, perms,
11fdf7f2 14068 [&new_bytes](const Inode &in) {
7c673cae
FG
14069 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14070 > in.quota.max_bytes;
14071 });
14072}
14073
11fdf7f2 14074bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14075{
11fdf7f2
TL
14076 return check_quota_condition(in, perms,
14077 [](const Inode &in) {
14078 if (in.quota.max_bytes) {
14079 if (in.rstat.rbytes >= in.quota.max_bytes) {
14080 return true;
14081 }
14082
14083 ceph_assert(in.size >= in.reported_size);
14084 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14085 const uint64_t size = in.size - in.reported_size;
14086 return (space >> 4) < size;
14087 } else {
14088 return false;
14089 }
14090 });
7c673cae
FG
14091}
14092
14093enum {
14094 POOL_CHECKED = 1,
14095 POOL_CHECKING = 2,
14096 POOL_READ = 4,
14097 POOL_WRITE = 8,
14098};
14099
14100int Client::check_pool_perm(Inode *in, int need)
14101{
14102 if (!cct->_conf->client_check_pool_perm)
14103 return 0;
14104
14105 int64_t pool_id = in->layout.pool_id;
14106 std::string pool_ns = in->layout.pool_ns;
14107 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14108 int have = 0;
14109 while (true) {
14110 auto it = pool_perms.find(perm_key);
14111 if (it == pool_perms.end())
14112 break;
14113 if (it->second == POOL_CHECKING) {
14114 // avoid concurrent checkings
14115 wait_on_list(waiting_for_pool_perm);
14116 } else {
14117 have = it->second;
11fdf7f2 14118 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14119 break;
14120 }
14121 }
14122
14123 if (!have) {
14124 if (in->snapid != CEPH_NOSNAP) {
14125 // pool permission check needs to write to the first object. But for snapshot,
14126 // head of the first object may have alread been deleted. To avoid creating
14127 // orphan object, skip the check for now.
14128 return 0;
14129 }
14130
14131 pool_perms[perm_key] = POOL_CHECKING;
14132
14133 char oid_buf[32];
14134 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14135 object_t oid = oid_buf;
14136
14137 SnapContext nullsnapc;
14138
14139 C_SaferCond rd_cond;
14140 ObjectOperation rd_op;
14141 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14142
14143 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14144 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14145
14146 C_SaferCond wr_cond;
14147 ObjectOperation wr_op;
14148 wr_op.create(true);
14149
14150 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14151 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14152
14153 client_lock.Unlock();
14154 int rd_ret = rd_cond.wait();
14155 int wr_ret = wr_cond.wait();
14156 client_lock.Lock();
14157
14158 bool errored = false;
14159
14160 if (rd_ret == 0 || rd_ret == -ENOENT)
14161 have |= POOL_READ;
14162 else if (rd_ret != -EPERM) {
11fdf7f2 14163 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14164 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14165 errored = true;
14166 }
14167
14168 if (wr_ret == 0 || wr_ret == -EEXIST)
14169 have |= POOL_WRITE;
14170 else if (wr_ret != -EPERM) {
11fdf7f2 14171 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14172 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14173 errored = true;
14174 }
14175
14176 if (errored) {
14177 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14178 // Raise EIO because actual error code might be misleading for
14179 // userspace filesystem user.
14180 pool_perms.erase(perm_key);
14181 signal_cond_list(waiting_for_pool_perm);
14182 return -EIO;
14183 }
14184
14185 pool_perms[perm_key] = have | POOL_CHECKED;
14186 signal_cond_list(waiting_for_pool_perm);
14187 }
14188
14189 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14190 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14191 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14192 return -EPERM;
14193 }
14194 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14195 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14196 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14197 return -EPERM;
14198 }
14199
14200 return 0;
14201}
14202
14203int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14204{
14205 if (acl_type == POSIX_ACL) {
14206 if (in->xattrs.count(ACL_EA_ACCESS)) {
14207 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14208
14209 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14210 }
14211 }
14212 return -EAGAIN;
14213}
14214
14215int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14216{
14217 if (acl_type == NO_ACL)
14218 return 0;
14219
14220 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14221 if (r < 0)
14222 goto out;
14223
14224 if (acl_type == POSIX_ACL) {
14225 if (in->xattrs.count(ACL_EA_ACCESS)) {
14226 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14227 bufferptr acl(access_acl.c_str(), access_acl.length());
14228 r = posix_acl_access_chmod(acl, mode);
14229 if (r < 0)
14230 goto out;
14231 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14232 } else {
14233 r = 0;
14234 }
14235 }
14236out:
14237 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14238 return r;
14239}
14240
14241int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14242 const UserPerm& perms)
14243{
14244 if (acl_type == NO_ACL)
14245 return 0;
14246
14247 if (S_ISLNK(*mode))
14248 return 0;
14249
14250 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14251 if (r < 0)
14252 goto out;
14253
14254 if (acl_type == POSIX_ACL) {
14255 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14256 map<string, bufferptr> xattrs;
14257
14258 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14259 bufferptr acl(default_acl.c_str(), default_acl.length());
14260 r = posix_acl_inherit_mode(acl, mode);
14261 if (r < 0)
14262 goto out;
14263
14264 if (r > 0) {
14265 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14266 if (r < 0)
14267 goto out;
14268 if (r > 0)
14269 xattrs[ACL_EA_ACCESS] = acl;
14270 }
14271
14272 if (S_ISDIR(*mode))
14273 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14274
14275 r = xattrs.size();
14276 if (r > 0)
11fdf7f2 14277 encode(xattrs, xattrs_bl);
7c673cae
FG
14278 } else {
14279 if (umask_cb)
14280 *mode &= ~umask_cb(callback_handle);
14281 r = 0;
14282 }
14283 }
14284out:
14285 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14286 return r;
14287}
14288
14289void Client::set_filer_flags(int flags)
14290{
11fdf7f2
TL
14291 std::lock_guard l(client_lock);
14292 ceph_assert(flags == 0 ||
7c673cae
FG
14293 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14294 objecter->add_global_op_flags(flags);
14295}
14296
14297void Client::clear_filer_flags(int flags)
14298{
11fdf7f2
TL
14299 std::lock_guard l(client_lock);
14300 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14301 objecter->clear_global_op_flag(flags);
14302}
14303
11fdf7f2
TL
14304// called before mount
14305void Client::set_uuid(const std::string& uuid)
14306{
14307 std::lock_guard l(client_lock);
14308 assert(initialized);
14309 assert(!uuid.empty());
14310
14311 metadata["uuid"] = uuid;
14312 _close_sessions();
14313}
14314
14315// called before mount. 0 means infinite
14316void Client::set_session_timeout(unsigned timeout)
14317{
14318 std::lock_guard l(client_lock);
14319 assert(initialized);
14320
14321 metadata["timeout"] = stringify(timeout);
14322}
14323
14324// called before mount
14325int Client::start_reclaim(const std::string& uuid, unsigned flags,
14326 const std::string& fs_name)
14327{
14328 std::lock_guard l(client_lock);
14329 if (!initialized)
14330 return -ENOTCONN;
14331
14332 if (uuid.empty())
14333 return -EINVAL;
14334
14335 {
14336 auto it = metadata.find("uuid");
14337 if (it != metadata.end() && it->second == uuid)
14338 return -EINVAL;
14339 }
14340
14341 int r = subscribe_mdsmap(fs_name);
14342 if (r < 0) {
14343 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14344 return r;
14345 }
14346
14347 if (metadata.empty())
14348 populate_metadata("");
14349
14350 while (mdsmap->get_epoch() == 0)
14351 wait_on_list(waiting_for_mdsmap);
14352
14353 reclaim_errno = 0;
14354 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14355 if (!mdsmap->is_up(mds)) {
14356 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14357 wait_on_list(waiting_for_mdsmap);
14358 continue;
14359 }
14360
14361 MetaSession *session;
14362 if (!have_open_session(mds)) {
14363 session = _get_or_open_mds_session(mds);
14364 if (session->state != MetaSession::STATE_OPENING) {
14365 // umounting?
14366 return -EINVAL;
14367 }
14368 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14369 wait_on_context_list(session->waiting_for_open);
14370 if (rejected_by_mds.count(mds))
14371 return -EPERM;
14372 continue;
14373 }
14374
14375 session = &mds_sessions.at(mds);
14376 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14377 return -EOPNOTSUPP;
14378
14379 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14380 session->reclaim_state == MetaSession::RECLAIMING) {
14381 session->reclaim_state = MetaSession::RECLAIMING;
14382 auto m = MClientReclaim::create(uuid, flags);
14383 session->con->send_message2(std::move(m));
14384 wait_on_list(waiting_for_reclaim);
14385 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14386 return reclaim_errno ? : -ENOTRECOVERABLE;
14387 } else {
14388 mds++;
14389 }
14390 }
14391
14392 // didn't find target session in any mds
14393 if (reclaim_target_addrs.empty()) {
14394 if (flags & CEPH_RECLAIM_RESET)
14395 return -ENOENT;
14396 return -ENOTRECOVERABLE;
14397 }
14398
14399 if (flags & CEPH_RECLAIM_RESET)
14400 return 0;
14401
14402 // use blacklist to check if target session was killed
14403 // (config option mds_session_blacklist_on_evict needs to be true)
14404 C_SaferCond cond;
14405 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14406 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14407 client_lock.Unlock();
14408 cond.wait();
14409 client_lock.Lock();
14410 }
14411
14412 bool blacklisted = objecter->with_osdmap(
14413 [this](const OSDMap &osd_map) -> bool {
14414 return osd_map.is_blacklisted(reclaim_target_addrs);
14415 });
14416 if (blacklisted)
14417 return -ENOTRECOVERABLE;
14418
14419 metadata["reclaiming_uuid"] = uuid;
14420 return 0;
14421}
14422
14423void Client::finish_reclaim()
14424{
14425 auto it = metadata.find("reclaiming_uuid");
14426 if (it == metadata.end()) {
14427 for (auto &p : mds_sessions)
14428 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14429 return;
14430 }
14431
14432 for (auto &p : mds_sessions) {
14433 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14434 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14435 p.second.con->send_message2(std::move(m));
14436 }
14437
14438 metadata["uuid"] = it->second;
14439 metadata.erase(it);
14440}
14441
14442void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14443{
14444 mds_rank_t from = mds_rank_t(reply->get_source().num());
14445 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14446
14447 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14448 if (!session) {
14449 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14450 return;
14451 }
14452
14453 if (reply->get_result() >= 0) {
14454 session->reclaim_state = MetaSession::RECLAIM_OK;
14455 if (reply->get_epoch() > reclaim_osd_epoch)
14456 reclaim_osd_epoch = reply->get_epoch();
14457 if (!reply->get_addrs().empty())
14458 reclaim_target_addrs = reply->get_addrs();
14459 } else {
14460 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14461 reclaim_errno = reply->get_result();
14462 }
14463
14464 signal_cond_list(waiting_for_reclaim);
14465}
14466
7c673cae
FG
14467/**
14468 * This is included in cap release messages, to cause
14469 * the MDS to wait until this OSD map epoch. It is necessary
14470 * in corner cases where we cancel RADOS ops, so that
14471 * nobody else tries to do IO to the same objects in
14472 * the same epoch as the cancelled ops.
14473 */
14474void Client::set_cap_epoch_barrier(epoch_t e)
14475{
14476 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14477 cap_epoch_barrier = e;
14478}
14479
14480const char** Client::get_tracked_conf_keys() const
14481{
14482 static const char* keys[] = {
14483 "client_cache_size",
14484 "client_cache_mid",
14485 "client_acl_type",
b32b8144
FG
14486 "client_deleg_timeout",
14487 "client_deleg_break_on_open",
7c673cae
FG
14488 NULL
14489 };
14490 return keys;
14491}
14492
11fdf7f2 14493void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14494 const std::set <std::string> &changed)
14495{
11fdf7f2 14496 std::lock_guard lock(client_lock);
7c673cae 14497
181888fb 14498 if (changed.count("client_cache_mid")) {
7c673cae
FG
14499 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14500 }
14501 if (changed.count("client_acl_type")) {
14502 acl_type = NO_ACL;
14503 if (cct->_conf->client_acl_type == "posix_acl")
14504 acl_type = POSIX_ACL;
14505 }
14506}
14507
7c673cae
FG
14508void intrusive_ptr_add_ref(Inode *in)
14509{
14510 in->get();
14511}
14512
14513void intrusive_ptr_release(Inode *in)
14514{
14515 in->client->put_inode(in);
14516}
14517
14518mds_rank_t Client::_get_random_up_mds() const
14519{
11fdf7f2 14520 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
14521
14522 std::set<mds_rank_t> up;
14523 mdsmap->get_up_mds_set(up);
14524
14525 if (up.empty())
14526 return MDS_RANK_NONE;
14527 std::set<mds_rank_t>::const_iterator p = up.begin();
14528 for (int n = rand() % up.size(); n; n--)
14529 ++p;
14530 return *p;
14531}
14532
14533
14534StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14535 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14536{
14537 monclient->set_messenger(m);
14538 objecter->set_client_incarnation(0);
14539}
14540
14541StandaloneClient::~StandaloneClient()
14542{
14543 delete objecter;
14544 objecter = nullptr;
14545}
14546
14547int StandaloneClient::init()
14548{
14549 timer.init();
14550 objectcacher->start();
14551 objecter->init();
14552
14553 client_lock.Lock();
11fdf7f2 14554 ceph_assert(!is_initialized());
7c673cae
FG
14555
14556 messenger->add_dispatcher_tail(objecter);
14557 messenger->add_dispatcher_tail(this);
14558
14559 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14560 int r = monclient->init();
14561 if (r < 0) {
14562 // need to do cleanup because we're in an intermediate init state
14563 timer.shutdown();
14564 client_lock.Unlock();
14565 objecter->shutdown();
14566 objectcacher->stop();
14567 monclient->shutdown();
14568 return r;
14569 }
14570 objecter->start();
14571
14572 client_lock.Unlock();
14573 _finish_init();
14574
14575 return 0;
14576}
14577
14578void StandaloneClient::shutdown()
14579{
14580 Client::shutdown();
14581 objecter->shutdown();
14582 monclient->shutdown();
14583}