]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
Add patch for failing prerm scripts
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
73#include "common/Mutex.h"
74#include "common/perf_counters.h"
75#include "common/admin_socket.h"
76#include "common/errno.h"
77#include "include/str_list.h"
78
79#define dout_subsys ceph_subsys_client
80
81#include "include/lru.h"
82#include "include/compat.h"
83#include "include/stringify.h"
84
85#include "Client.h"
86#include "Inode.h"
87#include "Dentry.h"
b32b8144 88#include "Delegation.h"
7c673cae
FG
89#include "Dir.h"
90#include "ClientSnapRealm.h"
91#include "Fh.h"
92#include "MetaSession.h"
93#include "MetaRequest.h"
94#include "ObjecterWriteback.h"
95#include "posix_acl.h"
96
11fdf7f2 97#include "include/ceph_assert.h"
7c673cae
FG
98#include "include/stat.h"
99
100#include "include/cephfs/ceph_statx.h"
101
102#if HAVE_GETGROUPLIST
103#include <grp.h>
104#include <pwd.h>
105#include <unistd.h>
106#endif
107
108#undef dout_prefix
109#define dout_prefix *_dout << "client." << whoami << " "
110
111#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113// FreeBSD fails to define this
114#ifndef O_DSYNC
115#define O_DSYNC 0x0
116#endif
117// Darwin fails to define this
118#ifndef O_RSYNC
119#define O_RSYNC 0x0
120#endif
121
122#ifndef O_DIRECT
123#define O_DIRECT 0x0
124#endif
125
126#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129{
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132}
133
134
135// -------------
136
137Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139{
140}
141
11fdf7f2
TL
142bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
7c673cae 145{
11fdf7f2 146 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae
FG
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
11fdf7f2 150 m_client->dump_mds_requests(f.get());
7c673cae 151 else if (command == "mds_sessions")
11fdf7f2 152 m_client->dump_mds_sessions(f.get());
7c673cae 153 else if (command == "dump_cache")
11fdf7f2 154 m_client->dump_cache(f.get());
7c673cae
FG
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
11fdf7f2 158 m_client->dump_status(f.get());
7c673cae 159 else
11fdf7f2 160 ceph_abort_msg("bad command registered");
7c673cae
FG
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
7c673cae
FG
164 return true;
165}
166
167
168// -------------
169
170dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176void Client::_reset_faked_inos()
177{
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
11fdf7f2 182 last_used_faked_root = 0;
7c673cae
FG
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184}
185
186void Client::_assign_faked_ino(Inode *in)
187{
11fdf7f2
TL
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 192 last_used_faked_ino = 2048;
7c673cae
FG
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
11fdf7f2 195 ceph_assert(it != free_faked_inos.end());
7c673cae 196 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 197 ceph_assert(it.get_len() > 0);
7c673cae
FG
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
11fdf7f2 201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206}
207
11fdf7f2
TL
208/*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214*/
215void Client::_assign_faked_root(Inode *in)
216{
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232}
233
7c673cae
FG
234void Client::_release_faked_ino(Inode *in)
235{
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238}
239
240vinodeno_t Client::_map_faked_ino(ino_t ino)
241{
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
250 return vino;
251}
252
253vinodeno_t Client::map_faked_ino(ino_t ino)
254{
11fdf7f2 255 std::lock_guard lock(client_lock);
7c673cae
FG
256 return _map_faked_ino(ino);
257}
258
259// cons/des
260
261Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
7c673cae 263 timer(m->cct, client_lock),
11fdf7f2
TL
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae 278
7c673cae
FG
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281
7c673cae
FG
282 if (cct->_conf->client_acl_type == "posix_acl")
283 acl_type = POSIX_ACL;
284
7c673cae
FG
285 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
286
287 // file handles
288 free_fd_set.insert(10, 1<<30);
289
290 mdsmap.reset(new MDSMap);
291
292 // osd interfaces
293 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
294 &client_lock));
295 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
296 client_flush_set_callback, // all commit callback
297 (void*)this,
298 cct->_conf->client_oc_size,
299 cct->_conf->client_oc_max_objects,
300 cct->_conf->client_oc_max_dirty,
301 cct->_conf->client_oc_target_dirty,
302 cct->_conf->client_oc_max_dirty_age,
303 true));
304 objecter_finisher.start();
305 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 306 objecter->enable_blacklist_events();
7c673cae
FG
307}
308
309
310Client::~Client()
311{
11fdf7f2 312 ceph_assert(!client_lock.is_locked());
7c673cae 313
31f18b77
FG
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 client_lock.Lock();
7c673cae 318 tear_down_cache();
31f18b77 319 client_lock.Unlock();
7c673cae
FG
320}
321
322void Client::tear_down_cache()
323{
324 // fd's
325 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
326 it != fd_map.end();
327 ++it) {
328 Fh *fh = it->second;
11fdf7f2 329 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
330 _release_fh(fh);
331 }
332 fd_map.clear();
333
334 while (!opened_dirs.empty()) {
335 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 336 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
337 _closedir(dirp);
338 }
339
340 // caps!
341 // *** FIXME ***
342
343 // empty lru
7c673cae 344 trim_cache();
11fdf7f2 345 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
346
347 // close root ino
11fdf7f2 348 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
349 if (root && inode_map.size() == 1 + root_parents.size()) {
350 delete root;
351 root = 0;
352 root_ancestor = 0;
353 while (!root_parents.empty())
354 root_parents.erase(root_parents.begin());
355 inode_map.clear();
356 _reset_faked_inos();
357 }
358
11fdf7f2 359 ceph_assert(inode_map.empty());
7c673cae
FG
360}
361
362inodeno_t Client::get_root_ino()
363{
11fdf7f2 364 std::lock_guard l(client_lock);
7c673cae
FG
365 if (use_faked_inos())
366 return root->faked_ino;
367 else
368 return root->ino;
369}
370
371Inode *Client::get_root()
372{
11fdf7f2 373 std::lock_guard l(client_lock);
7c673cae
FG
374 root->ll_get();
375 return root;
376}
377
378
379// debug crapola
380
381void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
382{
383 filepath path;
384 in->make_long_path(path);
385 ldout(cct, 1) << "dump_inode: "
386 << (disconnected ? "DISCONNECTED ":"")
387 << "inode " << in->ino
388 << " " << path
389 << " ref " << in->get_num_ref()
390 << *in << dendl;
391
392 if (f) {
393 f->open_object_section("inode");
394 f->dump_stream("path") << path;
395 if (disconnected)
396 f->dump_int("disconnected", 1);
397 in->dump(f);
398 f->close_section();
399 }
400
401 did.insert(in);
402 if (in->dir) {
403 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
404 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
405 it != in->dir->dentries.end();
406 ++it) {
407 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
408 if (f) {
409 f->open_object_section("dentry");
410 it->second->dump(f);
411 f->close_section();
412 }
413 if (it->second->inode)
414 dump_inode(f, it->second->inode.get(), did, false);
415 }
416 }
417}
418
419void Client::dump_cache(Formatter *f)
420{
421 set<Inode*> did;
422
11fdf7f2 423 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
424
425 if (f)
426 f->open_array_section("cache");
427
428 if (root)
429 dump_inode(f, root, did, true);
430
431 // make a second pass to catch anything disconnected
432 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
433 it != inode_map.end();
434 ++it) {
435 if (did.count(it->second))
436 continue;
437 dump_inode(f, it->second, did, true);
438 }
439
440 if (f)
441 f->close_section();
442}
443
444void Client::dump_status(Formatter *f)
445{
11fdf7f2 446 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
447
448 ldout(cct, 1) << __func__ << dendl;
449
450 const epoch_t osd_epoch
451 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
452
453 if (f) {
454 f->open_object_section("metadata");
455 for (const auto& kv : metadata)
456 f->dump_string(kv.first.c_str(), kv.second);
457 f->close_section();
458
459 f->dump_int("dentry_count", lru.lru_get_size());
460 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
461 f->dump_int("id", get_nodeid().v);
11fdf7f2 462 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 463 f->dump_object("inst", inst);
11fdf7f2
TL
464 f->dump_object("addr", inst.addr);
465 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
466 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
467 f->dump_int("inode_count", inode_map.size());
468 f->dump_int("mds_epoch", mdsmap->get_epoch());
469 f->dump_int("osd_epoch", osd_epoch);
470 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 471 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
472 }
473}
474
475int Client::init()
476{
477 timer.init();
478 objectcacher->start();
479
480 client_lock.Lock();
11fdf7f2 481 ceph_assert(!initialized);
7c673cae
FG
482
483 messenger->add_dispatcher_tail(this);
484 client_lock.Unlock();
485
486 _finish_init();
487 return 0;
488}
489
490void Client::_finish_init()
491{
492 client_lock.Lock();
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
11fdf7f2
TL
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
7c673cae
FG
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502
503 client_lock.Unlock();
504
11fdf7f2 505 cct->_conf.add_observer(this);
7c673cae
FG
506
507 AdminSocket* admin_socket = cct->get_admin_socket();
508 int ret = admin_socket->register_command("mds_requests",
509 "mds_requests",
510 &m_command_hook,
511 "show in-progress mds requests");
512 if (ret < 0) {
513 lderr(cct) << "error registering admin socket command: "
514 << cpp_strerror(-ret) << dendl;
515 }
516 ret = admin_socket->register_command("mds_sessions",
517 "mds_sessions",
518 &m_command_hook,
519 "show mds session state");
520 if (ret < 0) {
521 lderr(cct) << "error registering admin socket command: "
522 << cpp_strerror(-ret) << dendl;
523 }
524 ret = admin_socket->register_command("dump_cache",
525 "dump_cache",
526 &m_command_hook,
527 "show in-memory metadata cache contents");
528 if (ret < 0) {
529 lderr(cct) << "error registering admin socket command: "
530 << cpp_strerror(-ret) << dendl;
531 }
532 ret = admin_socket->register_command("kick_stale_sessions",
533 "kick_stale_sessions",
534 &m_command_hook,
535 "kick sessions that were remote reset");
536 if (ret < 0) {
537 lderr(cct) << "error registering admin socket command: "
538 << cpp_strerror(-ret) << dendl;
539 }
540 ret = admin_socket->register_command("status",
541 "status",
542 &m_command_hook,
543 "show overall client status");
544 if (ret < 0) {
545 lderr(cct) << "error registering admin socket command: "
546 << cpp_strerror(-ret) << dendl;
547 }
548
549 client_lock.Lock();
550 initialized = true;
551 client_lock.Unlock();
552}
553
554void Client::shutdown()
555{
11fdf7f2 556 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
557
558 // If we were not mounted, but were being used for sending
559 // MDS commands, we may have sessions that need closing.
560 client_lock.Lock();
561 _close_sessions();
562 client_lock.Unlock();
563
11fdf7f2 564 cct->_conf.remove_observer(this);
7c673cae 565
11fdf7f2 566 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
567
568 if (ino_invalidate_cb) {
569 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
570 async_ino_invalidator.wait_for_empty();
571 async_ino_invalidator.stop();
572 }
573
574 if (dentry_invalidate_cb) {
575 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
576 async_dentry_invalidator.wait_for_empty();
577 async_dentry_invalidator.stop();
578 }
579
580 if (switch_interrupt_cb) {
581 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
582 interrupt_finisher.wait_for_empty();
583 interrupt_finisher.stop();
584 }
585
586 if (remount_cb) {
587 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
588 remount_finisher.wait_for_empty();
589 remount_finisher.stop();
590 }
591
592 objectcacher->stop(); // outside of client_lock! this does a join.
593
594 client_lock.Lock();
11fdf7f2 595 ceph_assert(initialized);
7c673cae
FG
596 initialized = false;
597 timer.shutdown();
598 client_lock.Unlock();
599
600 objecter_finisher.wait_for_empty();
601 objecter_finisher.stop();
602
603 if (logger) {
604 cct->get_perfcounters_collection()->remove(logger.get());
605 logger.reset();
606 }
607}
608
609
610// ===================
611// metadata cache stuff
612
613void Client::trim_cache(bool trim_kernel_dcache)
614{
181888fb
FG
615 uint64_t max = cct->_conf->client_cache_size;
616 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
617 unsigned last = 0;
618 while (lru.lru_get_size() != last) {
619 last = lru.lru_get_size();
620
181888fb 621 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
622
623 // trim!
31f18b77 624 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
625 if (!dn)
626 break; // done
627
628 trim_dentry(dn);
629 }
630
181888fb 631 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
632 _invalidate_kernel_dcache();
633
634 // hose root?
635 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
636 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
637 delete root;
638 root = 0;
639 root_ancestor = 0;
640 while (!root_parents.empty())
641 root_parents.erase(root_parents.begin());
642 inode_map.clear();
643 _reset_faked_inos();
644 }
645}
646
647void Client::trim_cache_for_reconnect(MetaSession *s)
648{
649 mds_rank_t mds = s->mds_num;
11fdf7f2 650 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
651
652 int trimmed = 0;
653 list<Dentry*> skipped;
654 while (lru.lru_get_size() > 0) {
655 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
656 if (!dn)
657 break;
658
659 if ((dn->inode && dn->inode->caps.count(mds)) ||
660 dn->dir->parent_inode->caps.count(mds)) {
661 trim_dentry(dn);
662 trimmed++;
663 } else
664 skipped.push_back(dn);
665 }
666
667 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
668 lru.lru_insert_mid(*p);
669
11fdf7f2 670 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
671 << " trimmed " << trimmed << " dentries" << dendl;
672
673 if (s->caps.size() > 0)
674 _invalidate_kernel_dcache();
675}
676
677void Client::trim_dentry(Dentry *dn)
678{
679 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
680 << " in dir "
681 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
682 << dendl;
683 if (dn->inode) {
684 Inode *diri = dn->dir->parent_inode;
685 diri->dir_release_count++;
686 clear_dir_complete_and_ordered(diri, true);
687 }
688 unlink(dn, false, false); // drop dir, drop dentry
689}
690
691
1adf2230
AA
692void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
693 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 694{
7c673cae
FG
695 uint64_t prior_size = in->size;
696
7c673cae
FG
697 if (truncate_seq > in->truncate_seq ||
698 (truncate_seq == in->truncate_seq && size > in->size)) {
699 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
700 in->size = size;
701 in->reported_size = size;
702 if (truncate_seq != in->truncate_seq) {
703 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
704 << truncate_seq << dendl;
705 in->truncate_seq = truncate_seq;
706 in->oset.truncate_seq = truncate_seq;
707
708 // truncate cached file data
709 if (prior_size > size) {
710 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
711 }
712 }
713
714 // truncate inline data
715 if (in->inline_version < CEPH_INLINE_NONE) {
716 uint32_t len = in->inline_data.length();
717 if (size < len)
718 in->inline_data.splice(size, len - size);
719 }
720 }
721 if (truncate_seq >= in->truncate_seq &&
722 in->truncate_size != truncate_size) {
723 if (in->is_file()) {
724 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
725 << truncate_size << dendl;
726 in->truncate_size = truncate_size;
727 in->oset.truncate_size = truncate_size;
728 } else {
729 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
730 }
731 }
1adf2230
AA
732}
733
734void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
735 utime_t ctime, utime_t mtime, utime_t atime)
736{
737 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
738 << " ctime " << ctime << " mtime " << mtime << dendl;
739
740 if (time_warp_seq > in->time_warp_seq)
741 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
742 << " is higher than local time_warp_seq "
743 << in->time_warp_seq << dendl;
744
745 int warn = false;
7c673cae
FG
746 // be careful with size, mtime, atime
747 if (issued & (CEPH_CAP_FILE_EXCL|
748 CEPH_CAP_FILE_WR|
749 CEPH_CAP_FILE_BUFFER|
750 CEPH_CAP_AUTH_EXCL|
751 CEPH_CAP_XATTR_EXCL)) {
752 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
753 if (ctime > in->ctime)
754 in->ctime = ctime;
755 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
756 //the mds updated times, so take those!
757 in->mtime = mtime;
758 in->atime = atime;
759 in->time_warp_seq = time_warp_seq;
760 } else if (time_warp_seq == in->time_warp_seq) {
761 //take max times
762 if (mtime > in->mtime)
763 in->mtime = mtime;
764 if (atime > in->atime)
765 in->atime = atime;
766 } else if (issued & CEPH_CAP_FILE_EXCL) {
767 //ignore mds values as we have a higher seq
768 } else warn = true;
769 } else {
770 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
771 if (time_warp_seq >= in->time_warp_seq) {
772 in->ctime = ctime;
773 in->mtime = mtime;
774 in->atime = atime;
775 in->time_warp_seq = time_warp_seq;
776 } else warn = true;
777 }
778 if (warn) {
779 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
780 << time_warp_seq << " is lower than local time_warp_seq "
781 << in->time_warp_seq
782 << dendl;
783 }
784}
785
786void Client::_fragmap_remove_non_leaves(Inode *in)
787{
788 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
789 if (!in->dirfragtree.is_leaf(p->first))
790 in->fragmap.erase(p++);
791 else
792 ++p;
793}
794
795void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
796{
797 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
798 if (p->second == mds)
799 in->fragmap.erase(p++);
800 else
801 ++p;
802}
803
804Inode * Client::add_update_inode(InodeStat *st, utime_t from,
805 MetaSession *session,
806 const UserPerm& request_perms)
807{
808 Inode *in;
809 bool was_new = false;
810 if (inode_map.count(st->vino)) {
811 in = inode_map[st->vino];
11fdf7f2 812 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
813 } else {
814 in = new Inode(this, st->vino, &st->layout);
815 inode_map[st->vino] = in;
816
817 if (use_faked_inos())
818 _assign_faked_ino(in);
819
820 if (!root) {
821 root = in;
11fdf7f2
TL
822 if (use_faked_inos())
823 _assign_faked_root(root);
7c673cae
FG
824 root_ancestor = in;
825 cwd = root;
826 } else if (!mounted) {
827 root_parents[root_ancestor] = in;
828 root_ancestor = in;
829 }
830
831 // immutable bits
832 in->ino = st->vino.ino;
833 in->snapid = st->vino.snapid;
834 in->mode = st->mode & S_IFMT;
835 was_new = true;
836 }
837
838 in->rdev = st->rdev;
839 if (in->is_symlink())
840 in->symlink = st->symlink;
841
7c673cae 842 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
843 bool new_version = false;
844 if (in->version == 0 ||
845 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
846 (in->version & ~1) < st->version))
847 new_version = true;
7c673cae 848
1adf2230
AA
849 int issued;
850 in->caps_issued(&issued);
851 issued |= in->caps_dirty();
852 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 853
1adf2230
AA
854 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
855 !(issued & CEPH_CAP_AUTH_EXCL)) {
856 in->mode = st->mode;
857 in->uid = st->uid;
858 in->gid = st->gid;
859 in->btime = st->btime;
81eedcae 860 in->snap_btime = st->snap_btime;
1adf2230 861 }
7c673cae 862
1adf2230
AA
863 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
864 !(issued & CEPH_CAP_LINK_EXCL)) {
865 in->nlink = st->nlink;
866 }
7c673cae 867
1adf2230
AA
868 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
869 update_inode_file_time(in, issued, st->time_warp_seq,
870 st->ctime, st->mtime, st->atime);
871 }
7c673cae 872
1adf2230
AA
873 if (new_version ||
874 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 875 in->layout = st->layout;
1adf2230
AA
876 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
877 }
7c673cae 878
1adf2230
AA
879 if (in->is_dir()) {
880 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
881 in->dirstat = st->dirstat;
882 }
883 // dir_layout/rstat/quota are not tracked by capability, update them only if
884 // the inode stat is from auth mds
885 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
886 in->dir_layout = st->dir_layout;
887 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
888 in->rstat = st->rstat;
889 in->quota = st->quota;
11fdf7f2 890 in->dir_pin = st->dir_pin;
1adf2230
AA
891 }
892 // move me if/when version reflects fragtree changes.
893 if (in->dirfragtree != st->dirfragtree) {
894 in->dirfragtree = st->dirfragtree;
895 _fragmap_remove_non_leaves(in);
7c673cae 896 }
7c673cae
FG
897 }
898
899 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
900 st->xattrbl.length() &&
901 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
902 auto p = st->xattrbl.cbegin();
903 decode(in->xattrs, p);
7c673cae
FG
904 in->xattr_version = st->xattr_version;
905 }
906
1adf2230
AA
907 if (st->inline_version > in->inline_version) {
908 in->inline_data = st->inline_data;
909 in->inline_version = st->inline_version;
7c673cae
FG
910 }
911
1adf2230
AA
912 /* always take a newer change attr */
913 if (st->change_attr > in->change_attr)
914 in->change_attr = st->change_attr;
915
916 if (st->version > in->version)
917 in->version = st->version;
918
919 if (was_new)
920 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
921
922 if (!st->cap.caps)
923 return in; // as with readdir returning indoes in different snaprealms (no caps!)
924
7c673cae 925 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
926 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
927 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
928 st->cap.flags, request_perms);
28e407b8 929 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 930 in->max_size = st->max_size;
28e407b8
AA
931 in->rstat = st->rstat;
932 }
7c673cae 933
1adf2230
AA
934 // setting I_COMPLETE needs to happen after adding the cap
935 if (in->is_dir() &&
936 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
937 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
938 in->dirstat.nfiles == 0 &&
939 in->dirstat.nsubdirs == 0) {
940 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
941 in->flags |= I_COMPLETE | I_DIR_ORDERED;
942 if (in->dir) {
943 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
944 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
945 in->dir->readdir_cache.clear();
946 for (const auto& p : in->dir->dentries) {
947 unlink(p.second, true, true); // keep dir, keep dentry
948 }
949 if (in->dir->dentries.empty())
950 close_dir(in->dir);
7c673cae 951 }
7c673cae 952 }
1adf2230
AA
953 } else {
954 in->snap_caps |= st->cap.caps;
7c673cae
FG
955 }
956
957 return in;
958}
959
960
961/*
962 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
963 */
964Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
965 Inode *in, utime_t from, MetaSession *session,
966 Dentry *old_dentry)
967{
968 Dentry *dn = NULL;
969 if (dir->dentries.count(dname))
970 dn = dir->dentries[dname];
971
11fdf7f2 972 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
973 << " in dir " << dir->parent_inode->vino() << " dn " << dn
974 << dendl;
975
976 if (dn && dn->inode) {
977 if (dn->inode->vino() == in->vino()) {
978 touch_dn(dn);
979 ldout(cct, 12) << " had dentry " << dname
980 << " with correct vino " << dn->inode->vino()
981 << dendl;
982 } else {
983 ldout(cct, 12) << " had dentry " << dname
984 << " with WRONG vino " << dn->inode->vino()
985 << dendl;
986 unlink(dn, true, true); // keep dir, keep dentry
987 }
988 }
989
990 if (!dn || !dn->inode) {
991 InodeRef tmp_ref(in);
992 if (old_dentry) {
993 if (old_dentry->dir != dir) {
994 Inode *old_diri = old_dentry->dir->parent_inode;
995 old_diri->dir_ordered_count++;
996 clear_dir_complete_and_ordered(old_diri, false);
997 }
998 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
999 }
1000 Inode *diri = dir->parent_inode;
1001 diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(diri, false);
1003 dn = link(dir, dname, in, dn);
1004 }
1005
1006 update_dentry_lease(dn, dlease, from, session);
1007 return dn;
1008}
1009
1010void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1011{
1012 utime_t dttl = from;
1013 dttl += (float)dlease->duration_ms / 1000.0;
1014
11fdf7f2 1015 ceph_assert(dn);
7c673cae
FG
1016
1017 if (dlease->mask & CEPH_LOCK_DN) {
1018 if (dttl > dn->lease_ttl) {
1019 ldout(cct, 10) << "got dentry lease on " << dn->name
1020 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1021 dn->lease_ttl = dttl;
1022 dn->lease_mds = session->mds_num;
1023 dn->lease_seq = dlease->seq;
1024 dn->lease_gen = session->cap_gen;
1025 }
1026 }
1027 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1028}
1029
1030
1031/*
1032 * update MDS location cache for a single inode
1033 */
1034void Client::update_dir_dist(Inode *in, DirStat *dst)
1035{
1036 // auth
1037 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1038 if (dst->auth >= 0) {
1039 in->fragmap[dst->frag] = dst->auth;
1040 } else {
1041 in->fragmap.erase(dst->frag);
1042 }
1043 if (!in->dirfragtree.is_leaf(dst->frag)) {
1044 in->dirfragtree.force_to_leaf(cct, dst->frag);
1045 _fragmap_remove_non_leaves(in);
1046 }
1047
1048 // replicated
1049 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1050
1051 // dist
1052 /*
1053 if (!st->dirfrag_dist.empty()) { // FIXME
1054 set<int> dist = st->dirfrag_dist.begin()->second;
1055 if (dist.empty() && !in->dir_contacts.empty())
1056 ldout(cct, 9) << "lost dist spec for " << in->ino
1057 << " " << dist << dendl;
1058 if (!dist.empty() && in->dir_contacts.empty())
1059 ldout(cct, 9) << "got dist spec for " << in->ino
1060 << " " << dist << dendl;
1061 in->dir_contacts = dist;
1062 }
1063 */
1064}
1065
1066void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1067{
1068 if (diri->flags & I_COMPLETE) {
1069 if (complete) {
1070 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1071 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1072 } else {
1073 if (diri->flags & I_DIR_ORDERED) {
1074 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1075 diri->flags &= ~I_DIR_ORDERED;
1076 }
1077 }
1078 if (diri->dir)
1079 diri->dir->readdir_cache.clear();
1080 }
1081}
1082
1083/*
1084 * insert results from readdir or lssnap into the metadata cache.
1085 */
1086void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1087
11fdf7f2 1088 auto& reply = request->reply;
7c673cae 1089 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1090 uint64_t features;
1091 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1092 features = (uint64_t)-1;
1093 }
1094 else {
1095 features = con->get_features();
1096 }
7c673cae
FG
1097
1098 dir_result_t *dirp = request->dirp;
11fdf7f2 1099 ceph_assert(dirp);
7c673cae
FG
1100
1101 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1102 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1103 if (!p.end()) {
1104 // snapdir?
1105 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1106 ceph_assert(diri);
7c673cae
FG
1107 diri = open_snapdir(diri);
1108 }
1109
1110 // only open dir if we're actually adding stuff to it!
1111 Dir *dir = diri->open_dir();
11fdf7f2 1112 ceph_assert(dir);
7c673cae
FG
1113
1114 // dirstat
11fdf7f2 1115 DirStat dst(p, features);
7c673cae
FG
1116 __u32 numdn;
1117 __u16 flags;
11fdf7f2
TL
1118 decode(numdn, p);
1119 decode(flags, p);
7c673cae
FG
1120
1121 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1122 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1123
1124 frag_t fg = (unsigned)request->head.args.readdir.frag;
1125 unsigned readdir_offset = dirp->next_offset;
1126 string readdir_start = dirp->last_name;
11fdf7f2 1127 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1128
1129 unsigned last_hash = 0;
1130 if (hash_order) {
1131 if (!readdir_start.empty()) {
1132 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1133 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1134 /* mds understands offset_hash */
1135 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1136 }
1137 }
1138
1139 if (fg != dst.frag) {
1140 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1141 fg = dst.frag;
1142 if (!hash_order) {
1143 readdir_offset = 2;
1144 readdir_start.clear();
1145 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1146 }
1147 }
1148
1149 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1150 << ", hash_order=" << hash_order
1151 << ", readdir_start " << readdir_start
1152 << ", last_hash " << last_hash
1153 << ", next_offset " << readdir_offset << dendl;
1154
1155 if (diri->snapid != CEPH_SNAPDIR &&
1156 fg.is_leftmost() && readdir_offset == 2 &&
1157 !(hash_order && last_hash)) {
1158 dirp->release_count = diri->dir_release_count;
1159 dirp->ordered_count = diri->dir_ordered_count;
1160 dirp->start_shared_gen = diri->shared_gen;
1161 dirp->cache_index = 0;
1162 }
1163
1164 dirp->buffer_frag = fg;
1165
1166 _readdir_drop_dirp_buffer(dirp);
1167 dirp->buffer.reserve(numdn);
1168
1169 string dname;
1170 LeaseStat dlease;
1171 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1172 decode(dname, p);
1173 dlease.decode(p, features);
7c673cae
FG
1174 InodeStat ist(p, features);
1175
1176 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1177
1178 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1179 request->perms);
1180 Dentry *dn;
1181 if (diri->dir->dentries.count(dname)) {
1182 Dentry *olddn = diri->dir->dentries[dname];
1183 if (olddn->inode != in) {
1184 // replace incorrect dentry
1185 unlink(olddn, true, true); // keep dir, dentry
1186 dn = link(dir, dname, in, olddn);
11fdf7f2 1187 ceph_assert(dn == olddn);
7c673cae
FG
1188 } else {
1189 // keep existing dn
1190 dn = olddn;
1191 touch_dn(dn);
1192 }
1193 } else {
1194 // new dn
1195 dn = link(dir, dname, in, NULL);
1196 }
1197
1198 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1199 if (hash_order) {
1200 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1201 if (hash != last_hash)
1202 readdir_offset = 2;
1203 last_hash = hash;
1204 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1205 } else {
1206 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1207 }
1208 // add to readdir cache
1209 if (dirp->release_count == diri->dir_release_count &&
1210 dirp->ordered_count == diri->dir_ordered_count &&
1211 dirp->start_shared_gen == diri->shared_gen) {
1212 if (dirp->cache_index == dir->readdir_cache.size()) {
1213 if (i == 0) {
11fdf7f2 1214 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1215 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1216 }
1217 dir->readdir_cache.push_back(dn);
1218 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1219 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1220 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1221 else
1222 dir->readdir_cache[dirp->cache_index] = dn;
1223 } else {
11fdf7f2 1224 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1225 }
1226 dirp->cache_index++;
1227 }
1228 // add to cached result list
1229 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1230 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1231 }
1232
1233 if (numdn > 0)
1234 dirp->last_name = dname;
1235 if (end)
1236 dirp->next_offset = 2;
1237 else
1238 dirp->next_offset = readdir_offset;
1239
1240 if (dir->is_empty())
1241 close_dir(dir);
1242 }
1243}
1244
1245/** insert_trace
1246 *
1247 * insert a trace from a MDS reply into the cache.
1248 */
1249Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1250{
11fdf7f2 1251 auto& reply = request->reply;
7c673cae
FG
1252 int op = request->get_op();
1253
1254 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1255 << " is_target=" << (int)reply->head.is_target
1256 << " is_dentry=" << (int)reply->head.is_dentry
1257 << dendl;
1258
11fdf7f2 1259 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1260 if (request->got_unsafe) {
1261 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1262 ceph_assert(p.end());
7c673cae
FG
1263 return NULL;
1264 }
1265
1266 if (p.end()) {
1267 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1268
1269 Dentry *d = request->dentry();
1270 if (d) {
1271 Inode *diri = d->dir->parent_inode;
1272 diri->dir_release_count++;
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1281 ceph_assert(od);
7c673cae
FG
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
7c673cae
FG
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
11fdf7f2
TL
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
7c673cae
FG
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1338 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
1360 diri->dir_ordered_count++;
1361 clear_dir_complete_and_ordered(diri, false);
1362 unlink(dn, true, true); // keep dir, dentry
1363 }
1364 }
1365 if (dlease.duration_ms > 0) {
1366 if (!dn) {
1367 Dir *dir = diri->open_dir();
1368 dn = link(dir, dname, NULL, NULL);
1369 }
1370 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1371 }
1372 }
1373 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1374 op == CEPH_MDS_OP_MKSNAP) {
1375 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1376 // fake it for snap lookup
1377 vinodeno_t vino = ist.vino;
1378 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1379 ceph_assert(inode_map.count(vino));
7c673cae
FG
1380 diri = inode_map[vino];
1381
1382 string dname = request->path.last_dentry();
1383
1384 LeaseStat dlease;
1385 dlease.duration_ms = 0;
1386
1387 if (in) {
1388 Dir *dir = diri->open_dir();
1389 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1390 } else {
1391 if (diri->dir && diri->dir->dentries.count(dname)) {
1392 Dentry *dn = diri->dir->dentries[dname];
1393 if (dn->inode)
1394 unlink(dn, true, true); // keep dir, dentry
1395 }
1396 }
1397 }
1398
1399 if (in) {
1400 if (op == CEPH_MDS_OP_READDIR ||
1401 op == CEPH_MDS_OP_LSSNAP) {
1402 insert_readdir_results(request, session, in);
1403 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1404 // hack: return parent inode instead
1405 in = diri;
1406 }
1407
1408 if (request->dentry() == NULL && in != request->inode()) {
1409 // pin the target inode if its parent dentry is not pinned
1410 request->set_other_inode(in);
1411 }
1412 }
1413
1414 if (realm)
1415 put_snap_realm(realm);
1416
1417 request->target = in;
1418 return in;
1419}
1420
1421// -------
1422
1423mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1424{
1425 mds_rank_t mds = MDS_RANK_NONE;
1426 __u32 hash = 0;
1427 bool is_hash = false;
1428
1429 Inode *in = NULL;
1430 Dentry *de = NULL;
7c673cae
FG
1431
1432 if (req->resend_mds >= 0) {
1433 mds = req->resend_mds;
1434 req->resend_mds = -1;
11fdf7f2 1435 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1436 goto out;
1437 }
1438
1439 if (cct->_conf->client_use_random_mds)
1440 goto random_mds;
1441
1442 in = req->inode();
1443 de = req->dentry();
1444 if (in) {
11fdf7f2 1445 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1446 if (req->path.depth()) {
1447 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1448 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1449 << " on " << req->path[0]
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 } else if (de) {
1454 if (de->inode) {
1455 in = de->inode.get();
11fdf7f2 1456 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1457 } else {
1458 in = de->dir->parent_inode;
1459 hash = in->hash_dentry_name(de->name);
11fdf7f2 1460 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1461 << " on " << de->name
1462 << " => " << hash << dendl;
1463 is_hash = true;
1464 }
1465 }
1466 if (in) {
1467 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1468 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1469 while (in->snapid != CEPH_NOSNAP) {
1470 if (in->snapid == CEPH_SNAPDIR)
1471 in = in->snapdir_parent.get();
11fdf7f2 1472 else if (!in->dentries.empty())
7c673cae
FG
1473 /* In most cases there will only be one dentry, so getting it
1474 * will be the correct action. If there are multiple hard links,
1475 * I think the MDS should be able to redirect as needed*/
1476 in = in->get_first_parent()->dir->parent_inode;
1477 else {
1478 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1479 break;
1480 }
1481 }
1482 is_hash = false;
1483 }
1484
11fdf7f2 1485 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1486 << " hash=" << hash << dendl;
1487
1488 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1489 frag_t fg = in->dirfragtree[hash];
1490 if (in->fragmap.count(fg)) {
1491 mds = in->fragmap[fg];
1492 if (phash_diri)
1493 *phash_diri = in;
91327a77
AA
1494 } else if (in->auth_cap) {
1495 mds = in->auth_cap->session->mds_num;
1496 }
1497 if (mds >= 0) {
11fdf7f2 1498 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1499 goto out;
1500 }
1501 }
1502
11fdf7f2
TL
1503 if (in->auth_cap && req->auth_is_best()) {
1504 mds = in->auth_cap->session->mds_num;
1505 } else if (!in->caps.empty()) {
1506 mds = in->caps.begin()->second.session->mds_num;
1507 } else {
7c673cae 1508 goto random_mds;
11fdf7f2
TL
1509 }
1510 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1511
1512 goto out;
1513 }
1514
1515random_mds:
1516 if (mds < 0) {
1517 mds = _get_random_up_mds();
1518 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1519 }
1520
1521out:
1522 ldout(cct, 20) << "mds is " << mds << dendl;
1523 return mds;
1524}
1525
1526
1527void Client::connect_mds_targets(mds_rank_t mds)
1528{
11fdf7f2
TL
1529 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1530 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1531 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1532 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1533 q != info.export_targets.end();
1534 ++q) {
1535 if (mds_sessions.count(*q) == 0 &&
1536 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1537 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1538 << " export target mds." << *q << dendl;
1539 _open_mds_session(*q);
1540 }
1541 }
1542}
1543
1544void Client::dump_mds_sessions(Formatter *f)
1545{
1546 f->dump_int("id", get_nodeid().v);
11fdf7f2 1547 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1548 f->dump_object("inst", inst);
1549 f->dump_stream("inst_str") << inst;
1550 f->dump_stream("addr_str") << inst.addr;
7c673cae 1551 f->open_array_section("sessions");
11fdf7f2 1552 for (const auto &p : mds_sessions) {
7c673cae 1553 f->open_object_section("session");
11fdf7f2 1554 p.second.dump(f);
7c673cae
FG
1555 f->close_section();
1556 }
1557 f->close_section();
1558 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1559}
1560void Client::dump_mds_requests(Formatter *f)
1561{
1562 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1563 p != mds_requests.end();
1564 ++p) {
1565 f->open_object_section("request");
1566 p->second->dump(f);
1567 f->close_section();
1568 }
1569}
1570
1571int Client::verify_reply_trace(int r,
11fdf7f2 1572 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1573 InodeRef *ptarget, bool *pcreated,
1574 const UserPerm& perms)
1575{
1576 // check whether this request actually did the create, and set created flag
1577 bufferlist extra_bl;
1578 inodeno_t created_ino;
1579 bool got_created_ino = false;
1580 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1581
11fdf7f2 1582 extra_bl = reply->get_extra_bl();
7c673cae
FG
1583 if (extra_bl.length() >= 8) {
1584 // if the extra bufferlist has a buffer, we assume its the created inode
1585 // and that this request to create succeeded in actually creating
1586 // the inode (won the race with other create requests)
11fdf7f2 1587 decode(created_ino, extra_bl);
7c673cae
FG
1588 got_created_ino = true;
1589 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1590 }
1591
1592 if (pcreated)
1593 *pcreated = got_created_ino;
1594
1595 if (request->target) {
1596 *ptarget = request->target;
1597 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1598 } else {
1599 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1600 (*ptarget) = p->second;
1601 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1602 } else {
1603 // we got a traceless reply, and need to look up what we just
1604 // created. for now, do this by name. someday, do this by the
1605 // ino... which we know! FIXME.
1606 InodeRef target;
1607 Dentry *d = request->dentry();
1608 if (d) {
1609 if (d->dir) {
1610 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1611 << d->dir->parent_inode->ino << "/" << d->name
1612 << " got_ino " << got_created_ino
1613 << " ino " << created_ino
1614 << dendl;
1615 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1616 &target, perms);
1617 } else {
1618 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1619 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1620 }
1621 } else {
1622 Inode *in = request->inode();
1623 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1624 << in->ino << dendl;
1625 r = _getattr(in, request->regetattr_mask, perms, true);
1626 target = in;
1627 }
1628 if (r >= 0) {
1629 // verify ino returned in reply and trace_dist are the same
1630 if (got_created_ino &&
1631 created_ino.val != target->ino.val) {
1632 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1633 r = -EINTR;
1634 }
1635 if (ptarget)
1636 ptarget->swap(target);
1637 }
1638 }
1639 }
1640
1641 return r;
1642}
1643
1644
1645/**
1646 * make a request
1647 *
1648 * Blocking helper to make an MDS request.
1649 *
1650 * If the ptarget flag is set, behavior changes slightly: the caller
1651 * expects to get a pointer to the inode we are creating or operating
1652 * on. As a result, we will follow up any traceless mutation reply
1653 * with a getattr or lookup to transparently handle a traceless reply
1654 * from the MDS (as when the MDS restarts and the client has to replay
1655 * a request).
1656 *
1657 * @param request the MetaRequest to execute
1658 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1659 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1660 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1661 * @param use_mds [optional] prefer a specific mds (-1 for default)
1662 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1663 */
1664int Client::make_request(MetaRequest *request,
1665 const UserPerm& perms,
1666 InodeRef *ptarget, bool *pcreated,
1667 mds_rank_t use_mds,
1668 bufferlist *pdirbl)
1669{
1670 int r = 0;
1671
1672 // assign a unique tid
1673 ceph_tid_t tid = ++last_tid;
1674 request->set_tid(tid);
1675
1676 // and timestamp
1677 request->op_stamp = ceph_clock_now();
1678
1679 // make note
1680 mds_requests[tid] = request->get();
1681 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1682 oldest_tid = tid;
1683
1684 request->set_caller_perms(perms);
1685
1686 if (cct->_conf->client_inject_fixed_oldest_tid) {
1687 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1688 request->set_oldest_client_tid(1);
1689 } else {
1690 request->set_oldest_client_tid(oldest_tid);
1691 }
1692
1693 // hack target mds?
1694 if (use_mds >= 0)
1695 request->resend_mds = use_mds;
1696
1697 while (1) {
1698 if (request->aborted())
1699 break;
1700
31f18b77
FG
1701 if (blacklisted) {
1702 request->abort(-EBLACKLISTED);
1703 break;
1704 }
1705
7c673cae
FG
1706 // set up wait cond
1707 Cond caller_cond;
1708 request->caller_cond = &caller_cond;
1709
1710 // choose mds
1711 Inode *hash_diri = NULL;
1712 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1713 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1714 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1715 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1716 if (hash_diri) {
1717 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1718 _fragmap_remove_stopped_mds(hash_diri, mds);
1719 } else {
1720 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1721 request->resend_mds = _get_random_up_mds();
1722 }
1723 } else {
1724 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1725 wait_on_list(waiting_for_mdsmap);
1726 }
1727 continue;
1728 }
1729
1730 // open a session?
1731 MetaSession *session = NULL;
1732 if (!have_open_session(mds)) {
1733 session = _get_or_open_mds_session(mds);
1734
1735 // wait
1736 if (session->state == MetaSession::STATE_OPENING) {
1737 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1738 wait_on_context_list(session->waiting_for_open);
1739 // Abort requests on REJECT from MDS
1740 if (rejected_by_mds.count(mds)) {
1741 request->abort(-EPERM);
1742 break;
1743 }
1744 continue;
1745 }
1746
1747 if (!have_open_session(mds))
1748 continue;
1749 } else {
11fdf7f2 1750 session = &mds_sessions.at(mds);
7c673cae
FG
1751 }
1752
1753 // send request.
1754 send_request(request, session);
1755
1756 // wait for signal
1757 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1758 request->kick = false;
1759 while (!request->reply && // reply
1760 request->resend_mds < 0 && // forward
1761 !request->kick)
1762 caller_cond.Wait(client_lock);
1763 request->caller_cond = NULL;
1764
1765 // did we get a reply?
1766 if (request->reply)
1767 break;
1768 }
1769
1770 if (!request->reply) {
11fdf7f2
TL
1771 ceph_assert(request->aborted());
1772 ceph_assert(!request->got_unsafe);
7c673cae
FG
1773 r = request->get_abort_code();
1774 request->item.remove_myself();
1775 unregister_request(request);
11fdf7f2 1776 put_request(request);
7c673cae
FG
1777 return r;
1778 }
1779
1780 // got it!
11fdf7f2 1781 auto reply = std::move(request->reply);
7c673cae
FG
1782 r = reply->get_result();
1783 if (r >= 0)
1784 request->success = true;
1785
1786 // kick dispatcher (we've got it!)
11fdf7f2 1787 ceph_assert(request->dispatch_cond);
7c673cae
FG
1788 request->dispatch_cond->Signal();
1789 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1790 request->dispatch_cond = 0;
1791
1792 if (r >= 0 && ptarget)
1793 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1794
1795 if (pdirbl)
11fdf7f2 1796 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1797
1798 // -- log times --
1799 utime_t lat = ceph_clock_now();
1800 lat -= request->sent_stamp;
1801 ldout(cct, 20) << "lat " << lat << dendl;
1802 logger->tinc(l_c_lat, lat);
1803 logger->tinc(l_c_reply, lat);
1804
1805 put_request(request);
7c673cae
FG
1806 return r;
1807}
1808
1809void Client::unregister_request(MetaRequest *req)
1810{
1811 mds_requests.erase(req->tid);
1812 if (req->tid == oldest_tid) {
1813 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1814 while (true) {
1815 if (p == mds_requests.end()) {
1816 oldest_tid = 0;
1817 break;
1818 }
1819 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1820 oldest_tid = p->first;
1821 break;
1822 }
1823 ++p;
1824 }
1825 }
1826 put_request(req);
1827}
1828
1829void Client::put_request(MetaRequest *request)
1830{
1831 if (request->_put()) {
1832 int op = -1;
1833 if (request->success)
1834 op = request->get_op();
1835 InodeRef other_in;
1836 request->take_other_inode(&other_in);
1837 delete request;
1838
1839 if (other_in &&
1840 (op == CEPH_MDS_OP_RMDIR ||
1841 op == CEPH_MDS_OP_RENAME ||
1842 op == CEPH_MDS_OP_RMSNAP)) {
1843 _try_to_trim_inode(other_in.get(), false);
1844 }
1845 }
1846}
1847
1848int Client::encode_inode_release(Inode *in, MetaRequest *req,
1849 mds_rank_t mds, int drop,
1850 int unless, int force)
1851{
11fdf7f2 1852 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1853 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1854 << ", have:" << ", force:" << force << ")" << dendl;
1855 int released = 0;
11fdf7f2
TL
1856 auto it = in->caps.find(mds);
1857 if (it != in->caps.end()) {
1858 Cap &cap = it->second;
7c673cae 1859 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1860 if ((drop & cap.issued) &&
1861 !(unless & cap.issued)) {
1862 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1863 cap.issued &= ~drop;
1864 cap.implemented &= ~drop;
7c673cae 1865 released = 1;
11fdf7f2 1866 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1867 } else {
1868 released = force;
1869 }
1870 if (released) {
1871 ceph_mds_request_release rel;
1872 rel.ino = in->ino;
11fdf7f2
TL
1873 rel.cap_id = cap.cap_id;
1874 rel.seq = cap.seq;
1875 rel.issue_seq = cap.issue_seq;
1876 rel.mseq = cap.mseq;
1877 rel.caps = cap.implemented;
1878 rel.wanted = cap.wanted;
7c673cae
FG
1879 rel.dname_len = 0;
1880 rel.dname_seq = 0;
1881 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1882 }
1883 }
11fdf7f2 1884 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1885 << released << dendl;
1886 return released;
1887}
1888
1889void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1890 mds_rank_t mds, int drop, int unless)
1891{
11fdf7f2 1892 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1893 << dn << ")" << dendl;
1894 int released = 0;
1895 if (dn->dir)
1896 released = encode_inode_release(dn->dir->parent_inode, req,
1897 mds, drop, unless, 1);
1898 if (released && dn->lease_mds == mds) {
1899 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1900 auto& rel = req->cap_releases.back();
7c673cae
FG
1901 rel.item.dname_len = dn->name.length();
1902 rel.item.dname_seq = dn->lease_seq;
1903 rel.dname = dn->name;
1904 }
11fdf7f2 1905 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1906 << dn << ")" << dendl;
1907}
1908
1909
1910/*
1911 * This requires the MClientRequest *request member to be set.
1912 * It will error out horribly without one.
1913 * Additionally, if you set any *drop member, you'd better have
1914 * set the corresponding dentry!
1915 */
1916void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1917{
11fdf7f2 1918 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1919 << req << ", mds: " << mds << ")" << dendl;
1920 if (req->inode_drop && req->inode())
1921 encode_inode_release(req->inode(), req,
1922 mds, req->inode_drop,
1923 req->inode_unless);
1924
1925 if (req->old_inode_drop && req->old_inode())
1926 encode_inode_release(req->old_inode(), req,
1927 mds, req->old_inode_drop,
1928 req->old_inode_unless);
1929 if (req->other_inode_drop && req->other_inode())
1930 encode_inode_release(req->other_inode(), req,
1931 mds, req->other_inode_drop,
1932 req->other_inode_unless);
1933
1934 if (req->dentry_drop && req->dentry())
1935 encode_dentry_release(req->dentry(), req,
1936 mds, req->dentry_drop,
1937 req->dentry_unless);
1938
1939 if (req->old_dentry_drop && req->old_dentry())
1940 encode_dentry_release(req->old_dentry(), req,
1941 mds, req->old_dentry_drop,
1942 req->old_dentry_unless);
11fdf7f2 1943 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1944 << req << ", mds " << mds <<dendl;
1945}
1946
1947bool Client::have_open_session(mds_rank_t mds)
1948{
11fdf7f2
TL
1949 const auto &it = mds_sessions.find(mds);
1950 return it != mds_sessions.end() &&
1951 (it->second.state == MetaSession::STATE_OPEN ||
1952 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1953}
1954
1955MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1956{
11fdf7f2
TL
1957 const auto &it = mds_sessions.find(mds);
1958 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1959 return NULL;
11fdf7f2
TL
1960 } else {
1961 return &it->second;
1962 }
7c673cae
FG
1963}
1964
1965MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1966{
11fdf7f2
TL
1967 auto it = mds_sessions.find(mds);
1968 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1969}
1970
1971/**
1972 * Populate a map of strings with client-identifying metadata,
1973 * such as the hostname. Call this once at initialization.
1974 */
1975void Client::populate_metadata(const std::string &mount_root)
1976{
1977 // Hostname
1978 struct utsname u;
1979 int r = uname(&u);
1980 if (r >= 0) {
1981 metadata["hostname"] = u.nodename;
1982 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1983 } else {
1984 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1985 }
1986
1987 metadata["pid"] = stringify(getpid());
1988
1989 // Ceph entity id (the '0' in "client.0")
1990 metadata["entity_id"] = cct->_conf->name.get_id();
1991
1992 // Our mount position
1993 if (!mount_root.empty()) {
1994 metadata["root"] = mount_root;
1995 }
1996
1997 // Ceph version
1998 metadata["ceph_version"] = pretty_version_to_str();
1999 metadata["ceph_sha1"] = git_version_to_str();
2000
2001 // Apply any metadata from the user's configured overrides
2002 std::vector<std::string> tokens;
2003 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2004 for (const auto &i : tokens) {
2005 auto eqpos = i.find("=");
2006 // Throw out anything that isn't of the form "<str>=<str>"
2007 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2008 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2009 continue;
2010 }
2011 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2012 }
2013}
2014
2015/**
2016 * Optionally add or override client metadata fields.
2017 */
2018void Client::update_metadata(std::string const &k, std::string const &v)
2019{
11fdf7f2
TL
2020 std::lock_guard l(client_lock);
2021 ceph_assert(initialized);
7c673cae 2022
11fdf7f2
TL
2023 auto it = metadata.find(k);
2024 if (it != metadata.end()) {
7c673cae 2025 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2026 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2027 }
2028
2029 metadata[k] = v;
2030}
2031
2032MetaSession *Client::_open_mds_session(mds_rank_t mds)
2033{
11fdf7f2
TL
2034 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2035 auto addrs = mdsmap->get_addrs(mds);
2036 auto em = mds_sessions.emplace(std::piecewise_construct,
2037 std::forward_as_tuple(mds),
2038 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2039 ceph_assert(em.second); /* not already present */
2040 MetaSession *session = &em.first->second;
7c673cae
FG
2041
2042 // Maybe skip sending a request to open if this MDS daemon
2043 // has previously sent us a REJECT.
2044 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2045 if (rejected_by_mds[mds] == session->addrs) {
2046 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2047 "because we were rejected" << dendl;
2048 return session;
2049 } else {
11fdf7f2 2050 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2051 "rejected us, trying with new inst" << dendl;
2052 rejected_by_mds.erase(mds);
2053 }
2054 }
2055
11fdf7f2
TL
2056 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2057 m->metadata = metadata;
2058 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2059 session->con->send_message2(std::move(m));
7c673cae
FG
2060 return session;
2061}
2062
2063void Client::_close_mds_session(MetaSession *s)
2064{
11fdf7f2 2065 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2066 s->state = MetaSession::STATE_CLOSING;
11fdf7f2 2067 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2068}
2069
2070void Client::_closed_mds_session(MetaSession *s)
2071{
11fdf7f2 2072 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2073 s->state = MetaSession::STATE_CLOSED;
2074 s->con->mark_down();
2075 signal_context_list(s->waiting_for_open);
2076 mount_cond.Signal();
2077 remove_session_caps(s);
2078 kick_requests_closed(s);
2079 mds_sessions.erase(s->mds_num);
7c673cae
FG
2080}
2081
11fdf7f2 2082void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2083{
2084 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2085 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2086
2087 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2088 if (!session) {
2089 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2090 return;
2091 }
2092
2093 switch (m->get_op()) {
2094 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2095 {
2096 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2097 missing_features -= m->supported_features;
2098 if (!missing_features.empty()) {
2099 lderr(cct) << "mds." << from << " lacks required features '"
2100 << missing_features << "', closing session " << dendl;
2101 rejected_by_mds[session->mds_num] = session->addrs;
2102 _close_mds_session(session);
2103 _closed_mds_session(session);
2104 break;
2105 }
2106 session->mds_features = std::move(m->supported_features);
2107
2108 renew_caps(session);
2109 session->state = MetaSession::STATE_OPEN;
2110 if (unmounting)
2111 mount_cond.Signal();
2112 else
2113 connect_mds_targets(from);
2114 signal_context_list(session->waiting_for_open);
2115 break;
2116 }
7c673cae
FG
2117
2118 case CEPH_SESSION_CLOSE:
2119 _closed_mds_session(session);
2120 break;
2121
2122 case CEPH_SESSION_RENEWCAPS:
2123 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2124 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2125 session->cap_ttl =
2126 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2127 if (was_stale)
2128 wake_up_session_caps(session, false);
7c673cae
FG
2129 }
2130 break;
2131
2132 case CEPH_SESSION_STALE:
28e407b8
AA
2133 // invalidate session caps/leases
2134 session->cap_gen++;
2135 session->cap_ttl = ceph_clock_now();
2136 session->cap_ttl -= 1;
7c673cae
FG
2137 renew_caps(session);
2138 break;
2139
2140 case CEPH_SESSION_RECALL_STATE:
2141 trim_caps(session, m->get_max_caps());
2142 break;
2143
2144 case CEPH_SESSION_FLUSHMSG:
a8e16298 2145 /* flush cap release */
11fdf7f2
TL
2146 if (auto& m = session->release; m) {
2147 session->con->send_message2(std::move(m));
a8e16298 2148 }
11fdf7f2 2149 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2150 break;
2151
2152 case CEPH_SESSION_FORCE_RO:
2153 force_session_readonly(session);
2154 break;
2155
2156 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2157 {
2158 std::string_view error_str;
2159 auto it = m->metadata.find("error_string");
2160 if (it != m->metadata.end())
2161 error_str = it->second;
2162 else
2163 error_str = "unknown error";
2164 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2165
11fdf7f2
TL
2166 rejected_by_mds[session->mds_num] = session->addrs;
2167 _closed_mds_session(session);
2168 }
7c673cae
FG
2169 break;
2170
2171 default:
2172 ceph_abort();
2173 }
7c673cae
FG
2174}
2175
2176bool Client::_any_stale_sessions() const
2177{
11fdf7f2 2178 ceph_assert(client_lock.is_locked_by_me());
7c673cae 2179
11fdf7f2
TL
2180 for (const auto &p : mds_sessions) {
2181 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2182 return true;
2183 }
2184 }
2185
2186 return false;
2187}
2188
2189void Client::_kick_stale_sessions()
2190{
11fdf7f2 2191 ldout(cct, 1) << __func__ << dendl;
7c673cae 2192
11fdf7f2
TL
2193 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2194 MetaSession &s = it->second;
2195 ++it;
2196 if (s.state == MetaSession::STATE_STALE)
2197 _closed_mds_session(&s);
7c673cae
FG
2198 }
2199}
2200
2201void Client::send_request(MetaRequest *request, MetaSession *session,
2202 bool drop_cap_releases)
2203{
2204 // make the request
2205 mds_rank_t mds = session->mds_num;
11fdf7f2 2206 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2207 << " for mds." << mds << dendl;
11fdf7f2 2208 auto r = build_client_request(request);
7c673cae
FG
2209 if (request->dentry()) {
2210 r->set_dentry_wanted();
2211 }
2212 if (request->got_unsafe) {
2213 r->set_replayed_op();
2214 if (request->target)
2215 r->head.ino = request->target->ino;
2216 } else {
2217 encode_cap_releases(request, mds);
2218 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2219 request->cap_releases.clear();
2220 else
2221 r->releases.swap(request->cap_releases);
2222 }
2223 r->set_mdsmap_epoch(mdsmap->get_epoch());
2224 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2225 objecter->with_osdmap([r](const OSDMap& o) {
2226 r->set_osdmap_epoch(o.get_epoch());
2227 });
2228 }
2229
2230 if (request->mds == -1) {
2231 request->sent_stamp = ceph_clock_now();
11fdf7f2 2232 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2233 }
2234 request->mds = mds;
2235
2236 Inode *in = request->inode();
11fdf7f2
TL
2237 if (in) {
2238 auto it = in->caps.find(mds);
2239 if (it != in->caps.end()) {
2240 request->sent_on_mseq = it->second.mseq;
2241 }
2242 }
7c673cae
FG
2243
2244 session->requests.push_back(&request->item);
2245
11fdf7f2
TL
2246 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2247 session->con->send_message2(std::move(r));
7c673cae
FG
2248}
2249
11fdf7f2 2250MClientRequest::ref Client::build_client_request(MetaRequest *request)
7c673cae 2251{
11fdf7f2 2252 auto req = MClientRequest::create(request->get_op());
7c673cae
FG
2253 req->set_tid(request->tid);
2254 req->set_stamp(request->op_stamp);
2255 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2256
2257 // if the filepath's haven't been set, set them!
2258 if (request->path.empty()) {
2259 Inode *in = request->inode();
2260 Dentry *de = request->dentry();
2261 if (in)
2262 in->make_nosnap_relative_path(request->path);
2263 else if (de) {
2264 if (de->inode)
2265 de->inode->make_nosnap_relative_path(request->path);
2266 else if (de->dir) {
2267 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2268 request->path.push_dentry(de->name);
2269 }
2270 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2271 << " No path, inode, or appropriately-endowed dentry given!"
2272 << dendl;
2273 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or dentry given!"
2275 << dendl;
2276 }
2277 req->set_filepath(request->get_filepath());
2278 req->set_filepath2(request->get_filepath2());
2279 req->set_data(request->data);
2280 req->set_retry_attempt(request->retry_attempt++);
2281 req->head.num_fwd = request->num_fwd;
2282 const gid_t *_gids;
2283 int gid_count = request->perms.get_gids(&_gids);
2284 req->set_gid_list(gid_count, _gids);
2285 return req;
2286}
2287
2288
2289
11fdf7f2 2290void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2291{
2292 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2293 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2294 if (!session) {
7c673cae
FG
2295 return;
2296 }
2297 ceph_tid_t tid = fwd->get_tid();
2298
2299 if (mds_requests.count(tid) == 0) {
11fdf7f2 2300 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2301 return;
2302 }
2303
2304 MetaRequest *request = mds_requests[tid];
11fdf7f2 2305 ceph_assert(request);
7c673cae
FG
2306
2307 // reset retry counter
2308 request->retry_attempt = 0;
2309
2310 // request not forwarded, or dest mds has no session.
2311 // resend.
11fdf7f2 2312 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2313 << " fwd " << fwd->get_num_fwd()
2314 << " to mds." << fwd->get_dest_mds()
2315 << ", resending to " << fwd->get_dest_mds()
2316 << dendl;
2317
2318 request->mds = -1;
2319 request->item.remove_myself();
2320 request->num_fwd = fwd->get_num_fwd();
2321 request->resend_mds = fwd->get_dest_mds();
2322 request->caller_cond->Signal();
7c673cae
FG
2323}
2324
2325bool Client::is_dir_operation(MetaRequest *req)
2326{
2327 int op = req->get_op();
2328 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2329 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2330 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2331 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2332 return true;
2333 return false;
2334}
2335
11fdf7f2 2336void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2337{
2338 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2339 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2340 if (!session) {
7c673cae
FG
2341 return;
2342 }
2343
2344 ceph_tid_t tid = reply->get_tid();
2345 bool is_safe = reply->is_safe();
2346
2347 if (mds_requests.count(tid) == 0) {
11fdf7f2 2348 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2349 << " safe is:" << is_safe << dendl;
7c673cae
FG
2350 return;
2351 }
2352 MetaRequest *request = mds_requests.at(tid);
2353
11fdf7f2 2354 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2355 << " tid " << tid << dendl;
2356
2357 if (request->got_unsafe && !is_safe) {
2358 //duplicate response
2359 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2360 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2361 return;
2362 }
2363
2364 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2365 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2366 << " from mds." << request->mds << dendl;
2367 request->send_to_auth = true;
2368 request->resend_mds = choose_target_mds(request);
2369 Inode *in = request->inode();
11fdf7f2 2370 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2371 if (request->resend_mds >= 0 &&
2372 request->resend_mds == request->mds &&
2373 (in == NULL ||
11fdf7f2
TL
2374 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2375 request->sent_on_mseq == it->second.mseq)) {
2376 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae
FG
2377 } else {
2378 request->caller_cond->Signal();
7c673cae
FG
2379 return;
2380 }
7c673cae
FG
2381 }
2382
11fdf7f2 2383 ceph_assert(!request->reply);
7c673cae
FG
2384 request->reply = reply;
2385 insert_trace(request, session);
2386
2387 // Handle unsafe reply
2388 if (!is_safe) {
2389 request->got_unsafe = true;
2390 session->unsafe_requests.push_back(&request->unsafe_item);
2391 if (is_dir_operation(request)) {
2392 Inode *dir = request->inode();
11fdf7f2 2393 ceph_assert(dir);
7c673cae
FG
2394 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2395 }
2396 if (request->target) {
2397 InodeRef &in = request->target;
2398 in->unsafe_ops.push_back(&request->unsafe_target_item);
2399 }
2400 }
2401
2402 // Only signal the caller once (on the first reply):
2403 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2404 if (!is_safe || !request->got_unsafe) {
2405 Cond cond;
2406 request->dispatch_cond = &cond;
2407
2408 // wake up waiter
11fdf7f2 2409 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
7c673cae
FG
2410 request->caller_cond->Signal();
2411
2412 // wake for kick back
2413 while (request->dispatch_cond) {
11fdf7f2 2414 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
7c673cae
FG
2415 cond.Wait(client_lock);
2416 }
2417 }
2418
2419 if (is_safe) {
2420 // the filesystem change is committed to disk
2421 // we're done, clean up
2422 if (request->got_unsafe) {
2423 request->unsafe_item.remove_myself();
2424 request->unsafe_dir_item.remove_myself();
2425 request->unsafe_target_item.remove_myself();
2426 signal_cond_list(request->waitfor_safe);
2427 }
2428 request->item.remove_myself();
2429 unregister_request(request);
2430 }
2431 if (unmounting)
2432 mount_cond.Signal();
2433}
2434
2435void Client::_handle_full_flag(int64_t pool)
2436{
2437 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2438 << "on " << pool << dendl;
2439 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2440 // to do this rather than blocking, because otherwise when we fill up we
2441 // potentially lock caps forever on files with dirty pages, and we need
2442 // to be able to release those caps to the MDS so that it can delete files
2443 // and free up space.
2444 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2445
2446 // For all inodes with layouts in this pool and a pending flush write op
2447 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2448 // from ObjectCacher so that it doesn't re-issue the write in response to
2449 // the ENOSPC error.
2450 // Fortunately since we're cancelling everything in a given pool, we don't
2451 // need to know which ops belong to which ObjectSet, we can just blow all
2452 // the un-flushed cached data away and mark any dirty inodes' async_err
2453 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2454 // affecting this pool, and all the objectsets we're purging were also
2455 // in this pool.
2456 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2457 i != inode_map.end(); ++i)
2458 {
2459 Inode *inode = i->second;
2460 if (inode->oset.dirty_or_tx
2461 && (pool == -1 || inode->layout.pool_id == pool)) {
2462 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2463 << " has dirty objects, purging and setting ENOSPC" << dendl;
2464 objectcacher->purge_set(&inode->oset);
2465 inode->set_async_err(-ENOSPC);
2466 }
2467 }
2468
2469 if (cancelled_epoch != (epoch_t)-1) {
2470 set_cap_epoch_barrier(cancelled_epoch);
2471 }
2472}
2473
11fdf7f2 2474void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2475{
31f18b77
FG
2476 std::set<entity_addr_t> new_blacklists;
2477 objecter->consume_blacklist_events(&new_blacklists);
2478
11fdf7f2
TL
2479 const auto myaddrs = messenger->get_myaddrs();
2480 bool new_blacklist = false;
2481 bool prenautilus = objecter->with_osdmap(
2482 [&](const OSDMap& o) {
2483 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2484 });
2485 if (!blacklisted) {
2486 for (auto a : myaddrs.v) {
2487 // blacklist entries are always TYPE_ANY for nautilus+
2488 a.set_type(entity_addr_t::TYPE_ANY);
2489 if (new_blacklists.count(a)) {
2490 new_blacklist = true;
2491 break;
2492 }
2493 if (prenautilus) {
2494 // ...except pre-nautilus, they were TYPE_LEGACY
2495 a.set_type(entity_addr_t::TYPE_LEGACY);
2496 if (new_blacklists.count(a)) {
2497 new_blacklist = true;
2498 break;
2499 }
2500 }
2501 }
2502 }
2503 if (new_blacklist) {
31f18b77
FG
2504 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2505 return o.get_epoch();
2506 });
2507 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2508 blacklisted = true;
31f18b77 2509
11fdf7f2 2510 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2511
2512 // Since we know all our OSD ops will fail, cancel them all preemtively,
2513 // so that on an unhealthy cluster we can umount promptly even if e.g.
2514 // some PGs were inaccessible.
2515 objecter->op_cancel_writes(-EBLACKLISTED);
2516
2517 } else if (blacklisted) {
2518 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2519 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2520 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2521 }
2522
f64942e4
AA
2523 // Always subscribe to next osdmap for blacklisted client
2524 // until this client is not blacklisted.
2525 if (blacklisted) {
2526 objecter->maybe_request_map();
2527 }
2528
7c673cae
FG
2529 if (objecter->osdmap_full_flag()) {
2530 _handle_full_flag(-1);
2531 } else {
2532 // Accumulate local list of full pools so that I can drop
2533 // the objecter lock before re-entering objecter in
2534 // cancel_writes
2535 std::vector<int64_t> full_pools;
2536
2537 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2538 for (const auto& kv : o.get_pools()) {
2539 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2540 full_pools.push_back(kv.first);
2541 }
2542 }
2543 });
2544
2545 for (auto p : full_pools)
2546 _handle_full_flag(p);
2547
2548 // Subscribe to subsequent maps to watch for the full flag going
2549 // away. For the global full flag objecter does this for us, but
2550 // it pays no attention to the per-pool full flag so in this branch
2551 // we do it ourselves.
2552 if (!full_pools.empty()) {
2553 objecter->maybe_request_map();
2554 }
2555 }
7c673cae
FG
2556}
2557
2558
2559// ------------------------
2560// incoming messages
2561
2562
11fdf7f2 2563bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2564{
11fdf7f2 2565 std::lock_guard l(client_lock);
7c673cae
FG
2566 if (!initialized) {
2567 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2568 return true;
2569 }
2570
2571 switch (m->get_type()) {
2572 // mounting and mds sessions
2573 case CEPH_MSG_MDS_MAP:
11fdf7f2 2574 handle_mds_map(MMDSMap::msgref_cast(m));
7c673cae
FG
2575 break;
2576 case CEPH_MSG_FS_MAP:
11fdf7f2 2577 handle_fs_map(MFSMap::msgref_cast(m));
7c673cae
FG
2578 break;
2579 case CEPH_MSG_FS_MAP_USER:
11fdf7f2 2580 handle_fs_map_user(MFSMapUser::msgref_cast(m));
7c673cae
FG
2581 break;
2582 case CEPH_MSG_CLIENT_SESSION:
11fdf7f2 2583 handle_client_session(MClientSession::msgref_cast(m));
7c673cae
FG
2584 break;
2585
2586 case CEPH_MSG_OSD_MAP:
11fdf7f2 2587 handle_osd_map(MOSDMap::msgref_cast(m));
7c673cae
FG
2588 break;
2589
2590 // requests
2591 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
11fdf7f2 2592 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
7c673cae
FG
2593 break;
2594 case CEPH_MSG_CLIENT_REPLY:
11fdf7f2
TL
2595 handle_client_reply(MClientReply::msgref_cast(m));
2596 break;
2597
2598 // reclaim reply
2599 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2600 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
7c673cae
FG
2601 break;
2602
2603 case CEPH_MSG_CLIENT_SNAP:
11fdf7f2 2604 handle_snap(MClientSnap::msgref_cast(m));
7c673cae
FG
2605 break;
2606 case CEPH_MSG_CLIENT_CAPS:
11fdf7f2 2607 handle_caps(MClientCaps::msgref_cast(m));
7c673cae
FG
2608 break;
2609 case CEPH_MSG_CLIENT_LEASE:
11fdf7f2 2610 handle_lease(MClientLease::msgref_cast(m));
7c673cae
FG
2611 break;
2612 case MSG_COMMAND_REPLY:
2613 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
11fdf7f2 2614 handle_command_reply(MCommandReply::msgref_cast(m));
7c673cae
FG
2615 } else {
2616 return false;
2617 }
2618 break;
2619 case CEPH_MSG_CLIENT_QUOTA:
11fdf7f2 2620 handle_quota(MClientQuota::msgref_cast(m));
7c673cae
FG
2621 break;
2622
2623 default:
2624 return false;
2625 }
2626
2627 // unmounting?
2628 if (unmounting) {
2629 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2630 << "+" << inode_map.size() << dendl;
2631 long unsigned size = lru.lru_get_size() + inode_map.size();
2632 trim_cache();
2633 if (size < lru.lru_get_size() + inode_map.size()) {
2634 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2635 mount_cond.Signal();
2636 } else {
2637 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2638 << "+" << inode_map.size() << dendl;
2639 }
2640 }
2641
2642 return true;
2643}
2644
11fdf7f2 2645void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2646{
2647 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2648
2649 signal_cond_list(waiting_for_fsmap);
2650
2651 monclient->sub_got("fsmap", fsmap->get_epoch());
2652}
2653
11fdf7f2 2654void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2655{
2656 fsmap_user.reset(new FSMapUser);
2657 *fsmap_user = m->get_fsmap();
7c673cae
FG
2658
2659 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2660 signal_cond_list(waiting_for_fsmap);
2661}
2662
11fdf7f2 2663void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2664{
f64942e4 2665 mds_gid_t old_inc, new_inc;
7c673cae 2666 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2667 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2668 << " is identical to or older than our "
2669 << mdsmap->get_epoch() << dendl;
7c673cae 2670 return;
f64942e4 2671 }
7c673cae 2672
11fdf7f2 2673 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2674
2675 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2676 oldmap.swap(mdsmap);
2677
2678 mdsmap->decode(m->get_encoded());
2679
2680 // Cancel any commands for missing or laggy GIDs
2681 std::list<ceph_tid_t> cancel_ops;
2682 auto &commands = command_table.get_commands();
2683 for (const auto &i : commands) {
2684 auto &op = i.second;
2685 const mds_gid_t op_mds_gid = op.mds_gid;
2686 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2687 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2688 cancel_ops.push_back(i.first);
2689 if (op.outs) {
2690 std::ostringstream ss;
2691 ss << "MDS " << op_mds_gid << " went away";
2692 *(op.outs) = ss.str();
2693 }
2694 op.con->mark_down();
2695 if (op.on_finish) {
2696 op.on_finish->complete(-ETIMEDOUT);
2697 }
2698 }
2699 }
2700
2701 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2702 i != cancel_ops.end(); ++i) {
2703 command_table.erase(*i);
2704 }
2705
2706 // reset session
11fdf7f2 2707 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2708 mds_rank_t mds = p->first;
11fdf7f2 2709 MetaSession *session = &p->second;
7c673cae
FG
2710 ++p;
2711
2712 int oldstate = oldmap->get_state(mds);
2713 int newstate = mdsmap->get_state(mds);
2714 if (!mdsmap->is_up(mds)) {
2715 session->con->mark_down();
11fdf7f2 2716 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2717 old_inc = oldmap->get_incarnation(mds);
2718 new_inc = mdsmap->get_incarnation(mds);
2719 if (old_inc != new_inc) {
2720 ldout(cct, 1) << "mds incarnation changed from "
2721 << old_inc << " to " << new_inc << dendl;
2722 oldstate = MDSMap::STATE_NULL;
2723 }
7c673cae 2724 session->con->mark_down();
11fdf7f2 2725 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2726 // When new MDS starts to take over, notify kernel to trim unused entries
2727 // in its dcache/icache. Hopefully, the kernel will release some unused
2728 // inodes before the new MDS enters reconnect state.
2729 trim_cache_for_reconnect(session);
2730 } else if (oldstate == newstate)
2731 continue; // no change
2732
2733 session->mds_state = newstate;
2734 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2735 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2736 send_reconnect(session);
81eedcae
TL
2737 } else if (newstate > MDSMap::STATE_RECONNECT) {
2738 if (oldstate < MDSMap::STATE_RECONNECT) {
2739 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2740 _closed_mds_session(session);
2741 continue;
2742 }
2743 if (newstate >= MDSMap::STATE_ACTIVE) {
2744 if (oldstate < MDSMap::STATE_ACTIVE) {
2745 // kick new requests
2746 kick_requests(session);
2747 kick_flushing_caps(session);
2748 signal_context_list(session->waiting_for_open);
2749 wake_up_session_caps(session, true);
2750 }
2751 connect_mds_targets(mds);
7c673cae 2752 }
7c673cae
FG
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
7c673cae
FG
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763}
2764
2765void Client::send_reconnect(MetaSession *session)
2766{
2767 mds_rank_t mds = session->mds_num;
11fdf7f2 2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
11fdf7f2 2775 session->release.reset();
7c673cae
FG
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
11fdf7f2
TL
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
11fdf7f2
TL
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
7c673cae 2806 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2807 << " " << ccap_string(cap.issued)
7c673cae
FG
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
11fdf7f2
TL
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
7c673cae
FG
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
11fdf7f2 2832 cap.cap_id,
7c673cae
FG
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
11fdf7f2 2835 cap.issued, // issued
7c673cae
FG
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
11fdf7f2
TL
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
7c673cae
FG
2851
2852 mount_cond.Signal();
11fdf7f2
TL
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2856}
2857
2858
2859void Client::kick_requests(MetaSession *session)
2860{
11fdf7f2 2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
31f18b77
FG
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
7c673cae 2873 continue;
31f18b77
FG
2874 }
2875 if (req->retry_attempt > 0)
7c673cae 2876 continue; // new requests only
31f18b77 2877 if (req->mds == session->mds_num) {
7c673cae
FG
2878 send_request(p->second, session);
2879 }
2880 }
2881}
2882
2883void Client::resend_unsafe_requests(MetaSession *session)
2884{
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
31f18b77
FG
2898 if (req->aborted())
2899 continue;
7c673cae
FG
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905}
2906
2907void Client::wait_unsafe_requests()
2908{
2909 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927}
2928
2929void Client::kick_requests_closed(MetaSession *session)
2930{
11fdf7f2 2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
11fdf7f2 2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 2944 req->unsafe_item.remove_myself();
eafe8130
TL
2945 if (is_dir_operation(req)) {
2946 Inode *dir = req->inode();
2947 assert(dir);
2948 dir->set_async_err(-EIO);
2949 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2950 << dir->ino << " " << req->get_tid() << dendl;
2951 req->unsafe_dir_item.remove_myself();
2952 }
2953 if (req->target) {
2954 InodeRef &in = req->target;
2955 in->set_async_err(-EIO);
2956 lderr(cct) << "kick_requests_closed drop req of inode : "
2957 << in->ino << " " << req->get_tid() << dendl;
2958 req->unsafe_target_item.remove_myself();
2959 }
7c673cae
FG
2960 signal_cond_list(req->waitfor_safe);
2961 unregister_request(req);
2962 }
2963 }
2964 }
11fdf7f2
TL
2965 ceph_assert(session->requests.empty());
2966 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2967}
2968
2969
2970
2971
2972/************
2973 * leases
2974 */
2975
2976void Client::got_mds_push(MetaSession *s)
2977{
2978 s->seq++;
2979 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2980 if (s->state == MetaSession::STATE_CLOSING) {
11fdf7f2 2981 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2982 }
2983}
2984
11fdf7f2 2985void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2986{
11fdf7f2 2987 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2988
11fdf7f2 2989 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2990
2991 mds_rank_t mds = mds_rank_t(m->get_source().num());
2992 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2993 if (!session) {
7c673cae
FG
2994 return;
2995 }
2996
2997 got_mds_push(session);
2998
2999 ceph_seq_t seq = m->get_seq();
3000
3001 Inode *in;
3002 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3003 if (inode_map.count(vino) == 0) {
3004 ldout(cct, 10) << " don't have vino " << vino << dendl;
3005 goto revoke;
3006 }
3007 in = inode_map[vino];
3008
3009 if (m->get_mask() & CEPH_LOCK_DN) {
3010 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3011 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3012 goto revoke;
3013 }
3014 Dentry *dn = in->dir->dentries[m->dname];
3015 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3016 dn->lease_mds = -1;
3017 }
3018
3019 revoke:
11fdf7f2
TL
3020 {
3021 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3022 m->get_connection()->send_message2(std::move(reply));
3023 }
7c673cae
FG
3024}
3025
3026void Client::put_inode(Inode *in, int n)
3027{
11fdf7f2 3028 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3029 int left = in->_put(n);
3030 if (left == 0) {
3031 // release any caps
3032 remove_all_caps(in);
3033
11fdf7f2 3034 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3035 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3036 ceph_assert(!unclean);
7c673cae
FG
3037 inode_map.erase(in->vino());
3038 if (use_faked_inos())
3039 _release_faked_ino(in);
3040
3041 if (in == root) {
3042 root = 0;
3043 root_ancestor = 0;
3044 while (!root_parents.empty())
3045 root_parents.erase(root_parents.begin());
3046 }
3047
3048 delete in;
3049 }
3050}
3051
3052void Client::close_dir(Dir *dir)
3053{
3054 Inode *in = dir->parent_inode;
11fdf7f2
TL
3055 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3056 ceph_assert(dir->is_empty());
3057 ceph_assert(in->dir == dir);
3058 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3059 if (!in->dentries.empty())
7c673cae
FG
3060 in->get_first_parent()->put(); // unpin dentry
3061
3062 delete in->dir;
3063 in->dir = 0;
3064 put_inode(in); // unpin inode
3065}
3066
3067 /**
3068 * Don't call this with in==NULL, use get_or_create for that
3069 * leave dn set to default NULL unless you're trying to add
3070 * a new inode to a pre-created Dentry
3071 */
3072Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3073{
3074 if (!dn) {
3075 // create a new Dentry
11fdf7f2
TL
3076 dn = new Dentry(dir, name);
3077
7c673cae
FG
3078 lru.lru_insert_mid(dn); // mid or top?
3079
3080 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3081 << " dn " << dn << " (new dn)" << dendl;
3082 } else {
11fdf7f2 3083 ceph_assert(!dn->inode);
7c673cae
FG
3084 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3085 << " dn " << dn << " (old dn)" << dendl;
3086 }
3087
3088 if (in) { // link to inode
11fdf7f2 3089 InodeRef tmp_ref;
7c673cae 3090 // only one parent for directories!
11fdf7f2
TL
3091 if (in->is_dir() && !in->dentries.empty()) {
3092 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3093 Dentry *olddn = in->get_first_parent();
11fdf7f2 3094 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3095 Inode *old_diri = olddn->dir->parent_inode;
3096 old_diri->dir_release_count++;
3097 clear_dir_complete_and_ordered(old_diri, true);
3098 unlink(olddn, true, true); // keep dir, dentry
3099 }
3100
11fdf7f2
TL
3101 dn->link(in);
3102 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3103 }
3104
3105 return dn;
3106}
3107
3108void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3109{
11fdf7f2 3110 InodeRef in(dn->inode);
7c673cae
FG
3111 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3112 << " inode " << dn->inode << dendl;
3113
3114 // unlink from inode
11fdf7f2
TL
3115 if (dn->inode) {
3116 dn->unlink();
3117 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3118 }
3119
3120 if (keepdentry) {
3121 dn->lease_mds = -1;
3122 } else {
3123 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3124
3125 // unlink from dir
11fdf7f2
TL
3126 Dir *dir = dn->dir;
3127 dn->detach();
7c673cae
FG
3128
3129 // delete den
3130 lru.lru_remove(dn);
3131 dn->put();
11fdf7f2
TL
3132
3133 if (dir->is_empty() && !keepdir)
3134 close_dir(dir);
7c673cae
FG
3135 }
3136}
3137
3138/**
3139 * For asynchronous flushes, check for errors from the IO and
3140 * update the inode if necessary
3141 */
3142class C_Client_FlushComplete : public Context {
3143private:
3144 Client *client;
3145 InodeRef inode;
3146public:
3147 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3148 void finish(int r) override {
11fdf7f2 3149 ceph_assert(client->client_lock.is_locked_by_me());
7c673cae
FG
3150 if (r != 0) {
3151 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3152 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3153 << " 0x" << std::hex << inode->ino << std::dec
3154 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3155 inode->set_async_err(r);
3156 }
3157 }
3158};
3159
3160
3161/****
3162 * caps
3163 */
3164
3165void Client::get_cap_ref(Inode *in, int cap)
3166{
3167 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3168 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3169 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3170 in->get();
3171 }
3172 if ((cap & CEPH_CAP_FILE_CACHE) &&
3173 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3174 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3175 in->get();
3176 }
3177 in->get_cap_ref(cap);
3178}
3179
3180void Client::put_cap_ref(Inode *in, int cap)
3181{
3182 int last = in->put_cap_ref(cap);
3183 if (last) {
3184 int put_nref = 0;
3185 int drop = last & ~in->caps_issued();
3186 if (in->snapid == CEPH_NOSNAP) {
3187 if ((last & CEPH_CAP_FILE_WR) &&
3188 !in->cap_snaps.empty() &&
3189 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3190 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3191 in->cap_snaps.rbegin()->second.writing = 0;
3192 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3193 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3194 }
3195 if (last & CEPH_CAP_FILE_BUFFER) {
3196 for (auto &p : in->cap_snaps)
3197 p.second.dirty_data = 0;
3198 signal_cond_list(in->waitfor_commit);
11fdf7f2 3199 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3200 ++put_nref;
3201 }
3202 }
3203 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3204 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3205 ++put_nref;
3206 }
3207 if (drop)
3208 check_caps(in, 0);
3209 if (put_nref)
3210 put_inode(in, put_nref);
3211 }
3212}
3213
3214int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3215{
3216 int r = check_pool_perm(in, need);
3217 if (r < 0)
3218 return r;
3219
3220 while (1) {
3221 int file_wanted = in->caps_file_wanted();
3222 if ((file_wanted & need) != need) {
3223 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3224 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3225 << dendl;
3226 return -EBADF;
3227 }
3228
3229 int implemented;
3230 int have = in->caps_issued(&implemented);
3231
3232 bool waitfor_caps = false;
3233 bool waitfor_commit = false;
3234
3235 if (have & need & CEPH_CAP_FILE_WR) {
3236 if (endoff > 0 &&
3237 (endoff >= (loff_t)in->max_size ||
3238 endoff > (loff_t)(in->size << 1)) &&
3239 endoff > (loff_t)in->wanted_max_size) {
3240 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3241 in->wanted_max_size = endoff;
3242 check_caps(in, 0);
3243 }
3244
3245 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3246 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3247 waitfor_caps = true;
3248 }
3249 if (!in->cap_snaps.empty()) {
3250 if (in->cap_snaps.rbegin()->second.writing) {
3251 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3252 waitfor_caps = true;
3253 }
3254 for (auto &p : in->cap_snaps) {
3255 if (p.second.dirty_data) {
3256 waitfor_commit = true;
3257 break;
3258 }
3259 }
3260 if (waitfor_commit) {
3261 _flush(in, new C_Client_FlushComplete(this, in));
3262 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3263 }
3264 }
3265 }
3266
3267 if (!waitfor_caps && !waitfor_commit) {
3268 if ((have & need) == need) {
7c673cae
FG
3269 int revoking = implemented & ~have;
3270 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3271 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3272 << " revoking " << ccap_string(revoking)
7c673cae 3273 << dendl;
c07f9fc5 3274 if ((revoking & want) == 0) {
7c673cae
FG
3275 *phave = need | (have & want);
3276 in->get_cap_ref(need);
3277 return 0;
3278 }
3279 }
3280 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3281 waitfor_caps = true;
3282 }
3283
3284 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3285 in->auth_cap->session->readonly)
3286 return -EROFS;
3287
3288 if (in->flags & I_CAP_DROPPED) {
3289 int mds_wanted = in->caps_mds_wanted();
3290 if ((mds_wanted & need) != need) {
3291 int ret = _renew_caps(in);
3292 if (ret < 0)
3293 return ret;
3294 continue;
3295 }
a8e16298 3296 if (!(file_wanted & ~mds_wanted))
7c673cae 3297 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3298 }
3299
3300 if (waitfor_caps)
3301 wait_on_list(in->waitfor_caps);
3302 else if (waitfor_commit)
3303 wait_on_list(in->waitfor_commit);
3304 }
3305}
3306
3307int Client::get_caps_used(Inode *in)
3308{
3309 unsigned used = in->caps_used();
3310 if (!(used & CEPH_CAP_FILE_CACHE) &&
3311 !objectcacher->set_is_empty(&in->oset))
3312 used |= CEPH_CAP_FILE_CACHE;
3313 return used;
3314}
3315
3316void Client::cap_delay_requeue(Inode *in)
3317{
11fdf7f2 3318 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3319 in->hold_caps_until = ceph_clock_now();
3320 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3321 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3322}
3323
3324void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3325 int flags, int used, int want, int retain,
7c673cae
FG
3326 int flush, ceph_tid_t flush_tid)
3327{
3328 int held = cap->issued | cap->implemented;
3329 int revoking = cap->implemented & ~cap->issued;
3330 retain &= ~revoking;
3331 int dropping = cap->issued & ~retain;
3332 int op = CEPH_CAP_OP_UPDATE;
3333
11fdf7f2 3334 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3335 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3336 << " used " << ccap_string(used)
3337 << " want " << ccap_string(want)
3338 << " flush " << ccap_string(flush)
3339 << " retain " << ccap_string(retain)
3340 << " held "<< ccap_string(held)
3341 << " revoking " << ccap_string(revoking)
3342 << " dropping " << ccap_string(dropping)
3343 << dendl;
3344
3345 if (cct->_conf->client_inject_release_failure && revoking) {
3346 const int would_have_issued = cap->issued & retain;
3347 const int would_have_implemented = cap->implemented & (cap->issued | used);
3348 // Simulated bug:
3349 // - tell the server we think issued is whatever they issued plus whatever we implemented
3350 // - leave what we have implemented in place
3351 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3352 cap->issued = cap->issued | cap->implemented;
3353
3354 // Make an exception for revoking xattr caps: we are injecting
3355 // failure to release other caps, but allow xattr because client
3356 // will block on xattr ops if it can't release these to MDS (#9800)
3357 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3358 cap->issued ^= xattr_mask & revoking;
3359 cap->implemented ^= xattr_mask & revoking;
3360
3361 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3362 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3363 } else {
3364 // Normal behaviour
3365 cap->issued &= retain;
3366 cap->implemented &= cap->issued | used;
3367 }
3368
3369 snapid_t follows = 0;
3370
3371 if (flush)
3372 follows = in->snaprealm->get_snap_context().seq;
3373
11fdf7f2 3374 auto m = MClientCaps::create(op,
7c673cae
FG
3375 in->ino,
3376 0,
3377 cap->cap_id, cap->seq,
3378 cap->implemented,
3379 want,
3380 flush,
3381 cap->mseq,
3382 cap_epoch_barrier);
3383 m->caller_uid = in->cap_dirtier_uid;
3384 m->caller_gid = in->cap_dirtier_gid;
3385
3386 m->head.issue_seq = cap->issue_seq;
3387 m->set_tid(flush_tid);
3388
3389 m->head.uid = in->uid;
3390 m->head.gid = in->gid;
3391 m->head.mode = in->mode;
3392
3393 m->head.nlink = in->nlink;
3394
3395 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3396 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3397 m->head.xattr_version = in->xattr_version;
3398 }
3399
3400 m->size = in->size;
3401 m->max_size = in->max_size;
3402 m->truncate_seq = in->truncate_seq;
3403 m->truncate_size = in->truncate_size;
3404 m->mtime = in->mtime;
3405 m->atime = in->atime;
3406 m->ctime = in->ctime;
3407 m->btime = in->btime;
3408 m->time_warp_seq = in->time_warp_seq;
3409 m->change_attr = in->change_attr;
eafe8130
TL
3410
3411 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3412 !in->cap_snaps.empty() &&
3413 in->cap_snaps.rbegin()->second.flush_tid == 0)
3414 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3415 m->flags = flags;
3416
7c673cae
FG
3417 if (flush & CEPH_CAP_FILE_WR) {
3418 m->inline_version = in->inline_version;
3419 m->inline_data = in->inline_data;
3420 }
3421
3422 in->reported_size = in->size;
3423 m->set_snap_follows(follows);
3424 cap->wanted = want;
3425 if (cap == in->auth_cap) {
3426 m->set_max_size(in->wanted_max_size);
3427 in->requested_max_size = in->wanted_max_size;
3428 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3429 }
3430
3431 if (!session->flushing_caps_tids.empty())
3432 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3433
11fdf7f2 3434 session->con->send_message2(std::move(m));
7c673cae
FG
3435}
3436
31f18b77
FG
3437static bool is_max_size_approaching(Inode *in)
3438{
3439 /* mds will adjust max size according to the reported size */
3440 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3441 return false;
3442 if (in->size >= in->max_size)
3443 return true;
3444 /* half of previous max_size increment has been used */
3445 if (in->max_size > in->reported_size &&
3446 (in->size << 1) >= in->max_size + in->reported_size)
3447 return true;
3448 return false;
3449}
7c673cae 3450
11fdf7f2
TL
3451static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3452{
3453 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3454 return used;
3455 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3456 return used;
3457
3458 if (issued & CEPH_CAP_FILE_LAZYIO) {
3459 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3460 used &= ~CEPH_CAP_FILE_CACHE;
3461 used |= CEPH_CAP_FILE_LAZYIO;
3462 }
3463 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3464 used &= ~CEPH_CAP_FILE_BUFFER;
3465 used |= CEPH_CAP_FILE_LAZYIO;
3466 }
3467 } else {
3468 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3469 used &= ~CEPH_CAP_FILE_CACHE;
3470 used |= CEPH_CAP_FILE_LAZYIO;
3471 }
3472 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3473 used &= ~CEPH_CAP_FILE_BUFFER;
3474 used |= CEPH_CAP_FILE_LAZYIO;
3475 }
3476 }
3477 return used;
3478}
3479
7c673cae
FG
3480/**
3481 * check_caps
3482 *
3483 * Examine currently used and wanted versus held caps. Release, flush or ack
3484 * revoked caps to the MDS as appropriate.
3485 *
3486 * @param in the inode to check
3487 * @param flags flags to apply to cap check
3488 */
3489void Client::check_caps(Inode *in, unsigned flags)
3490{
3491 unsigned wanted = in->caps_wanted();
3492 unsigned used = get_caps_used(in);
3493 unsigned cap_used;
3494
7c673cae
FG
3495 int implemented;
3496 int issued = in->caps_issued(&implemented);
3497 int revoking = implemented & ~issued;
3498
11fdf7f2
TL
3499 int orig_used = used;
3500 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3501
7c673cae 3502 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3503 if (!unmounting && in->nlink > 0) {
3504 if (wanted) {
7c673cae 3505 retain |= CEPH_CAP_ANY;
a8e16298
TL
3506 } else if (in->is_dir() &&
3507 (issued & CEPH_CAP_FILE_SHARED) &&
3508 (in->flags & I_COMPLETE)) {
3509 // we do this here because we don't want to drop to Fs (and then
3510 // drop the Fs if we do a create!) if that alone makes us send lookups
3511 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3512 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3513 retain |= wanted;
3514 } else {
7c673cae 3515 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3516 // keep RD only if we didn't have the file open RW,
3517 // because then the mds would revoke it anyway to
3518 // journal max_size=0.
3519 if (in->max_size == 0)
3520 retain |= CEPH_CAP_ANY_RD;
3521 }
7c673cae
FG
3522 }
3523
11fdf7f2 3524 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3525 << " wanted " << ccap_string(wanted)
3526 << " used " << ccap_string(used)
3527 << " issued " << ccap_string(issued)
3528 << " revoking " << ccap_string(revoking)
3529 << " flags=" << flags
3530 << dendl;
3531
3532 if (in->snapid != CEPH_NOSNAP)
3533 return; //snap caps last forever, can't write
3534
3535 if (in->caps.empty())
3536 return; // guard if at end of func
3537
11fdf7f2
TL
3538 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3539 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3540 if (_release(in))
11fdf7f2 3541 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3542 }
7c673cae 3543
7c673cae 3544
11fdf7f2
TL
3545 for (auto &p : in->caps) {
3546 mds_rank_t mds = p.first;
3547 Cap &cap = p.second;
7c673cae 3548
11fdf7f2 3549 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3550
3551 cap_used = used;
11fdf7f2 3552 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3553 cap_used &= ~in->auth_cap->issued;
3554
11fdf7f2 3555 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3556
3557 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3558 << " issued " << ccap_string(cap.issued)
3559 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3560 << " revoking " << ccap_string(revoking) << dendl;
3561
3562 if (in->wanted_max_size > in->max_size &&
3563 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3564 &cap == in->auth_cap)
7c673cae
FG
3565 goto ack;
3566
3567 /* approaching file_max? */
11fdf7f2
TL
3568 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3569 &cap == in->auth_cap &&
31f18b77 3570 is_max_size_approaching(in)) {
7c673cae 3571 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3572 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3573 goto ack;
3574 }
3575
3576 /* completed revocation? */
3577 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3578 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3579 goto ack;
3580 }
3581
3582 /* want more caps from mds? */
11fdf7f2 3583 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3584 goto ack;
3585
3586 if (!revoking && unmounting && (cap_used == 0))
3587 goto ack;
3588
11fdf7f2 3589 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3590 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3591 continue;
3592
11fdf7f2 3593 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3594 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3595 cap_delay_requeue(in);
7c673cae
FG
3596 continue;
3597 }
3598
3599 ack:
eafe8130
TL
3600 if (&cap == in->auth_cap) {
3601 if (in->flags & I_KICK_FLUSH) {
3602 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3603 << " to mds." << mds << dendl;
3604 kick_flushing_caps(in, session);
3605 }
3606 if (!in->cap_snaps.empty() &&
3607 in->cap_snaps.rbegin()->second.flush_tid == 0)
3608 flush_snaps(in);
7c673cae
FG
3609 }
3610
3611 int flushing;
3612 ceph_tid_t flush_tid;
11fdf7f2 3613 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3614 flushing = mark_caps_flushing(in, &flush_tid);
3615 } else {
3616 flushing = 0;
3617 flush_tid = 0;
3618 }
3619
eafe8130
TL
3620 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3621 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3622 flushing, flush_tid);
7c673cae
FG
3623 }
3624}
3625
3626
3627void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3628{
3629 int used = get_caps_used(in);
3630 int dirty = in->caps_dirty();
11fdf7f2 3631 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3632
3633 if (in->cap_snaps.size() &&
3634 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3635 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3636 return;
3637 } else if (in->caps_dirty() ||
3638 (used & CEPH_CAP_FILE_WR) ||
3639 (dirty & CEPH_CAP_ANY_WR)) {
3640 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3641 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3642 CapSnap &capsnap = capsnapem.first->second;
3643 capsnap.context = old_snapc;
3644 capsnap.issued = in->caps_issued();
3645 capsnap.dirty = in->caps_dirty();
3646
3647 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3648
3649 capsnap.uid = in->uid;
3650 capsnap.gid = in->gid;
3651 capsnap.mode = in->mode;
3652 capsnap.btime = in->btime;
3653 capsnap.xattrs = in->xattrs;
3654 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3655 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3656 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3657
3658 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3659 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3660 capsnap.writing = 1;
3661 } else {
3662 finish_cap_snap(in, capsnap, used);
3663 }
3664 } else {
11fdf7f2 3665 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3666 }
3667}
3668
3669void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3670{
11fdf7f2 3671 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3672 capsnap.size = in->size;
3673 capsnap.mtime = in->mtime;
3674 capsnap.atime = in->atime;
3675 capsnap.ctime = in->ctime;
3676 capsnap.time_warp_seq = in->time_warp_seq;
3677 capsnap.change_attr = in->change_attr;
7c673cae
FG
3678 capsnap.dirty |= in->caps_dirty();
3679
11fdf7f2
TL
3680 /* Only reset it if it wasn't set before */
3681 if (capsnap.cap_dirtier_uid == -1) {
3682 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3683 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3684 }
3685
7c673cae
FG
3686 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3687 capsnap.inline_data = in->inline_data;
3688 capsnap.inline_version = in->inline_version;
3689 }
3690
3691 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3692 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3693 << " WRBUFFER, delaying" << dendl;
3694 } else {
3695 capsnap.dirty_data = 0;
3696 flush_snaps(in);
3697 }
3698}
3699
3700void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3701{
11fdf7f2 3702 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3703 in->cap_snaps.at(seq).dirty_data = 0;
3704 flush_snaps(in);
3705}
3706
eafe8130
TL
3707void Client::send_flush_snap(Inode *in, MetaSession *session,
3708 snapid_t follows, CapSnap& capsnap)
3709{
3710 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP,
3711 in->ino, in->snaprealm->ino, 0,
3712 in->auth_cap->mseq, cap_epoch_barrier);
3713 m->caller_uid = capsnap.cap_dirtier_uid;
3714 m->caller_gid = capsnap.cap_dirtier_gid;
3715
3716 m->set_client_tid(capsnap.flush_tid);
3717 m->head.snap_follows = follows;
3718
3719 m->head.caps = capsnap.issued;
3720 m->head.dirty = capsnap.dirty;
3721
3722 m->head.uid = capsnap.uid;
3723 m->head.gid = capsnap.gid;
3724 m->head.mode = capsnap.mode;
3725 m->btime = capsnap.btime;
3726
3727 m->size = capsnap.size;
3728
3729 m->head.xattr_version = capsnap.xattr_version;
3730 encode(capsnap.xattrs, m->xattrbl);
3731
3732 m->ctime = capsnap.ctime;
3733 m->btime = capsnap.btime;
3734 m->mtime = capsnap.mtime;
3735 m->atime = capsnap.atime;
3736 m->time_warp_seq = capsnap.time_warp_seq;
3737 m->change_attr = capsnap.change_attr;
3738
3739 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3740 m->inline_version = in->inline_version;
3741 m->inline_data = in->inline_data;
3742 }
3743
3744 ceph_assert(!session->flushing_caps_tids.empty());
3745 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3746
3747 session->con->send_message2(std::move(m));
3748}
3749
3750void Client::flush_snaps(Inode *in)
7c673cae 3751{
eafe8130 3752 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3753 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3754
3755 // pick auth mds
11fdf7f2 3756 ceph_assert(in->auth_cap);
7c673cae 3757 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3758
3759 for (auto &p : in->cap_snaps) {
3760 CapSnap &capsnap = p.second;
eafe8130
TL
3761 // only do new flush
3762 if (capsnap.flush_tid > 0)
3763 continue;
7c673cae
FG
3764
3765 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3766 << " follows " << p.first
3767 << " size " << capsnap.size
3768 << " mtime " << capsnap.mtime
3769 << " dirty_data=" << capsnap.dirty_data
3770 << " writing=" << capsnap.writing
3771 << " on " << *in << dendl;
3772 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3773 break;
7c673cae 3774
eafe8130
TL
3775 capsnap.flush_tid = ++last_flush_tid;
3776 session->flushing_caps_tids.insert(capsnap.flush_tid);
3777 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3778 if (!in->flushing_cap_item.is_on_list())
3779 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3780
eafe8130 3781 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3782 }
3783}
3784
7c673cae
FG
3785void Client::wait_on_list(list<Cond*>& ls)
3786{
3787 Cond cond;
3788 ls.push_back(&cond);
3789 cond.Wait(client_lock);
3790 ls.remove(&cond);
3791}
3792
3793void Client::signal_cond_list(list<Cond*>& ls)
3794{
3795 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3796 (*it)->Signal();
3797}
3798
3799void Client::wait_on_context_list(list<Context*>& ls)
3800{
3801 Cond cond;
3802 bool done = false;
3803 int r;
3804 ls.push_back(new C_Cond(&cond, &done, &r));
3805 while (!done)
3806 cond.Wait(client_lock);
3807}
3808
3809void Client::signal_context_list(list<Context*>& ls)
3810{
3811 while (!ls.empty()) {
3812 ls.front()->complete(0);
3813 ls.pop_front();
3814 }
3815}
3816
a8e16298 3817void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3818{
11fdf7f2
TL
3819 for (const auto &cap : s->caps) {
3820 auto &in = cap->inode;
a8e16298 3821 if (reconnect) {
11fdf7f2
TL
3822 in.requested_max_size = 0;
3823 in.wanted_max_size = 0;
a8e16298
TL
3824 } else {
3825 if (cap->gen < s->cap_gen) {
3826 // mds did not re-issue stale cap.
3827 cap->issued = cap->implemented = CEPH_CAP_PIN;
3828 // make sure mds knows what we want.
11fdf7f2
TL
3829 if (in.caps_file_wanted() & ~cap->wanted)
3830 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3831 }
3832 }
11fdf7f2 3833 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3834 }
3835}
3836
3837
3838// flush dirty data (from objectcache)
3839
3840class C_Client_CacheInvalidate : public Context {
3841private:
3842 Client *client;
3843 vinodeno_t ino;
3844 int64_t offset, length;
3845public:
3846 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3847 client(c), offset(off), length(len) {
3848 if (client->use_faked_inos())
3849 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3850 else
3851 ino = in->vino();
3852 }
3853 void finish(int r) override {
3854 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
11fdf7f2 3855 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
3856 client->_async_invalidate(ino, offset, length);
3857 }
3858};
3859
3860void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3861{
3862 if (unmounting)
3863 return;
11fdf7f2 3864 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3865 ino_invalidate_cb(callback_handle, ino, off, len);
3866}
3867
3868void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3869
3870 if (ino_invalidate_cb)
3871 // we queue the invalidate, which calls the callback and decrements the ref
3872 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3873}
3874
3875void Client::_invalidate_inode_cache(Inode *in)
3876{
11fdf7f2 3877 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3878
3879 // invalidate our userspace inode cache
94b18763 3880 if (cct->_conf->client_oc) {
7c673cae 3881 objectcacher->release_set(&in->oset);
94b18763
FG
3882 if (!objectcacher->set_is_empty(&in->oset))
3883 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3884 }
7c673cae
FG
3885
3886 _schedule_invalidate_callback(in, 0, 0);
3887}
3888
3889void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3890{
11fdf7f2 3891 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3892
3893 // invalidate our userspace inode cache
3894 if (cct->_conf->client_oc) {
3895 vector<ObjectExtent> ls;
3896 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3897 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3898 }
3899
3900 _schedule_invalidate_callback(in, off, len);
3901}
3902
3903bool Client::_release(Inode *in)
3904{
3905 ldout(cct, 20) << "_release " << *in << dendl;
3906 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3907 _invalidate_inode_cache(in);
3908 return true;
3909 }
3910 return false;
3911}
3912
3913bool Client::_flush(Inode *in, Context *onfinish)
3914{
3915 ldout(cct, 10) << "_flush " << *in << dendl;
3916
3917 if (!in->oset.dirty_or_tx) {
3918 ldout(cct, 10) << " nothing to flush" << dendl;
3919 onfinish->complete(0);
3920 return true;
3921 }
3922
3923 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3924 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3925 objectcacher->purge_set(&in->oset);
3926 if (onfinish) {
3927 onfinish->complete(-ENOSPC);
3928 }
3929 return true;
3930 }
3931
3932 return objectcacher->flush_set(&in->oset, onfinish);
3933}
3934
3935void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3936{
11fdf7f2 3937 ceph_assert(client_lock.is_locked());
7c673cae
FG
3938 if (!in->oset.dirty_or_tx) {
3939 ldout(cct, 10) << " nothing to flush" << dendl;
3940 return;
3941 }
3942
11fdf7f2 3943 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3944 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3945 offset, size, &onflush);
7c673cae
FG
3946 if (!ret) {
3947 // wait for flush
3948 client_lock.Unlock();
11fdf7f2 3949 onflush.wait();
7c673cae
FG
3950 client_lock.Lock();
3951 }
3952}
3953
3954void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3955{
11fdf7f2
TL
3956 // std::lock_guard l(client_lock);
3957 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
7c673cae 3958 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3959 ceph_assert(in);
7c673cae
FG
3960 _flushed(in);
3961}
3962
3963void Client::_flushed(Inode *in)
3964{
3965 ldout(cct, 10) << "_flushed " << *in << dendl;
3966
3967 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3968}
3969
3970
3971
3972// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3973void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3974{
3975 unsigned had = in->caps_issued();
3976
3977 if ((issued & CEPH_CAP_FILE_CACHE) &&
3978 !(had & CEPH_CAP_FILE_CACHE))
3979 in->cache_gen++;
3980
3981 if ((issued & CEPH_CAP_FILE_SHARED) &&
3982 !(had & CEPH_CAP_FILE_SHARED)) {
3983 in->shared_gen++;
3984
3985 if (in->is_dir())
3986 clear_dir_complete_and_ordered(in, true);
3987 }
3988}
3989
3990void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3991 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3992 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3993{
11fdf7f2
TL
3994 if (!in->is_any_caps()) {
3995 ceph_assert(in->snaprealm == 0);
3996 in->snaprealm = get_snap_realm(realm);
3997 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3998 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3999 } else {
4000 ceph_assert(in->snaprealm);
4001 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4002 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4003 in->snaprealm_item.remove_myself();
4004 auto oldrealm = in->snaprealm;
4005 in->snaprealm = get_snap_realm(realm);
4006 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4007 put_snap_realm(oldrealm);
4008 }
4009 }
4010
7c673cae 4011 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4012 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4013 Cap &cap = capem.first->second;
4014 if (!capem.second) {
4015 if (cap.gen < mds_session->cap_gen)
4016 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4017
4018 /*
4019 * auth mds of the inode changed. we received the cap export
4020 * message, but still haven't received the cap import message.
4021 * handle_cap_export() updated the new auth MDS' cap.
4022 *
4023 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4024 * a message that was send before the cap import message. So
4025 * don't remove caps.
4026 */
11fdf7f2
TL
4027 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4028 ceph_assert(&cap == in->auth_cap);
4029 ceph_assert(cap.cap_id == cap_id);
4030 seq = cap.seq;
4031 mseq = cap.mseq;
4032 issued |= cap.issued;
7c673cae
FG
4033 flags |= CEPH_CAP_FLAG_AUTH;
4034 }
7c673cae
FG
4035 }
4036
11fdf7f2 4037 check_cap_issue(in, issued);
7c673cae
FG
4038
4039 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4040 if (in->auth_cap != &cap &&
7c673cae
FG
4041 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4042 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4043 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4044 << "add myself to new auth MDS' flushing caps list" << dendl;
4045 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4046 }
11fdf7f2 4047 in->auth_cap = &cap;
7c673cae
FG
4048 }
4049 }
4050
11fdf7f2
TL
4051 unsigned old_caps = cap.issued;
4052 cap.cap_id = cap_id;
4053 cap.issued = issued;
4054 cap.implemented |= issued;
4055 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4056 cap.wanted = wanted;
a8e16298 4057 else
11fdf7f2
TL
4058 cap.wanted |= wanted;
4059 cap.seq = seq;
4060 cap.issue_seq = seq;
4061 cap.mseq = mseq;
4062 cap.gen = mds_session->cap_gen;
4063 cap.latest_perms = cap_perms;
4064 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4065 << " from mds." << mds
4066 << " on " << *in
4067 << dendl;
4068
4069 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4070 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4071 for (auto &p : in->caps) {
4072 if (&p.second == &cap)
7c673cae 4073 continue;
11fdf7f2 4074 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4075 check_caps(in, CHECK_CAPS_NODELAY);
4076 break;
4077 }
4078 }
4079 }
4080
4081 if (issued & ~old_caps)
4082 signal_cond_list(in->waitfor_caps);
4083}
4084
4085void Client::remove_cap(Cap *cap, bool queue_release)
4086{
11fdf7f2 4087 auto &in = cap->inode;
7c673cae
FG
4088 MetaSession *session = cap->session;
4089 mds_rank_t mds = cap->session->mds_num;
4090
11fdf7f2 4091 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4092
4093 if (queue_release) {
4094 session->enqueue_cap_release(
11fdf7f2 4095 in.ino,
7c673cae
FG
4096 cap->cap_id,
4097 cap->issue_seq,
4098 cap->mseq,
4099 cap_epoch_barrier);
4100 }
4101
11fdf7f2
TL
4102 if (in.auth_cap == cap) {
4103 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4104 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4105 in.flushing_cap_item.remove_myself();
7c673cae 4106 }
11fdf7f2 4107 in.auth_cap = NULL;
7c673cae 4108 }
11fdf7f2
TL
4109 size_t n = in.caps.erase(mds);
4110 ceph_assert(n == 1);
7c673cae
FG
4111 cap = nullptr;
4112
11fdf7f2
TL
4113 if (!in.is_any_caps()) {
4114 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4115 in.snaprealm_item.remove_myself();
4116 put_snap_realm(in.snaprealm);
4117 in.snaprealm = 0;
7c673cae
FG
4118 }
4119}
4120
4121void Client::remove_all_caps(Inode *in)
4122{
4123 while (!in->caps.empty())
11fdf7f2 4124 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4125}
4126
4127void Client::remove_session_caps(MetaSession *s)
4128{
11fdf7f2 4129 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4130
4131 while (s->caps.size()) {
4132 Cap *cap = *s->caps.begin();
11fdf7f2 4133 InodeRef in(&cap->inode);
eafe8130 4134 bool dirty_caps = false;
7c673cae 4135 if (in->auth_cap == cap) {
7c673cae
FG
4136 dirty_caps = in->dirty_caps | in->flushing_caps;
4137 in->wanted_max_size = 0;
4138 in->requested_max_size = 0;
7c673cae 4139 }
a8e16298
TL
4140 if (cap->wanted | cap->issued)
4141 in->flags |= I_CAP_DROPPED;
7c673cae 4142 remove_cap(cap, false);
eafe8130 4143 in->cap_snaps.clear();
7c673cae 4144 if (dirty_caps) {
11fdf7f2 4145 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4146 if (in->flushing_caps) {
4147 num_flushing_caps--;
4148 in->flushing_cap_tids.clear();
4149 }
4150 in->flushing_caps = 0;
28e407b8 4151 in->mark_caps_clean();
11fdf7f2 4152 put_inode(in.get());
7c673cae 4153 }
a8e16298 4154 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4155 }
4156 s->flushing_caps_tids.clear();
4157 sync_cond.Signal();
4158}
4159
91327a77 4160int Client::_do_remount(bool retry_on_error)
b32b8144 4161{
11fdf7f2 4162 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4163
b32b8144
FG
4164 errno = 0;
4165 int r = remount_cb(callback_handle);
91327a77
AA
4166 if (r == 0) {
4167 retries_on_invalidate = 0;
4168 } else {
b32b8144
FG
4169 int e = errno;
4170 client_t whoami = get_nodeid();
4171 if (r == -1) {
4172 lderr(cct) <<
4173 "failed to remount (to trim kernel dentries): "
4174 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4175 } else {
4176 lderr(cct) <<
4177 "failed to remount (to trim kernel dentries): "
4178 "return code = " << r << dendl;
4179 }
91327a77 4180 bool should_abort =
11fdf7f2
TL
4181 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4182 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4183 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4184 if (should_abort && !unmounting) {
4185 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4186 ceph_abort();
4187 }
4188 }
4189 return r;
4190}
4191
7c673cae
FG
4192class C_Client_Remount : public Context {
4193private:
4194 Client *client;
4195public:
4196 explicit C_Client_Remount(Client *c) : client(c) {}
4197 void finish(int r) override {
11fdf7f2 4198 ceph_assert(r == 0);
91327a77 4199 client->_do_remount(true);
7c673cae
FG
4200 }
4201};
4202
4203void Client::_invalidate_kernel_dcache()
4204{
4205 if (unmounting)
4206 return;
94b18763
FG
4207 if (can_invalidate_dentries) {
4208 if (dentry_invalidate_cb && root->dir) {
4209 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4210 p != root->dir->dentries.end();
4211 ++p) {
4212 if (p->second->inode)
4213 _schedule_invalidate_dentry_callback(p->second, false);
4214 }
7c673cae
FG
4215 }
4216 } else if (remount_cb) {
4217 // Hacky:
4218 // when remounting a file system, linux kernel trims all unused dentries in the fs
4219 remount_finisher.queue(new C_Client_Remount(this));
4220 }
4221}
4222
91327a77
AA
4223void Client::_trim_negative_child_dentries(InodeRef& in)
4224{
4225 if (!in->is_dir())
4226 return;
4227
4228 Dir* dir = in->dir;
4229 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4230 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4231 Dentry *dn = p->second;
4232 ++p;
11fdf7f2 4233 ceph_assert(!dn->inode);
91327a77
AA
4234 if (dn->lru_is_expireable())
4235 unlink(dn, true, false); // keep dir, drop dentry
4236 }
4237 if (dir->dentries.empty()) {
4238 close_dir(dir);
4239 }
4240 }
4241
4242 if (in->flags & I_SNAPDIR_OPEN) {
4243 InodeRef snapdir = open_snapdir(in.get());
4244 _trim_negative_child_dentries(snapdir);
4245 }
4246}
4247
28e407b8 4248void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4249{
4250 mds_rank_t mds = s->mds_num;
28e407b8 4251 size_t caps_size = s->caps.size();
11fdf7f2 4252 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4253 << " caps " << caps_size << dendl;
4254
28e407b8
AA
4255 uint64_t trimmed = 0;
4256 auto p = s->caps.begin();
4257 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4258 * looking at from getting deleted during traversal. */
7c673cae
FG
4259 while ((caps_size - trimmed) > max && !p.end()) {
4260 Cap *cap = *p;
11fdf7f2 4261 InodeRef in(&cap->inode);
7c673cae
FG
4262
4263 // Increment p early because it will be invalidated if cap
4264 // is deleted inside remove_cap
4265 ++p;
4266
4267 if (in->caps.size() > 1 && cap != in->auth_cap) {
4268 int mine = cap->issued | cap->implemented;
4269 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4270 // disposable non-auth cap
b32b8144 4271 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4272 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4273 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4274 trimmed++;
4275 }
4276 } else {
4277 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4278 _trim_negative_child_dentries(in);
7c673cae 4279 bool all = true;
11fdf7f2
TL
4280 auto q = in->dentries.begin();
4281 while (q != in->dentries.end()) {
4282 Dentry *dn = *q;
4283 ++q;
7c673cae
FG
4284 if (dn->lru_is_expireable()) {
4285 if (can_invalidate_dentries &&
4286 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4287 // Only issue one of these per DN for inodes in root: handle
4288 // others more efficiently by calling for root-child DNs at
4289 // the end of this function.
4290 _schedule_invalidate_dentry_callback(dn, true);
4291 }
28e407b8
AA
4292 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4293 to_trim.insert(dn);
7c673cae
FG
4294 } else {
4295 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4296 all = false;
4297 }
4298 }
4299 if (all && in->ino != MDS_INO_ROOT) {
4300 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4301 trimmed++;
4302 }
4303 }
4304 }
28e407b8
AA
4305 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4306 for (const auto &dn : to_trim) {
4307 trim_dentry(dn);
4308 }
4309 to_trim.clear();
7c673cae 4310
b32b8144 4311 caps_size = s->caps.size();
11fdf7f2 4312 if (caps_size > (size_t)max)
7c673cae
FG
4313 _invalidate_kernel_dcache();
4314}
4315
4316void Client::force_session_readonly(MetaSession *s)
4317{
4318 s->readonly = true;
4319 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4320 auto &in = (*p)->inode;
4321 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4322 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4323 }
4324}
4325
7c673cae
FG
4326int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4327{
4328 MetaSession *session = in->auth_cap->session;
4329
4330 int flushing = in->dirty_caps;
11fdf7f2 4331 ceph_assert(flushing);
7c673cae
FG
4332
4333 ceph_tid_t flush_tid = ++last_flush_tid;
4334 in->flushing_cap_tids[flush_tid] = flushing;
4335
4336 if (!in->flushing_caps) {
11fdf7f2 4337 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4338 num_flushing_caps++;
4339 } else {
11fdf7f2 4340 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4341 }
4342
4343 in->flushing_caps |= flushing;
28e407b8 4344 in->mark_caps_clean();
7c673cae
FG
4345
4346 if (!in->flushing_cap_item.is_on_list())
4347 session->flushing_caps.push_back(&in->flushing_cap_item);
4348 session->flushing_caps_tids.insert(flush_tid);
4349
4350 *ptid = flush_tid;
4351 return flushing;
4352}
4353
4354void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4355{
4356 for (auto &p : in->cap_snaps) {
4357 CapSnap &capsnap = p.second;
4358 if (capsnap.flush_tid > 0) {
4359 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4360 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4361 }
4362 }
4363 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4364 it != in->flushing_cap_tids.end();
4365 ++it) {
4366 old_s->flushing_caps_tids.erase(it->first);
4367 new_s->flushing_caps_tids.insert(it->first);
4368 }
4369 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4370}
4371
4372/*
4373 * Flush all caps back to the MDS. Because the callers generally wait on the
4374 * result of this function (syncfs and umount cases), we set
4375 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4376 */
4377void Client::flush_caps_sync()
4378{
4379 ldout(cct, 10) << __func__ << dendl;
28e407b8 4380 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4381 while (!p.end()) {
4382 unsigned flags = CHECK_CAPS_NODELAY;
4383 Inode *in = *p;
4384
4385 ++p;
28e407b8
AA
4386 delayed_list.pop_front();
4387 if (p.end() && dirty_list.empty())
7c673cae
FG
4388 flags |= CHECK_CAPS_SYNCHRONOUS;
4389 check_caps(in, flags);
4390 }
4391
4392 // other caps, too
28e407b8 4393 p = dirty_list.begin();
7c673cae
FG
4394 while (!p.end()) {
4395 unsigned flags = CHECK_CAPS_NODELAY;
4396 Inode *in = *p;
4397
4398 ++p;
4399 if (p.end())
4400 flags |= CHECK_CAPS_SYNCHRONOUS;
4401 check_caps(in, flags);
4402 }
4403}
4404
7c673cae
FG
4405void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4406{
4407 while (in->flushing_caps) {
4408 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4409 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4410 if (it->first > want)
4411 break;
11fdf7f2 4412 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4413 << ccap_string(it->second) << " want " << want
4414 << " last " << it->first << dendl;
4415 wait_on_list(in->waitfor_caps);
4416 }
4417}
4418
4419void Client::wait_sync_caps(ceph_tid_t want)
4420{
4421 retry:
11fdf7f2 4422 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4423 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4424 for (auto &p : mds_sessions) {
4425 MetaSession *s = &p.second;
7c673cae
FG
4426 if (s->flushing_caps_tids.empty())
4427 continue;
4428 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4429 if (oldest_tid <= want) {
11fdf7f2 4430 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae
FG
4431 << " (want " << want << ")" << dendl;
4432 sync_cond.Wait(client_lock);
4433 goto retry;
4434 }
4435 }
4436}
4437
eafe8130
TL
4438void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4439{
4440 in->flags &= ~I_KICK_FLUSH;
4441
4442 Cap *cap = in->auth_cap;
4443 ceph_assert(cap->session == session);
4444
4445 ceph_tid_t last_snap_flush = 0;
4446 for (auto p = in->flushing_cap_tids.rbegin();
4447 p != in->flushing_cap_tids.rend();
4448 ++p) {
4449 if (!p->second) {
4450 last_snap_flush = p->first;
4451 break;
4452 }
4453 }
4454
4455 int wanted = in->caps_wanted();
4456 int used = get_caps_used(in) | in->caps_dirty();
4457 auto it = in->cap_snaps.begin();
4458 for (auto& p : in->flushing_cap_tids) {
4459 if (p.second) {
4460 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4461 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4462 p.second, p.first);
4463 } else {
4464 ceph_assert(it != in->cap_snaps.end());
4465 ceph_assert(it->second.flush_tid == p.first);
4466 send_flush_snap(in, session, it->first, it->second);
4467 ++it;
4468 }
4469 }
4470}
4471
7c673cae
FG
4472void Client::kick_flushing_caps(MetaSession *session)
4473{
4474 mds_rank_t mds = session->mds_num;
11fdf7f2 4475 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4476
4477 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4478 Inode *in = *p;
eafe8130
TL
4479 if (in->flags & I_KICK_FLUSH) {
4480 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4481 kick_flushing_caps(in, session);
4482 }
7c673cae 4483 }
7c673cae
FG
4484}
4485
4486void Client::early_kick_flushing_caps(MetaSession *session)
4487{
7c673cae
FG
4488 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4489 Inode *in = *p;
11fdf7f2
TL
4490 Cap *cap = in->auth_cap;
4491 ceph_assert(cap);
7c673cae
FG
4492
4493 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4494 // stage. This guarantees that MDS processes the cap flush message before issuing
4495 // the flushing caps to other client.
eafe8130
TL
4496 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4497 in->flags |= I_KICK_FLUSH;
7c673cae 4498 continue;
eafe8130 4499 }
7c673cae
FG
4500
4501 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4502 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4503 // send_reconnect() also will reset these sequence numbers. make sure
4504 // sequence numbers in cap flush message match later reconnect message.
4505 cap->seq = 0;
4506 cap->issue_seq = 0;
4507 cap->mseq = 0;
4508 cap->issued = cap->implemented;
4509
eafe8130 4510 kick_flushing_caps(in, session);
7c673cae
FG
4511 }
4512}
4513
7c673cae
FG
4514void SnapRealm::build_snap_context()
4515{
4516 set<snapid_t> snaps;
4517 snapid_t max_seq = seq;
4518
4519 // start with prior_parents?
4520 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4521 snaps.insert(prior_parent_snaps[i]);
4522
4523 // current parent's snaps
4524 if (pparent) {
4525 const SnapContext& psnapc = pparent->get_snap_context();
4526 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4527 if (psnapc.snaps[i] >= parent_since)
4528 snaps.insert(psnapc.snaps[i]);
4529 if (psnapc.seq > max_seq)
4530 max_seq = psnapc.seq;
4531 }
4532
4533 // my snaps
4534 for (unsigned i=0; i<my_snaps.size(); i++)
4535 snaps.insert(my_snaps[i]);
4536
4537 // ok!
4538 cached_snap_context.seq = max_seq;
4539 cached_snap_context.snaps.resize(0);
4540 cached_snap_context.snaps.reserve(snaps.size());
4541 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4542 cached_snap_context.snaps.push_back(*p);
4543}
4544
4545void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4546{
4547 list<SnapRealm*> q;
4548 q.push_back(realm);
4549
4550 while (!q.empty()) {
4551 realm = q.front();
4552 q.pop_front();
4553
11fdf7f2 4554 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4555 realm->invalidate_cache();
4556
4557 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4558 p != realm->pchildren.end();
4559 ++p)
4560 q.push_back(*p);
4561 }
4562}
4563
4564SnapRealm *Client::get_snap_realm(inodeno_t r)
4565{
4566 SnapRealm *realm = snap_realms[r];
4567 if (!realm)
4568 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4569 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4570 realm->nref++;
4571 return realm;
4572}
4573
4574SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4575{
4576 if (snap_realms.count(r) == 0) {
11fdf7f2 4577 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4578 return NULL;
4579 }
4580 SnapRealm *realm = snap_realms[r];
11fdf7f2 4581 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4582 realm->nref++;
4583 return realm;
4584}
4585
4586void Client::put_snap_realm(SnapRealm *realm)
4587{
11fdf7f2 4588 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4589 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4590 if (--realm->nref == 0) {
4591 snap_realms.erase(realm->ino);
4592 if (realm->pparent) {
4593 realm->pparent->pchildren.erase(realm);
4594 put_snap_realm(realm->pparent);
4595 }
4596 delete realm;
4597 }
4598}
4599
4600bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4601{
4602 if (realm->parent != parent) {
11fdf7f2 4603 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4604 << " " << realm->parent << " -> " << parent << dendl;
4605 realm->parent = parent;
4606 if (realm->pparent) {
4607 realm->pparent->pchildren.erase(realm);
4608 put_snap_realm(realm->pparent);
4609 }
4610 realm->pparent = get_snap_realm(parent);
4611 realm->pparent->pchildren.insert(realm);
4612 return true;
4613 }
4614 return false;
4615}
4616
4617static bool has_new_snaps(const SnapContext& old_snapc,
4618 const SnapContext& new_snapc)
4619{
4620 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4621}
4622
4623
11fdf7f2 4624void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4625{
4626 SnapRealm *first_realm = NULL;
11fdf7f2 4627 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4628
4629 map<SnapRealm*, SnapContext> dirty_realms;
4630
11fdf7f2 4631 auto p = bl.cbegin();
7c673cae
FG
4632 while (!p.end()) {
4633 SnapRealmInfo info;
11fdf7f2 4634 decode(info, p);
7c673cae
FG
4635 SnapRealm *realm = get_snap_realm(info.ino());
4636
4637 bool invalidate = false;
4638
4639 if (info.seq() > realm->seq) {
11fdf7f2 4640 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4641 << dendl;
4642
4643 if (flush) {
4644 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4645 // flush me + children
4646 list<SnapRealm*> q;
4647 q.push_back(realm);
4648 while (!q.empty()) {
4649 SnapRealm *realm = q.front();
4650 q.pop_front();
4651
4652 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4653 p != realm->pchildren.end();
4654 ++p)
4655 q.push_back(*p);
4656
4657 if (dirty_realms.count(realm) == 0) {
4658 realm->nref++;
4659 dirty_realms[realm] = realm->get_snap_context();
4660 }
4661 }
4662 }
4663
4664 // update
4665 realm->seq = info.seq();
4666 realm->created = info.created();
4667 realm->parent_since = info.parent_since();
4668 realm->prior_parent_snaps = info.prior_parent_snaps;
4669 realm->my_snaps = info.my_snaps;
4670 invalidate = true;
4671 }
4672
4673 // _always_ verify parent
4674 if (adjust_realm_parent(realm, info.parent()))
4675 invalidate = true;
4676
4677 if (invalidate) {
4678 invalidate_snaprealm_and_children(realm);
11fdf7f2 4679 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4680 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4681 } else {
11fdf7f2 4682 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4683 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4684 }
4685
4686 if (!first_realm)
4687 first_realm = realm;
4688 else
4689 put_snap_realm(realm);
4690 }
4691
4692 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4693 q != dirty_realms.end();
4694 ++q) {
4695 SnapRealm *realm = q->first;
4696 // if there are new snaps ?
4697 if (has_new_snaps(q->second, realm->get_snap_context())) {
4698 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4699 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4700 while (!r.end()) {
4701 Inode *in = *r;
4702 ++r;
4703 queue_cap_snap(in, q->second);
4704 }
4705 } else {
4706 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4707 }
4708 put_snap_realm(realm);
4709 }
4710
4711 if (realm_ret)
4712 *realm_ret = first_realm;
4713 else
4714 put_snap_realm(first_realm);
4715}
4716
11fdf7f2 4717void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4718{
11fdf7f2 4719 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4720 mds_rank_t mds = mds_rank_t(m->get_source().num());
4721 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4722 if (!session) {
7c673cae
FG
4723 return;
4724 }
4725
4726 got_mds_push(session);
4727
4728 map<Inode*, SnapContext> to_move;
4729 SnapRealm *realm = 0;
4730
4731 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4732 ceph_assert(m->head.split);
7c673cae 4733 SnapRealmInfo info;
11fdf7f2
TL
4734 auto p = m->bl.cbegin();
4735 decode(info, p);
4736 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4737
4738 // flush, then move, ino's.
4739 realm = get_snap_realm(info.ino());
4740 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4741 for (auto& ino : m->split_inos) {
4742 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4743 if (inode_map.count(vino)) {
4744 Inode *in = inode_map[vino];
4745 if (!in->snaprealm || in->snaprealm == realm)
4746 continue;
4747 if (in->snaprealm->created > info.created()) {
4748 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4749 << *in->snaprealm << dendl;
4750 continue;
4751 }
4752 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4753
4754
4755 in->snaprealm_item.remove_myself();
4756 to_move[in] = in->snaprealm->get_snap_context();
4757 put_snap_realm(in->snaprealm);
4758 }
4759 }
4760
4761 // move child snaprealms, too
11fdf7f2
TL
4762 for (auto& child_realm : m->split_realms) {
4763 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4764 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4765 if (!child)
4766 continue;
4767 adjust_realm_parent(child, realm->ino);
4768 put_snap_realm(child);
4769 }
4770 }
4771
4772 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4773
4774 if (realm) {
4775 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4776 Inode *in = p->first;
4777 in->snaprealm = realm;
4778 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4779 realm->nref++;
4780 // queue for snap writeback
4781 if (has_new_snaps(p->second, realm->get_snap_context()))
4782 queue_cap_snap(in, p->second);
4783 }
4784 put_snap_realm(realm);
4785 }
7c673cae
FG
4786}
4787
11fdf7f2 4788void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4789{
4790 mds_rank_t mds = mds_rank_t(m->get_source().num());
4791 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4792 if (!session) {
7c673cae
FG
4793 return;
4794 }
4795
4796 got_mds_push(session);
4797
11fdf7f2 4798 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4799
4800 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4801 if (inode_map.count(vino)) {
4802 Inode *in = NULL;
4803 in = inode_map[vino];
4804
4805 if (in) {
4806 in->quota = m->quota;
4807 in->rstat = m->rstat;
4808 }
4809 }
7c673cae
FG
4810}
4811
11fdf7f2 4812void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4813{
4814 mds_rank_t mds = mds_rank_t(m->get_source().num());
4815 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4816 if (!session) {
7c673cae
FG
4817 return;
4818 }
4819
4820 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4821 // Pause RADOS operations until we see the required epoch
4822 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4823 }
4824
4825 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4826 // Record the barrier so that we will transmit it to MDS when releasing
4827 set_cap_epoch_barrier(m->osd_epoch_barrier);
4828 }
4829
4830 got_mds_push(session);
4831
11fdf7f2 4832 Inode *in;
7c673cae 4833 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4834 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4835 in = it->second;
4836 } else {
7c673cae 4837 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4838 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4839 session->enqueue_cap_release(
4840 m->get_ino(),
4841 m->get_cap_id(),
4842 m->get_seq(),
4843 m->get_mseq(),
4844 cap_epoch_barrier);
4845 } else {
11fdf7f2 4846 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4847 }
7c673cae
FG
4848
4849 // in case the mds is waiting on e.g. a revocation
4850 flush_cap_releases();
4851 return;
4852 }
4853
4854 switch (m->get_op()) {
11fdf7f2
TL
4855 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4856 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4857 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4858 }
4859
11fdf7f2
TL
4860 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4861 Cap &cap = in->caps.at(mds);
7c673cae 4862
11fdf7f2
TL
4863 switch (m->get_op()) {
4864 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4865 case CEPH_CAP_OP_IMPORT:
4866 case CEPH_CAP_OP_REVOKE:
4867 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4868 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4869 }
4870 } else {
4871 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4872 return;
7c673cae
FG
4873 }
4874}
4875
11fdf7f2 4876void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4877{
4878 mds_rank_t mds = session->mds_num;
4879
11fdf7f2 4880 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4881 << " IMPORT from mds." << mds << dendl;
4882
4883 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4884 Cap *cap = NULL;
4885 UserPerm cap_perms;
11fdf7f2
TL
4886 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4887 cap = &it->second;
4888 cap_perms = cap->latest_perms;
7c673cae
FG
4889 }
4890
4891 // add/update it
4892 SnapRealm *realm = NULL;
4893 update_snap_trace(m->snapbl, &realm);
4894
4895 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4896 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4897 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4898
4899 if (cap && cap->cap_id == m->peer.cap_id) {
4900 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4901 }
4902
4903 if (realm)
4904 put_snap_realm(realm);
4905
eafe8130 4906 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 4907 // reflush any/all caps (if we are now the auth_cap)
eafe8130 4908 kick_flushing_caps(in, session);
7c673cae
FG
4909 }
4910}
4911
11fdf7f2 4912void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4913{
4914 mds_rank_t mds = session->mds_num;
4915
11fdf7f2 4916 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4917 << " EXPORT from mds." << mds << dendl;
4918
11fdf7f2
TL
4919 auto it = in->caps.find(mds);
4920 if (it != in->caps.end()) {
4921 Cap &cap = it->second;
4922 if (cap.cap_id == m->get_cap_id()) {
4923 if (m->peer.cap_id) {
4924 const auto peer_mds = mds_rank_t(m->peer.mds);
4925 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4926 auto it = in->caps.find(peer_mds);
4927 if (it != in->caps.end()) {
4928 Cap &tcap = it->second;
4929 if (tcap.cap_id == m->peer.cap_id &&
4930 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4931 tcap.cap_id = m->peer.cap_id;
4932 tcap.seq = m->peer.seq - 1;
4933 tcap.issue_seq = tcap.seq;
4934 tcap.issued |= cap.issued;
4935 tcap.implemented |= cap.issued;
4936 if (&cap == in->auth_cap)
4937 in->auth_cap = &tcap;
4938 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4939 adjust_session_flushing_caps(in, session, tsession);
4940 }
4941 } else {
4942 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4943 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4944 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4945 cap.latest_perms);
4946 }
7c673cae 4947 } else {
11fdf7f2
TL
4948 if (cap.wanted | cap.issued)
4949 in->flags |= I_CAP_DROPPED;
7c673cae 4950 }
7c673cae 4951
11fdf7f2
TL
4952 remove_cap(&cap, false);
4953 }
7c673cae 4954 }
7c673cae
FG
4955}
4956
11fdf7f2 4957void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4958{
4959 mds_rank_t mds = session->mds_num;
11fdf7f2 4960 ceph_assert(in->caps.count(mds));
7c673cae 4961
11fdf7f2 4962 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4963 << " size " << in->size << " -> " << m->get_size()
4964 << dendl;
4965
1adf2230
AA
4966 int issued;
4967 in->caps_issued(&issued);
4968 issued |= in->caps_dirty();
4969 update_inode_file_size(in, issued, m->get_size(),
4970 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4971}
4972
11fdf7f2 4973void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4974{
4975 ceph_tid_t flush_ack_tid = m->get_client_tid();
4976 int dirty = m->get_dirty();
4977 int cleaned = 0;
4978 int flushed = 0;
4979
11fdf7f2
TL
4980 auto it = in->flushing_cap_tids.begin();
4981 if (it->first < flush_ack_tid) {
4982 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4983 << " got unexpected flush ack tid " << flush_ack_tid
4984 << " expected is " << it->first << dendl;
4985 }
4986 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
4987 if (!it->second) {
4988 // cap snap
4989 ++it;
4990 continue;
4991 }
7c673cae
FG
4992 if (it->first == flush_ack_tid)
4993 cleaned = it->second;
4994 if (it->first <= flush_ack_tid) {
4995 session->flushing_caps_tids.erase(it->first);
4996 in->flushing_cap_tids.erase(it++);
4997 ++flushed;
4998 continue;
4999 }
5000 cleaned &= ~it->second;
5001 if (!cleaned)
5002 break;
5003 ++it;
5004 }
5005
11fdf7f2 5006 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5007 << " cleaned " << ccap_string(cleaned) << " on " << *in
5008 << " with " << ccap_string(dirty) << dendl;
5009
5010 if (flushed) {
5011 signal_cond_list(in->waitfor_caps);
5012 if (session->flushing_caps_tids.empty() ||
5013 *session->flushing_caps_tids.begin() > flush_ack_tid)
5014 sync_cond.Signal();
5015 }
5016
5017 if (!dirty) {
5018 in->cap_dirtier_uid = -1;
5019 in->cap_dirtier_gid = -1;
5020 }
5021
5022 if (!cleaned) {
5023 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5024 } else {
5025 if (in->flushing_caps) {
5026 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5027 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5028 in->flushing_caps &= ~cleaned;
5029 if (in->flushing_caps == 0) {
5030 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5031 num_flushing_caps--;
eafe8130 5032 if (in->flushing_cap_tids.empty())
7c673cae
FG
5033 in->flushing_cap_item.remove_myself();
5034 }
5035 if (!in->caps_dirty())
5036 put_inode(in);
5037 }
5038 }
7c673cae
FG
5039}
5040
5041
11fdf7f2 5042void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5043{
eafe8130 5044 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5045 mds_rank_t mds = session->mds_num;
11fdf7f2 5046 ceph_assert(in->caps.count(mds));
7c673cae
FG
5047 snapid_t follows = m->get_snap_follows();
5048
11fdf7f2
TL
5049 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5050 auto& capsnap = it->second;
eafe8130
TL
5051 if (flush_ack_tid != capsnap.flush_tid) {
5052 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5053 } else {
eafe8130 5054 InodeRef tmp_ref(in);
11fdf7f2 5055 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5056 << " on " << *in << dendl;
7c673cae 5057 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5058 in->flushing_cap_tids.erase(capsnap.flush_tid);
5059 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5060 in->flushing_cap_item.remove_myself();
11fdf7f2 5061 in->cap_snaps.erase(it);
eafe8130
TL
5062
5063 signal_cond_list(in->waitfor_caps);
5064 if (session->flushing_caps_tids.empty() ||
5065 *session->flushing_caps_tids.begin() > flush_ack_tid)
5066 sync_cond.Signal();
7c673cae
FG
5067 }
5068 } else {
11fdf7f2 5069 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5070 << " on " << *in << dendl;
5071 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5072 }
7c673cae
FG
5073}
5074
5075class C_Client_DentryInvalidate : public Context {
5076private:
5077 Client *client;
5078 vinodeno_t dirino;
5079 vinodeno_t ino;
5080 string name;
5081public:
5082 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5083 client(c), name(dn->name) {
5084 if (client->use_faked_inos()) {
5085 dirino.ino = dn->dir->parent_inode->faked_ino;
5086 if (del)
5087 ino.ino = dn->inode->faked_ino;
5088 } else {
5089 dirino = dn->dir->parent_inode->vino();
5090 if (del)
5091 ino = dn->inode->vino();
5092 }
5093 if (!del)
5094 ino.ino = inodeno_t();
5095 }
5096 void finish(int r) override {
5097 // _async_dentry_invalidate is responsible for its own locking
11fdf7f2 5098 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
5099 client->_async_dentry_invalidate(dirino, ino, name);
5100 }
5101};
5102
5103void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5104{
5105 if (unmounting)
5106 return;
11fdf7f2 5107 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5108 << " in dir " << dirino << dendl;
5109 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5110}
5111
5112void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5113{
5114 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5115 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5116}
5117
5118void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5119{
5120 int ref = in->get_num_ref();
494da23a 5121 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5122
5123 if (in->dir && !in->dir->dentries.empty()) {
5124 for (auto p = in->dir->dentries.begin();
5125 p != in->dir->dentries.end(); ) {
5126 Dentry *dn = p->second;
5127 ++p;
5128 /* rmsnap removes whole subtree, need trim inodes recursively.
5129 * we don't need to invalidate dentries recursively. because
5130 * invalidating a directory dentry effectively invalidate
5131 * whole subtree */
5132 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5133 _try_to_trim_inode(dn->inode.get(), false);
5134
5135 if (dn->lru_is_expireable())
5136 unlink(dn, true, false); // keep dir, drop dentry
5137 }
5138 if (in->dir->dentries.empty()) {
5139 close_dir(in->dir);
5140 --ref;
5141 }
5142 }
5143
5144 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5145 InodeRef snapdir = open_snapdir(in);
5146 _try_to_trim_inode(snapdir.get(), false);
5147 --ref;
5148 }
5149
494da23a 5150 if (ref > 0) {
11fdf7f2
TL
5151 auto q = in->dentries.begin();
5152 while (q != in->dentries.end()) {
5153 Dentry *dn = *q;
5154 ++q;
494da23a
TL
5155 if( in->ll_ref > 0 && sched_inval) {
5156 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5157 // so in->dentries doesn't always reflect the state of kernel's dcache.
5158 _schedule_invalidate_dentry_callback(dn, true);
5159 }
7c673cae
FG
5160 unlink(dn, true, true);
5161 }
5162 }
5163}
5164
11fdf7f2 5165void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5166{
5167 mds_rank_t mds = session->mds_num;
5168 int used = get_caps_used(in);
5169 int wanted = in->caps_wanted();
5170
a8e16298
TL
5171 const unsigned new_caps = m->get_caps();
5172 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5173 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5174 << " mds." << mds << " seq " << m->get_seq()
5175 << " caps now " << ccap_string(new_caps)
a8e16298
TL
5176 << " was " << ccap_string(cap->issued)
5177 << (was_stale ? "" : " (stale)") << dendl;
5178
5179 if (was_stale)
5180 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5181 cap->seq = m->get_seq();
28e407b8 5182 cap->gen = session->cap_gen;
7c673cae 5183
11fdf7f2 5184 check_cap_issue(in, new_caps);
a8e16298 5185
7c673cae 5186 // update inode
1adf2230
AA
5187 int issued;
5188 in->caps_issued(&issued);
5189 issued |= in->caps_dirty();
7c673cae 5190
1adf2230
AA
5191 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5192 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5193 in->mode = m->head.mode;
5194 in->uid = m->head.uid;
5195 in->gid = m->head.gid;
5196 in->btime = m->btime;
5197 }
5198 bool deleted_inode = false;
1adf2230
AA
5199 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5200 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5201 in->nlink = m->head.nlink;
5202 if (in->nlink == 0 &&
5203 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5204 deleted_inode = true;
5205 }
1adf2230 5206 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5207 m->xattrbl.length() &&
5208 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5209 auto p = m->xattrbl.cbegin();
5210 decode(in->xattrs, p);
7c673cae
FG
5211 in->xattr_version = m->head.xattr_version;
5212 }
28e407b8
AA
5213
5214 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5215 in->dirstat.nfiles = m->get_nfiles();
5216 in->dirstat.nsubdirs = m->get_nsubdirs();
5217 }
5218
1adf2230
AA
5219 if (new_caps & CEPH_CAP_ANY_RD) {
5220 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5221 m->get_ctime(), m->get_mtime(), m->get_atime());
5222 }
5223
5224 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5225 in->layout = m->get_layout();
5226 update_inode_file_size(in, issued, m->get_size(),
5227 m->get_truncate_seq(), m->get_truncate_size());
5228 }
5229
5230 if (m->inline_version > in->inline_version) {
5231 in->inline_data = m->inline_data;
5232 in->inline_version = m->inline_version;
5233 }
5234
5235 /* always take a newer change attr */
5236 if (m->get_change_attr() > in->change_attr)
5237 in->change_attr = m->get_change_attr();
7c673cae
FG
5238
5239 // max_size
5240 if (cap == in->auth_cap &&
1adf2230
AA
5241 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5242 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5243 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5244 in->max_size = m->get_max_size();
5245 if (in->max_size > in->wanted_max_size) {
5246 in->wanted_max_size = 0;
5247 in->requested_max_size = 0;
5248 }
5249 }
5250
5251 bool check = false;
a8e16298
TL
5252 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5253 (wanted & ~(cap->wanted | new_caps))) {
5254 // If mds is importing cap, prior cap messages that update 'wanted'
5255 // may get dropped by mds (migrate seq mismatch).
5256 //
5257 // We don't send cap message to update 'wanted' if what we want are
5258 // already issued. If mds revokes caps, cap message that releases caps
5259 // also tells mds what we want. But if caps got revoked by mds forcedly
5260 // (session stale). We may haven't told mds what we want.
7c673cae 5261 check = true;
a8e16298 5262 }
7c673cae 5263
7c673cae
FG
5264
5265 // update caps
a8e16298 5266 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5267 if (revoked) {
5268 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5269 cap->issued = new_caps;
5270 cap->implemented |= new_caps;
5271
b32b8144
FG
5272 // recall delegations if we're losing caps necessary for them
5273 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5274 in->recall_deleg(false);
5275 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5276 in->recall_deleg(true);
5277
11fdf7f2
TL
5278 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5279 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5280 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5281 // waitin' for flush
11fdf7f2 5282 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5283 if (_release(in))
5284 check = true;
5285 } else {
5286 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5287 check = true;
5288 }
a8e16298
TL
5289 } else if (cap->issued == new_caps) {
5290 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5291 } else {
a8e16298 5292 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5293 cap->issued = new_caps;
5294 cap->implemented |= new_caps;
5295
5296 if (cap == in->auth_cap) {
5297 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5298 for (const auto &p : in->caps) {
5299 if (&p.second == cap)
7c673cae 5300 continue;
11fdf7f2 5301 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5302 check = true;
5303 break;
5304 }
5305 }
5306 }
5307 }
5308
5309 if (check)
5310 check_caps(in, 0);
5311
5312 // wake up waiters
5313 if (new_caps)
5314 signal_cond_list(in->waitfor_caps);
5315
5316 // may drop inode's last ref
5317 if (deleted_inode)
5318 _try_to_trim_inode(in, true);
7c673cae
FG
5319}
5320
7c673cae
FG
5321int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5322{
5323 if (perms.uid() == 0)
5324 return 0;
5325
5326 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5327 int ret = _posix_acl_permission(in, perms, want);
5328 if (ret != -EAGAIN)
5329 return ret;
5330 }
5331
5332 // check permissions before doing anything else
5333 if (!in->check_mode(perms, want))
5334 return -EACCES;
5335 return 0;
5336}
5337
5338int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5339 const UserPerm& perms)
5340{
5341 int r = _getattr_for_perm(in, perms);
5342 if (r < 0)
5343 goto out;
5344
5345 r = 0;
5346 if (strncmp(name, "system.", 7) == 0) {
5347 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5348 r = -EPERM;
5349 } else {
5350 r = inode_permission(in, perms, want);
5351 }
5352out:
1adf2230 5353 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5354 return r;
5355}
5356
5357ostream& operator<<(ostream &out, const UserPerm& perm) {
5358 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5359 return out;
5360}
5361
5362int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5363 const UserPerm& perms)
5364{
181888fb 5365 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5366 int r = _getattr_for_perm(in, perms);
5367 if (r < 0)
5368 goto out;
5369
5370 if (mask & CEPH_SETATTR_SIZE) {
5371 r = inode_permission(in, perms, MAY_WRITE);
5372 if (r < 0)
5373 goto out;
5374 }
5375
5376 r = -EPERM;
5377 if (mask & CEPH_SETATTR_UID) {
5378 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5379 goto out;
5380 }
5381 if (mask & CEPH_SETATTR_GID) {
5382 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5383 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5384 goto out;
5385 }
5386
5387 if (mask & CEPH_SETATTR_MODE) {
5388 if (perms.uid() != 0 && perms.uid() != in->uid)
5389 goto out;
5390
5391 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5392 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5393 stx->stx_mode &= ~S_ISGID;
5394 }
5395
5396 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5397 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5398 if (perms.uid() != 0 && perms.uid() != in->uid) {
5399 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5400 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5401 check_mask |= CEPH_SETATTR_MTIME;
5402 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5403 check_mask |= CEPH_SETATTR_ATIME;
5404 if (check_mask & mask) {
5405 goto out;
5406 } else {
5407 r = inode_permission(in, perms, MAY_WRITE);
5408 if (r < 0)
5409 goto out;
5410 }
5411 }
5412 }
5413 r = 0;
5414out:
5415 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5416 return r;
5417}
5418
5419int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5420{
181888fb 5421 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5422 unsigned want = 0;
5423
5424 if ((flags & O_ACCMODE) == O_WRONLY)
5425 want = MAY_WRITE;
5426 else if ((flags & O_ACCMODE) == O_RDWR)
5427 want = MAY_READ | MAY_WRITE;
5428 else if ((flags & O_ACCMODE) == O_RDONLY)
5429 want = MAY_READ;
5430 if (flags & O_TRUNC)
5431 want |= MAY_WRITE;
5432
5433 int r = 0;
5434 switch (in->mode & S_IFMT) {
5435 case S_IFLNK:
5436 r = -ELOOP;
5437 goto out;
5438 case S_IFDIR:
5439 if (want & MAY_WRITE) {
5440 r = -EISDIR;
5441 goto out;
5442 }
5443 break;
5444 }
5445
5446 r = _getattr_for_perm(in, perms);
5447 if (r < 0)
5448 goto out;
5449
5450 r = inode_permission(in, perms, want);
5451out:
5452 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5453 return r;
5454}
5455
5456int Client::may_lookup(Inode *dir, const UserPerm& perms)
5457{
181888fb 5458 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5459 int r = _getattr_for_perm(dir, perms);
5460 if (r < 0)
5461 goto out;
5462
5463 r = inode_permission(dir, perms, MAY_EXEC);
5464out:
5465 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5466 return r;
5467}
5468
5469int Client::may_create(Inode *dir, const UserPerm& perms)
5470{
181888fb 5471 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5472 int r = _getattr_for_perm(dir, perms);
5473 if (r < 0)
5474 goto out;
5475
5476 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5477out:
5478 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5479 return r;
5480}
5481
5482int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5483{
181888fb 5484 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5485 int r = _getattr_for_perm(dir, perms);
5486 if (r < 0)
5487 goto out;
5488
5489 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5490 if (r < 0)
5491 goto out;
5492
5493 /* 'name == NULL' means rmsnap */
5494 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5495 InodeRef otherin;
5496 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5497 if (r < 0)
5498 goto out;
5499 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5500 r = -EPERM;
5501 }
5502out:
5503 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5504 return r;
5505}
5506
5507int Client::may_hardlink(Inode *in, const UserPerm& perms)
5508{
181888fb 5509 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5510 int r = _getattr_for_perm(in, perms);
5511 if (r < 0)
5512 goto out;
5513
5514 if (perms.uid() == 0 || perms.uid() == in->uid) {
5515 r = 0;
5516 goto out;
5517 }
5518
5519 r = -EPERM;
5520 if (!S_ISREG(in->mode))
5521 goto out;
5522
5523 if (in->mode & S_ISUID)
5524 goto out;
5525
5526 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5527 goto out;
5528
5529 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5530out:
5531 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5532 return r;
5533}
5534
5535int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5536{
5537 int mask = CEPH_STAT_CAP_MODE;
5538 bool force = false;
5539 if (acl_type != NO_ACL) {
5540 mask |= CEPH_STAT_CAP_XATTR;
5541 force = in->xattr_version == 0;
5542 }
5543 return _getattr(in, mask, perms, force);
5544}
5545
5546vinodeno_t Client::_get_vino(Inode *in)
5547{
5548 /* The caller must hold the client lock */
5549 return vinodeno_t(in->ino, in->snapid);
5550}
5551
7c673cae
FG
5552/**
5553 * Resolve an MDS spec to a list of MDS daemon GIDs.
5554 *
5555 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5556 * It may be '*' in which case it matches all GIDs.
5557 *
5558 * If no error is returned, the `targets` vector will be populated with at least
5559 * one MDS.
5560 */
5561int Client::resolve_mds(
5562 const std::string &mds_spec,
5563 std::vector<mds_gid_t> *targets)
5564{
11fdf7f2
TL
5565 ceph_assert(fsmap);
5566 ceph_assert(targets != nullptr);
7c673cae
FG
5567
5568 mds_role_t role;
5569 std::stringstream ss;
5570 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5571 if (role_r == 0) {
5572 // We got a role, resolve it to a GID
5573 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5574 << role << "'" << dendl;
5575 targets->push_back(
5576 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5577 return 0;
5578 }
5579
5580 std::string strtol_err;
5581 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5582 if (strtol_err.empty()) {
5583 // It is a possible GID
5584 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5585 if (fsmap->gid_exists(mds_gid)) {
5586 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5587 targets->push_back(mds_gid);
5588 } else {
5589 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5590 << dendl;
5591 return -ENOENT;
5592 }
5593 } else if (mds_spec == "*") {
5594 // It is a wildcard: use all MDSs
5595 const auto mds_info = fsmap->get_mds_info();
5596
5597 if (mds_info.empty()) {
5598 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5599 return -ENOENT;
5600 }
5601
5602 for (const auto i : mds_info) {
5603 targets->push_back(i.first);
5604 }
5605 } else {
5606 // It did not parse as an integer, it is not a wildcard, it must be a name
5607 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5608 if (mds_gid == 0) {
5609 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5610
5611 lderr(cct) << "FSMap: " << *fsmap << dendl;
5612
5613 return -ENOENT;
5614 } else {
5615 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5616 << "' to GID " << mds_gid << dendl;
5617 targets->push_back(mds_gid);
5618 }
5619 }
5620
5621 return 0;
5622}
5623
5624
5625/**
5626 * Authenticate with mon and establish global ID
5627 */
5628int Client::authenticate()
5629{
11fdf7f2 5630 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
5631
5632 if (monclient->is_authenticated()) {
5633 return 0;
5634 }
5635
5636 client_lock.Unlock();
5637 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5638 client_lock.Lock();
5639 if (r < 0) {
5640 return r;
5641 }
5642
5643 whoami = monclient->get_global_id();
5644 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5645
5646 return 0;
5647}
5648
5649int Client::fetch_fsmap(bool user)
5650{
5651 int r;
5652 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5653 // rather than MDSMap because no one MDSMap contains all the daemons, and
5654 // a `tell` can address any daemon.
5655 version_t fsmap_latest;
5656 do {
5657 C_SaferCond cond;
5658 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5659 client_lock.Unlock();
5660 r = cond.wait();
5661 client_lock.Lock();
5662 } while (r == -EAGAIN);
5663
5664 if (r < 0) {
5665 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5666 return r;
5667 }
5668
5669 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5670
5671 if (user) {
5672 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5673 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5674 monclient->renew_subs();
5675 wait_on_list(waiting_for_fsmap);
5676 }
11fdf7f2
TL
5677 ceph_assert(fsmap_user);
5678 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5679 } else {
5680 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5681 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5682 monclient->renew_subs();
5683 wait_on_list(waiting_for_fsmap);
5684 }
11fdf7f2
TL
5685 ceph_assert(fsmap);
5686 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5687 }
5688 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5689 << fsmap_latest << dendl;
5690 return 0;
5691}
5692
5693/**
5694 *
5695 * @mds_spec one of ID, rank, GID, "*"
5696 *
5697 */
5698int Client::mds_command(
5699 const std::string &mds_spec,
5700 const vector<string>& cmd,
5701 const bufferlist& inbl,
5702 bufferlist *outbl,
5703 string *outs,
5704 Context *onfinish)
5705{
11fdf7f2 5706 std::lock_guard lock(client_lock);
7c673cae 5707
181888fb
FG
5708 if (!initialized)
5709 return -ENOTCONN;
7c673cae
FG
5710
5711 int r;
5712 r = authenticate();
5713 if (r < 0) {
5714 return r;
5715 }
5716
5717 r = fetch_fsmap(false);
5718 if (r < 0) {
5719 return r;
5720 }
5721
5722 // Look up MDS target(s) of the command
5723 std::vector<mds_gid_t> targets;
5724 r = resolve_mds(mds_spec, &targets);
5725 if (r < 0) {
5726 return r;
5727 }
5728
5729 // If daemons are laggy, we won't send them commands. If all
5730 // are laggy then we fail.
5731 std::vector<mds_gid_t> non_laggy;
5732 for (const auto gid : targets) {
5733 const auto info = fsmap->get_info_gid(gid);
5734 if (!info.laggy()) {
5735 non_laggy.push_back(gid);
5736 }
5737 }
5738 if (non_laggy.size() == 0) {
5739 *outs = "All targeted MDS daemons are laggy";
5740 return -ENOENT;
5741 }
5742
5743 if (metadata.empty()) {
5744 // We are called on an unmounted client, so metadata
5745 // won't be initialized yet.
5746 populate_metadata("");
5747 }
5748
5749 // Send commands to targets
5750 C_GatherBuilder gather(cct, onfinish);
5751 for (const auto target_gid : non_laggy) {
5752 const auto info = fsmap->get_info_gid(target_gid);
5753
5754 // Open a connection to the target MDS
11fdf7f2 5755 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5756
5757 // Generate MDSCommandOp state
5758 auto &op = command_table.start_command();
5759
5760 op.on_finish = gather.new_sub();
5761 op.cmd = cmd;
5762 op.outbl = outbl;
5763 op.outs = outs;
5764 op.inbl = inbl;
5765 op.mds_gid = target_gid;
5766 op.con = conn;
5767
5768 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5769 << " tid=" << op.tid << cmd << dendl;
5770
5771 // Construct and send MCommand
11fdf7f2
TL
5772 auto m = op.get_message(monclient->get_fsid());
5773 conn->send_message2(std::move(m));
7c673cae
FG
5774 }
5775 gather.activate();
5776
5777 return 0;
5778}
5779
11fdf7f2 5780void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5781{
5782 ceph_tid_t const tid = m->get_tid();
5783
5784 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5785
5786 if (!command_table.exists(tid)) {
5787 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5788 return;
5789 }
5790
5791 auto &op = command_table.get_command(tid);
5792 if (op.outbl) {
11fdf7f2 5793 *op.outbl = m->get_data();
7c673cae
FG
5794 }
5795 if (op.outs) {
5796 *op.outs = m->rs;
5797 }
5798
5799 if (op.on_finish) {
5800 op.on_finish->complete(m->r);
5801 }
5802
5803 command_table.erase(tid);
7c673cae
FG
5804}
5805
5806// -------------------
5807// MOUNT
5808
11fdf7f2 5809int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5810{
7c673cae
FG
5811 int r = authenticate();
5812 if (r < 0) {
5813 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5814 return r;
5815 }
5816
11fdf7f2
TL
5817 std::string resolved_fs_name;
5818 if (fs_name.empty()) {
5819 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5820 } else {
5821 resolved_fs_name = fs_name;
5822 }
5823
7c673cae 5824 std::string want = "mdsmap";
11fdf7f2 5825 if (!resolved_fs_name.empty()) {
7c673cae
FG
5826 r = fetch_fsmap(true);
5827 if (r < 0)
5828 return r;
11fdf7f2
TL
5829 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5830 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5831 return -ENOENT;
11fdf7f2 5832 }
7c673cae
FG
5833
5834 std::ostringstream oss;
11fdf7f2 5835 oss << want << "." << fscid;
7c673cae
FG
5836 want = oss.str();
5837 }
5838 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5839
5840 monclient->sub_want(want, 0, 0);
5841 monclient->renew_subs();
5842
11fdf7f2
TL
5843 return 0;
5844}
5845
5846int Client::mount(const std::string &mount_root, const UserPerm& perms,
5847 bool require_mds, const std::string &fs_name)
5848{
5849 std::lock_guard lock(client_lock);
5850
5851 if (mounted) {
5852 ldout(cct, 5) << "already mounted" << dendl;
5853 return 0;
5854 }
5855
5856 unmounting = false;
5857
5858 int r = subscribe_mdsmap(fs_name);
5859 if (r < 0) {
5860 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5861 return r;
5862 }
5863
7c673cae
FG
5864 tick(); // start tick
5865
5866 if (require_mds) {
5867 while (1) {
5868 auto availability = mdsmap->is_cluster_available();
5869 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5870 // Error out
5871 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5872 return CEPH_FUSE_NO_MDS_UP;
5873 } else if (availability == MDSMap::AVAILABLE) {
5874 // Continue to mount
5875 break;
5876 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5877 // Else, wait. MDSMonitor will update the map to bring
5878 // us to a conclusion eventually.
5879 wait_on_list(waiting_for_mdsmap);
5880 } else {
5881 // Unexpected value!
5882 ceph_abort();
5883 }
5884 }
5885 }
5886
5887 populate_metadata(mount_root.empty() ? "/" : mount_root);
5888
5889 filepath fp(CEPH_INO_ROOT);
5890 if (!mount_root.empty()) {
5891 fp = filepath(mount_root.c_str());
5892 }
5893 while (true) {
5894 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5895 req->set_filepath(fp);
5896 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5897 int res = make_request(req, perms);
5898 if (res < 0) {
5899 if (res == -EACCES && root) {
5900 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5901 break;
5902 }
5903 return res;
5904 }
5905
5906 if (fp.depth())
5907 fp.pop_dentry();
5908 else
5909 break;
5910 }
5911
11fdf7f2 5912 ceph_assert(root);
7c673cae
FG
5913 _ll_get(root);
5914
5915 mounted = true;
5916
5917 // trace?
5918 if (!cct->_conf->client_trace.empty()) {
5919 traceout.open(cct->_conf->client_trace.c_str());
5920 if (traceout.is_open()) {
5921 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5922 } else {
5923 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5924 }
5925 }
5926
5927 /*
5928 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5929 ldout(cct, 3) << "op: struct stat st;" << dendl;
5930 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5931 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5932 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5933 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5934 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5935 ldout(cct, 3) << "op: int fd;" << dendl;
5936 */
5937 return 0;
5938}
5939
5940// UNMOUNT
5941
5942void Client::_close_sessions()
5943{
5944 while (!mds_sessions.empty()) {
5945 // send session closes!
11fdf7f2
TL
5946 for (auto &p : mds_sessions) {
5947 if (p.second.state != MetaSession::STATE_CLOSING) {
5948 _close_mds_session(&p.second);
7c673cae
FG
5949 }
5950 }
5951
5952 // wait for sessions to close
5953 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5954 mount_cond.Wait(client_lock);
5955 }
5956}
5957
31f18b77
FG
5958void Client::flush_mdlog_sync()
5959{
5960 if (mds_requests.empty())
5961 return;
11fdf7f2
TL
5962 for (auto &p : mds_sessions) {
5963 flush_mdlog(&p.second);
31f18b77
FG
5964 }
5965}
5966
5967void Client::flush_mdlog(MetaSession *session)
5968{
5969 // Only send this to Luminous or newer MDS daemons, older daemons
5970 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5971 const uint64_t features = session->con->get_features();
5972 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
11fdf7f2
TL
5973 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5974 session->con->send_message2(std::move(m));
31f18b77
FG
5975 }
5976}
5977
5978
11fdf7f2
TL
5979void Client::_abort_mds_sessions(int err)
5980{
5981 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5982 auto req = p->second;
5983 ++p;
5984 // unsafe requests will be removed during close session below.
5985 if (req->got_unsafe)
5986 continue;
5987
5988 req->abort(err);
5989 if (req->caller_cond) {
5990 req->kick = true;
5991 req->caller_cond->Signal();
5992 }
5993 }
5994
5995 // Process aborts on any requests that were on this waitlist.
5996 // Any requests that were on a waiting_for_open session waitlist
5997 // will get kicked during close session below.
5998 signal_cond_list(waiting_for_mdsmap);
5999
6000 // Force-close all sessions
6001 while(!mds_sessions.empty()) {
6002 auto& session = mds_sessions.begin()->second;
6003 _closed_mds_session(&session);
6004 }
6005}
6006
6007void Client::_unmount(bool abort)
7c673cae 6008{
181888fb
FG
6009 if (unmounting)
6010 return;
7c673cae 6011
11fdf7f2
TL
6012 if (abort || blacklisted) {
6013 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6014 } else {
6015 ldout(cct, 2) << "unmounting" << dendl;
6016 }
7c673cae
FG
6017 unmounting = true;
6018
b32b8144
FG
6019 deleg_timeout = 0;
6020
11fdf7f2
TL
6021 if (abort) {
6022 // Abort all mds sessions
6023 _abort_mds_sessions(-ENOTCONN);
6024
6025 objecter->op_cancel_writes(-ENOTCONN);
6026 } else {
6027 // flush the mdlog for pending requests, if any
6028 flush_mdlog_sync();
6029 }
6030
7c673cae
FG
6031 while (!mds_requests.empty()) {
6032 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6033 mount_cond.Wait(client_lock);
6034 }
6035
6036 if (tick_event)
6037 timer.cancel_event(tick_event);
6038 tick_event = 0;
6039
6040 cwd.reset();
6041
6042 // clean up any unclosed files
6043 while (!fd_map.empty()) {
6044 Fh *fh = fd_map.begin()->second;
6045 fd_map.erase(fd_map.begin());
6046 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6047 _release_fh(fh);
6048 }
6049
6050 while (!ll_unclosed_fh_set.empty()) {
6051 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6052 Fh *fh = *it;
6053 ll_unclosed_fh_set.erase(fh);
6054 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6055 _release_fh(fh);
6056 }
6057
6058 while (!opened_dirs.empty()) {
6059 dir_result_t *dirp = *opened_dirs.begin();
6060 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6061 _closedir(dirp);
6062 }
6063
6064 _ll_drop_pins();
6065
6066 while (unsafe_sync_write > 0) {
6067 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6068 mount_cond.Wait(client_lock);
6069 }
6070
6071 if (cct->_conf->client_oc) {
6072 // flush/release all buffered data
11fdf7f2
TL
6073 std::list<InodeRef> anchor;
6074 for (auto& p : inode_map) {
6075 Inode *in = p.second;
7c673cae 6076 if (!in) {
11fdf7f2
TL
6077 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6078 ceph_assert(in);
7c673cae 6079 }
11fdf7f2
TL
6080
6081 // prevent inode from getting freed
6082 anchor.emplace_back(in);
6083
6084 if (abort || blacklisted) {
6085 objectcacher->purge_set(&in->oset);
6086 } else if (!in->caps.empty()) {
7c673cae
FG
6087 _release(in);
6088 _flush(in, new C_Client_FlushComplete(this, in));
6089 }
6090 }
6091 }
6092
11fdf7f2
TL
6093 if (abort || blacklisted) {
6094 for (auto p = dirty_list.begin(); !p.end(); ) {
6095 Inode *in = *p;
6096 ++p;
6097 if (in->dirty_caps) {
6098 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6099 in->mark_caps_clean();
6100 put_inode(in);
6101 }
6102 }
6103 } else {
6104 flush_caps_sync();
6105 wait_sync_caps(last_flush_tid);
6106 }
7c673cae
FG
6107
6108 // empty lru cache
7c673cae
FG
6109 trim_cache();
6110
6111 while (lru.lru_get_size() > 0 ||
6112 !inode_map.empty()) {
6113 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6114 << "+" << inode_map.size() << " items"
6115 << ", waiting (for caps to release?)"
6116 << dendl;
6117 utime_t until = ceph_clock_now() + utime_t(5, 0);
6118 int r = mount_cond.WaitUntil(client_lock, until);
6119 if (r == ETIMEDOUT) {
6120 dump_cache(NULL);
6121 }
6122 }
11fdf7f2
TL
6123 ceph_assert(lru.lru_get_size() == 0);
6124 ceph_assert(inode_map.empty());
7c673cae
FG
6125
6126 // stop tracing
6127 if (!cct->_conf->client_trace.empty()) {
6128 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6129 traceout.close();
6130 }
6131
6132 _close_sessions();
6133
6134 mounted = false;
6135
6136 ldout(cct, 2) << "unmounted." << dendl;
6137}
6138
b32b8144
FG
6139void Client::unmount()
6140{
11fdf7f2
TL
6141 std::lock_guard lock(client_lock);
6142 _unmount(false);
6143}
6144
6145void Client::abort_conn()
6146{
6147 std::lock_guard lock(client_lock);
6148 _unmount(true);
b32b8144
FG
6149}
6150
7c673cae
FG
6151void Client::flush_cap_releases()
6152{
6153 // send any cap releases
11fdf7f2
TL
6154 for (auto &p : mds_sessions) {
6155 auto &session = p.second;
6156 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6157 p.first)) {
7c673cae
FG
6158 if (cct->_conf->client_inject_release_failure) {
6159 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6160 } else {
11fdf7f2 6161 session.con->send_message2(std::move(session.release));
7c673cae 6162 }
11fdf7f2 6163 session.release.reset();
7c673cae
FG
6164 }
6165 }
6166}
6167
6168void Client::tick()
6169{
6170 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6171 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6172 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6173 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6174 }
6175
6176 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6177 tick_event = timer.add_event_after(
6178 cct->_conf->client_tick_interval,
6179 new FunctionContext([this](int) {
6180 // Called back via Timer, which takes client_lock for us
11fdf7f2 6181 ceph_assert(client_lock.is_locked_by_me());
3efd9988
FG
6182 tick();
6183 }));
7c673cae
FG
6184 utime_t now = ceph_clock_now();
6185
6186 if (!mounted && !mds_requests.empty()) {
6187 MetaRequest *req = mds_requests.begin()->second;
6188 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6189 req->abort(-ETIMEDOUT);
6190 if (req->caller_cond) {
6191 req->kick = true;
6192 req->caller_cond->Signal();
6193 }
6194 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6195 for (auto &p : mds_sessions) {
6196 signal_context_list(p.second.waiting_for_open);
6197 }
7c673cae
FG
6198 }
6199 }
6200
6201 if (mdsmap->get_epoch()) {
6202 // renew caps?
6203 utime_t el = now - last_cap_renew;
6204 if (el > mdsmap->get_session_timeout() / 3.0)
6205 renew_caps();
6206
6207 flush_cap_releases();
6208 }
6209
6210 // delayed caps
28e407b8 6211 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6212 while (!p.end()) {
6213 Inode *in = *p;
6214 ++p;
6215 if (in->hold_caps_until > now)
6216 break;
28e407b8 6217 delayed_list.pop_front();
7c673cae
FG
6218 check_caps(in, CHECK_CAPS_NODELAY);
6219 }
6220
6221 trim_cache(true);
6222}
6223
6224void Client::renew_caps()
6225{
6226 ldout(cct, 10) << "renew_caps()" << dendl;
6227 last_cap_renew = ceph_clock_now();
6228
11fdf7f2
TL
6229 for (auto &p : mds_sessions) {
6230 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6231 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6232 renew_caps(&p.second);
7c673cae
FG
6233 }
6234}
6235
6236void Client::renew_caps(MetaSession *session)
6237{
6238 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6239 session->last_cap_renew_request = ceph_clock_now();
6240 uint64_t seq = ++session->cap_renew_seq;
11fdf7f2 6241 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6242}
6243
6244
6245// ===============================================================
6246// high level (POSIXy) interface
6247
6248int Client::_do_lookup(Inode *dir, const string& name, int mask,
6249 InodeRef *target, const UserPerm& perms)
6250{
6251 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6252 MetaRequest *req = new MetaRequest(op);
6253 filepath path;
6254 dir->make_nosnap_relative_path(path);
6255 path.push_dentry(name);
6256 req->set_filepath(path);
6257 req->set_inode(dir);
6258 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6259 mask |= DEBUG_GETATTR_CAPS;
6260 req->head.args.getattr.mask = mask;
6261
11fdf7f2 6262 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6263
6264 int r = make_request(req, perms, target);
11fdf7f2 6265 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6266 return r;
6267}
6268
6269int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6270 const UserPerm& perms)
6271{
6272 int r = 0;
6273 Dentry *dn = NULL;
6274
7c673cae 6275 if (dname == "..") {
11fdf7f2
TL
6276 if (dir->dentries.empty()) {
6277 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6278 filepath path(dir->ino);
6279 req->set_filepath(path);
6280
6281 InodeRef tmptarget;
6282 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6283
6284 if (r == 0) {
6285 Inode *tempino = tmptarget.get();
6286 _ll_get(tempino);
6287 *target = tempino;
6288 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6289 } else {
6290 *target = dir;
6291 }
6292 }
7c673cae
FG
6293 else
6294 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6295 goto done;
6296 }
6297
6298 if (dname == ".") {
6299 *target = dir;
6300 goto done;
6301 }
6302
11fdf7f2
TL
6303 if (!dir->is_dir()) {
6304 r = -ENOTDIR;
6305 goto done;
6306 }
6307
7c673cae
FG
6308 if (dname.length() > NAME_MAX) {
6309 r = -ENAMETOOLONG;
6310 goto done;
6311 }
6312
6313 if (dname == cct->_conf->client_snapdir &&
6314 dir->snapid == CEPH_NOSNAP) {
6315 *target = open_snapdir(dir);
6316 goto done;
6317 }
6318
6319 if (dir->dir &&
6320 dir->dir->dentries.count(dname)) {
6321 dn = dir->dir->dentries[dname];
6322
11fdf7f2 6323 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6324 << " seq " << dn->lease_seq
6325 << dendl;
6326
94b18763 6327 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6328 // is dn lease valid?
6329 utime_t now = ceph_clock_now();
6330 if (dn->lease_mds >= 0 &&
6331 dn->lease_ttl > now &&
6332 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6333 MetaSession &s = mds_sessions.at(dn->lease_mds);
6334 if (s.cap_ttl > now &&
6335 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6336 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6337 // make trim_caps() behave.
6338 dir->try_touch_cap(dn->lease_mds);
6339 goto hit_dn;
6340 }
11fdf7f2 6341 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6342 << " vs lease_gen " << dn->lease_gen << dendl;
6343 }
6344 // dir lease?
94b18763 6345 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6346 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6347 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6348 goto hit_dn;
6349 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6350 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6351 << *dir << " dn '" << dname << "'" << dendl;
6352 return -ENOENT;
6353 }
6354 }
6355 } else {
6356 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6357 }
6358 } else {
6359 // can we conclude ENOENT locally?
94b18763 6360 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6361 (dir->flags & I_COMPLETE)) {
11fdf7f2 6362 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6363 return -ENOENT;
6364 }
6365 }
6366
6367 r = _do_lookup(dir, dname, mask, target, perms);
6368 goto done;
6369
6370 hit_dn:
6371 if (dn->inode) {
6372 *target = dn->inode;
6373 } else {
6374 r = -ENOENT;
6375 }
6376 touch_dn(dn);
6377
6378 done:
6379 if (r < 0)
11fdf7f2 6380 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6381 else
11fdf7f2 6382 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6383 return r;
6384}
6385
6386int Client::get_or_create(Inode *dir, const char* name,
6387 Dentry **pdn, bool expect_null)
6388{
6389 // lookup
11fdf7f2 6390 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6391 dir->open_dir();
6392 if (dir->dir->dentries.count(name)) {
6393 Dentry *dn = dir->dir->dentries[name];
6394
6395 // is dn lease valid?
6396 utime_t now = ceph_clock_now();
6397 if (dn->inode &&
6398 dn->lease_mds >= 0 &&
6399 dn->lease_ttl > now &&
6400 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6401 MetaSession &s = mds_sessions.at(dn->lease_mds);
6402 if (s.cap_ttl > now &&
6403 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6404 if (expect_null)
6405 return -EEXIST;
6406 }
6407 }
6408 *pdn = dn;
6409 } else {
6410 // otherwise link up a new one
6411 *pdn = link(dir->dir, name, NULL, NULL);
6412 }
6413
6414 // success
6415 return 0;
6416}
6417
6418int Client::path_walk(const filepath& origpath, InodeRef *end,
6419 const UserPerm& perms, bool followsym, int mask)
6420{
6421 filepath path = origpath;
6422 InodeRef cur;
6423 if (origpath.absolute())
6424 cur = root;
6425 else
6426 cur = cwd;
11fdf7f2 6427 ceph_assert(cur);
7c673cae 6428
11fdf7f2 6429 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6430
6431 int symlinks = 0;
6432
6433 unsigned i=0;
6434 while (i < path.depth() && cur) {
6435 int caps = 0;
6436 const string &dname = path[i];
6437 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6438 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6439 InodeRef next;
6440 if (cct->_conf->client_permissions) {
6441 int r = may_lookup(cur.get(), perms);
6442 if (r < 0)
6443 return r;
6444 caps = CEPH_CAP_AUTH_SHARED;
6445 }
6446
6447 /* Get extra requested caps on the last component */
6448 if (i == (path.depth() - 1))
6449 caps |= mask;
6450 int r = _lookup(cur.get(), dname, caps, &next, perms);
6451 if (r < 0)
6452 return r;
6453 // only follow trailing symlink if followsym. always follow
6454 // 'directory' symlinks.
6455 if (next && next->is_symlink()) {
6456 symlinks++;
6457 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6458 if (symlinks > MAXSYMLINKS) {
6459 return -ELOOP;
6460 }
6461
6462 if (i < path.depth() - 1) {
6463 // dir symlink
6464 // replace consumed components of path with symlink dir target
6465 filepath resolved(next->symlink.c_str());
6466 resolved.append(path.postfixpath(i + 1));
6467 path = resolved;
6468 i = 0;
6469 if (next->symlink[0] == '/') {
6470 cur = root;
6471 }
6472 continue;
6473 } else if (followsym) {
6474 if (next->symlink[0] == '/') {
6475 path = next->symlink.c_str();
6476 i = 0;
6477 // reset position
6478 cur = root;
6479 } else {
6480 filepath more(next->symlink.c_str());
6481 // we need to remove the symlink component from off of the path
6482 // before adding the target that the symlink points to. remain
6483 // at the same position in the path.
6484 path.pop_dentry();
6485 path.append(more);
6486 }
6487 continue;
6488 }
6489 }
6490 cur.swap(next);
6491 i++;
6492 }
6493 if (!cur)
6494 return -ENOENT;
6495 if (end)
6496 end->swap(cur);
6497 return 0;
6498}
6499
6500
6501// namespace ops
6502
6503int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6504{
11fdf7f2 6505 std::lock_guard lock(client_lock);
7c673cae
FG
6506 tout(cct) << "link" << std::endl;
6507 tout(cct) << relexisting << std::endl;
6508 tout(cct) << relpath << std::endl;
6509
181888fb
FG
6510 if (unmounting)
6511 return -ENOTCONN;
6512
7c673cae
FG
6513 filepath existing(relexisting);
6514
6515 InodeRef in, dir;
6516 int r = path_walk(existing, &in, perm, true);
6517 if (r < 0)
6518 return r;
6519 if (std::string(relpath) == "/") {
6520 r = -EEXIST;
6521 return r;
6522 }
6523 filepath path(relpath);
6524 string name = path.last_dentry();
6525 path.pop_dentry();
6526
6527 r = path_walk(path, &dir, perm, true);
6528 if (r < 0)
6529 return r;
6530 if (cct->_conf->client_permissions) {
6531 if (S_ISDIR(in->mode)) {
6532 r = -EPERM;
6533 return r;
6534 }
6535 r = may_hardlink(in.get(), perm);
6536 if (r < 0)
6537 return r;
6538 r = may_create(dir.get(), perm);
6539 if (r < 0)
6540 return r;
6541 }
6542 r = _link(in.get(), dir.get(), name.c_str(), perm);
6543 return r;
6544}
6545
6546int Client::unlink(const char *relpath, const UserPerm& perm)
6547{
11fdf7f2
TL
6548 std::lock_guard lock(client_lock);
6549 tout(cct) << __func__ << std::endl;
7c673cae
FG
6550 tout(cct) << relpath << std::endl;
6551
181888fb
FG
6552 if (unmounting)
6553 return -ENOTCONN;
6554
7c673cae
FG
6555 if (std::string(relpath) == "/")
6556 return -EISDIR;
6557
6558 filepath path(relpath);
6559 string name = path.last_dentry();
6560 path.pop_dentry();
6561 InodeRef dir;
6562 int r = path_walk(path, &dir, perm);
6563 if (r < 0)
6564 return r;
6565 if (cct->_conf->client_permissions) {
6566 r = may_delete(dir.get(), name.c_str(), perm);
6567 if (r < 0)
6568 return r;
6569 }
6570 return _unlink(dir.get(), name.c_str(), perm);
6571}
6572
6573int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6574{
11fdf7f2
TL
6575 std::lock_guard lock(client_lock);
6576 tout(cct) << __func__ << std::endl;
7c673cae
FG
6577 tout(cct) << relfrom << std::endl;
6578 tout(cct) << relto << std::endl;
6579
181888fb
FG
6580 if (unmounting)
6581 return -ENOTCONN;
6582
7c673cae
FG
6583 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6584 return -EBUSY;
6585
6586 filepath from(relfrom);
6587 filepath to(relto);
6588 string fromname = from.last_dentry();
6589 from.pop_dentry();
6590 string toname = to.last_dentry();
6591 to.pop_dentry();
6592
6593 InodeRef fromdir, todir;
6594 int r = path_walk(from, &fromdir, perm);
6595 if (r < 0)
6596 goto out;
6597 r = path_walk(to, &todir, perm);
6598 if (r < 0)
6599 goto out;
6600
6601 if (cct->_conf->client_permissions) {
6602 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6603 if (r < 0)
6604 return r;
6605 r = may_delete(todir.get(), toname.c_str(), perm);
6606 if (r < 0 && r != -ENOENT)
6607 return r;
6608 }
6609 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6610out:
6611 return r;
6612}
6613
6614// dirs
6615
6616int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6617{
11fdf7f2
TL
6618 std::lock_guard lock(client_lock);
6619 tout(cct) << __func__ << std::endl;
7c673cae
FG
6620 tout(cct) << relpath << std::endl;
6621 tout(cct) << mode << std::endl;
11fdf7f2 6622 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6623
181888fb
FG
6624 if (unmounting)
6625 return -ENOTCONN;
6626
7c673cae
FG
6627 if (std::string(relpath) == "/")
6628 return -EEXIST;
6629
6630 filepath path(relpath);
6631 string name = path.last_dentry();
6632 path.pop_dentry();
6633 InodeRef dir;
6634 int r = path_walk(path, &dir, perm);
6635 if (r < 0)
6636 return r;
6637 if (cct->_conf->client_permissions) {
6638 r = may_create(dir.get(), perm);
6639 if (r < 0)
6640 return r;
6641 }
6642 return _mkdir(dir.get(), name.c_str(), mode, perm);
6643}
6644
6645int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6646{
11fdf7f2 6647 std::lock_guard lock(client_lock);
7c673cae 6648 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6649 tout(cct) << __func__ << std::endl;
7c673cae
FG
6650 tout(cct) << relpath << std::endl;
6651 tout(cct) << mode << std::endl;
6652
181888fb
FG
6653 if (unmounting)
6654 return -ENOTCONN;
6655
7c673cae
FG
6656 //get through existing parts of path
6657 filepath path(relpath);
6658 unsigned int i;
6659 int r = 0, caps = 0;
6660 InodeRef cur, next;
6661 cur = cwd;
6662 for (i=0; i<path.depth(); ++i) {
6663 if (cct->_conf->client_permissions) {
6664 r = may_lookup(cur.get(), perms);
6665 if (r < 0)
6666 break;
6667 caps = CEPH_CAP_AUTH_SHARED;
6668 }
6669 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6670 if (r < 0)
6671 break;
6672 cur.swap(next);
6673 }
7c673cae 6674 if (r!=-ENOENT) return r;
11fdf7f2 6675 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6676 //make new directory at each level
6677 for (; i<path.depth(); ++i) {
6678 if (cct->_conf->client_permissions) {
6679 r = may_create(cur.get(), perms);
6680 if (r < 0)
6681 return r;
6682 }
6683 //make new dir
6684 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6685
7c673cae 6686 //check proper creation/existence
c07f9fc5
FG
6687 if(-EEXIST == r && i < path.depth() - 1) {
6688 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6689 }
6690 if (r < 0)
6691 return r;
7c673cae
FG
6692 //move to new dir and continue
6693 cur.swap(next);
11fdf7f2 6694 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6695 << filepath(cur->ino).get_path() << dendl;
6696 }
6697 return 0;
6698}
6699
6700int Client::rmdir(const char *relpath, const UserPerm& perms)
6701{
11fdf7f2
TL
6702 std::lock_guard lock(client_lock);
6703 tout(cct) << __func__ << std::endl;
7c673cae
FG
6704 tout(cct) << relpath << std::endl;
6705
181888fb
FG
6706 if (unmounting)
6707 return -ENOTCONN;
6708
7c673cae
FG
6709 if (std::string(relpath) == "/")
6710 return -EBUSY;
6711
6712 filepath path(relpath);
6713 string name = path.last_dentry();
6714 path.pop_dentry();
6715 InodeRef dir;
6716 int r = path_walk(path, &dir, perms);
6717 if (r < 0)
6718 return r;
6719 if (cct->_conf->client_permissions) {
6720 int r = may_delete(dir.get(), name.c_str(), perms);
6721 if (r < 0)
6722 return r;
6723 }
6724 return _rmdir(dir.get(), name.c_str(), perms);
6725}
6726
6727int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6728{
11fdf7f2
TL
6729 std::lock_guard lock(client_lock);
6730 tout(cct) << __func__ << std::endl;
7c673cae
FG
6731 tout(cct) << relpath << std::endl;
6732 tout(cct) << mode << std::endl;
6733 tout(cct) << rdev << std::endl;
6734
181888fb
FG
6735 if (unmounting)
6736 return -ENOTCONN;
6737
7c673cae
FG
6738 if (std::string(relpath) == "/")
6739 return -EEXIST;
6740
6741 filepath path(relpath);
6742 string name = path.last_dentry();
6743 path.pop_dentry();
6744 InodeRef dir;
6745 int r = path_walk(path, &dir, perms);
6746 if (r < 0)
6747 return r;
6748 if (cct->_conf->client_permissions) {
6749 int r = may_create(dir.get(), perms);
6750 if (r < 0)
6751 return r;
6752 }
6753 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6754}
6755
6756// symlinks
6757
6758int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6759{
11fdf7f2
TL
6760 std::lock_guard lock(client_lock);
6761 tout(cct) << __func__ << std::endl;
7c673cae
FG
6762 tout(cct) << target << std::endl;
6763 tout(cct) << relpath << std::endl;
6764
181888fb
FG
6765 if (unmounting)
6766 return -ENOTCONN;
6767
7c673cae
FG
6768 if (std::string(relpath) == "/")
6769 return -EEXIST;
6770
6771 filepath path(relpath);
6772 string name = path.last_dentry();
6773 path.pop_dentry();
6774 InodeRef dir;
6775 int r = path_walk(path, &dir, perms);
6776 if (r < 0)
6777 return r;
6778 if (cct->_conf->client_permissions) {
6779 int r = may_create(dir.get(), perms);
6780 if (r < 0)
6781 return r;
6782 }
6783 return _symlink(dir.get(), name.c_str(), target, perms);
6784}
6785
6786int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6787{
11fdf7f2
TL
6788 std::lock_guard lock(client_lock);
6789 tout(cct) << __func__ << std::endl;
7c673cae
FG
6790 tout(cct) << relpath << std::endl;
6791
181888fb
FG
6792 if (unmounting)
6793 return -ENOTCONN;
6794
7c673cae
FG
6795 filepath path(relpath);
6796 InodeRef in;
6797 int r = path_walk(path, &in, perms, false);
6798 if (r < 0)
6799 return r;
6800
6801 return _readlink(in.get(), buf, size);
6802}
6803
6804int Client::_readlink(Inode *in, char *buf, size_t size)
6805{
6806 if (!in->is_symlink())
6807 return -EINVAL;
6808
6809 // copy into buf (at most size bytes)
6810 int r = in->symlink.length();
6811 if (r > (int)size)
6812 r = size;
6813 memcpy(buf, in->symlink.c_str(), r);
6814 return r;
6815}
6816
6817
6818// inode stuff
6819
6820int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6821{
94b18763 6822 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6823
11fdf7f2 6824 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6825 if (yes && !force)
6826 return 0;
6827
6828 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6829 filepath path;
6830 in->make_nosnap_relative_path(path);
6831 req->set_filepath(path);
6832 req->set_inode(in);
6833 req->head.args.getattr.mask = mask;
6834
6835 int res = make_request(req, perms);
11fdf7f2 6836 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6837 return res;
6838}
6839
6840int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6841 const UserPerm& perms, InodeRef *inp)
6842{
6843 int issued = in->caps_issued();
6844
11fdf7f2 6845 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6846 ccap_string(issued) << dendl;
6847
6848 if (in->snapid != CEPH_NOSNAP) {
6849 return -EROFS;
6850 }
6851 if ((mask & CEPH_SETATTR_SIZE) &&
6852 (unsigned long)stx->stx_size > in->size &&
6853 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6854 perms)) {
6855 return -EDQUOT;
6856 }
6857
6858 // make the change locally?
6859 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6860 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6861 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6862 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6863 << in->cap_dirtier_gid << ", forcing sync setattr"
6864 << dendl;
6865 /*
6866 * This works because we implicitly flush the caps as part of the
6867 * request, so the cap update check will happen with the writeback
6868 * cap context, and then the setattr check will happen with the
6869 * caller's context.
6870 *
6871 * In reality this pattern is likely pretty rare (different users
6872 * setattr'ing the same file). If that turns out not to be the
6873 * case later, we can build a more complex pipelined cap writeback
6874 * infrastructure...
6875 */
6876 if (!mask)
6877 mask |= CEPH_SETATTR_CTIME;
6878 goto force_request;
6879 }
6880
6881 if (!mask) {
6882 // caller just needs us to bump the ctime
6883 in->ctime = ceph_clock_now();
6884 in->cap_dirtier_uid = perms.uid();
6885 in->cap_dirtier_gid = perms.gid();
6886 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6887 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6888 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6889 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6890 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6891 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6892 else
6893 mask |= CEPH_SETATTR_CTIME;
6894 }
6895
6896 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6897 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6898
6899 mask &= ~CEPH_SETATTR_KILL_SGUID;
6900
6901 if (mask & CEPH_SETATTR_UID) {
6902 in->ctime = ceph_clock_now();
6903 in->cap_dirtier_uid = perms.uid();
6904 in->cap_dirtier_gid = perms.gid();
6905 in->uid = stx->stx_uid;
28e407b8 6906 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6907 mask &= ~CEPH_SETATTR_UID;
6908 kill_sguid = true;
6909 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6910 }
6911 if (mask & CEPH_SETATTR_GID) {
6912 in->ctime = ceph_clock_now();
6913 in->cap_dirtier_uid = perms.uid();
6914 in->cap_dirtier_gid = perms.gid();
6915 in->gid = stx->stx_gid;
28e407b8 6916 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6917 mask &= ~CEPH_SETATTR_GID;
6918 kill_sguid = true;
6919 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6920 }
6921
6922 if (mask & CEPH_SETATTR_MODE) {
6923 in->ctime = ceph_clock_now();
6924 in->cap_dirtier_uid = perms.uid();
6925 in->cap_dirtier_gid = perms.gid();
6926 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6927 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6928 mask &= ~CEPH_SETATTR_MODE;
6929 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6930 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6931 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6932 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6933 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6934 }
6935
6936 if (mask & CEPH_SETATTR_BTIME) {
6937 in->ctime = ceph_clock_now();
6938 in->cap_dirtier_uid = perms.uid();
6939 in->cap_dirtier_gid = perms.gid();
6940 in->btime = utime_t(stx->stx_btime);
28e407b8 6941 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6942 mask &= ~CEPH_SETATTR_BTIME;
6943 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6944 }
6945 } else if (mask & CEPH_SETATTR_SIZE) {
6946 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6947 mask |= CEPH_SETATTR_KILL_SGUID;
6948 }
6949
6950 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6951 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6952 if (mask & CEPH_SETATTR_MTIME)
6953 in->mtime = utime_t(stx->stx_mtime);
6954 if (mask & CEPH_SETATTR_ATIME)
6955 in->atime = utime_t(stx->stx_atime);
6956 in->ctime = ceph_clock_now();
6957 in->cap_dirtier_uid = perms.uid();
6958 in->cap_dirtier_gid = perms.gid();
6959 in->time_warp_seq++;
28e407b8 6960 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6961 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6962 }
6963 }
6964 if (!mask) {
6965 in->change_attr++;
6966 return 0;
6967 }
6968
6969force_request:
6970 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6971
6972 filepath path;
6973
6974 in->make_nosnap_relative_path(path);
6975 req->set_filepath(path);
6976 req->set_inode(in);
6977
6978 if (mask & CEPH_SETATTR_KILL_SGUID) {
6979 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6980 }
6981 if (mask & CEPH_SETATTR_MODE) {
6982 req->head.args.setattr.mode = stx->stx_mode;
6983 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6984 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6985 }
6986 if (mask & CEPH_SETATTR_UID) {
6987 req->head.args.setattr.uid = stx->stx_uid;
6988 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6989 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6990 }
6991 if (mask & CEPH_SETATTR_GID) {
6992 req->head.args.setattr.gid = stx->stx_gid;
6993 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6994 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6995 }
6996 if (mask & CEPH_SETATTR_BTIME) {
6997 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6998 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6999 }
7000 if (mask & CEPH_SETATTR_MTIME) {
7001 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 7002 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7003 CEPH_CAP_FILE_WR;
7004 }
7005 if (mask & CEPH_SETATTR_ATIME) {
7006 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7007 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7008 CEPH_CAP_FILE_WR;
7009 }
7010 if (mask & CEPH_SETATTR_SIZE) {
7011 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7012 req->head.args.setattr.size = stx->stx_size;
7013 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7014 } else { //too big!
7015 put_request(req);
7016 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7017 return -EFBIG;
7018 }
94b18763 7019 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7020 CEPH_CAP_FILE_WR;
7021 }
7022 req->head.args.setattr.mask = mask;
7023
7024 req->regetattr_mask = mask;
7025
7026 int res = make_request(req, perms, inp);
7027 ldout(cct, 10) << "_setattr result=" << res << dendl;
7028 return res;
7029}
7030
7031/* Note that we only care about attrs that setattr cares about */
7032void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7033{
7034 stx->stx_size = st->st_size;
7035 stx->stx_mode = st->st_mode;
7036 stx->stx_uid = st->st_uid;
7037 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7038#ifdef __APPLE__
7039 stx->stx_mtime = st->st_mtimespec;
7040 stx->stx_atime = st->st_atimespec;
7041#else
7c673cae
FG
7042 stx->stx_mtime = st->st_mtim;
7043 stx->stx_atime = st->st_atim;
11fdf7f2 7044#endif
7c673cae
FG
7045}
7046
7047int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7048 const UserPerm& perms, InodeRef *inp)
7049{
7050 int ret = _do_setattr(in, stx, mask, perms, inp);
7051 if (ret < 0)
7052 return ret;
7053 if (mask & CEPH_SETATTR_MODE)
7054 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7055 return ret;
7056}
7057
7058int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7059 const UserPerm& perms)
7060{
7061 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7062 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7063 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7064 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7065 if (cct->_conf->client_permissions) {
7066 int r = may_setattr(in.get(), stx, mask, perms);
7067 if (r < 0)
7068 return r;
7069 }
7070 return __setattrx(in.get(), stx, mask, perms);
7071}
7072
7073int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7074 const UserPerm& perms)
7075{
7076 struct ceph_statx stx;
7077
7078 stat_to_statx(attr, &stx);
7079 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7080
7081 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7082 mask &= ~CEPH_SETATTR_UID;
7083 }
7084 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7085 mask &= ~CEPH_SETATTR_GID;
7086 }
7087
7c673cae
FG
7088 return _setattrx(in, &stx, mask, perms);
7089}
7090
7091int Client::setattr(const char *relpath, struct stat *attr, int mask,
7092 const UserPerm& perms)
7093{
11fdf7f2
TL
7094 std::lock_guard lock(client_lock);
7095 tout(cct) << __func__ << std::endl;
7c673cae
FG
7096 tout(cct) << relpath << std::endl;
7097 tout(cct) << mask << std::endl;
7098
181888fb
FG
7099 if (unmounting)
7100 return -ENOTCONN;
7101
7c673cae
FG
7102 filepath path(relpath);
7103 InodeRef in;
7104 int r = path_walk(path, &in, perms);
7105 if (r < 0)
7106 return r;
7107 return _setattr(in, attr, mask, perms);
7108}
7109
7110int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7111 const UserPerm& perms, int flags)
7112{
11fdf7f2
TL
7113 std::lock_guard lock(client_lock);
7114 tout(cct) << __func__ << std::endl;
7c673cae
FG
7115 tout(cct) << relpath << std::endl;
7116 tout(cct) << mask << std::endl;
7117
181888fb
FG
7118 if (unmounting)
7119 return -ENOTCONN;
7120
7c673cae
FG
7121 filepath path(relpath);
7122 InodeRef in;
7123 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7124 if (r < 0)
7125 return r;
7126 return _setattrx(in, stx, mask, perms);
7127}
7128
7129int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7130{
11fdf7f2
TL
7131 std::lock_guard lock(client_lock);
7132 tout(cct) << __func__ << std::endl;
7c673cae
FG
7133 tout(cct) << fd << std::endl;
7134 tout(cct) << mask << std::endl;
7135
181888fb
FG
7136 if (unmounting)
7137 return -ENOTCONN;
7138
7c673cae
FG
7139 Fh *f = get_filehandle(fd);
7140 if (!f)
7141 return -EBADF;
7142#if defined(__linux__) && defined(O_PATH)
7143 if (f->flags & O_PATH)
7144 return -EBADF;
7145#endif
7146 return _setattr(f->inode, attr, mask, perms);
7147}
7148
7149int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7150{
11fdf7f2
TL
7151 std::lock_guard lock(client_lock);
7152 tout(cct) << __func__ << std::endl;
7c673cae
FG
7153 tout(cct) << fd << std::endl;
7154 tout(cct) << mask << std::endl;
7155
181888fb
FG
7156 if (unmounting)
7157 return -ENOTCONN;
7158
7c673cae
FG
7159 Fh *f = get_filehandle(fd);
7160 if (!f)
7161 return -EBADF;
7162#if defined(__linux__) && defined(O_PATH)
7163 if (f->flags & O_PATH)
7164 return -EBADF;
7165#endif
7166 return _setattrx(f->inode, stx, mask, perms);
7167}
7168
7169int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7170 frag_info_t *dirstat, int mask)
7171{
11fdf7f2
TL
7172 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7173 std::lock_guard lock(client_lock);
7c673cae
FG
7174 tout(cct) << "stat" << std::endl;
7175 tout(cct) << relpath << std::endl;
181888fb
FG
7176
7177 if (unmounting)
7178 return -ENOTCONN;
7179
7c673cae
FG
7180 filepath path(relpath);
7181 InodeRef in;
7182 int r = path_walk(path, &in, perms, true, mask);
7183 if (r < 0)
7184 return r;
7185 r = _getattr(in, mask, perms);
7186 if (r < 0) {
11fdf7f2 7187 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7188 return r;
7189 }
7190 fill_stat(in, stbuf, dirstat);
11fdf7f2 7191 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7192 return r;
7193}
7194
7195unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7196{
7197 unsigned mask = 0;
7198
7199 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7200 if (flags & AT_NO_ATTR_SYNC)
7201 goto out;
7202
7203 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7204 mask |= CEPH_CAP_PIN;
7205 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7206 mask |= CEPH_CAP_AUTH_SHARED;
7207 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7208 mask |= CEPH_CAP_LINK_SHARED;
7209 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7210 mask |= CEPH_CAP_FILE_SHARED;
7211 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7212 mask |= CEPH_CAP_XATTR_SHARED;
7213out:
7214 return mask;
7215}
7216
7217int Client::statx(const char *relpath, struct ceph_statx *stx,
7218 const UserPerm& perms,
7219 unsigned int want, unsigned int flags)
7220{
11fdf7f2
TL
7221 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7222 std::lock_guard lock(client_lock);
7c673cae
FG
7223 tout(cct) << "statx" << std::endl;
7224 tout(cct) << relpath << std::endl;
181888fb
FG
7225
7226 if (unmounting)
7227 return -ENOTCONN;
7228
7c673cae
FG
7229 filepath path(relpath);
7230 InodeRef in;
7231
7232 unsigned mask = statx_to_mask(flags, want);
7233
7234 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7235 if (r < 0)
7236 return r;
7237
7238 r = _getattr(in, mask, perms);
7239 if (r < 0) {
11fdf7f2 7240 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7241 return r;
7242 }
7243
7244 fill_statx(in, mask, stx);
11fdf7f2 7245 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7246 return r;
7247}
7248
7249int Client::lstat(const char *relpath, struct stat *stbuf,
7250 const UserPerm& perms, frag_info_t *dirstat, int mask)
7251{
11fdf7f2
TL
7252 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7253 std::lock_guard lock(client_lock);
7254 tout(cct) << __func__ << std::endl;
7c673cae 7255 tout(cct) << relpath << std::endl;
181888fb
FG
7256
7257 if (unmounting)
7258 return -ENOTCONN;
7259
7c673cae
FG
7260 filepath path(relpath);
7261 InodeRef in;
7262 // don't follow symlinks
7263 int r = path_walk(path, &in, perms, false, mask);
7264 if (r < 0)
7265 return r;
7266 r = _getattr(in, mask, perms);
7267 if (r < 0) {
11fdf7f2 7268 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7269 return r;
7270 }
7271 fill_stat(in, stbuf, dirstat);
11fdf7f2 7272 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7273 return r;
7274}
7275
7276int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7277{
11fdf7f2 7278 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7279 << " mode 0" << oct << in->mode << dec
7280 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7281 memset(st, 0, sizeof(struct stat));
7282 if (use_faked_inos())
7283 st->st_ino = in->faked_ino;
7284 else
7285 st->st_ino = in->ino;
7286 st->st_dev = in->snapid;
7287 st->st_mode = in->mode;
7288 st->st_rdev = in->rdev;
28e407b8
AA
7289 if (in->is_dir()) {
7290 switch (in->nlink) {
7291 case 0:
7292 st->st_nlink = 0; /* dir is unlinked */
7293 break;
7294 case 1:
7295 st->st_nlink = 1 /* parent dentry */
7296 + 1 /* <dir>/. */
7297 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7298 break;
7299 default:
7300 ceph_abort();
7301 }
7302 } else {
7303 st->st_nlink = in->nlink;
7304 }
7c673cae
FG
7305 st->st_uid = in->uid;
7306 st->st_gid = in->gid;
7307 if (in->ctime > in->mtime) {
7308 stat_set_ctime_sec(st, in->ctime.sec());
7309 stat_set_ctime_nsec(st, in->ctime.nsec());
7310 } else {
7311 stat_set_ctime_sec(st, in->mtime.sec());
7312 stat_set_ctime_nsec(st, in->mtime.nsec());
7313 }
7314 stat_set_atime_sec(st, in->atime.sec());
7315 stat_set_atime_nsec(st, in->atime.nsec());
7316 stat_set_mtime_sec(st, in->mtime.sec());
7317 stat_set_mtime_nsec(st, in->mtime.nsec());
7318 if (in->is_dir()) {
7319 if (cct->_conf->client_dirsize_rbytes)
7320 st->st_size = in->rstat.rbytes;
7321 else
7322 st->st_size = in->dirstat.size();
7323 st->st_blocks = 1;
7324 } else {
7325 st->st_size = in->size;
7326 st->st_blocks = (in->size + 511) >> 9;
7327 }
11fdf7f2 7328 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7329
7330 if (dirstat)
7331 *dirstat = in->dirstat;
7332 if (rstat)
7333 *rstat = in->rstat;
7334
7335 return in->caps_issued();
7336}
7337
7338void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7339{
11fdf7f2 7340 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7341 << " mode 0" << oct << in->mode << dec
7342 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7343 memset(stx, 0, sizeof(struct ceph_statx));
7344
7345 /*
7346 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7347 * so that all bits are set.
7348 */
7349 if (!mask)
7350 mask = ~0;
7351
7352 /* These are always considered to be available */
7353 stx->stx_dev = in->snapid;
11fdf7f2 7354 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7355
7356 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7357 stx->stx_mode = S_IFMT & in->mode;
7358 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7359 stx->stx_rdev = in->rdev;
7360 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7361
7362 if (mask & CEPH_CAP_AUTH_SHARED) {
7363 stx->stx_uid = in->uid;
7364 stx->stx_gid = in->gid;
7365 stx->stx_mode = in->mode;
7366 in->btime.to_timespec(&stx->stx_btime);
7367 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7368 }
7369
7370 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7371 if (in->is_dir()) {
7372 switch (in->nlink) {
7373 case 0:
7374 stx->stx_nlink = 0; /* dir is unlinked */
7375 break;
7376 case 1:
7377 stx->stx_nlink = 1 /* parent dentry */
7378 + 1 /* <dir>/. */
7379 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7380 break;
7381 default:
7382 ceph_abort();
7383 }
7384 } else {
7385 stx->stx_nlink = in->nlink;
7386 }
7c673cae
FG
7387 stx->stx_mask |= CEPH_STATX_NLINK;
7388 }
7389
7390 if (mask & CEPH_CAP_FILE_SHARED) {
7391
7392 in->atime.to_timespec(&stx->stx_atime);
7393 in->mtime.to_timespec(&stx->stx_mtime);
7394
7395 if (in->is_dir()) {
7396 if (cct->_conf->client_dirsize_rbytes)
7397 stx->stx_size = in->rstat.rbytes;
7398 else
7399 stx->stx_size = in->dirstat.size();
7400 stx->stx_blocks = 1;
7401 } else {
7402 stx->stx_size = in->size;
7403 stx->stx_blocks = (in->size + 511) >> 9;
7404 }
7405 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7406 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7407 }
7408
7409 /* Change time and change_attr both require all shared caps to view */
7410 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7411 stx->stx_version = in->change_attr;
7412 if (in->ctime > in->mtime)
7413 in->ctime.to_timespec(&stx->stx_ctime);
7414 else
7415 in->mtime.to_timespec(&stx->stx_ctime);
7416 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7417 }
7418
7419}
7420
7421void Client::touch_dn(Dentry *dn)
7422{
7423 lru.lru_touch(dn);
7424}
7425
7426int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7427{
11fdf7f2
TL
7428 std::lock_guard lock(client_lock);
7429 tout(cct) << __func__ << std::endl;
7c673cae
FG
7430 tout(cct) << relpath << std::endl;
7431 tout(cct) << mode << std::endl;
181888fb
FG
7432
7433 if (unmounting)
7434 return -ENOTCONN;
7435
7c673cae
FG
7436 filepath path(relpath);
7437 InodeRef in;
7438 int r = path_walk(path, &in, perms);
7439 if (r < 0)
7440 return r;
7441 struct stat attr;
7442 attr.st_mode = mode;
7443 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7444}
7445
7446int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7447{
11fdf7f2
TL
7448 std::lock_guard lock(client_lock);
7449 tout(cct) << __func__ << std::endl;
7c673cae
FG
7450 tout(cct) << fd << std::endl;
7451 tout(cct) << mode << std::endl;
181888fb
FG
7452
7453 if (unmounting)
7454 return -ENOTCONN;
7455
7c673cae
FG
7456 Fh *f = get_filehandle(fd);
7457 if (!f)
7458 return -EBADF;
7459#if defined(__linux__) && defined(O_PATH)
7460 if (f->flags & O_PATH)
7461 return -EBADF;
7462#endif
7463 struct stat attr;
7464 attr.st_mode = mode;
7465 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7466}
7467
7468int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7469{
11fdf7f2
TL
7470 std::lock_guard lock(client_lock);
7471 tout(cct) << __func__ << std::endl;
7c673cae
FG
7472 tout(cct) << relpath << std::endl;
7473 tout(cct) << mode << std::endl;
181888fb
FG
7474
7475 if (unmounting)
7476 return -ENOTCONN;
7477
7c673cae
FG
7478 filepath path(relpath);
7479 InodeRef in;
7480 // don't follow symlinks
7481 int r = path_walk(path, &in, perms, false);
7482 if (r < 0)
7483 return r;
7484 struct stat attr;
7485 attr.st_mode = mode;
7486 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7487}
7488
7489int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7490 const UserPerm& perms)
7491{
11fdf7f2
TL
7492 std::lock_guard lock(client_lock);
7493 tout(cct) << __func__ << std::endl;
7c673cae
FG
7494 tout(cct) << relpath << std::endl;
7495 tout(cct) << new_uid << std::endl;
7496 tout(cct) << new_gid << std::endl;
181888fb
FG
7497
7498 if (unmounting)
7499 return -ENOTCONN;
7500
7c673cae
FG
7501 filepath path(relpath);
7502 InodeRef in;
7503 int r = path_walk(path, &in, perms);
7504 if (r < 0)
7505 return r;
7506 struct stat attr;
7507 attr.st_uid = new_uid;
7508 attr.st_gid = new_gid;
181888fb 7509 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7510}
7511
7512int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7513{
11fdf7f2
TL
7514 std::lock_guard lock(client_lock);
7515 tout(cct) << __func__ << std::endl;
7c673cae
FG
7516 tout(cct) << fd << std::endl;
7517 tout(cct) << new_uid << std::endl;
7518 tout(cct) << new_gid << std::endl;
181888fb
FG
7519
7520 if (unmounting)
7521 return -ENOTCONN;
7522
7c673cae
FG
7523 Fh *f = get_filehandle(fd);
7524 if (!f)
7525 return -EBADF;
7526#if defined(__linux__) && defined(O_PATH)
7527 if (f->flags & O_PATH)
7528 return -EBADF;
7529#endif
7530 struct stat attr;
7531 attr.st_uid = new_uid;
7532 attr.st_gid = new_gid;
7533 int mask = 0;
7534 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7535 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7536 return _setattr(f->inode, &attr, mask, perms);
7537}
7538
7539int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7540 const UserPerm& perms)
7541{
11fdf7f2
TL
7542 std::lock_guard lock(client_lock);
7543 tout(cct) << __func__ << std::endl;
7c673cae
FG
7544 tout(cct) << relpath << std::endl;
7545 tout(cct) << new_uid << std::endl;
7546 tout(cct) << new_gid << std::endl;
181888fb
FG
7547
7548 if (unmounting)
7549 return -ENOTCONN;
7550
7c673cae
FG
7551 filepath path(relpath);
7552 InodeRef in;
7553 // don't follow symlinks
7554 int r = path_walk(path, &in, perms, false);
7555 if (r < 0)
7556 return r;
7557 struct stat attr;
7558 attr.st_uid = new_uid;
7559 attr.st_gid = new_gid;
7560 int mask = 0;
7561 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7562 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7563 return _setattr(in, &attr, mask, perms);
7564}
7565
11fdf7f2
TL
7566static void attr_set_atime_and_mtime(struct stat *attr,
7567 const utime_t &atime,
7568 const utime_t &mtime)
7569{
7570 stat_set_atime_sec(attr, atime.tv.tv_sec);
7571 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7572 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7573 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7574}
7575
7576// for [l]utime() invoke the timeval variant as the timespec
7577// variant are not yet implemented. for futime[s](), invoke
7578// the timespec variant.
7c673cae
FG
7579int Client::utime(const char *relpath, struct utimbuf *buf,
7580 const UserPerm& perms)
7581{
11fdf7f2
TL
7582 struct timeval tv[2];
7583 tv[0].tv_sec = buf->actime;
7584 tv[0].tv_usec = 0;
7585 tv[1].tv_sec = buf->modtime;
7586 tv[1].tv_usec = 0;
7587
7588 return utimes(relpath, tv, perms);
7589}
7590
7591int Client::lutime(const char *relpath, struct utimbuf *buf,
7592 const UserPerm& perms)
7593{
7594 struct timeval tv[2];
7595 tv[0].tv_sec = buf->actime;
7596 tv[0].tv_usec = 0;
7597 tv[1].tv_sec = buf->modtime;
7598 tv[1].tv_usec = 0;
7599
7600 return lutimes(relpath, tv, perms);
7601}
7602
7603int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7604{
7605 struct timespec ts[2];
7606 ts[0].tv_sec = buf->actime;
7607 ts[0].tv_nsec = 0;
7608 ts[1].tv_sec = buf->modtime;
7609 ts[1].tv_nsec = 0;
7610
7611 return futimens(fd, ts, perms);
7612}
7613
7614int Client::utimes(const char *relpath, struct timeval times[2],
7615 const UserPerm& perms)
7616{
7617 std::lock_guard lock(client_lock);
7618 tout(cct) << __func__ << std::endl;
7c673cae 7619 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7620 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7621 << std::endl;
7622 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7623 << std::endl;
181888fb
FG
7624
7625 if (unmounting)
7626 return -ENOTCONN;
7627
7c673cae
FG
7628 filepath path(relpath);
7629 InodeRef in;
7630 int r = path_walk(path, &in, perms);
7631 if (r < 0)
7632 return r;
7633 struct stat attr;
11fdf7f2
TL
7634 utime_t atime(times[0]);
7635 utime_t mtime(times[1]);
7636
7637 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7638 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7639}
7640
11fdf7f2
TL
7641int Client::lutimes(const char *relpath, struct timeval times[2],
7642 const UserPerm& perms)
7c673cae 7643{
11fdf7f2
TL
7644 std::lock_guard lock(client_lock);
7645 tout(cct) << __func__ << std::endl;
7c673cae 7646 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7647 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7648 << std::endl;
7649 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7650 << std::endl;
181888fb
FG
7651
7652 if (unmounting)
7653 return -ENOTCONN;
7654
7c673cae
FG
7655 filepath path(relpath);
7656 InodeRef in;
7c673cae
FG
7657 int r = path_walk(path, &in, perms, false);
7658 if (r < 0)
7659 return r;
7660 struct stat attr;
11fdf7f2
TL
7661 utime_t atime(times[0]);
7662 utime_t mtime(times[1]);
7663
7664 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7665 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7666}
7667
11fdf7f2
TL
7668int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7669{
7670 struct timespec ts[2];
7671 ts[0].tv_sec = times[0].tv_sec;
7672 ts[0].tv_nsec = times[0].tv_usec * 1000;
7673 ts[1].tv_sec = times[1].tv_sec;
7674 ts[1].tv_nsec = times[1].tv_usec * 1000;
7675
7676 return futimens(fd, ts, perms);
7677}
7678
7679int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7680{
7681 std::lock_guard lock(client_lock);
7682 tout(cct) << __func__ << std::endl;
7683 tout(cct) << fd << std::endl;
7684 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7685 << std::endl;
7686 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7687 << std::endl;
7688
7689 if (unmounting)
7690 return -ENOTCONN;
7691
7692 Fh *f = get_filehandle(fd);
7693 if (!f)
7694 return -EBADF;
7695#if defined(__linux__) && defined(O_PATH)
7696 if (f->flags & O_PATH)
7697 return -EBADF;
7698#endif
7699 struct stat attr;
7700 utime_t atime(times[0]);
7701 utime_t mtime(times[1]);
7702
7703 attr_set_atime_and_mtime(&attr, atime, mtime);
7704 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7705}
7706
7c673cae
FG
7707int Client::flock(int fd, int operation, uint64_t owner)
7708{
11fdf7f2
TL
7709 std::lock_guard lock(client_lock);
7710 tout(cct) << __func__ << std::endl;
7c673cae
FG
7711 tout(cct) << fd << std::endl;
7712 tout(cct) << operation << std::endl;
7713 tout(cct) << owner << std::endl;
181888fb
FG
7714
7715 if (unmounting)
7716 return -ENOTCONN;
7717
7c673cae
FG
7718 Fh *f = get_filehandle(fd);
7719 if (!f)
7720 return -EBADF;
7721
7722 return _flock(f, operation, owner);
7723}
7724
7725int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7726{
11fdf7f2
TL
7727 std::lock_guard lock(client_lock);
7728 tout(cct) << __func__ << std::endl;
7c673cae 7729 tout(cct) << relpath << std::endl;
181888fb
FG
7730
7731 if (unmounting)
7732 return -ENOTCONN;
7733
7c673cae
FG
7734 filepath path(relpath);
7735 InodeRef in;
7736 int r = path_walk(path, &in, perms, true);
7737 if (r < 0)
7738 return r;
7739 if (cct->_conf->client_permissions) {
7740 int r = may_open(in.get(), O_RDONLY, perms);
7741 if (r < 0)
7742 return r;
7743 }
7744 r = _opendir(in.get(), dirpp, perms);
7745 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7746 if (r != -ENOTDIR)
7747 tout(cct) << (unsigned long)*dirpp << std::endl;
7748 return r;
7749}
7750
7751int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7752{
7753 if (!in->is_dir())
7754 return -ENOTDIR;
7755 *dirpp = new dir_result_t(in, perms);
7756 opened_dirs.insert(*dirpp);
11fdf7f2 7757 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7758 return 0;
7759}
7760
7761
7762int Client::closedir(dir_result_t *dir)
7763{
11fdf7f2
TL
7764 std::lock_guard lock(client_lock);
7765 tout(cct) << __func__ << std::endl;
7c673cae
FG
7766 tout(cct) << (unsigned long)dir << std::endl;
7767
11fdf7f2 7768 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7769 _closedir(dir);
7770 return 0;
7771}
7772
7773void Client::_closedir(dir_result_t *dirp)
7774{
11fdf7f2 7775 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7776 if (dirp->inode) {
11fdf7f2 7777 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7778 dirp->inode.reset();
7779 }
7780 _readdir_drop_dirp_buffer(dirp);
7781 opened_dirs.erase(dirp);
7782 delete dirp;
7783}
7784
7785void Client::rewinddir(dir_result_t *dirp)
7786{
11fdf7f2
TL
7787 std::lock_guard lock(client_lock);
7788 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7789
7790 if (unmounting)
7791 return;
7792
7c673cae
FG
7793 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7794 _readdir_drop_dirp_buffer(d);
7795 d->reset();
7796}
7797
7798loff_t Client::telldir(dir_result_t *dirp)
7799{
7800 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7801 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7802 return d->offset;
7803}
7804
7805void Client::seekdir(dir_result_t *dirp, loff_t offset)
7806{
11fdf7f2 7807 std::lock_guard lock(client_lock);
7c673cae 7808
11fdf7f2 7809 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7810
181888fb
FG
7811 if (unmounting)
7812 return;
7813
7c673cae
FG
7814 if (offset == dirp->offset)
7815 return;
7816
7817 if (offset > dirp->offset)
7818 dirp->release_count = 0; // bump if we do a forward seek
7819 else
7820 dirp->ordered_count = 0; // disable filling readdir cache
7821
7822 if (dirp->hash_order()) {
7823 if (dirp->offset > offset) {
7824 _readdir_drop_dirp_buffer(dirp);
7825 dirp->reset();
7826 }
7827 } else {
7828 if (offset == 0 ||
7829 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7830 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7831 _readdir_drop_dirp_buffer(dirp);
7832 dirp->reset();
7833 }
7834 }
7835
7836 dirp->offset = offset;
7837}
7838
7839
7840//struct dirent {
7841// ino_t d_ino; /* inode number */
7842// off_t d_off; /* offset to the next dirent */
7843// unsigned short d_reclen; /* length of this record */
7844// unsigned char d_type; /* type of file */
7845// char d_name[256]; /* filename */
7846//};
7847void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7848{
7849 strncpy(de->d_name, name, 255);
7850 de->d_name[255] = '\0';
7851#ifndef __CYGWIN__
7852 de->d_ino = ino;
11fdf7f2 7853#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7854 de->d_off = next_off;
7855#endif
7856 de->d_reclen = 1;
7857 de->d_type = IFTODT(type);
11fdf7f2 7858 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7859 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7860#endif
7861}
7862
7863void Client::_readdir_next_frag(dir_result_t *dirp)
7864{
7865 frag_t fg = dirp->buffer_frag;
7866
7867 if (fg.is_rightmost()) {
11fdf7f2 7868 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7869 dirp->set_end();
7870 return;
7871 }
7872
7873 // advance
7874 fg = fg.next();
11fdf7f2 7875 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7876
7877 if (dirp->hash_order()) {
7878 // keep last_name
7879 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7880 if (dirp->offset < new_offset) // don't decrease offset
7881 dirp->offset = new_offset;
7882 } else {
7883 dirp->last_name.clear();
7884 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7885 _readdir_rechoose_frag(dirp);
7886 }
7887}
7888
7889void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7890{
11fdf7f2 7891 ceph_assert(dirp->inode);
7c673cae
FG
7892
7893 if (dirp->hash_order())
7894 return;
7895
7896 frag_t cur = frag_t(dirp->offset_high());
7897 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7898 if (fg != cur) {
11fdf7f2 7899 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7900 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7901 dirp->last_name.clear();
7902 dirp->next_offset = 2;
7903 }
7904}
7905
7906void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7907{
11fdf7f2 7908 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7909 dirp->buffer.clear();
7910}
7911
7912int Client::_readdir_get_frag(dir_result_t *dirp)
7913{
11fdf7f2
TL
7914 ceph_assert(dirp);
7915 ceph_assert(dirp->inode);
7c673cae
FG
7916
7917 // get the current frag.
7918 frag_t fg;
7919 if (dirp->hash_order())
7920 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7921 else
7922 fg = frag_t(dirp->offset_high());
7923
11fdf7f2 7924 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7925 << " offset " << hex << dirp->offset << dec << dendl;
7926
7927 int op = CEPH_MDS_OP_READDIR;
7928 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7929 op = CEPH_MDS_OP_LSSNAP;
7930
7931 InodeRef& diri = dirp->inode;
7932
7933 MetaRequest *req = new MetaRequest(op);
7934 filepath path;
7935 diri->make_nosnap_relative_path(path);
7936 req->set_filepath(path);
7937 req->set_inode(diri.get());
7938 req->head.args.readdir.frag = fg;
7939 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7940 if (dirp->last_name.length()) {
94b18763 7941 req->path2.set_path(dirp->last_name);
7c673cae
FG
7942 } else if (dirp->hash_order()) {
7943 req->head.args.readdir.offset_hash = dirp->offset_high();
7944 }
7945 req->dirp = dirp;
7946
7947 bufferlist dirbl;
7948 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7949
7950 if (res == -EAGAIN) {
11fdf7f2 7951 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7952 _readdir_rechoose_frag(dirp);
7953 return _readdir_get_frag(dirp);
7954 }
7955
7956 if (res == 0) {
11fdf7f2 7957 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7958 << " size " << dirp->buffer.size() << dendl;
7959 } else {
11fdf7f2 7960 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7961 dirp->set_end();
7962 }
7963
7964 return res;
7965}
7966
7967struct dentry_off_lt {
7968 bool operator()(const Dentry* dn, int64_t off) const {
7969 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7970 }
7971};
7972
7973int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7974 int caps, bool getref)
7975{
11fdf7f2
TL
7976 ceph_assert(client_lock.is_locked());
7977 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7978 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7979 << dendl;
7980 Dir *dir = dirp->inode->dir;
7981
7982 if (!dir) {
7983 ldout(cct, 10) << " dir is empty" << dendl;
7984 dirp->set_end();
7985 return 0;
7986 }
7987
7988 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7989 dir->readdir_cache.end(),
7990 dirp->offset, dentry_off_lt());
7991
7992 string dn_name;
7993 while (true) {
7994 if (!dirp->inode->is_complete_and_ordered())
7995 return -EAGAIN;
7996 if (pd == dir->readdir_cache.end())
7997 break;
7998 Dentry *dn = *pd;
7999 if (dn->inode == NULL) {
8000 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8001 ++pd;
8002 continue;
8003 }
8004 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8005 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8006 ++pd;
8007 continue;
8008 }
8009
8010 int r = _getattr(dn->inode, caps, dirp->perms);
8011 if (r < 0)
8012 return r;
8013
8014 struct ceph_statx stx;
8015 struct dirent de;
8016 fill_statx(dn->inode, caps, &stx);
8017
8018 uint64_t next_off = dn->offset + 1;
eafe8130 8019 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8020 ++pd;
8021 if (pd == dir->readdir_cache.end())
8022 next_off = dir_result_t::END;
8023
8024 Inode *in = NULL;
7c673cae
FG
8025 if (getref) {
8026 in = dn->inode.get();
8027 _ll_get(in);
8028 }
8029
8030 dn_name = dn->name; // fill in name while we have lock
8031
8032 client_lock.Unlock();
8033 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8034 client_lock.Lock();
8035 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8036 << " = " << r << dendl;
8037 if (r < 0) {
8038 return r;
8039 }
8040
8041 dirp->offset = next_off;
8042 if (dirp->at_end())
8043 dirp->next_offset = 2;
8044 else
8045 dirp->next_offset = dirp->offset_low();
8046 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8047 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8048 if (r > 0)
8049 return r;
8050 }
8051
11fdf7f2 8052 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8053 dirp->set_end();
8054 return 0;
8055}
8056
8057int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8058 unsigned want, unsigned flags, bool getref)
8059{
8060 int caps = statx_to_mask(flags, want);
8061
11fdf7f2 8062 std::lock_guard lock(client_lock);
7c673cae 8063
181888fb
FG
8064 if (unmounting)
8065 return -ENOTCONN;
8066
7c673cae
FG
8067 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8068
11fdf7f2 8069 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8070 << dec << " at_end=" << dirp->at_end()
8071 << " hash_order=" << dirp->hash_order() << dendl;
8072
8073 struct dirent de;
8074 struct ceph_statx stx;
8075 memset(&de, 0, sizeof(de));
8076 memset(&stx, 0, sizeof(stx));
8077
8078 InodeRef& diri = dirp->inode;
8079
8080 if (dirp->at_end())
8081 return 0;
8082
8083 if (dirp->offset == 0) {
8084 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8085 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8086 uint64_t next_off = 1;
8087
8088 int r;
8089 r = _getattr(diri, caps, dirp->perms);
8090 if (r < 0)
8091 return r;
8092
8093 fill_statx(diri, caps, &stx);
8094 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8095
8096 Inode *inode = NULL;
8097 if (getref) {
8098 inode = diri.get();
8099 _ll_get(inode);
8100 }
8101
8102 client_lock.Unlock();
8103 r = cb(p, &de, &stx, next_off, inode);
8104 client_lock.Lock();
8105 if (r < 0)
8106 return r;
8107
8108 dirp->offset = next_off;
8109 if (r > 0)
8110 return r;
8111 }
8112 if (dirp->offset == 1) {
8113 ldout(cct, 15) << " including .." << dendl;
8114 uint64_t next_off = 2;
8115 InodeRef in;
11fdf7f2 8116 if (diri->dentries.empty())
7c673cae
FG
8117 in = diri;
8118 else
94b18763 8119 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8120
8121 int r;
94b18763 8122 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8123 if (r < 0)
8124 return r;
8125
8126 fill_statx(in, caps, &stx);
8127 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8128
8129 Inode *inode = NULL;
8130 if (getref) {
8131 inode = in.get();
8132 _ll_get(inode);
8133 }
8134
8135 client_lock.Unlock();
8136 r = cb(p, &de, &stx, next_off, inode);
8137 client_lock.Lock();
8138 if (r < 0)
8139 return r;
8140
8141 dirp->offset = next_off;
8142 if (r > 0)
8143 return r;
8144 }
8145
8146 // can we read from our cache?
8147 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8148 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8149 << dirp->inode->is_complete_and_ordered()
8150 << " issued " << ccap_string(dirp->inode->caps_issued())
8151 << dendl;
8152 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8153 dirp->inode->is_complete_and_ordered() &&
94b18763 8154 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8155 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8156 if (err != -EAGAIN)
8157 return err;
8158 }
8159
8160 while (1) {
8161 if (dirp->at_end())
8162 return 0;
8163
8164 bool check_caps = true;
8165 if (!dirp->is_cached()) {
8166 int r = _readdir_get_frag(dirp);
8167 if (r)
8168 return r;
8169 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8170 // different than the requested one. (our dirfragtree was outdated)
8171 check_caps = false;
8172 }
8173 frag_t fg = dirp->buffer_frag;
8174
8175 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8176 << " offset " << hex << dirp->offset << dendl;
8177
8178 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8179 dirp->offset, dir_result_t::dentry_off_lt());
8180 it != dirp->buffer.end();
8181 ++it) {
8182 dir_result_t::dentry &entry = *it;
8183
8184 uint64_t next_off = entry.offset + 1;
8185
8186 int r;
8187 if (check_caps) {
8188 r = _getattr(entry.inode, caps, dirp->perms);
8189 if (r < 0)
8190 return r;
8191 }
8192
8193 fill_statx(entry.inode, caps, &stx);
8194 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8195
8196 Inode *inode = NULL;
8197 if (getref) {
8198 inode = entry.inode.get();
8199 _ll_get(inode);
8200 }
8201
8202 client_lock.Unlock();
8203 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8204 client_lock.Lock();
8205
8206 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8207 << " = " << r << dendl;
8208 if (r < 0)
8209 return r;
8210
8211 dirp->offset = next_off;
8212 if (r > 0)
8213 return r;
8214 }
8215
8216 if (dirp->next_offset > 2) {
8217 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8218 _readdir_drop_dirp_buffer(dirp);
8219 continue; // more!
8220 }
8221
8222 if (!fg.is_rightmost()) {
8223 // next frag!
8224 _readdir_next_frag(dirp);
8225 continue;
8226 }
8227
8228 if (diri->shared_gen == dirp->start_shared_gen &&
8229 diri->dir_release_count == dirp->release_count) {
8230 if (diri->dir_ordered_count == dirp->ordered_count) {
8231 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8232 if (diri->dir) {
11fdf7f2 8233 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8234 diri->dir->readdir_cache.resize(dirp->cache_index);
8235 }
8236 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8237 } else {
8238 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8239 diri->flags |= I_COMPLETE;
8240 }
8241 }
8242
8243 dirp->set_end();
8244 return 0;
8245 }
8246 ceph_abort();
8247 return 0;
8248}
8249
8250
8251int Client::readdir_r(dir_result_t *d, struct dirent *de)
8252{
8253 return readdirplus_r(d, de, 0, 0, 0, NULL);
8254}
8255
8256/*
8257 * readdirplus_r
8258 *
8259 * returns
8260 * 1 if we got a dirent
8261 * 0 for end of directory
8262 * <0 on error
8263 */
8264
8265struct single_readdir {
8266 struct dirent *de;
8267 struct ceph_statx *stx;
8268 Inode *inode;
8269 bool full;
8270};
8271
8272static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8273 struct ceph_statx *stx, off_t off,
8274 Inode *in)
8275{
8276 single_readdir *c = static_cast<single_readdir *>(p);
8277
8278 if (c->full)
8279 return -1; // already filled this dirent
8280
8281 *c->de = *de;
8282 if (c->stx)
8283 *c->stx = *stx;
8284 c->inode = in;
8285 c->full = true;
8286 return 1;
8287}
8288
8289struct dirent *Client::readdir(dir_result_t *d)
8290{
8291 int ret;
8292 static struct dirent de;
8293 single_readdir sr;
8294 sr.de = &de;
8295 sr.stx = NULL;
8296 sr.inode = NULL;
8297 sr.full = false;
8298
8299 // our callback fills the dirent and sets sr.full=true on first
8300 // call, and returns -1 the second time around.
8301 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8302 if (ret < -1) {
8303 errno = -ret; // this sucks.
8304 return (dirent *) NULL;
8305 }
8306 if (sr.full) {
8307 return &de;
8308 }
8309 return (dirent *) NULL;
8310}
8311
8312int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8313 struct ceph_statx *stx, unsigned want,
8314 unsigned flags, Inode **out)
8315{
8316 single_readdir sr;
8317 sr.de = de;
8318 sr.stx = stx;
8319 sr.inode = NULL;
8320 sr.full = false;
8321
8322 // our callback fills the dirent and sets sr.full=true on first
8323 // call, and returns -1 the second time around.
8324 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8325 if (r < -1)
8326 return r;
8327 if (out)
8328 *out = sr.inode;
8329 if (sr.full)
8330 return 1;
8331 return 0;
8332}
8333
8334
8335/* getdents */
8336struct getdents_result {
8337 char *buf;
8338 int buflen;
8339 int pos;
8340 bool fullent;
8341};
8342
8343static int _readdir_getdent_cb(void *p, struct dirent *de,
8344 struct ceph_statx *stx, off_t off, Inode *in)
8345{
8346 struct getdents_result *c = static_cast<getdents_result *>(p);
8347
8348 int dlen;
8349 if (c->fullent)
8350 dlen = sizeof(*de);
8351 else
8352 dlen = strlen(de->d_name) + 1;
8353
8354 if (c->pos + dlen > c->buflen)
8355 return -1; // doesn't fit
8356
8357 if (c->fullent) {
8358 memcpy(c->buf + c->pos, de, sizeof(*de));
8359 } else {
8360 memcpy(c->buf + c->pos, de->d_name, dlen);
8361 }
8362 c->pos += dlen;
8363 return 0;
8364}
8365
8366int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8367{
8368 getdents_result gr;
8369 gr.buf = buf;
8370 gr.buflen = buflen;
8371 gr.fullent = fullent;
8372 gr.pos = 0;
8373
8374 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8375
8376 if (r < 0) { // some error
8377 if (r == -1) { // buffer ran out of space
8378 if (gr.pos) { // but we got some entries already!
8379 return gr.pos;
8380 } // or we need a larger buffer
8381 return -ERANGE;
8382 } else { // actual error, return it
8383 return r;
8384 }
8385 }
8386 return gr.pos;
8387}
8388
8389
8390/* getdir */
8391struct getdir_result {
8392 list<string> *contents;
8393 int num;
8394};
8395
8396static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8397{
8398 getdir_result *r = static_cast<getdir_result *>(p);
8399
8400 r->contents->push_back(de->d_name);
8401 r->num++;
8402 return 0;
8403}
8404
8405int Client::getdir(const char *relpath, list<string>& contents,
8406 const UserPerm& perms)
8407{
8408 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8409 {
11fdf7f2 8410 std::lock_guard lock(client_lock);
7c673cae
FG
8411 tout(cct) << "getdir" << std::endl;
8412 tout(cct) << relpath << std::endl;
8413 }
8414
8415 dir_result_t *d;
8416 int r = opendir(relpath, &d, perms);
8417 if (r < 0)
8418 return r;
8419
8420 getdir_result gr;
8421 gr.contents = &contents;
8422 gr.num = 0;
8423 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8424
8425 closedir(d);
8426
8427 if (r < 0)
8428 return r;
8429 return gr.num;
8430}
8431
8432
8433/****** file i/o **********/
8434int Client::open(const char *relpath, int flags, const UserPerm& perms,
8435 mode_t mode, int stripe_unit, int stripe_count,
8436 int object_size, const char *data_pool)
8437{
8438 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8439 std::lock_guard lock(client_lock);
7c673cae
FG
8440 tout(cct) << "open" << std::endl;
8441 tout(cct) << relpath << std::endl;
8442 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8443
181888fb
FG
8444 if (unmounting)
8445 return -ENOTCONN;
8446
7c673cae
FG
8447 Fh *fh = NULL;
8448
8449#if defined(__linux__) && defined(O_PATH)
8450 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8451 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8452 * in kernel (fs/open.c). */
8453 if (flags & O_PATH)
8454 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8455#endif
8456
8457 filepath path(relpath);
8458 InodeRef in;
8459 bool created = false;
8460 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8461 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8462 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8463
8464 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8465 return -EEXIST;
8466
8467#if defined(__linux__) && defined(O_PATH)
8468 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8469#else
8470 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8471#endif
8472 return -ELOOP;
8473
8474 if (r == -ENOENT && (flags & O_CREAT)) {
8475 filepath dirpath = path;
8476 string dname = dirpath.last_dentry();
8477 dirpath.pop_dentry();
8478 InodeRef dir;
8479 r = path_walk(dirpath, &dir, perms, true,
8480 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8481 if (r < 0)
8482 goto out;
8483 if (cct->_conf->client_permissions) {
8484 r = may_create(dir.get(), perms);
8485 if (r < 0)
8486 goto out;
8487 }
8488 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8489 stripe_count, object_size, data_pool, &created, perms);
8490 }
8491 if (r < 0)
8492 goto out;
8493
8494 if (!created) {
8495 // posix says we can only check permissions of existing files
8496 if (cct->_conf->client_permissions) {
8497 r = may_open(in.get(), flags, perms);
8498 if (r < 0)
8499 goto out;
8500 }
8501 }
8502
8503 if (!fh)
8504 r = _open(in.get(), flags, mode, &fh, perms);
8505 if (r >= 0) {
8506 // allocate a integer file descriptor
11fdf7f2 8507 ceph_assert(fh);
7c673cae 8508 r = get_fd();
11fdf7f2 8509 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8510 fd_map[r] = fh;
8511 }
8512
8513 out:
8514 tout(cct) << r << std::endl;
8515 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8516 return r;
8517}
8518
8519int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8520{
8521 /* Use default file striping parameters */
8522 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8523}
8524
8525int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8526 const UserPerm& perms)
8527{
11fdf7f2
TL
8528 std::lock_guard lock(client_lock);
8529 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8530
181888fb
FG
8531 if (unmounting)
8532 return -ENOTCONN;
8533
7c673cae
FG
8534 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8535 filepath path(ino);
8536 req->set_filepath(path);
8537
8538 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8539 char f[30];
8540 sprintf(f, "%u", h);
8541 filepath path2(dirino);
8542 path2.push_dentry(string(f));
8543 req->set_filepath2(path2);
8544
8545 int r = make_request(req, perms, NULL, NULL,
8546 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8547 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8548 return r;
8549}
8550
8551
8552/**
8553 * Load inode into local cache.
8554 *
8555 * If inode pointer is non-NULL, and take a reference on
8556 * the resulting Inode object in one operation, so that caller
8557 * can safely assume inode will still be there after return.
8558 */
1adf2230 8559int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8560{
11fdf7f2 8561 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8562
181888fb
FG
8563 if (unmounting)
8564 return -ENOTCONN;
8565
7c673cae
FG
8566 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8567 filepath path(ino);
8568 req->set_filepath(path);
8569
8570 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8571 if (r == 0 && inode != NULL) {
8572 vinodeno_t vino(ino, CEPH_NOSNAP);
8573 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8574 ceph_assert(p != inode_map.end());
7c673cae
FG
8575 *inode = p->second;
8576 _ll_get(*inode);
8577 }
11fdf7f2 8578 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8579 return r;
8580}
8581
1adf2230
AA
8582int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8583{
11fdf7f2 8584 std::lock_guard lock(client_lock);
1adf2230
AA
8585 return _lookup_ino(ino, perms, inode);
8586}
7c673cae
FG
8587
8588/**
8589 * Find the parent inode of `ino` and insert it into
8590 * our cache. Conditionally also set `parent` to a referenced
8591 * Inode* if caller provides non-NULL value.
8592 */
1adf2230 8593int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8594{
11fdf7f2 8595 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8596
7c673cae
FG
8597 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8598 filepath path(ino->ino);
8599 req->set_filepath(path);
8600
8601 InodeRef target;
8602 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8603 // Give caller a reference to the parent ino if they provided a pointer.
8604 if (parent != NULL) {
8605 if (r == 0) {
8606 *parent = target.get();
8607 _ll_get(*parent);
11fdf7f2 8608 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8609 } else {
8610 *parent = NULL;
8611 }
8612 }
11fdf7f2 8613 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8614 return r;
8615}
8616
7c673cae
FG
8617/**
8618 * Populate the parent dentry for `ino`, provided it is
8619 * a child of `parent`.
8620 */
1adf2230 8621int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8622{
11fdf7f2
TL
8623 ceph_assert(parent->is_dir());
8624 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8625
181888fb
FG
8626 if (unmounting)
8627 return -ENOTCONN;
8628
7c673cae
FG
8629 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8630 req->set_filepath2(filepath(parent->ino));
8631 req->set_filepath(filepath(ino->ino));
8632 req->set_inode(ino);
8633
8634 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8635 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8636 return r;
8637}
8638
1adf2230
AA
8639int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8640{
11fdf7f2 8641 std::lock_guard lock(client_lock);
1adf2230
AA
8642 return _lookup_name(ino, parent, perms);
8643}
7c673cae 8644
11fdf7f2 8645Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8646{
11fdf7f2
TL
8647 ceph_assert(in);
8648 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8649
11fdf7f2 8650 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8651
8652 if (in->snapid != CEPH_NOSNAP) {
8653 in->snap_cap_refs++;
8654 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8655 << ccap_string(in->caps_issued()) << dendl;
8656 }
8657
11fdf7f2 8658 const auto& conf = cct->_conf;
7c673cae
FG
8659 f->readahead.set_trigger_requests(1);
8660 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8661 uint64_t max_readahead = Readahead::NO_LIMIT;
8662 if (conf->client_readahead_max_bytes) {
11fdf7f2 8663 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8664 }
8665 if (conf->client_readahead_max_periods) {
11fdf7f2 8666 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8667 }
8668 f->readahead.set_max_readahead_size(max_readahead);
8669 vector<uint64_t> alignments;
8670 alignments.push_back(in->layout.get_period());
8671 alignments.push_back(in->layout.stripe_unit);
8672 f->readahead.set_alignments(alignments);
8673
8674 return f;
8675}
8676
8677int Client::_release_fh(Fh *f)
8678{
8679 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8680 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8681 Inode *in = f->inode.get();
11fdf7f2 8682 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8683
b32b8144
FG
8684 in->unset_deleg(f);
8685
7c673cae
FG
8686 if (in->snapid == CEPH_NOSNAP) {
8687 if (in->put_open_ref(f->mode)) {
8688 _flush(in, new C_Client_FlushComplete(this, in));
8689 check_caps(in, 0);
8690 }
8691 } else {
11fdf7f2 8692 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8693 in->snap_cap_refs--;
8694 }
8695
8696 _release_filelocks(f);
8697
8698 // Finally, read any async err (i.e. from flushes)
8699 int err = f->take_async_err();
8700 if (err != 0) {
11fdf7f2 8701 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8702 << cpp_strerror(err) << dendl;
8703 } else {
11fdf7f2 8704 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8705 }
8706
8707 _put_fh(f);
8708
8709 return err;
8710}
8711
8712void Client::_put_fh(Fh *f)
8713{
8714 int left = f->put();
8715 if (!left) {
8716 delete f;
8717 }
8718}
8719
8720int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8721 const UserPerm& perms)
8722{
8723 if (in->snapid != CEPH_NOSNAP &&
8724 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8725 return -EROFS;
8726 }
8727
8728 // use normalized flags to generate cmode
11fdf7f2
TL
8729 int cflags = ceph_flags_sys2wire(flags);
8730 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8731 cflags |= CEPH_O_LAZY;
8732
8733 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8734 int want = ceph_caps_for_mode(cmode);
8735 int result = 0;
8736
8737 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8738
b32b8144 8739 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8740 // update wanted?
8741 check_caps(in, CHECK_CAPS_NODELAY);
8742 } else {
b32b8144 8743
7c673cae
FG
8744 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8745 filepath path;
8746 in->make_nosnap_relative_path(path);
8747 req->set_filepath(path);
11fdf7f2 8748 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8749 req->head.args.open.mode = mode;
8750 req->head.args.open.pool = -1;
8751 if (cct->_conf->client_debug_getattr_caps)
8752 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8753 else
8754 req->head.args.open.mask = 0;
8755 req->head.args.open.old_size = in->size; // for O_TRUNC
8756 req->set_inode(in);
8757 result = make_request(req, perms);
b32b8144
FG
8758
8759 /*
8760 * NFS expects that delegations will be broken on a conflicting open,
8761 * not just when there is actual conflicting access to the file. SMB leases
8762 * and oplocks also have similar semantics.
8763 *
8764 * Ensure that clients that have delegations enabled will wait on minimal
8765 * caps during open, just to ensure that other clients holding delegations
8766 * return theirs first.
8767 */
8768 if (deleg_timeout && result == 0) {
8769 int need = 0, have;
8770
8771 if (cmode & CEPH_FILE_MODE_WR)
8772 need |= CEPH_CAP_FILE_WR;
8773 if (cmode & CEPH_FILE_MODE_RD)
8774 need |= CEPH_CAP_FILE_RD;
8775
8776 result = get_caps(in, need, want, &have, -1);
8777 if (result < 0) {
1adf2230 8778 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8779 " . Denying open: " <<
8780 cpp_strerror(result) << dendl;
8781 in->put_open_ref(cmode);
8782 } else {
8783 put_cap_ref(in, need);
8784 }
8785 }
7c673cae
FG
8786 }
8787
8788 // success?
8789 if (result >= 0) {
8790 if (fhp)
8791 *fhp = _create_fh(in, flags, cmode, perms);
8792 } else {
8793 in->put_open_ref(cmode);
8794 }
8795
8796 trim_cache();
8797
8798 return result;
8799}
8800
8801int Client::_renew_caps(Inode *in)
8802{
8803 int wanted = in->caps_file_wanted();
8804 if (in->is_any_caps() &&
8805 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8806 check_caps(in, CHECK_CAPS_NODELAY);
8807 return 0;
8808 }
8809
8810 int flags = 0;
8811 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8812 flags = O_RDWR;
8813 else if (wanted & CEPH_CAP_FILE_RD)
8814 flags = O_RDONLY;
8815 else if (wanted & CEPH_CAP_FILE_WR)
8816 flags = O_WRONLY;
8817
8818 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8819 filepath path;
8820 in->make_nosnap_relative_path(path);
8821 req->set_filepath(path);
8822 req->head.args.open.flags = flags;
8823 req->head.args.open.pool = -1;
8824 if (cct->_conf->client_debug_getattr_caps)
8825 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8826 else
8827 req->head.args.open.mask = 0;
8828 req->set_inode(in);
8829
8830 // duplicate in case Cap goes away; not sure if that race is a concern?
8831 const UserPerm *pperm = in->get_best_perms();
8832 UserPerm perms;
8833 if (pperm != NULL)
8834 perms = *pperm;
8835 int ret = make_request(req, perms);
8836 return ret;
8837}
8838
8839int Client::close(int fd)
8840{
8841 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8842 std::lock_guard lock(client_lock);
7c673cae
FG
8843 tout(cct) << "close" << std::endl;
8844 tout(cct) << fd << std::endl;
8845
181888fb
FG
8846 if (unmounting)
8847 return -ENOTCONN;
8848
7c673cae
FG
8849 Fh *fh = get_filehandle(fd);
8850 if (!fh)
8851 return -EBADF;
8852 int err = _release_fh(fh);
8853 fd_map.erase(fd);
8854 put_fd(fd);
8855 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8856 return err;
8857}
8858
8859
8860// ------------
8861// read, write
8862
8863loff_t Client::lseek(int fd, loff_t offset, int whence)
8864{
11fdf7f2 8865 std::lock_guard lock(client_lock);
7c673cae
FG
8866 tout(cct) << "lseek" << std::endl;
8867 tout(cct) << fd << std::endl;
8868 tout(cct) << offset << std::endl;
8869 tout(cct) << whence << std::endl;
8870
181888fb
FG
8871 if (unmounting)
8872 return -ENOTCONN;
8873
7c673cae
FG
8874 Fh *f = get_filehandle(fd);
8875 if (!f)
8876 return -EBADF;
8877#if defined(__linux__) && defined(O_PATH)
8878 if (f->flags & O_PATH)
8879 return -EBADF;
8880#endif
8881 return _lseek(f, offset, whence);
8882}
8883
8884loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8885{
8886 Inode *in = f->inode.get();
8887 int r;
11fdf7f2 8888 loff_t pos = -1;
7c673cae
FG
8889
8890 switch (whence) {
8891 case SEEK_SET:
11fdf7f2 8892 pos = offset;
7c673cae
FG
8893 break;
8894
8895 case SEEK_CUR:
11fdf7f2 8896 pos += offset;
7c673cae
FG
8897 break;
8898
8899 case SEEK_END:
8900 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8901 if (r < 0)
8902 return r;
11fdf7f2 8903 pos = in->size + offset;
7c673cae
FG
8904 break;
8905
8906 default:
8907 ceph_abort();
8908 }
8909
11fdf7f2
TL
8910 if (pos < 0) {
8911 return -EINVAL;
8912 } else {
8913 f->pos = pos;
8914 }
8915
1adf2230 8916 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8917 return f->pos;
8918}
8919
8920
8921void Client::lock_fh_pos(Fh *f)
8922{
11fdf7f2 8923 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8924
8925 if (f->pos_locked || !f->pos_waiters.empty()) {
8926 Cond cond;
8927 f->pos_waiters.push_back(&cond);
11fdf7f2 8928 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
7c673cae
FG
8929 while (f->pos_locked || f->pos_waiters.front() != &cond)
8930 cond.Wait(client_lock);
11fdf7f2
TL
8931 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8932 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
8933 f->pos_waiters.pop_front();
8934 }
8935
8936 f->pos_locked = true;
8937}
8938
8939void Client::unlock_fh_pos(Fh *f)
8940{
11fdf7f2 8941 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8942 f->pos_locked = false;
8943}
8944
8945int Client::uninline_data(Inode *in, Context *onfinish)
8946{
8947 if (!in->inline_data.length()) {
8948 onfinish->complete(0);
8949 return 0;
8950 }
8951
8952 char oid_buf[32];
8953 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8954 object_t oid = oid_buf;
8955
8956 ObjectOperation create_ops;
8957 create_ops.create(false);
8958
8959 objecter->mutate(oid,
8960 OSDMap::file_to_object_locator(in->layout),
8961 create_ops,
8962 in->snaprealm->get_snap_context(),
8963 ceph::real_clock::now(),
8964 0,
8965 NULL);
8966
8967 bufferlist inline_version_bl;
11fdf7f2 8968 encode(in->inline_version, inline_version_bl);
7c673cae
FG
8969
8970 ObjectOperation uninline_ops;
8971 uninline_ops.cmpxattr("inline_version",
8972 CEPH_OSD_CMPXATTR_OP_GT,
8973 CEPH_OSD_CMPXATTR_MODE_U64,
8974 inline_version_bl);
8975 bufferlist inline_data = in->inline_data;
8976 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8977 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8978
8979 objecter->mutate(oid,
8980 OSDMap::file_to_object_locator(in->layout),
8981 uninline_ops,
8982 in->snaprealm->get_snap_context(),
8983 ceph::real_clock::now(),
8984 0,
8985 onfinish);
8986
8987 return 0;
8988}
8989
8990//
8991
8992// blocking osd interface
8993
8994int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8995{
11fdf7f2 8996 std::lock_guard lock(client_lock);
7c673cae
FG
8997 tout(cct) << "read" << std::endl;
8998 tout(cct) << fd << std::endl;
8999 tout(cct) << size << std::endl;
9000 tout(cct) << offset << std::endl;
9001
181888fb
FG
9002 if (unmounting)
9003 return -ENOTCONN;
9004
7c673cae
FG
9005 Fh *f = get_filehandle(fd);
9006 if (!f)
9007 return -EBADF;
9008#if defined(__linux__) && defined(O_PATH)
9009 if (f->flags & O_PATH)
9010 return -EBADF;
9011#endif
9012 bufferlist bl;
11fdf7f2
TL
9013 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9014 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9015 int r = _read(f, offset, size, &bl);
9016 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9017 if (r >= 0) {
9018 bl.copy(0, bl.length(), buf);
9019 r = bl.length();
9020 }
9021 return r;
9022}
9023
9024int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9025{
9026 if (iovcnt < 0)
9027 return -EINVAL;
9028 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9029}
9030
11fdf7f2 9031int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9032{
11fdf7f2
TL
9033 int want, have = 0;
9034 bool movepos = false;
9035 std::unique_ptr<C_SaferCond> onuninline;
9036 int64_t r = 0;
9037 const auto& conf = cct->_conf;
7c673cae 9038 Inode *in = f->inode.get();
11fdf7f2
TL
9039 utime_t lat;
9040 utime_t start = ceph_clock_now();
7c673cae
FG
9041
9042 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9043 return -EBADF;
9044 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9045
7c673cae
FG
9046 if (offset < 0) {
9047 lock_fh_pos(f);
9048 offset = f->pos;
9049 movepos = true;
9050 }
9051 loff_t start_pos = offset;
9052
9053 if (in->inline_version == 0) {
11fdf7f2 9054 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9055 if (r < 0) {
11fdf7f2 9056 goto done;
c07f9fc5 9057 }
11fdf7f2 9058 ceph_assert(in->inline_version > 0);
7c673cae
FG
9059 }
9060
9061retry:
11fdf7f2
TL
9062 if (f->mode & CEPH_FILE_MODE_LAZY)
9063 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9064 else
9065 want = CEPH_CAP_FILE_CACHE;
9066 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9067 if (r < 0) {
11fdf7f2 9068 goto done;
c07f9fc5 9069 }
7c673cae 9070 if (f->flags & O_DIRECT)
11fdf7f2 9071 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9072
9073 if (in->inline_version < CEPH_INLINE_NONE) {
9074 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9075 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9076 uninline_data(in, onuninline.get());
7c673cae
FG
9077 } else {
9078 uint32_t len = in->inline_data.length();
7c673cae
FG
9079 uint64_t endoff = offset + size;
9080 if (endoff > in->size)
9081 endoff = in->size;
9082
9083 if (offset < len) {
9084 if (endoff <= len) {
9085 bl->substr_of(in->inline_data, offset, endoff - offset);
9086 } else {
9087 bl->substr_of(in->inline_data, offset, len - offset);
9088 bl->append_zero(endoff - len);
9089 }
11fdf7f2 9090 r = endoff - offset;
7c673cae
FG
9091 } else if ((uint64_t)offset < endoff) {
9092 bl->append_zero(endoff - offset);
11fdf7f2
TL
9093 r = endoff - offset;
9094 } else {
9095 r = 0;
7c673cae 9096 }
7c673cae
FG
9097 goto success;
9098 }
9099 }
9100
9101 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9102 conf->client_oc &&
9103 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9104
9105 if (f->flags & O_RSYNC) {
9106 _flush_range(in, offset, size);
9107 }
9108 r = _read_async(f, offset, size, bl);
9109 if (r < 0)
9110 goto done;
9111 } else {
9112 if (f->flags & O_DIRECT)
9113 _flush_range(in, offset, size);
9114
9115 bool checkeof = false;
9116 r = _read_sync(f, offset, size, bl, &checkeof);
9117 if (r < 0)
9118 goto done;
9119 if (checkeof) {
9120 offset += r;
9121 size -= r;
9122
9123 put_cap_ref(in, CEPH_CAP_FILE_RD);
9124 have = 0;
9125 // reverify size
9126 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9127 if (r < 0)
9128 goto done;
9129
9130 // eof? short read.
9131 if ((uint64_t)offset < in->size)
9132 goto retry;
9133 }
9134 }
9135
9136success:
11fdf7f2 9137 ceph_assert(r >= 0);
7c673cae
FG
9138 if (movepos) {
9139 // adjust fd pos
11fdf7f2 9140 f->pos = start_pos + r;
7c673cae 9141 }
11fdf7f2
TL
9142
9143 lat = ceph_clock_now();
9144 lat -= start;
9145 logger->tinc(l_c_read, lat);
7c673cae
FG
9146
9147done:
9148 // done!
11fdf7f2 9149
7c673cae
FG
9150 if (onuninline) {
9151 client_lock.Unlock();
11fdf7f2 9152 int ret = onuninline->wait();
7c673cae 9153 client_lock.Lock();
11fdf7f2 9154 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9155 in->inline_data.clear();
9156 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9157 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9158 check_caps(in, 0);
9159 } else
11fdf7f2 9160 r = ret;
7c673cae 9161 }
11fdf7f2 9162 if (have) {
7c673cae 9163 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9164 }
9165 if (movepos) {
9166 unlock_fh_pos(f);
9167 }
9168 return r;
7c673cae
FG
9169}
9170
9171Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9172 client(c), f(f) {
9173 f->get();
9174 f->readahead.inc_pending();
9175}
9176
9177Client::C_Readahead::~C_Readahead() {
9178 f->readahead.dec_pending();
9179 client->_put_fh(f);
9180}
9181
9182void Client::C_Readahead::finish(int r) {
9183 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9184 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9185}
9186
9187int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9188{
11fdf7f2 9189 const auto& conf = cct->_conf;
7c673cae
FG
9190 Inode *in = f->inode.get();
9191
11fdf7f2 9192 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9193
9194 // trim read based on file size?
9195 if (off >= in->size)
9196 return 0;
9197 if (len == 0)
9198 return 0;
9199 if (off + len > in->size) {
9200 len = in->size - off;
9201 }
9202
9203 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9204 << " max_bytes=" << f->readahead.get_max_readahead_size()
9205 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9206
9207 // read (and possibly block)
11fdf7f2
TL
9208 int r = 0;
9209 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9210 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9211 off, len, bl, 0, &onfinish);
7c673cae
FG
9212 if (r == 0) {
9213 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9214 client_lock.Unlock();
11fdf7f2 9215 r = onfinish.wait();
7c673cae
FG
9216 client_lock.Lock();
9217 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9218 }
9219
9220 if(f->readahead.get_min_readahead_size() > 0) {
9221 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9222 if (readahead_extent.second > 0) {
9223 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9224 << " (caller wants " << off << "~" << len << ")" << dendl;
9225 Context *onfinish2 = new C_Readahead(this, f);
9226 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9227 readahead_extent.first, readahead_extent.second,
9228 NULL, 0, onfinish2);
9229 if (r2 == 0) {
9230 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9231 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9232 } else {
9233 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9234 delete onfinish2;
9235 }
9236 }
9237 }
9238
9239 return r;
9240}
9241
9242int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9243 bool *checkeof)
9244{
9245 Inode *in = f->inode.get();
9246 uint64_t pos = off;
9247 int left = len;
9248 int read = 0;
9249
11fdf7f2 9250 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9251
9252 Mutex flock("Client::_read_sync flock");
9253 Cond cond;
9254 while (left > 0) {
11fdf7f2 9255 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9256 bufferlist tbl;
9257
9258 int wanted = left;
9259 filer->read_trunc(in->ino, &in->layout, in->snapid,
9260 pos, left, &tbl, 0,
9261 in->truncate_size, in->truncate_seq,
11fdf7f2 9262 &onfinish);
7c673cae 9263 client_lock.Unlock();
11fdf7f2 9264 int r = onfinish.wait();
7c673cae
FG
9265 client_lock.Lock();
9266
9267 // if we get ENOENT from OSD, assume 0 bytes returned
9268 if (r == -ENOENT)
9269 r = 0;
9270 if (r < 0)
9271 return r;
9272 if (tbl.length()) {
9273 r = tbl.length();
9274
9275 read += r;
9276 pos += r;
9277 left -= r;
9278 bl->claim_append(tbl);
9279 }
9280 // short read?
9281 if (r >= 0 && r < wanted) {
9282 if (pos < in->size) {
9283 // zero up to known EOF
9284 int64_t some = in->size - pos;
9285 if (some > left)
9286 some = left;
11fdf7f2
TL
9287 auto z = buffer::ptr_node::create(some);
9288 z->zero();
9289 bl->push_back(std::move(z));
7c673cae
FG
9290 read += some;
9291 pos += some;
9292 left -= some;
9293 if (left == 0)
9294 return read;
9295 }
9296
9297 *checkeof = true;
9298 return read;
9299 }
9300 }
9301 return read;
9302}
9303
9304
9305/*
9306 * we keep count of uncommitted sync writes on the inode, so that
9307 * fsync can DDRT.
9308 */
9309void Client::_sync_write_commit(Inode *in)
9310{
11fdf7f2 9311 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9312 unsafe_sync_write--;
9313
9314 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9315
11fdf7f2 9316 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9317 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9318 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
7c673cae
FG
9319 mount_cond.Signal();
9320 }
9321}
9322
9323int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9324{
11fdf7f2 9325 std::lock_guard lock(client_lock);
7c673cae
FG
9326 tout(cct) << "write" << std::endl;
9327 tout(cct) << fd << std::endl;
9328 tout(cct) << size << std::endl;
9329 tout(cct) << offset << std::endl;
9330
181888fb
FG
9331 if (unmounting)
9332 return -ENOTCONN;
9333
7c673cae
FG
9334 Fh *fh = get_filehandle(fd);
9335 if (!fh)
9336 return -EBADF;
9337#if defined(__linux__) && defined(O_PATH)
9338 if (fh->flags & O_PATH)
9339 return -EBADF;
9340#endif
11fdf7f2
TL
9341 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9342 size = std::min(size, (loff_t)INT_MAX);
9343 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9344 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9345 return r;
9346}
9347
9348int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9349{
9350 if (iovcnt < 0)
9351 return -EINVAL;
9352 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9353}
9354
11fdf7f2
TL
9355int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9356 unsigned iovcnt, int64_t offset, bool write,
9357 bool clamp_to_int)
7c673cae 9358{
7c673cae
FG
9359#if defined(__linux__) && defined(O_PATH)
9360 if (fh->flags & O_PATH)
9361 return -EBADF;
9362#endif
9363 loff_t totallen = 0;
9364 for (unsigned i = 0; i < iovcnt; i++) {
9365 totallen += iov[i].iov_len;
9366 }
11fdf7f2
TL
9367
9368 /*
9369 * Some of the API functions take 64-bit size values, but only return
9370 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9371 * we don't do I/Os larger than the values we can return.
9372 */
9373 if (clamp_to_int) {
9374 totallen = std::min(totallen, (loff_t)INT_MAX);
9375 }
7c673cae 9376 if (write) {
11fdf7f2
TL
9377 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9378 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9379 return w;
9380 } else {
9381 bufferlist bl;
11fdf7f2
TL
9382 int64_t r = _read(fh, offset, totallen, &bl);
9383 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9384 if (r <= 0)
9385 return r;
9386
9387 int bufoff = 0;
9388 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9389 /*
9390 * This piece of code aims to handle the case that bufferlist does not have enough data
9391 * to fill in the iov
9392 */
9393 if (resid < iov[j].iov_len) {
9394 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9395 break;
9396 } else {
9397 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9398 }
9399 resid -= iov[j].iov_len;
9400 bufoff += iov[j].iov_len;
9401 }
9402 return r;
9403 }
9404}
9405
11fdf7f2
TL
9406int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9407{
9408 std::lock_guard lock(client_lock);
9409 tout(cct) << fd << std::endl;
9410 tout(cct) << offset << std::endl;
9411
9412 if (unmounting)
9413 return -ENOTCONN;
9414
9415 Fh *fh = get_filehandle(fd);
9416 if (!fh)
9417 return -EBADF;
9418 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9419}
9420
9421int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9422 const struct iovec *iov, int iovcnt)
7c673cae 9423{
f64942e4
AA
9424 uint64_t fpos = 0;
9425
7c673cae
FG
9426 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9427 return -EFBIG;
9428
9429 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9430 Inode *in = f->inode.get();
9431
9432 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9433 return -ENOSPC;
9434 }
9435
11fdf7f2 9436 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9437
9438 // was Fh opened as writeable?
9439 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9440 return -EBADF;
9441
7c673cae
FG
9442 // use/adjust fd pos?
9443 if (offset < 0) {
9444 lock_fh_pos(f);
9445 /*
9446 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9447 * change out from under us.
9448 */
9449 if (f->flags & O_APPEND) {
9450 int r = _lseek(f, 0, SEEK_END);
9451 if (r < 0) {
9452 unlock_fh_pos(f);
9453 return r;
9454 }
9455 }
9456 offset = f->pos;
f64942e4 9457 fpos = offset+size;
7c673cae
FG
9458 unlock_fh_pos(f);
9459 }
9460
11fdf7f2
TL
9461 // check quota
9462 uint64_t endoff = offset + size;
9463 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9464 f->actor_perms)) {
9465 return -EDQUOT;
9466 }
9467
7c673cae
FG
9468 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9469
9470 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9471
9472 // time it.
9473 utime_t start = ceph_clock_now();
9474
9475 if (in->inline_version == 0) {
9476 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9477 if (r < 0)
9478 return r;
11fdf7f2 9479 ceph_assert(in->inline_version > 0);
7c673cae
FG
9480 }
9481
9482 // copy into fresh buffer (since our write may be resub, async)
9483 bufferlist bl;
9484 if (buf) {
9485 if (size > 0)
9486 bl.append(buf, size);
9487 } else if (iov){
9488 for (int i = 0; i < iovcnt; i++) {
9489 if (iov[i].iov_len > 0) {
9490 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9491 }
9492 }
9493 }
9494
9495 utime_t lat;
9496 uint64_t totalwritten;
11fdf7f2
TL
9497 int want, have;
9498 if (f->mode & CEPH_FILE_MODE_LAZY)
9499 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9500 else
9501 want = CEPH_CAP_FILE_BUFFER;
9502 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9503 if (r < 0)
9504 return r;
9505
9506 /* clear the setuid/setgid bits, if any */
181888fb 9507 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9508 struct ceph_statx stx = { 0 };
9509
9510 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9511 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9512 if (r < 0)
9513 return r;
9514 } else {
9515 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9516 }
9517
9518 if (f->flags & O_DIRECT)
11fdf7f2 9519 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9520
9521 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9522
11fdf7f2
TL
9523 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9524
7c673cae
FG
9525 if (in->inline_version < CEPH_INLINE_NONE) {
9526 if (endoff > cct->_conf->client_max_inline_size ||
9527 endoff > CEPH_INLINE_MAX_SIZE ||
9528 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9529 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9530 uninline_data(in, onuninline.get());
7c673cae
FG
9531 } else {
9532 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9533
9534 uint32_t len = in->inline_data.length();
9535
9536 if (endoff < len)
9537 in->inline_data.copy(endoff, len - endoff, bl);
9538
9539 if (offset < len)
9540 in->inline_data.splice(offset, len - offset);
9541 else if (offset > len)
9542 in->inline_data.append_zero(offset - len);
9543
9544 in->inline_data.append(bl);
9545 in->inline_version++;
9546
9547 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9548
9549 goto success;
9550 }
9551 }
9552
11fdf7f2
TL
9553 if (cct->_conf->client_oc &&
9554 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9555 // do buffered write
9556 if (!in->oset.dirty_or_tx)
9557 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9558
9559 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9560
9561 // async, caching, non-blocking.
9562 r = objectcacher->file_write(&in->oset, &in->layout,
9563 in->snaprealm->get_snap_context(),
9564 offset, size, bl, ceph::real_clock::now(),
9565 0);
9566 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9567
9568 if (r < 0)
9569 goto done;
9570
9571 // flush cached write if O_SYNC is set on file fh
9572 // O_DSYNC == O_SYNC on linux < 2.6.33
9573 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9574 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9575 _flush_range(in, offset, size);
9576 }
9577 } else {
9578 if (f->flags & O_DIRECT)
9579 _flush_range(in, offset, size);
9580
9581 // simple, non-atomic sync write
11fdf7f2 9582 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9583 unsafe_sync_write++;
9584 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9585
9586 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9587 offset, size, bl, ceph::real_clock::now(), 0,
9588 in->truncate_size, in->truncate_seq,
11fdf7f2 9589 &onfinish);
7c673cae 9590 client_lock.Unlock();
11fdf7f2 9591 onfinish.wait();
7c673cae
FG
9592 client_lock.Lock();
9593 _sync_write_commit(in);
9594 }
9595
9596 // if we get here, write was successful, update client metadata
9597success:
9598 // time
9599 lat = ceph_clock_now();
9600 lat -= start;
9601 logger->tinc(l_c_wrlat, lat);
9602
f64942e4
AA
9603 if (fpos) {
9604 lock_fh_pos(f);
9605 f->pos = fpos;
9606 unlock_fh_pos(f);
9607 }
7c673cae 9608 totalwritten = size;
11fdf7f2 9609 r = (int64_t)totalwritten;
7c673cae
FG
9610
9611 // extend file?
9612 if (totalwritten + offset > in->size) {
9613 in->size = totalwritten + offset;
28e407b8 9614 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9615
11fdf7f2 9616 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9617 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9618 } else if (is_max_size_approaching(in)) {
9619 check_caps(in, 0);
7c673cae
FG
9620 }
9621
9622 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9623 } else {
9624 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9625 }
9626
9627 // mtime
91327a77 9628 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9629 in->change_attr++;
28e407b8 9630 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9631
9632done:
9633
11fdf7f2 9634 if (nullptr != onuninline) {
7c673cae 9635 client_lock.Unlock();
11fdf7f2 9636 int uninline_ret = onuninline->wait();
7c673cae
FG
9637 client_lock.Lock();
9638
9639 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9640 in->inline_data.clear();
9641 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9642 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9643 check_caps(in, 0);
9644 } else
9645 r = uninline_ret;
9646 }
9647
9648 put_cap_ref(in, CEPH_CAP_FILE_WR);
9649 return r;
9650}
9651
9652int Client::_flush(Fh *f)
9653{
9654 Inode *in = f->inode.get();
9655 int err = f->take_async_err();
9656 if (err != 0) {
9657 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9658 << cpp_strerror(err) << dendl;
9659 } else {
9660 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9661 }
9662
9663 return err;
9664}
9665
9666int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9667{
9668 struct ceph_statx stx;
9669 stx.stx_size = length;
9670 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9671}
9672
9673int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9674{
11fdf7f2
TL
9675 std::lock_guard lock(client_lock);
9676 tout(cct) << __func__ << std::endl;
7c673cae
FG
9677 tout(cct) << fd << std::endl;
9678 tout(cct) << length << std::endl;
9679
181888fb
FG
9680 if (unmounting)
9681 return -ENOTCONN;
9682
7c673cae
FG
9683 Fh *f = get_filehandle(fd);
9684 if (!f)
9685 return -EBADF;
9686#if defined(__linux__) && defined(O_PATH)
9687 if (f->flags & O_PATH)
9688 return -EBADF;
9689#endif
9690 struct stat attr;
9691 attr.st_size = length;
9692 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9693}
9694
9695int Client::fsync(int fd, bool syncdataonly)
9696{
11fdf7f2 9697 std::lock_guard lock(client_lock);
7c673cae
FG
9698 tout(cct) << "fsync" << std::endl;
9699 tout(cct) << fd << std::endl;
9700 tout(cct) << syncdataonly << std::endl;
9701
181888fb
FG
9702 if (unmounting)
9703 return -ENOTCONN;
9704
7c673cae
FG
9705 Fh *f = get_filehandle(fd);
9706 if (!f)
9707 return -EBADF;
9708#if defined(__linux__) && defined(O_PATH)
9709 if (f->flags & O_PATH)
9710 return -EBADF;
9711#endif
9712 int r = _fsync(f, syncdataonly);
9713 if (r == 0) {
9714 // The IOs in this fsync were okay, but maybe something happened
9715 // in the background that we shoudl be reporting?
9716 r = f->take_async_err();
1adf2230 9717 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9718 << ") = 0, async_err = " << r << dendl;
9719 } else {
9720 // Assume that an error we encountered during fsync, even reported
9721 // synchronously, would also have applied the error to the Fh, and we
9722 // should clear it here to avoid returning the same error again on next
9723 // call.
1adf2230 9724 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9725 << r << dendl;
9726 f->take_async_err();
9727 }
9728 return r;
9729}
9730
9731int Client::_fsync(Inode *in, bool syncdataonly)
9732{
9733 int r = 0;
11fdf7f2 9734 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9735 ceph_tid_t flush_tid = 0;
9736 InodeRef tmp_ref;
11fdf7f2
TL
9737 utime_t lat;
9738 utime_t start = ceph_clock_now();
7c673cae 9739
1adf2230 9740 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9741
9742 if (cct->_conf->client_oc) {
11fdf7f2
TL
9743 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9744 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9745 _flush(in, object_cacher_completion.get());
7c673cae
FG
9746 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9747 }
9748
9749 if (!syncdataonly && in->dirty_caps) {
9750 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9751 if (in->flushing_caps)
9752 flush_tid = last_flush_tid;
9753 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9754
9755 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9756 flush_mdlog_sync();
9757
7c673cae
FG
9758 MetaRequest *req = in->unsafe_ops.back();
9759 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9760
9761 req->get();
9762 wait_on_list(req->waitfor_safe);
9763 put_request(req);
9764 }
9765
11fdf7f2 9766 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
7c673cae 9767 client_lock.Unlock();
7c673cae 9768 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9769 r = object_cacher_completion->wait();
7c673cae
FG
9770 client_lock.Lock();
9771 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9772 } else {
9773 // FIXME: this can starve
9774 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9775 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9776 << " uncommitted, waiting" << dendl;
9777 wait_on_list(in->waitfor_commit);
9778 }
9779 }
9780
9781 if (!r) {
9782 if (flush_tid > 0)
9783 wait_sync_caps(in, flush_tid);
9784
9785 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9786 } else {
1adf2230 9787 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9788 << cpp_strerror(-r) << dendl;
9789 }
11fdf7f2
TL
9790
9791 lat = ceph_clock_now();
9792 lat -= start;
9793 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9794
9795 return r;
9796}
9797
9798int Client::_fsync(Fh *f, bool syncdataonly)
9799{
1adf2230 9800 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9801 return _fsync(f->inode.get(), syncdataonly);
9802}
9803
9804int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9805{
11fdf7f2 9806 std::lock_guard lock(client_lock);
7c673cae
FG
9807 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9808 tout(cct) << fd << std::endl;
9809
181888fb
FG
9810 if (unmounting)
9811 return -ENOTCONN;
9812
7c673cae
FG
9813 Fh *f = get_filehandle(fd);
9814 if (!f)
9815 return -EBADF;
9816 int r = _getattr(f->inode, mask, perms);
9817 if (r < 0)
9818 return r;
9819 fill_stat(f->inode, stbuf, NULL);
1adf2230 9820 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9821 return r;
9822}
9823
9824int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9825 unsigned int want, unsigned int flags)
9826{
11fdf7f2 9827 std::lock_guard lock(client_lock);
7c673cae
FG
9828 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9829 tout(cct) << fd << std::endl;
9830
181888fb
FG
9831 if (unmounting)
9832 return -ENOTCONN;
9833
7c673cae
FG
9834 Fh *f = get_filehandle(fd);
9835 if (!f)
9836 return -EBADF;
9837
9838 unsigned mask = statx_to_mask(flags, want);
9839
9840 int r = 0;
94b18763 9841 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9842 r = _getattr(f->inode, mask, perms);
9843 if (r < 0) {
9844 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9845 return r;
9846 }
9847 }
9848
9849 fill_statx(f->inode, mask, stx);
9850 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9851 return r;
9852}
9853
9854// not written yet, but i want to link!
9855
9856int Client::chdir(const char *relpath, std::string &new_cwd,
9857 const UserPerm& perms)
9858{
11fdf7f2 9859 std::lock_guard lock(client_lock);
7c673cae
FG
9860 tout(cct) << "chdir" << std::endl;
9861 tout(cct) << relpath << std::endl;
181888fb
FG
9862
9863 if (unmounting)
9864 return -ENOTCONN;
9865
7c673cae
FG
9866 filepath path(relpath);
9867 InodeRef in;
9868 int r = path_walk(path, &in, perms);
9869 if (r < 0)
9870 return r;
9871 if (cwd != in)
9872 cwd.swap(in);
9873 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9874
b5b8bbf5 9875 _getcwd(new_cwd, perms);
7c673cae
FG
9876 return 0;
9877}
9878
b5b8bbf5 9879void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9880{
9881 filepath path;
11fdf7f2 9882 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9883
9884 Inode *in = cwd.get();
9885 while (in != root) {
11fdf7f2 9886 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9887
9888 // A cwd or ancester is unlinked
11fdf7f2 9889 if (in->dentries.empty()) {
7c673cae
FG
9890 return;
9891 }
9892
9893 Dentry *dn = in->get_first_parent();
9894
9895
9896 if (!dn) {
9897 // look it up
11fdf7f2 9898 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9899 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9900 filepath path(in->ino);
9901 req->set_filepath(path);
9902 req->set_inode(in);
9903 int res = make_request(req, perms);
9904 if (res < 0)
9905 break;
9906
9907 // start over
9908 path = filepath();
9909 in = cwd.get();
9910 continue;
9911 }
9912 path.push_front_dentry(dn->name);
9913 in = dn->dir->parent_inode;
9914 }
9915 dir = "/";
9916 dir += path.get_path();
9917}
9918
b5b8bbf5
FG
9919void Client::getcwd(string& dir, const UserPerm& perms)
9920{
11fdf7f2 9921 std::lock_guard l(client_lock);
181888fb
FG
9922 if (!unmounting)
9923 _getcwd(dir, perms);
b5b8bbf5
FG
9924}
9925
7c673cae
FG
9926int Client::statfs(const char *path, struct statvfs *stbuf,
9927 const UserPerm& perms)
9928{
11fdf7f2
TL
9929 std::lock_guard l(client_lock);
9930 tout(cct) << __func__ << std::endl;
91327a77 9931 unsigned long int total_files_on_fs;
7c673cae 9932
181888fb
FG
9933 if (unmounting)
9934 return -ENOTCONN;
9935
7c673cae
FG
9936 ceph_statfs stats;
9937 C_SaferCond cond;
d2e6a577
FG
9938
9939 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9940 if (data_pools.size() == 1) {
9941 objecter->get_fs_stats(stats, data_pools[0], &cond);
9942 } else {
9943 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9944 }
7c673cae
FG
9945
9946 client_lock.Unlock();
9947 int rval = cond.wait();
91327a77
AA
9948 assert(root);
9949 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9950 client_lock.Lock();
9951
9952 if (rval < 0) {
9953 ldout(cct, 1) << "underlying call to statfs returned error: "
9954 << cpp_strerror(rval)
9955 << dendl;
9956 return rval;
9957 }
9958
9959 memset(stbuf, 0, sizeof(*stbuf));
9960
9961 /*
9962 * we're going to set a block size of 4MB so we can represent larger
9963 * FSes without overflowing. Additionally convert the space
9964 * measurements from KB to bytes while making them in terms of
9965 * blocks. We use 4MB only because it is big enough, and because it
9966 * actually *is* the (ceph) default block size.
9967 */
9968 const int CEPH_BLOCK_SHIFT = 22;
9969 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9970 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9971 stbuf->f_files = total_files_on_fs;
9972 stbuf->f_ffree = 0;
7c673cae
FG
9973 stbuf->f_favail = -1;
9974 stbuf->f_fsid = -1; // ??
9975 stbuf->f_flag = 0; // ??
9976 stbuf->f_namemax = NAME_MAX;
9977
9978 // Usually quota_root will == root_ancestor, but if the mount root has no
9979 // quota but we can see a parent of it that does have a quota, we'll
9980 // respect that one instead.
11fdf7f2 9981 ceph_assert(root != nullptr);
7c673cae
FG
9982 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9983
9984 // get_quota_root should always give us something
9985 // because client quotas are always enabled
11fdf7f2 9986 ceph_assert(quota_root != nullptr);
7c673cae
FG
9987
9988 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9989
9990 // Skip the getattr if any sessions are stale, as we don't want to
9991 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9992 // is unhealthy.
9993 if (!_any_stale_sessions()) {
9994 int r = _getattr(quota_root, 0, perms, true);
9995 if (r != 0) {
9996 // Ignore return value: error getting latest inode metadata is not a good
9997 // reason to break "df".
9998 lderr(cct) << "Error in getattr on quota root 0x"
9999 << std::hex << quota_root->ino << std::dec
10000 << " statfs result may be outdated" << dendl;
10001 }
10002 }
10003
10004 // Special case: if there is a size quota set on the Inode acting
10005 // as the root for this client mount, then report the quota status
10006 // as the filesystem statistics.
10007 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10008 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10009 // It is possible for a quota to be exceeded: arithmetic here must
10010 // handle case where used > total.
10011 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10012
10013 stbuf->f_blocks = total;
10014 stbuf->f_bfree = free;
10015 stbuf->f_bavail = free;
10016 } else {
d2e6a577 10017 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10018 // multiple pools may be used without one filesystem namespace via
10019 // layouts, this is the most correct thing we can do.
10020 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10021 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10022 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10023 }
10024
10025 return rval;
10026}
10027
10028int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10029 struct flock *fl, uint64_t owner, bool removing)
10030{
11fdf7f2 10031 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10032 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10033 << " type " << fl->l_type << " owner " << owner
10034 << " " << fl->l_start << "~" << fl->l_len << dendl;
10035
10036 int lock_cmd;
10037 if (F_RDLCK == fl->l_type)
10038 lock_cmd = CEPH_LOCK_SHARED;
10039 else if (F_WRLCK == fl->l_type)
10040 lock_cmd = CEPH_LOCK_EXCL;
10041 else if (F_UNLCK == fl->l_type)
10042 lock_cmd = CEPH_LOCK_UNLOCK;
10043 else
10044 return -EIO;
10045
10046 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10047 sleep = 0;
10048
10049 /*
10050 * Set the most significant bit, so that MDS knows the 'owner'
10051 * is sufficient to identify the owner of lock. (old code uses
10052 * both 'owner' and 'pid')
10053 */
10054 owner |= (1ULL << 63);
10055
10056 MetaRequest *req = new MetaRequest(op);
10057 filepath path;
10058 in->make_nosnap_relative_path(path);
10059 req->set_filepath(path);
10060 req->set_inode(in);
10061
10062 req->head.args.filelock_change.rule = lock_type;
10063 req->head.args.filelock_change.type = lock_cmd;
10064 req->head.args.filelock_change.owner = owner;
10065 req->head.args.filelock_change.pid = fl->l_pid;
10066 req->head.args.filelock_change.start = fl->l_start;
10067 req->head.args.filelock_change.length = fl->l_len;
10068 req->head.args.filelock_change.wait = sleep;
10069
10070 int ret;
10071 bufferlist bl;
10072
10073 if (sleep && switch_interrupt_cb) {
10074 // enable interrupt
10075 switch_interrupt_cb(callback_handle, req->get());
10076 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10077 // disable interrupt
10078 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10079 if (ret == 0 && req->aborted()) {
10080 // effect of this lock request has been revoked by the 'lock intr' request
10081 ret = req->get_abort_code();
10082 }
7c673cae
FG
10083 put_request(req);
10084 } else {
10085 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10086 }
10087
10088 if (ret == 0) {
10089 if (op == CEPH_MDS_OP_GETFILELOCK) {
10090 ceph_filelock filelock;
11fdf7f2
TL
10091 auto p = bl.cbegin();
10092 decode(filelock, p);
7c673cae
FG
10093
10094 if (CEPH_LOCK_SHARED == filelock.type)
10095 fl->l_type = F_RDLCK;
10096 else if (CEPH_LOCK_EXCL == filelock.type)
10097 fl->l_type = F_WRLCK;
10098 else
10099 fl->l_type = F_UNLCK;
10100
10101 fl->l_whence = SEEK_SET;
10102 fl->l_start = filelock.start;
10103 fl->l_len = filelock.length;
10104 fl->l_pid = filelock.pid;
10105 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10106 ceph_lock_state_t *lock_state;
10107 if (lock_type == CEPH_LOCK_FCNTL) {
10108 if (!in->fcntl_locks)
11fdf7f2
TL
10109 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10110 lock_state = in->fcntl_locks.get();
7c673cae
FG
10111 } else if (lock_type == CEPH_LOCK_FLOCK) {
10112 if (!in->flock_locks)
11fdf7f2
TL
10113 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10114 lock_state = in->flock_locks.get();
7c673cae
FG
10115 } else {
10116 ceph_abort();
10117 return -EINVAL;
10118 }
10119 _update_lock_state(fl, owner, lock_state);
10120
10121 if (!removing) {
10122 if (lock_type == CEPH_LOCK_FCNTL) {
10123 if (!fh->fcntl_locks)
11fdf7f2
TL
10124 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10125 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10126 } else {
10127 if (!fh->flock_locks)
11fdf7f2
TL
10128 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10129 lock_state = fh->flock_locks.get();
7c673cae
FG
10130 }
10131 _update_lock_state(fl, owner, lock_state);
10132 }
10133 } else
10134 ceph_abort();
10135 }
10136 return ret;
10137}
10138
10139int Client::_interrupt_filelock(MetaRequest *req)
10140{
31f18b77
FG
10141 // Set abort code, but do not kick. The abort code prevents the request
10142 // from being re-sent.
10143 req->abort(-EINTR);
10144 if (req->mds < 0)
10145 return 0; // haven't sent the request
10146
7c673cae
FG
10147 Inode *in = req->inode();
10148
10149 int lock_type;
10150 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10151 lock_type = CEPH_LOCK_FLOCK_INTR;
10152 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10153 lock_type = CEPH_LOCK_FCNTL_INTR;
10154 else {
10155 ceph_abort();
10156 return -EINVAL;
10157 }
10158
10159 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10160 filepath path;
10161 in->make_nosnap_relative_path(path);
10162 intr_req->set_filepath(path);
10163 intr_req->set_inode(in);
10164 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10165 intr_req->head.args.filelock_change.rule = lock_type;
10166 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10167
10168 UserPerm perms(req->get_uid(), req->get_gid());
10169 return make_request(intr_req, perms, NULL, NULL, -1);
10170}
10171
10172void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10173{
10174 if (!in->fcntl_locks && !in->flock_locks)
10175 return;
10176
10177 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10178 encode(nr_fcntl_locks, bl);
7c673cae 10179 if (nr_fcntl_locks) {
11fdf7f2 10180 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10181 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10182 p != lock_state->held_locks.end();
10183 ++p)
11fdf7f2 10184 encode(p->second, bl);
7c673cae
FG
10185 }
10186
10187 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10188 encode(nr_flock_locks, bl);
7c673cae 10189 if (nr_flock_locks) {
11fdf7f2 10190 auto &lock_state = in->flock_locks;
7c673cae
FG
10191 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10192 p != lock_state->held_locks.end();
10193 ++p)
11fdf7f2 10194 encode(p->second, bl);
7c673cae
FG
10195 }
10196
11fdf7f2 10197 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10198 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10199}
10200
10201void Client::_release_filelocks(Fh *fh)
10202{
10203 if (!fh->fcntl_locks && !fh->flock_locks)
10204 return;
10205
10206 Inode *in = fh->inode.get();
11fdf7f2 10207 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10208
10209 list<pair<int, ceph_filelock> > to_release;
10210
10211 if (fh->fcntl_locks) {
11fdf7f2 10212 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10213 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10214 p != lock_state->held_locks.end();
10215 ++p)
10216 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10217 lock_state.reset();
7c673cae
FG
10218 }
10219 if (fh->flock_locks) {
11fdf7f2 10220 auto &lock_state = fh->flock_locks;
7c673cae
FG
10221 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10222 p != lock_state->held_locks.end();
10223 ++p)
10224 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10225 lock_state.reset();
7c673cae
FG
10226 }
10227
10228 if (to_release.empty())
10229 return;
10230
11fdf7f2
TL
10231 // mds has already released filelocks if session was closed.
10232 if (in->caps.empty())
10233 return;
10234
7c673cae
FG
10235 struct flock fl;
10236 memset(&fl, 0, sizeof(fl));
10237 fl.l_whence = SEEK_SET;
10238 fl.l_type = F_UNLCK;
10239
10240 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10241 p != to_release.end();
10242 ++p) {
10243 fl.l_start = p->second.start;
10244 fl.l_len = p->second.length;
10245 fl.l_pid = p->second.pid;
10246 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10247 p->second.owner, true);
10248 }
10249}
10250
10251void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10252 ceph_lock_state_t *lock_state)
10253{
10254 int lock_cmd;
10255 if (F_RDLCK == fl->l_type)
10256 lock_cmd = CEPH_LOCK_SHARED;
10257 else if (F_WRLCK == fl->l_type)
10258 lock_cmd = CEPH_LOCK_EXCL;
10259 else
10260 lock_cmd = CEPH_LOCK_UNLOCK;;
10261
10262 ceph_filelock filelock;
10263 filelock.start = fl->l_start;
10264 filelock.length = fl->l_len;
10265 filelock.client = 0;
10266 // see comment in _do_filelock()
10267 filelock.owner = owner | (1ULL << 63);
10268 filelock.pid = fl->l_pid;
10269 filelock.type = lock_cmd;
10270
10271 if (filelock.type == CEPH_LOCK_UNLOCK) {
10272 list<ceph_filelock> activated_locks;
10273 lock_state->remove_lock(filelock, activated_locks);
10274 } else {
10275 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10276 ceph_assert(r);
7c673cae
FG
10277 }
10278}
10279
10280int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10281{
10282 Inode *in = fh->inode.get();
10283 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10284 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10285 return ret;
10286}
10287
10288int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10289{
10290 Inode *in = fh->inode.get();
10291 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10292 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10293 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10294 return ret;
10295}
10296
10297int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10298{
10299 Inode *in = fh->inode.get();
10300 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10301
10302 int sleep = !(cmd & LOCK_NB);
10303 cmd &= ~LOCK_NB;
10304
10305 int type;
10306 switch (cmd) {
10307 case LOCK_SH:
10308 type = F_RDLCK;
10309 break;
10310 case LOCK_EX:
10311 type = F_WRLCK;
10312 break;
10313 case LOCK_UN:
10314 type = F_UNLCK;
10315 break;
10316 default:
10317 return -EINVAL;
10318 }
10319
10320 struct flock fl;
10321 memset(&fl, 0, sizeof(fl));
10322 fl.l_type = type;
10323 fl.l_whence = SEEK_SET;
10324
10325 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10326 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10327 return ret;
10328}
10329
10330int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10331{
10332 /* Since the only thing this does is wrap a call to statfs, and
10333 statfs takes a lock, it doesn't seem we have a need to split it
10334 out. */
10335 return statfs(0, stbuf, perms);
10336}
10337
10338void Client::ll_register_callbacks(struct client_callback_args *args)
10339{
10340 if (!args)
10341 return;
11fdf7f2
TL
10342 std::lock_guard l(client_lock);
10343 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10344 << " invalidate_ino_cb " << args->ino_cb
10345 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10346 << " switch_interrupt_cb " << args->switch_intr_cb
10347 << " remount_cb " << args->remount_cb
10348 << dendl;
10349 callback_handle = args->handle;
10350 if (args->ino_cb) {
10351 ino_invalidate_cb = args->ino_cb;
10352 async_ino_invalidator.start();
10353 }
10354 if (args->dentry_cb) {
10355 dentry_invalidate_cb = args->dentry_cb;
10356 async_dentry_invalidator.start();
10357 }
10358 if (args->switch_intr_cb) {
10359 switch_interrupt_cb = args->switch_intr_cb;
10360 interrupt_finisher.start();
10361 }
10362 if (args->remount_cb) {
10363 remount_cb = args->remount_cb;
10364 remount_finisher.start();
10365 }
7c673cae
FG
10366 umask_cb = args->umask_cb;
10367}
10368
10369int Client::test_dentry_handling(bool can_invalidate)
10370{
10371 int r = 0;
10372
10373 can_invalidate_dentries = can_invalidate;
10374
10375 if (can_invalidate_dentries) {
11fdf7f2 10376 ceph_assert(dentry_invalidate_cb);
7c673cae 10377 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10378 r = 0;
11fdf7f2
TL
10379 } else {
10380 ceph_assert(remount_cb);
7c673cae 10381 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10382 r = _do_remount(false);
b32b8144 10383 }
11fdf7f2 10384
7c673cae
FG
10385 return r;
10386}
10387
10388int Client::_sync_fs()
10389{
11fdf7f2 10390 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10391
10392 // flush file data
11fdf7f2
TL
10393 std::unique_ptr<C_SaferCond> cond = nullptr;
10394 if (cct->_conf->client_oc) {
10395 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10396 objectcacher->flush_all(cond.get());
10397 }
7c673cae
FG
10398
10399 // flush caps
10400 flush_caps_sync();
10401 ceph_tid_t flush_tid = last_flush_tid;
10402
10403 // wait for unsafe mds requests
10404 wait_unsafe_requests();
10405
10406 wait_sync_caps(flush_tid);
10407
11fdf7f2 10408 if (nullptr != cond) {
7c673cae 10409 client_lock.Unlock();
11fdf7f2
TL
10410 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10411 cond->wait();
10412 ldout(cct, 15) << __func__ << " flush finished" << dendl;
7c673cae
FG
10413 client_lock.Lock();
10414 }
10415
10416 return 0;
10417}
10418
10419int Client::sync_fs()
10420{
11fdf7f2 10421 std::lock_guard l(client_lock);
181888fb
FG
10422
10423 if (unmounting)
10424 return -ENOTCONN;
10425
7c673cae
FG
10426 return _sync_fs();
10427}
10428
10429int64_t Client::drop_caches()
10430{
11fdf7f2 10431 std::lock_guard l(client_lock);
7c673cae
FG
10432 return objectcacher->release_all();
10433}
10434
11fdf7f2
TL
10435int Client::_lazyio(Fh *fh, int enable)
10436{
10437 Inode *in = fh->inode.get();
10438 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10439
10440 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10441 return 0;
10442
10443 int orig_mode = fh->mode;
10444 if (enable) {
10445 fh->mode |= CEPH_FILE_MODE_LAZY;
10446 in->get_open_ref(fh->mode);
10447 in->put_open_ref(orig_mode);
10448 check_caps(in, CHECK_CAPS_NODELAY);
10449 } else {
10450 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10451 in->get_open_ref(fh->mode);
10452 in->put_open_ref(orig_mode);
10453 check_caps(in, 0);
10454 }
10455
10456 return 0;
10457}
10458
10459int Client::lazyio(int fd, int enable)
10460{
10461 std::lock_guard l(client_lock);
10462 Fh *f = get_filehandle(fd);
10463 if (!f)
10464 return -EBADF;
10465
10466 return _lazyio(f, enable);
10467}
10468
10469int Client::ll_lazyio(Fh *fh, int enable)
10470{
10471 std::lock_guard lock(client_lock);
10472 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10473 tout(cct) << __func__ << std::endl;
10474
10475 return _lazyio(fh, enable);
10476}
7c673cae
FG
10477
10478int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10479{
11fdf7f2 10480 std::lock_guard l(client_lock);
7c673cae
FG
10481 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10482 << ", " << offset << ", " << count << ")" << dendl;
10483
10484 Fh *f = get_filehandle(fd);
10485 if (!f)
10486 return -EBADF;
10487
10488 // for now
10489 _fsync(f, true);
10490
10491 return 0;
10492}
10493
10494int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10495{
11fdf7f2 10496 std::lock_guard l(client_lock);
7c673cae
FG
10497 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10498 << ", " << offset << ", " << count << ")" << dendl;
10499
10500 Fh *f = get_filehandle(fd);
10501 if (!f)
10502 return -EBADF;
10503 Inode *in = f->inode.get();
10504
10505 _fsync(f, true);
10506 if (_release(in))
10507 check_caps(in, 0);
10508 return 0;
10509}
10510
10511
10512// =============================
10513// snaps
10514
10515int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10516{
11fdf7f2 10517 std::lock_guard l(client_lock);
181888fb
FG
10518
10519 if (unmounting)
10520 return -ENOTCONN;
10521
7c673cae
FG
10522 filepath path(relpath);
10523 InodeRef in;
10524 int r = path_walk(path, &in, perm);
10525 if (r < 0)
10526 return r;
10527 if (cct->_conf->client_permissions) {
10528 r = may_create(in.get(), perm);
10529 if (r < 0)
10530 return r;
10531 }
10532 Inode *snapdir = open_snapdir(in.get());
10533 return _mkdir(snapdir, name, 0, perm);
10534}
181888fb 10535
7c673cae
FG
10536int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10537{
11fdf7f2 10538 std::lock_guard l(client_lock);
181888fb
FG
10539
10540 if (unmounting)
10541 return -ENOTCONN;
10542
7c673cae
FG
10543 filepath path(relpath);
10544 InodeRef in;
10545 int r = path_walk(path, &in, perms);
10546 if (r < 0)
10547 return r;
10548 if (cct->_conf->client_permissions) {
10549 r = may_delete(in.get(), NULL, perms);
10550 if (r < 0)
10551 return r;
10552 }
10553 Inode *snapdir = open_snapdir(in.get());
10554 return _rmdir(snapdir, name, perms);
10555}
10556
10557// =============================
10558// expose caps
10559
10560int Client::get_caps_issued(int fd) {
10561
11fdf7f2 10562 std::lock_guard lock(client_lock);
7c673cae 10563
181888fb
FG
10564 if (unmounting)
10565 return -ENOTCONN;
10566
7c673cae
FG
10567 Fh *f = get_filehandle(fd);
10568 if (!f)
10569 return -EBADF;
10570
10571 return f->inode->caps_issued();
10572}
10573
10574int Client::get_caps_issued(const char *path, const UserPerm& perms)
10575{
11fdf7f2 10576 std::lock_guard lock(client_lock);
181888fb
FG
10577
10578 if (unmounting)
10579 return -ENOTCONN;
10580
7c673cae
FG
10581 filepath p(path);
10582 InodeRef in;
10583 int r = path_walk(p, &in, perms, true);
10584 if (r < 0)
10585 return r;
10586 return in->caps_issued();
10587}
10588
10589// =========================================
10590// low level
10591
10592Inode *Client::open_snapdir(Inode *diri)
10593{
10594 Inode *in;
10595 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10596 if (!inode_map.count(vino)) {
10597 in = new Inode(this, vino, &diri->layout);
10598
10599 in->ino = diri->ino;
10600 in->snapid = CEPH_SNAPDIR;
10601 in->mode = diri->mode;
10602 in->uid = diri->uid;
10603 in->gid = diri->gid;
494da23a 10604 in->nlink = 1;
7c673cae
FG
10605 in->mtime = diri->mtime;
10606 in->ctime = diri->ctime;
10607 in->btime = diri->btime;
10608 in->size = diri->size;
10609 in->change_attr = diri->change_attr;
10610
10611 in->dirfragtree.clear();
10612 in->snapdir_parent = diri;
10613 diri->flags |= I_SNAPDIR_OPEN;
10614 inode_map[vino] = in;
10615 if (use_faked_inos())
10616 _assign_faked_ino(in);
10617 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10618 } else {
10619 in = inode_map[vino];
10620 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10621 }
10622 return in;
10623}
10624
10625int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10626 Inode **out, const UserPerm& perms)
10627{
11fdf7f2 10628 std::lock_guard lock(client_lock);
31f18b77 10629 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10630 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10631 tout(cct) << __func__ << std::endl;
7c673cae
FG
10632 tout(cct) << name << std::endl;
10633
181888fb
FG
10634 if (unmounting)
10635 return -ENOTCONN;
10636
7c673cae 10637 int r = 0;
11fdf7f2
TL
10638 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10639 "fuse_default_permissions");
10640 if (!fuse_default_permissions) {
10641 if (strcmp(name, ".") && strcmp(name, "..")) {
10642 r = may_lookup(parent, perms);
10643 if (r < 0)
10644 return r;
10645 }
7c673cae
FG
10646 }
10647
10648 string dname(name);
10649 InodeRef in;
10650
10651 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10652 if (r < 0) {
10653 attr->st_ino = 0;
10654 goto out;
10655 }
10656
11fdf7f2 10657 ceph_assert(in);
7c673cae
FG
10658 fill_stat(in, attr);
10659 _ll_get(in.get());
10660
10661 out:
11fdf7f2 10662 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10663 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10664 tout(cct) << attr->st_ino << std::endl;
10665 *out = in.get();
10666 return r;
10667}
10668
1adf2230
AA
10669int Client::ll_lookup_inode(
10670 struct inodeno_t ino,
10671 const UserPerm& perms,
10672 Inode **inode)
10673{
81eedcae 10674 ceph_assert(inode != NULL);
11fdf7f2 10675 std::lock_guard lock(client_lock);
1adf2230
AA
10676 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10677
81eedcae
TL
10678 if (unmounting)
10679 return -ENOTCONN;
10680
1adf2230
AA
10681 // Num1: get inode and *inode
10682 int r = _lookup_ino(ino, perms, inode);
81eedcae 10683 if (r)
1adf2230 10684 return r;
81eedcae 10685
11fdf7f2 10686 ceph_assert(*inode != NULL);
1adf2230 10687
81eedcae
TL
10688 if (!(*inode)->dentries.empty()) {
10689 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10690 return 0;
10691 }
10692
10693 if ((*inode)->is_root()) {
10694 ldout(cct, 8) << "ino is root, no parent" << dendl;
10695 return 0;
10696 }
10697
1adf2230
AA
10698 // Num2: Request the parent inode, so that we can look up the name
10699 Inode *parent;
10700 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10701 if (r) {
1adf2230
AA
10702 _ll_forget(*inode, 1);
10703 return r;
1adf2230 10704 }
81eedcae 10705
11fdf7f2 10706 ceph_assert(parent != NULL);
1adf2230
AA
10707
10708 // Num3: Finally, get the name (dentry) of the requested inode
10709 r = _lookup_name(*inode, parent, perms);
10710 if (r) {
10711 // Unexpected error
10712 _ll_forget(parent, 1);
10713 _ll_forget(*inode, 1);
10714 return r;
10715 }
10716
10717 _ll_forget(parent, 1);
10718 return 0;
10719}
10720
7c673cae
FG
10721int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10722 struct ceph_statx *stx, unsigned want, unsigned flags,
10723 const UserPerm& perms)
10724{
11fdf7f2 10725 std::lock_guard lock(client_lock);
31f18b77 10726 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10727 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10728 tout(cct) << "ll_lookupx" << std::endl;
10729 tout(cct) << name << std::endl;
10730
181888fb
FG
10731 if (unmounting)
10732 return -ENOTCONN;
10733
7c673cae 10734 int r = 0;
11fdf7f2
TL
10735 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10736 "fuse_default_permissions");
10737 if (!fuse_default_permissions) {
7c673cae
FG
10738 r = may_lookup(parent, perms);
10739 if (r < 0)
10740 return r;
10741 }
10742
10743 string dname(name);
10744 InodeRef in;
10745
10746 unsigned mask = statx_to_mask(flags, want);
10747 r = _lookup(parent, dname, mask, &in, perms);
10748 if (r < 0) {
10749 stx->stx_ino = 0;
10750 stx->stx_mask = 0;
10751 } else {
11fdf7f2 10752 ceph_assert(in);
7c673cae
FG
10753 fill_statx(in, mask, stx);
10754 _ll_get(in.get());
10755 }
10756
11fdf7f2 10757 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10758 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10759 tout(cct) << stx->stx_ino << std::endl;
10760 *out = in.get();
10761 return r;
10762}
10763
10764int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10765 unsigned int want, unsigned int flags, const UserPerm& perms)
10766{
11fdf7f2 10767 std::lock_guard lock(client_lock);
181888fb
FG
10768
10769 if (unmounting)
10770 return -ENOTCONN;
10771
7c673cae
FG
10772 filepath fp(name, 0);
10773 InodeRef in;
10774 int rc;
10775 unsigned mask = statx_to_mask(flags, want);
10776
11fdf7f2
TL
10777 ldout(cct, 3) << __func__ << " " << name << dendl;
10778 tout(cct) << __func__ << std::endl;
7c673cae
FG
10779 tout(cct) << name << std::endl;
10780
10781 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10782 if (rc < 0) {
10783 /* zero out mask, just in case... */
10784 stx->stx_mask = 0;
10785 stx->stx_ino = 0;
10786 *out = NULL;
10787 return rc;
10788 } else {
11fdf7f2 10789 ceph_assert(in);
7c673cae
FG
10790 fill_statx(in, mask, stx);
10791 _ll_get(in.get());
10792 *out = in.get();
10793 return 0;
10794 }
10795}
10796
10797void Client::_ll_get(Inode *in)
10798{
10799 if (in->ll_ref == 0) {
10800 in->get();
11fdf7f2
TL
10801 if (in->is_dir() && !in->dentries.empty()) {
10802 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10803 in->get_first_parent()->get(); // pin dentry
10804 }
11fdf7f2
TL
10805 if (in->snapid != CEPH_NOSNAP)
10806 ll_snap_ref[in->snapid]++;
7c673cae
FG
10807 }
10808 in->ll_get();
11fdf7f2 10809 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10810}
10811
494da23a 10812int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
10813{
10814 in->ll_put(num);
11fdf7f2 10815 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10816 if (in->ll_ref == 0) {
11fdf7f2
TL
10817 if (in->is_dir() && !in->dentries.empty()) {
10818 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10819 in->get_first_parent()->put(); // unpin dentry
10820 }
11fdf7f2
TL
10821 if (in->snapid != CEPH_NOSNAP) {
10822 auto p = ll_snap_ref.find(in->snapid);
10823 ceph_assert(p != ll_snap_ref.end());
10824 ceph_assert(p->second > 0);
10825 if (--p->second == 0)
10826 ll_snap_ref.erase(p);
10827 }
7c673cae
FG
10828 put_inode(in);
10829 return 0;
10830 } else {
10831 return in->ll_ref;
10832 }
10833}
10834
10835void Client::_ll_drop_pins()
10836{
11fdf7f2 10837 ldout(cct, 10) << __func__ << dendl;
1adf2230 10838 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10839 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10840 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10841 it != inode_map.end();
10842 it = next) {
10843 Inode *in = it->second;
10844 next = it;
10845 ++next;
1adf2230
AA
10846 if (in->ll_ref){
10847 to_be_put.insert(in);
7c673cae 10848 _ll_put(in, in->ll_ref);
1adf2230 10849 }
7c673cae
FG
10850 }
10851}
10852
494da23a 10853bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 10854{
11fdf7f2 10855 inodeno_t ino = in->ino;
7c673cae 10856
11fdf7f2
TL
10857 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10858 tout(cct) << __func__ << std::endl;
7c673cae
FG
10859 tout(cct) << ino.val << std::endl;
10860 tout(cct) << count << std::endl;
10861
181888fb
FG
10862 // Ignore forget if we're no longer mounted
10863 if (unmounting)
10864 return true;
10865
7c673cae
FG
10866 if (ino == 1) return true; // ignore forget on root.
10867
10868 bool last = false;
10869 if (in->ll_ref < count) {
10870 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10871 << ", which only has ll_ref=" << in->ll_ref << dendl;
10872 _ll_put(in, in->ll_ref);
10873 last = true;
10874 } else {
10875 if (_ll_put(in, count) == 0)
10876 last = true;
10877 }
10878
10879 return last;
10880}
10881
494da23a 10882bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 10883{
11fdf7f2 10884 std::lock_guard lock(client_lock);
1adf2230
AA
10885 return _ll_forget(in, count);
10886}
10887
7c673cae
FG
10888bool Client::ll_put(Inode *in)
10889{
10890 /* ll_forget already takes the lock */
10891 return ll_forget(in, 1);
10892}
10893
11fdf7f2
TL
10894int Client::ll_get_snap_ref(snapid_t snap)
10895{
10896 std::lock_guard lock(client_lock);
10897 auto p = ll_snap_ref.find(snap);
10898 if (p != ll_snap_ref.end())
10899 return p->second;
10900 return 0;
10901}
10902
7c673cae
FG
10903snapid_t Client::ll_get_snapid(Inode *in)
10904{
11fdf7f2 10905 std::lock_guard lock(client_lock);
7c673cae
FG
10906 return in->snapid;
10907}
10908
10909Inode *Client::ll_get_inode(ino_t ino)
10910{
11fdf7f2 10911 std::lock_guard lock(client_lock);
181888fb
FG
10912
10913 if (unmounting)
10914 return NULL;
10915
7c673cae
FG
10916 vinodeno_t vino = _map_faked_ino(ino);
10917 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10918 if (p == inode_map.end())
10919 return NULL;
10920 Inode *in = p->second;
10921 _ll_get(in);
10922 return in;
10923}
10924
10925Inode *Client::ll_get_inode(vinodeno_t vino)
10926{
11fdf7f2 10927 std::lock_guard lock(client_lock);
181888fb
FG
10928
10929 if (unmounting)
10930 return NULL;
10931
7c673cae
FG
10932 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10933 if (p == inode_map.end())
10934 return NULL;
10935 Inode *in = p->second;
10936 _ll_get(in);
10937 return in;
10938}
10939
10940int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10941{
10942 vinodeno_t vino = _get_vino(in);
10943
11fdf7f2
TL
10944 ldout(cct, 8) << __func__ << " " << vino << dendl;
10945 tout(cct) << __func__ << std::endl;
7c673cae
FG
10946 tout(cct) << vino.ino.val << std::endl;
10947
10948 if (vino.snapid < CEPH_NOSNAP)
10949 return 0;
10950 else
10951 return _getattr(in, caps, perms);
10952}
10953
10954int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10955{
11fdf7f2 10956 std::lock_guard lock(client_lock);
7c673cae 10957
181888fb
FG
10958 if (unmounting)
10959 return -ENOTCONN;
10960
7c673cae
FG
10961 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10962
10963 if (res == 0)
10964 fill_stat(in, attr);
11fdf7f2 10965 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10966 return res;
10967}
10968
10969int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10970 unsigned int flags, const UserPerm& perms)
10971{
11fdf7f2 10972 std::lock_guard lock(client_lock);
7c673cae 10973
181888fb
FG
10974 if (unmounting)
10975 return -ENOTCONN;
10976
7c673cae
FG
10977 int res = 0;
10978 unsigned mask = statx_to_mask(flags, want);
10979
94b18763 10980 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10981 res = _ll_getattr(in, mask, perms);
10982
10983 if (res == 0)
10984 fill_statx(in, mask, stx);
11fdf7f2 10985 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10986 return res;
10987}
10988
10989int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10990 const UserPerm& perms, InodeRef *inp)
10991{
10992 vinodeno_t vino = _get_vino(in);
10993
11fdf7f2 10994 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 10995 << dendl;
11fdf7f2 10996 tout(cct) << __func__ << std::endl;
7c673cae
FG
10997 tout(cct) << vino.ino.val << std::endl;
10998 tout(cct) << stx->stx_mode << std::endl;
10999 tout(cct) << stx->stx_uid << std::endl;
11000 tout(cct) << stx->stx_gid << std::endl;
11001 tout(cct) << stx->stx_size << std::endl;
11002 tout(cct) << stx->stx_mtime << std::endl;
11003 tout(cct) << stx->stx_atime << std::endl;
11004 tout(cct) << stx->stx_btime << std::endl;
11005 tout(cct) << mask << std::endl;
11006
11fdf7f2
TL
11007 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11008 "fuse_default_permissions");
11009 if (!fuse_default_permissions) {
7c673cae
FG
11010 int res = may_setattr(in, stx, mask, perms);
11011 if (res < 0)
11012 return res;
11013 }
11014
11015 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11016
11017 return __setattrx(in, stx, mask, perms, inp);
11018}
11019
11020int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11021 const UserPerm& perms)
11022{
11fdf7f2 11023 std::lock_guard lock(client_lock);
181888fb
FG
11024
11025 if (unmounting)
11026 return -ENOTCONN;
11027
7c673cae
FG
11028 InodeRef target(in);
11029 int res = _ll_setattrx(in, stx, mask, perms, &target);
11030 if (res == 0) {
11fdf7f2 11031 ceph_assert(in == target.get());
7c673cae
FG
11032 fill_statx(in, in->caps_issued(), stx);
11033 }
11034
11fdf7f2 11035 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11036 return res;
11037}
11038
11039int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11040 const UserPerm& perms)
11041{
11042 struct ceph_statx stx;
11043 stat_to_statx(attr, &stx);
11044
11fdf7f2 11045 std::lock_guard lock(client_lock);
181888fb
FG
11046
11047 if (unmounting)
11048 return -ENOTCONN;
11049
7c673cae
FG
11050 InodeRef target(in);
11051 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11052 if (res == 0) {
11fdf7f2 11053 ceph_assert(in == target.get());
7c673cae
FG
11054 fill_stat(in, attr);
11055 }
11056
11fdf7f2 11057 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11058 return res;
11059}
11060
11061
11062// ----------
11063// xattrs
11064
11065int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11066 const UserPerm& perms)
11067{
11fdf7f2 11068 std::lock_guard lock(client_lock);
181888fb
FG
11069
11070 if (unmounting)
11071 return -ENOTCONN;
11072
7c673cae
FG
11073 InodeRef in;
11074 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11075 if (r < 0)
11076 return r;
11077 return _getxattr(in, name, value, size, perms);
11078}
11079
11080int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11081 const UserPerm& perms)
11082{
11fdf7f2 11083 std::lock_guard lock(client_lock);
181888fb
FG
11084
11085 if (unmounting)
11086 return -ENOTCONN;
11087
7c673cae
FG
11088 InodeRef in;
11089 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11090 if (r < 0)
11091 return r;
11092 return _getxattr(in, name, value, size, perms);
11093}
11094
11095int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11096 const UserPerm& perms)
11097{
11fdf7f2 11098 std::lock_guard lock(client_lock);
181888fb
FG
11099
11100 if (unmounting)
11101 return -ENOTCONN;
11102
7c673cae
FG
11103 Fh *f = get_filehandle(fd);
11104 if (!f)
11105 return -EBADF;
11106 return _getxattr(f->inode, name, value, size, perms);
11107}
11108
11109int Client::listxattr(const char *path, char *list, size_t size,
11110 const UserPerm& perms)
11111{
11fdf7f2 11112 std::lock_guard lock(client_lock);
181888fb
FG
11113
11114 if (unmounting)
11115 return -ENOTCONN;
11116
7c673cae
FG
11117 InodeRef in;
11118 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11119 if (r < 0)
11120 return r;
11121 return Client::_listxattr(in.get(), list, size, perms);
11122}
11123
11124int Client::llistxattr(const char *path, char *list, size_t size,
11125 const UserPerm& perms)
11126{
11fdf7f2 11127 std::lock_guard lock(client_lock);
181888fb
FG
11128
11129 if (unmounting)
11130 return -ENOTCONN;
11131
7c673cae
FG
11132 InodeRef in;
11133 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11134 if (r < 0)
11135 return r;
11136 return Client::_listxattr(in.get(), list, size, perms);
11137}
11138
11139int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11140{
11fdf7f2 11141 std::lock_guard lock(client_lock);
181888fb
FG
11142
11143 if (unmounting)
11144 return -ENOTCONN;
11145
7c673cae
FG
11146 Fh *f = get_filehandle(fd);
11147 if (!f)
11148 return -EBADF;
11149 return Client::_listxattr(f->inode.get(), list, size, perms);
11150}
11151
11152int Client::removexattr(const char *path, const char *name,
11153 const UserPerm& perms)
11154{
11fdf7f2 11155 std::lock_guard lock(client_lock);
181888fb
FG
11156
11157 if (unmounting)
11158 return -ENOTCONN;
11159
7c673cae
FG
11160 InodeRef in;
11161 int r = Client::path_walk(path, &in, perms, true);
11162 if (r < 0)
11163 return r;
11164 return _removexattr(in, name, perms);
11165}
11166
11167int Client::lremovexattr(const char *path, const char *name,
11168 const UserPerm& perms)
11169{
11fdf7f2 11170 std::lock_guard lock(client_lock);
181888fb
FG
11171
11172 if (unmounting)
11173 return -ENOTCONN;
11174
7c673cae
FG
11175 InodeRef in;
11176 int r = Client::path_walk(path, &in, perms, false);
11177 if (r < 0)
11178 return r;
11179 return _removexattr(in, name, perms);
11180}
11181
11182int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11183{
11fdf7f2 11184 std::lock_guard lock(client_lock);
181888fb
FG
11185
11186 if (unmounting)
11187 return -ENOTCONN;
11188
7c673cae
FG
11189 Fh *f = get_filehandle(fd);
11190 if (!f)
11191 return -EBADF;
11192 return _removexattr(f->inode, name, perms);
11193}
11194
11195int Client::setxattr(const char *path, const char *name, const void *value,
11196 size_t size, int flags, const UserPerm& perms)
11197{
11198 _setxattr_maybe_wait_for_osdmap(name, value, size);
11199
11fdf7f2 11200 std::lock_guard lock(client_lock);
181888fb
FG
11201
11202 if (unmounting)
11203 return -ENOTCONN;
11204
7c673cae
FG
11205 InodeRef in;
11206 int r = Client::path_walk(path, &in, perms, true);
11207 if (r < 0)
11208 return r;
11209 return _setxattr(in, name, value, size, flags, perms);
11210}
11211
11212int Client::lsetxattr(const char *path, const char *name, const void *value,
11213 size_t size, int flags, const UserPerm& perms)
11214{
11215 _setxattr_maybe_wait_for_osdmap(name, value, size);
11216
11fdf7f2 11217 std::lock_guard lock(client_lock);
181888fb
FG
11218
11219 if (unmounting)
11220 return -ENOTCONN;
11221
7c673cae
FG
11222 InodeRef in;
11223 int r = Client::path_walk(path, &in, perms, false);
11224 if (r < 0)
11225 return r;
11226 return _setxattr(in, name, value, size, flags, perms);
11227}
11228
11229int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11230 int flags, const UserPerm& perms)
11231{
11232 _setxattr_maybe_wait_for_osdmap(name, value, size);
11233
11fdf7f2 11234 std::lock_guard lock(client_lock);
181888fb
FG
11235
11236 if (unmounting)
11237 return -ENOTCONN;
11238
7c673cae
FG
11239 Fh *f = get_filehandle(fd);
11240 if (!f)
11241 return -EBADF;
11242 return _setxattr(f->inode, name, value, size, flags, perms);
11243}
11244
11245int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11246 const UserPerm& perms)
11247{
11248 int r;
11249
11250 const VXattr *vxattr = _match_vxattr(in, name);
11251 if (vxattr) {
11252 r = -ENODATA;
11253
11254 // Do a force getattr to get the latest quota before returning
11255 // a value to userspace.
28e407b8
AA
11256 int flags = 0;
11257 if (vxattr->flags & VXATTR_RSTAT) {
11258 flags |= CEPH_STAT_RSTAT;
11259 }
11260 r = _getattr(in, flags, perms, true);
7c673cae
FG
11261 if (r != 0) {
11262 // Error from getattr!
11263 return r;
11264 }
11265
11266 // call pointer-to-member function
11267 char buf[256];
11268 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11269 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11270 } else {
11271 r = -ENODATA;
11272 }
11273
11274 if (size != 0) {
11275 if (r > (int)size) {
11276 r = -ERANGE;
11277 } else if (r > 0) {
11278 memcpy(value, buf, r);
11279 }
11280 }
11281 goto out;
11282 }
11283
11284 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11285 r = -EOPNOTSUPP;
11286 goto out;
11287 }
11288
11289 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11290 if (r == 0) {
11291 string n(name);
11292 r = -ENODATA;
11293 if (in->xattrs.count(n)) {
11294 r = in->xattrs[n].length();
11295 if (r > 0 && size != 0) {
11296 if (size >= (unsigned)r)
11297 memcpy(value, in->xattrs[n].c_str(), r);
11298 else
11299 r = -ERANGE;
11300 }
11301 }
11302 }
11303 out:
1adf2230 11304 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11305 return r;
11306}
11307
11308int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11309 const UserPerm& perms)
11310{
11311 if (cct->_conf->client_permissions) {
11312 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11313 if (r < 0)
11314 return r;
11315 }
11316 return _getxattr(in.get(), name, value, size, perms);
11317}
11318
11319int Client::ll_getxattr(Inode *in, const char *name, void *value,
11320 size_t size, const UserPerm& perms)
11321{
11fdf7f2 11322 std::lock_guard lock(client_lock);
7c673cae 11323
181888fb
FG
11324 if (unmounting)
11325 return -ENOTCONN;
11326
7c673cae
FG
11327 vinodeno_t vino = _get_vino(in);
11328
11fdf7f2
TL
11329 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11330 tout(cct) << __func__ << std::endl;
7c673cae
FG
11331 tout(cct) << vino.ino.val << std::endl;
11332 tout(cct) << name << std::endl;
11333
11fdf7f2
TL
11334 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11335 "fuse_default_permissions");
11336 if (!fuse_default_permissions) {
7c673cae
FG
11337 int r = xattr_permission(in, name, MAY_READ, perms);
11338 if (r < 0)
11339 return r;
11340 }
11341
11342 return _getxattr(in, name, value, size, perms);
11343}
11344
11345int Client::_listxattr(Inode *in, char *name, size_t size,
11346 const UserPerm& perms)
11347{
81eedcae 11348 bool len_only = (size == 0);
7c673cae 11349 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11350 if (r != 0) {
11351 goto out;
11352 }
7c673cae 11353
81eedcae
TL
11354 r = 0;
11355 for (const auto& p : in->xattrs) {
11356 size_t this_len = p.first.length() + 1;
11357 r += this_len;
11358 if (len_only)
11359 continue;
7c673cae 11360
81eedcae
TL
11361 if (this_len > size) {
11362 r = -ERANGE;
11363 goto out;
11364 }
11365
11366 memcpy(name, p.first.c_str(), this_len);
11367 name += this_len;
11368 size -= this_len;
11369 }
11370
11371 const VXattr *vxattr;
11372 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11373 if (vxattr->hidden)
11374 continue;
11375 // call pointer-to-member function
11376 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11377 continue;
11378
11379 size_t this_len = vxattr->name.length() + 1;
11380 r += this_len;
11381 if (len_only)
11382 continue;
11383
11384 if (this_len > size) {
11385 r = -ERANGE;
11386 goto out;
7c673cae 11387 }
81eedcae
TL
11388
11389 memcpy(name, vxattr->name.c_str(), this_len);
11390 name += this_len;
11391 size -= this_len;
7c673cae 11392 }
81eedcae 11393out:
11fdf7f2 11394 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11395 return r;
11396}
11397
11398int Client::ll_listxattr(Inode *in, char *names, size_t size,
11399 const UserPerm& perms)
11400{
11fdf7f2 11401 std::lock_guard lock(client_lock);
7c673cae 11402
181888fb
FG
11403 if (unmounting)
11404 return -ENOTCONN;
11405
7c673cae
FG
11406 vinodeno_t vino = _get_vino(in);
11407
11fdf7f2
TL
11408 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11409 tout(cct) << __func__ << std::endl;
7c673cae
FG
11410 tout(cct) << vino.ino.val << std::endl;
11411 tout(cct) << size << std::endl;
11412
11413 return _listxattr(in, names, size, perms);
11414}
11415
11416int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11417 size_t size, int flags, const UserPerm& perms)
11418{
11419
11420 int xattr_flags = 0;
11421 if (!value)
11422 xattr_flags |= CEPH_XATTR_REMOVE;
11423 if (flags & XATTR_CREATE)
11424 xattr_flags |= CEPH_XATTR_CREATE;
11425 if (flags & XATTR_REPLACE)
11426 xattr_flags |= CEPH_XATTR_REPLACE;
11427
11428 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11429 filepath path;
11430 in->make_nosnap_relative_path(path);
11431 req->set_filepath(path);
11432 req->set_string2(name);
11433 req->set_inode(in);
11434 req->head.args.setxattr.flags = xattr_flags;
11435
11436 bufferlist bl;
11fdf7f2 11437 assert (value || size == 0);
7c673cae
FG
11438 bl.append((const char*)value, size);
11439 req->set_data(bl);
11440
11441 int res = make_request(req, perms);
11442
11443 trim_cache();
11fdf7f2 11444 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11445 res << dendl;
11446 return res;
11447}
11448
11449int Client::_setxattr(Inode *in, const char *name, const void *value,
11450 size_t size, int flags, const UserPerm& perms)
11451{
11452 if (in->snapid != CEPH_NOSNAP) {
11453 return -EROFS;
11454 }
11455
11456 bool posix_acl_xattr = false;
11457 if (acl_type == POSIX_ACL)
11458 posix_acl_xattr = !strncmp(name, "system.", 7);
11459
11460 if (strncmp(name, "user.", 5) &&
11461 strncmp(name, "security.", 9) &&
11462 strncmp(name, "trusted.", 8) &&
11463 strncmp(name, "ceph.", 5) &&
11464 !posix_acl_xattr)
11465 return -EOPNOTSUPP;
11466
11fdf7f2
TL
11467 bool check_realm = false;
11468
7c673cae
FG
11469 if (posix_acl_xattr) {
11470 if (!strcmp(name, ACL_EA_ACCESS)) {
11471 mode_t new_mode = in->mode;
11472 if (value) {
11473 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11474 if (ret < 0)
11475 return ret;
11476 if (ret == 0) {
11477 value = NULL;
11478 size = 0;
11479 }
11480 if (new_mode != in->mode) {
11481 struct ceph_statx stx;
11482 stx.stx_mode = new_mode;
11483 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11484 if (ret < 0)
11485 return ret;
11486 }
11487 }
11488 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11489 if (value) {
11490 if (!S_ISDIR(in->mode))
11491 return -EACCES;
11492 int ret = posix_acl_check(value, size);
11493 if (ret < 0)
11494 return -EINVAL;
11495 if (ret == 0) {
11496 value = NULL;
11497 size = 0;
11498 }
11499 }
11500 } else {
11501 return -EOPNOTSUPP;
11502 }
11503 } else {
11504 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11505 if (vxattr) {
11506 if (vxattr->readonly)
11507 return -EOPNOTSUPP;
11508 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11509 check_realm = true;
11510 }
7c673cae
FG
11511 }
11512
11fdf7f2
TL
11513 int ret = _do_setxattr(in, name, value, size, flags, perms);
11514 if (ret >= 0 && check_realm) {
11515 // check if snaprealm was created for quota inode
11516 if (in->quota.is_enable() &&
11517 !(in->snaprealm && in->snaprealm->ino == in->ino))
11518 ret = -EOPNOTSUPP;
11519 }
11520
11521 return ret;
7c673cae
FG
11522}
11523
11524int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11525 size_t size, int flags, const UserPerm& perms)
11526{
11527 if (cct->_conf->client_permissions) {
11528 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11529 if (r < 0)
11530 return r;
11531 }
11532 return _setxattr(in.get(), name, value, size, flags, perms);
11533}
11534
11535int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11536{
11537 string tmp;
11538 if (name == "layout") {
11539 string::iterator begin = value.begin();
11540 string::iterator end = value.end();
11541 keys_and_values<string::iterator> p; // create instance of parser
11542 std::map<string, string> m; // map to receive results
11543 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11544 return -EINVAL;
11545 }
11546 if (begin != end)
11547 return -EINVAL;
11548 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11549 if (q->first == "pool") {
11550 tmp = q->second;
11551 break;
11552 }
11553 }
11554 } else if (name == "layout.pool") {
11555 tmp = value;
11556 }
11557
11558 if (tmp.length()) {
11559 int64_t pool;
11560 try {
11561 pool = boost::lexical_cast<unsigned>(tmp);
11562 if (!osdmap->have_pg_pool(pool))
11563 return -ENOENT;
11564 } catch (boost::bad_lexical_cast const&) {
11565 pool = osdmap->lookup_pg_pool_name(tmp);
11566 if (pool < 0) {
11567 return -ENOENT;
11568 }
11569 }
11570 }
11571
11572 return 0;
11573}
11574
11575void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11576{
11577 // For setting pool of layout, MetaRequest need osdmap epoch.
11578 // There is a race which create a new data pool but client and mds both don't have.
11579 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11580 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11581 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11582 string rest(strstr(name, "layout"));
11583 string v((const char*)value, size);
11584 int r = objecter->with_osdmap([&](const OSDMap& o) {
11585 return _setxattr_check_data_pool(rest, v, &o);
11586 });
11587
11588 if (r == -ENOENT) {
11589 C_SaferCond ctx;
11590 objecter->wait_for_latest_osdmap(&ctx);
11591 ctx.wait();
11592 }
11593 }
11594}
11595
11596int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11597 size_t size, int flags, const UserPerm& perms)
11598{
11599 _setxattr_maybe_wait_for_osdmap(name, value, size);
11600
11fdf7f2 11601 std::lock_guard lock(client_lock);
7c673cae 11602
181888fb
FG
11603 if (unmounting)
11604 return -ENOTCONN;
11605
7c673cae
FG
11606 vinodeno_t vino = _get_vino(in);
11607
11fdf7f2
TL
11608 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11609 tout(cct) << __func__ << std::endl;
7c673cae
FG
11610 tout(cct) << vino.ino.val << std::endl;
11611 tout(cct) << name << std::endl;
11612
11fdf7f2
TL
11613 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11614 "fuse_default_permissions");
11615 if (!fuse_default_permissions) {
7c673cae
FG
11616 int r = xattr_permission(in, name, MAY_WRITE, perms);
11617 if (r < 0)
11618 return r;
11619 }
11620 return _setxattr(in, name, value, size, flags, perms);
11621}
11622
11623int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11624{
11625 if (in->snapid != CEPH_NOSNAP) {
11626 return -EROFS;
11627 }
11628
11629 // same xattrs supported by kernel client
11630 if (strncmp(name, "user.", 5) &&
11631 strncmp(name, "system.", 7) &&
11632 strncmp(name, "security.", 9) &&
11633 strncmp(name, "trusted.", 8) &&
11634 strncmp(name, "ceph.", 5))
11635 return -EOPNOTSUPP;
11636
11637 const VXattr *vxattr = _match_vxattr(in, name);
11638 if (vxattr && vxattr->readonly)
11639 return -EOPNOTSUPP;
11640
11641 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11642 filepath path;
11643 in->make_nosnap_relative_path(path);
11644 req->set_filepath(path);
11645 req->set_filepath2(name);
11646 req->set_inode(in);
11647
11648 int res = make_request(req, perms);
11649
11650 trim_cache();
1adf2230 11651 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11652 return res;
11653}
11654
11655int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11656{
11657 if (cct->_conf->client_permissions) {
11658 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11659 if (r < 0)
11660 return r;
11661 }
11662 return _removexattr(in.get(), name, perms);
11663}
11664
11665int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11666{
11fdf7f2 11667 std::lock_guard lock(client_lock);
7c673cae 11668
181888fb
FG
11669 if (unmounting)
11670 return -ENOTCONN;
11671
7c673cae
FG
11672 vinodeno_t vino = _get_vino(in);
11673
11674 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11675 tout(cct) << "ll_removexattr" << std::endl;
11676 tout(cct) << vino.ino.val << std::endl;
11677 tout(cct) << name << std::endl;
11678
11fdf7f2
TL
11679 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11680 "fuse_default_permissions");
11681 if (!fuse_default_permissions) {
7c673cae
FG
11682 int r = xattr_permission(in, name, MAY_WRITE, perms);
11683 if (r < 0)
11684 return r;
11685 }
11686
11687 return _removexattr(in, name, perms);
11688}
11689
11690bool Client::_vxattrcb_quota_exists(Inode *in)
11691{
11fdf7f2
TL
11692 return in->quota.is_enable() &&
11693 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11694}
11695size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11696{
11697 return snprintf(val, size,
11698 "max_bytes=%lld max_files=%lld",
11699 (long long int)in->quota.max_bytes,
11700 (long long int)in->quota.max_files);
11701}
11702size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11703{
11704 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11705}
11706size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11707{
11708 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11709}
11710
11711bool Client::_vxattrcb_layout_exists(Inode *in)
11712{
11713 return in->layout != file_layout_t();
11714}
11715size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11716{
11717 int r = snprintf(val, size,
11fdf7f2 11718 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11719 (unsigned long long)in->layout.stripe_unit,
11720 (unsigned long long)in->layout.stripe_count,
11721 (unsigned long long)in->layout.object_size);
11722 objecter->with_osdmap([&](const OSDMap& o) {
11723 if (o.have_pg_pool(in->layout.pool_id))
11724 r += snprintf(val + r, size - r, "%s",
11725 o.get_pool_name(in->layout.pool_id).c_str());
11726 else
11727 r += snprintf(val + r, size - r, "%" PRIu64,
11728 (uint64_t)in->layout.pool_id);
11729 });
11730 if (in->layout.pool_ns.length())
11731 r += snprintf(val + r, size - r, " pool_namespace=%s",
11732 in->layout.pool_ns.c_str());
11733 return r;
11734}
11735size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11736{
11fdf7f2 11737 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11738}
11739size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11740{
11fdf7f2 11741 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11742}
11743size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11744{
11fdf7f2 11745 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11746}
11747size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11748{
11749 size_t r;
11750 objecter->with_osdmap([&](const OSDMap& o) {
11751 if (o.have_pg_pool(in->layout.pool_id))
11752 r = snprintf(val, size, "%s", o.get_pool_name(
11753 in->layout.pool_id).c_str());
11754 else
11755 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11756 });
11757 return r;
11758}
11759size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11760{
11761 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11762}
11763size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11764{
11fdf7f2 11765 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11766}
11767size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11768{
11fdf7f2 11769 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11770}
11771size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11772{
11fdf7f2 11773 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11774}
11775size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11776{
11fdf7f2 11777 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11778}
11779size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11780{
11fdf7f2 11781 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11782}
11783size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11784{
11fdf7f2 11785 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11786}
11787size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11788{
11fdf7f2 11789 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11790}
11791size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11792{
81eedcae 11793 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
11794 (long)in->rstat.rctime.nsec());
11795}
11fdf7f2
TL
11796bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11797{
11798 return in->dir_pin != -ENODATA;
11799}
11800size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11801{
11802 return snprintf(val, size, "%ld", (long)in->dir_pin);
11803}
7c673cae 11804
81eedcae
TL
11805bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11806{
11807 return !in->snap_btime.is_zero();
11808}
11809
11810size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11811{
11812 return snprintf(val, size, "%llu.%09lu",
11813 (long long unsigned)in->snap_btime.sec(),
11814 (long unsigned)in->snap_btime.nsec());
11815}
11816
7c673cae
FG
11817#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11818#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11819
11820#define XATTR_NAME_CEPH(_type, _name) \
11821{ \
11822 name: CEPH_XATTR_NAME(_type, _name), \
11823 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11824 readonly: true, \
11825 hidden: false, \
11826 exists_cb: NULL, \
28e407b8
AA
11827 flags: 0, \
11828}
11829#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11830{ \
11831 name: CEPH_XATTR_NAME(_type, _name), \
11832 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11833 readonly: true, \
11834 hidden: false, \
11835 exists_cb: NULL, \
11836 flags: _flags, \
7c673cae
FG
11837}
11838#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11839{ \
11840 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11841 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11842 readonly: false, \
11843 hidden: true, \
11844 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11845 flags: 0, \
7c673cae
FG
11846}
11847#define XATTR_QUOTA_FIELD(_type, _name) \
11848{ \
11849 name: CEPH_XATTR_NAME(_type, _name), \
11850 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11851 readonly: false, \
11852 hidden: true, \
11853 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11854 flags: 0, \
7c673cae
FG
11855}
11856
11857const Client::VXattr Client::_dir_vxattrs[] = {
11858 {
11859 name: "ceph.dir.layout",
11860 getxattr_cb: &Client::_vxattrcb_layout,
11861 readonly: false,
11862 hidden: true,
11863 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11864 flags: 0,
7c673cae
FG
11865 },
11866 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11867 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11868 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11869 XATTR_LAYOUT_FIELD(dir, layout, pool),
11870 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11871 XATTR_NAME_CEPH(dir, entries),
11872 XATTR_NAME_CEPH(dir, files),
11873 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11874 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11875 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11876 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11877 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11878 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11879 {
11880 name: "ceph.quota",
11881 getxattr_cb: &Client::_vxattrcb_quota,
11882 readonly: false,
11883 hidden: true,
11884 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11885 flags: 0,
7c673cae
FG
11886 },
11887 XATTR_QUOTA_FIELD(quota, max_bytes),
11888 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11889 {
11890 name: "ceph.dir.pin",
11891 getxattr_cb: &Client::_vxattrcb_dir_pin,
11892 readonly: false,
11893 hidden: true,
11894 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11895 flags: 0,
11896 },
81eedcae
TL
11897 {
11898 name: "ceph.snap.btime",
11899 getxattr_cb: &Client::_vxattrcb_snap_btime,
11900 readonly: true,
11901 hidden: false,
11902 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11903 flags: 0,
11904 },
7c673cae
FG
11905 { name: "" } /* Required table terminator */
11906};
11907
11908const Client::VXattr Client::_file_vxattrs[] = {
11909 {
11910 name: "ceph.file.layout",
11911 getxattr_cb: &Client::_vxattrcb_layout,
11912 readonly: false,
11913 hidden: true,
11914 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11915 flags: 0,
7c673cae
FG
11916 },
11917 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11918 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11919 XATTR_LAYOUT_FIELD(file, layout, object_size),
11920 XATTR_LAYOUT_FIELD(file, layout, pool),
11921 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
11922 {
11923 name: "ceph.snap.btime",
11924 getxattr_cb: &Client::_vxattrcb_snap_btime,
11925 readonly: true,
11926 hidden: false,
11927 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11928 flags: 0,
11929 },
7c673cae
FG
11930 { name: "" } /* Required table terminator */
11931};
11932
11933const Client::VXattr *Client::_get_vxattrs(Inode *in)
11934{
11935 if (in->is_dir())
11936 return _dir_vxattrs;
11937 else if (in->is_file())
11938 return _file_vxattrs;
11939 return NULL;
11940}
11941
11942const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11943{
11944 if (strncmp(name, "ceph.", 5) == 0) {
11945 const VXattr *vxattr = _get_vxattrs(in);
11946 if (vxattr) {
11947 while (!vxattr->name.empty()) {
11948 if (vxattr->name == name)
11949 return vxattr;
11950 vxattr++;
11951 }
11952 }
11953 }
11954 return NULL;
11955}
11956
7c673cae
FG
11957int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11958{
11fdf7f2 11959 std::lock_guard lock(client_lock);
7c673cae 11960
181888fb
FG
11961 if (unmounting)
11962 return -ENOTCONN;
11963
7c673cae
FG
11964 vinodeno_t vino = _get_vino(in);
11965
11966 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11967 tout(cct) << "ll_readlink" << std::endl;
11968 tout(cct) << vino.ino.val << std::endl;
11969
11fdf7f2
TL
11970 for (auto dn : in->dentries) {
11971 touch_dn(dn);
7c673cae
FG
11972 }
11973
11974 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11975 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11976 return r;
11977}
11978
11979int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11980 const UserPerm& perms, InodeRef *inp)
11981{
1adf2230 11982 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11983 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11984 << ", gid " << perms.gid() << ")" << dendl;
11985
11986 if (strlen(name) > NAME_MAX)
11987 return -ENAMETOOLONG;
11988
11989 if (dir->snapid != CEPH_NOSNAP) {
11990 return -EROFS;
11991 }
11992 if (is_quota_files_exceeded(dir, perms)) {
11993 return -EDQUOT;
11994 }
11995
11996 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11997
11998 filepath path;
11999 dir->make_nosnap_relative_path(path);
12000 path.push_dentry(name);
12001 req->set_filepath(path);
12002 req->set_inode(dir);
12003 req->head.args.mknod.rdev = rdev;
12004 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12005 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12006
12007 bufferlist xattrs_bl;
12008 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12009 if (res < 0)
12010 goto fail;
12011 req->head.args.mknod.mode = mode;
12012 if (xattrs_bl.length() > 0)
12013 req->set_data(xattrs_bl);
12014
12015 Dentry *de;
12016 res = get_or_create(dir, name, &de);
12017 if (res < 0)
12018 goto fail;
12019 req->set_dentry(de);
12020
12021 res = make_request(req, perms, inp);
12022
12023 trim_cache();
12024
1adf2230 12025 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12026 return res;
12027
12028 fail:
12029 put_request(req);
12030 return res;
12031}
12032
12033int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12034 dev_t rdev, struct stat *attr, Inode **out,
12035 const UserPerm& perms)
12036{
11fdf7f2 12037 std::lock_guard lock(client_lock);
7c673cae 12038
181888fb
FG
12039 if (unmounting)
12040 return -ENOTCONN;
12041
7c673cae
FG
12042 vinodeno_t vparent = _get_vino(parent);
12043
12044 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12045 tout(cct) << "ll_mknod" << std::endl;
12046 tout(cct) << vparent.ino.val << std::endl;
12047 tout(cct) << name << std::endl;
12048 tout(cct) << mode << std::endl;
12049 tout(cct) << rdev << std::endl;
12050
11fdf7f2
TL
12051 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12052 "fuse_default_permissions");
12053 if (!fuse_default_permissions) {
7c673cae
FG
12054 int r = may_create(parent, perms);
12055 if (r < 0)
12056 return r;
12057 }
12058
12059 InodeRef in;
12060 int r = _mknod(parent, name, mode, rdev, perms, &in);
12061 if (r == 0) {
12062 fill_stat(in, attr);
12063 _ll_get(in.get());
12064 }
12065 tout(cct) << attr->st_ino << std::endl;
12066 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12067 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12068 *out = in.get();
12069 return r;
12070}
12071
12072int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12073 dev_t rdev, Inode **out,
12074 struct ceph_statx *stx, unsigned want, unsigned flags,
12075 const UserPerm& perms)
12076{
12077 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12078 std::lock_guard lock(client_lock);
7c673cae 12079
181888fb
FG
12080 if (unmounting)
12081 return -ENOTCONN;
12082
7c673cae
FG
12083 vinodeno_t vparent = _get_vino(parent);
12084
12085 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12086 tout(cct) << "ll_mknodx" << std::endl;
12087 tout(cct) << vparent.ino.val << std::endl;
12088 tout(cct) << name << std::endl;
12089 tout(cct) << mode << std::endl;
12090 tout(cct) << rdev << std::endl;
12091
11fdf7f2
TL
12092 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12093 "fuse_default_permissions");
12094 if (!fuse_default_permissions) {
7c673cae
FG
12095 int r = may_create(parent, perms);
12096 if (r < 0)
12097 return r;
12098 }
12099
12100 InodeRef in;
12101 int r = _mknod(parent, name, mode, rdev, perms, &in);
12102 if (r == 0) {
12103 fill_statx(in, caps, stx);
12104 _ll_get(in.get());
12105 }
12106 tout(cct) << stx->stx_ino << std::endl;
12107 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12108 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12109 *out = in.get();
12110 return r;
12111}
12112
12113int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12114 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12115 int object_size, const char *data_pool, bool *created,
12116 const UserPerm& perms)
12117{
1adf2230 12118 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12119 mode << dec << ")" << dendl;
12120
12121 if (strlen(name) > NAME_MAX)
12122 return -ENAMETOOLONG;
12123 if (dir->snapid != CEPH_NOSNAP) {
12124 return -EROFS;
12125 }
12126 if (is_quota_files_exceeded(dir, perms)) {
12127 return -EDQUOT;
12128 }
12129
12130 // use normalized flags to generate cmode
11fdf7f2
TL
12131 int cflags = ceph_flags_sys2wire(flags);
12132 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12133 cflags |= CEPH_O_LAZY;
12134
12135 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12136
12137 int64_t pool_id = -1;
12138 if (data_pool && *data_pool) {
12139 pool_id = objecter->with_osdmap(
12140 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12141 if (pool_id < 0)
12142 return -EINVAL;
12143 if (pool_id > 0xffffffffll)
12144 return -ERANGE; // bummer!
12145 }
12146
12147 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12148
12149 filepath path;
12150 dir->make_nosnap_relative_path(path);
12151 path.push_dentry(name);
12152 req->set_filepath(path);
12153 req->set_inode(dir);
11fdf7f2 12154 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12155
12156 req->head.args.open.stripe_unit = stripe_unit;
12157 req->head.args.open.stripe_count = stripe_count;
12158 req->head.args.open.object_size = object_size;
12159 if (cct->_conf->client_debug_getattr_caps)
12160 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12161 else
12162 req->head.args.open.mask = 0;
12163 req->head.args.open.pool = pool_id;
12164 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12165 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12166
12167 mode |= S_IFREG;
12168 bufferlist xattrs_bl;
12169 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12170 if (res < 0)
12171 goto fail;
12172 req->head.args.open.mode = mode;
12173 if (xattrs_bl.length() > 0)
12174 req->set_data(xattrs_bl);
12175
12176 Dentry *de;
12177 res = get_or_create(dir, name, &de);
12178 if (res < 0)
12179 goto fail;
12180 req->set_dentry(de);
12181
12182 res = make_request(req, perms, inp, created);
12183 if (res < 0) {
12184 goto reply_error;
12185 }
12186
12187 /* If the caller passed a value in fhp, do the open */
12188 if(fhp) {
12189 (*inp)->get_open_ref(cmode);
12190 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12191 }
12192
12193 reply_error:
12194 trim_cache();
12195
1adf2230 12196 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12197 << " layout " << stripe_unit
12198 << ' ' << stripe_count
12199 << ' ' << object_size
12200 <<") = " << res << dendl;
12201 return res;
12202
12203 fail:
12204 put_request(req);
12205 return res;
12206}
12207
12208
12209int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12210 InodeRef *inp)
12211{
1adf2230 12212 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12213 << mode << dec << ", uid " << perm.uid()
12214 << ", gid " << perm.gid() << ")" << dendl;
12215
12216 if (strlen(name) > NAME_MAX)
12217 return -ENAMETOOLONG;
12218
12219 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12220 return -EROFS;
12221 }
12222 if (is_quota_files_exceeded(dir, perm)) {
12223 return -EDQUOT;
12224 }
12225 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12226 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12227
12228 filepath path;
12229 dir->make_nosnap_relative_path(path);
12230 path.push_dentry(name);
12231 req->set_filepath(path);
12232 req->set_inode(dir);
12233 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12234 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12235
12236 mode |= S_IFDIR;
12237 bufferlist xattrs_bl;
12238 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12239 if (res < 0)
12240 goto fail;
12241 req->head.args.mkdir.mode = mode;
12242 if (xattrs_bl.length() > 0)
12243 req->set_data(xattrs_bl);
12244
12245 Dentry *de;
12246 res = get_or_create(dir, name, &de);
12247 if (res < 0)
12248 goto fail;
12249 req->set_dentry(de);
12250
12251 ldout(cct, 10) << "_mkdir: making request" << dendl;
12252 res = make_request(req, perm, inp);
12253 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12254
12255 trim_cache();
12256
1adf2230 12257 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12258 return res;
12259
12260 fail:
12261 put_request(req);
12262 return res;
12263}
12264
12265int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12266 struct stat *attr, Inode **out, const UserPerm& perm)
12267{
11fdf7f2 12268 std::lock_guard lock(client_lock);
7c673cae 12269
181888fb
FG
12270 if (unmounting)
12271 return -ENOTCONN;
12272
7c673cae
FG
12273 vinodeno_t vparent = _get_vino(parent);
12274
12275 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12276 tout(cct) << "ll_mkdir" << std::endl;
12277 tout(cct) << vparent.ino.val << std::endl;
12278 tout(cct) << name << std::endl;
12279 tout(cct) << mode << std::endl;
12280
11fdf7f2
TL
12281 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12282 "fuse_default_permissions");
12283 if (!fuse_default_permissions) {
7c673cae
FG
12284 int r = may_create(parent, perm);
12285 if (r < 0)
12286 return r;
12287 }
12288
12289 InodeRef in;
12290 int r = _mkdir(parent, name, mode, perm, &in);
12291 if (r == 0) {
12292 fill_stat(in, attr);
12293 _ll_get(in.get());
12294 }
12295 tout(cct) << attr->st_ino << std::endl;
12296 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12297 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12298 *out = in.get();
12299 return r;
12300}
12301
12302int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12303 struct ceph_statx *stx, unsigned want, unsigned flags,
12304 const UserPerm& perms)
12305{
11fdf7f2 12306 std::lock_guard lock(client_lock);
7c673cae 12307
181888fb
FG
12308 if (unmounting)
12309 return -ENOTCONN;
12310
7c673cae
FG
12311 vinodeno_t vparent = _get_vino(parent);
12312
12313 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12314 tout(cct) << "ll_mkdirx" << std::endl;
12315 tout(cct) << vparent.ino.val << std::endl;
12316 tout(cct) << name << std::endl;
12317 tout(cct) << mode << std::endl;
12318
11fdf7f2
TL
12319 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12320 "fuse_default_permissions");
12321 if (!fuse_default_permissions) {
7c673cae
FG
12322 int r = may_create(parent, perms);
12323 if (r < 0)
12324 return r;
12325 }
12326
12327 InodeRef in;
12328 int r = _mkdir(parent, name, mode, perms, &in);
12329 if (r == 0) {
12330 fill_statx(in, statx_to_mask(flags, want), stx);
12331 _ll_get(in.get());
12332 } else {
12333 stx->stx_ino = 0;
12334 stx->stx_mask = 0;
12335 }
12336 tout(cct) << stx->stx_ino << std::endl;
12337 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12338 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12339 *out = in.get();
12340 return r;
12341}
12342
12343int Client::_symlink(Inode *dir, const char *name, const char *target,
12344 const UserPerm& perms, InodeRef *inp)
12345{
1adf2230 12346 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12347 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12348 << dendl;
12349
12350 if (strlen(name) > NAME_MAX)
12351 return -ENAMETOOLONG;
12352
12353 if (dir->snapid != CEPH_NOSNAP) {
12354 return -EROFS;
12355 }
12356 if (is_quota_files_exceeded(dir, perms)) {
12357 return -EDQUOT;
12358 }
12359
12360 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12361
12362 filepath path;
12363 dir->make_nosnap_relative_path(path);
12364 path.push_dentry(name);
12365 req->set_filepath(path);
12366 req->set_inode(dir);
12367 req->set_string2(target);
12368 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12369 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12370
12371 Dentry *de;
12372 int res = get_or_create(dir, name, &de);
12373 if (res < 0)
12374 goto fail;
12375 req->set_dentry(de);
12376
12377 res = make_request(req, perms, inp);
12378
12379 trim_cache();
1adf2230 12380 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12381 res << dendl;
12382 return res;
12383
12384 fail:
12385 put_request(req);
12386 return res;
12387}
12388
12389int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12390 struct stat *attr, Inode **out, const UserPerm& perms)
12391{
11fdf7f2 12392 std::lock_guard lock(client_lock);
7c673cae 12393
181888fb
FG
12394 if (unmounting)
12395 return -ENOTCONN;
12396
7c673cae
FG
12397 vinodeno_t vparent = _get_vino(parent);
12398
12399 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12400 << dendl;
12401 tout(cct) << "ll_symlink" << std::endl;
12402 tout(cct) << vparent.ino.val << std::endl;
12403 tout(cct) << name << std::endl;
12404 tout(cct) << value << std::endl;
12405
11fdf7f2
TL
12406 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12407 "fuse_default_permissions");
12408 if (!fuse_default_permissions) {
7c673cae
FG
12409 int r = may_create(parent, perms);
12410 if (r < 0)
12411 return r;
12412 }
12413
12414 InodeRef in;
12415 int r = _symlink(parent, name, value, perms, &in);
12416 if (r == 0) {
12417 fill_stat(in, attr);
12418 _ll_get(in.get());
12419 }
12420 tout(cct) << attr->st_ino << std::endl;
12421 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12422 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12423 *out = in.get();
12424 return r;
12425}
12426
12427int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12428 Inode **out, struct ceph_statx *stx, unsigned want,
12429 unsigned flags, const UserPerm& perms)
12430{
11fdf7f2 12431 std::lock_guard lock(client_lock);
7c673cae 12432
181888fb
FG
12433 if (unmounting)
12434 return -ENOTCONN;
12435
7c673cae
FG
12436 vinodeno_t vparent = _get_vino(parent);
12437
12438 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12439 << dendl;
12440 tout(cct) << "ll_symlinkx" << std::endl;
12441 tout(cct) << vparent.ino.val << std::endl;
12442 tout(cct) << name << std::endl;
12443 tout(cct) << value << std::endl;
12444
11fdf7f2
TL
12445 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12446 "fuse_default_permissions");
12447 if (!fuse_default_permissions) {
7c673cae
FG
12448 int r = may_create(parent, perms);
12449 if (r < 0)
12450 return r;
12451 }
12452
12453 InodeRef in;
12454 int r = _symlink(parent, name, value, perms, &in);
12455 if (r == 0) {
12456 fill_statx(in, statx_to_mask(flags, want), stx);
12457 _ll_get(in.get());
12458 }
12459 tout(cct) << stx->stx_ino << std::endl;
12460 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12461 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12462 *out = in.get();
12463 return r;
12464}
12465
12466int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12467{
1adf2230 12468 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12469 << " uid " << perm.uid() << " gid " << perm.gid()
12470 << ")" << dendl;
12471
12472 if (dir->snapid != CEPH_NOSNAP) {
12473 return -EROFS;
12474 }
12475
12476 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12477
12478 filepath path;
12479 dir->make_nosnap_relative_path(path);
12480 path.push_dentry(name);
12481 req->set_filepath(path);
12482
12483 InodeRef otherin;
b32b8144 12484 Inode *in;
7c673cae 12485 Dentry *de;
b32b8144 12486
7c673cae
FG
12487 int res = get_or_create(dir, name, &de);
12488 if (res < 0)
12489 goto fail;
12490 req->set_dentry(de);
12491 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12492 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12493
12494 res = _lookup(dir, name, 0, &otherin, perm);
12495 if (res < 0)
12496 goto fail;
b32b8144
FG
12497
12498 in = otherin.get();
12499 req->set_other_inode(in);
12500 in->break_all_delegs();
7c673cae
FG
12501 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12502
12503 req->set_inode(dir);
12504
12505 res = make_request(req, perm);
12506
12507 trim_cache();
1adf2230 12508 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12509 return res;
12510
12511 fail:
12512 put_request(req);
12513 return res;
12514}
12515
12516int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12517{
11fdf7f2 12518 std::lock_guard lock(client_lock);
7c673cae 12519
181888fb
FG
12520 if (unmounting)
12521 return -ENOTCONN;
12522
7c673cae
FG
12523 vinodeno_t vino = _get_vino(in);
12524
12525 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12526 tout(cct) << "ll_unlink" << std::endl;
12527 tout(cct) << vino.ino.val << std::endl;
12528 tout(cct) << name << std::endl;
12529
11fdf7f2
TL
12530 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12531 "fuse_default_permissions");
12532 if (!fuse_default_permissions) {
7c673cae
FG
12533 int r = may_delete(in, name, perm);
12534 if (r < 0)
12535 return r;
12536 }
12537 return _unlink(in, name, perm);
12538}
12539
12540int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12541{
1adf2230 12542 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12543 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12544
12545 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12546 return -EROFS;
12547 }
b32b8144
FG
12548
12549 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12550 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12551 filepath path;
12552 dir->make_nosnap_relative_path(path);
12553 path.push_dentry(name);
12554 req->set_filepath(path);
11fdf7f2 12555 req->set_inode(dir);
7c673cae
FG
12556
12557 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12558 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12559 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12560
12561 InodeRef in;
12562
12563 Dentry *de;
12564 int res = get_or_create(dir, name, &de);
12565 if (res < 0)
12566 goto fail;
b32b8144
FG
12567 if (op == CEPH_MDS_OP_RMDIR)
12568 req->set_dentry(de);
12569 else
12570 de->get();
12571
7c673cae
FG
12572 res = _lookup(dir, name, 0, &in, perms);
12573 if (res < 0)
12574 goto fail;
11fdf7f2
TL
12575
12576 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12577 unlink(de, true, true);
b32b8144 12578 de->put();
7c673cae 12579 }
11fdf7f2 12580 req->set_other_inode(in.get());
7c673cae
FG
12581
12582 res = make_request(req, perms);
12583
12584 trim_cache();
1adf2230 12585 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12586 return res;
12587
12588 fail:
12589 put_request(req);
12590 return res;
12591}
12592
12593int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12594{
11fdf7f2 12595 std::lock_guard lock(client_lock);
7c673cae 12596
181888fb
FG
12597 if (unmounting)
12598 return -ENOTCONN;
12599
7c673cae
FG
12600 vinodeno_t vino = _get_vino(in);
12601
12602 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12603 tout(cct) << "ll_rmdir" << std::endl;
12604 tout(cct) << vino.ino.val << std::endl;
12605 tout(cct) << name << std::endl;
12606
11fdf7f2
TL
12607 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12608 "fuse_default_permissions");
12609 if (!fuse_default_permissions) {
7c673cae
FG
12610 int r = may_delete(in, name, perms);
12611 if (r < 0)
12612 return r;
12613 }
12614
12615 return _rmdir(in, name, perms);
12616}
12617
12618int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12619{
1adf2230 12620 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12621 << todir->ino << " " << toname
12622 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12623 << dendl;
12624
12625 if (fromdir->snapid != todir->snapid)
12626 return -EXDEV;
12627
12628 int op = CEPH_MDS_OP_RENAME;
12629 if (fromdir->snapid != CEPH_NOSNAP) {
12630 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12631 op = CEPH_MDS_OP_RENAMESNAP;
12632 else
12633 return -EROFS;
12634 }
12635 if (fromdir != todir) {
12636 Inode *fromdir_root =
12637 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12638 Inode *todir_root =
12639 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12640 if (fromdir_root != todir_root) {
12641 return -EXDEV;
12642 }
12643 }
12644
12645 InodeRef target;
12646 MetaRequest *req = new MetaRequest(op);
12647
12648 filepath from;
12649 fromdir->make_nosnap_relative_path(from);
12650 from.push_dentry(fromname);
12651 filepath to;
12652 todir->make_nosnap_relative_path(to);
12653 to.push_dentry(toname);
12654 req->set_filepath(to);
12655 req->set_filepath2(from);
12656
12657 Dentry *oldde;
12658 int res = get_or_create(fromdir, fromname, &oldde);
12659 if (res < 0)
12660 goto fail;
12661 Dentry *de;
12662 res = get_or_create(todir, toname, &de);
12663 if (res < 0)
12664 goto fail;
12665
12666 if (op == CEPH_MDS_OP_RENAME) {
12667 req->set_old_dentry(oldde);
12668 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12669 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12670
12671 req->set_dentry(de);
12672 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12673 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12674
12675 InodeRef oldin, otherin;
12676 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12677 if (res < 0)
12678 goto fail;
b32b8144
FG
12679
12680 Inode *oldinode = oldin.get();
12681 oldinode->break_all_delegs();
12682 req->set_old_inode(oldinode);
7c673cae
FG
12683 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12684
12685 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12686 switch (res) {
12687 case 0:
12688 {
12689 Inode *in = otherin.get();
12690 req->set_other_inode(in);
12691 in->break_all_delegs();
12692 }
7c673cae 12693 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12694 break;
12695 case -ENOENT:
12696 break;
12697 default:
12698 goto fail;
7c673cae
FG
12699 }
12700
12701 req->set_inode(todir);
12702 } else {
12703 // renamesnap reply contains no tracedn, so we need to invalidate
12704 // dentry manually
12705 unlink(oldde, true, true);
12706 unlink(de, true, true);
11fdf7f2
TL
12707
12708 req->set_inode(todir);
7c673cae
FG
12709 }
12710
12711 res = make_request(req, perm, &target);
12712 ldout(cct, 10) << "rename result is " << res << dendl;
12713
12714 // renamed item from our cache
12715
12716 trim_cache();
1adf2230 12717 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12718 return res;
12719
12720 fail:
12721 put_request(req);
12722 return res;
12723}
12724
12725int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12726 const char *newname, const UserPerm& perm)
12727{
11fdf7f2 12728 std::lock_guard lock(client_lock);
7c673cae 12729
181888fb
FG
12730 if (unmounting)
12731 return -ENOTCONN;
12732
7c673cae
FG
12733 vinodeno_t vparent = _get_vino(parent);
12734 vinodeno_t vnewparent = _get_vino(newparent);
12735
12736 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12737 << vnewparent << " " << newname << dendl;
12738 tout(cct) << "ll_rename" << std::endl;
12739 tout(cct) << vparent.ino.val << std::endl;
12740 tout(cct) << name << std::endl;
12741 tout(cct) << vnewparent.ino.val << std::endl;
12742 tout(cct) << newname << std::endl;
12743
11fdf7f2
TL
12744 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12745 "fuse_default_permissions");
12746 if (!fuse_default_permissions) {
7c673cae
FG
12747 int r = may_delete(parent, name, perm);
12748 if (r < 0)
12749 return r;
12750 r = may_delete(newparent, newname, perm);
12751 if (r < 0 && r != -ENOENT)
12752 return r;
12753 }
12754
12755 return _rename(parent, name, newparent, newname, perm);
12756}
12757
12758int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12759{
1adf2230 12760 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12761 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12762
12763 if (strlen(newname) > NAME_MAX)
12764 return -ENAMETOOLONG;
12765
12766 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12767 return -EROFS;
12768 }
12769 if (is_quota_files_exceeded(dir, perm)) {
12770 return -EDQUOT;
12771 }
12772
b32b8144 12773 in->break_all_delegs();
7c673cae
FG
12774 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12775
12776 filepath path(newname, dir->ino);
12777 req->set_filepath(path);
12778 filepath existing(in->ino);
12779 req->set_filepath2(existing);
12780
12781 req->set_inode(dir);
12782 req->inode_drop = CEPH_CAP_FILE_SHARED;
12783 req->inode_unless = CEPH_CAP_FILE_EXCL;
12784
12785 Dentry *de;
12786 int res = get_or_create(dir, newname, &de);
12787 if (res < 0)
12788 goto fail;
12789 req->set_dentry(de);
12790
12791 res = make_request(req, perm, inp);
12792 ldout(cct, 10) << "link result is " << res << dendl;
12793
12794 trim_cache();
1adf2230 12795 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12796 return res;
12797
12798 fail:
12799 put_request(req);
12800 return res;
12801}
12802
12803int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12804 const UserPerm& perm)
12805{
11fdf7f2 12806 std::lock_guard lock(client_lock);
7c673cae 12807
181888fb
FG
12808 if (unmounting)
12809 return -ENOTCONN;
12810
7c673cae
FG
12811 vinodeno_t vino = _get_vino(in);
12812 vinodeno_t vnewparent = _get_vino(newparent);
12813
31f18b77 12814 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12815 newname << dendl;
12816 tout(cct) << "ll_link" << std::endl;
12817 tout(cct) << vino.ino.val << std::endl;
12818 tout(cct) << vnewparent << std::endl;
12819 tout(cct) << newname << std::endl;
12820
7c673cae
FG
12821 InodeRef target;
12822
11fdf7f2
TL
12823 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12824 "fuse_default_permissions");
12825 if (!fuse_default_permissions) {
7c673cae
FG
12826 if (S_ISDIR(in->mode))
12827 return -EPERM;
12828
11fdf7f2 12829 int r = may_hardlink(in, perm);
7c673cae
FG
12830 if (r < 0)
12831 return r;
12832
12833 r = may_create(newparent, perm);
12834 if (r < 0)
12835 return r;
12836 }
12837
12838 return _link(in, newparent, newname, perm, &target);
12839}
12840
12841int Client::ll_num_osds(void)
12842{
11fdf7f2 12843 std::lock_guard lock(client_lock);
7c673cae
FG
12844 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12845}
12846
12847int Client::ll_osdaddr(int osd, uint32_t *addr)
12848{
11fdf7f2 12849 std::lock_guard lock(client_lock);
181888fb 12850
7c673cae
FG
12851 entity_addr_t g;
12852 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12853 if (!o.exists(osd))
12854 return false;
11fdf7f2 12855 g = o.get_addrs(osd).front();
7c673cae
FG
12856 return true;
12857 });
12858 if (!exists)
12859 return -1;
12860 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12861 *addr = ntohl(nb_addr);
12862 return 0;
12863}
181888fb 12864
7c673cae
FG
12865uint32_t Client::ll_stripe_unit(Inode *in)
12866{
11fdf7f2 12867 std::lock_guard lock(client_lock);
7c673cae
FG
12868 return in->layout.stripe_unit;
12869}
12870
12871uint64_t Client::ll_snap_seq(Inode *in)
12872{
11fdf7f2 12873 std::lock_guard lock(client_lock);
7c673cae
FG
12874 return in->snaprealm->seq;
12875}
12876
12877int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12878{
11fdf7f2 12879 std::lock_guard lock(client_lock);
7c673cae
FG
12880 *layout = in->layout;
12881 return 0;
12882}
12883
12884int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12885{
12886 return ll_file_layout(fh->inode.get(), layout);
12887}
12888
12889/* Currently we cannot take advantage of redundancy in reads, since we
12890 would have to go through all possible placement groups (a
12891 potentially quite large number determined by a hash), and use CRUSH
12892 to calculate the appropriate set of OSDs for each placement group,
12893 then index into that. An array with one entry per OSD is much more
12894 tractable and works for demonstration purposes. */
12895
12896int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12897 file_layout_t* layout)
12898{
11fdf7f2 12899 std::lock_guard lock(client_lock);
181888fb 12900
28e407b8 12901 inodeno_t ino = in->ino;
7c673cae
FG
12902 uint32_t object_size = layout->object_size;
12903 uint32_t su = layout->stripe_unit;
12904 uint32_t stripe_count = layout->stripe_count;
12905 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12906 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12907
11fdf7f2
TL
12908 if(stripe_count) {
12909 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12910 stripepos = blockno % stripe_count; // which object in the object set (X)
12911 }
7c673cae
FG
12912 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12913 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12914
12915 object_t oid = file_object_t(ino, objectno);
12916 return objecter->with_osdmap([&](const OSDMap& o) {
12917 ceph_object_layout olayout =
12918 o.file_to_object_layout(oid, *layout);
12919 pg_t pg = (pg_t)olayout.ol_pgid;
12920 vector<int> osds;
12921 int primary;
12922 o.pg_to_acting_osds(pg, &osds, &primary);
12923 return primary;
12924 });
12925}
12926
12927/* Return the offset of the block, internal to the object */
12928
12929uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12930{
11fdf7f2 12931 std::lock_guard lock(client_lock);
7c673cae
FG
12932 file_layout_t *layout=&(in->layout);
12933 uint32_t object_size = layout->object_size;
12934 uint32_t su = layout->stripe_unit;
12935 uint64_t stripes_per_object = object_size / su;
12936
12937 return (blockno % stripes_per_object) * su;
12938}
12939
12940int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12941 const UserPerm& perms)
12942{
11fdf7f2 12943 std::lock_guard lock(client_lock);
7c673cae 12944
181888fb
FG
12945 if (unmounting)
12946 return -ENOTCONN;
12947
7c673cae
FG
12948 vinodeno_t vino = _get_vino(in);
12949
12950 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12951 tout(cct) << "ll_opendir" << std::endl;
12952 tout(cct) << vino.ino.val << std::endl;
12953
11fdf7f2
TL
12954 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12955 "fuse_default_permissions");
12956 if (!fuse_default_permissions) {
7c673cae
FG
12957 int r = may_open(in, flags, perms);
12958 if (r < 0)
12959 return r;
12960 }
12961
12962 int r = _opendir(in, dirpp, perms);
12963 tout(cct) << (unsigned long)*dirpp << std::endl;
12964
12965 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12966 << dendl;
12967 return r;
12968}
12969
12970int Client::ll_releasedir(dir_result_t *dirp)
12971{
11fdf7f2 12972 std::lock_guard lock(client_lock);
7c673cae
FG
12973 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12974 tout(cct) << "ll_releasedir" << std::endl;
12975 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12976
12977 if (unmounting)
12978 return -ENOTCONN;
12979
7c673cae
FG
12980 _closedir(dirp);
12981 return 0;
12982}
12983
12984int Client::ll_fsyncdir(dir_result_t *dirp)
12985{
11fdf7f2 12986 std::lock_guard lock(client_lock);
7c673cae
FG
12987 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12988 tout(cct) << "ll_fsyncdir" << std::endl;
12989 tout(cct) << (unsigned long)dirp << std::endl;
12990
181888fb
FG
12991 if (unmounting)
12992 return -ENOTCONN;
12993
7c673cae
FG
12994 return _fsync(dirp->inode.get(), false);
12995}
12996
12997int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12998{
11fdf7f2 12999 ceph_assert(!(flags & O_CREAT));
7c673cae 13000
11fdf7f2 13001 std::lock_guard lock(client_lock);
7c673cae 13002
181888fb
FG
13003 if (unmounting)
13004 return -ENOTCONN;
13005
7c673cae
FG
13006 vinodeno_t vino = _get_vino(in);
13007
13008 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13009 tout(cct) << "ll_open" << std::endl;
13010 tout(cct) << vino.ino.val << std::endl;
13011 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13012
13013 int r;
11fdf7f2
TL
13014 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13015 "fuse_default_permissions");
13016 if (!fuse_default_permissions) {
7c673cae
FG
13017 r = may_open(in, flags, perms);
13018 if (r < 0)
13019 goto out;
13020 }
13021
13022 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13023
13024 out:
13025 Fh *fhptr = fhp ? *fhp : NULL;
13026 if (fhptr) {
13027 ll_unclosed_fh_set.insert(fhptr);
13028 }
13029 tout(cct) << (unsigned long)fhptr << std::endl;
13030 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13031 " = " << r << " (" << fhptr << ")" << dendl;
13032 return r;
13033}
13034
13035int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13036 int flags, InodeRef *in, int caps, Fh **fhp,
13037 const UserPerm& perms)
13038{
13039 *fhp = NULL;
13040
13041 vinodeno_t vparent = _get_vino(parent);
13042
1adf2230 13043 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13044 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13045 << ", gid " << perms.gid() << dendl;
13046 tout(cct) << "ll_create" << std::endl;
13047 tout(cct) << vparent.ino.val << std::endl;
13048 tout(cct) << name << std::endl;
13049 tout(cct) << mode << std::endl;
13050 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13051
13052 bool created = false;
13053 int r = _lookup(parent, name, caps, in, perms);
13054
13055 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13056 return -EEXIST;
13057
13058 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2
TL
13059 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13060 "fuse_default_permissions");
13061 if (!fuse_default_permissions) {
7c673cae
FG
13062 r = may_create(parent, perms);
13063 if (r < 0)
13064 goto out;
13065 }
13066 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13067 perms);
13068 if (r < 0)
13069 goto out;
13070 }
13071
13072 if (r < 0)
13073 goto out;
13074
11fdf7f2 13075 ceph_assert(*in);
7c673cae
FG
13076
13077 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13078 if (!created) {
11fdf7f2
TL
13079 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13080 "fuse_default_permissions");
13081 if (!fuse_default_permissions) {
7c673cae
FG
13082 r = may_open(in->get(), flags, perms);
13083 if (r < 0) {
13084 if (*fhp) {
13085 int release_r = _release_fh(*fhp);
11fdf7f2 13086 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13087 }
13088 goto out;
13089 }
13090 }
13091 if (*fhp == NULL) {
13092 r = _open(in->get(), flags, mode, fhp, perms);
13093 if (r < 0)
13094 goto out;
13095 }
13096 }
13097
13098out:
13099 if (*fhp) {
13100 ll_unclosed_fh_set.insert(*fhp);
13101 }
13102
13103 ino_t ino = 0;
13104 if (r >= 0) {
13105 Inode *inode = in->get();
13106 if (use_faked_inos())
13107 ino = inode->faked_ino;
13108 else
13109 ino = inode->ino;
13110 }
13111
13112 tout(cct) << (unsigned long)*fhp << std::endl;
13113 tout(cct) << ino << std::endl;
1adf2230 13114 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13115 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13116 *fhp << " " << hex << ino << dec << ")" << dendl;
13117
13118 return r;
13119}
13120
13121int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13122 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13123 const UserPerm& perms)
13124{
11fdf7f2 13125 std::lock_guard lock(client_lock);
7c673cae
FG
13126 InodeRef in;
13127
181888fb
FG
13128 if (unmounting)
13129 return -ENOTCONN;
13130
7c673cae
FG
13131 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13132 fhp, perms);
13133 if (r >= 0) {
11fdf7f2 13134 ceph_assert(in);
7c673cae
FG
13135
13136 // passing an Inode in outp requires an additional ref
13137 if (outp) {
13138 _ll_get(in.get());
13139 *outp = in.get();
13140 }
13141 fill_stat(in, attr);
13142 } else {
13143 attr->st_ino = 0;
13144 }
13145
13146 return r;
13147}
13148
13149int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13150 int oflags, Inode **outp, Fh **fhp,
13151 struct ceph_statx *stx, unsigned want, unsigned lflags,
13152 const UserPerm& perms)
13153{
13154 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13155 std::lock_guard lock(client_lock);
7c673cae
FG
13156 InodeRef in;
13157
181888fb
FG
13158 if (unmounting)
13159 return -ENOTCONN;
7c673cae
FG
13160
13161 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13162 if (r >= 0) {
11fdf7f2 13163 ceph_assert(in);
7c673cae
FG
13164
13165 // passing an Inode in outp requires an additional ref
13166 if (outp) {
13167 _ll_get(in.get());
13168 *outp = in.get();
13169 }
13170 fill_statx(in, caps, stx);
13171 } else {
13172 stx->stx_ino = 0;
13173 stx->stx_mask = 0;
13174 }
13175
13176 return r;
13177}
13178
13179loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13180{
11fdf7f2 13181 std::lock_guard lock(client_lock);
7c673cae
FG
13182 tout(cct) << "ll_lseek" << std::endl;
13183 tout(cct) << offset << std::endl;
13184 tout(cct) << whence << std::endl;
13185
181888fb
FG
13186 if (unmounting)
13187 return -ENOTCONN;
13188
7c673cae
FG
13189 return _lseek(fh, offset, whence);
13190}
13191
13192int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13193{
11fdf7f2 13194 std::lock_guard lock(client_lock);
7c673cae
FG
13195 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13196 tout(cct) << "ll_read" << std::endl;
13197 tout(cct) << (unsigned long)fh << std::endl;
13198 tout(cct) << off << std::endl;
13199 tout(cct) << len << std::endl;
13200
181888fb
FG
13201 if (unmounting)
13202 return -ENOTCONN;
13203
11fdf7f2
TL
13204 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13205 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13206 return _read(fh, off, len, bl);
13207}
13208
13209int Client::ll_read_block(Inode *in, uint64_t blockid,
13210 char *buf,
13211 uint64_t offset,
13212 uint64_t length,
13213 file_layout_t* layout)
13214{
11fdf7f2 13215 std::lock_guard lock(client_lock);
181888fb
FG
13216
13217 if (unmounting)
13218 return -ENOTCONN;
13219
b32b8144 13220 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13221 object_t oid = file_object_t(vino.ino, blockid);
13222 C_SaferCond onfinish;
13223 bufferlist bl;
13224
13225 objecter->read(oid,
13226 object_locator_t(layout->pool_id),
13227 offset,
13228 length,
13229 vino.snapid,
13230 &bl,
13231 CEPH_OSD_FLAG_READ,
13232 &onfinish);
13233
13234 client_lock.Unlock();
13235 int r = onfinish.wait();
13236 client_lock.Lock();
13237
13238 if (r >= 0) {
13239 bl.copy(0, bl.length(), buf);
13240 r = bl.length();
13241 }
13242
13243 return r;
13244}
13245
13246/* It appears that the OSD doesn't return success unless the entire
13247 buffer was written, return the write length on success. */
13248
13249int Client::ll_write_block(Inode *in, uint64_t blockid,
13250 char* buf, uint64_t offset,
13251 uint64_t length, file_layout_t* layout,
13252 uint64_t snapseq, uint32_t sync)
13253{
7c673cae 13254 vinodeno_t vino = ll_get_vino(in);
7c673cae 13255 int r = 0;
11fdf7f2
TL
13256 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13257
7c673cae
FG
13258 if (length == 0) {
13259 return -EINVAL;
13260 }
13261 if (true || sync) {
13262 /* if write is stable, the epilogue is waiting on
13263 * flock */
11fdf7f2 13264 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13265 }
13266 object_t oid = file_object_t(vino.ino, blockid);
13267 SnapContext fakesnap;
11fdf7f2
TL
13268 ceph::bufferlist bl;
13269 if (length > 0) {
13270 bl.push_back(buffer::copy(buf, length));
13271 }
7c673cae
FG
13272
13273 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13274 << dendl;
13275
13276 fakesnap.seq = snapseq;
13277
13278 /* lock just in time */
13279 client_lock.Lock();
181888fb
FG
13280 if (unmounting) {
13281 client_lock.Unlock();
181888fb
FG
13282 return -ENOTCONN;
13283 }
7c673cae
FG
13284
13285 objecter->write(oid,
13286 object_locator_t(layout->pool_id),
13287 offset,
13288 length,
13289 fakesnap,
13290 bl,
13291 ceph::real_clock::now(),
13292 0,
11fdf7f2 13293 onsafe.get());
7c673cae
FG
13294
13295 client_lock.Unlock();
11fdf7f2
TL
13296 if (nullptr != onsafe) {
13297 r = onsafe->wait();
7c673cae
FG
13298 }
13299
13300 if (r < 0) {
13301 return r;
13302 } else {
13303 return length;
13304 }
13305}
13306
13307int Client::ll_commit_blocks(Inode *in,
13308 uint64_t offset,
13309 uint64_t length)
13310{
11fdf7f2 13311 std::lock_guard lock(client_lock);
7c673cae
FG
13312 /*
13313 BarrierContext *bctx;
b32b8144 13314 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13315 uint64_t ino = vino.ino;
13316
13317 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13318 << offset << " to " << length << dendl;
13319
13320 if (length == 0) {
13321 return -EINVAL;
13322 }
13323
13324 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13325 if (p != barriers.end()) {
13326 barrier_interval civ(offset, offset + length);
13327 p->second->commit_barrier(civ);
13328 }
13329 */
13330 return 0;
13331}
13332
13333int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13334{
11fdf7f2 13335 std::lock_guard lock(client_lock);
7c673cae
FG
13336 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13337 "~" << len << dendl;
13338 tout(cct) << "ll_write" << std::endl;
13339 tout(cct) << (unsigned long)fh << std::endl;
13340 tout(cct) << off << std::endl;
13341 tout(cct) << len << std::endl;
13342
181888fb
FG
13343 if (unmounting)
13344 return -ENOTCONN;
13345
11fdf7f2
TL
13346 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13347 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13348 int r = _write(fh, off, len, data, NULL, 0);
13349 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13350 << dendl;
13351 return r;
13352}
13353
11fdf7f2
TL
13354int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13355{
13356 std::lock_guard lock(client_lock);
13357 if (unmounting)
13358 return -ENOTCONN;
13359 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13360}
13361
13362int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13363{
13364 std::lock_guard lock(client_lock);
13365 if (unmounting)
13366 return -ENOTCONN;
13367 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13368}
13369
7c673cae
FG
13370int Client::ll_flush(Fh *fh)
13371{
11fdf7f2 13372 std::lock_guard lock(client_lock);
7c673cae
FG
13373 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13374 tout(cct) << "ll_flush" << std::endl;
13375 tout(cct) << (unsigned long)fh << std::endl;
13376
181888fb
FG
13377 if (unmounting)
13378 return -ENOTCONN;
13379
7c673cae
FG
13380 return _flush(fh);
13381}
13382
13383int Client::ll_fsync(Fh *fh, bool syncdataonly)
13384{
11fdf7f2 13385 std::lock_guard lock(client_lock);
7c673cae
FG
13386 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13387 tout(cct) << "ll_fsync" << std::endl;
13388 tout(cct) << (unsigned long)fh << std::endl;
13389
181888fb
FG
13390 if (unmounting)
13391 return -ENOTCONN;
13392
7c673cae
FG
13393 int r = _fsync(fh, syncdataonly);
13394 if (r) {
13395 // If we're returning an error, clear it from the FH
13396 fh->take_async_err();
13397 }
13398 return r;
13399}
13400
28e407b8
AA
13401int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13402{
11fdf7f2 13403 std::lock_guard lock(client_lock);
28e407b8
AA
13404 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13405 tout(cct) << "ll_sync_inode" << std::endl;
13406 tout(cct) << (unsigned long)in << std::endl;
13407
13408 if (unmounting)
13409 return -ENOTCONN;
13410
13411 return _fsync(in, syncdataonly);
13412}
13413
7c673cae
FG
13414#ifdef FALLOC_FL_PUNCH_HOLE
13415
13416int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13417{
13418 if (offset < 0 || length <= 0)
13419 return -EINVAL;
13420
13421 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13422 return -EOPNOTSUPP;
13423
13424 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13425 return -EOPNOTSUPP;
13426
13427 Inode *in = fh->inode.get();
13428
13429 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13430 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13431 return -ENOSPC;
13432 }
13433
13434 if (in->snapid != CEPH_NOSNAP)
13435 return -EROFS;
13436
13437 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13438 return -EBADF;
13439
13440 uint64_t size = offset + length;
13441 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13442 size > in->size &&
11fdf7f2 13443 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13444 return -EDQUOT;
13445 }
13446
13447 int have;
13448 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13449 if (r < 0)
13450 return r;
13451
11fdf7f2 13452 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13453 if (mode & FALLOC_FL_PUNCH_HOLE) {
13454 if (in->inline_version < CEPH_INLINE_NONE &&
13455 (have & CEPH_CAP_FILE_BUFFER)) {
13456 bufferlist bl;
13457 int len = in->inline_data.length();
13458 if (offset < len) {
13459 if (offset > 0)
13460 in->inline_data.copy(0, offset, bl);
13461 int size = length;
13462 if (offset + size > len)
13463 size = len - offset;
13464 if (size > 0)
13465 bl.append_zero(size);
13466 if (offset + size < len)
13467 in->inline_data.copy(offset + size, len - offset - size, bl);
13468 in->inline_data = bl;
13469 in->inline_version++;
13470 }
91327a77 13471 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13472 in->change_attr++;
28e407b8 13473 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13474 } else {
13475 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13476 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13477 uninline_data(in, onuninline.get());
7c673cae
FG
13478 }
13479
11fdf7f2 13480 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13481
13482 unsafe_sync_write++;
13483 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13484
13485 _invalidate_inode_cache(in, offset, length);
13486 filer->zero(in->ino, &in->layout,
13487 in->snaprealm->get_snap_context(),
13488 offset, length,
13489 ceph::real_clock::now(),
11fdf7f2 13490 0, true, &onfinish);
91327a77 13491 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13492 in->change_attr++;
28e407b8 13493 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13494
13495 client_lock.Unlock();
11fdf7f2 13496 onfinish.wait();
7c673cae
FG
13497 client_lock.Lock();
13498 _sync_write_commit(in);
13499 }
13500 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13501 uint64_t size = offset + length;
13502 if (size > in->size) {
13503 in->size = size;
91327a77 13504 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13505 in->change_attr++;
28e407b8 13506 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13507
11fdf7f2 13508 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13509 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13510 } else if (is_max_size_approaching(in)) {
13511 check_caps(in, 0);
7c673cae
FG
13512 }
13513 }
13514 }
13515
11fdf7f2 13516 if (nullptr != onuninline) {
7c673cae 13517 client_lock.Unlock();
11fdf7f2 13518 int ret = onuninline->wait();
7c673cae
FG
13519 client_lock.Lock();
13520
11fdf7f2 13521 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13522 in->inline_data.clear();
13523 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13524 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13525 check_caps(in, 0);
13526 } else
11fdf7f2 13527 r = ret;
7c673cae
FG
13528 }
13529
13530 put_cap_ref(in, CEPH_CAP_FILE_WR);
13531 return r;
13532}
13533#else
13534
13535int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13536{
13537 return -EOPNOTSUPP;
13538}
13539
13540#endif
13541
13542
11fdf7f2 13543int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13544{
11fdf7f2
TL
13545 std::lock_guard lock(client_lock);
13546 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13547 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13548 tout(cct) << (unsigned long)fh << std::endl;
13549
181888fb
FG
13550 if (unmounting)
13551 return -ENOTCONN;
13552
7c673cae
FG
13553 return _fallocate(fh, mode, offset, length);
13554}
13555
13556int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13557{
11fdf7f2
TL
13558 std::lock_guard lock(client_lock);
13559 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13560
181888fb
FG
13561 if (unmounting)
13562 return -ENOTCONN;
13563
7c673cae
FG
13564 Fh *fh = get_filehandle(fd);
13565 if (!fh)
13566 return -EBADF;
13567#if defined(__linux__) && defined(O_PATH)
13568 if (fh->flags & O_PATH)
13569 return -EBADF;
13570#endif
13571 return _fallocate(fh, mode, offset, length);
13572}
13573
13574int Client::ll_release(Fh *fh)
13575{
11fdf7f2 13576 std::lock_guard lock(client_lock);
91327a77
AA
13577
13578 if (unmounting)
13579 return -ENOTCONN;
13580
11fdf7f2 13581 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13582 dendl;
11fdf7f2 13583 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13584 tout(cct) << (unsigned long)fh << std::endl;
13585
13586 if (ll_unclosed_fh_set.count(fh))
13587 ll_unclosed_fh_set.erase(fh);
13588 return _release_fh(fh);
13589}
13590
13591int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13592{
11fdf7f2 13593 std::lock_guard lock(client_lock);
7c673cae
FG
13594
13595 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13596 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13597
181888fb
FG
13598 if (unmounting)
13599 return -ENOTCONN;
13600
7c673cae
FG
13601 return _getlk(fh, fl, owner);
13602}
13603
13604int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13605{
11fdf7f2 13606 std::lock_guard lock(client_lock);
7c673cae 13607
11fdf7f2
TL
13608 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13609 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13610
181888fb
FG
13611 if (unmounting)
13612 return -ENOTCONN;
13613
7c673cae
FG
13614 return _setlk(fh, fl, owner, sleep);
13615}
13616
13617int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13618{
11fdf7f2 13619 std::lock_guard lock(client_lock);
7c673cae 13620
11fdf7f2
TL
13621 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13622 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13623
181888fb
FG
13624 if (unmounting)
13625 return -ENOTCONN;
13626
7c673cae
FG
13627 return _flock(fh, cmd, owner);
13628}
13629
b32b8144
FG
13630int Client::set_deleg_timeout(uint32_t timeout)
13631{
11fdf7f2 13632 std::lock_guard lock(client_lock);
b32b8144
FG
13633
13634 /*
13635 * The whole point is to prevent blacklisting so we must time out the
13636 * delegation before the session autoclose timeout kicks in.
13637 */
13638 if (timeout >= mdsmap->get_session_autoclose())
13639 return -EINVAL;
13640
13641 deleg_timeout = timeout;
13642 return 0;
13643}
13644
13645int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13646{
13647 int ret = -EINVAL;
13648
11fdf7f2 13649 std::lock_guard lock(client_lock);
b32b8144
FG
13650
13651 if (!mounted)
13652 return -ENOTCONN;
13653
13654 Inode *inode = fh->inode.get();
13655
13656 switch(cmd) {
13657 case CEPH_DELEGATION_NONE:
13658 inode->unset_deleg(fh);
13659 ret = 0;
13660 break;
13661 default:
13662 try {
13663 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13664 } catch (std::bad_alloc&) {
b32b8144
FG
13665 ret = -ENOMEM;
13666 }
13667 break;
13668 }
13669 return ret;
13670}
13671
7c673cae
FG
13672class C_Client_RequestInterrupt : public Context {
13673private:
13674 Client *client;
13675 MetaRequest *req;
13676public:
13677 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13678 req->get();
13679 }
13680 void finish(int r) override {
11fdf7f2
TL
13681 std::lock_guard l(client->client_lock);
13682 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13683 client->_interrupt_filelock(req);
13684 client->put_request(req);
13685 }
13686};
13687
13688void Client::ll_interrupt(void *d)
13689{
13690 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13691 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13692 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13693 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13694}
13695
13696// =========================================
13697// layout
13698
13699// expose file layouts
13700
13701int Client::describe_layout(const char *relpath, file_layout_t *lp,
13702 const UserPerm& perms)
13703{
11fdf7f2 13704 std::lock_guard lock(client_lock);
7c673cae 13705
181888fb
FG
13706 if (unmounting)
13707 return -ENOTCONN;
13708
7c673cae
FG
13709 filepath path(relpath);
13710 InodeRef in;
13711 int r = path_walk(path, &in, perms);
13712 if (r < 0)
13713 return r;
13714
13715 *lp = in->layout;
13716
11fdf7f2 13717 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13718 return 0;
13719}
13720
13721int Client::fdescribe_layout(int fd, file_layout_t *lp)
13722{
11fdf7f2 13723 std::lock_guard lock(client_lock);
7c673cae 13724
181888fb
FG
13725 if (unmounting)
13726 return -ENOTCONN;
13727
7c673cae
FG
13728 Fh *f = get_filehandle(fd);
13729 if (!f)
13730 return -EBADF;
13731 Inode *in = f->inode.get();
13732
13733 *lp = in->layout;
13734
11fdf7f2 13735 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13736 return 0;
13737}
13738
d2e6a577
FG
13739int64_t Client::get_default_pool_id()
13740{
11fdf7f2 13741 std::lock_guard lock(client_lock);
181888fb
FG
13742
13743 if (unmounting)
13744 return -ENOTCONN;
13745
d2e6a577
FG
13746 /* first data pool is the default */
13747 return mdsmap->get_first_data_pool();
13748}
7c673cae
FG
13749
13750// expose osdmap
13751
13752int64_t Client::get_pool_id(const char *pool_name)
13753{
11fdf7f2 13754 std::lock_guard lock(client_lock);
181888fb
FG
13755
13756 if (unmounting)
13757 return -ENOTCONN;
13758
7c673cae
FG
13759 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13760 pool_name);
13761}
13762
13763string Client::get_pool_name(int64_t pool)
13764{
11fdf7f2 13765 std::lock_guard lock(client_lock);
181888fb
FG
13766
13767 if (unmounting)
13768 return string();
13769
7c673cae
FG
13770 return objecter->with_osdmap([pool](const OSDMap& o) {
13771 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13772 });
13773}
13774
13775int Client::get_pool_replication(int64_t pool)
13776{
11fdf7f2 13777 std::lock_guard lock(client_lock);
181888fb
FG
13778
13779 if (unmounting)
13780 return -ENOTCONN;
13781
7c673cae
FG
13782 return objecter->with_osdmap([pool](const OSDMap& o) {
13783 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13784 });
13785}
13786
13787int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13788{
11fdf7f2 13789 std::lock_guard lock(client_lock);
7c673cae 13790
181888fb
FG
13791 if (unmounting)
13792 return -ENOTCONN;
13793
7c673cae
FG
13794 Fh *f = get_filehandle(fd);
13795 if (!f)
13796 return -EBADF;
13797 Inode *in = f->inode.get();
13798
13799 vector<ObjectExtent> extents;
13800 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13801 ceph_assert(extents.size() == 1);
7c673cae
FG
13802
13803 objecter->with_osdmap([&](const OSDMap& o) {
13804 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13805 o.pg_to_acting_osds(pg, osds);
13806 });
13807
13808 if (osds.empty())
13809 return -EINVAL;
13810
13811 /*
13812 * Return the remainder of the extent (stripe unit)
13813 *
13814 * If length = 1 is passed to Striper::file_to_extents we get a single
13815 * extent back, but its length is one so we still need to compute the length
13816 * to the end of the stripe unit.
13817 *
13818 * If length = su then we may get 1 or 2 objects back in the extents vector
13819 * which would have to be examined. Even then, the offsets are local to the
13820 * object, so matching up to the file offset is extra work.
13821 *
13822 * It seems simpler to stick with length = 1 and manually compute the
13823 * remainder.
13824 */
13825 if (len) {
13826 uint64_t su = in->layout.stripe_unit;
13827 *len = su - (off % su);
13828 }
13829
13830 return 0;
13831}
13832
13833int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13834{
11fdf7f2 13835 std::lock_guard lock(client_lock);
181888fb
FG
13836
13837 if (unmounting)
13838 return -ENOTCONN;
13839
7c673cae
FG
13840 if (id < 0)
13841 return -EINVAL;
13842 return objecter->with_osdmap([&](const OSDMap& o) {
13843 return o.crush->get_full_location_ordered(id, path);
13844 });
13845}
13846
13847int Client::get_file_stripe_address(int fd, loff_t offset,
13848 vector<entity_addr_t>& address)
13849{
11fdf7f2 13850 std::lock_guard lock(client_lock);
7c673cae 13851
181888fb
FG
13852 if (unmounting)
13853 return -ENOTCONN;
13854
7c673cae
FG
13855 Fh *f = get_filehandle(fd);
13856 if (!f)
13857 return -EBADF;
13858 Inode *in = f->inode.get();
13859
13860 // which object?
13861 vector<ObjectExtent> extents;
13862 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13863 in->truncate_size, extents);
11fdf7f2 13864 ceph_assert(extents.size() == 1);
7c673cae
FG
13865
13866 // now we have the object and its 'layout'
13867 return objecter->with_osdmap([&](const OSDMap& o) {
13868 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13869 vector<int> osds;
13870 o.pg_to_acting_osds(pg, osds);
13871 if (osds.empty())
13872 return -EINVAL;
13873 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13874 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13875 address.push_back(addr);
13876 }
13877 return 0;
13878 });
13879}
13880
13881int Client::get_osd_addr(int osd, entity_addr_t& addr)
13882{
11fdf7f2 13883 std::lock_guard lock(client_lock);
181888fb
FG
13884
13885 if (unmounting)
13886 return -ENOTCONN;
13887
7c673cae
FG
13888 return objecter->with_osdmap([&](const OSDMap& o) {
13889 if (!o.exists(osd))
13890 return -ENOENT;
13891
11fdf7f2 13892 addr = o.get_addrs(osd).front();
7c673cae
FG
13893 return 0;
13894 });
13895}
13896
13897int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13898 loff_t length, loff_t offset)
13899{
11fdf7f2 13900 std::lock_guard lock(client_lock);
7c673cae 13901
181888fb
FG
13902 if (unmounting)
13903 return -ENOTCONN;
13904
7c673cae
FG
13905 Fh *f = get_filehandle(fd);
13906 if (!f)
13907 return -EBADF;
13908 Inode *in = f->inode.get();
13909
13910 // map to a list of extents
13911 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13912
11fdf7f2 13913 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13914 return 0;
13915}
13916
13917
b32b8144 13918/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13919int Client::get_local_osd()
13920{
11fdf7f2 13921 std::lock_guard lock(client_lock);
181888fb
FG
13922
13923 if (unmounting)
13924 return -ENOTCONN;
13925
7c673cae
FG
13926 objecter->with_osdmap([this](const OSDMap& o) {
13927 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13928 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13929 local_osd_epoch = o.get_epoch();
13930 }
13931 });
13932 return local_osd;
13933}
13934
13935
13936
13937
13938
13939
13940// ===============================
13941
13942void Client::ms_handle_connect(Connection *con)
13943{
11fdf7f2 13944 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13945}
13946
13947bool Client::ms_handle_reset(Connection *con)
13948{
11fdf7f2 13949 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13950 return false;
13951}
13952
13953void Client::ms_handle_remote_reset(Connection *con)
13954{
11fdf7f2
TL
13955 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13956 std::lock_guard l(client_lock);
7c673cae
FG
13957 switch (con->get_peer_type()) {
13958 case CEPH_ENTITY_TYPE_MDS:
13959 {
13960 // kludge to figure out which mds this is; fixme with a Connection* state
13961 mds_rank_t mds = MDS_RANK_NONE;
13962 MetaSession *s = NULL;
11fdf7f2
TL
13963 for (auto &p : mds_sessions) {
13964 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13965 mds = p.first;
13966 s = &p.second;
7c673cae
FG
13967 }
13968 }
13969 if (mds >= 0) {
d2e6a577 13970 assert (s != NULL);
7c673cae
FG
13971 switch (s->state) {
13972 case MetaSession::STATE_CLOSING:
13973 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13974 _closed_mds_session(s);
13975 break;
13976
13977 case MetaSession::STATE_OPENING:
13978 {
13979 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13980 list<Context*> waiters;
13981 waiters.swap(s->waiting_for_open);
13982 _closed_mds_session(s);
13983 MetaSession *news = _get_or_open_mds_session(mds);
13984 news->waiting_for_open.swap(waiters);
13985 }
13986 break;
13987
13988 case MetaSession::STATE_OPEN:
13989 {
28e407b8 13990 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 13991 const auto& conf = cct->_conf;
7c673cae
FG
13992 if (conf->client_reconnect_stale) {
13993 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13994 _closed_mds_session(s);
13995 } else {
13996 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13997 s->state = MetaSession::STATE_STALE;
13998 }
13999 }
14000 break;
14001
14002 case MetaSession::STATE_NEW:
14003 case MetaSession::STATE_CLOSED:
14004 default:
14005 break;
14006 }
14007 }
14008 }
14009 break;
14010 }
14011}
14012
14013bool Client::ms_handle_refused(Connection *con)
14014{
11fdf7f2 14015 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14016 return false;
14017}
14018
11fdf7f2 14019bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
14020{
14021 if (dest_type == CEPH_ENTITY_TYPE_MON)
14022 return true;
14023 *authorizer = monclient->build_authorizer(dest_type);
14024 return true;
14025}
14026
14027Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14028{
11fdf7f2
TL
14029 Inode *quota_in = root_ancestor;
14030 SnapRealm *realm = in->snaprealm;
14031 while (realm) {
14032 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14033 if (realm->ino != in->ino) {
14034 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14035 if (p == inode_map.end())
14036 break;
7c673cae 14037
11fdf7f2
TL
14038 if (p->second->quota.is_enable()) {
14039 quota_in = p->second;
14040 break;
7c673cae 14041 }
7c673cae 14042 }
11fdf7f2 14043 realm = realm->pparent;
7c673cae 14044 }
11fdf7f2
TL
14045 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14046 return quota_in;
7c673cae
FG
14047}
14048
14049/**
14050 * Traverse quota ancestors of the Inode, return true
14051 * if any of them passes the passed function
14052 */
14053bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14054 std::function<bool (const Inode &in)> test)
14055{
14056 while (true) {
11fdf7f2 14057 ceph_assert(in != NULL);
7c673cae
FG
14058 if (test(*in)) {
14059 return true;
14060 }
14061
14062 if (in == root_ancestor) {
14063 // We're done traversing, drop out
14064 return false;
14065 } else {
14066 // Continue up the tree
14067 in = get_quota_root(in, perms);
14068 }
14069 }
14070
14071 return false;
14072}
14073
14074bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14075{
14076 return check_quota_condition(in, perms,
14077 [](const Inode &in) {
14078 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14079 });
14080}
14081
14082bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14083 const UserPerm& perms)
7c673cae
FG
14084{
14085 return check_quota_condition(in, perms,
11fdf7f2 14086 [&new_bytes](const Inode &in) {
7c673cae
FG
14087 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14088 > in.quota.max_bytes;
14089 });
14090}
14091
11fdf7f2 14092bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14093{
11fdf7f2
TL
14094 return check_quota_condition(in, perms,
14095 [](const Inode &in) {
14096 if (in.quota.max_bytes) {
14097 if (in.rstat.rbytes >= in.quota.max_bytes) {
14098 return true;
14099 }
14100
14101 ceph_assert(in.size >= in.reported_size);
14102 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14103 const uint64_t size = in.size - in.reported_size;
14104 return (space >> 4) < size;
14105 } else {
14106 return false;
14107 }
14108 });
7c673cae
FG
14109}
14110
14111enum {
14112 POOL_CHECKED = 1,
14113 POOL_CHECKING = 2,
14114 POOL_READ = 4,
14115 POOL_WRITE = 8,
14116};
14117
14118int Client::check_pool_perm(Inode *in, int need)
14119{
14120 if (!cct->_conf->client_check_pool_perm)
14121 return 0;
14122
14123 int64_t pool_id = in->layout.pool_id;
14124 std::string pool_ns = in->layout.pool_ns;
14125 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14126 int have = 0;
14127 while (true) {
14128 auto it = pool_perms.find(perm_key);
14129 if (it == pool_perms.end())
14130 break;
14131 if (it->second == POOL_CHECKING) {
14132 // avoid concurrent checkings
14133 wait_on_list(waiting_for_pool_perm);
14134 } else {
14135 have = it->second;
11fdf7f2 14136 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14137 break;
14138 }
14139 }
14140
14141 if (!have) {
14142 if (in->snapid != CEPH_NOSNAP) {
14143 // pool permission check needs to write to the first object. But for snapshot,
14144 // head of the first object may have alread been deleted. To avoid creating
14145 // orphan object, skip the check for now.
14146 return 0;
14147 }
14148
14149 pool_perms[perm_key] = POOL_CHECKING;
14150
14151 char oid_buf[32];
14152 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14153 object_t oid = oid_buf;
14154
14155 SnapContext nullsnapc;
14156
14157 C_SaferCond rd_cond;
14158 ObjectOperation rd_op;
14159 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14160
14161 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14162 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14163
14164 C_SaferCond wr_cond;
14165 ObjectOperation wr_op;
14166 wr_op.create(true);
14167
14168 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14169 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14170
14171 client_lock.Unlock();
14172 int rd_ret = rd_cond.wait();
14173 int wr_ret = wr_cond.wait();
14174 client_lock.Lock();
14175
14176 bool errored = false;
14177
14178 if (rd_ret == 0 || rd_ret == -ENOENT)
14179 have |= POOL_READ;
14180 else if (rd_ret != -EPERM) {
11fdf7f2 14181 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14182 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14183 errored = true;
14184 }
14185
14186 if (wr_ret == 0 || wr_ret == -EEXIST)
14187 have |= POOL_WRITE;
14188 else if (wr_ret != -EPERM) {
11fdf7f2 14189 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14190 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14191 errored = true;
14192 }
14193
14194 if (errored) {
14195 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14196 // Raise EIO because actual error code might be misleading for
14197 // userspace filesystem user.
14198 pool_perms.erase(perm_key);
14199 signal_cond_list(waiting_for_pool_perm);
14200 return -EIO;
14201 }
14202
14203 pool_perms[perm_key] = have | POOL_CHECKED;
14204 signal_cond_list(waiting_for_pool_perm);
14205 }
14206
14207 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14208 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14209 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14210 return -EPERM;
14211 }
14212 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14213 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14214 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14215 return -EPERM;
14216 }
14217
14218 return 0;
14219}
14220
14221int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14222{
14223 if (acl_type == POSIX_ACL) {
14224 if (in->xattrs.count(ACL_EA_ACCESS)) {
14225 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14226
14227 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14228 }
14229 }
14230 return -EAGAIN;
14231}
14232
14233int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14234{
14235 if (acl_type == NO_ACL)
14236 return 0;
14237
14238 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14239 if (r < 0)
14240 goto out;
14241
14242 if (acl_type == POSIX_ACL) {
14243 if (in->xattrs.count(ACL_EA_ACCESS)) {
14244 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14245 bufferptr acl(access_acl.c_str(), access_acl.length());
14246 r = posix_acl_access_chmod(acl, mode);
14247 if (r < 0)
14248 goto out;
14249 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14250 } else {
14251 r = 0;
14252 }
14253 }
14254out:
14255 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14256 return r;
14257}
14258
14259int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14260 const UserPerm& perms)
14261{
14262 if (acl_type == NO_ACL)
14263 return 0;
14264
14265 if (S_ISLNK(*mode))
14266 return 0;
14267
14268 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14269 if (r < 0)
14270 goto out;
14271
14272 if (acl_type == POSIX_ACL) {
14273 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14274 map<string, bufferptr> xattrs;
14275
14276 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14277 bufferptr acl(default_acl.c_str(), default_acl.length());
14278 r = posix_acl_inherit_mode(acl, mode);
14279 if (r < 0)
14280 goto out;
14281
14282 if (r > 0) {
14283 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14284 if (r < 0)
14285 goto out;
14286 if (r > 0)
14287 xattrs[ACL_EA_ACCESS] = acl;
14288 }
14289
14290 if (S_ISDIR(*mode))
14291 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14292
14293 r = xattrs.size();
14294 if (r > 0)
11fdf7f2 14295 encode(xattrs, xattrs_bl);
7c673cae
FG
14296 } else {
14297 if (umask_cb)
14298 *mode &= ~umask_cb(callback_handle);
14299 r = 0;
14300 }
14301 }
14302out:
14303 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14304 return r;
14305}
14306
14307void Client::set_filer_flags(int flags)
14308{
11fdf7f2
TL
14309 std::lock_guard l(client_lock);
14310 ceph_assert(flags == 0 ||
7c673cae
FG
14311 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14312 objecter->add_global_op_flags(flags);
14313}
14314
14315void Client::clear_filer_flags(int flags)
14316{
11fdf7f2
TL
14317 std::lock_guard l(client_lock);
14318 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14319 objecter->clear_global_op_flag(flags);
14320}
14321
11fdf7f2
TL
14322// called before mount
14323void Client::set_uuid(const std::string& uuid)
14324{
14325 std::lock_guard l(client_lock);
14326 assert(initialized);
14327 assert(!uuid.empty());
14328
14329 metadata["uuid"] = uuid;
14330 _close_sessions();
14331}
14332
14333// called before mount. 0 means infinite
14334void Client::set_session_timeout(unsigned timeout)
14335{
14336 std::lock_guard l(client_lock);
14337 assert(initialized);
14338
14339 metadata["timeout"] = stringify(timeout);
14340}
14341
14342// called before mount
14343int Client::start_reclaim(const std::string& uuid, unsigned flags,
14344 const std::string& fs_name)
14345{
14346 std::lock_guard l(client_lock);
14347 if (!initialized)
14348 return -ENOTCONN;
14349
14350 if (uuid.empty())
14351 return -EINVAL;
14352
14353 {
14354 auto it = metadata.find("uuid");
14355 if (it != metadata.end() && it->second == uuid)
14356 return -EINVAL;
14357 }
14358
14359 int r = subscribe_mdsmap(fs_name);
14360 if (r < 0) {
14361 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14362 return r;
14363 }
14364
14365 if (metadata.empty())
14366 populate_metadata("");
14367
14368 while (mdsmap->get_epoch() == 0)
14369 wait_on_list(waiting_for_mdsmap);
14370
14371 reclaim_errno = 0;
14372 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14373 if (!mdsmap->is_up(mds)) {
14374 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14375 wait_on_list(waiting_for_mdsmap);
14376 continue;
14377 }
14378
14379 MetaSession *session;
14380 if (!have_open_session(mds)) {
14381 session = _get_or_open_mds_session(mds);
14382 if (session->state != MetaSession::STATE_OPENING) {
14383 // umounting?
14384 return -EINVAL;
14385 }
14386 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14387 wait_on_context_list(session->waiting_for_open);
14388 if (rejected_by_mds.count(mds))
14389 return -EPERM;
14390 continue;
14391 }
14392
14393 session = &mds_sessions.at(mds);
14394 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14395 return -EOPNOTSUPP;
14396
14397 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14398 session->reclaim_state == MetaSession::RECLAIMING) {
14399 session->reclaim_state = MetaSession::RECLAIMING;
14400 auto m = MClientReclaim::create(uuid, flags);
14401 session->con->send_message2(std::move(m));
14402 wait_on_list(waiting_for_reclaim);
14403 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14404 return reclaim_errno ? : -ENOTRECOVERABLE;
14405 } else {
14406 mds++;
14407 }
14408 }
14409
14410 // didn't find target session in any mds
14411 if (reclaim_target_addrs.empty()) {
14412 if (flags & CEPH_RECLAIM_RESET)
14413 return -ENOENT;
14414 return -ENOTRECOVERABLE;
14415 }
14416
14417 if (flags & CEPH_RECLAIM_RESET)
14418 return 0;
14419
14420 // use blacklist to check if target session was killed
14421 // (config option mds_session_blacklist_on_evict needs to be true)
14422 C_SaferCond cond;
14423 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14424 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14425 client_lock.Unlock();
14426 cond.wait();
14427 client_lock.Lock();
14428 }
14429
14430 bool blacklisted = objecter->with_osdmap(
14431 [this](const OSDMap &osd_map) -> bool {
14432 return osd_map.is_blacklisted(reclaim_target_addrs);
14433 });
14434 if (blacklisted)
14435 return -ENOTRECOVERABLE;
14436
14437 metadata["reclaiming_uuid"] = uuid;
14438 return 0;
14439}
14440
14441void Client::finish_reclaim()
14442{
14443 auto it = metadata.find("reclaiming_uuid");
14444 if (it == metadata.end()) {
14445 for (auto &p : mds_sessions)
14446 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14447 return;
14448 }
14449
14450 for (auto &p : mds_sessions) {
14451 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14452 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14453 p.second.con->send_message2(std::move(m));
14454 }
14455
14456 metadata["uuid"] = it->second;
14457 metadata.erase(it);
14458}
14459
14460void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14461{
14462 mds_rank_t from = mds_rank_t(reply->get_source().num());
14463 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14464
14465 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14466 if (!session) {
14467 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14468 return;
14469 }
14470
14471 if (reply->get_result() >= 0) {
14472 session->reclaim_state = MetaSession::RECLAIM_OK;
14473 if (reply->get_epoch() > reclaim_osd_epoch)
14474 reclaim_osd_epoch = reply->get_epoch();
14475 if (!reply->get_addrs().empty())
14476 reclaim_target_addrs = reply->get_addrs();
14477 } else {
14478 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14479 reclaim_errno = reply->get_result();
14480 }
14481
14482 signal_cond_list(waiting_for_reclaim);
14483}
14484
7c673cae
FG
14485/**
14486 * This is included in cap release messages, to cause
14487 * the MDS to wait until this OSD map epoch. It is necessary
14488 * in corner cases where we cancel RADOS ops, so that
14489 * nobody else tries to do IO to the same objects in
14490 * the same epoch as the cancelled ops.
14491 */
14492void Client::set_cap_epoch_barrier(epoch_t e)
14493{
14494 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14495 cap_epoch_barrier = e;
14496}
14497
14498const char** Client::get_tracked_conf_keys() const
14499{
14500 static const char* keys[] = {
14501 "client_cache_size",
14502 "client_cache_mid",
14503 "client_acl_type",
b32b8144
FG
14504 "client_deleg_timeout",
14505 "client_deleg_break_on_open",
7c673cae
FG
14506 NULL
14507 };
14508 return keys;
14509}
14510
11fdf7f2 14511void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14512 const std::set <std::string> &changed)
14513{
11fdf7f2 14514 std::lock_guard lock(client_lock);
7c673cae 14515
181888fb 14516 if (changed.count("client_cache_mid")) {
7c673cae
FG
14517 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14518 }
14519 if (changed.count("client_acl_type")) {
14520 acl_type = NO_ACL;
14521 if (cct->_conf->client_acl_type == "posix_acl")
14522 acl_type = POSIX_ACL;
14523 }
14524}
14525
7c673cae
FG
14526void intrusive_ptr_add_ref(Inode *in)
14527{
14528 in->get();
14529}
14530
14531void intrusive_ptr_release(Inode *in)
14532{
14533 in->client->put_inode(in);
14534}
14535
14536mds_rank_t Client::_get_random_up_mds() const
14537{
11fdf7f2 14538 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
14539
14540 std::set<mds_rank_t> up;
14541 mdsmap->get_up_mds_set(up);
14542
14543 if (up.empty())
14544 return MDS_RANK_NONE;
14545 std::set<mds_rank_t>::const_iterator p = up.begin();
14546 for (int n = rand() % up.size(); n; n--)
14547 ++p;
14548 return *p;
14549}
14550
14551
14552StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14553 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14554{
14555 monclient->set_messenger(m);
14556 objecter->set_client_incarnation(0);
14557}
14558
14559StandaloneClient::~StandaloneClient()
14560{
14561 delete objecter;
14562 objecter = nullptr;
14563}
14564
14565int StandaloneClient::init()
14566{
14567 timer.init();
14568 objectcacher->start();
14569 objecter->init();
14570
14571 client_lock.Lock();
11fdf7f2 14572 ceph_assert(!is_initialized());
7c673cae
FG
14573
14574 messenger->add_dispatcher_tail(objecter);
14575 messenger->add_dispatcher_tail(this);
14576
14577 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14578 int r = monclient->init();
14579 if (r < 0) {
14580 // need to do cleanup because we're in an intermediate init state
14581 timer.shutdown();
14582 client_lock.Unlock();
14583 objecter->shutdown();
14584 objectcacher->stop();
14585 monclient->shutdown();
14586 return r;
14587 }
14588 objecter->start();
14589
14590 client_lock.Unlock();
14591 _finish_init();
14592
14593 return 0;
14594}
14595
14596void StandaloneClient::shutdown()
14597{
14598 Client::shutdown();
14599 objecter->shutdown();
14600 monclient->shutdown();
14601}