]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update download target update for octopus release
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
73#include "common/Mutex.h"
74#include "common/perf_counters.h"
75#include "common/admin_socket.h"
76#include "common/errno.h"
77#include "include/str_list.h"
78
79#define dout_subsys ceph_subsys_client
80
81#include "include/lru.h"
82#include "include/compat.h"
83#include "include/stringify.h"
84
85#include "Client.h"
86#include "Inode.h"
87#include "Dentry.h"
b32b8144 88#include "Delegation.h"
7c673cae
FG
89#include "Dir.h"
90#include "ClientSnapRealm.h"
91#include "Fh.h"
92#include "MetaSession.h"
93#include "MetaRequest.h"
94#include "ObjecterWriteback.h"
95#include "posix_acl.h"
96
11fdf7f2 97#include "include/ceph_assert.h"
7c673cae
FG
98#include "include/stat.h"
99
100#include "include/cephfs/ceph_statx.h"
101
102#if HAVE_GETGROUPLIST
103#include <grp.h>
104#include <pwd.h>
105#include <unistd.h>
106#endif
107
108#undef dout_prefix
109#define dout_prefix *_dout << "client." << whoami << " "
110
111#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113// FreeBSD fails to define this
114#ifndef O_DSYNC
115#define O_DSYNC 0x0
116#endif
117// Darwin fails to define this
118#ifndef O_RSYNC
119#define O_RSYNC 0x0
120#endif
121
122#ifndef O_DIRECT
123#define O_DIRECT 0x0
124#endif
125
126#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129{
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132}
133
134
135// -------------
136
137Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139{
140}
141
11fdf7f2
TL
142bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
7c673cae 145{
11fdf7f2 146 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae
FG
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
11fdf7f2 150 m_client->dump_mds_requests(f.get());
7c673cae 151 else if (command == "mds_sessions")
11fdf7f2 152 m_client->dump_mds_sessions(f.get());
7c673cae 153 else if (command == "dump_cache")
11fdf7f2 154 m_client->dump_cache(f.get());
7c673cae
FG
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
11fdf7f2 158 m_client->dump_status(f.get());
7c673cae 159 else
11fdf7f2 160 ceph_abort_msg("bad command registered");
7c673cae
FG
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
7c673cae
FG
164 return true;
165}
166
167
168// -------------
169
170dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176void Client::_reset_faked_inos()
177{
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
11fdf7f2 182 last_used_faked_root = 0;
7c673cae
FG
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184}
185
186void Client::_assign_faked_ino(Inode *in)
187{
11fdf7f2
TL
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 192 last_used_faked_ino = 2048;
7c673cae
FG
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
11fdf7f2 195 ceph_assert(it != free_faked_inos.end());
7c673cae 196 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 197 ceph_assert(it.get_len() > 0);
7c673cae
FG
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
11fdf7f2 201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206}
207
11fdf7f2
TL
208/*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214*/
215void Client::_assign_faked_root(Inode *in)
216{
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232}
233
7c673cae
FG
234void Client::_release_faked_ino(Inode *in)
235{
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238}
239
240vinodeno_t Client::_map_faked_ino(ino_t ino)
241{
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
250 return vino;
251}
252
253vinodeno_t Client::map_faked_ino(ino_t ino)
254{
11fdf7f2 255 std::lock_guard lock(client_lock);
7c673cae
FG
256 return _map_faked_ino(ino);
257}
258
259// cons/des
260
261Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
7c673cae 263 timer(m->cct, client_lock),
11fdf7f2
TL
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae 278
7c673cae
FG
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
281 fuse_default_permissions = cct->_conf.get_val<bool>(
282 "fuse_default_permissions");
7c673cae 283
7c673cae
FG
284 if (cct->_conf->client_acl_type == "posix_acl")
285 acl_type = POSIX_ACL;
286
7c673cae
FG
287 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
288
289 // file handles
290 free_fd_set.insert(10, 1<<30);
291
292 mdsmap.reset(new MDSMap);
293
294 // osd interfaces
295 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
296 &client_lock));
297 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
298 client_flush_set_callback, // all commit callback
299 (void*)this,
300 cct->_conf->client_oc_size,
301 cct->_conf->client_oc_max_objects,
302 cct->_conf->client_oc_max_dirty,
303 cct->_conf->client_oc_target_dirty,
304 cct->_conf->client_oc_max_dirty_age,
305 true));
306 objecter_finisher.start();
307 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 308 objecter->enable_blacklist_events();
7c673cae
FG
309}
310
311
312Client::~Client()
313{
11fdf7f2 314 ceph_assert(!client_lock.is_locked());
7c673cae 315
31f18b77
FG
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
319 client_lock.Lock();
7c673cae 320 tear_down_cache();
31f18b77 321 client_lock.Unlock();
7c673cae
FG
322}
323
324void Client::tear_down_cache()
325{
326 // fd's
327 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
328 it != fd_map.end();
329 ++it) {
330 Fh *fh = it->second;
11fdf7f2 331 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
332 _release_fh(fh);
333 }
334 fd_map.clear();
335
336 while (!opened_dirs.empty()) {
337 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 338 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
339 _closedir(dirp);
340 }
341
342 // caps!
343 // *** FIXME ***
344
345 // empty lru
7c673cae 346 trim_cache();
11fdf7f2 347 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
348
349 // close root ino
11fdf7f2 350 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
351 if (root && inode_map.size() == 1 + root_parents.size()) {
352 delete root;
353 root = 0;
354 root_ancestor = 0;
355 while (!root_parents.empty())
356 root_parents.erase(root_parents.begin());
357 inode_map.clear();
358 _reset_faked_inos();
359 }
360
11fdf7f2 361 ceph_assert(inode_map.empty());
7c673cae
FG
362}
363
364inodeno_t Client::get_root_ino()
365{
11fdf7f2 366 std::lock_guard l(client_lock);
7c673cae
FG
367 if (use_faked_inos())
368 return root->faked_ino;
369 else
370 return root->ino;
371}
372
373Inode *Client::get_root()
374{
11fdf7f2 375 std::lock_guard l(client_lock);
7c673cae
FG
376 root->ll_get();
377 return root;
378}
379
380
381// debug crapola
382
383void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
384{
385 filepath path;
386 in->make_long_path(path);
387 ldout(cct, 1) << "dump_inode: "
388 << (disconnected ? "DISCONNECTED ":"")
389 << "inode " << in->ino
390 << " " << path
391 << " ref " << in->get_num_ref()
392 << *in << dendl;
393
394 if (f) {
395 f->open_object_section("inode");
396 f->dump_stream("path") << path;
397 if (disconnected)
398 f->dump_int("disconnected", 1);
399 in->dump(f);
400 f->close_section();
401 }
402
403 did.insert(in);
404 if (in->dir) {
405 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
406 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
407 it != in->dir->dentries.end();
408 ++it) {
409 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
410 if (f) {
411 f->open_object_section("dentry");
412 it->second->dump(f);
413 f->close_section();
414 }
415 if (it->second->inode)
416 dump_inode(f, it->second->inode.get(), did, false);
417 }
418 }
419}
420
421void Client::dump_cache(Formatter *f)
422{
423 set<Inode*> did;
424
11fdf7f2 425 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
426
427 if (f)
428 f->open_array_section("cache");
429
430 if (root)
431 dump_inode(f, root, did, true);
432
433 // make a second pass to catch anything disconnected
434 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
435 it != inode_map.end();
436 ++it) {
437 if (did.count(it->second))
438 continue;
439 dump_inode(f, it->second, did, true);
440 }
441
442 if (f)
443 f->close_section();
444}
445
446void Client::dump_status(Formatter *f)
447{
11fdf7f2 448 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
449
450 ldout(cct, 1) << __func__ << dendl;
451
452 const epoch_t osd_epoch
453 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
454
455 if (f) {
456 f->open_object_section("metadata");
457 for (const auto& kv : metadata)
458 f->dump_string(kv.first.c_str(), kv.second);
459 f->close_section();
460
461 f->dump_int("dentry_count", lru.lru_get_size());
462 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
463 f->dump_int("id", get_nodeid().v);
11fdf7f2 464 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 465 f->dump_object("inst", inst);
11fdf7f2
TL
466 f->dump_object("addr", inst.addr);
467 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
468 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
469 f->dump_int("inode_count", inode_map.size());
470 f->dump_int("mds_epoch", mdsmap->get_epoch());
471 f->dump_int("osd_epoch", osd_epoch);
472 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 473 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
474 }
475}
476
477int Client::init()
478{
479 timer.init();
480 objectcacher->start();
481
482 client_lock.Lock();
11fdf7f2 483 ceph_assert(!initialized);
7c673cae
FG
484
485 messenger->add_dispatcher_tail(this);
486 client_lock.Unlock();
487
488 _finish_init();
489 return 0;
490}
491
492void Client::_finish_init()
493{
494 client_lock.Lock();
495 // logger
496 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
497 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
498 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
499 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
11fdf7f2
TL
500 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
501 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
7c673cae
FG
502 logger.reset(plb.create_perf_counters());
503 cct->get_perfcounters_collection()->add(logger.get());
504
505 client_lock.Unlock();
506
11fdf7f2 507 cct->_conf.add_observer(this);
7c673cae
FG
508
509 AdminSocket* admin_socket = cct->get_admin_socket();
510 int ret = admin_socket->register_command("mds_requests",
511 "mds_requests",
512 &m_command_hook,
513 "show in-progress mds requests");
514 if (ret < 0) {
515 lderr(cct) << "error registering admin socket command: "
516 << cpp_strerror(-ret) << dendl;
517 }
518 ret = admin_socket->register_command("mds_sessions",
519 "mds_sessions",
520 &m_command_hook,
521 "show mds session state");
522 if (ret < 0) {
523 lderr(cct) << "error registering admin socket command: "
524 << cpp_strerror(-ret) << dendl;
525 }
526 ret = admin_socket->register_command("dump_cache",
527 "dump_cache",
528 &m_command_hook,
529 "show in-memory metadata cache contents");
530 if (ret < 0) {
531 lderr(cct) << "error registering admin socket command: "
532 << cpp_strerror(-ret) << dendl;
533 }
534 ret = admin_socket->register_command("kick_stale_sessions",
535 "kick_stale_sessions",
536 &m_command_hook,
537 "kick sessions that were remote reset");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542 ret = admin_socket->register_command("status",
543 "status",
544 &m_command_hook,
545 "show overall client status");
546 if (ret < 0) {
547 lderr(cct) << "error registering admin socket command: "
548 << cpp_strerror(-ret) << dendl;
549 }
550
551 client_lock.Lock();
552 initialized = true;
553 client_lock.Unlock();
554}
555
556void Client::shutdown()
557{
11fdf7f2 558 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
559
560 // If we were not mounted, but were being used for sending
561 // MDS commands, we may have sessions that need closing.
562 client_lock.Lock();
563 _close_sessions();
564 client_lock.Unlock();
565
11fdf7f2 566 cct->_conf.remove_observer(this);
7c673cae 567
11fdf7f2 568 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
569
570 if (ino_invalidate_cb) {
571 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
572 async_ino_invalidator.wait_for_empty();
573 async_ino_invalidator.stop();
574 }
575
576 if (dentry_invalidate_cb) {
577 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
578 async_dentry_invalidator.wait_for_empty();
579 async_dentry_invalidator.stop();
580 }
581
582 if (switch_interrupt_cb) {
583 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
584 interrupt_finisher.wait_for_empty();
585 interrupt_finisher.stop();
586 }
587
588 if (remount_cb) {
589 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
590 remount_finisher.wait_for_empty();
591 remount_finisher.stop();
592 }
593
594 objectcacher->stop(); // outside of client_lock! this does a join.
595
596 client_lock.Lock();
11fdf7f2 597 ceph_assert(initialized);
7c673cae
FG
598 initialized = false;
599 timer.shutdown();
600 client_lock.Unlock();
601
602 objecter_finisher.wait_for_empty();
603 objecter_finisher.stop();
604
605 if (logger) {
606 cct->get_perfcounters_collection()->remove(logger.get());
607 logger.reset();
608 }
609}
610
611
612// ===================
613// metadata cache stuff
614
615void Client::trim_cache(bool trim_kernel_dcache)
616{
181888fb
FG
617 uint64_t max = cct->_conf->client_cache_size;
618 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
619 unsigned last = 0;
620 while (lru.lru_get_size() != last) {
621 last = lru.lru_get_size();
622
181888fb 623 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
624
625 // trim!
31f18b77 626 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
627 if (!dn)
628 break; // done
629
630 trim_dentry(dn);
631 }
632
181888fb 633 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
634 _invalidate_kernel_dcache();
635
636 // hose root?
637 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
638 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
639 delete root;
640 root = 0;
641 root_ancestor = 0;
642 while (!root_parents.empty())
643 root_parents.erase(root_parents.begin());
644 inode_map.clear();
645 _reset_faked_inos();
646 }
647}
648
649void Client::trim_cache_for_reconnect(MetaSession *s)
650{
651 mds_rank_t mds = s->mds_num;
11fdf7f2 652 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
653
654 int trimmed = 0;
655 list<Dentry*> skipped;
656 while (lru.lru_get_size() > 0) {
657 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
658 if (!dn)
659 break;
660
661 if ((dn->inode && dn->inode->caps.count(mds)) ||
662 dn->dir->parent_inode->caps.count(mds)) {
663 trim_dentry(dn);
664 trimmed++;
665 } else
666 skipped.push_back(dn);
667 }
668
669 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
670 lru.lru_insert_mid(*p);
671
11fdf7f2 672 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
673 << " trimmed " << trimmed << " dentries" << dendl;
674
675 if (s->caps.size() > 0)
676 _invalidate_kernel_dcache();
677}
678
679void Client::trim_dentry(Dentry *dn)
680{
681 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
682 << " in dir "
683 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
684 << dendl;
685 if (dn->inode) {
686 Inode *diri = dn->dir->parent_inode;
687 diri->dir_release_count++;
688 clear_dir_complete_and_ordered(diri, true);
689 }
690 unlink(dn, false, false); // drop dir, drop dentry
691}
692
693
1adf2230
AA
694void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
695 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 696{
7c673cae
FG
697 uint64_t prior_size = in->size;
698
7c673cae
FG
699 if (truncate_seq > in->truncate_seq ||
700 (truncate_seq == in->truncate_seq && size > in->size)) {
701 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
702 in->size = size;
703 in->reported_size = size;
704 if (truncate_seq != in->truncate_seq) {
705 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
706 << truncate_seq << dendl;
707 in->truncate_seq = truncate_seq;
708 in->oset.truncate_seq = truncate_seq;
709
710 // truncate cached file data
711 if (prior_size > size) {
712 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
713 }
714 }
715
716 // truncate inline data
717 if (in->inline_version < CEPH_INLINE_NONE) {
718 uint32_t len = in->inline_data.length();
719 if (size < len)
720 in->inline_data.splice(size, len - size);
721 }
722 }
723 if (truncate_seq >= in->truncate_seq &&
724 in->truncate_size != truncate_size) {
725 if (in->is_file()) {
726 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
727 << truncate_size << dendl;
728 in->truncate_size = truncate_size;
729 in->oset.truncate_size = truncate_size;
730 } else {
731 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
732 }
733 }
1adf2230
AA
734}
735
736void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
737 utime_t ctime, utime_t mtime, utime_t atime)
738{
739 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
740 << " ctime " << ctime << " mtime " << mtime << dendl;
741
742 if (time_warp_seq > in->time_warp_seq)
743 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
744 << " is higher than local time_warp_seq "
745 << in->time_warp_seq << dendl;
746
747 int warn = false;
7c673cae
FG
748 // be careful with size, mtime, atime
749 if (issued & (CEPH_CAP_FILE_EXCL|
750 CEPH_CAP_FILE_WR|
751 CEPH_CAP_FILE_BUFFER|
752 CEPH_CAP_AUTH_EXCL|
753 CEPH_CAP_XATTR_EXCL)) {
754 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
755 if (ctime > in->ctime)
756 in->ctime = ctime;
757 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
758 //the mds updated times, so take those!
759 in->mtime = mtime;
760 in->atime = atime;
761 in->time_warp_seq = time_warp_seq;
762 } else if (time_warp_seq == in->time_warp_seq) {
763 //take max times
764 if (mtime > in->mtime)
765 in->mtime = mtime;
766 if (atime > in->atime)
767 in->atime = atime;
768 } else if (issued & CEPH_CAP_FILE_EXCL) {
769 //ignore mds values as we have a higher seq
770 } else warn = true;
771 } else {
772 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
773 if (time_warp_seq >= in->time_warp_seq) {
774 in->ctime = ctime;
775 in->mtime = mtime;
776 in->atime = atime;
777 in->time_warp_seq = time_warp_seq;
778 } else warn = true;
779 }
780 if (warn) {
781 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
782 << time_warp_seq << " is lower than local time_warp_seq "
783 << in->time_warp_seq
784 << dendl;
785 }
786}
787
788void Client::_fragmap_remove_non_leaves(Inode *in)
789{
790 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
791 if (!in->dirfragtree.is_leaf(p->first))
792 in->fragmap.erase(p++);
793 else
794 ++p;
795}
796
797void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
798{
799 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
800 if (p->second == mds)
801 in->fragmap.erase(p++);
802 else
803 ++p;
804}
805
806Inode * Client::add_update_inode(InodeStat *st, utime_t from,
807 MetaSession *session,
808 const UserPerm& request_perms)
809{
810 Inode *in;
811 bool was_new = false;
812 if (inode_map.count(st->vino)) {
813 in = inode_map[st->vino];
11fdf7f2 814 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
815 } else {
816 in = new Inode(this, st->vino, &st->layout);
817 inode_map[st->vino] = in;
818
819 if (use_faked_inos())
820 _assign_faked_ino(in);
821
822 if (!root) {
823 root = in;
11fdf7f2
TL
824 if (use_faked_inos())
825 _assign_faked_root(root);
7c673cae
FG
826 root_ancestor = in;
827 cwd = root;
828 } else if (!mounted) {
829 root_parents[root_ancestor] = in;
830 root_ancestor = in;
831 }
832
833 // immutable bits
834 in->ino = st->vino.ino;
835 in->snapid = st->vino.snapid;
836 in->mode = st->mode & S_IFMT;
837 was_new = true;
838 }
839
840 in->rdev = st->rdev;
841 if (in->is_symlink())
842 in->symlink = st->symlink;
843
7c673cae 844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
845 bool new_version = false;
846 if (in->version == 0 ||
847 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
848 (in->version & ~1) < st->version))
849 new_version = true;
7c673cae 850
1adf2230
AA
851 int issued;
852 in->caps_issued(&issued);
853 issued |= in->caps_dirty();
854 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 855
1adf2230
AA
856 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
857 !(issued & CEPH_CAP_AUTH_EXCL)) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
81eedcae 862 in->snap_btime = st->snap_btime;
1adf2230 863 }
7c673cae 864
1adf2230
AA
865 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
866 !(issued & CEPH_CAP_LINK_EXCL)) {
867 in->nlink = st->nlink;
868 }
7c673cae 869
1adf2230
AA
870 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
871 update_inode_file_time(in, issued, st->time_warp_seq,
872 st->ctime, st->mtime, st->atime);
873 }
7c673cae 874
1adf2230
AA
875 if (new_version ||
876 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 877 in->layout = st->layout;
1adf2230
AA
878 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
879 }
7c673cae 880
1adf2230
AA
881 if (in->is_dir()) {
882 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
883 in->dirstat = st->dirstat;
884 }
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
888 in->dir_layout = st->dir_layout;
889 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
890 in->rstat = st->rstat;
891 in->quota = st->quota;
11fdf7f2 892 in->dir_pin = st->dir_pin;
1adf2230
AA
893 }
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
7c673cae 898 }
7c673cae
FG
899 }
900
901 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
902 st->xattrbl.length() &&
903 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
904 auto p = st->xattrbl.cbegin();
905 decode(in->xattrs, p);
7c673cae
FG
906 in->xattr_version = st->xattr_version;
907 }
908
1adf2230
AA
909 if (st->inline_version > in->inline_version) {
910 in->inline_data = st->inline_data;
911 in->inline_version = st->inline_version;
7c673cae
FG
912 }
913
1adf2230
AA
914 /* always take a newer change attr */
915 if (st->change_attr > in->change_attr)
916 in->change_attr = st->change_attr;
917
918 if (st->version > in->version)
919 in->version = st->version;
920
921 if (was_new)
922 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
923
924 if (!st->cap.caps)
925 return in; // as with readdir returning indoes in different snaprealms (no caps!)
926
7c673cae 927 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
928 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
929 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
930 st->cap.flags, request_perms);
28e407b8 931 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 932 in->max_size = st->max_size;
28e407b8
AA
933 in->rstat = st->rstat;
934 }
7c673cae 935
1adf2230
AA
936 // setting I_COMPLETE needs to happen after adding the cap
937 if (in->is_dir() &&
938 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
939 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
940 in->dirstat.nfiles == 0 &&
941 in->dirstat.nsubdirs == 0) {
942 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
943 in->flags |= I_COMPLETE | I_DIR_ORDERED;
944 if (in->dir) {
945 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
946 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
947 in->dir->readdir_cache.clear();
948 for (const auto& p : in->dir->dentries) {
949 unlink(p.second, true, true); // keep dir, keep dentry
950 }
951 if (in->dir->dentries.empty())
952 close_dir(in->dir);
7c673cae 953 }
7c673cae 954 }
1adf2230
AA
955 } else {
956 in->snap_caps |= st->cap.caps;
7c673cae
FG
957 }
958
959 return in;
960}
961
962
963/*
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
965 */
966Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
967 Inode *in, utime_t from, MetaSession *session,
968 Dentry *old_dentry)
969{
970 Dentry *dn = NULL;
971 if (dir->dentries.count(dname))
972 dn = dir->dentries[dname];
973
11fdf7f2 974 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
975 << " in dir " << dir->parent_inode->vino() << " dn " << dn
976 << dendl;
977
978 if (dn && dn->inode) {
979 if (dn->inode->vino() == in->vino()) {
980 touch_dn(dn);
981 ldout(cct, 12) << " had dentry " << dname
982 << " with correct vino " << dn->inode->vino()
983 << dendl;
984 } else {
985 ldout(cct, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn->inode->vino()
987 << dendl;
988 unlink(dn, true, true); // keep dir, keep dentry
989 }
990 }
991
992 if (!dn || !dn->inode) {
993 InodeRef tmp_ref(in);
994 if (old_dentry) {
995 if (old_dentry->dir != dir) {
996 Inode *old_diri = old_dentry->dir->parent_inode;
997 old_diri->dir_ordered_count++;
998 clear_dir_complete_and_ordered(old_diri, false);
999 }
1000 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1001 }
1002 Inode *diri = dir->parent_inode;
1003 diri->dir_ordered_count++;
1004 clear_dir_complete_and_ordered(diri, false);
1005 dn = link(dir, dname, in, dn);
1006 }
1007
1008 update_dentry_lease(dn, dlease, from, session);
1009 return dn;
1010}
1011
1012void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1013{
1014 utime_t dttl = from;
1015 dttl += (float)dlease->duration_ms / 1000.0;
1016
11fdf7f2 1017 ceph_assert(dn);
7c673cae
FG
1018
1019 if (dlease->mask & CEPH_LOCK_DN) {
1020 if (dttl > dn->lease_ttl) {
1021 ldout(cct, 10) << "got dentry lease on " << dn->name
1022 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1023 dn->lease_ttl = dttl;
1024 dn->lease_mds = session->mds_num;
1025 dn->lease_seq = dlease->seq;
1026 dn->lease_gen = session->cap_gen;
1027 }
1028 }
1029 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1030}
1031
1032
1033/*
1034 * update MDS location cache for a single inode
1035 */
1036void Client::update_dir_dist(Inode *in, DirStat *dst)
1037{
1038 // auth
1039 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1040 if (dst->auth >= 0) {
1041 in->fragmap[dst->frag] = dst->auth;
1042 } else {
1043 in->fragmap.erase(dst->frag);
1044 }
1045 if (!in->dirfragtree.is_leaf(dst->frag)) {
1046 in->dirfragtree.force_to_leaf(cct, dst->frag);
1047 _fragmap_remove_non_leaves(in);
1048 }
1049
1050 // replicated
1051 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
7c673cae
FG
1052}
1053
1054void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1055{
1056 if (diri->flags & I_COMPLETE) {
1057 if (complete) {
1058 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1059 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1060 } else {
1061 if (diri->flags & I_DIR_ORDERED) {
1062 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1063 diri->flags &= ~I_DIR_ORDERED;
1064 }
1065 }
1066 if (diri->dir)
1067 diri->dir->readdir_cache.clear();
1068 }
1069}
1070
1071/*
1072 * insert results from readdir or lssnap into the metadata cache.
1073 */
1074void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1075
11fdf7f2 1076 auto& reply = request->reply;
7c673cae 1077 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1078 uint64_t features;
1079 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1080 features = (uint64_t)-1;
1081 }
1082 else {
1083 features = con->get_features();
1084 }
7c673cae
FG
1085
1086 dir_result_t *dirp = request->dirp;
11fdf7f2 1087 ceph_assert(dirp);
7c673cae
FG
1088
1089 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1090 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1091 if (!p.end()) {
1092 // snapdir?
1093 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1094 ceph_assert(diri);
7c673cae
FG
1095 diri = open_snapdir(diri);
1096 }
1097
1098 // only open dir if we're actually adding stuff to it!
1099 Dir *dir = diri->open_dir();
11fdf7f2 1100 ceph_assert(dir);
7c673cae
FG
1101
1102 // dirstat
11fdf7f2 1103 DirStat dst(p, features);
7c673cae
FG
1104 __u32 numdn;
1105 __u16 flags;
11fdf7f2
TL
1106 decode(numdn, p);
1107 decode(flags, p);
7c673cae
FG
1108
1109 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1110 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1111
1112 frag_t fg = (unsigned)request->head.args.readdir.frag;
1113 unsigned readdir_offset = dirp->next_offset;
1114 string readdir_start = dirp->last_name;
11fdf7f2 1115 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1116
1117 unsigned last_hash = 0;
1118 if (hash_order) {
1119 if (!readdir_start.empty()) {
1120 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1121 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1122 /* mds understands offset_hash */
1123 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1124 }
1125 }
1126
1127 if (fg != dst.frag) {
1128 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1129 fg = dst.frag;
1130 if (!hash_order) {
1131 readdir_offset = 2;
1132 readdir_start.clear();
1133 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1134 }
1135 }
1136
1137 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1138 << ", hash_order=" << hash_order
1139 << ", readdir_start " << readdir_start
1140 << ", last_hash " << last_hash
1141 << ", next_offset " << readdir_offset << dendl;
1142
1143 if (diri->snapid != CEPH_SNAPDIR &&
1144 fg.is_leftmost() && readdir_offset == 2 &&
1145 !(hash_order && last_hash)) {
1146 dirp->release_count = diri->dir_release_count;
1147 dirp->ordered_count = diri->dir_ordered_count;
1148 dirp->start_shared_gen = diri->shared_gen;
1149 dirp->cache_index = 0;
1150 }
1151
1152 dirp->buffer_frag = fg;
1153
1154 _readdir_drop_dirp_buffer(dirp);
1155 dirp->buffer.reserve(numdn);
1156
1157 string dname;
1158 LeaseStat dlease;
1159 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1160 decode(dname, p);
1161 dlease.decode(p, features);
7c673cae
FG
1162 InodeStat ist(p, features);
1163
1164 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1165
1166 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1167 request->perms);
1168 Dentry *dn;
1169 if (diri->dir->dentries.count(dname)) {
1170 Dentry *olddn = diri->dir->dentries[dname];
1171 if (olddn->inode != in) {
1172 // replace incorrect dentry
1173 unlink(olddn, true, true); // keep dir, dentry
1174 dn = link(dir, dname, in, olddn);
11fdf7f2 1175 ceph_assert(dn == olddn);
7c673cae
FG
1176 } else {
1177 // keep existing dn
1178 dn = olddn;
1179 touch_dn(dn);
1180 }
1181 } else {
1182 // new dn
1183 dn = link(dir, dname, in, NULL);
1184 }
1185
1186 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1187 if (hash_order) {
1188 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1189 if (hash != last_hash)
1190 readdir_offset = 2;
1191 last_hash = hash;
1192 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1193 } else {
1194 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1195 }
1196 // add to readdir cache
1197 if (dirp->release_count == diri->dir_release_count &&
1198 dirp->ordered_count == diri->dir_ordered_count &&
1199 dirp->start_shared_gen == diri->shared_gen) {
1200 if (dirp->cache_index == dir->readdir_cache.size()) {
1201 if (i == 0) {
11fdf7f2 1202 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1203 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1204 }
1205 dir->readdir_cache.push_back(dn);
1206 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1207 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1208 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1209 else
1210 dir->readdir_cache[dirp->cache_index] = dn;
1211 } else {
11fdf7f2 1212 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1213 }
1214 dirp->cache_index++;
1215 }
1216 // add to cached result list
1217 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1218 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1219 }
1220
1221 if (numdn > 0)
1222 dirp->last_name = dname;
1223 if (end)
1224 dirp->next_offset = 2;
1225 else
1226 dirp->next_offset = readdir_offset;
1227
1228 if (dir->is_empty())
1229 close_dir(dir);
1230 }
1231}
1232
1233/** insert_trace
1234 *
1235 * insert a trace from a MDS reply into the cache.
1236 */
1237Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1238{
11fdf7f2 1239 auto& reply = request->reply;
7c673cae
FG
1240 int op = request->get_op();
1241
1242 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1243 << " is_target=" << (int)reply->head.is_target
1244 << " is_dentry=" << (int)reply->head.is_dentry
1245 << dendl;
1246
11fdf7f2 1247 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1248 if (request->got_unsafe) {
1249 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1250 ceph_assert(p.end());
7c673cae
FG
1251 return NULL;
1252 }
1253
1254 if (p.end()) {
1255 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1256
1257 Dentry *d = request->dentry();
1258 if (d) {
1259 Inode *diri = d->dir->parent_inode;
1260 diri->dir_release_count++;
1261 clear_dir_complete_and_ordered(diri, true);
1262 }
1263
1264 if (d && reply->get_result() == 0) {
1265 if (op == CEPH_MDS_OP_RENAME) {
1266 // rename
1267 Dentry *od = request->old_dentry();
1268 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1269 ceph_assert(od);
7c673cae
FG
1270 unlink(od, true, true); // keep dir, dentry
1271 } else if (op == CEPH_MDS_OP_RMDIR ||
1272 op == CEPH_MDS_OP_UNLINK) {
1273 // unlink, rmdir
1274 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1275 unlink(d, true, true); // keep dir, dentry
1276 }
1277 }
1278 return NULL;
1279 }
1280
1281 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1282 uint64_t features;
1283 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1284 features = (uint64_t)-1;
1285 }
1286 else {
1287 features = con->get_features();
1288 }
7c673cae
FG
1289 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1290
1291 // snap trace
1292 SnapRealm *realm = NULL;
1293 if (reply->snapbl.length())
1294 update_snap_trace(reply->snapbl, &realm);
1295
1296 ldout(cct, 10) << " hrm "
1297 << " is_target=" << (int)reply->head.is_target
1298 << " is_dentry=" << (int)reply->head.is_dentry
1299 << dendl;
1300
1301 InodeStat dirst;
1302 DirStat dst;
1303 string dname;
1304 LeaseStat dlease;
1305 InodeStat ist;
1306
1307 if (reply->head.is_dentry) {
1308 dirst.decode(p, features);
11fdf7f2
TL
1309 dst.decode(p, features);
1310 decode(dname, p);
1311 dlease.decode(p, features);
7c673cae
FG
1312 }
1313
1314 Inode *in = 0;
1315 if (reply->head.is_target) {
1316 ist.decode(p, features);
1317 if (cct->_conf->client_debug_getattr_caps) {
1318 unsigned wanted = 0;
1319 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1320 wanted = request->head.args.getattr.mask;
1321 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1322 wanted = request->head.args.open.mask;
1323
1324 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1325 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1326 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1327 }
1328
1329 in = add_update_inode(&ist, request->sent_stamp, session,
1330 request->perms);
1331 }
1332
1333 Inode *diri = NULL;
1334 if (reply->head.is_dentry) {
1335 diri = add_update_inode(&dirst, request->sent_stamp, session,
1336 request->perms);
1337 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1338
1339 if (in) {
1340 Dir *dir = diri->open_dir();
1341 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1342 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1343 } else {
1344 Dentry *dn = NULL;
1345 if (diri->dir && diri->dir->dentries.count(dname)) {
1346 dn = diri->dir->dentries[dname];
1347 if (dn->inode) {
1348 diri->dir_ordered_count++;
1349 clear_dir_complete_and_ordered(diri, false);
1350 unlink(dn, true, true); // keep dir, dentry
1351 }
1352 }
1353 if (dlease.duration_ms > 0) {
1354 if (!dn) {
1355 Dir *dir = diri->open_dir();
1356 dn = link(dir, dname, NULL, NULL);
1357 }
1358 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1359 }
1360 }
1361 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1362 op == CEPH_MDS_OP_MKSNAP) {
1363 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1364 // fake it for snap lookup
1365 vinodeno_t vino = ist.vino;
1366 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1367 ceph_assert(inode_map.count(vino));
7c673cae
FG
1368 diri = inode_map[vino];
1369
1370 string dname = request->path.last_dentry();
1371
1372 LeaseStat dlease;
1373 dlease.duration_ms = 0;
1374
1375 if (in) {
1376 Dir *dir = diri->open_dir();
1377 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1378 } else {
1379 if (diri->dir && diri->dir->dentries.count(dname)) {
1380 Dentry *dn = diri->dir->dentries[dname];
1381 if (dn->inode)
1382 unlink(dn, true, true); // keep dir, dentry
1383 }
1384 }
1385 }
1386
1387 if (in) {
1388 if (op == CEPH_MDS_OP_READDIR ||
1389 op == CEPH_MDS_OP_LSSNAP) {
1390 insert_readdir_results(request, session, in);
1391 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1392 // hack: return parent inode instead
1393 in = diri;
1394 }
1395
1396 if (request->dentry() == NULL && in != request->inode()) {
1397 // pin the target inode if its parent dentry is not pinned
1398 request->set_other_inode(in);
1399 }
1400 }
1401
1402 if (realm)
1403 put_snap_realm(realm);
1404
1405 request->target = in;
1406 return in;
1407}
1408
1409// -------
1410
1411mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1412{
1413 mds_rank_t mds = MDS_RANK_NONE;
1414 __u32 hash = 0;
1415 bool is_hash = false;
1416
1417 Inode *in = NULL;
1418 Dentry *de = NULL;
7c673cae
FG
1419
1420 if (req->resend_mds >= 0) {
1421 mds = req->resend_mds;
1422 req->resend_mds = -1;
11fdf7f2 1423 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1424 goto out;
1425 }
1426
1427 if (cct->_conf->client_use_random_mds)
1428 goto random_mds;
1429
1430 in = req->inode();
1431 de = req->dentry();
1432 if (in) {
11fdf7f2 1433 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1434 if (req->path.depth()) {
1435 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1436 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1437 << " on " << req->path[0]
1438 << " => " << hash << dendl;
1439 is_hash = true;
1440 }
1441 } else if (de) {
1442 if (de->inode) {
1443 in = de->inode.get();
11fdf7f2 1444 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1445 } else {
1446 in = de->dir->parent_inode;
1447 hash = in->hash_dentry_name(de->name);
11fdf7f2 1448 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1449 << " on " << de->name
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 }
1454 if (in) {
1455 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1456 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1457 while (in->snapid != CEPH_NOSNAP) {
1458 if (in->snapid == CEPH_SNAPDIR)
1459 in = in->snapdir_parent.get();
11fdf7f2 1460 else if (!in->dentries.empty())
7c673cae
FG
1461 /* In most cases there will only be one dentry, so getting it
1462 * will be the correct action. If there are multiple hard links,
1463 * I think the MDS should be able to redirect as needed*/
1464 in = in->get_first_parent()->dir->parent_inode;
1465 else {
1466 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1467 break;
1468 }
1469 }
1470 is_hash = false;
1471 }
1472
11fdf7f2 1473 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1474 << " hash=" << hash << dendl;
1475
1476 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1477 frag_t fg = in->dirfragtree[hash];
1478 if (in->fragmap.count(fg)) {
1479 mds = in->fragmap[fg];
1480 if (phash_diri)
1481 *phash_diri = in;
91327a77
AA
1482 } else if (in->auth_cap) {
1483 mds = in->auth_cap->session->mds_num;
1484 }
1485 if (mds >= 0) {
11fdf7f2 1486 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1487 goto out;
1488 }
1489 }
1490
11fdf7f2
TL
1491 if (in->auth_cap && req->auth_is_best()) {
1492 mds = in->auth_cap->session->mds_num;
1493 } else if (!in->caps.empty()) {
1494 mds = in->caps.begin()->second.session->mds_num;
1495 } else {
7c673cae 1496 goto random_mds;
11fdf7f2
TL
1497 }
1498 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1499
1500 goto out;
1501 }
1502
1503random_mds:
1504 if (mds < 0) {
1505 mds = _get_random_up_mds();
1506 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1507 }
1508
1509out:
1510 ldout(cct, 20) << "mds is " << mds << dendl;
1511 return mds;
1512}
1513
1514
1515void Client::connect_mds_targets(mds_rank_t mds)
1516{
11fdf7f2
TL
1517 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1518 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1519 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1520 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1521 q != info.export_targets.end();
1522 ++q) {
1523 if (mds_sessions.count(*q) == 0 &&
1524 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1525 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1526 << " export target mds." << *q << dendl;
1527 _open_mds_session(*q);
1528 }
1529 }
1530}
1531
1532void Client::dump_mds_sessions(Formatter *f)
1533{
1534 f->dump_int("id", get_nodeid().v);
11fdf7f2 1535 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1536 f->dump_object("inst", inst);
1537 f->dump_stream("inst_str") << inst;
1538 f->dump_stream("addr_str") << inst.addr;
7c673cae 1539 f->open_array_section("sessions");
11fdf7f2 1540 for (const auto &p : mds_sessions) {
7c673cae 1541 f->open_object_section("session");
11fdf7f2 1542 p.second.dump(f);
7c673cae
FG
1543 f->close_section();
1544 }
1545 f->close_section();
1546 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1547}
1548void Client::dump_mds_requests(Formatter *f)
1549{
1550 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1551 p != mds_requests.end();
1552 ++p) {
1553 f->open_object_section("request");
1554 p->second->dump(f);
1555 f->close_section();
1556 }
1557}
1558
1559int Client::verify_reply_trace(int r,
11fdf7f2 1560 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1561 InodeRef *ptarget, bool *pcreated,
1562 const UserPerm& perms)
1563{
1564 // check whether this request actually did the create, and set created flag
1565 bufferlist extra_bl;
1566 inodeno_t created_ino;
1567 bool got_created_ino = false;
1568 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1569
11fdf7f2 1570 extra_bl = reply->get_extra_bl();
7c673cae
FG
1571 if (extra_bl.length() >= 8) {
1572 // if the extra bufferlist has a buffer, we assume its the created inode
1573 // and that this request to create succeeded in actually creating
1574 // the inode (won the race with other create requests)
11fdf7f2 1575 decode(created_ino, extra_bl);
7c673cae
FG
1576 got_created_ino = true;
1577 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1578 }
1579
1580 if (pcreated)
1581 *pcreated = got_created_ino;
1582
1583 if (request->target) {
1584 *ptarget = request->target;
1585 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1586 } else {
1587 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1588 (*ptarget) = p->second;
1589 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1590 } else {
1591 // we got a traceless reply, and need to look up what we just
1592 // created. for now, do this by name. someday, do this by the
1593 // ino... which we know! FIXME.
1594 InodeRef target;
1595 Dentry *d = request->dentry();
1596 if (d) {
1597 if (d->dir) {
1598 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1599 << d->dir->parent_inode->ino << "/" << d->name
1600 << " got_ino " << got_created_ino
1601 << " ino " << created_ino
1602 << dendl;
1603 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1604 &target, perms);
1605 } else {
1606 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1607 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1608 }
1609 } else {
1610 Inode *in = request->inode();
1611 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1612 << in->ino << dendl;
1613 r = _getattr(in, request->regetattr_mask, perms, true);
1614 target = in;
1615 }
1616 if (r >= 0) {
1617 // verify ino returned in reply and trace_dist are the same
1618 if (got_created_ino &&
1619 created_ino.val != target->ino.val) {
1620 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1621 r = -EINTR;
1622 }
1623 if (ptarget)
1624 ptarget->swap(target);
1625 }
1626 }
1627 }
1628
1629 return r;
1630}
1631
1632
1633/**
1634 * make a request
1635 *
1636 * Blocking helper to make an MDS request.
1637 *
1638 * If the ptarget flag is set, behavior changes slightly: the caller
1639 * expects to get a pointer to the inode we are creating or operating
1640 * on. As a result, we will follow up any traceless mutation reply
1641 * with a getattr or lookup to transparently handle a traceless reply
1642 * from the MDS (as when the MDS restarts and the client has to replay
1643 * a request).
1644 *
1645 * @param request the MetaRequest to execute
1646 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1647 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1648 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1649 * @param use_mds [optional] prefer a specific mds (-1 for default)
1650 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1651 */
1652int Client::make_request(MetaRequest *request,
1653 const UserPerm& perms,
1654 InodeRef *ptarget, bool *pcreated,
1655 mds_rank_t use_mds,
1656 bufferlist *pdirbl)
1657{
1658 int r = 0;
1659
1660 // assign a unique tid
1661 ceph_tid_t tid = ++last_tid;
1662 request->set_tid(tid);
1663
1664 // and timestamp
1665 request->op_stamp = ceph_clock_now();
1666
1667 // make note
1668 mds_requests[tid] = request->get();
1669 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1670 oldest_tid = tid;
1671
1672 request->set_caller_perms(perms);
1673
1674 if (cct->_conf->client_inject_fixed_oldest_tid) {
1675 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1676 request->set_oldest_client_tid(1);
1677 } else {
1678 request->set_oldest_client_tid(oldest_tid);
1679 }
1680
1681 // hack target mds?
1682 if (use_mds >= 0)
1683 request->resend_mds = use_mds;
1684
1685 while (1) {
1686 if (request->aborted())
1687 break;
1688
31f18b77
FG
1689 if (blacklisted) {
1690 request->abort(-EBLACKLISTED);
1691 break;
1692 }
1693
7c673cae
FG
1694 // set up wait cond
1695 Cond caller_cond;
1696 request->caller_cond = &caller_cond;
1697
1698 // choose mds
1699 Inode *hash_diri = NULL;
1700 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1701 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1702 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1703 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1704 if (hash_diri) {
1705 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1706 _fragmap_remove_stopped_mds(hash_diri, mds);
1707 } else {
1708 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1709 request->resend_mds = _get_random_up_mds();
1710 }
1711 } else {
1712 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1713 wait_on_list(waiting_for_mdsmap);
1714 }
1715 continue;
1716 }
1717
1718 // open a session?
1719 MetaSession *session = NULL;
1720 if (!have_open_session(mds)) {
1721 session = _get_or_open_mds_session(mds);
1722
1723 // wait
1724 if (session->state == MetaSession::STATE_OPENING) {
1725 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1726 wait_on_context_list(session->waiting_for_open);
1727 // Abort requests on REJECT from MDS
1728 if (rejected_by_mds.count(mds)) {
1729 request->abort(-EPERM);
1730 break;
1731 }
1732 continue;
1733 }
1734
1735 if (!have_open_session(mds))
1736 continue;
1737 } else {
11fdf7f2 1738 session = &mds_sessions.at(mds);
7c673cae
FG
1739 }
1740
1741 // send request.
1742 send_request(request, session);
1743
1744 // wait for signal
1745 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1746 request->kick = false;
1747 while (!request->reply && // reply
1748 request->resend_mds < 0 && // forward
1749 !request->kick)
1750 caller_cond.Wait(client_lock);
1751 request->caller_cond = NULL;
1752
1753 // did we get a reply?
1754 if (request->reply)
1755 break;
1756 }
1757
1758 if (!request->reply) {
11fdf7f2
TL
1759 ceph_assert(request->aborted());
1760 ceph_assert(!request->got_unsafe);
7c673cae
FG
1761 r = request->get_abort_code();
1762 request->item.remove_myself();
1763 unregister_request(request);
11fdf7f2 1764 put_request(request);
7c673cae
FG
1765 return r;
1766 }
1767
1768 // got it!
11fdf7f2 1769 auto reply = std::move(request->reply);
7c673cae
FG
1770 r = reply->get_result();
1771 if (r >= 0)
1772 request->success = true;
1773
1774 // kick dispatcher (we've got it!)
11fdf7f2 1775 ceph_assert(request->dispatch_cond);
7c673cae
FG
1776 request->dispatch_cond->Signal();
1777 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1778 request->dispatch_cond = 0;
1779
1780 if (r >= 0 && ptarget)
1781 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1782
1783 if (pdirbl)
11fdf7f2 1784 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1785
1786 // -- log times --
1787 utime_t lat = ceph_clock_now();
1788 lat -= request->sent_stamp;
1789 ldout(cct, 20) << "lat " << lat << dendl;
1790 logger->tinc(l_c_lat, lat);
1791 logger->tinc(l_c_reply, lat);
1792
1793 put_request(request);
7c673cae
FG
1794 return r;
1795}
1796
1797void Client::unregister_request(MetaRequest *req)
1798{
1799 mds_requests.erase(req->tid);
1800 if (req->tid == oldest_tid) {
1801 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1802 while (true) {
1803 if (p == mds_requests.end()) {
1804 oldest_tid = 0;
1805 break;
1806 }
1807 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1808 oldest_tid = p->first;
1809 break;
1810 }
1811 ++p;
1812 }
1813 }
1814 put_request(req);
1815}
1816
1817void Client::put_request(MetaRequest *request)
1818{
1819 if (request->_put()) {
1820 int op = -1;
1821 if (request->success)
1822 op = request->get_op();
1823 InodeRef other_in;
1824 request->take_other_inode(&other_in);
1825 delete request;
1826
1827 if (other_in &&
1828 (op == CEPH_MDS_OP_RMDIR ||
1829 op == CEPH_MDS_OP_RENAME ||
1830 op == CEPH_MDS_OP_RMSNAP)) {
1831 _try_to_trim_inode(other_in.get(), false);
1832 }
1833 }
1834}
1835
1836int Client::encode_inode_release(Inode *in, MetaRequest *req,
1837 mds_rank_t mds, int drop,
1838 int unless, int force)
1839{
11fdf7f2 1840 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1841 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1842 << ", have:" << ", force:" << force << ")" << dendl;
1843 int released = 0;
11fdf7f2
TL
1844 auto it = in->caps.find(mds);
1845 if (it != in->caps.end()) {
1846 Cap &cap = it->second;
7c673cae 1847 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1848 if ((drop & cap.issued) &&
1849 !(unless & cap.issued)) {
1850 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1851 cap.issued &= ~drop;
1852 cap.implemented &= ~drop;
7c673cae 1853 released = 1;
11fdf7f2 1854 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1855 } else {
1856 released = force;
1857 }
1858 if (released) {
1859 ceph_mds_request_release rel;
1860 rel.ino = in->ino;
11fdf7f2
TL
1861 rel.cap_id = cap.cap_id;
1862 rel.seq = cap.seq;
1863 rel.issue_seq = cap.issue_seq;
1864 rel.mseq = cap.mseq;
1865 rel.caps = cap.implemented;
1866 rel.wanted = cap.wanted;
7c673cae
FG
1867 rel.dname_len = 0;
1868 rel.dname_seq = 0;
1869 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1870 }
1871 }
11fdf7f2 1872 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1873 << released << dendl;
1874 return released;
1875}
1876
1877void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1878 mds_rank_t mds, int drop, int unless)
1879{
11fdf7f2 1880 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1881 << dn << ")" << dendl;
1882 int released = 0;
1883 if (dn->dir)
1884 released = encode_inode_release(dn->dir->parent_inode, req,
1885 mds, drop, unless, 1);
1886 if (released && dn->lease_mds == mds) {
1887 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1888 auto& rel = req->cap_releases.back();
7c673cae
FG
1889 rel.item.dname_len = dn->name.length();
1890 rel.item.dname_seq = dn->lease_seq;
1891 rel.dname = dn->name;
1892 }
11fdf7f2 1893 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1894 << dn << ")" << dendl;
1895}
1896
1897
1898/*
1899 * This requires the MClientRequest *request member to be set.
1900 * It will error out horribly without one.
1901 * Additionally, if you set any *drop member, you'd better have
1902 * set the corresponding dentry!
1903 */
1904void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1905{
11fdf7f2 1906 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1907 << req << ", mds: " << mds << ")" << dendl;
1908 if (req->inode_drop && req->inode())
1909 encode_inode_release(req->inode(), req,
1910 mds, req->inode_drop,
1911 req->inode_unless);
1912
1913 if (req->old_inode_drop && req->old_inode())
1914 encode_inode_release(req->old_inode(), req,
1915 mds, req->old_inode_drop,
1916 req->old_inode_unless);
1917 if (req->other_inode_drop && req->other_inode())
1918 encode_inode_release(req->other_inode(), req,
1919 mds, req->other_inode_drop,
1920 req->other_inode_unless);
1921
1922 if (req->dentry_drop && req->dentry())
1923 encode_dentry_release(req->dentry(), req,
1924 mds, req->dentry_drop,
1925 req->dentry_unless);
1926
1927 if (req->old_dentry_drop && req->old_dentry())
1928 encode_dentry_release(req->old_dentry(), req,
1929 mds, req->old_dentry_drop,
1930 req->old_dentry_unless);
11fdf7f2 1931 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1932 << req << ", mds " << mds <<dendl;
1933}
1934
1935bool Client::have_open_session(mds_rank_t mds)
1936{
11fdf7f2
TL
1937 const auto &it = mds_sessions.find(mds);
1938 return it != mds_sessions.end() &&
1939 (it->second.state == MetaSession::STATE_OPEN ||
1940 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1941}
1942
1943MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1944{
11fdf7f2
TL
1945 const auto &it = mds_sessions.find(mds);
1946 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1947 return NULL;
11fdf7f2
TL
1948 } else {
1949 return &it->second;
1950 }
7c673cae
FG
1951}
1952
1953MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1954{
11fdf7f2
TL
1955 auto it = mds_sessions.find(mds);
1956 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1957}
1958
1959/**
1960 * Populate a map of strings with client-identifying metadata,
1961 * such as the hostname. Call this once at initialization.
1962 */
1963void Client::populate_metadata(const std::string &mount_root)
1964{
1965 // Hostname
1966 struct utsname u;
1967 int r = uname(&u);
1968 if (r >= 0) {
1969 metadata["hostname"] = u.nodename;
1970 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1971 } else {
1972 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1973 }
1974
1975 metadata["pid"] = stringify(getpid());
1976
1977 // Ceph entity id (the '0' in "client.0")
1978 metadata["entity_id"] = cct->_conf->name.get_id();
1979
1980 // Our mount position
1981 if (!mount_root.empty()) {
1982 metadata["root"] = mount_root;
1983 }
1984
1985 // Ceph version
1986 metadata["ceph_version"] = pretty_version_to_str();
1987 metadata["ceph_sha1"] = git_version_to_str();
1988
1989 // Apply any metadata from the user's configured overrides
1990 std::vector<std::string> tokens;
1991 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1992 for (const auto &i : tokens) {
1993 auto eqpos = i.find("=");
1994 // Throw out anything that isn't of the form "<str>=<str>"
1995 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1996 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1997 continue;
1998 }
1999 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2000 }
2001}
2002
2003/**
2004 * Optionally add or override client metadata fields.
2005 */
2006void Client::update_metadata(std::string const &k, std::string const &v)
2007{
11fdf7f2
TL
2008 std::lock_guard l(client_lock);
2009 ceph_assert(initialized);
7c673cae 2010
11fdf7f2
TL
2011 auto it = metadata.find(k);
2012 if (it != metadata.end()) {
7c673cae 2013 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2014 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2015 }
2016
2017 metadata[k] = v;
2018}
2019
2020MetaSession *Client::_open_mds_session(mds_rank_t mds)
2021{
11fdf7f2
TL
2022 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2023 auto addrs = mdsmap->get_addrs(mds);
2024 auto em = mds_sessions.emplace(std::piecewise_construct,
2025 std::forward_as_tuple(mds),
2026 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2027 ceph_assert(em.second); /* not already present */
2028 MetaSession *session = &em.first->second;
7c673cae
FG
2029
2030 // Maybe skip sending a request to open if this MDS daemon
2031 // has previously sent us a REJECT.
2032 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2033 if (rejected_by_mds[mds] == session->addrs) {
2034 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2035 "because we were rejected" << dendl;
2036 return session;
2037 } else {
11fdf7f2 2038 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2039 "rejected us, trying with new inst" << dendl;
2040 rejected_by_mds.erase(mds);
2041 }
2042 }
2043
11fdf7f2
TL
2044 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2045 m->metadata = metadata;
2046 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2047 session->con->send_message2(std::move(m));
7c673cae
FG
2048 return session;
2049}
2050
2051void Client::_close_mds_session(MetaSession *s)
2052{
11fdf7f2 2053 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2054 s->state = MetaSession::STATE_CLOSING;
11fdf7f2 2055 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2056}
2057
2058void Client::_closed_mds_session(MetaSession *s)
2059{
11fdf7f2 2060 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2061 s->state = MetaSession::STATE_CLOSED;
2062 s->con->mark_down();
2063 signal_context_list(s->waiting_for_open);
2064 mount_cond.Signal();
2065 remove_session_caps(s);
2066 kick_requests_closed(s);
2067 mds_sessions.erase(s->mds_num);
7c673cae
FG
2068}
2069
11fdf7f2 2070void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2071{
2072 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2073 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2074
2075 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2076 if (!session) {
2077 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2078 return;
2079 }
2080
2081 switch (m->get_op()) {
2082 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2083 {
2084 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2085 missing_features -= m->supported_features;
2086 if (!missing_features.empty()) {
2087 lderr(cct) << "mds." << from << " lacks required features '"
2088 << missing_features << "', closing session " << dendl;
2089 rejected_by_mds[session->mds_num] = session->addrs;
2090 _close_mds_session(session);
2091 _closed_mds_session(session);
2092 break;
2093 }
2094 session->mds_features = std::move(m->supported_features);
2095
2096 renew_caps(session);
2097 session->state = MetaSession::STATE_OPEN;
2098 if (unmounting)
2099 mount_cond.Signal();
2100 else
2101 connect_mds_targets(from);
2102 signal_context_list(session->waiting_for_open);
2103 break;
2104 }
7c673cae
FG
2105
2106 case CEPH_SESSION_CLOSE:
2107 _closed_mds_session(session);
2108 break;
2109
2110 case CEPH_SESSION_RENEWCAPS:
2111 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2112 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2113 session->cap_ttl =
2114 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2115 if (was_stale)
2116 wake_up_session_caps(session, false);
7c673cae
FG
2117 }
2118 break;
2119
2120 case CEPH_SESSION_STALE:
28e407b8
AA
2121 // invalidate session caps/leases
2122 session->cap_gen++;
2123 session->cap_ttl = ceph_clock_now();
2124 session->cap_ttl -= 1;
7c673cae
FG
2125 renew_caps(session);
2126 break;
2127
2128 case CEPH_SESSION_RECALL_STATE:
2129 trim_caps(session, m->get_max_caps());
2130 break;
2131
2132 case CEPH_SESSION_FLUSHMSG:
a8e16298 2133 /* flush cap release */
11fdf7f2
TL
2134 if (auto& m = session->release; m) {
2135 session->con->send_message2(std::move(m));
a8e16298 2136 }
11fdf7f2 2137 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2138 break;
2139
2140 case CEPH_SESSION_FORCE_RO:
2141 force_session_readonly(session);
2142 break;
2143
2144 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2145 {
2146 std::string_view error_str;
2147 auto it = m->metadata.find("error_string");
2148 if (it != m->metadata.end())
2149 error_str = it->second;
2150 else
2151 error_str = "unknown error";
2152 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2153
11fdf7f2
TL
2154 rejected_by_mds[session->mds_num] = session->addrs;
2155 _closed_mds_session(session);
2156 }
7c673cae
FG
2157 break;
2158
2159 default:
2160 ceph_abort();
2161 }
7c673cae
FG
2162}
2163
2164bool Client::_any_stale_sessions() const
2165{
11fdf7f2 2166 ceph_assert(client_lock.is_locked_by_me());
7c673cae 2167
11fdf7f2
TL
2168 for (const auto &p : mds_sessions) {
2169 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2170 return true;
2171 }
2172 }
2173
2174 return false;
2175}
2176
2177void Client::_kick_stale_sessions()
2178{
11fdf7f2 2179 ldout(cct, 1) << __func__ << dendl;
7c673cae 2180
11fdf7f2
TL
2181 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2182 MetaSession &s = it->second;
2183 ++it;
2184 if (s.state == MetaSession::STATE_STALE)
2185 _closed_mds_session(&s);
7c673cae
FG
2186 }
2187}
2188
2189void Client::send_request(MetaRequest *request, MetaSession *session,
2190 bool drop_cap_releases)
2191{
2192 // make the request
2193 mds_rank_t mds = session->mds_num;
11fdf7f2 2194 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2195 << " for mds." << mds << dendl;
11fdf7f2 2196 auto r = build_client_request(request);
7c673cae
FG
2197 if (request->dentry()) {
2198 r->set_dentry_wanted();
2199 }
2200 if (request->got_unsafe) {
2201 r->set_replayed_op();
2202 if (request->target)
2203 r->head.ino = request->target->ino;
2204 } else {
2205 encode_cap_releases(request, mds);
2206 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2207 request->cap_releases.clear();
2208 else
2209 r->releases.swap(request->cap_releases);
2210 }
2211 r->set_mdsmap_epoch(mdsmap->get_epoch());
2212 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2213 objecter->with_osdmap([r](const OSDMap& o) {
2214 r->set_osdmap_epoch(o.get_epoch());
2215 });
2216 }
2217
2218 if (request->mds == -1) {
2219 request->sent_stamp = ceph_clock_now();
11fdf7f2 2220 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2221 }
2222 request->mds = mds;
2223
2224 Inode *in = request->inode();
11fdf7f2
TL
2225 if (in) {
2226 auto it = in->caps.find(mds);
2227 if (it != in->caps.end()) {
2228 request->sent_on_mseq = it->second.mseq;
2229 }
2230 }
7c673cae
FG
2231
2232 session->requests.push_back(&request->item);
2233
11fdf7f2
TL
2234 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2235 session->con->send_message2(std::move(r));
7c673cae
FG
2236}
2237
11fdf7f2 2238MClientRequest::ref Client::build_client_request(MetaRequest *request)
7c673cae 2239{
11fdf7f2 2240 auto req = MClientRequest::create(request->get_op());
7c673cae
FG
2241 req->set_tid(request->tid);
2242 req->set_stamp(request->op_stamp);
2243 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2244
2245 // if the filepath's haven't been set, set them!
2246 if (request->path.empty()) {
2247 Inode *in = request->inode();
2248 Dentry *de = request->dentry();
2249 if (in)
2250 in->make_nosnap_relative_path(request->path);
2251 else if (de) {
2252 if (de->inode)
2253 de->inode->make_nosnap_relative_path(request->path);
2254 else if (de->dir) {
2255 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2256 request->path.push_dentry(de->name);
2257 }
2258 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2259 << " No path, inode, or appropriately-endowed dentry given!"
2260 << dendl;
2261 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2262 << " No path, inode, or dentry given!"
2263 << dendl;
2264 }
2265 req->set_filepath(request->get_filepath());
2266 req->set_filepath2(request->get_filepath2());
2267 req->set_data(request->data);
2268 req->set_retry_attempt(request->retry_attempt++);
2269 req->head.num_fwd = request->num_fwd;
2270 const gid_t *_gids;
2271 int gid_count = request->perms.get_gids(&_gids);
2272 req->set_gid_list(gid_count, _gids);
2273 return req;
2274}
2275
2276
2277
11fdf7f2 2278void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2279{
2280 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2281 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2282 if (!session) {
7c673cae
FG
2283 return;
2284 }
2285 ceph_tid_t tid = fwd->get_tid();
2286
2287 if (mds_requests.count(tid) == 0) {
11fdf7f2 2288 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2289 return;
2290 }
2291
2292 MetaRequest *request = mds_requests[tid];
11fdf7f2 2293 ceph_assert(request);
7c673cae
FG
2294
2295 // reset retry counter
2296 request->retry_attempt = 0;
2297
2298 // request not forwarded, or dest mds has no session.
2299 // resend.
11fdf7f2 2300 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2301 << " fwd " << fwd->get_num_fwd()
2302 << " to mds." << fwd->get_dest_mds()
2303 << ", resending to " << fwd->get_dest_mds()
2304 << dendl;
2305
2306 request->mds = -1;
2307 request->item.remove_myself();
2308 request->num_fwd = fwd->get_num_fwd();
2309 request->resend_mds = fwd->get_dest_mds();
2310 request->caller_cond->Signal();
7c673cae
FG
2311}
2312
2313bool Client::is_dir_operation(MetaRequest *req)
2314{
2315 int op = req->get_op();
2316 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2317 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2318 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2319 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2320 return true;
2321 return false;
2322}
2323
11fdf7f2 2324void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2325{
2326 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2327 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2328 if (!session) {
7c673cae
FG
2329 return;
2330 }
2331
2332 ceph_tid_t tid = reply->get_tid();
2333 bool is_safe = reply->is_safe();
2334
2335 if (mds_requests.count(tid) == 0) {
11fdf7f2 2336 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2337 << " safe is:" << is_safe << dendl;
7c673cae
FG
2338 return;
2339 }
2340 MetaRequest *request = mds_requests.at(tid);
2341
11fdf7f2 2342 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2343 << " tid " << tid << dendl;
2344
2345 if (request->got_unsafe && !is_safe) {
2346 //duplicate response
2347 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2348 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2349 return;
2350 }
2351
2352 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2353 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2354 << " from mds." << request->mds << dendl;
2355 request->send_to_auth = true;
2356 request->resend_mds = choose_target_mds(request);
2357 Inode *in = request->inode();
11fdf7f2 2358 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2359 if (request->resend_mds >= 0 &&
2360 request->resend_mds == request->mds &&
2361 (in == NULL ||
11fdf7f2
TL
2362 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2363 request->sent_on_mseq == it->second.mseq)) {
2364 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae
FG
2365 } else {
2366 request->caller_cond->Signal();
7c673cae
FG
2367 return;
2368 }
7c673cae
FG
2369 }
2370
11fdf7f2 2371 ceph_assert(!request->reply);
7c673cae
FG
2372 request->reply = reply;
2373 insert_trace(request, session);
2374
2375 // Handle unsafe reply
2376 if (!is_safe) {
2377 request->got_unsafe = true;
2378 session->unsafe_requests.push_back(&request->unsafe_item);
2379 if (is_dir_operation(request)) {
2380 Inode *dir = request->inode();
11fdf7f2 2381 ceph_assert(dir);
7c673cae
FG
2382 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2383 }
2384 if (request->target) {
2385 InodeRef &in = request->target;
2386 in->unsafe_ops.push_back(&request->unsafe_target_item);
2387 }
2388 }
2389
2390 // Only signal the caller once (on the first reply):
2391 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2392 if (!is_safe || !request->got_unsafe) {
2393 Cond cond;
2394 request->dispatch_cond = &cond;
2395
2396 // wake up waiter
11fdf7f2 2397 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
7c673cae
FG
2398 request->caller_cond->Signal();
2399
2400 // wake for kick back
2401 while (request->dispatch_cond) {
11fdf7f2 2402 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
7c673cae
FG
2403 cond.Wait(client_lock);
2404 }
2405 }
2406
2407 if (is_safe) {
2408 // the filesystem change is committed to disk
2409 // we're done, clean up
2410 if (request->got_unsafe) {
2411 request->unsafe_item.remove_myself();
2412 request->unsafe_dir_item.remove_myself();
2413 request->unsafe_target_item.remove_myself();
2414 signal_cond_list(request->waitfor_safe);
2415 }
2416 request->item.remove_myself();
2417 unregister_request(request);
2418 }
2419 if (unmounting)
2420 mount_cond.Signal();
2421}
2422
2423void Client::_handle_full_flag(int64_t pool)
2424{
2425 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2426 << "on " << pool << dendl;
2427 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2428 // to do this rather than blocking, because otherwise when we fill up we
2429 // potentially lock caps forever on files with dirty pages, and we need
2430 // to be able to release those caps to the MDS so that it can delete files
2431 // and free up space.
2432 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2433
2434 // For all inodes with layouts in this pool and a pending flush write op
2435 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2436 // from ObjectCacher so that it doesn't re-issue the write in response to
2437 // the ENOSPC error.
2438 // Fortunately since we're cancelling everything in a given pool, we don't
2439 // need to know which ops belong to which ObjectSet, we can just blow all
2440 // the un-flushed cached data away and mark any dirty inodes' async_err
2441 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2442 // affecting this pool, and all the objectsets we're purging were also
2443 // in this pool.
2444 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2445 i != inode_map.end(); ++i)
2446 {
2447 Inode *inode = i->second;
2448 if (inode->oset.dirty_or_tx
2449 && (pool == -1 || inode->layout.pool_id == pool)) {
2450 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2451 << " has dirty objects, purging and setting ENOSPC" << dendl;
2452 objectcacher->purge_set(&inode->oset);
2453 inode->set_async_err(-ENOSPC);
2454 }
2455 }
2456
2457 if (cancelled_epoch != (epoch_t)-1) {
2458 set_cap_epoch_barrier(cancelled_epoch);
2459 }
2460}
2461
11fdf7f2 2462void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2463{
31f18b77
FG
2464 std::set<entity_addr_t> new_blacklists;
2465 objecter->consume_blacklist_events(&new_blacklists);
2466
11fdf7f2
TL
2467 const auto myaddrs = messenger->get_myaddrs();
2468 bool new_blacklist = false;
2469 bool prenautilus = objecter->with_osdmap(
2470 [&](const OSDMap& o) {
2471 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2472 });
2473 if (!blacklisted) {
2474 for (auto a : myaddrs.v) {
2475 // blacklist entries are always TYPE_ANY for nautilus+
2476 a.set_type(entity_addr_t::TYPE_ANY);
2477 if (new_blacklists.count(a)) {
2478 new_blacklist = true;
2479 break;
2480 }
2481 if (prenautilus) {
2482 // ...except pre-nautilus, they were TYPE_LEGACY
2483 a.set_type(entity_addr_t::TYPE_LEGACY);
2484 if (new_blacklists.count(a)) {
2485 new_blacklist = true;
2486 break;
2487 }
2488 }
2489 }
2490 }
2491 if (new_blacklist) {
31f18b77
FG
2492 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2493 return o.get_epoch();
2494 });
2495 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2496 blacklisted = true;
31f18b77 2497
11fdf7f2 2498 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2499
2500 // Since we know all our OSD ops will fail, cancel them all preemtively,
2501 // so that on an unhealthy cluster we can umount promptly even if e.g.
2502 // some PGs were inaccessible.
2503 objecter->op_cancel_writes(-EBLACKLISTED);
2504
2505 } else if (blacklisted) {
2506 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2507 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2508 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2509 }
2510
f64942e4
AA
2511 // Always subscribe to next osdmap for blacklisted client
2512 // until this client is not blacklisted.
2513 if (blacklisted) {
2514 objecter->maybe_request_map();
2515 }
2516
7c673cae
FG
2517 if (objecter->osdmap_full_flag()) {
2518 _handle_full_flag(-1);
2519 } else {
2520 // Accumulate local list of full pools so that I can drop
2521 // the objecter lock before re-entering objecter in
2522 // cancel_writes
2523 std::vector<int64_t> full_pools;
2524
2525 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2526 for (const auto& kv : o.get_pools()) {
2527 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2528 full_pools.push_back(kv.first);
2529 }
2530 }
2531 });
2532
2533 for (auto p : full_pools)
2534 _handle_full_flag(p);
2535
2536 // Subscribe to subsequent maps to watch for the full flag going
2537 // away. For the global full flag objecter does this for us, but
2538 // it pays no attention to the per-pool full flag so in this branch
2539 // we do it ourselves.
2540 if (!full_pools.empty()) {
2541 objecter->maybe_request_map();
2542 }
2543 }
7c673cae
FG
2544}
2545
2546
2547// ------------------------
2548// incoming messages
2549
2550
11fdf7f2 2551bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2552{
11fdf7f2 2553 std::lock_guard l(client_lock);
7c673cae
FG
2554 if (!initialized) {
2555 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2556 return true;
2557 }
2558
2559 switch (m->get_type()) {
2560 // mounting and mds sessions
2561 case CEPH_MSG_MDS_MAP:
11fdf7f2 2562 handle_mds_map(MMDSMap::msgref_cast(m));
7c673cae
FG
2563 break;
2564 case CEPH_MSG_FS_MAP:
11fdf7f2 2565 handle_fs_map(MFSMap::msgref_cast(m));
7c673cae
FG
2566 break;
2567 case CEPH_MSG_FS_MAP_USER:
11fdf7f2 2568 handle_fs_map_user(MFSMapUser::msgref_cast(m));
7c673cae
FG
2569 break;
2570 case CEPH_MSG_CLIENT_SESSION:
11fdf7f2 2571 handle_client_session(MClientSession::msgref_cast(m));
7c673cae
FG
2572 break;
2573
2574 case CEPH_MSG_OSD_MAP:
11fdf7f2 2575 handle_osd_map(MOSDMap::msgref_cast(m));
7c673cae
FG
2576 break;
2577
2578 // requests
2579 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
11fdf7f2 2580 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
7c673cae
FG
2581 break;
2582 case CEPH_MSG_CLIENT_REPLY:
11fdf7f2
TL
2583 handle_client_reply(MClientReply::msgref_cast(m));
2584 break;
2585
2586 // reclaim reply
2587 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2588 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
7c673cae
FG
2589 break;
2590
2591 case CEPH_MSG_CLIENT_SNAP:
11fdf7f2 2592 handle_snap(MClientSnap::msgref_cast(m));
7c673cae
FG
2593 break;
2594 case CEPH_MSG_CLIENT_CAPS:
11fdf7f2 2595 handle_caps(MClientCaps::msgref_cast(m));
7c673cae
FG
2596 break;
2597 case CEPH_MSG_CLIENT_LEASE:
11fdf7f2 2598 handle_lease(MClientLease::msgref_cast(m));
7c673cae
FG
2599 break;
2600 case MSG_COMMAND_REPLY:
2601 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
11fdf7f2 2602 handle_command_reply(MCommandReply::msgref_cast(m));
7c673cae
FG
2603 } else {
2604 return false;
2605 }
2606 break;
2607 case CEPH_MSG_CLIENT_QUOTA:
11fdf7f2 2608 handle_quota(MClientQuota::msgref_cast(m));
7c673cae
FG
2609 break;
2610
2611 default:
2612 return false;
2613 }
2614
2615 // unmounting?
2616 if (unmounting) {
2617 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2618 << "+" << inode_map.size() << dendl;
2619 long unsigned size = lru.lru_get_size() + inode_map.size();
2620 trim_cache();
2621 if (size < lru.lru_get_size() + inode_map.size()) {
2622 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2623 mount_cond.Signal();
2624 } else {
2625 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2626 << "+" << inode_map.size() << dendl;
2627 }
2628 }
2629
2630 return true;
2631}
2632
11fdf7f2 2633void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2634{
2635 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2636
2637 signal_cond_list(waiting_for_fsmap);
2638
2639 monclient->sub_got("fsmap", fsmap->get_epoch());
2640}
2641
11fdf7f2 2642void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2643{
2644 fsmap_user.reset(new FSMapUser);
2645 *fsmap_user = m->get_fsmap();
7c673cae
FG
2646
2647 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2648 signal_cond_list(waiting_for_fsmap);
2649}
2650
11fdf7f2 2651void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2652{
f64942e4 2653 mds_gid_t old_inc, new_inc;
7c673cae 2654 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2655 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2656 << " is identical to or older than our "
2657 << mdsmap->get_epoch() << dendl;
7c673cae 2658 return;
f64942e4 2659 }
7c673cae 2660
11fdf7f2 2661 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2662
2663 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2664 oldmap.swap(mdsmap);
2665
2666 mdsmap->decode(m->get_encoded());
2667
2668 // Cancel any commands for missing or laggy GIDs
2669 std::list<ceph_tid_t> cancel_ops;
2670 auto &commands = command_table.get_commands();
2671 for (const auto &i : commands) {
2672 auto &op = i.second;
2673 const mds_gid_t op_mds_gid = op.mds_gid;
2674 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2675 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2676 cancel_ops.push_back(i.first);
2677 if (op.outs) {
2678 std::ostringstream ss;
2679 ss << "MDS " << op_mds_gid << " went away";
2680 *(op.outs) = ss.str();
2681 }
2682 op.con->mark_down();
2683 if (op.on_finish) {
2684 op.on_finish->complete(-ETIMEDOUT);
2685 }
2686 }
2687 }
2688
2689 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2690 i != cancel_ops.end(); ++i) {
2691 command_table.erase(*i);
2692 }
2693
2694 // reset session
11fdf7f2 2695 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2696 mds_rank_t mds = p->first;
11fdf7f2 2697 MetaSession *session = &p->second;
7c673cae
FG
2698 ++p;
2699
2700 int oldstate = oldmap->get_state(mds);
2701 int newstate = mdsmap->get_state(mds);
2702 if (!mdsmap->is_up(mds)) {
2703 session->con->mark_down();
11fdf7f2 2704 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2705 old_inc = oldmap->get_incarnation(mds);
2706 new_inc = mdsmap->get_incarnation(mds);
2707 if (old_inc != new_inc) {
2708 ldout(cct, 1) << "mds incarnation changed from "
2709 << old_inc << " to " << new_inc << dendl;
2710 oldstate = MDSMap::STATE_NULL;
2711 }
7c673cae 2712 session->con->mark_down();
11fdf7f2 2713 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2714 // When new MDS starts to take over, notify kernel to trim unused entries
2715 // in its dcache/icache. Hopefully, the kernel will release some unused
2716 // inodes before the new MDS enters reconnect state.
2717 trim_cache_for_reconnect(session);
2718 } else if (oldstate == newstate)
2719 continue; // no change
2720
2721 session->mds_state = newstate;
2722 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2723 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2724 send_reconnect(session);
81eedcae
TL
2725 } else if (newstate > MDSMap::STATE_RECONNECT) {
2726 if (oldstate < MDSMap::STATE_RECONNECT) {
2727 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2728 _closed_mds_session(session);
2729 continue;
2730 }
2731 if (newstate >= MDSMap::STATE_ACTIVE) {
2732 if (oldstate < MDSMap::STATE_ACTIVE) {
2733 // kick new requests
2734 kick_requests(session);
2735 kick_flushing_caps(session);
2736 signal_context_list(session->waiting_for_open);
2737 wake_up_session_caps(session, true);
2738 }
2739 connect_mds_targets(mds);
7c673cae 2740 }
7c673cae
FG
2741 } else if (newstate == MDSMap::STATE_NULL &&
2742 mds >= mdsmap->get_max_mds()) {
2743 _closed_mds_session(session);
2744 }
2745 }
2746
2747 // kick any waiting threads
2748 signal_cond_list(waiting_for_mdsmap);
2749
7c673cae
FG
2750 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2751}
2752
2753void Client::send_reconnect(MetaSession *session)
2754{
2755 mds_rank_t mds = session->mds_num;
11fdf7f2 2756 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2757
2758 // trim unused caps to reduce MDS's cache rejoin time
2759 trim_cache_for_reconnect(session);
2760
2761 session->readonly = false;
2762
11fdf7f2 2763 session->release.reset();
7c673cae
FG
2764
2765 // reset my cap seq number
2766 session->seq = 0;
2767 //connect to the mds' offload targets
2768 connect_mds_targets(mds);
2769 //make sure unsafe requests get saved
2770 resend_unsafe_requests(session);
2771
11fdf7f2
TL
2772 early_kick_flushing_caps(session);
2773
2774 auto m = MClientReconnect::create();
2775 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2776
2777 // i have an open session.
2778 ceph::unordered_set<inodeno_t> did_snaprealm;
2779 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2780 p != inode_map.end();
2781 ++p) {
2782 Inode *in = p->second;
11fdf7f2
TL
2783 auto it = in->caps.find(mds);
2784 if (it != in->caps.end()) {
2785 if (allow_multi &&
2786 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2787 m->mark_more();
2788 session->con->send_message2(std::move(m));
2789
2790 m = MClientReconnect::create();
2791 }
2792
2793 Cap &cap = it->second;
7c673cae 2794 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2795 << " " << ccap_string(cap.issued)
7c673cae
FG
2796 << " wants " << ccap_string(in->caps_wanted())
2797 << dendl;
2798 filepath path;
2799 in->make_long_path(path);
2800 ldout(cct, 10) << " path " << path << dendl;
2801
2802 bufferlist flockbl;
2803 _encode_filelocks(in, flockbl);
2804
11fdf7f2
TL
2805 cap.seq = 0; // reset seq.
2806 cap.issue_seq = 0; // reset seq.
2807 cap.mseq = 0; // reset seq.
2808 // cap gen should catch up with session cap_gen
2809 if (cap.gen < session->cap_gen) {
2810 cap.gen = session->cap_gen;
2811 cap.issued = cap.implemented = CEPH_CAP_PIN;
2812 } else {
2813 cap.issued = cap.implemented;
2814 }
7c673cae
FG
2815 snapid_t snap_follows = 0;
2816 if (!in->cap_snaps.empty())
2817 snap_follows = in->cap_snaps.begin()->first;
2818
2819 m->add_cap(p->first.ino,
11fdf7f2 2820 cap.cap_id,
7c673cae
FG
2821 path.get_ino(), path.get_path(), // ino
2822 in->caps_wanted(), // wanted
11fdf7f2 2823 cap.issued, // issued
7c673cae
FG
2824 in->snaprealm->ino,
2825 snap_follows,
2826 flockbl);
2827
2828 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2829 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2830 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2831 did_snaprealm.insert(in->snaprealm->ino);
2832 }
2833 }
2834 }
2835
11fdf7f2
TL
2836 if (!allow_multi)
2837 m->set_encoding_version(0); // use connection features to choose encoding
2838 session->con->send_message2(std::move(m));
7c673cae
FG
2839
2840 mount_cond.Signal();
11fdf7f2
TL
2841
2842 if (session->reclaim_state == MetaSession::RECLAIMING)
2843 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2844}
2845
2846
2847void Client::kick_requests(MetaSession *session)
2848{
11fdf7f2 2849 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2850 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2851 p != mds_requests.end();
2852 ++p) {
31f18b77
FG
2853 MetaRequest *req = p->second;
2854 if (req->got_unsafe)
2855 continue;
2856 if (req->aborted()) {
2857 if (req->caller_cond) {
2858 req->kick = true;
2859 req->caller_cond->Signal();
2860 }
7c673cae 2861 continue;
31f18b77
FG
2862 }
2863 if (req->retry_attempt > 0)
7c673cae 2864 continue; // new requests only
31f18b77 2865 if (req->mds == session->mds_num) {
7c673cae
FG
2866 send_request(p->second, session);
2867 }
2868 }
2869}
2870
2871void Client::resend_unsafe_requests(MetaSession *session)
2872{
2873 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2874 !iter.end();
2875 ++iter)
2876 send_request(*iter, session);
2877
2878 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2879 // process completed requests in clientreplay stage.
2880 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2881 p != mds_requests.end();
2882 ++p) {
2883 MetaRequest *req = p->second;
2884 if (req->got_unsafe)
2885 continue;
31f18b77
FG
2886 if (req->aborted())
2887 continue;
7c673cae
FG
2888 if (req->retry_attempt == 0)
2889 continue; // old requests only
2890 if (req->mds == session->mds_num)
2891 send_request(req, session, true);
2892 }
2893}
2894
2895void Client::wait_unsafe_requests()
2896{
2897 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2898 for (const auto &p : mds_sessions) {
2899 const MetaSession &s = p.second;
2900 if (!s.unsafe_requests.empty()) {
2901 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2902 req->get();
2903 last_unsafe_reqs.push_back(req);
2904 }
2905 }
2906
2907 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2908 p != last_unsafe_reqs.end();
2909 ++p) {
2910 MetaRequest *req = *p;
2911 if (req->unsafe_item.is_on_list())
2912 wait_on_list(req->waitfor_safe);
2913 put_request(req);
2914 }
2915}
2916
2917void Client::kick_requests_closed(MetaSession *session)
2918{
11fdf7f2 2919 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2920 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2921 p != mds_requests.end(); ) {
2922 MetaRequest *req = p->second;
2923 ++p;
2924 if (req->mds == session->mds_num) {
2925 if (req->caller_cond) {
2926 req->kick = true;
2927 req->caller_cond->Signal();
2928 }
2929 req->item.remove_myself();
2930 if (req->got_unsafe) {
11fdf7f2 2931 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 2932 req->unsafe_item.remove_myself();
eafe8130
TL
2933 if (is_dir_operation(req)) {
2934 Inode *dir = req->inode();
2935 assert(dir);
2936 dir->set_async_err(-EIO);
2937 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2938 << dir->ino << " " << req->get_tid() << dendl;
2939 req->unsafe_dir_item.remove_myself();
2940 }
2941 if (req->target) {
2942 InodeRef &in = req->target;
2943 in->set_async_err(-EIO);
2944 lderr(cct) << "kick_requests_closed drop req of inode : "
2945 << in->ino << " " << req->get_tid() << dendl;
2946 req->unsafe_target_item.remove_myself();
2947 }
7c673cae
FG
2948 signal_cond_list(req->waitfor_safe);
2949 unregister_request(req);
2950 }
2951 }
2952 }
11fdf7f2
TL
2953 ceph_assert(session->requests.empty());
2954 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2955}
2956
2957
2958
2959
2960/************
2961 * leases
2962 */
2963
2964void Client::got_mds_push(MetaSession *s)
2965{
2966 s->seq++;
2967 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2968 if (s->state == MetaSession::STATE_CLOSING) {
11fdf7f2 2969 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2970 }
2971}
2972
11fdf7f2 2973void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2974{
11fdf7f2 2975 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2976
11fdf7f2 2977 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2978
2979 mds_rank_t mds = mds_rank_t(m->get_source().num());
2980 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2981 if (!session) {
7c673cae
FG
2982 return;
2983 }
2984
2985 got_mds_push(session);
2986
2987 ceph_seq_t seq = m->get_seq();
2988
2989 Inode *in;
2990 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2991 if (inode_map.count(vino) == 0) {
2992 ldout(cct, 10) << " don't have vino " << vino << dendl;
2993 goto revoke;
2994 }
2995 in = inode_map[vino];
2996
2997 if (m->get_mask() & CEPH_LOCK_DN) {
2998 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2999 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3000 goto revoke;
3001 }
3002 Dentry *dn = in->dir->dentries[m->dname];
3003 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3004 dn->lease_mds = -1;
3005 }
3006
3007 revoke:
11fdf7f2
TL
3008 {
3009 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3010 m->get_connection()->send_message2(std::move(reply));
3011 }
7c673cae
FG
3012}
3013
3014void Client::put_inode(Inode *in, int n)
3015{
11fdf7f2 3016 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3017 int left = in->_put(n);
3018 if (left == 0) {
3019 // release any caps
3020 remove_all_caps(in);
3021
11fdf7f2 3022 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3023 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3024 ceph_assert(!unclean);
7c673cae
FG
3025 inode_map.erase(in->vino());
3026 if (use_faked_inos())
3027 _release_faked_ino(in);
3028
3029 if (in == root) {
3030 root = 0;
3031 root_ancestor = 0;
3032 while (!root_parents.empty())
3033 root_parents.erase(root_parents.begin());
3034 }
3035
3036 delete in;
3037 }
3038}
3039
3040void Client::close_dir(Dir *dir)
3041{
3042 Inode *in = dir->parent_inode;
11fdf7f2
TL
3043 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3044 ceph_assert(dir->is_empty());
3045 ceph_assert(in->dir == dir);
3046 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3047 if (!in->dentries.empty())
7c673cae
FG
3048 in->get_first_parent()->put(); // unpin dentry
3049
3050 delete in->dir;
3051 in->dir = 0;
3052 put_inode(in); // unpin inode
3053}
3054
3055 /**
3056 * Don't call this with in==NULL, use get_or_create for that
3057 * leave dn set to default NULL unless you're trying to add
3058 * a new inode to a pre-created Dentry
3059 */
3060Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3061{
3062 if (!dn) {
3063 // create a new Dentry
11fdf7f2
TL
3064 dn = new Dentry(dir, name);
3065
7c673cae
FG
3066 lru.lru_insert_mid(dn); // mid or top?
3067
3068 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3069 << " dn " << dn << " (new dn)" << dendl;
3070 } else {
11fdf7f2 3071 ceph_assert(!dn->inode);
7c673cae
FG
3072 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3073 << " dn " << dn << " (old dn)" << dendl;
3074 }
3075
3076 if (in) { // link to inode
11fdf7f2 3077 InodeRef tmp_ref;
7c673cae 3078 // only one parent for directories!
11fdf7f2
TL
3079 if (in->is_dir() && !in->dentries.empty()) {
3080 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3081 Dentry *olddn = in->get_first_parent();
11fdf7f2 3082 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3083 Inode *old_diri = olddn->dir->parent_inode;
3084 old_diri->dir_release_count++;
3085 clear_dir_complete_and_ordered(old_diri, true);
3086 unlink(olddn, true, true); // keep dir, dentry
3087 }
3088
11fdf7f2
TL
3089 dn->link(in);
3090 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3091 }
3092
3093 return dn;
3094}
3095
3096void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3097{
11fdf7f2 3098 InodeRef in(dn->inode);
7c673cae
FG
3099 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3100 << " inode " << dn->inode << dendl;
3101
3102 // unlink from inode
11fdf7f2
TL
3103 if (dn->inode) {
3104 dn->unlink();
3105 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3106 }
3107
3108 if (keepdentry) {
3109 dn->lease_mds = -1;
3110 } else {
3111 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3112
3113 // unlink from dir
11fdf7f2
TL
3114 Dir *dir = dn->dir;
3115 dn->detach();
7c673cae
FG
3116
3117 // delete den
3118 lru.lru_remove(dn);
3119 dn->put();
11fdf7f2
TL
3120
3121 if (dir->is_empty() && !keepdir)
3122 close_dir(dir);
7c673cae
FG
3123 }
3124}
3125
3126/**
3127 * For asynchronous flushes, check for errors from the IO and
3128 * update the inode if necessary
3129 */
3130class C_Client_FlushComplete : public Context {
3131private:
3132 Client *client;
3133 InodeRef inode;
3134public:
3135 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3136 void finish(int r) override {
11fdf7f2 3137 ceph_assert(client->client_lock.is_locked_by_me());
7c673cae
FG
3138 if (r != 0) {
3139 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3140 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3141 << " 0x" << std::hex << inode->ino << std::dec
3142 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3143 inode->set_async_err(r);
3144 }
3145 }
3146};
3147
3148
3149/****
3150 * caps
3151 */
3152
3153void Client::get_cap_ref(Inode *in, int cap)
3154{
3155 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3156 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3157 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3158 in->get();
3159 }
3160 if ((cap & CEPH_CAP_FILE_CACHE) &&
3161 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3162 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3163 in->get();
3164 }
3165 in->get_cap_ref(cap);
3166}
3167
3168void Client::put_cap_ref(Inode *in, int cap)
3169{
3170 int last = in->put_cap_ref(cap);
3171 if (last) {
3172 int put_nref = 0;
3173 int drop = last & ~in->caps_issued();
3174 if (in->snapid == CEPH_NOSNAP) {
3175 if ((last & CEPH_CAP_FILE_WR) &&
3176 !in->cap_snaps.empty() &&
3177 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3178 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3179 in->cap_snaps.rbegin()->second.writing = 0;
3180 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3181 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3182 }
3183 if (last & CEPH_CAP_FILE_BUFFER) {
3184 for (auto &p : in->cap_snaps)
3185 p.second.dirty_data = 0;
3186 signal_cond_list(in->waitfor_commit);
11fdf7f2 3187 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3188 ++put_nref;
3189 }
3190 }
3191 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3192 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3193 ++put_nref;
3194 }
3195 if (drop)
3196 check_caps(in, 0);
3197 if (put_nref)
3198 put_inode(in, put_nref);
3199 }
3200}
3201
3202int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3203{
3204 int r = check_pool_perm(in, need);
3205 if (r < 0)
3206 return r;
3207
3208 while (1) {
3209 int file_wanted = in->caps_file_wanted();
3210 if ((file_wanted & need) != need) {
3211 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3212 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3213 << dendl;
3214 return -EBADF;
3215 }
3216
3217 int implemented;
3218 int have = in->caps_issued(&implemented);
3219
3220 bool waitfor_caps = false;
3221 bool waitfor_commit = false;
3222
3223 if (have & need & CEPH_CAP_FILE_WR) {
3224 if (endoff > 0 &&
3225 (endoff >= (loff_t)in->max_size ||
3226 endoff > (loff_t)(in->size << 1)) &&
3227 endoff > (loff_t)in->wanted_max_size) {
3228 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3229 in->wanted_max_size = endoff;
3230 check_caps(in, 0);
3231 }
3232
3233 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3234 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3235 waitfor_caps = true;
3236 }
3237 if (!in->cap_snaps.empty()) {
3238 if (in->cap_snaps.rbegin()->second.writing) {
3239 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3240 waitfor_caps = true;
3241 }
3242 for (auto &p : in->cap_snaps) {
3243 if (p.second.dirty_data) {
3244 waitfor_commit = true;
3245 break;
3246 }
3247 }
3248 if (waitfor_commit) {
3249 _flush(in, new C_Client_FlushComplete(this, in));
3250 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3251 }
3252 }
3253 }
3254
3255 if (!waitfor_caps && !waitfor_commit) {
3256 if ((have & need) == need) {
7c673cae
FG
3257 int revoking = implemented & ~have;
3258 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3259 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3260 << " revoking " << ccap_string(revoking)
7c673cae 3261 << dendl;
c07f9fc5 3262 if ((revoking & want) == 0) {
7c673cae
FG
3263 *phave = need | (have & want);
3264 in->get_cap_ref(need);
3265 return 0;
3266 }
3267 }
3268 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3269 waitfor_caps = true;
3270 }
3271
3272 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3273 in->auth_cap->session->readonly)
3274 return -EROFS;
3275
3276 if (in->flags & I_CAP_DROPPED) {
3277 int mds_wanted = in->caps_mds_wanted();
3278 if ((mds_wanted & need) != need) {
3279 int ret = _renew_caps(in);
3280 if (ret < 0)
3281 return ret;
3282 continue;
3283 }
a8e16298 3284 if (!(file_wanted & ~mds_wanted))
7c673cae 3285 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3286 }
3287
3288 if (waitfor_caps)
3289 wait_on_list(in->waitfor_caps);
3290 else if (waitfor_commit)
3291 wait_on_list(in->waitfor_commit);
3292 }
3293}
3294
3295int Client::get_caps_used(Inode *in)
3296{
3297 unsigned used = in->caps_used();
3298 if (!(used & CEPH_CAP_FILE_CACHE) &&
3299 !objectcacher->set_is_empty(&in->oset))
3300 used |= CEPH_CAP_FILE_CACHE;
3301 return used;
3302}
3303
3304void Client::cap_delay_requeue(Inode *in)
3305{
11fdf7f2 3306 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3307 in->hold_caps_until = ceph_clock_now();
3308 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3309 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3310}
3311
3312void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3313 int flags, int used, int want, int retain,
7c673cae
FG
3314 int flush, ceph_tid_t flush_tid)
3315{
3316 int held = cap->issued | cap->implemented;
3317 int revoking = cap->implemented & ~cap->issued;
3318 retain &= ~revoking;
3319 int dropping = cap->issued & ~retain;
3320 int op = CEPH_CAP_OP_UPDATE;
3321
11fdf7f2 3322 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3323 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
11fdf7f2 3362 auto m = MClientCaps::create(op,
7c673cae
FG
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3384 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
eafe8130
TL
3398
3399 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3400 !in->cap_snaps.empty() &&
3401 in->cap_snaps.rbegin()->second.flush_tid == 0)
3402 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3403 m->flags = flags;
3404
7c673cae
FG
3405 if (flush & CEPH_CAP_FILE_WR) {
3406 m->inline_version = in->inline_version;
3407 m->inline_data = in->inline_data;
3408 }
3409
3410 in->reported_size = in->size;
3411 m->set_snap_follows(follows);
3412 cap->wanted = want;
3413 if (cap == in->auth_cap) {
3414 m->set_max_size(in->wanted_max_size);
3415 in->requested_max_size = in->wanted_max_size;
3416 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3417 }
3418
3419 if (!session->flushing_caps_tids.empty())
3420 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3421
11fdf7f2 3422 session->con->send_message2(std::move(m));
7c673cae
FG
3423}
3424
31f18b77
FG
3425static bool is_max_size_approaching(Inode *in)
3426{
3427 /* mds will adjust max size according to the reported size */
3428 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3429 return false;
3430 if (in->size >= in->max_size)
3431 return true;
3432 /* half of previous max_size increment has been used */
3433 if (in->max_size > in->reported_size &&
3434 (in->size << 1) >= in->max_size + in->reported_size)
3435 return true;
3436 return false;
3437}
7c673cae 3438
11fdf7f2
TL
3439static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3440{
3441 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3442 return used;
3443 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3444 return used;
3445
3446 if (issued & CEPH_CAP_FILE_LAZYIO) {
3447 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3448 used &= ~CEPH_CAP_FILE_CACHE;
3449 used |= CEPH_CAP_FILE_LAZYIO;
3450 }
3451 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3452 used &= ~CEPH_CAP_FILE_BUFFER;
3453 used |= CEPH_CAP_FILE_LAZYIO;
3454 }
3455 } else {
3456 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3457 used &= ~CEPH_CAP_FILE_CACHE;
3458 used |= CEPH_CAP_FILE_LAZYIO;
3459 }
3460 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3461 used &= ~CEPH_CAP_FILE_BUFFER;
3462 used |= CEPH_CAP_FILE_LAZYIO;
3463 }
3464 }
3465 return used;
3466}
3467
7c673cae
FG
3468/**
3469 * check_caps
3470 *
3471 * Examine currently used and wanted versus held caps. Release, flush or ack
3472 * revoked caps to the MDS as appropriate.
3473 *
3474 * @param in the inode to check
3475 * @param flags flags to apply to cap check
3476 */
3477void Client::check_caps(Inode *in, unsigned flags)
3478{
3479 unsigned wanted = in->caps_wanted();
3480 unsigned used = get_caps_used(in);
3481 unsigned cap_used;
3482
7c673cae
FG
3483 int implemented;
3484 int issued = in->caps_issued(&implemented);
3485 int revoking = implemented & ~issued;
3486
11fdf7f2
TL
3487 int orig_used = used;
3488 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3489
7c673cae 3490 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3491 if (!unmounting && in->nlink > 0) {
3492 if (wanted) {
7c673cae 3493 retain |= CEPH_CAP_ANY;
a8e16298
TL
3494 } else if (in->is_dir() &&
3495 (issued & CEPH_CAP_FILE_SHARED) &&
3496 (in->flags & I_COMPLETE)) {
3497 // we do this here because we don't want to drop to Fs (and then
3498 // drop the Fs if we do a create!) if that alone makes us send lookups
3499 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3500 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3501 retain |= wanted;
3502 } else {
7c673cae 3503 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3504 // keep RD only if we didn't have the file open RW,
3505 // because then the mds would revoke it anyway to
3506 // journal max_size=0.
3507 if (in->max_size == 0)
3508 retain |= CEPH_CAP_ANY_RD;
3509 }
7c673cae
FG
3510 }
3511
11fdf7f2 3512 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3513 << " wanted " << ccap_string(wanted)
3514 << " used " << ccap_string(used)
3515 << " issued " << ccap_string(issued)
3516 << " revoking " << ccap_string(revoking)
3517 << " flags=" << flags
3518 << dendl;
3519
3520 if (in->snapid != CEPH_NOSNAP)
3521 return; //snap caps last forever, can't write
3522
3523 if (in->caps.empty())
3524 return; // guard if at end of func
3525
11fdf7f2
TL
3526 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3527 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3528 if (_release(in))
11fdf7f2 3529 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3530 }
7c673cae 3531
7c673cae 3532
11fdf7f2
TL
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
7c673cae 3536
11fdf7f2 3537 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3538
3539 cap_used = used;
11fdf7f2 3540 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3541 cap_used &= ~in->auth_cap->issued;
3542
11fdf7f2 3543 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3544
3545 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3552 &cap == in->auth_cap)
7c673cae
FG
3553 goto ack;
3554
3555 /* approaching file_max? */
11fdf7f2
TL
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
31f18b77 3558 is_max_size_approaching(in)) {
7c673cae 3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3560 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
11fdf7f2 3571 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
11fdf7f2 3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3578 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3579 continue;
3580
11fdf7f2 3581 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3582 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3583 cap_delay_requeue(in);
7c673cae
FG
3584 continue;
3585 }
3586
3587 ack:
eafe8130
TL
3588 if (&cap == in->auth_cap) {
3589 if (in->flags & I_KICK_FLUSH) {
3590 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3591 << " to mds." << mds << dendl;
3592 kick_flushing_caps(in, session);
3593 }
3594 if (!in->cap_snaps.empty() &&
3595 in->cap_snaps.rbegin()->second.flush_tid == 0)
3596 flush_snaps(in);
7c673cae
FG
3597 }
3598
3599 int flushing;
3600 ceph_tid_t flush_tid;
11fdf7f2 3601 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3602 flushing = mark_caps_flushing(in, &flush_tid);
3603 } else {
3604 flushing = 0;
3605 flush_tid = 0;
3606 }
3607
eafe8130
TL
3608 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3609 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3610 flushing, flush_tid);
7c673cae
FG
3611 }
3612}
3613
3614
3615void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3616{
3617 int used = get_caps_used(in);
3618 int dirty = in->caps_dirty();
11fdf7f2 3619 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3620
3621 if (in->cap_snaps.size() &&
3622 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3623 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3624 return;
3625 } else if (in->caps_dirty() ||
3626 (used & CEPH_CAP_FILE_WR) ||
3627 (dirty & CEPH_CAP_ANY_WR)) {
3628 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3629 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3630 CapSnap &capsnap = capsnapem.first->second;
3631 capsnap.context = old_snapc;
3632 capsnap.issued = in->caps_issued();
3633 capsnap.dirty = in->caps_dirty();
3634
3635 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3636
3637 capsnap.uid = in->uid;
3638 capsnap.gid = in->gid;
3639 capsnap.mode = in->mode;
3640 capsnap.btime = in->btime;
3641 capsnap.xattrs = in->xattrs;
3642 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3643 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3644 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3645
3646 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3647 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3648 capsnap.writing = 1;
3649 } else {
3650 finish_cap_snap(in, capsnap, used);
3651 }
3652 } else {
11fdf7f2 3653 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3654 }
3655}
3656
3657void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3658{
11fdf7f2 3659 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3660 capsnap.size = in->size;
3661 capsnap.mtime = in->mtime;
3662 capsnap.atime = in->atime;
3663 capsnap.ctime = in->ctime;
3664 capsnap.time_warp_seq = in->time_warp_seq;
3665 capsnap.change_attr = in->change_attr;
7c673cae
FG
3666 capsnap.dirty |= in->caps_dirty();
3667
11fdf7f2
TL
3668 /* Only reset it if it wasn't set before */
3669 if (capsnap.cap_dirtier_uid == -1) {
3670 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3671 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3672 }
3673
7c673cae
FG
3674 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3675 capsnap.inline_data = in->inline_data;
3676 capsnap.inline_version = in->inline_version;
3677 }
3678
3679 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3680 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3681 << " WRBUFFER, delaying" << dendl;
3682 } else {
3683 capsnap.dirty_data = 0;
3684 flush_snaps(in);
3685 }
3686}
3687
3688void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3689{
11fdf7f2 3690 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3691 in->cap_snaps.at(seq).dirty_data = 0;
3692 flush_snaps(in);
3693}
3694
eafe8130
TL
3695void Client::send_flush_snap(Inode *in, MetaSession *session,
3696 snapid_t follows, CapSnap& capsnap)
3697{
3698 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP,
3699 in->ino, in->snaprealm->ino, 0,
3700 in->auth_cap->mseq, cap_epoch_barrier);
3701 m->caller_uid = capsnap.cap_dirtier_uid;
3702 m->caller_gid = capsnap.cap_dirtier_gid;
3703
3704 m->set_client_tid(capsnap.flush_tid);
3705 m->head.snap_follows = follows;
3706
3707 m->head.caps = capsnap.issued;
3708 m->head.dirty = capsnap.dirty;
3709
3710 m->head.uid = capsnap.uid;
3711 m->head.gid = capsnap.gid;
3712 m->head.mode = capsnap.mode;
3713 m->btime = capsnap.btime;
3714
3715 m->size = capsnap.size;
3716
3717 m->head.xattr_version = capsnap.xattr_version;
3718 encode(capsnap.xattrs, m->xattrbl);
3719
3720 m->ctime = capsnap.ctime;
3721 m->btime = capsnap.btime;
3722 m->mtime = capsnap.mtime;
3723 m->atime = capsnap.atime;
3724 m->time_warp_seq = capsnap.time_warp_seq;
3725 m->change_attr = capsnap.change_attr;
3726
3727 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3728 m->inline_version = in->inline_version;
3729 m->inline_data = in->inline_data;
3730 }
3731
3732 ceph_assert(!session->flushing_caps_tids.empty());
3733 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3734
3735 session->con->send_message2(std::move(m));
3736}
3737
3738void Client::flush_snaps(Inode *in)
7c673cae 3739{
eafe8130 3740 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3741 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3742
3743 // pick auth mds
11fdf7f2 3744 ceph_assert(in->auth_cap);
7c673cae 3745 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3746
3747 for (auto &p : in->cap_snaps) {
3748 CapSnap &capsnap = p.second;
eafe8130
TL
3749 // only do new flush
3750 if (capsnap.flush_tid > 0)
3751 continue;
7c673cae
FG
3752
3753 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3754 << " follows " << p.first
3755 << " size " << capsnap.size
3756 << " mtime " << capsnap.mtime
3757 << " dirty_data=" << capsnap.dirty_data
3758 << " writing=" << capsnap.writing
3759 << " on " << *in << dendl;
3760 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3761 break;
7c673cae 3762
eafe8130
TL
3763 capsnap.flush_tid = ++last_flush_tid;
3764 session->flushing_caps_tids.insert(capsnap.flush_tid);
3765 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3766 if (!in->flushing_cap_item.is_on_list())
3767 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 3768
eafe8130 3769 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
3770 }
3771}
3772
7c673cae
FG
3773void Client::wait_on_list(list<Cond*>& ls)
3774{
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779}
3780
3781void Client::signal_cond_list(list<Cond*>& ls)
3782{
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785}
3786
3787void Client::wait_on_context_list(list<Context*>& ls)
3788{
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795}
3796
3797void Client::signal_context_list(list<Context*>& ls)
3798{
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803}
3804
a8e16298 3805void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3806{
11fdf7f2
TL
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
a8e16298 3809 if (reconnect) {
11fdf7f2
TL
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
a8e16298
TL
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
11fdf7f2
TL
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3819 }
3820 }
11fdf7f2 3821 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3822 }
3823}
3824
3825
3826// flush dirty data (from objectcache)
3827
3828class C_Client_CacheInvalidate : public Context {
3829private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
11fdf7f2 3843 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846};
3847
3848void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849{
3850 if (unmounting)
3851 return;
11fdf7f2 3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854}
3855
3856void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861}
3862
3863void Client::_invalidate_inode_cache(Inode *in)
3864{
11fdf7f2 3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3866
3867 // invalidate our userspace inode cache
94b18763 3868 if (cct->_conf->client_oc) {
7c673cae 3869 objectcacher->release_set(&in->oset);
94b18763
FG
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
7c673cae
FG
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875}
3876
3877void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878{
11fdf7f2 3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889}
3890
3891bool Client::_release(Inode *in)
3892{
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899}
3900
3901bool Client::_flush(Inode *in, Context *onfinish)
3902{
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921}
3922
3923void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924{
11fdf7f2 3925 ceph_assert(client_lock.is_locked());
7c673cae
FG
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
11fdf7f2 3931 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3933 offset, size, &onflush);
7c673cae
FG
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
11fdf7f2 3937 onflush.wait();
7c673cae
FG
3938 client_lock.Lock();
3939 }
3940}
3941
3942void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943{
11fdf7f2
TL
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
7c673cae 3946 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3947 ceph_assert(in);
7c673cae
FG
3948 _flushed(in);
3949}
3950
3951void Client::_flushed(Inode *in)
3952{
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956}
3957
3958
3959
3960// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3961void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3962{
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976}
3977
3978void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3981{
11fdf7f2
TL
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
7c673cae 3999 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
11fdf7f2 4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4016 if (&cap != in->auth_cap)
4017 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4018
11fdf7f2
TL
4019 ceph_assert(cap.cap_id == cap_id);
4020 seq = cap.seq;
4021 mseq = cap.mseq;
4022 issued |= cap.issued;
7c673cae
FG
4023 flags |= CEPH_CAP_FLAG_AUTH;
4024 }
7c673cae
FG
4025 }
4026
11fdf7f2 4027 check_cap_issue(in, issued);
7c673cae
FG
4028
4029 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4030 if (in->auth_cap != &cap &&
7c673cae
FG
4031 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4032 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4033 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4034 << "add myself to new auth MDS' flushing caps list" << dendl;
4035 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4036 }
11fdf7f2 4037 in->auth_cap = &cap;
7c673cae
FG
4038 }
4039 }
4040
11fdf7f2
TL
4041 unsigned old_caps = cap.issued;
4042 cap.cap_id = cap_id;
4043 cap.issued = issued;
4044 cap.implemented |= issued;
4045 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4046 cap.wanted = wanted;
a8e16298 4047 else
11fdf7f2
TL
4048 cap.wanted |= wanted;
4049 cap.seq = seq;
4050 cap.issue_seq = seq;
4051 cap.mseq = mseq;
4052 cap.gen = mds_session->cap_gen;
4053 cap.latest_perms = cap_perms;
4054 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4055 << " from mds." << mds
4056 << " on " << *in
4057 << dendl;
4058
4059 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4060 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4061 for (auto &p : in->caps) {
4062 if (&p.second == &cap)
7c673cae 4063 continue;
11fdf7f2 4064 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4065 check_caps(in, CHECK_CAPS_NODELAY);
4066 break;
4067 }
4068 }
4069 }
4070
4071 if (issued & ~old_caps)
4072 signal_cond_list(in->waitfor_caps);
4073}
4074
4075void Client::remove_cap(Cap *cap, bool queue_release)
4076{
11fdf7f2 4077 auto &in = cap->inode;
7c673cae
FG
4078 MetaSession *session = cap->session;
4079 mds_rank_t mds = cap->session->mds_num;
4080
11fdf7f2 4081 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4082
4083 if (queue_release) {
4084 session->enqueue_cap_release(
11fdf7f2 4085 in.ino,
7c673cae
FG
4086 cap->cap_id,
4087 cap->issue_seq,
4088 cap->mseq,
4089 cap_epoch_barrier);
4090 }
4091
11fdf7f2
TL
4092 if (in.auth_cap == cap) {
4093 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4094 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4095 in.flushing_cap_item.remove_myself();
7c673cae 4096 }
11fdf7f2 4097 in.auth_cap = NULL;
7c673cae 4098 }
11fdf7f2
TL
4099 size_t n = in.caps.erase(mds);
4100 ceph_assert(n == 1);
7c673cae
FG
4101 cap = nullptr;
4102
11fdf7f2
TL
4103 if (!in.is_any_caps()) {
4104 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4105 in.snaprealm_item.remove_myself();
4106 put_snap_realm(in.snaprealm);
4107 in.snaprealm = 0;
7c673cae
FG
4108 }
4109}
4110
4111void Client::remove_all_caps(Inode *in)
4112{
4113 while (!in->caps.empty())
11fdf7f2 4114 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4115}
4116
4117void Client::remove_session_caps(MetaSession *s)
4118{
11fdf7f2 4119 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4120
4121 while (s->caps.size()) {
4122 Cap *cap = *s->caps.begin();
11fdf7f2 4123 InodeRef in(&cap->inode);
eafe8130 4124 bool dirty_caps = false;
7c673cae 4125 if (in->auth_cap == cap) {
7c673cae
FG
4126 dirty_caps = in->dirty_caps | in->flushing_caps;
4127 in->wanted_max_size = 0;
4128 in->requested_max_size = 0;
7c673cae 4129 }
a8e16298
TL
4130 if (cap->wanted | cap->issued)
4131 in->flags |= I_CAP_DROPPED;
7c673cae 4132 remove_cap(cap, false);
eafe8130 4133 in->cap_snaps.clear();
7c673cae 4134 if (dirty_caps) {
11fdf7f2 4135 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4136 if (in->flushing_caps) {
4137 num_flushing_caps--;
4138 in->flushing_cap_tids.clear();
4139 }
4140 in->flushing_caps = 0;
28e407b8 4141 in->mark_caps_clean();
11fdf7f2 4142 put_inode(in.get());
7c673cae 4143 }
a8e16298 4144 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4145 }
4146 s->flushing_caps_tids.clear();
4147 sync_cond.Signal();
4148}
4149
91327a77 4150int Client::_do_remount(bool retry_on_error)
b32b8144 4151{
11fdf7f2 4152 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4153
b32b8144
FG
4154 errno = 0;
4155 int r = remount_cb(callback_handle);
91327a77
AA
4156 if (r == 0) {
4157 retries_on_invalidate = 0;
4158 } else {
b32b8144
FG
4159 int e = errno;
4160 client_t whoami = get_nodeid();
4161 if (r == -1) {
4162 lderr(cct) <<
4163 "failed to remount (to trim kernel dentries): "
4164 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4165 } else {
4166 lderr(cct) <<
4167 "failed to remount (to trim kernel dentries): "
4168 "return code = " << r << dendl;
4169 }
91327a77 4170 bool should_abort =
11fdf7f2
TL
4171 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4172 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4173 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4174 if (should_abort && !unmounting) {
4175 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4176 ceph_abort();
4177 }
4178 }
4179 return r;
4180}
4181
7c673cae
FG
4182class C_Client_Remount : public Context {
4183private:
4184 Client *client;
4185public:
4186 explicit C_Client_Remount(Client *c) : client(c) {}
4187 void finish(int r) override {
11fdf7f2 4188 ceph_assert(r == 0);
91327a77 4189 client->_do_remount(true);
7c673cae
FG
4190 }
4191};
4192
4193void Client::_invalidate_kernel_dcache()
4194{
4195 if (unmounting)
4196 return;
94b18763
FG
4197 if (can_invalidate_dentries) {
4198 if (dentry_invalidate_cb && root->dir) {
4199 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4200 p != root->dir->dentries.end();
4201 ++p) {
4202 if (p->second->inode)
4203 _schedule_invalidate_dentry_callback(p->second, false);
4204 }
7c673cae
FG
4205 }
4206 } else if (remount_cb) {
4207 // Hacky:
4208 // when remounting a file system, linux kernel trims all unused dentries in the fs
4209 remount_finisher.queue(new C_Client_Remount(this));
4210 }
4211}
4212
91327a77
AA
4213void Client::_trim_negative_child_dentries(InodeRef& in)
4214{
4215 if (!in->is_dir())
4216 return;
4217
4218 Dir* dir = in->dir;
4219 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4220 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4221 Dentry *dn = p->second;
4222 ++p;
11fdf7f2 4223 ceph_assert(!dn->inode);
91327a77
AA
4224 if (dn->lru_is_expireable())
4225 unlink(dn, true, false); // keep dir, drop dentry
4226 }
4227 if (dir->dentries.empty()) {
4228 close_dir(dir);
4229 }
4230 }
4231
4232 if (in->flags & I_SNAPDIR_OPEN) {
4233 InodeRef snapdir = open_snapdir(in.get());
4234 _trim_negative_child_dentries(snapdir);
4235 }
4236}
4237
28e407b8 4238void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4239{
4240 mds_rank_t mds = s->mds_num;
28e407b8 4241 size_t caps_size = s->caps.size();
11fdf7f2 4242 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4243 << " caps " << caps_size << dendl;
4244
28e407b8
AA
4245 uint64_t trimmed = 0;
4246 auto p = s->caps.begin();
4247 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4248 * looking at from getting deleted during traversal. */
7c673cae
FG
4249 while ((caps_size - trimmed) > max && !p.end()) {
4250 Cap *cap = *p;
11fdf7f2 4251 InodeRef in(&cap->inode);
7c673cae
FG
4252
4253 // Increment p early because it will be invalidated if cap
4254 // is deleted inside remove_cap
4255 ++p;
4256
4257 if (in->caps.size() > 1 && cap != in->auth_cap) {
4258 int mine = cap->issued | cap->implemented;
4259 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4260 // disposable non-auth cap
b32b8144 4261 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4262 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4263 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4264 trimmed++;
4265 }
4266 } else {
4267 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4268 _trim_negative_child_dentries(in);
7c673cae 4269 bool all = true;
11fdf7f2
TL
4270 auto q = in->dentries.begin();
4271 while (q != in->dentries.end()) {
4272 Dentry *dn = *q;
4273 ++q;
7c673cae
FG
4274 if (dn->lru_is_expireable()) {
4275 if (can_invalidate_dentries &&
4276 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4277 // Only issue one of these per DN for inodes in root: handle
4278 // others more efficiently by calling for root-child DNs at
4279 // the end of this function.
4280 _schedule_invalidate_dentry_callback(dn, true);
4281 }
28e407b8
AA
4282 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4283 to_trim.insert(dn);
7c673cae
FG
4284 } else {
4285 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4286 all = false;
4287 }
4288 }
4289 if (all && in->ino != MDS_INO_ROOT) {
4290 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4291 trimmed++;
4292 }
4293 }
4294 }
28e407b8
AA
4295 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4296 for (const auto &dn : to_trim) {
4297 trim_dentry(dn);
4298 }
4299 to_trim.clear();
7c673cae 4300
b32b8144 4301 caps_size = s->caps.size();
11fdf7f2 4302 if (caps_size > (size_t)max)
7c673cae
FG
4303 _invalidate_kernel_dcache();
4304}
4305
4306void Client::force_session_readonly(MetaSession *s)
4307{
4308 s->readonly = true;
4309 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4310 auto &in = (*p)->inode;
4311 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4312 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4313 }
4314}
4315
7c673cae
FG
4316int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4317{
4318 MetaSession *session = in->auth_cap->session;
4319
4320 int flushing = in->dirty_caps;
11fdf7f2 4321 ceph_assert(flushing);
7c673cae
FG
4322
4323 ceph_tid_t flush_tid = ++last_flush_tid;
4324 in->flushing_cap_tids[flush_tid] = flushing;
4325
4326 if (!in->flushing_caps) {
11fdf7f2 4327 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4328 num_flushing_caps++;
4329 } else {
11fdf7f2 4330 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4331 }
4332
4333 in->flushing_caps |= flushing;
28e407b8 4334 in->mark_caps_clean();
7c673cae
FG
4335
4336 if (!in->flushing_cap_item.is_on_list())
4337 session->flushing_caps.push_back(&in->flushing_cap_item);
4338 session->flushing_caps_tids.insert(flush_tid);
4339
4340 *ptid = flush_tid;
4341 return flushing;
4342}
4343
4344void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4345{
4346 for (auto &p : in->cap_snaps) {
4347 CapSnap &capsnap = p.second;
4348 if (capsnap.flush_tid > 0) {
4349 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4350 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4351 }
4352 }
4353 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4354 it != in->flushing_cap_tids.end();
4355 ++it) {
4356 old_s->flushing_caps_tids.erase(it->first);
4357 new_s->flushing_caps_tids.insert(it->first);
4358 }
4359 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4360}
4361
4362/*
4363 * Flush all caps back to the MDS. Because the callers generally wait on the
4364 * result of this function (syncfs and umount cases), we set
4365 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4366 */
4367void Client::flush_caps_sync()
4368{
4369 ldout(cct, 10) << __func__ << dendl;
28e407b8 4370 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4371 while (!p.end()) {
4372 unsigned flags = CHECK_CAPS_NODELAY;
4373 Inode *in = *p;
4374
4375 ++p;
28e407b8
AA
4376 delayed_list.pop_front();
4377 if (p.end() && dirty_list.empty())
7c673cae
FG
4378 flags |= CHECK_CAPS_SYNCHRONOUS;
4379 check_caps(in, flags);
4380 }
4381
4382 // other caps, too
28e407b8 4383 p = dirty_list.begin();
7c673cae
FG
4384 while (!p.end()) {
4385 unsigned flags = CHECK_CAPS_NODELAY;
4386 Inode *in = *p;
4387
4388 ++p;
4389 if (p.end())
4390 flags |= CHECK_CAPS_SYNCHRONOUS;
4391 check_caps(in, flags);
4392 }
4393}
4394
7c673cae
FG
4395void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4396{
4397 while (in->flushing_caps) {
4398 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4399 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4400 if (it->first > want)
4401 break;
11fdf7f2 4402 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4403 << ccap_string(it->second) << " want " << want
4404 << " last " << it->first << dendl;
4405 wait_on_list(in->waitfor_caps);
4406 }
4407}
4408
4409void Client::wait_sync_caps(ceph_tid_t want)
4410{
4411 retry:
11fdf7f2 4412 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4413 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4414 for (auto &p : mds_sessions) {
4415 MetaSession *s = &p.second;
7c673cae
FG
4416 if (s->flushing_caps_tids.empty())
4417 continue;
4418 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4419 if (oldest_tid <= want) {
11fdf7f2 4420 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae
FG
4421 << " (want " << want << ")" << dendl;
4422 sync_cond.Wait(client_lock);
4423 goto retry;
4424 }
4425 }
4426}
4427
eafe8130
TL
4428void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4429{
4430 in->flags &= ~I_KICK_FLUSH;
4431
4432 Cap *cap = in->auth_cap;
4433 ceph_assert(cap->session == session);
4434
4435 ceph_tid_t last_snap_flush = 0;
4436 for (auto p = in->flushing_cap_tids.rbegin();
4437 p != in->flushing_cap_tids.rend();
4438 ++p) {
4439 if (!p->second) {
4440 last_snap_flush = p->first;
4441 break;
4442 }
4443 }
4444
4445 int wanted = in->caps_wanted();
4446 int used = get_caps_used(in) | in->caps_dirty();
4447 auto it = in->cap_snaps.begin();
4448 for (auto& p : in->flushing_cap_tids) {
4449 if (p.second) {
4450 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4451 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4452 p.second, p.first);
4453 } else {
4454 ceph_assert(it != in->cap_snaps.end());
4455 ceph_assert(it->second.flush_tid == p.first);
4456 send_flush_snap(in, session, it->first, it->second);
4457 ++it;
4458 }
4459 }
4460}
4461
7c673cae
FG
4462void Client::kick_flushing_caps(MetaSession *session)
4463{
4464 mds_rank_t mds = session->mds_num;
11fdf7f2 4465 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4466
4467 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4468 Inode *in = *p;
eafe8130
TL
4469 if (in->flags & I_KICK_FLUSH) {
4470 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4471 kick_flushing_caps(in, session);
4472 }
7c673cae 4473 }
7c673cae
FG
4474}
4475
4476void Client::early_kick_flushing_caps(MetaSession *session)
4477{
7c673cae
FG
4478 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4479 Inode *in = *p;
11fdf7f2
TL
4480 Cap *cap = in->auth_cap;
4481 ceph_assert(cap);
7c673cae
FG
4482
4483 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4484 // stage. This guarantees that MDS processes the cap flush message before issuing
4485 // the flushing caps to other client.
eafe8130
TL
4486 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4487 in->flags |= I_KICK_FLUSH;
7c673cae 4488 continue;
eafe8130 4489 }
7c673cae
FG
4490
4491 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4492 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4493 // send_reconnect() also will reset these sequence numbers. make sure
4494 // sequence numbers in cap flush message match later reconnect message.
4495 cap->seq = 0;
4496 cap->issue_seq = 0;
4497 cap->mseq = 0;
4498 cap->issued = cap->implemented;
4499
eafe8130 4500 kick_flushing_caps(in, session);
7c673cae
FG
4501 }
4502}
4503
7c673cae
FG
4504void SnapRealm::build_snap_context()
4505{
4506 set<snapid_t> snaps;
4507 snapid_t max_seq = seq;
4508
4509 // start with prior_parents?
4510 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4511 snaps.insert(prior_parent_snaps[i]);
4512
4513 // current parent's snaps
4514 if (pparent) {
4515 const SnapContext& psnapc = pparent->get_snap_context();
4516 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4517 if (psnapc.snaps[i] >= parent_since)
4518 snaps.insert(psnapc.snaps[i]);
4519 if (psnapc.seq > max_seq)
4520 max_seq = psnapc.seq;
4521 }
4522
4523 // my snaps
4524 for (unsigned i=0; i<my_snaps.size(); i++)
4525 snaps.insert(my_snaps[i]);
4526
4527 // ok!
4528 cached_snap_context.seq = max_seq;
4529 cached_snap_context.snaps.resize(0);
4530 cached_snap_context.snaps.reserve(snaps.size());
4531 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4532 cached_snap_context.snaps.push_back(*p);
4533}
4534
4535void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4536{
4537 list<SnapRealm*> q;
4538 q.push_back(realm);
4539
4540 while (!q.empty()) {
4541 realm = q.front();
4542 q.pop_front();
4543
11fdf7f2 4544 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4545 realm->invalidate_cache();
4546
4547 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4548 p != realm->pchildren.end();
4549 ++p)
4550 q.push_back(*p);
4551 }
4552}
4553
4554SnapRealm *Client::get_snap_realm(inodeno_t r)
4555{
4556 SnapRealm *realm = snap_realms[r];
4557 if (!realm)
4558 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4559 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4560 realm->nref++;
4561 return realm;
4562}
4563
4564SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4565{
4566 if (snap_realms.count(r) == 0) {
11fdf7f2 4567 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4568 return NULL;
4569 }
4570 SnapRealm *realm = snap_realms[r];
11fdf7f2 4571 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4572 realm->nref++;
4573 return realm;
4574}
4575
4576void Client::put_snap_realm(SnapRealm *realm)
4577{
11fdf7f2 4578 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4579 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4580 if (--realm->nref == 0) {
4581 snap_realms.erase(realm->ino);
4582 if (realm->pparent) {
4583 realm->pparent->pchildren.erase(realm);
4584 put_snap_realm(realm->pparent);
4585 }
4586 delete realm;
4587 }
4588}
4589
4590bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4591{
4592 if (realm->parent != parent) {
11fdf7f2 4593 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4594 << " " << realm->parent << " -> " << parent << dendl;
4595 realm->parent = parent;
4596 if (realm->pparent) {
4597 realm->pparent->pchildren.erase(realm);
4598 put_snap_realm(realm->pparent);
4599 }
4600 realm->pparent = get_snap_realm(parent);
4601 realm->pparent->pchildren.insert(realm);
4602 return true;
4603 }
4604 return false;
4605}
4606
4607static bool has_new_snaps(const SnapContext& old_snapc,
4608 const SnapContext& new_snapc)
4609{
4610 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4611}
4612
4613
11fdf7f2 4614void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4615{
4616 SnapRealm *first_realm = NULL;
11fdf7f2 4617 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4618
4619 map<SnapRealm*, SnapContext> dirty_realms;
4620
11fdf7f2 4621 auto p = bl.cbegin();
7c673cae
FG
4622 while (!p.end()) {
4623 SnapRealmInfo info;
11fdf7f2 4624 decode(info, p);
7c673cae
FG
4625 SnapRealm *realm = get_snap_realm(info.ino());
4626
4627 bool invalidate = false;
4628
4629 if (info.seq() > realm->seq) {
11fdf7f2 4630 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4631 << dendl;
4632
4633 if (flush) {
4634 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4635 // flush me + children
4636 list<SnapRealm*> q;
4637 q.push_back(realm);
4638 while (!q.empty()) {
4639 SnapRealm *realm = q.front();
4640 q.pop_front();
4641
4642 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4643 p != realm->pchildren.end();
4644 ++p)
4645 q.push_back(*p);
4646
4647 if (dirty_realms.count(realm) == 0) {
4648 realm->nref++;
4649 dirty_realms[realm] = realm->get_snap_context();
4650 }
4651 }
4652 }
4653
4654 // update
4655 realm->seq = info.seq();
4656 realm->created = info.created();
4657 realm->parent_since = info.parent_since();
4658 realm->prior_parent_snaps = info.prior_parent_snaps;
4659 realm->my_snaps = info.my_snaps;
4660 invalidate = true;
4661 }
4662
4663 // _always_ verify parent
4664 if (adjust_realm_parent(realm, info.parent()))
4665 invalidate = true;
4666
4667 if (invalidate) {
4668 invalidate_snaprealm_and_children(realm);
11fdf7f2 4669 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4670 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4671 } else {
11fdf7f2 4672 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4673 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4674 }
4675
4676 if (!first_realm)
4677 first_realm = realm;
4678 else
4679 put_snap_realm(realm);
4680 }
4681
4682 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4683 q != dirty_realms.end();
4684 ++q) {
4685 SnapRealm *realm = q->first;
4686 // if there are new snaps ?
4687 if (has_new_snaps(q->second, realm->get_snap_context())) {
4688 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4689 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4690 while (!r.end()) {
4691 Inode *in = *r;
4692 ++r;
4693 queue_cap_snap(in, q->second);
4694 }
4695 } else {
4696 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4697 }
4698 put_snap_realm(realm);
4699 }
4700
4701 if (realm_ret)
4702 *realm_ret = first_realm;
4703 else
4704 put_snap_realm(first_realm);
4705}
4706
11fdf7f2 4707void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4708{
11fdf7f2 4709 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4710 mds_rank_t mds = mds_rank_t(m->get_source().num());
4711 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4712 if (!session) {
7c673cae
FG
4713 return;
4714 }
4715
4716 got_mds_push(session);
4717
4718 map<Inode*, SnapContext> to_move;
4719 SnapRealm *realm = 0;
4720
4721 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4722 ceph_assert(m->head.split);
7c673cae 4723 SnapRealmInfo info;
11fdf7f2
TL
4724 auto p = m->bl.cbegin();
4725 decode(info, p);
4726 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4727
4728 // flush, then move, ino's.
4729 realm = get_snap_realm(info.ino());
4730 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4731 for (auto& ino : m->split_inos) {
4732 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4733 if (inode_map.count(vino)) {
4734 Inode *in = inode_map[vino];
4735 if (!in->snaprealm || in->snaprealm == realm)
4736 continue;
4737 if (in->snaprealm->created > info.created()) {
4738 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4739 << *in->snaprealm << dendl;
4740 continue;
4741 }
4742 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4743
4744
4745 in->snaprealm_item.remove_myself();
4746 to_move[in] = in->snaprealm->get_snap_context();
4747 put_snap_realm(in->snaprealm);
4748 }
4749 }
4750
4751 // move child snaprealms, too
11fdf7f2
TL
4752 for (auto& child_realm : m->split_realms) {
4753 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4754 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4755 if (!child)
4756 continue;
4757 adjust_realm_parent(child, realm->ino);
4758 put_snap_realm(child);
4759 }
4760 }
4761
4762 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4763
4764 if (realm) {
4765 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4766 Inode *in = p->first;
4767 in->snaprealm = realm;
4768 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4769 realm->nref++;
4770 // queue for snap writeback
4771 if (has_new_snaps(p->second, realm->get_snap_context()))
4772 queue_cap_snap(in, p->second);
4773 }
4774 put_snap_realm(realm);
4775 }
7c673cae
FG
4776}
4777
11fdf7f2 4778void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4779{
4780 mds_rank_t mds = mds_rank_t(m->get_source().num());
4781 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4782 if (!session) {
7c673cae
FG
4783 return;
4784 }
4785
4786 got_mds_push(session);
4787
11fdf7f2 4788 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4789
4790 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4791 if (inode_map.count(vino)) {
4792 Inode *in = NULL;
4793 in = inode_map[vino];
4794
4795 if (in) {
4796 in->quota = m->quota;
4797 in->rstat = m->rstat;
4798 }
4799 }
7c673cae
FG
4800}
4801
11fdf7f2 4802void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4803{
4804 mds_rank_t mds = mds_rank_t(m->get_source().num());
4805 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4806 if (!session) {
7c673cae
FG
4807 return;
4808 }
4809
4810 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4811 // Pause RADOS operations until we see the required epoch
4812 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4813 }
4814
4815 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4816 // Record the barrier so that we will transmit it to MDS when releasing
4817 set_cap_epoch_barrier(m->osd_epoch_barrier);
4818 }
4819
4820 got_mds_push(session);
4821
11fdf7f2 4822 Inode *in;
7c673cae 4823 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4824 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4825 in = it->second;
4826 } else {
7c673cae 4827 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4828 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4829 session->enqueue_cap_release(
4830 m->get_ino(),
4831 m->get_cap_id(),
4832 m->get_seq(),
4833 m->get_mseq(),
4834 cap_epoch_barrier);
4835 } else {
11fdf7f2 4836 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4837 }
7c673cae
FG
4838
4839 // in case the mds is waiting on e.g. a revocation
4840 flush_cap_releases();
4841 return;
4842 }
4843
4844 switch (m->get_op()) {
11fdf7f2
TL
4845 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4846 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4847 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4848 }
4849
11fdf7f2
TL
4850 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4851 Cap &cap = in->caps.at(mds);
7c673cae 4852
11fdf7f2
TL
4853 switch (m->get_op()) {
4854 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4855 case CEPH_CAP_OP_IMPORT:
4856 case CEPH_CAP_OP_REVOKE:
4857 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4858 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4859 }
4860 } else {
4861 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4862 return;
7c673cae
FG
4863 }
4864}
4865
11fdf7f2 4866void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4867{
4868 mds_rank_t mds = session->mds_num;
4869
11fdf7f2 4870 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4871 << " IMPORT from mds." << mds << dendl;
4872
4873 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4874 Cap *cap = NULL;
4875 UserPerm cap_perms;
11fdf7f2
TL
4876 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4877 cap = &it->second;
4878 cap_perms = cap->latest_perms;
7c673cae
FG
4879 }
4880
4881 // add/update it
4882 SnapRealm *realm = NULL;
4883 update_snap_trace(m->snapbl, &realm);
4884
4885 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4886 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4887 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4888
4889 if (cap && cap->cap_id == m->peer.cap_id) {
4890 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4891 }
4892
4893 if (realm)
4894 put_snap_realm(realm);
4895
eafe8130 4896 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 4897 // reflush any/all caps (if we are now the auth_cap)
eafe8130 4898 kick_flushing_caps(in, session);
7c673cae
FG
4899 }
4900}
4901
11fdf7f2 4902void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4903{
4904 mds_rank_t mds = session->mds_num;
4905
11fdf7f2 4906 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4907 << " EXPORT from mds." << mds << dendl;
4908
11fdf7f2
TL
4909 auto it = in->caps.find(mds);
4910 if (it != in->caps.end()) {
4911 Cap &cap = it->second;
4912 if (cap.cap_id == m->get_cap_id()) {
4913 if (m->peer.cap_id) {
4914 const auto peer_mds = mds_rank_t(m->peer.mds);
4915 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4916 auto it = in->caps.find(peer_mds);
4917 if (it != in->caps.end()) {
4918 Cap &tcap = it->second;
4919 if (tcap.cap_id == m->peer.cap_id &&
4920 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4921 tcap.cap_id = m->peer.cap_id;
4922 tcap.seq = m->peer.seq - 1;
4923 tcap.issue_seq = tcap.seq;
4924 tcap.issued |= cap.issued;
4925 tcap.implemented |= cap.issued;
4926 if (&cap == in->auth_cap)
4927 in->auth_cap = &tcap;
4928 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4929 adjust_session_flushing_caps(in, session, tsession);
4930 }
4931 } else {
4932 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4933 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4934 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4935 cap.latest_perms);
4936 }
7c673cae 4937 } else {
11fdf7f2
TL
4938 if (cap.wanted | cap.issued)
4939 in->flags |= I_CAP_DROPPED;
7c673cae 4940 }
7c673cae 4941
11fdf7f2
TL
4942 remove_cap(&cap, false);
4943 }
7c673cae 4944 }
7c673cae
FG
4945}
4946
11fdf7f2 4947void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4948{
4949 mds_rank_t mds = session->mds_num;
11fdf7f2 4950 ceph_assert(in->caps.count(mds));
7c673cae 4951
11fdf7f2 4952 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4953 << " size " << in->size << " -> " << m->get_size()
4954 << dendl;
4955
1adf2230
AA
4956 int issued;
4957 in->caps_issued(&issued);
4958 issued |= in->caps_dirty();
4959 update_inode_file_size(in, issued, m->get_size(),
4960 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4961}
4962
11fdf7f2 4963void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4964{
4965 ceph_tid_t flush_ack_tid = m->get_client_tid();
4966 int dirty = m->get_dirty();
4967 int cleaned = 0;
4968 int flushed = 0;
4969
11fdf7f2
TL
4970 auto it = in->flushing_cap_tids.begin();
4971 if (it->first < flush_ack_tid) {
4972 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4973 << " got unexpected flush ack tid " << flush_ack_tid
4974 << " expected is " << it->first << dendl;
4975 }
4976 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
4977 if (!it->second) {
4978 // cap snap
4979 ++it;
4980 continue;
4981 }
7c673cae
FG
4982 if (it->first == flush_ack_tid)
4983 cleaned = it->second;
4984 if (it->first <= flush_ack_tid) {
4985 session->flushing_caps_tids.erase(it->first);
4986 in->flushing_cap_tids.erase(it++);
4987 ++flushed;
4988 continue;
4989 }
4990 cleaned &= ~it->second;
4991 if (!cleaned)
4992 break;
4993 ++it;
4994 }
4995
11fdf7f2 4996 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
4997 << " cleaned " << ccap_string(cleaned) << " on " << *in
4998 << " with " << ccap_string(dirty) << dendl;
4999
5000 if (flushed) {
5001 signal_cond_list(in->waitfor_caps);
5002 if (session->flushing_caps_tids.empty() ||
5003 *session->flushing_caps_tids.begin() > flush_ack_tid)
5004 sync_cond.Signal();
5005 }
5006
5007 if (!dirty) {
5008 in->cap_dirtier_uid = -1;
5009 in->cap_dirtier_gid = -1;
5010 }
5011
5012 if (!cleaned) {
5013 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5014 } else {
5015 if (in->flushing_caps) {
5016 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5017 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5018 in->flushing_caps &= ~cleaned;
5019 if (in->flushing_caps == 0) {
5020 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5021 num_flushing_caps--;
eafe8130 5022 if (in->flushing_cap_tids.empty())
7c673cae
FG
5023 in->flushing_cap_item.remove_myself();
5024 }
5025 if (!in->caps_dirty())
5026 put_inode(in);
5027 }
5028 }
7c673cae
FG
5029}
5030
5031
11fdf7f2 5032void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5033{
eafe8130 5034 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5035 mds_rank_t mds = session->mds_num;
11fdf7f2 5036 ceph_assert(in->caps.count(mds));
7c673cae
FG
5037 snapid_t follows = m->get_snap_follows();
5038
11fdf7f2
TL
5039 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5040 auto& capsnap = it->second;
eafe8130
TL
5041 if (flush_ack_tid != capsnap.flush_tid) {
5042 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5043 } else {
eafe8130 5044 InodeRef tmp_ref(in);
11fdf7f2 5045 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5046 << " on " << *in << dendl;
7c673cae 5047 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5048 in->flushing_cap_tids.erase(capsnap.flush_tid);
5049 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5050 in->flushing_cap_item.remove_myself();
11fdf7f2 5051 in->cap_snaps.erase(it);
eafe8130
TL
5052
5053 signal_cond_list(in->waitfor_caps);
5054 if (session->flushing_caps_tids.empty() ||
5055 *session->flushing_caps_tids.begin() > flush_ack_tid)
5056 sync_cond.Signal();
7c673cae
FG
5057 }
5058 } else {
11fdf7f2 5059 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5060 << " on " << *in << dendl;
5061 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5062 }
7c673cae
FG
5063}
5064
5065class C_Client_DentryInvalidate : public Context {
5066private:
5067 Client *client;
5068 vinodeno_t dirino;
5069 vinodeno_t ino;
5070 string name;
5071public:
5072 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5073 client(c), name(dn->name) {
5074 if (client->use_faked_inos()) {
5075 dirino.ino = dn->dir->parent_inode->faked_ino;
5076 if (del)
5077 ino.ino = dn->inode->faked_ino;
5078 } else {
5079 dirino = dn->dir->parent_inode->vino();
5080 if (del)
5081 ino = dn->inode->vino();
5082 }
5083 if (!del)
5084 ino.ino = inodeno_t();
5085 }
5086 void finish(int r) override {
5087 // _async_dentry_invalidate is responsible for its own locking
11fdf7f2 5088 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
5089 client->_async_dentry_invalidate(dirino, ino, name);
5090 }
5091};
5092
5093void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5094{
5095 if (unmounting)
5096 return;
11fdf7f2 5097 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5098 << " in dir " << dirino << dendl;
5099 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5100}
5101
5102void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5103{
5104 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5105 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5106}
5107
5108void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5109{
5110 int ref = in->get_num_ref();
494da23a 5111 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5112
5113 if (in->dir && !in->dir->dentries.empty()) {
5114 for (auto p = in->dir->dentries.begin();
5115 p != in->dir->dentries.end(); ) {
5116 Dentry *dn = p->second;
5117 ++p;
5118 /* rmsnap removes whole subtree, need trim inodes recursively.
5119 * we don't need to invalidate dentries recursively. because
5120 * invalidating a directory dentry effectively invalidate
5121 * whole subtree */
5122 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5123 _try_to_trim_inode(dn->inode.get(), false);
5124
5125 if (dn->lru_is_expireable())
5126 unlink(dn, true, false); // keep dir, drop dentry
5127 }
5128 if (in->dir->dentries.empty()) {
5129 close_dir(in->dir);
5130 --ref;
5131 }
5132 }
5133
5134 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5135 InodeRef snapdir = open_snapdir(in);
5136 _try_to_trim_inode(snapdir.get(), false);
5137 --ref;
5138 }
5139
494da23a 5140 if (ref > 0) {
11fdf7f2
TL
5141 auto q = in->dentries.begin();
5142 while (q != in->dentries.end()) {
5143 Dentry *dn = *q;
5144 ++q;
494da23a
TL
5145 if( in->ll_ref > 0 && sched_inval) {
5146 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5147 // so in->dentries doesn't always reflect the state of kernel's dcache.
5148 _schedule_invalidate_dentry_callback(dn, true);
5149 }
7c673cae
FG
5150 unlink(dn, true, true);
5151 }
5152 }
5153}
5154
11fdf7f2 5155void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5156{
5157 mds_rank_t mds = session->mds_num;
5158 int used = get_caps_used(in);
5159 int wanted = in->caps_wanted();
5160
a8e16298
TL
5161 const unsigned new_caps = m->get_caps();
5162 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5163 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5164 << " mds." << mds << " seq " << m->get_seq()
5165 << " caps now " << ccap_string(new_caps)
a8e16298 5166 << " was " << ccap_string(cap->issued)
92f5a8d4 5167 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5168
5169 if (was_stale)
5170 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5171 cap->seq = m->get_seq();
28e407b8 5172 cap->gen = session->cap_gen;
7c673cae 5173
11fdf7f2 5174 check_cap_issue(in, new_caps);
a8e16298 5175
7c673cae 5176 // update inode
1adf2230
AA
5177 int issued;
5178 in->caps_issued(&issued);
5179 issued |= in->caps_dirty();
7c673cae 5180
1adf2230
AA
5181 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5182 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5183 in->mode = m->head.mode;
5184 in->uid = m->head.uid;
5185 in->gid = m->head.gid;
5186 in->btime = m->btime;
5187 }
5188 bool deleted_inode = false;
1adf2230
AA
5189 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5190 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5191 in->nlink = m->head.nlink;
5192 if (in->nlink == 0 &&
5193 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5194 deleted_inode = true;
5195 }
1adf2230 5196 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5197 m->xattrbl.length() &&
5198 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5199 auto p = m->xattrbl.cbegin();
5200 decode(in->xattrs, p);
7c673cae
FG
5201 in->xattr_version = m->head.xattr_version;
5202 }
28e407b8
AA
5203
5204 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5205 in->dirstat.nfiles = m->get_nfiles();
5206 in->dirstat.nsubdirs = m->get_nsubdirs();
5207 }
5208
1adf2230
AA
5209 if (new_caps & CEPH_CAP_ANY_RD) {
5210 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5211 m->get_ctime(), m->get_mtime(), m->get_atime());
5212 }
5213
5214 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5215 in->layout = m->get_layout();
5216 update_inode_file_size(in, issued, m->get_size(),
5217 m->get_truncate_seq(), m->get_truncate_size());
5218 }
5219
5220 if (m->inline_version > in->inline_version) {
5221 in->inline_data = m->inline_data;
5222 in->inline_version = m->inline_version;
5223 }
5224
5225 /* always take a newer change attr */
5226 if (m->get_change_attr() > in->change_attr)
5227 in->change_attr = m->get_change_attr();
7c673cae
FG
5228
5229 // max_size
5230 if (cap == in->auth_cap &&
1adf2230
AA
5231 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5232 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5233 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5234 in->max_size = m->get_max_size();
5235 if (in->max_size > in->wanted_max_size) {
5236 in->wanted_max_size = 0;
5237 in->requested_max_size = 0;
5238 }
5239 }
5240
5241 bool check = false;
a8e16298
TL
5242 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5243 (wanted & ~(cap->wanted | new_caps))) {
5244 // If mds is importing cap, prior cap messages that update 'wanted'
5245 // may get dropped by mds (migrate seq mismatch).
5246 //
5247 // We don't send cap message to update 'wanted' if what we want are
5248 // already issued. If mds revokes caps, cap message that releases caps
5249 // also tells mds what we want. But if caps got revoked by mds forcedly
5250 // (session stale). We may haven't told mds what we want.
7c673cae 5251 check = true;
a8e16298 5252 }
7c673cae 5253
7c673cae
FG
5254
5255 // update caps
a8e16298 5256 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5257 if (revoked) {
5258 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5259 cap->issued = new_caps;
5260 cap->implemented |= new_caps;
5261
b32b8144
FG
5262 // recall delegations if we're losing caps necessary for them
5263 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5264 in->recall_deleg(false);
5265 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5266 in->recall_deleg(true);
5267
11fdf7f2
TL
5268 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5269 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5270 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5271 // waitin' for flush
11fdf7f2 5272 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5273 if (_release(in))
5274 check = true;
5275 } else {
5276 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5277 check = true;
5278 }
a8e16298
TL
5279 } else if (cap->issued == new_caps) {
5280 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5281 } else {
a8e16298 5282 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5283 cap->issued = new_caps;
5284 cap->implemented |= new_caps;
5285
5286 if (cap == in->auth_cap) {
5287 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5288 for (const auto &p : in->caps) {
5289 if (&p.second == cap)
7c673cae 5290 continue;
11fdf7f2 5291 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5292 check = true;
5293 break;
5294 }
5295 }
5296 }
5297 }
5298
5299 if (check)
5300 check_caps(in, 0);
5301
5302 // wake up waiters
5303 if (new_caps)
5304 signal_cond_list(in->waitfor_caps);
5305
5306 // may drop inode's last ref
5307 if (deleted_inode)
5308 _try_to_trim_inode(in, true);
7c673cae
FG
5309}
5310
7c673cae
FG
5311int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5312{
5313 if (perms.uid() == 0)
5314 return 0;
5315
5316 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5317 int ret = _posix_acl_permission(in, perms, want);
5318 if (ret != -EAGAIN)
5319 return ret;
5320 }
5321
5322 // check permissions before doing anything else
5323 if (!in->check_mode(perms, want))
5324 return -EACCES;
5325 return 0;
5326}
5327
5328int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5329 const UserPerm& perms)
5330{
5331 int r = _getattr_for_perm(in, perms);
5332 if (r < 0)
5333 goto out;
5334
5335 r = 0;
5336 if (strncmp(name, "system.", 7) == 0) {
5337 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5338 r = -EPERM;
5339 } else {
5340 r = inode_permission(in, perms, want);
5341 }
5342out:
1adf2230 5343 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5344 return r;
5345}
5346
5347ostream& operator<<(ostream &out, const UserPerm& perm) {
5348 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5349 return out;
5350}
5351
5352int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5353 const UserPerm& perms)
5354{
181888fb 5355 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5356 int r = _getattr_for_perm(in, perms);
5357 if (r < 0)
5358 goto out;
5359
5360 if (mask & CEPH_SETATTR_SIZE) {
5361 r = inode_permission(in, perms, MAY_WRITE);
5362 if (r < 0)
5363 goto out;
5364 }
5365
5366 r = -EPERM;
5367 if (mask & CEPH_SETATTR_UID) {
5368 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5369 goto out;
5370 }
5371 if (mask & CEPH_SETATTR_GID) {
5372 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5373 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5374 goto out;
5375 }
5376
5377 if (mask & CEPH_SETATTR_MODE) {
5378 if (perms.uid() != 0 && perms.uid() != in->uid)
5379 goto out;
5380
5381 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5382 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5383 stx->stx_mode &= ~S_ISGID;
5384 }
5385
5386 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5387 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5388 if (perms.uid() != 0 && perms.uid() != in->uid) {
5389 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5390 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5391 check_mask |= CEPH_SETATTR_MTIME;
5392 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5393 check_mask |= CEPH_SETATTR_ATIME;
5394 if (check_mask & mask) {
5395 goto out;
5396 } else {
5397 r = inode_permission(in, perms, MAY_WRITE);
5398 if (r < 0)
5399 goto out;
5400 }
5401 }
5402 }
5403 r = 0;
5404out:
5405 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5406 return r;
5407}
5408
5409int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5410{
181888fb 5411 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5412 unsigned want = 0;
5413
5414 if ((flags & O_ACCMODE) == O_WRONLY)
5415 want = MAY_WRITE;
5416 else if ((flags & O_ACCMODE) == O_RDWR)
5417 want = MAY_READ | MAY_WRITE;
5418 else if ((flags & O_ACCMODE) == O_RDONLY)
5419 want = MAY_READ;
5420 if (flags & O_TRUNC)
5421 want |= MAY_WRITE;
5422
5423 int r = 0;
5424 switch (in->mode & S_IFMT) {
5425 case S_IFLNK:
5426 r = -ELOOP;
5427 goto out;
5428 case S_IFDIR:
5429 if (want & MAY_WRITE) {
5430 r = -EISDIR;
5431 goto out;
5432 }
5433 break;
5434 }
5435
5436 r = _getattr_for_perm(in, perms);
5437 if (r < 0)
5438 goto out;
5439
5440 r = inode_permission(in, perms, want);
5441out:
5442 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5443 return r;
5444}
5445
5446int Client::may_lookup(Inode *dir, const UserPerm& perms)
5447{
181888fb 5448 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5449 int r = _getattr_for_perm(dir, perms);
5450 if (r < 0)
5451 goto out;
5452
5453 r = inode_permission(dir, perms, MAY_EXEC);
5454out:
5455 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5456 return r;
5457}
5458
5459int Client::may_create(Inode *dir, const UserPerm& perms)
5460{
181888fb 5461 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5462 int r = _getattr_for_perm(dir, perms);
5463 if (r < 0)
5464 goto out;
5465
5466 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5467out:
5468 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5469 return r;
5470}
5471
5472int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5473{
181888fb 5474 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5475 int r = _getattr_for_perm(dir, perms);
5476 if (r < 0)
5477 goto out;
5478
5479 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5480 if (r < 0)
5481 goto out;
5482
5483 /* 'name == NULL' means rmsnap */
5484 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5485 InodeRef otherin;
5486 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5487 if (r < 0)
5488 goto out;
5489 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5490 r = -EPERM;
5491 }
5492out:
5493 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5494 return r;
5495}
5496
5497int Client::may_hardlink(Inode *in, const UserPerm& perms)
5498{
181888fb 5499 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5500 int r = _getattr_for_perm(in, perms);
5501 if (r < 0)
5502 goto out;
5503
5504 if (perms.uid() == 0 || perms.uid() == in->uid) {
5505 r = 0;
5506 goto out;
5507 }
5508
5509 r = -EPERM;
5510 if (!S_ISREG(in->mode))
5511 goto out;
5512
5513 if (in->mode & S_ISUID)
5514 goto out;
5515
5516 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5517 goto out;
5518
5519 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5520out:
5521 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5522 return r;
5523}
5524
5525int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5526{
5527 int mask = CEPH_STAT_CAP_MODE;
5528 bool force = false;
5529 if (acl_type != NO_ACL) {
5530 mask |= CEPH_STAT_CAP_XATTR;
5531 force = in->xattr_version == 0;
5532 }
5533 return _getattr(in, mask, perms, force);
5534}
5535
5536vinodeno_t Client::_get_vino(Inode *in)
5537{
5538 /* The caller must hold the client lock */
5539 return vinodeno_t(in->ino, in->snapid);
5540}
5541
7c673cae
FG
5542/**
5543 * Resolve an MDS spec to a list of MDS daemon GIDs.
5544 *
5545 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5546 * It may be '*' in which case it matches all GIDs.
5547 *
5548 * If no error is returned, the `targets` vector will be populated with at least
5549 * one MDS.
5550 */
5551int Client::resolve_mds(
5552 const std::string &mds_spec,
5553 std::vector<mds_gid_t> *targets)
5554{
11fdf7f2
TL
5555 ceph_assert(fsmap);
5556 ceph_assert(targets != nullptr);
7c673cae
FG
5557
5558 mds_role_t role;
5559 std::stringstream ss;
5560 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5561 if (role_r == 0) {
5562 // We got a role, resolve it to a GID
5563 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5564 << role << "'" << dendl;
5565 targets->push_back(
5566 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5567 return 0;
5568 }
5569
5570 std::string strtol_err;
5571 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5572 if (strtol_err.empty()) {
5573 // It is a possible GID
5574 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5575 if (fsmap->gid_exists(mds_gid)) {
5576 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5577 targets->push_back(mds_gid);
5578 } else {
5579 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5580 << dendl;
5581 return -ENOENT;
5582 }
5583 } else if (mds_spec == "*") {
5584 // It is a wildcard: use all MDSs
5585 const auto mds_info = fsmap->get_mds_info();
5586
5587 if (mds_info.empty()) {
5588 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5589 return -ENOENT;
5590 }
5591
5592 for (const auto i : mds_info) {
5593 targets->push_back(i.first);
5594 }
5595 } else {
5596 // It did not parse as an integer, it is not a wildcard, it must be a name
5597 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5598 if (mds_gid == 0) {
5599 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5600
5601 lderr(cct) << "FSMap: " << *fsmap << dendl;
5602
5603 return -ENOENT;
5604 } else {
5605 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5606 << "' to GID " << mds_gid << dendl;
5607 targets->push_back(mds_gid);
5608 }
5609 }
5610
5611 return 0;
5612}
5613
5614
5615/**
5616 * Authenticate with mon and establish global ID
5617 */
5618int Client::authenticate()
5619{
11fdf7f2 5620 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
5621
5622 if (monclient->is_authenticated()) {
5623 return 0;
5624 }
5625
5626 client_lock.Unlock();
5627 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5628 client_lock.Lock();
5629 if (r < 0) {
5630 return r;
5631 }
5632
5633 whoami = monclient->get_global_id();
5634 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5635
5636 return 0;
5637}
5638
5639int Client::fetch_fsmap(bool user)
5640{
5641 int r;
5642 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5643 // rather than MDSMap because no one MDSMap contains all the daemons, and
5644 // a `tell` can address any daemon.
5645 version_t fsmap_latest;
5646 do {
5647 C_SaferCond cond;
5648 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5649 client_lock.Unlock();
5650 r = cond.wait();
5651 client_lock.Lock();
5652 } while (r == -EAGAIN);
5653
5654 if (r < 0) {
5655 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5656 return r;
5657 }
5658
5659 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5660
5661 if (user) {
5662 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5663 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5664 monclient->renew_subs();
5665 wait_on_list(waiting_for_fsmap);
5666 }
11fdf7f2
TL
5667 ceph_assert(fsmap_user);
5668 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5669 } else {
5670 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5671 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5672 monclient->renew_subs();
5673 wait_on_list(waiting_for_fsmap);
5674 }
11fdf7f2
TL
5675 ceph_assert(fsmap);
5676 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5677 }
5678 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5679 << fsmap_latest << dendl;
5680 return 0;
5681}
5682
5683/**
5684 *
5685 * @mds_spec one of ID, rank, GID, "*"
5686 *
5687 */
5688int Client::mds_command(
5689 const std::string &mds_spec,
5690 const vector<string>& cmd,
5691 const bufferlist& inbl,
5692 bufferlist *outbl,
5693 string *outs,
5694 Context *onfinish)
5695{
11fdf7f2 5696 std::lock_guard lock(client_lock);
7c673cae 5697
181888fb
FG
5698 if (!initialized)
5699 return -ENOTCONN;
7c673cae
FG
5700
5701 int r;
5702 r = authenticate();
5703 if (r < 0) {
5704 return r;
5705 }
5706
5707 r = fetch_fsmap(false);
5708 if (r < 0) {
5709 return r;
5710 }
5711
5712 // Look up MDS target(s) of the command
5713 std::vector<mds_gid_t> targets;
5714 r = resolve_mds(mds_spec, &targets);
5715 if (r < 0) {
5716 return r;
5717 }
5718
5719 // If daemons are laggy, we won't send them commands. If all
5720 // are laggy then we fail.
5721 std::vector<mds_gid_t> non_laggy;
5722 for (const auto gid : targets) {
5723 const auto info = fsmap->get_info_gid(gid);
5724 if (!info.laggy()) {
5725 non_laggy.push_back(gid);
5726 }
5727 }
5728 if (non_laggy.size() == 0) {
5729 *outs = "All targeted MDS daemons are laggy";
5730 return -ENOENT;
5731 }
5732
5733 if (metadata.empty()) {
5734 // We are called on an unmounted client, so metadata
5735 // won't be initialized yet.
5736 populate_metadata("");
5737 }
5738
5739 // Send commands to targets
5740 C_GatherBuilder gather(cct, onfinish);
5741 for (const auto target_gid : non_laggy) {
5742 const auto info = fsmap->get_info_gid(target_gid);
5743
5744 // Open a connection to the target MDS
11fdf7f2 5745 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5746
5747 // Generate MDSCommandOp state
5748 auto &op = command_table.start_command();
5749
5750 op.on_finish = gather.new_sub();
5751 op.cmd = cmd;
5752 op.outbl = outbl;
5753 op.outs = outs;
5754 op.inbl = inbl;
5755 op.mds_gid = target_gid;
5756 op.con = conn;
5757
5758 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5759 << " tid=" << op.tid << cmd << dendl;
5760
5761 // Construct and send MCommand
11fdf7f2
TL
5762 auto m = op.get_message(monclient->get_fsid());
5763 conn->send_message2(std::move(m));
7c673cae
FG
5764 }
5765 gather.activate();
5766
5767 return 0;
5768}
5769
11fdf7f2 5770void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5771{
5772 ceph_tid_t const tid = m->get_tid();
5773
5774 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5775
5776 if (!command_table.exists(tid)) {
5777 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5778 return;
5779 }
5780
5781 auto &op = command_table.get_command(tid);
5782 if (op.outbl) {
11fdf7f2 5783 *op.outbl = m->get_data();
7c673cae
FG
5784 }
5785 if (op.outs) {
5786 *op.outs = m->rs;
5787 }
5788
5789 if (op.on_finish) {
5790 op.on_finish->complete(m->r);
5791 }
5792
5793 command_table.erase(tid);
7c673cae
FG
5794}
5795
5796// -------------------
5797// MOUNT
5798
11fdf7f2 5799int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5800{
7c673cae
FG
5801 int r = authenticate();
5802 if (r < 0) {
5803 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5804 return r;
5805 }
5806
11fdf7f2
TL
5807 std::string resolved_fs_name;
5808 if (fs_name.empty()) {
5809 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5810 } else {
5811 resolved_fs_name = fs_name;
5812 }
5813
7c673cae 5814 std::string want = "mdsmap";
11fdf7f2 5815 if (!resolved_fs_name.empty()) {
7c673cae
FG
5816 r = fetch_fsmap(true);
5817 if (r < 0)
5818 return r;
11fdf7f2
TL
5819 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5820 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5821 return -ENOENT;
11fdf7f2 5822 }
7c673cae
FG
5823
5824 std::ostringstream oss;
11fdf7f2 5825 oss << want << "." << fscid;
7c673cae
FG
5826 want = oss.str();
5827 }
5828 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5829
5830 monclient->sub_want(want, 0, 0);
5831 monclient->renew_subs();
5832
11fdf7f2
TL
5833 return 0;
5834}
5835
5836int Client::mount(const std::string &mount_root, const UserPerm& perms,
5837 bool require_mds, const std::string &fs_name)
5838{
5839 std::lock_guard lock(client_lock);
5840
5841 if (mounted) {
5842 ldout(cct, 5) << "already mounted" << dendl;
5843 return 0;
5844 }
5845
5846 unmounting = false;
5847
5848 int r = subscribe_mdsmap(fs_name);
5849 if (r < 0) {
5850 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5851 return r;
5852 }
5853
7c673cae
FG
5854 tick(); // start tick
5855
5856 if (require_mds) {
5857 while (1) {
5858 auto availability = mdsmap->is_cluster_available();
5859 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5860 // Error out
5861 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5862 return CEPH_FUSE_NO_MDS_UP;
5863 } else if (availability == MDSMap::AVAILABLE) {
5864 // Continue to mount
5865 break;
5866 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5867 // Else, wait. MDSMonitor will update the map to bring
5868 // us to a conclusion eventually.
5869 wait_on_list(waiting_for_mdsmap);
5870 } else {
5871 // Unexpected value!
5872 ceph_abort();
5873 }
5874 }
5875 }
5876
5877 populate_metadata(mount_root.empty() ? "/" : mount_root);
5878
5879 filepath fp(CEPH_INO_ROOT);
5880 if (!mount_root.empty()) {
5881 fp = filepath(mount_root.c_str());
5882 }
5883 while (true) {
5884 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5885 req->set_filepath(fp);
5886 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5887 int res = make_request(req, perms);
5888 if (res < 0) {
5889 if (res == -EACCES && root) {
5890 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5891 break;
5892 }
5893 return res;
5894 }
5895
5896 if (fp.depth())
5897 fp.pop_dentry();
5898 else
5899 break;
5900 }
5901
11fdf7f2 5902 ceph_assert(root);
7c673cae
FG
5903 _ll_get(root);
5904
5905 mounted = true;
5906
5907 // trace?
5908 if (!cct->_conf->client_trace.empty()) {
5909 traceout.open(cct->_conf->client_trace.c_str());
5910 if (traceout.is_open()) {
5911 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5912 } else {
5913 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5914 }
5915 }
5916
5917 /*
5918 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5919 ldout(cct, 3) << "op: struct stat st;" << dendl;
5920 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5921 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5922 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5923 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5924 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5925 ldout(cct, 3) << "op: int fd;" << dendl;
5926 */
5927 return 0;
5928}
5929
5930// UNMOUNT
5931
5932void Client::_close_sessions()
5933{
5934 while (!mds_sessions.empty()) {
5935 // send session closes!
11fdf7f2
TL
5936 for (auto &p : mds_sessions) {
5937 if (p.second.state != MetaSession::STATE_CLOSING) {
5938 _close_mds_session(&p.second);
7c673cae
FG
5939 }
5940 }
5941
5942 // wait for sessions to close
5943 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5944 mount_cond.Wait(client_lock);
5945 }
5946}
5947
31f18b77
FG
5948void Client::flush_mdlog_sync()
5949{
5950 if (mds_requests.empty())
5951 return;
11fdf7f2
TL
5952 for (auto &p : mds_sessions) {
5953 flush_mdlog(&p.second);
31f18b77
FG
5954 }
5955}
5956
5957void Client::flush_mdlog(MetaSession *session)
5958{
5959 // Only send this to Luminous or newer MDS daemons, older daemons
5960 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5961 const uint64_t features = session->con->get_features();
5962 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
11fdf7f2
TL
5963 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5964 session->con->send_message2(std::move(m));
31f18b77
FG
5965 }
5966}
5967
5968
11fdf7f2
TL
5969void Client::_abort_mds_sessions(int err)
5970{
5971 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5972 auto req = p->second;
5973 ++p;
5974 // unsafe requests will be removed during close session below.
5975 if (req->got_unsafe)
5976 continue;
5977
5978 req->abort(err);
5979 if (req->caller_cond) {
5980 req->kick = true;
5981 req->caller_cond->Signal();
5982 }
5983 }
5984
5985 // Process aborts on any requests that were on this waitlist.
5986 // Any requests that were on a waiting_for_open session waitlist
5987 // will get kicked during close session below.
5988 signal_cond_list(waiting_for_mdsmap);
5989
5990 // Force-close all sessions
5991 while(!mds_sessions.empty()) {
5992 auto& session = mds_sessions.begin()->second;
5993 _closed_mds_session(&session);
5994 }
5995}
5996
5997void Client::_unmount(bool abort)
7c673cae 5998{
181888fb
FG
5999 if (unmounting)
6000 return;
7c673cae 6001
11fdf7f2
TL
6002 if (abort || blacklisted) {
6003 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6004 } else {
6005 ldout(cct, 2) << "unmounting" << dendl;
6006 }
7c673cae
FG
6007 unmounting = true;
6008
b32b8144
FG
6009 deleg_timeout = 0;
6010
11fdf7f2
TL
6011 if (abort) {
6012 // Abort all mds sessions
6013 _abort_mds_sessions(-ENOTCONN);
6014
6015 objecter->op_cancel_writes(-ENOTCONN);
6016 } else {
6017 // flush the mdlog for pending requests, if any
6018 flush_mdlog_sync();
6019 }
6020
7c673cae
FG
6021 while (!mds_requests.empty()) {
6022 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6023 mount_cond.Wait(client_lock);
6024 }
6025
6026 if (tick_event)
6027 timer.cancel_event(tick_event);
6028 tick_event = 0;
6029
6030 cwd.reset();
6031
6032 // clean up any unclosed files
6033 while (!fd_map.empty()) {
6034 Fh *fh = fd_map.begin()->second;
6035 fd_map.erase(fd_map.begin());
6036 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6037 _release_fh(fh);
6038 }
6039
6040 while (!ll_unclosed_fh_set.empty()) {
6041 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6042 Fh *fh = *it;
6043 ll_unclosed_fh_set.erase(fh);
6044 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6045 _release_fh(fh);
6046 }
6047
6048 while (!opened_dirs.empty()) {
6049 dir_result_t *dirp = *opened_dirs.begin();
6050 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6051 _closedir(dirp);
6052 }
6053
6054 _ll_drop_pins();
6055
6056 while (unsafe_sync_write > 0) {
6057 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6058 mount_cond.Wait(client_lock);
6059 }
6060
6061 if (cct->_conf->client_oc) {
6062 // flush/release all buffered data
11fdf7f2
TL
6063 std::list<InodeRef> anchor;
6064 for (auto& p : inode_map) {
6065 Inode *in = p.second;
7c673cae 6066 if (!in) {
11fdf7f2
TL
6067 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6068 ceph_assert(in);
7c673cae 6069 }
11fdf7f2
TL
6070
6071 // prevent inode from getting freed
6072 anchor.emplace_back(in);
6073
6074 if (abort || blacklisted) {
6075 objectcacher->purge_set(&in->oset);
6076 } else if (!in->caps.empty()) {
7c673cae
FG
6077 _release(in);
6078 _flush(in, new C_Client_FlushComplete(this, in));
6079 }
6080 }
6081 }
6082
11fdf7f2
TL
6083 if (abort || blacklisted) {
6084 for (auto p = dirty_list.begin(); !p.end(); ) {
6085 Inode *in = *p;
6086 ++p;
6087 if (in->dirty_caps) {
6088 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6089 in->mark_caps_clean();
6090 put_inode(in);
6091 }
6092 }
6093 } else {
6094 flush_caps_sync();
6095 wait_sync_caps(last_flush_tid);
6096 }
7c673cae
FG
6097
6098 // empty lru cache
7c673cae
FG
6099 trim_cache();
6100
6101 while (lru.lru_get_size() > 0 ||
6102 !inode_map.empty()) {
6103 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6104 << "+" << inode_map.size() << " items"
6105 << ", waiting (for caps to release?)"
6106 << dendl;
6107 utime_t until = ceph_clock_now() + utime_t(5, 0);
6108 int r = mount_cond.WaitUntil(client_lock, until);
6109 if (r == ETIMEDOUT) {
6110 dump_cache(NULL);
6111 }
6112 }
11fdf7f2
TL
6113 ceph_assert(lru.lru_get_size() == 0);
6114 ceph_assert(inode_map.empty());
7c673cae
FG
6115
6116 // stop tracing
6117 if (!cct->_conf->client_trace.empty()) {
6118 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6119 traceout.close();
6120 }
6121
6122 _close_sessions();
6123
6124 mounted = false;
6125
6126 ldout(cct, 2) << "unmounted." << dendl;
6127}
6128
b32b8144
FG
6129void Client::unmount()
6130{
11fdf7f2
TL
6131 std::lock_guard lock(client_lock);
6132 _unmount(false);
6133}
6134
6135void Client::abort_conn()
6136{
6137 std::lock_guard lock(client_lock);
6138 _unmount(true);
b32b8144
FG
6139}
6140
7c673cae
FG
6141void Client::flush_cap_releases()
6142{
6143 // send any cap releases
11fdf7f2
TL
6144 for (auto &p : mds_sessions) {
6145 auto &session = p.second;
6146 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6147 p.first)) {
7c673cae
FG
6148 if (cct->_conf->client_inject_release_failure) {
6149 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6150 } else {
11fdf7f2 6151 session.con->send_message2(std::move(session.release));
7c673cae 6152 }
11fdf7f2 6153 session.release.reset();
7c673cae
FG
6154 }
6155 }
6156}
6157
6158void Client::tick()
6159{
6160 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6161 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6162 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6163 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6164 }
6165
6166 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6167 tick_event = timer.add_event_after(
6168 cct->_conf->client_tick_interval,
6169 new FunctionContext([this](int) {
6170 // Called back via Timer, which takes client_lock for us
11fdf7f2 6171 ceph_assert(client_lock.is_locked_by_me());
3efd9988
FG
6172 tick();
6173 }));
7c673cae
FG
6174 utime_t now = ceph_clock_now();
6175
6176 if (!mounted && !mds_requests.empty()) {
6177 MetaRequest *req = mds_requests.begin()->second;
6178 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6179 req->abort(-ETIMEDOUT);
6180 if (req->caller_cond) {
6181 req->kick = true;
6182 req->caller_cond->Signal();
6183 }
6184 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6185 for (auto &p : mds_sessions) {
6186 signal_context_list(p.second.waiting_for_open);
6187 }
7c673cae
FG
6188 }
6189 }
6190
6191 if (mdsmap->get_epoch()) {
6192 // renew caps?
6193 utime_t el = now - last_cap_renew;
6194 if (el > mdsmap->get_session_timeout() / 3.0)
6195 renew_caps();
6196
6197 flush_cap_releases();
6198 }
6199
6200 // delayed caps
28e407b8 6201 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6202 while (!p.end()) {
6203 Inode *in = *p;
6204 ++p;
6205 if (in->hold_caps_until > now)
6206 break;
28e407b8 6207 delayed_list.pop_front();
7c673cae
FG
6208 check_caps(in, CHECK_CAPS_NODELAY);
6209 }
6210
6211 trim_cache(true);
6212}
6213
6214void Client::renew_caps()
6215{
6216 ldout(cct, 10) << "renew_caps()" << dendl;
6217 last_cap_renew = ceph_clock_now();
6218
11fdf7f2
TL
6219 for (auto &p : mds_sessions) {
6220 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6221 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6222 renew_caps(&p.second);
7c673cae
FG
6223 }
6224}
6225
6226void Client::renew_caps(MetaSession *session)
6227{
6228 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6229 session->last_cap_renew_request = ceph_clock_now();
6230 uint64_t seq = ++session->cap_renew_seq;
11fdf7f2 6231 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6232}
6233
6234
6235// ===============================================================
6236// high level (POSIXy) interface
6237
6238int Client::_do_lookup(Inode *dir, const string& name, int mask,
6239 InodeRef *target, const UserPerm& perms)
6240{
6241 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6242 MetaRequest *req = new MetaRequest(op);
6243 filepath path;
6244 dir->make_nosnap_relative_path(path);
6245 path.push_dentry(name);
6246 req->set_filepath(path);
6247 req->set_inode(dir);
6248 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6249 mask |= DEBUG_GETATTR_CAPS;
6250 req->head.args.getattr.mask = mask;
6251
11fdf7f2 6252 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6253
6254 int r = make_request(req, perms, target);
11fdf7f2 6255 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6256 return r;
6257}
6258
6259int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6260 const UserPerm& perms)
6261{
6262 int r = 0;
6263 Dentry *dn = NULL;
6264
7c673cae 6265 if (dname == "..") {
11fdf7f2
TL
6266 if (dir->dentries.empty()) {
6267 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6268 filepath path(dir->ino);
6269 req->set_filepath(path);
6270
6271 InodeRef tmptarget;
6272 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6273
6274 if (r == 0) {
6275 Inode *tempino = tmptarget.get();
6276 _ll_get(tempino);
6277 *target = tempino;
6278 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6279 } else {
6280 *target = dir;
6281 }
6282 }
7c673cae
FG
6283 else
6284 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6285 goto done;
6286 }
6287
6288 if (dname == ".") {
6289 *target = dir;
6290 goto done;
6291 }
6292
11fdf7f2
TL
6293 if (!dir->is_dir()) {
6294 r = -ENOTDIR;
6295 goto done;
6296 }
6297
7c673cae
FG
6298 if (dname.length() > NAME_MAX) {
6299 r = -ENAMETOOLONG;
6300 goto done;
6301 }
6302
6303 if (dname == cct->_conf->client_snapdir &&
6304 dir->snapid == CEPH_NOSNAP) {
6305 *target = open_snapdir(dir);
6306 goto done;
6307 }
6308
6309 if (dir->dir &&
6310 dir->dir->dentries.count(dname)) {
6311 dn = dir->dir->dentries[dname];
6312
11fdf7f2 6313 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6314 << " seq " << dn->lease_seq
6315 << dendl;
6316
94b18763 6317 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6318 // is dn lease valid?
6319 utime_t now = ceph_clock_now();
6320 if (dn->lease_mds >= 0 &&
6321 dn->lease_ttl > now &&
6322 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6323 MetaSession &s = mds_sessions.at(dn->lease_mds);
6324 if (s.cap_ttl > now &&
6325 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6326 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6327 // make trim_caps() behave.
6328 dir->try_touch_cap(dn->lease_mds);
6329 goto hit_dn;
6330 }
11fdf7f2 6331 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6332 << " vs lease_gen " << dn->lease_gen << dendl;
6333 }
92f5a8d4 6334 // dir shared caps?
94b18763 6335 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6336 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6337 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6338 goto hit_dn;
6339 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6340 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6341 << *dir << " dn '" << dname << "'" << dendl;
6342 return -ENOENT;
6343 }
6344 }
6345 } else {
6346 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6347 }
6348 } else {
6349 // can we conclude ENOENT locally?
94b18763 6350 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6351 (dir->flags & I_COMPLETE)) {
11fdf7f2 6352 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6353 return -ENOENT;
6354 }
6355 }
6356
6357 r = _do_lookup(dir, dname, mask, target, perms);
6358 goto done;
6359
6360 hit_dn:
6361 if (dn->inode) {
6362 *target = dn->inode;
6363 } else {
6364 r = -ENOENT;
6365 }
6366 touch_dn(dn);
6367
6368 done:
6369 if (r < 0)
11fdf7f2 6370 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6371 else
11fdf7f2 6372 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6373 return r;
6374}
6375
6376int Client::get_or_create(Inode *dir, const char* name,
6377 Dentry **pdn, bool expect_null)
6378{
6379 // lookup
11fdf7f2 6380 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6381 dir->open_dir();
6382 if (dir->dir->dentries.count(name)) {
6383 Dentry *dn = dir->dir->dentries[name];
6384
6385 // is dn lease valid?
6386 utime_t now = ceph_clock_now();
6387 if (dn->inode &&
6388 dn->lease_mds >= 0 &&
6389 dn->lease_ttl > now &&
6390 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6391 MetaSession &s = mds_sessions.at(dn->lease_mds);
6392 if (s.cap_ttl > now &&
6393 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6394 if (expect_null)
6395 return -EEXIST;
6396 }
6397 }
6398 *pdn = dn;
6399 } else {
6400 // otherwise link up a new one
6401 *pdn = link(dir->dir, name, NULL, NULL);
6402 }
6403
6404 // success
6405 return 0;
6406}
6407
6408int Client::path_walk(const filepath& origpath, InodeRef *end,
6409 const UserPerm& perms, bool followsym, int mask)
6410{
6411 filepath path = origpath;
6412 InodeRef cur;
6413 if (origpath.absolute())
6414 cur = root;
6415 else
6416 cur = cwd;
11fdf7f2 6417 ceph_assert(cur);
7c673cae 6418
11fdf7f2 6419 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6420
6421 int symlinks = 0;
6422
6423 unsigned i=0;
6424 while (i < path.depth() && cur) {
6425 int caps = 0;
6426 const string &dname = path[i];
6427 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6428 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6429 InodeRef next;
6430 if (cct->_conf->client_permissions) {
6431 int r = may_lookup(cur.get(), perms);
6432 if (r < 0)
6433 return r;
6434 caps = CEPH_CAP_AUTH_SHARED;
6435 }
6436
6437 /* Get extra requested caps on the last component */
6438 if (i == (path.depth() - 1))
6439 caps |= mask;
6440 int r = _lookup(cur.get(), dname, caps, &next, perms);
6441 if (r < 0)
6442 return r;
6443 // only follow trailing symlink if followsym. always follow
6444 // 'directory' symlinks.
6445 if (next && next->is_symlink()) {
6446 symlinks++;
6447 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6448 if (symlinks > MAXSYMLINKS) {
6449 return -ELOOP;
6450 }
6451
6452 if (i < path.depth() - 1) {
6453 // dir symlink
6454 // replace consumed components of path with symlink dir target
6455 filepath resolved(next->symlink.c_str());
6456 resolved.append(path.postfixpath(i + 1));
6457 path = resolved;
6458 i = 0;
6459 if (next->symlink[0] == '/') {
6460 cur = root;
6461 }
6462 continue;
6463 } else if (followsym) {
6464 if (next->symlink[0] == '/') {
6465 path = next->symlink.c_str();
6466 i = 0;
6467 // reset position
6468 cur = root;
6469 } else {
6470 filepath more(next->symlink.c_str());
6471 // we need to remove the symlink component from off of the path
6472 // before adding the target that the symlink points to. remain
6473 // at the same position in the path.
6474 path.pop_dentry();
6475 path.append(more);
6476 }
6477 continue;
6478 }
6479 }
6480 cur.swap(next);
6481 i++;
6482 }
6483 if (!cur)
6484 return -ENOENT;
6485 if (end)
6486 end->swap(cur);
6487 return 0;
6488}
6489
6490
6491// namespace ops
6492
6493int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6494{
11fdf7f2 6495 std::lock_guard lock(client_lock);
7c673cae
FG
6496 tout(cct) << "link" << std::endl;
6497 tout(cct) << relexisting << std::endl;
6498 tout(cct) << relpath << std::endl;
6499
181888fb
FG
6500 if (unmounting)
6501 return -ENOTCONN;
6502
7c673cae
FG
6503 filepath existing(relexisting);
6504
6505 InodeRef in, dir;
6506 int r = path_walk(existing, &in, perm, true);
6507 if (r < 0)
6508 return r;
6509 if (std::string(relpath) == "/") {
6510 r = -EEXIST;
6511 return r;
6512 }
6513 filepath path(relpath);
6514 string name = path.last_dentry();
6515 path.pop_dentry();
6516
6517 r = path_walk(path, &dir, perm, true);
6518 if (r < 0)
6519 return r;
6520 if (cct->_conf->client_permissions) {
6521 if (S_ISDIR(in->mode)) {
6522 r = -EPERM;
6523 return r;
6524 }
6525 r = may_hardlink(in.get(), perm);
6526 if (r < 0)
6527 return r;
6528 r = may_create(dir.get(), perm);
6529 if (r < 0)
6530 return r;
6531 }
6532 r = _link(in.get(), dir.get(), name.c_str(), perm);
6533 return r;
6534}
6535
6536int Client::unlink(const char *relpath, const UserPerm& perm)
6537{
11fdf7f2
TL
6538 std::lock_guard lock(client_lock);
6539 tout(cct) << __func__ << std::endl;
7c673cae
FG
6540 tout(cct) << relpath << std::endl;
6541
181888fb
FG
6542 if (unmounting)
6543 return -ENOTCONN;
6544
7c673cae
FG
6545 if (std::string(relpath) == "/")
6546 return -EISDIR;
6547
6548 filepath path(relpath);
6549 string name = path.last_dentry();
6550 path.pop_dentry();
6551 InodeRef dir;
6552 int r = path_walk(path, &dir, perm);
6553 if (r < 0)
6554 return r;
6555 if (cct->_conf->client_permissions) {
6556 r = may_delete(dir.get(), name.c_str(), perm);
6557 if (r < 0)
6558 return r;
6559 }
6560 return _unlink(dir.get(), name.c_str(), perm);
6561}
6562
6563int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6564{
11fdf7f2
TL
6565 std::lock_guard lock(client_lock);
6566 tout(cct) << __func__ << std::endl;
7c673cae
FG
6567 tout(cct) << relfrom << std::endl;
6568 tout(cct) << relto << std::endl;
6569
181888fb
FG
6570 if (unmounting)
6571 return -ENOTCONN;
6572
7c673cae
FG
6573 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6574 return -EBUSY;
6575
6576 filepath from(relfrom);
6577 filepath to(relto);
6578 string fromname = from.last_dentry();
6579 from.pop_dentry();
6580 string toname = to.last_dentry();
6581 to.pop_dentry();
6582
6583 InodeRef fromdir, todir;
6584 int r = path_walk(from, &fromdir, perm);
6585 if (r < 0)
6586 goto out;
6587 r = path_walk(to, &todir, perm);
6588 if (r < 0)
6589 goto out;
6590
6591 if (cct->_conf->client_permissions) {
6592 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6593 if (r < 0)
6594 return r;
6595 r = may_delete(todir.get(), toname.c_str(), perm);
6596 if (r < 0 && r != -ENOENT)
6597 return r;
6598 }
6599 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6600out:
6601 return r;
6602}
6603
6604// dirs
6605
6606int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6607{
11fdf7f2
TL
6608 std::lock_guard lock(client_lock);
6609 tout(cct) << __func__ << std::endl;
7c673cae
FG
6610 tout(cct) << relpath << std::endl;
6611 tout(cct) << mode << std::endl;
11fdf7f2 6612 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6613
181888fb
FG
6614 if (unmounting)
6615 return -ENOTCONN;
6616
7c673cae
FG
6617 if (std::string(relpath) == "/")
6618 return -EEXIST;
6619
6620 filepath path(relpath);
6621 string name = path.last_dentry();
6622 path.pop_dentry();
6623 InodeRef dir;
6624 int r = path_walk(path, &dir, perm);
6625 if (r < 0)
6626 return r;
6627 if (cct->_conf->client_permissions) {
6628 r = may_create(dir.get(), perm);
6629 if (r < 0)
6630 return r;
6631 }
6632 return _mkdir(dir.get(), name.c_str(), mode, perm);
6633}
6634
6635int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6636{
11fdf7f2 6637 std::lock_guard lock(client_lock);
7c673cae 6638 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6639 tout(cct) << __func__ << std::endl;
7c673cae
FG
6640 tout(cct) << relpath << std::endl;
6641 tout(cct) << mode << std::endl;
6642
181888fb
FG
6643 if (unmounting)
6644 return -ENOTCONN;
6645
7c673cae
FG
6646 //get through existing parts of path
6647 filepath path(relpath);
6648 unsigned int i;
6649 int r = 0, caps = 0;
6650 InodeRef cur, next;
6651 cur = cwd;
6652 for (i=0; i<path.depth(); ++i) {
6653 if (cct->_conf->client_permissions) {
6654 r = may_lookup(cur.get(), perms);
6655 if (r < 0)
6656 break;
6657 caps = CEPH_CAP_AUTH_SHARED;
6658 }
6659 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6660 if (r < 0)
6661 break;
6662 cur.swap(next);
6663 }
7c673cae 6664 if (r!=-ENOENT) return r;
11fdf7f2 6665 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6666 //make new directory at each level
6667 for (; i<path.depth(); ++i) {
6668 if (cct->_conf->client_permissions) {
6669 r = may_create(cur.get(), perms);
6670 if (r < 0)
6671 return r;
6672 }
6673 //make new dir
6674 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6675
7c673cae 6676 //check proper creation/existence
c07f9fc5
FG
6677 if(-EEXIST == r && i < path.depth() - 1) {
6678 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6679 }
6680 if (r < 0)
6681 return r;
7c673cae
FG
6682 //move to new dir and continue
6683 cur.swap(next);
11fdf7f2 6684 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6685 << filepath(cur->ino).get_path() << dendl;
6686 }
6687 return 0;
6688}
6689
6690int Client::rmdir(const char *relpath, const UserPerm& perms)
6691{
11fdf7f2
TL
6692 std::lock_guard lock(client_lock);
6693 tout(cct) << __func__ << std::endl;
7c673cae
FG
6694 tout(cct) << relpath << std::endl;
6695
181888fb
FG
6696 if (unmounting)
6697 return -ENOTCONN;
6698
7c673cae
FG
6699 if (std::string(relpath) == "/")
6700 return -EBUSY;
6701
6702 filepath path(relpath);
6703 string name = path.last_dentry();
6704 path.pop_dentry();
6705 InodeRef dir;
6706 int r = path_walk(path, &dir, perms);
6707 if (r < 0)
6708 return r;
6709 if (cct->_conf->client_permissions) {
6710 int r = may_delete(dir.get(), name.c_str(), perms);
6711 if (r < 0)
6712 return r;
6713 }
6714 return _rmdir(dir.get(), name.c_str(), perms);
6715}
6716
6717int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6718{
11fdf7f2
TL
6719 std::lock_guard lock(client_lock);
6720 tout(cct) << __func__ << std::endl;
7c673cae
FG
6721 tout(cct) << relpath << std::endl;
6722 tout(cct) << mode << std::endl;
6723 tout(cct) << rdev << std::endl;
6724
181888fb
FG
6725 if (unmounting)
6726 return -ENOTCONN;
6727
7c673cae
FG
6728 if (std::string(relpath) == "/")
6729 return -EEXIST;
6730
6731 filepath path(relpath);
6732 string name = path.last_dentry();
6733 path.pop_dentry();
6734 InodeRef dir;
6735 int r = path_walk(path, &dir, perms);
6736 if (r < 0)
6737 return r;
6738 if (cct->_conf->client_permissions) {
6739 int r = may_create(dir.get(), perms);
6740 if (r < 0)
6741 return r;
6742 }
6743 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6744}
6745
6746// symlinks
6747
6748int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6749{
11fdf7f2
TL
6750 std::lock_guard lock(client_lock);
6751 tout(cct) << __func__ << std::endl;
7c673cae
FG
6752 tout(cct) << target << std::endl;
6753 tout(cct) << relpath << std::endl;
6754
181888fb
FG
6755 if (unmounting)
6756 return -ENOTCONN;
6757
7c673cae
FG
6758 if (std::string(relpath) == "/")
6759 return -EEXIST;
6760
6761 filepath path(relpath);
6762 string name = path.last_dentry();
6763 path.pop_dentry();
6764 InodeRef dir;
6765 int r = path_walk(path, &dir, perms);
6766 if (r < 0)
6767 return r;
6768 if (cct->_conf->client_permissions) {
6769 int r = may_create(dir.get(), perms);
6770 if (r < 0)
6771 return r;
6772 }
6773 return _symlink(dir.get(), name.c_str(), target, perms);
6774}
6775
6776int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6777{
11fdf7f2
TL
6778 std::lock_guard lock(client_lock);
6779 tout(cct) << __func__ << std::endl;
7c673cae
FG
6780 tout(cct) << relpath << std::endl;
6781
181888fb
FG
6782 if (unmounting)
6783 return -ENOTCONN;
6784
7c673cae
FG
6785 filepath path(relpath);
6786 InodeRef in;
6787 int r = path_walk(path, &in, perms, false);
6788 if (r < 0)
6789 return r;
6790
6791 return _readlink(in.get(), buf, size);
6792}
6793
6794int Client::_readlink(Inode *in, char *buf, size_t size)
6795{
6796 if (!in->is_symlink())
6797 return -EINVAL;
6798
6799 // copy into buf (at most size bytes)
6800 int r = in->symlink.length();
6801 if (r > (int)size)
6802 r = size;
6803 memcpy(buf, in->symlink.c_str(), r);
6804 return r;
6805}
6806
6807
6808// inode stuff
6809
6810int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6811{
94b18763 6812 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6813
11fdf7f2 6814 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6815 if (yes && !force)
6816 return 0;
6817
6818 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6819 filepath path;
6820 in->make_nosnap_relative_path(path);
6821 req->set_filepath(path);
6822 req->set_inode(in);
6823 req->head.args.getattr.mask = mask;
6824
6825 int res = make_request(req, perms);
11fdf7f2 6826 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6827 return res;
6828}
6829
6830int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6831 const UserPerm& perms, InodeRef *inp)
6832{
6833 int issued = in->caps_issued();
6834
11fdf7f2 6835 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6836 ccap_string(issued) << dendl;
6837
6838 if (in->snapid != CEPH_NOSNAP) {
6839 return -EROFS;
6840 }
6841 if ((mask & CEPH_SETATTR_SIZE) &&
6842 (unsigned long)stx->stx_size > in->size &&
6843 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6844 perms)) {
6845 return -EDQUOT;
6846 }
6847
6848 // make the change locally?
6849 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6850 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6851 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6852 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6853 << in->cap_dirtier_gid << ", forcing sync setattr"
6854 << dendl;
6855 /*
6856 * This works because we implicitly flush the caps as part of the
6857 * request, so the cap update check will happen with the writeback
6858 * cap context, and then the setattr check will happen with the
6859 * caller's context.
6860 *
6861 * In reality this pattern is likely pretty rare (different users
6862 * setattr'ing the same file). If that turns out not to be the
6863 * case later, we can build a more complex pipelined cap writeback
6864 * infrastructure...
6865 */
6866 if (!mask)
6867 mask |= CEPH_SETATTR_CTIME;
6868 goto force_request;
6869 }
6870
6871 if (!mask) {
6872 // caller just needs us to bump the ctime
6873 in->ctime = ceph_clock_now();
6874 in->cap_dirtier_uid = perms.uid();
6875 in->cap_dirtier_gid = perms.gid();
6876 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6877 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6878 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6879 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6880 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6881 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6882 else
6883 mask |= CEPH_SETATTR_CTIME;
6884 }
6885
6886 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6887 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6888
6889 mask &= ~CEPH_SETATTR_KILL_SGUID;
6890
6891 if (mask & CEPH_SETATTR_UID) {
6892 in->ctime = ceph_clock_now();
6893 in->cap_dirtier_uid = perms.uid();
6894 in->cap_dirtier_gid = perms.gid();
6895 in->uid = stx->stx_uid;
28e407b8 6896 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6897 mask &= ~CEPH_SETATTR_UID;
6898 kill_sguid = true;
6899 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6900 }
6901 if (mask & CEPH_SETATTR_GID) {
6902 in->ctime = ceph_clock_now();
6903 in->cap_dirtier_uid = perms.uid();
6904 in->cap_dirtier_gid = perms.gid();
6905 in->gid = stx->stx_gid;
28e407b8 6906 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6907 mask &= ~CEPH_SETATTR_GID;
6908 kill_sguid = true;
6909 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6910 }
6911
6912 if (mask & CEPH_SETATTR_MODE) {
6913 in->ctime = ceph_clock_now();
6914 in->cap_dirtier_uid = perms.uid();
6915 in->cap_dirtier_gid = perms.gid();
6916 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6917 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6918 mask &= ~CEPH_SETATTR_MODE;
6919 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6920 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6921 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6922 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6923 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6924 }
6925
6926 if (mask & CEPH_SETATTR_BTIME) {
6927 in->ctime = ceph_clock_now();
6928 in->cap_dirtier_uid = perms.uid();
6929 in->cap_dirtier_gid = perms.gid();
6930 in->btime = utime_t(stx->stx_btime);
28e407b8 6931 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6932 mask &= ~CEPH_SETATTR_BTIME;
6933 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6934 }
6935 } else if (mask & CEPH_SETATTR_SIZE) {
6936 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6937 mask |= CEPH_SETATTR_KILL_SGUID;
6938 }
6939
6940 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6941 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6942 if (mask & CEPH_SETATTR_MTIME)
6943 in->mtime = utime_t(stx->stx_mtime);
6944 if (mask & CEPH_SETATTR_ATIME)
6945 in->atime = utime_t(stx->stx_atime);
6946 in->ctime = ceph_clock_now();
6947 in->cap_dirtier_uid = perms.uid();
6948 in->cap_dirtier_gid = perms.gid();
6949 in->time_warp_seq++;
28e407b8 6950 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6951 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6952 }
6953 }
6954 if (!mask) {
6955 in->change_attr++;
6956 return 0;
6957 }
6958
6959force_request:
6960 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6961
6962 filepath path;
6963
6964 in->make_nosnap_relative_path(path);
6965 req->set_filepath(path);
6966 req->set_inode(in);
6967
6968 if (mask & CEPH_SETATTR_KILL_SGUID) {
6969 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6970 }
6971 if (mask & CEPH_SETATTR_MODE) {
6972 req->head.args.setattr.mode = stx->stx_mode;
6973 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6974 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6975 }
6976 if (mask & CEPH_SETATTR_UID) {
6977 req->head.args.setattr.uid = stx->stx_uid;
6978 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6979 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6980 }
6981 if (mask & CEPH_SETATTR_GID) {
6982 req->head.args.setattr.gid = stx->stx_gid;
6983 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6984 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6985 }
6986 if (mask & CEPH_SETATTR_BTIME) {
6987 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6988 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6989 }
6990 if (mask & CEPH_SETATTR_MTIME) {
6991 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6992 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6993 CEPH_CAP_FILE_WR;
6994 }
6995 if (mask & CEPH_SETATTR_ATIME) {
6996 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6997 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6998 CEPH_CAP_FILE_WR;
6999 }
7000 if (mask & CEPH_SETATTR_SIZE) {
7001 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7002 req->head.args.setattr.size = stx->stx_size;
7003 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7004 } else { //too big!
7005 put_request(req);
7006 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7007 return -EFBIG;
7008 }
94b18763 7009 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7010 CEPH_CAP_FILE_WR;
7011 }
7012 req->head.args.setattr.mask = mask;
7013
7014 req->regetattr_mask = mask;
7015
7016 int res = make_request(req, perms, inp);
7017 ldout(cct, 10) << "_setattr result=" << res << dendl;
7018 return res;
7019}
7020
7021/* Note that we only care about attrs that setattr cares about */
7022void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7023{
7024 stx->stx_size = st->st_size;
7025 stx->stx_mode = st->st_mode;
7026 stx->stx_uid = st->st_uid;
7027 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7028#ifdef __APPLE__
7029 stx->stx_mtime = st->st_mtimespec;
7030 stx->stx_atime = st->st_atimespec;
7031#else
7c673cae
FG
7032 stx->stx_mtime = st->st_mtim;
7033 stx->stx_atime = st->st_atim;
11fdf7f2 7034#endif
7c673cae
FG
7035}
7036
7037int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7038 const UserPerm& perms, InodeRef *inp)
7039{
7040 int ret = _do_setattr(in, stx, mask, perms, inp);
7041 if (ret < 0)
7042 return ret;
7043 if (mask & CEPH_SETATTR_MODE)
7044 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7045 return ret;
7046}
7047
7048int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7049 const UserPerm& perms)
7050{
7051 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7052 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7053 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7054 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7055 if (cct->_conf->client_permissions) {
7056 int r = may_setattr(in.get(), stx, mask, perms);
7057 if (r < 0)
7058 return r;
7059 }
7060 return __setattrx(in.get(), stx, mask, perms);
7061}
7062
7063int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7064 const UserPerm& perms)
7065{
7066 struct ceph_statx stx;
7067
7068 stat_to_statx(attr, &stx);
7069 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7070
7071 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7072 mask &= ~CEPH_SETATTR_UID;
7073 }
7074 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7075 mask &= ~CEPH_SETATTR_GID;
7076 }
7077
7c673cae
FG
7078 return _setattrx(in, &stx, mask, perms);
7079}
7080
7081int Client::setattr(const char *relpath, struct stat *attr, int mask,
7082 const UserPerm& perms)
7083{
11fdf7f2
TL
7084 std::lock_guard lock(client_lock);
7085 tout(cct) << __func__ << std::endl;
7c673cae
FG
7086 tout(cct) << relpath << std::endl;
7087 tout(cct) << mask << std::endl;
7088
181888fb
FG
7089 if (unmounting)
7090 return -ENOTCONN;
7091
7c673cae
FG
7092 filepath path(relpath);
7093 InodeRef in;
7094 int r = path_walk(path, &in, perms);
7095 if (r < 0)
7096 return r;
7097 return _setattr(in, attr, mask, perms);
7098}
7099
7100int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7101 const UserPerm& perms, int flags)
7102{
11fdf7f2
TL
7103 std::lock_guard lock(client_lock);
7104 tout(cct) << __func__ << std::endl;
7c673cae
FG
7105 tout(cct) << relpath << std::endl;
7106 tout(cct) << mask << std::endl;
7107
181888fb
FG
7108 if (unmounting)
7109 return -ENOTCONN;
7110
7c673cae
FG
7111 filepath path(relpath);
7112 InodeRef in;
7113 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7114 if (r < 0)
7115 return r;
7116 return _setattrx(in, stx, mask, perms);
7117}
7118
7119int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7120{
11fdf7f2
TL
7121 std::lock_guard lock(client_lock);
7122 tout(cct) << __func__ << std::endl;
7c673cae
FG
7123 tout(cct) << fd << std::endl;
7124 tout(cct) << mask << std::endl;
7125
181888fb
FG
7126 if (unmounting)
7127 return -ENOTCONN;
7128
7c673cae
FG
7129 Fh *f = get_filehandle(fd);
7130 if (!f)
7131 return -EBADF;
7132#if defined(__linux__) && defined(O_PATH)
7133 if (f->flags & O_PATH)
7134 return -EBADF;
7135#endif
7136 return _setattr(f->inode, attr, mask, perms);
7137}
7138
7139int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7140{
11fdf7f2
TL
7141 std::lock_guard lock(client_lock);
7142 tout(cct) << __func__ << std::endl;
7c673cae
FG
7143 tout(cct) << fd << std::endl;
7144 tout(cct) << mask << std::endl;
7145
181888fb
FG
7146 if (unmounting)
7147 return -ENOTCONN;
7148
7c673cae
FG
7149 Fh *f = get_filehandle(fd);
7150 if (!f)
7151 return -EBADF;
7152#if defined(__linux__) && defined(O_PATH)
7153 if (f->flags & O_PATH)
7154 return -EBADF;
7155#endif
7156 return _setattrx(f->inode, stx, mask, perms);
7157}
7158
7159int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7160 frag_info_t *dirstat, int mask)
7161{
11fdf7f2
TL
7162 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7163 std::lock_guard lock(client_lock);
7c673cae
FG
7164 tout(cct) << "stat" << std::endl;
7165 tout(cct) << relpath << std::endl;
181888fb
FG
7166
7167 if (unmounting)
7168 return -ENOTCONN;
7169
7c673cae
FG
7170 filepath path(relpath);
7171 InodeRef in;
7172 int r = path_walk(path, &in, perms, true, mask);
7173 if (r < 0)
7174 return r;
7175 r = _getattr(in, mask, perms);
7176 if (r < 0) {
11fdf7f2 7177 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7178 return r;
7179 }
7180 fill_stat(in, stbuf, dirstat);
11fdf7f2 7181 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7182 return r;
7183}
7184
7185unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7186{
7187 unsigned mask = 0;
7188
7189 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7190 if (flags & AT_NO_ATTR_SYNC)
7191 goto out;
7192
7193 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7194 mask |= CEPH_CAP_PIN;
7195 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7196 mask |= CEPH_CAP_AUTH_SHARED;
7197 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7198 mask |= CEPH_CAP_LINK_SHARED;
7199 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7200 mask |= CEPH_CAP_FILE_SHARED;
7201 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7202 mask |= CEPH_CAP_XATTR_SHARED;
7203out:
7204 return mask;
7205}
7206
7207int Client::statx(const char *relpath, struct ceph_statx *stx,
7208 const UserPerm& perms,
7209 unsigned int want, unsigned int flags)
7210{
11fdf7f2
TL
7211 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7212 std::lock_guard lock(client_lock);
7c673cae
FG
7213 tout(cct) << "statx" << std::endl;
7214 tout(cct) << relpath << std::endl;
181888fb
FG
7215
7216 if (unmounting)
7217 return -ENOTCONN;
7218
7c673cae
FG
7219 filepath path(relpath);
7220 InodeRef in;
7221
7222 unsigned mask = statx_to_mask(flags, want);
7223
7224 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7225 if (r < 0)
7226 return r;
7227
7228 r = _getattr(in, mask, perms);
7229 if (r < 0) {
11fdf7f2 7230 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7231 return r;
7232 }
7233
7234 fill_statx(in, mask, stx);
11fdf7f2 7235 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7236 return r;
7237}
7238
7239int Client::lstat(const char *relpath, struct stat *stbuf,
7240 const UserPerm& perms, frag_info_t *dirstat, int mask)
7241{
11fdf7f2
TL
7242 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7243 std::lock_guard lock(client_lock);
7244 tout(cct) << __func__ << std::endl;
7c673cae 7245 tout(cct) << relpath << std::endl;
181888fb
FG
7246
7247 if (unmounting)
7248 return -ENOTCONN;
7249
7c673cae
FG
7250 filepath path(relpath);
7251 InodeRef in;
7252 // don't follow symlinks
7253 int r = path_walk(path, &in, perms, false, mask);
7254 if (r < 0)
7255 return r;
7256 r = _getattr(in, mask, perms);
7257 if (r < 0) {
11fdf7f2 7258 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7259 return r;
7260 }
7261 fill_stat(in, stbuf, dirstat);
11fdf7f2 7262 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7263 return r;
7264}
7265
7266int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7267{
11fdf7f2 7268 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7269 << " mode 0" << oct << in->mode << dec
7270 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7271 memset(st, 0, sizeof(struct stat));
7272 if (use_faked_inos())
7273 st->st_ino = in->faked_ino;
7274 else
7275 st->st_ino = in->ino;
7276 st->st_dev = in->snapid;
7277 st->st_mode = in->mode;
7278 st->st_rdev = in->rdev;
28e407b8
AA
7279 if (in->is_dir()) {
7280 switch (in->nlink) {
7281 case 0:
7282 st->st_nlink = 0; /* dir is unlinked */
7283 break;
7284 case 1:
7285 st->st_nlink = 1 /* parent dentry */
7286 + 1 /* <dir>/. */
7287 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7288 break;
7289 default:
7290 ceph_abort();
7291 }
7292 } else {
7293 st->st_nlink = in->nlink;
7294 }
7c673cae
FG
7295 st->st_uid = in->uid;
7296 st->st_gid = in->gid;
7297 if (in->ctime > in->mtime) {
7298 stat_set_ctime_sec(st, in->ctime.sec());
7299 stat_set_ctime_nsec(st, in->ctime.nsec());
7300 } else {
7301 stat_set_ctime_sec(st, in->mtime.sec());
7302 stat_set_ctime_nsec(st, in->mtime.nsec());
7303 }
7304 stat_set_atime_sec(st, in->atime.sec());
7305 stat_set_atime_nsec(st, in->atime.nsec());
7306 stat_set_mtime_sec(st, in->mtime.sec());
7307 stat_set_mtime_nsec(st, in->mtime.nsec());
7308 if (in->is_dir()) {
7309 if (cct->_conf->client_dirsize_rbytes)
7310 st->st_size = in->rstat.rbytes;
7311 else
7312 st->st_size = in->dirstat.size();
7313 st->st_blocks = 1;
7314 } else {
7315 st->st_size = in->size;
7316 st->st_blocks = (in->size + 511) >> 9;
7317 }
11fdf7f2 7318 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7319
7320 if (dirstat)
7321 *dirstat = in->dirstat;
7322 if (rstat)
7323 *rstat = in->rstat;
7324
7325 return in->caps_issued();
7326}
7327
7328void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7329{
11fdf7f2 7330 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7331 << " mode 0" << oct << in->mode << dec
7332 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7333 memset(stx, 0, sizeof(struct ceph_statx));
7334
7335 /*
7336 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7337 * so that all bits are set.
7338 */
7339 if (!mask)
7340 mask = ~0;
7341
7342 /* These are always considered to be available */
7343 stx->stx_dev = in->snapid;
11fdf7f2 7344 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7345
7346 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7347 stx->stx_mode = S_IFMT & in->mode;
7348 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7349 stx->stx_rdev = in->rdev;
7350 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7351
7352 if (mask & CEPH_CAP_AUTH_SHARED) {
7353 stx->stx_uid = in->uid;
7354 stx->stx_gid = in->gid;
7355 stx->stx_mode = in->mode;
7356 in->btime.to_timespec(&stx->stx_btime);
7357 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7358 }
7359
7360 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7361 if (in->is_dir()) {
7362 switch (in->nlink) {
7363 case 0:
7364 stx->stx_nlink = 0; /* dir is unlinked */
7365 break;
7366 case 1:
7367 stx->stx_nlink = 1 /* parent dentry */
7368 + 1 /* <dir>/. */
7369 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7370 break;
7371 default:
7372 ceph_abort();
7373 }
7374 } else {
7375 stx->stx_nlink = in->nlink;
7376 }
7c673cae
FG
7377 stx->stx_mask |= CEPH_STATX_NLINK;
7378 }
7379
7380 if (mask & CEPH_CAP_FILE_SHARED) {
7381
7382 in->atime.to_timespec(&stx->stx_atime);
7383 in->mtime.to_timespec(&stx->stx_mtime);
7384
7385 if (in->is_dir()) {
7386 if (cct->_conf->client_dirsize_rbytes)
7387 stx->stx_size = in->rstat.rbytes;
7388 else
7389 stx->stx_size = in->dirstat.size();
7390 stx->stx_blocks = 1;
7391 } else {
7392 stx->stx_size = in->size;
7393 stx->stx_blocks = (in->size + 511) >> 9;
7394 }
7395 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7396 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7397 }
7398
7399 /* Change time and change_attr both require all shared caps to view */
7400 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7401 stx->stx_version = in->change_attr;
7402 if (in->ctime > in->mtime)
7403 in->ctime.to_timespec(&stx->stx_ctime);
7404 else
7405 in->mtime.to_timespec(&stx->stx_ctime);
7406 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7407 }
7408
7409}
7410
7411void Client::touch_dn(Dentry *dn)
7412{
7413 lru.lru_touch(dn);
7414}
7415
7416int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7417{
11fdf7f2
TL
7418 std::lock_guard lock(client_lock);
7419 tout(cct) << __func__ << std::endl;
7c673cae
FG
7420 tout(cct) << relpath << std::endl;
7421 tout(cct) << mode << std::endl;
181888fb
FG
7422
7423 if (unmounting)
7424 return -ENOTCONN;
7425
7c673cae
FG
7426 filepath path(relpath);
7427 InodeRef in;
7428 int r = path_walk(path, &in, perms);
7429 if (r < 0)
7430 return r;
7431 struct stat attr;
7432 attr.st_mode = mode;
7433 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7434}
7435
7436int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7437{
11fdf7f2
TL
7438 std::lock_guard lock(client_lock);
7439 tout(cct) << __func__ << std::endl;
7c673cae
FG
7440 tout(cct) << fd << std::endl;
7441 tout(cct) << mode << std::endl;
181888fb
FG
7442
7443 if (unmounting)
7444 return -ENOTCONN;
7445
7c673cae
FG
7446 Fh *f = get_filehandle(fd);
7447 if (!f)
7448 return -EBADF;
7449#if defined(__linux__) && defined(O_PATH)
7450 if (f->flags & O_PATH)
7451 return -EBADF;
7452#endif
7453 struct stat attr;
7454 attr.st_mode = mode;
7455 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7456}
7457
7458int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7459{
11fdf7f2
TL
7460 std::lock_guard lock(client_lock);
7461 tout(cct) << __func__ << std::endl;
7c673cae
FG
7462 tout(cct) << relpath << std::endl;
7463 tout(cct) << mode << std::endl;
181888fb
FG
7464
7465 if (unmounting)
7466 return -ENOTCONN;
7467
7c673cae
FG
7468 filepath path(relpath);
7469 InodeRef in;
7470 // don't follow symlinks
7471 int r = path_walk(path, &in, perms, false);
7472 if (r < 0)
7473 return r;
7474 struct stat attr;
7475 attr.st_mode = mode;
7476 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7477}
7478
7479int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7480 const UserPerm& perms)
7481{
11fdf7f2
TL
7482 std::lock_guard lock(client_lock);
7483 tout(cct) << __func__ << std::endl;
7c673cae
FG
7484 tout(cct) << relpath << std::endl;
7485 tout(cct) << new_uid << std::endl;
7486 tout(cct) << new_gid << std::endl;
181888fb
FG
7487
7488 if (unmounting)
7489 return -ENOTCONN;
7490
7c673cae
FG
7491 filepath path(relpath);
7492 InodeRef in;
7493 int r = path_walk(path, &in, perms);
7494 if (r < 0)
7495 return r;
7496 struct stat attr;
7497 attr.st_uid = new_uid;
7498 attr.st_gid = new_gid;
181888fb 7499 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7500}
7501
7502int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7503{
11fdf7f2
TL
7504 std::lock_guard lock(client_lock);
7505 tout(cct) << __func__ << std::endl;
7c673cae
FG
7506 tout(cct) << fd << std::endl;
7507 tout(cct) << new_uid << std::endl;
7508 tout(cct) << new_gid << std::endl;
181888fb
FG
7509
7510 if (unmounting)
7511 return -ENOTCONN;
7512
7c673cae
FG
7513 Fh *f = get_filehandle(fd);
7514 if (!f)
7515 return -EBADF;
7516#if defined(__linux__) && defined(O_PATH)
7517 if (f->flags & O_PATH)
7518 return -EBADF;
7519#endif
7520 struct stat attr;
7521 attr.st_uid = new_uid;
7522 attr.st_gid = new_gid;
7523 int mask = 0;
7524 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7525 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7526 return _setattr(f->inode, &attr, mask, perms);
7527}
7528
7529int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7530 const UserPerm& perms)
7531{
11fdf7f2
TL
7532 std::lock_guard lock(client_lock);
7533 tout(cct) << __func__ << std::endl;
7c673cae
FG
7534 tout(cct) << relpath << std::endl;
7535 tout(cct) << new_uid << std::endl;
7536 tout(cct) << new_gid << std::endl;
181888fb
FG
7537
7538 if (unmounting)
7539 return -ENOTCONN;
7540
7c673cae
FG
7541 filepath path(relpath);
7542 InodeRef in;
7543 // don't follow symlinks
7544 int r = path_walk(path, &in, perms, false);
7545 if (r < 0)
7546 return r;
7547 struct stat attr;
7548 attr.st_uid = new_uid;
7549 attr.st_gid = new_gid;
7550 int mask = 0;
7551 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7552 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7553 return _setattr(in, &attr, mask, perms);
7554}
7555
11fdf7f2
TL
7556static void attr_set_atime_and_mtime(struct stat *attr,
7557 const utime_t &atime,
7558 const utime_t &mtime)
7559{
7560 stat_set_atime_sec(attr, atime.tv.tv_sec);
7561 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7562 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7563 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7564}
7565
7566// for [l]utime() invoke the timeval variant as the timespec
7567// variant are not yet implemented. for futime[s](), invoke
7568// the timespec variant.
7c673cae
FG
7569int Client::utime(const char *relpath, struct utimbuf *buf,
7570 const UserPerm& perms)
7571{
11fdf7f2
TL
7572 struct timeval tv[2];
7573 tv[0].tv_sec = buf->actime;
7574 tv[0].tv_usec = 0;
7575 tv[1].tv_sec = buf->modtime;
7576 tv[1].tv_usec = 0;
7577
7578 return utimes(relpath, tv, perms);
7579}
7580
7581int Client::lutime(const char *relpath, struct utimbuf *buf,
7582 const UserPerm& perms)
7583{
7584 struct timeval tv[2];
7585 tv[0].tv_sec = buf->actime;
7586 tv[0].tv_usec = 0;
7587 tv[1].tv_sec = buf->modtime;
7588 tv[1].tv_usec = 0;
7589
7590 return lutimes(relpath, tv, perms);
7591}
7592
7593int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7594{
7595 struct timespec ts[2];
7596 ts[0].tv_sec = buf->actime;
7597 ts[0].tv_nsec = 0;
7598 ts[1].tv_sec = buf->modtime;
7599 ts[1].tv_nsec = 0;
7600
7601 return futimens(fd, ts, perms);
7602}
7603
7604int Client::utimes(const char *relpath, struct timeval times[2],
7605 const UserPerm& perms)
7606{
7607 std::lock_guard lock(client_lock);
7608 tout(cct) << __func__ << std::endl;
7c673cae 7609 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7610 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7611 << std::endl;
7612 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7613 << std::endl;
181888fb
FG
7614
7615 if (unmounting)
7616 return -ENOTCONN;
7617
7c673cae
FG
7618 filepath path(relpath);
7619 InodeRef in;
7620 int r = path_walk(path, &in, perms);
7621 if (r < 0)
7622 return r;
7623 struct stat attr;
11fdf7f2
TL
7624 utime_t atime(times[0]);
7625 utime_t mtime(times[1]);
7626
7627 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7628 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7629}
7630
11fdf7f2
TL
7631int Client::lutimes(const char *relpath, struct timeval times[2],
7632 const UserPerm& perms)
7c673cae 7633{
11fdf7f2
TL
7634 std::lock_guard lock(client_lock);
7635 tout(cct) << __func__ << std::endl;
7c673cae 7636 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7637 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7638 << std::endl;
7639 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7640 << std::endl;
181888fb
FG
7641
7642 if (unmounting)
7643 return -ENOTCONN;
7644
7c673cae
FG
7645 filepath path(relpath);
7646 InodeRef in;
7c673cae
FG
7647 int r = path_walk(path, &in, perms, false);
7648 if (r < 0)
7649 return r;
7650 struct stat attr;
11fdf7f2
TL
7651 utime_t atime(times[0]);
7652 utime_t mtime(times[1]);
7653
7654 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7655 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7656}
7657
11fdf7f2
TL
7658int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7659{
7660 struct timespec ts[2];
7661 ts[0].tv_sec = times[0].tv_sec;
7662 ts[0].tv_nsec = times[0].tv_usec * 1000;
7663 ts[1].tv_sec = times[1].tv_sec;
7664 ts[1].tv_nsec = times[1].tv_usec * 1000;
7665
7666 return futimens(fd, ts, perms);
7667}
7668
7669int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7670{
7671 std::lock_guard lock(client_lock);
7672 tout(cct) << __func__ << std::endl;
7673 tout(cct) << fd << std::endl;
7674 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7675 << std::endl;
7676 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7677 << std::endl;
7678
7679 if (unmounting)
7680 return -ENOTCONN;
7681
7682 Fh *f = get_filehandle(fd);
7683 if (!f)
7684 return -EBADF;
7685#if defined(__linux__) && defined(O_PATH)
7686 if (f->flags & O_PATH)
7687 return -EBADF;
7688#endif
7689 struct stat attr;
7690 utime_t atime(times[0]);
7691 utime_t mtime(times[1]);
7692
7693 attr_set_atime_and_mtime(&attr, atime, mtime);
7694 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7695}
7696
7c673cae
FG
7697int Client::flock(int fd, int operation, uint64_t owner)
7698{
11fdf7f2
TL
7699 std::lock_guard lock(client_lock);
7700 tout(cct) << __func__ << std::endl;
7c673cae
FG
7701 tout(cct) << fd << std::endl;
7702 tout(cct) << operation << std::endl;
7703 tout(cct) << owner << std::endl;
181888fb
FG
7704
7705 if (unmounting)
7706 return -ENOTCONN;
7707
7c673cae
FG
7708 Fh *f = get_filehandle(fd);
7709 if (!f)
7710 return -EBADF;
7711
7712 return _flock(f, operation, owner);
7713}
7714
7715int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7716{
11fdf7f2
TL
7717 std::lock_guard lock(client_lock);
7718 tout(cct) << __func__ << std::endl;
7c673cae 7719 tout(cct) << relpath << std::endl;
181888fb
FG
7720
7721 if (unmounting)
7722 return -ENOTCONN;
7723
7c673cae
FG
7724 filepath path(relpath);
7725 InodeRef in;
7726 int r = path_walk(path, &in, perms, true);
7727 if (r < 0)
7728 return r;
7729 if (cct->_conf->client_permissions) {
7730 int r = may_open(in.get(), O_RDONLY, perms);
7731 if (r < 0)
7732 return r;
7733 }
7734 r = _opendir(in.get(), dirpp, perms);
7735 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7736 if (r != -ENOTDIR)
7737 tout(cct) << (unsigned long)*dirpp << std::endl;
7738 return r;
7739}
7740
7741int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7742{
7743 if (!in->is_dir())
7744 return -ENOTDIR;
7745 *dirpp = new dir_result_t(in, perms);
7746 opened_dirs.insert(*dirpp);
11fdf7f2 7747 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7748 return 0;
7749}
7750
7751
7752int Client::closedir(dir_result_t *dir)
7753{
11fdf7f2
TL
7754 std::lock_guard lock(client_lock);
7755 tout(cct) << __func__ << std::endl;
7c673cae
FG
7756 tout(cct) << (unsigned long)dir << std::endl;
7757
11fdf7f2 7758 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7759 _closedir(dir);
7760 return 0;
7761}
7762
7763void Client::_closedir(dir_result_t *dirp)
7764{
11fdf7f2 7765 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7766 if (dirp->inode) {
11fdf7f2 7767 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7768 dirp->inode.reset();
7769 }
7770 _readdir_drop_dirp_buffer(dirp);
7771 opened_dirs.erase(dirp);
7772 delete dirp;
7773}
7774
7775void Client::rewinddir(dir_result_t *dirp)
7776{
11fdf7f2
TL
7777 std::lock_guard lock(client_lock);
7778 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7779
7780 if (unmounting)
7781 return;
7782
7c673cae
FG
7783 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7784 _readdir_drop_dirp_buffer(d);
7785 d->reset();
7786}
7787
7788loff_t Client::telldir(dir_result_t *dirp)
7789{
7790 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7791 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7792 return d->offset;
7793}
7794
7795void Client::seekdir(dir_result_t *dirp, loff_t offset)
7796{
11fdf7f2 7797 std::lock_guard lock(client_lock);
7c673cae 7798
11fdf7f2 7799 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7800
181888fb
FG
7801 if (unmounting)
7802 return;
7803
7c673cae
FG
7804 if (offset == dirp->offset)
7805 return;
7806
7807 if (offset > dirp->offset)
7808 dirp->release_count = 0; // bump if we do a forward seek
7809 else
7810 dirp->ordered_count = 0; // disable filling readdir cache
7811
7812 if (dirp->hash_order()) {
7813 if (dirp->offset > offset) {
7814 _readdir_drop_dirp_buffer(dirp);
7815 dirp->reset();
7816 }
7817 } else {
7818 if (offset == 0 ||
7819 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7820 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7821 _readdir_drop_dirp_buffer(dirp);
7822 dirp->reset();
7823 }
7824 }
7825
7826 dirp->offset = offset;
7827}
7828
7829
7830//struct dirent {
7831// ino_t d_ino; /* inode number */
7832// off_t d_off; /* offset to the next dirent */
7833// unsigned short d_reclen; /* length of this record */
7834// unsigned char d_type; /* type of file */
7835// char d_name[256]; /* filename */
7836//};
7837void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7838{
7839 strncpy(de->d_name, name, 255);
7840 de->d_name[255] = '\0';
7841#ifndef __CYGWIN__
7842 de->d_ino = ino;
11fdf7f2 7843#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7844 de->d_off = next_off;
7845#endif
7846 de->d_reclen = 1;
7847 de->d_type = IFTODT(type);
11fdf7f2 7848 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7849 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7850#endif
7851}
7852
7853void Client::_readdir_next_frag(dir_result_t *dirp)
7854{
7855 frag_t fg = dirp->buffer_frag;
7856
7857 if (fg.is_rightmost()) {
11fdf7f2 7858 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7859 dirp->set_end();
7860 return;
7861 }
7862
7863 // advance
7864 fg = fg.next();
11fdf7f2 7865 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7866
7867 if (dirp->hash_order()) {
7868 // keep last_name
7869 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7870 if (dirp->offset < new_offset) // don't decrease offset
7871 dirp->offset = new_offset;
7872 } else {
7873 dirp->last_name.clear();
7874 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7875 _readdir_rechoose_frag(dirp);
7876 }
7877}
7878
7879void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7880{
11fdf7f2 7881 ceph_assert(dirp->inode);
7c673cae
FG
7882
7883 if (dirp->hash_order())
7884 return;
7885
7886 frag_t cur = frag_t(dirp->offset_high());
7887 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7888 if (fg != cur) {
11fdf7f2 7889 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7890 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7891 dirp->last_name.clear();
7892 dirp->next_offset = 2;
7893 }
7894}
7895
7896void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7897{
11fdf7f2 7898 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7899 dirp->buffer.clear();
7900}
7901
7902int Client::_readdir_get_frag(dir_result_t *dirp)
7903{
11fdf7f2
TL
7904 ceph_assert(dirp);
7905 ceph_assert(dirp->inode);
7c673cae
FG
7906
7907 // get the current frag.
7908 frag_t fg;
7909 if (dirp->hash_order())
7910 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7911 else
7912 fg = frag_t(dirp->offset_high());
7913
11fdf7f2 7914 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7915 << " offset " << hex << dirp->offset << dec << dendl;
7916
7917 int op = CEPH_MDS_OP_READDIR;
7918 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7919 op = CEPH_MDS_OP_LSSNAP;
7920
7921 InodeRef& diri = dirp->inode;
7922
7923 MetaRequest *req = new MetaRequest(op);
7924 filepath path;
7925 diri->make_nosnap_relative_path(path);
7926 req->set_filepath(path);
7927 req->set_inode(diri.get());
7928 req->head.args.readdir.frag = fg;
7929 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7930 if (dirp->last_name.length()) {
94b18763 7931 req->path2.set_path(dirp->last_name);
7c673cae
FG
7932 } else if (dirp->hash_order()) {
7933 req->head.args.readdir.offset_hash = dirp->offset_high();
7934 }
7935 req->dirp = dirp;
7936
7937 bufferlist dirbl;
7938 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7939
7940 if (res == -EAGAIN) {
11fdf7f2 7941 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7942 _readdir_rechoose_frag(dirp);
7943 return _readdir_get_frag(dirp);
7944 }
7945
7946 if (res == 0) {
11fdf7f2 7947 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7948 << " size " << dirp->buffer.size() << dendl;
7949 } else {
11fdf7f2 7950 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7951 dirp->set_end();
7952 }
7953
7954 return res;
7955}
7956
7957struct dentry_off_lt {
7958 bool operator()(const Dentry* dn, int64_t off) const {
7959 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7960 }
7961};
7962
7963int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7964 int caps, bool getref)
7965{
11fdf7f2
TL
7966 ceph_assert(client_lock.is_locked());
7967 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7968 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7969 << dendl;
7970 Dir *dir = dirp->inode->dir;
7971
7972 if (!dir) {
7973 ldout(cct, 10) << " dir is empty" << dendl;
7974 dirp->set_end();
7975 return 0;
7976 }
7977
7978 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7979 dir->readdir_cache.end(),
7980 dirp->offset, dentry_off_lt());
7981
7982 string dn_name;
7983 while (true) {
7984 if (!dirp->inode->is_complete_and_ordered())
7985 return -EAGAIN;
7986 if (pd == dir->readdir_cache.end())
7987 break;
7988 Dentry *dn = *pd;
7989 if (dn->inode == NULL) {
7990 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7991 ++pd;
7992 continue;
7993 }
7994 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7995 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7996 ++pd;
7997 continue;
7998 }
7999
92f5a8d4 8000 int idx = pd - dir->readdir_cache.begin();
7c673cae
FG
8001 int r = _getattr(dn->inode, caps, dirp->perms);
8002 if (r < 0)
8003 return r;
92f5a8d4
TL
8004
8005 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8006 pd = dir->readdir_cache.begin() + idx;
8007 if (pd >= dir->readdir_cache.end() || *pd != dn)
8008 return -EAGAIN;
7c673cae
FG
8009
8010 struct ceph_statx stx;
8011 struct dirent de;
8012 fill_statx(dn->inode, caps, &stx);
8013
8014 uint64_t next_off = dn->offset + 1;
eafe8130 8015 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8016 ++pd;
8017 if (pd == dir->readdir_cache.end())
8018 next_off = dir_result_t::END;
8019
8020 Inode *in = NULL;
7c673cae
FG
8021 if (getref) {
8022 in = dn->inode.get();
8023 _ll_get(in);
8024 }
8025
8026 dn_name = dn->name; // fill in name while we have lock
8027
8028 client_lock.Unlock();
8029 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8030 client_lock.Lock();
8031 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8032 << " = " << r << dendl;
8033 if (r < 0) {
8034 return r;
8035 }
8036
8037 dirp->offset = next_off;
8038 if (dirp->at_end())
8039 dirp->next_offset = 2;
8040 else
8041 dirp->next_offset = dirp->offset_low();
8042 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8043 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8044 if (r > 0)
8045 return r;
8046 }
8047
11fdf7f2 8048 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8049 dirp->set_end();
8050 return 0;
8051}
8052
8053int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8054 unsigned want, unsigned flags, bool getref)
8055{
8056 int caps = statx_to_mask(flags, want);
8057
11fdf7f2 8058 std::lock_guard lock(client_lock);
7c673cae 8059
181888fb
FG
8060 if (unmounting)
8061 return -ENOTCONN;
8062
7c673cae
FG
8063 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8064
11fdf7f2 8065 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8066 << dec << " at_end=" << dirp->at_end()
8067 << " hash_order=" << dirp->hash_order() << dendl;
8068
8069 struct dirent de;
8070 struct ceph_statx stx;
8071 memset(&de, 0, sizeof(de));
8072 memset(&stx, 0, sizeof(stx));
8073
8074 InodeRef& diri = dirp->inode;
8075
8076 if (dirp->at_end())
8077 return 0;
8078
8079 if (dirp->offset == 0) {
8080 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8081 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8082 uint64_t next_off = 1;
8083
8084 int r;
8085 r = _getattr(diri, caps, dirp->perms);
8086 if (r < 0)
8087 return r;
8088
8089 fill_statx(diri, caps, &stx);
8090 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8091
8092 Inode *inode = NULL;
8093 if (getref) {
8094 inode = diri.get();
8095 _ll_get(inode);
8096 }
8097
8098 client_lock.Unlock();
8099 r = cb(p, &de, &stx, next_off, inode);
8100 client_lock.Lock();
8101 if (r < 0)
8102 return r;
8103
8104 dirp->offset = next_off;
8105 if (r > 0)
8106 return r;
8107 }
8108 if (dirp->offset == 1) {
8109 ldout(cct, 15) << " including .." << dendl;
8110 uint64_t next_off = 2;
8111 InodeRef in;
11fdf7f2 8112 if (diri->dentries.empty())
7c673cae
FG
8113 in = diri;
8114 else
94b18763 8115 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8116
8117 int r;
94b18763 8118 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8119 if (r < 0)
8120 return r;
8121
8122 fill_statx(in, caps, &stx);
8123 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8124
8125 Inode *inode = NULL;
8126 if (getref) {
8127 inode = in.get();
8128 _ll_get(inode);
8129 }
8130
8131 client_lock.Unlock();
8132 r = cb(p, &de, &stx, next_off, inode);
8133 client_lock.Lock();
8134 if (r < 0)
8135 return r;
8136
8137 dirp->offset = next_off;
8138 if (r > 0)
8139 return r;
8140 }
8141
8142 // can we read from our cache?
8143 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8144 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8145 << dirp->inode->is_complete_and_ordered()
8146 << " issued " << ccap_string(dirp->inode->caps_issued())
8147 << dendl;
8148 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8149 dirp->inode->is_complete_and_ordered() &&
94b18763 8150 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8151 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8152 if (err != -EAGAIN)
8153 return err;
8154 }
8155
8156 while (1) {
8157 if (dirp->at_end())
8158 return 0;
8159
8160 bool check_caps = true;
8161 if (!dirp->is_cached()) {
8162 int r = _readdir_get_frag(dirp);
8163 if (r)
8164 return r;
8165 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8166 // different than the requested one. (our dirfragtree was outdated)
8167 check_caps = false;
8168 }
8169 frag_t fg = dirp->buffer_frag;
8170
8171 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8172 << " offset " << hex << dirp->offset << dendl;
8173
8174 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8175 dirp->offset, dir_result_t::dentry_off_lt());
8176 it != dirp->buffer.end();
8177 ++it) {
8178 dir_result_t::dentry &entry = *it;
8179
8180 uint64_t next_off = entry.offset + 1;
8181
8182 int r;
8183 if (check_caps) {
8184 r = _getattr(entry.inode, caps, dirp->perms);
8185 if (r < 0)
8186 return r;
8187 }
8188
8189 fill_statx(entry.inode, caps, &stx);
8190 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8191
8192 Inode *inode = NULL;
8193 if (getref) {
8194 inode = entry.inode.get();
8195 _ll_get(inode);
8196 }
8197
8198 client_lock.Unlock();
8199 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8200 client_lock.Lock();
8201
8202 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8203 << " = " << r << dendl;
8204 if (r < 0)
8205 return r;
8206
8207 dirp->offset = next_off;
8208 if (r > 0)
8209 return r;
8210 }
8211
8212 if (dirp->next_offset > 2) {
8213 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8214 _readdir_drop_dirp_buffer(dirp);
8215 continue; // more!
8216 }
8217
8218 if (!fg.is_rightmost()) {
8219 // next frag!
8220 _readdir_next_frag(dirp);
8221 continue;
8222 }
8223
8224 if (diri->shared_gen == dirp->start_shared_gen &&
8225 diri->dir_release_count == dirp->release_count) {
8226 if (diri->dir_ordered_count == dirp->ordered_count) {
8227 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8228 if (diri->dir) {
11fdf7f2 8229 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8230 diri->dir->readdir_cache.resize(dirp->cache_index);
8231 }
8232 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8233 } else {
8234 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8235 diri->flags |= I_COMPLETE;
8236 }
8237 }
8238
8239 dirp->set_end();
8240 return 0;
8241 }
8242 ceph_abort();
8243 return 0;
8244}
8245
8246
8247int Client::readdir_r(dir_result_t *d, struct dirent *de)
8248{
8249 return readdirplus_r(d, de, 0, 0, 0, NULL);
8250}
8251
8252/*
8253 * readdirplus_r
8254 *
8255 * returns
8256 * 1 if we got a dirent
8257 * 0 for end of directory
8258 * <0 on error
8259 */
8260
8261struct single_readdir {
8262 struct dirent *de;
8263 struct ceph_statx *stx;
8264 Inode *inode;
8265 bool full;
8266};
8267
8268static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8269 struct ceph_statx *stx, off_t off,
8270 Inode *in)
8271{
8272 single_readdir *c = static_cast<single_readdir *>(p);
8273
8274 if (c->full)
8275 return -1; // already filled this dirent
8276
8277 *c->de = *de;
8278 if (c->stx)
8279 *c->stx = *stx;
8280 c->inode = in;
8281 c->full = true;
8282 return 1;
8283}
8284
8285struct dirent *Client::readdir(dir_result_t *d)
8286{
8287 int ret;
8288 static struct dirent de;
8289 single_readdir sr;
8290 sr.de = &de;
8291 sr.stx = NULL;
8292 sr.inode = NULL;
8293 sr.full = false;
8294
8295 // our callback fills the dirent and sets sr.full=true on first
8296 // call, and returns -1 the second time around.
8297 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8298 if (ret < -1) {
8299 errno = -ret; // this sucks.
8300 return (dirent *) NULL;
8301 }
8302 if (sr.full) {
8303 return &de;
8304 }
8305 return (dirent *) NULL;
8306}
8307
8308int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8309 struct ceph_statx *stx, unsigned want,
8310 unsigned flags, Inode **out)
8311{
8312 single_readdir sr;
8313 sr.de = de;
8314 sr.stx = stx;
8315 sr.inode = NULL;
8316 sr.full = false;
8317
8318 // our callback fills the dirent and sets sr.full=true on first
8319 // call, and returns -1 the second time around.
8320 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8321 if (r < -1)
8322 return r;
8323 if (out)
8324 *out = sr.inode;
8325 if (sr.full)
8326 return 1;
8327 return 0;
8328}
8329
8330
8331/* getdents */
8332struct getdents_result {
8333 char *buf;
8334 int buflen;
8335 int pos;
8336 bool fullent;
8337};
8338
8339static int _readdir_getdent_cb(void *p, struct dirent *de,
8340 struct ceph_statx *stx, off_t off, Inode *in)
8341{
8342 struct getdents_result *c = static_cast<getdents_result *>(p);
8343
8344 int dlen;
8345 if (c->fullent)
8346 dlen = sizeof(*de);
8347 else
8348 dlen = strlen(de->d_name) + 1;
8349
8350 if (c->pos + dlen > c->buflen)
8351 return -1; // doesn't fit
8352
8353 if (c->fullent) {
8354 memcpy(c->buf + c->pos, de, sizeof(*de));
8355 } else {
8356 memcpy(c->buf + c->pos, de->d_name, dlen);
8357 }
8358 c->pos += dlen;
8359 return 0;
8360}
8361
8362int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8363{
8364 getdents_result gr;
8365 gr.buf = buf;
8366 gr.buflen = buflen;
8367 gr.fullent = fullent;
8368 gr.pos = 0;
8369
8370 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8371
8372 if (r < 0) { // some error
8373 if (r == -1) { // buffer ran out of space
8374 if (gr.pos) { // but we got some entries already!
8375 return gr.pos;
8376 } // or we need a larger buffer
8377 return -ERANGE;
8378 } else { // actual error, return it
8379 return r;
8380 }
8381 }
8382 return gr.pos;
8383}
8384
8385
8386/* getdir */
8387struct getdir_result {
8388 list<string> *contents;
8389 int num;
8390};
8391
8392static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8393{
8394 getdir_result *r = static_cast<getdir_result *>(p);
8395
8396 r->contents->push_back(de->d_name);
8397 r->num++;
8398 return 0;
8399}
8400
8401int Client::getdir(const char *relpath, list<string>& contents,
8402 const UserPerm& perms)
8403{
8404 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8405 {
11fdf7f2 8406 std::lock_guard lock(client_lock);
7c673cae
FG
8407 tout(cct) << "getdir" << std::endl;
8408 tout(cct) << relpath << std::endl;
8409 }
8410
8411 dir_result_t *d;
8412 int r = opendir(relpath, &d, perms);
8413 if (r < 0)
8414 return r;
8415
8416 getdir_result gr;
8417 gr.contents = &contents;
8418 gr.num = 0;
8419 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8420
8421 closedir(d);
8422
8423 if (r < 0)
8424 return r;
8425 return gr.num;
8426}
8427
8428
8429/****** file i/o **********/
8430int Client::open(const char *relpath, int flags, const UserPerm& perms,
8431 mode_t mode, int stripe_unit, int stripe_count,
8432 int object_size, const char *data_pool)
8433{
8434 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8435 std::lock_guard lock(client_lock);
7c673cae
FG
8436 tout(cct) << "open" << std::endl;
8437 tout(cct) << relpath << std::endl;
8438 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8439
181888fb
FG
8440 if (unmounting)
8441 return -ENOTCONN;
8442
7c673cae
FG
8443 Fh *fh = NULL;
8444
8445#if defined(__linux__) && defined(O_PATH)
8446 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8447 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8448 * in kernel (fs/open.c). */
8449 if (flags & O_PATH)
8450 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8451#endif
8452
8453 filepath path(relpath);
8454 InodeRef in;
8455 bool created = false;
8456 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8457 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8458 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8459
8460 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8461 return -EEXIST;
8462
8463#if defined(__linux__) && defined(O_PATH)
8464 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8465#else
8466 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8467#endif
8468 return -ELOOP;
8469
8470 if (r == -ENOENT && (flags & O_CREAT)) {
8471 filepath dirpath = path;
8472 string dname = dirpath.last_dentry();
8473 dirpath.pop_dentry();
8474 InodeRef dir;
8475 r = path_walk(dirpath, &dir, perms, true,
8476 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8477 if (r < 0)
8478 goto out;
8479 if (cct->_conf->client_permissions) {
8480 r = may_create(dir.get(), perms);
8481 if (r < 0)
8482 goto out;
8483 }
8484 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8485 stripe_count, object_size, data_pool, &created, perms);
8486 }
8487 if (r < 0)
8488 goto out;
8489
8490 if (!created) {
8491 // posix says we can only check permissions of existing files
8492 if (cct->_conf->client_permissions) {
8493 r = may_open(in.get(), flags, perms);
8494 if (r < 0)
8495 goto out;
8496 }
8497 }
8498
8499 if (!fh)
8500 r = _open(in.get(), flags, mode, &fh, perms);
8501 if (r >= 0) {
8502 // allocate a integer file descriptor
11fdf7f2 8503 ceph_assert(fh);
7c673cae 8504 r = get_fd();
11fdf7f2 8505 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8506 fd_map[r] = fh;
8507 }
8508
8509 out:
8510 tout(cct) << r << std::endl;
8511 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8512 return r;
8513}
8514
8515int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8516{
8517 /* Use default file striping parameters */
8518 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8519}
8520
8521int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8522 const UserPerm& perms)
8523{
11fdf7f2
TL
8524 std::lock_guard lock(client_lock);
8525 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8526
181888fb
FG
8527 if (unmounting)
8528 return -ENOTCONN;
8529
7c673cae
FG
8530 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8531 filepath path(ino);
8532 req->set_filepath(path);
8533
8534 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8535 char f[30];
8536 sprintf(f, "%u", h);
8537 filepath path2(dirino);
8538 path2.push_dentry(string(f));
8539 req->set_filepath2(path2);
8540
8541 int r = make_request(req, perms, NULL, NULL,
8542 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8543 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8544 return r;
8545}
8546
8547
8548/**
8549 * Load inode into local cache.
8550 *
8551 * If inode pointer is non-NULL, and take a reference on
8552 * the resulting Inode object in one operation, so that caller
8553 * can safely assume inode will still be there after return.
8554 */
1adf2230 8555int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8556{
11fdf7f2 8557 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8558
181888fb
FG
8559 if (unmounting)
8560 return -ENOTCONN;
8561
7c673cae
FG
8562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8563 filepath path(ino);
8564 req->set_filepath(path);
8565
8566 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8567 if (r == 0 && inode != NULL) {
8568 vinodeno_t vino(ino, CEPH_NOSNAP);
8569 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8570 ceph_assert(p != inode_map.end());
7c673cae
FG
8571 *inode = p->second;
8572 _ll_get(*inode);
8573 }
11fdf7f2 8574 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8575 return r;
8576}
8577
1adf2230
AA
8578int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8579{
11fdf7f2 8580 std::lock_guard lock(client_lock);
1adf2230
AA
8581 return _lookup_ino(ino, perms, inode);
8582}
7c673cae
FG
8583
8584/**
8585 * Find the parent inode of `ino` and insert it into
8586 * our cache. Conditionally also set `parent` to a referenced
8587 * Inode* if caller provides non-NULL value.
8588 */
1adf2230 8589int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8590{
11fdf7f2 8591 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8592
7c673cae
FG
8593 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8594 filepath path(ino->ino);
8595 req->set_filepath(path);
8596
8597 InodeRef target;
8598 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8599 // Give caller a reference to the parent ino if they provided a pointer.
8600 if (parent != NULL) {
8601 if (r == 0) {
8602 *parent = target.get();
8603 _ll_get(*parent);
11fdf7f2 8604 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8605 } else {
8606 *parent = NULL;
8607 }
8608 }
11fdf7f2 8609 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8610 return r;
8611}
8612
7c673cae
FG
8613/**
8614 * Populate the parent dentry for `ino`, provided it is
8615 * a child of `parent`.
8616 */
1adf2230 8617int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8618{
11fdf7f2
TL
8619 ceph_assert(parent->is_dir());
8620 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8621
181888fb
FG
8622 if (unmounting)
8623 return -ENOTCONN;
8624
7c673cae
FG
8625 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8626 req->set_filepath2(filepath(parent->ino));
8627 req->set_filepath(filepath(ino->ino));
8628 req->set_inode(ino);
8629
8630 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8631 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8632 return r;
8633}
8634
1adf2230
AA
8635int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8636{
11fdf7f2 8637 std::lock_guard lock(client_lock);
1adf2230
AA
8638 return _lookup_name(ino, parent, perms);
8639}
7c673cae 8640
11fdf7f2 8641Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8642{
11fdf7f2
TL
8643 ceph_assert(in);
8644 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8645
11fdf7f2 8646 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8647
8648 if (in->snapid != CEPH_NOSNAP) {
8649 in->snap_cap_refs++;
8650 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8651 << ccap_string(in->caps_issued()) << dendl;
8652 }
8653
11fdf7f2 8654 const auto& conf = cct->_conf;
7c673cae
FG
8655 f->readahead.set_trigger_requests(1);
8656 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8657 uint64_t max_readahead = Readahead::NO_LIMIT;
8658 if (conf->client_readahead_max_bytes) {
11fdf7f2 8659 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8660 }
8661 if (conf->client_readahead_max_periods) {
11fdf7f2 8662 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8663 }
8664 f->readahead.set_max_readahead_size(max_readahead);
8665 vector<uint64_t> alignments;
8666 alignments.push_back(in->layout.get_period());
8667 alignments.push_back(in->layout.stripe_unit);
8668 f->readahead.set_alignments(alignments);
8669
8670 return f;
8671}
8672
8673int Client::_release_fh(Fh *f)
8674{
8675 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8676 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8677 Inode *in = f->inode.get();
11fdf7f2 8678 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8679
b32b8144
FG
8680 in->unset_deleg(f);
8681
7c673cae
FG
8682 if (in->snapid == CEPH_NOSNAP) {
8683 if (in->put_open_ref(f->mode)) {
8684 _flush(in, new C_Client_FlushComplete(this, in));
8685 check_caps(in, 0);
8686 }
8687 } else {
11fdf7f2 8688 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8689 in->snap_cap_refs--;
8690 }
8691
8692 _release_filelocks(f);
8693
8694 // Finally, read any async err (i.e. from flushes)
8695 int err = f->take_async_err();
8696 if (err != 0) {
11fdf7f2 8697 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8698 << cpp_strerror(err) << dendl;
8699 } else {
11fdf7f2 8700 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8701 }
8702
8703 _put_fh(f);
8704
8705 return err;
8706}
8707
8708void Client::_put_fh(Fh *f)
8709{
8710 int left = f->put();
8711 if (!left) {
8712 delete f;
8713 }
8714}
8715
8716int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8717 const UserPerm& perms)
8718{
8719 if (in->snapid != CEPH_NOSNAP &&
8720 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8721 return -EROFS;
8722 }
8723
8724 // use normalized flags to generate cmode
11fdf7f2
TL
8725 int cflags = ceph_flags_sys2wire(flags);
8726 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8727 cflags |= CEPH_O_LAZY;
8728
8729 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8730 int want = ceph_caps_for_mode(cmode);
8731 int result = 0;
8732
8733 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8734
b32b8144 8735 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8736 // update wanted?
8737 check_caps(in, CHECK_CAPS_NODELAY);
8738 } else {
b32b8144 8739
7c673cae
FG
8740 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8741 filepath path;
8742 in->make_nosnap_relative_path(path);
8743 req->set_filepath(path);
11fdf7f2 8744 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8745 req->head.args.open.mode = mode;
8746 req->head.args.open.pool = -1;
8747 if (cct->_conf->client_debug_getattr_caps)
8748 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8749 else
8750 req->head.args.open.mask = 0;
8751 req->head.args.open.old_size = in->size; // for O_TRUNC
8752 req->set_inode(in);
8753 result = make_request(req, perms);
b32b8144
FG
8754
8755 /*
8756 * NFS expects that delegations will be broken on a conflicting open,
8757 * not just when there is actual conflicting access to the file. SMB leases
8758 * and oplocks also have similar semantics.
8759 *
8760 * Ensure that clients that have delegations enabled will wait on minimal
8761 * caps during open, just to ensure that other clients holding delegations
8762 * return theirs first.
8763 */
8764 if (deleg_timeout && result == 0) {
8765 int need = 0, have;
8766
8767 if (cmode & CEPH_FILE_MODE_WR)
8768 need |= CEPH_CAP_FILE_WR;
8769 if (cmode & CEPH_FILE_MODE_RD)
8770 need |= CEPH_CAP_FILE_RD;
8771
8772 result = get_caps(in, need, want, &have, -1);
8773 if (result < 0) {
1adf2230 8774 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8775 " . Denying open: " <<
8776 cpp_strerror(result) << dendl;
8777 in->put_open_ref(cmode);
8778 } else {
8779 put_cap_ref(in, need);
8780 }
8781 }
7c673cae
FG
8782 }
8783
8784 // success?
8785 if (result >= 0) {
8786 if (fhp)
8787 *fhp = _create_fh(in, flags, cmode, perms);
8788 } else {
8789 in->put_open_ref(cmode);
8790 }
8791
8792 trim_cache();
8793
8794 return result;
8795}
8796
8797int Client::_renew_caps(Inode *in)
8798{
8799 int wanted = in->caps_file_wanted();
8800 if (in->is_any_caps() &&
8801 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8802 check_caps(in, CHECK_CAPS_NODELAY);
8803 return 0;
8804 }
8805
8806 int flags = 0;
8807 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8808 flags = O_RDWR;
8809 else if (wanted & CEPH_CAP_FILE_RD)
8810 flags = O_RDONLY;
8811 else if (wanted & CEPH_CAP_FILE_WR)
8812 flags = O_WRONLY;
8813
8814 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8815 filepath path;
8816 in->make_nosnap_relative_path(path);
8817 req->set_filepath(path);
8818 req->head.args.open.flags = flags;
8819 req->head.args.open.pool = -1;
8820 if (cct->_conf->client_debug_getattr_caps)
8821 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8822 else
8823 req->head.args.open.mask = 0;
8824 req->set_inode(in);
8825
8826 // duplicate in case Cap goes away; not sure if that race is a concern?
8827 const UserPerm *pperm = in->get_best_perms();
8828 UserPerm perms;
8829 if (pperm != NULL)
8830 perms = *pperm;
8831 int ret = make_request(req, perms);
8832 return ret;
8833}
8834
8835int Client::close(int fd)
8836{
8837 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8838 std::lock_guard lock(client_lock);
7c673cae
FG
8839 tout(cct) << "close" << std::endl;
8840 tout(cct) << fd << std::endl;
8841
181888fb
FG
8842 if (unmounting)
8843 return -ENOTCONN;
8844
7c673cae
FG
8845 Fh *fh = get_filehandle(fd);
8846 if (!fh)
8847 return -EBADF;
8848 int err = _release_fh(fh);
8849 fd_map.erase(fd);
8850 put_fd(fd);
8851 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8852 return err;
8853}
8854
8855
8856// ------------
8857// read, write
8858
8859loff_t Client::lseek(int fd, loff_t offset, int whence)
8860{
11fdf7f2 8861 std::lock_guard lock(client_lock);
7c673cae
FG
8862 tout(cct) << "lseek" << std::endl;
8863 tout(cct) << fd << std::endl;
8864 tout(cct) << offset << std::endl;
8865 tout(cct) << whence << std::endl;
8866
181888fb
FG
8867 if (unmounting)
8868 return -ENOTCONN;
8869
7c673cae
FG
8870 Fh *f = get_filehandle(fd);
8871 if (!f)
8872 return -EBADF;
8873#if defined(__linux__) && defined(O_PATH)
8874 if (f->flags & O_PATH)
8875 return -EBADF;
8876#endif
8877 return _lseek(f, offset, whence);
8878}
8879
8880loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8881{
8882 Inode *in = f->inode.get();
8883 int r;
11fdf7f2 8884 loff_t pos = -1;
7c673cae 8885
92f5a8d4
TL
8886 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
8887 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8888 if (r < 0) {
8889 return r;
8890 }
8891 }
8892
7c673cae
FG
8893 switch (whence) {
8894 case SEEK_SET:
11fdf7f2 8895 pos = offset;
7c673cae
FG
8896 break;
8897
8898 case SEEK_CUR:
92f5a8d4 8899 pos = f->pos + offset;
7c673cae
FG
8900 break;
8901
8902 case SEEK_END:
11fdf7f2 8903 pos = in->size + offset;
7c673cae
FG
8904 break;
8905
92f5a8d4
TL
8906 case SEEK_DATA:
8907 if (offset < 0 || offset >= in->size) {
8908 r = -ENXIO;
8909 return offset;
8910 }
8911 pos = offset;
8912 break;
8913
8914 case SEEK_HOLE:
8915 if (offset < 0 || offset >= in->size) {
8916 r = -ENXIO;
8917 pos = offset;
8918 } else {
8919 pos = in->size;
8920 }
8921 break;
8922
7c673cae 8923 default:
92f5a8d4
TL
8924 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
8925 return -EINVAL;
7c673cae
FG
8926 }
8927
11fdf7f2
TL
8928 if (pos < 0) {
8929 return -EINVAL;
8930 } else {
8931 f->pos = pos;
8932 }
8933
1adf2230 8934 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8935 return f->pos;
8936}
8937
8938
8939void Client::lock_fh_pos(Fh *f)
8940{
11fdf7f2 8941 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8942
8943 if (f->pos_locked || !f->pos_waiters.empty()) {
8944 Cond cond;
8945 f->pos_waiters.push_back(&cond);
11fdf7f2 8946 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
7c673cae
FG
8947 while (f->pos_locked || f->pos_waiters.front() != &cond)
8948 cond.Wait(client_lock);
11fdf7f2
TL
8949 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8950 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
8951 f->pos_waiters.pop_front();
8952 }
8953
8954 f->pos_locked = true;
8955}
8956
8957void Client::unlock_fh_pos(Fh *f)
8958{
11fdf7f2 8959 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8960 f->pos_locked = false;
8961}
8962
8963int Client::uninline_data(Inode *in, Context *onfinish)
8964{
8965 if (!in->inline_data.length()) {
8966 onfinish->complete(0);
8967 return 0;
8968 }
8969
8970 char oid_buf[32];
8971 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8972 object_t oid = oid_buf;
8973
8974 ObjectOperation create_ops;
8975 create_ops.create(false);
8976
8977 objecter->mutate(oid,
8978 OSDMap::file_to_object_locator(in->layout),
8979 create_ops,
8980 in->snaprealm->get_snap_context(),
8981 ceph::real_clock::now(),
8982 0,
8983 NULL);
8984
8985 bufferlist inline_version_bl;
11fdf7f2 8986 encode(in->inline_version, inline_version_bl);
7c673cae
FG
8987
8988 ObjectOperation uninline_ops;
8989 uninline_ops.cmpxattr("inline_version",
8990 CEPH_OSD_CMPXATTR_OP_GT,
8991 CEPH_OSD_CMPXATTR_MODE_U64,
8992 inline_version_bl);
8993 bufferlist inline_data = in->inline_data;
8994 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8995 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8996
8997 objecter->mutate(oid,
8998 OSDMap::file_to_object_locator(in->layout),
8999 uninline_ops,
9000 in->snaprealm->get_snap_context(),
9001 ceph::real_clock::now(),
9002 0,
9003 onfinish);
9004
9005 return 0;
9006}
9007
9008//
9009
9010// blocking osd interface
9011
9012int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9013{
11fdf7f2 9014 std::lock_guard lock(client_lock);
7c673cae
FG
9015 tout(cct) << "read" << std::endl;
9016 tout(cct) << fd << std::endl;
9017 tout(cct) << size << std::endl;
9018 tout(cct) << offset << std::endl;
9019
181888fb
FG
9020 if (unmounting)
9021 return -ENOTCONN;
9022
7c673cae
FG
9023 Fh *f = get_filehandle(fd);
9024 if (!f)
9025 return -EBADF;
9026#if defined(__linux__) && defined(O_PATH)
9027 if (f->flags & O_PATH)
9028 return -EBADF;
9029#endif
9030 bufferlist bl;
11fdf7f2
TL
9031 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9032 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9033 int r = _read(f, offset, size, &bl);
9034 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9035 if (r >= 0) {
9036 bl.copy(0, bl.length(), buf);
9037 r = bl.length();
9038 }
9039 return r;
9040}
9041
9042int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9043{
9044 if (iovcnt < 0)
9045 return -EINVAL;
9046 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9047}
9048
11fdf7f2 9049int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9050{
11fdf7f2
TL
9051 int want, have = 0;
9052 bool movepos = false;
9053 std::unique_ptr<C_SaferCond> onuninline;
9054 int64_t r = 0;
9055 const auto& conf = cct->_conf;
7c673cae 9056 Inode *in = f->inode.get();
11fdf7f2
TL
9057 utime_t lat;
9058 utime_t start = ceph_clock_now();
7c673cae
FG
9059
9060 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9061 return -EBADF;
9062 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9063
7c673cae
FG
9064 if (offset < 0) {
9065 lock_fh_pos(f);
9066 offset = f->pos;
9067 movepos = true;
9068 }
9069 loff_t start_pos = offset;
9070
9071 if (in->inline_version == 0) {
11fdf7f2 9072 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9073 if (r < 0) {
11fdf7f2 9074 goto done;
c07f9fc5 9075 }
11fdf7f2 9076 ceph_assert(in->inline_version > 0);
7c673cae
FG
9077 }
9078
9079retry:
11fdf7f2
TL
9080 if (f->mode & CEPH_FILE_MODE_LAZY)
9081 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9082 else
9083 want = CEPH_CAP_FILE_CACHE;
9084 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9085 if (r < 0) {
11fdf7f2 9086 goto done;
c07f9fc5 9087 }
7c673cae 9088 if (f->flags & O_DIRECT)
11fdf7f2 9089 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9090
9091 if (in->inline_version < CEPH_INLINE_NONE) {
9092 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9093 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9094 uninline_data(in, onuninline.get());
7c673cae
FG
9095 } else {
9096 uint32_t len = in->inline_data.length();
7c673cae
FG
9097 uint64_t endoff = offset + size;
9098 if (endoff > in->size)
9099 endoff = in->size;
9100
9101 if (offset < len) {
9102 if (endoff <= len) {
9103 bl->substr_of(in->inline_data, offset, endoff - offset);
9104 } else {
9105 bl->substr_of(in->inline_data, offset, len - offset);
9106 bl->append_zero(endoff - len);
9107 }
11fdf7f2 9108 r = endoff - offset;
7c673cae
FG
9109 } else if ((uint64_t)offset < endoff) {
9110 bl->append_zero(endoff - offset);
11fdf7f2
TL
9111 r = endoff - offset;
9112 } else {
9113 r = 0;
7c673cae 9114 }
7c673cae
FG
9115 goto success;
9116 }
9117 }
9118
9119 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9120 conf->client_oc &&
9121 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9122
9123 if (f->flags & O_RSYNC) {
9124 _flush_range(in, offset, size);
9125 }
9126 r = _read_async(f, offset, size, bl);
9127 if (r < 0)
9128 goto done;
9129 } else {
9130 if (f->flags & O_DIRECT)
9131 _flush_range(in, offset, size);
9132
9133 bool checkeof = false;
9134 r = _read_sync(f, offset, size, bl, &checkeof);
9135 if (r < 0)
9136 goto done;
9137 if (checkeof) {
9138 offset += r;
9139 size -= r;
9140
9141 put_cap_ref(in, CEPH_CAP_FILE_RD);
9142 have = 0;
9143 // reverify size
9144 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9145 if (r < 0)
9146 goto done;
9147
9148 // eof? short read.
9149 if ((uint64_t)offset < in->size)
9150 goto retry;
9151 }
9152 }
9153
9154success:
11fdf7f2 9155 ceph_assert(r >= 0);
7c673cae
FG
9156 if (movepos) {
9157 // adjust fd pos
11fdf7f2 9158 f->pos = start_pos + r;
7c673cae 9159 }
11fdf7f2
TL
9160
9161 lat = ceph_clock_now();
9162 lat -= start;
9163 logger->tinc(l_c_read, lat);
7c673cae
FG
9164
9165done:
9166 // done!
11fdf7f2 9167
7c673cae
FG
9168 if (onuninline) {
9169 client_lock.Unlock();
11fdf7f2 9170 int ret = onuninline->wait();
7c673cae 9171 client_lock.Lock();
11fdf7f2 9172 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9173 in->inline_data.clear();
9174 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9175 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9176 check_caps(in, 0);
9177 } else
11fdf7f2 9178 r = ret;
7c673cae 9179 }
11fdf7f2 9180 if (have) {
7c673cae 9181 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9182 }
9183 if (movepos) {
9184 unlock_fh_pos(f);
9185 }
9186 return r;
7c673cae
FG
9187}
9188
9189Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9190 client(c), f(f) {
9191 f->get();
9192 f->readahead.inc_pending();
9193}
9194
9195Client::C_Readahead::~C_Readahead() {
9196 f->readahead.dec_pending();
9197 client->_put_fh(f);
9198}
9199
9200void Client::C_Readahead::finish(int r) {
9201 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9202 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9203}
9204
9205int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9206{
11fdf7f2 9207 const auto& conf = cct->_conf;
7c673cae
FG
9208 Inode *in = f->inode.get();
9209
11fdf7f2 9210 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9211
9212 // trim read based on file size?
9213 if (off >= in->size)
9214 return 0;
9215 if (len == 0)
9216 return 0;
9217 if (off + len > in->size) {
9218 len = in->size - off;
9219 }
9220
9221 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9222 << " max_bytes=" << f->readahead.get_max_readahead_size()
9223 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9224
9225 // read (and possibly block)
11fdf7f2
TL
9226 int r = 0;
9227 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9228 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9229 off, len, bl, 0, &onfinish);
7c673cae
FG
9230 if (r == 0) {
9231 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9232 client_lock.Unlock();
11fdf7f2 9233 r = onfinish.wait();
7c673cae
FG
9234 client_lock.Lock();
9235 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9236 }
9237
9238 if(f->readahead.get_min_readahead_size() > 0) {
9239 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9240 if (readahead_extent.second > 0) {
9241 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9242 << " (caller wants " << off << "~" << len << ")" << dendl;
9243 Context *onfinish2 = new C_Readahead(this, f);
9244 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9245 readahead_extent.first, readahead_extent.second,
9246 NULL, 0, onfinish2);
9247 if (r2 == 0) {
9248 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9249 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9250 } else {
9251 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9252 delete onfinish2;
9253 }
9254 }
9255 }
9256
9257 return r;
9258}
9259
9260int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9261 bool *checkeof)
9262{
9263 Inode *in = f->inode.get();
9264 uint64_t pos = off;
9265 int left = len;
9266 int read = 0;
9267
11fdf7f2 9268 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9269
9270 Mutex flock("Client::_read_sync flock");
9271 Cond cond;
9272 while (left > 0) {
11fdf7f2 9273 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9274 bufferlist tbl;
9275
9276 int wanted = left;
9277 filer->read_trunc(in->ino, &in->layout, in->snapid,
9278 pos, left, &tbl, 0,
9279 in->truncate_size, in->truncate_seq,
11fdf7f2 9280 &onfinish);
7c673cae 9281 client_lock.Unlock();
11fdf7f2 9282 int r = onfinish.wait();
7c673cae
FG
9283 client_lock.Lock();
9284
9285 // if we get ENOENT from OSD, assume 0 bytes returned
9286 if (r == -ENOENT)
9287 r = 0;
9288 if (r < 0)
9289 return r;
9290 if (tbl.length()) {
9291 r = tbl.length();
9292
9293 read += r;
9294 pos += r;
9295 left -= r;
9296 bl->claim_append(tbl);
9297 }
9298 // short read?
9299 if (r >= 0 && r < wanted) {
9300 if (pos < in->size) {
9301 // zero up to known EOF
9302 int64_t some = in->size - pos;
9303 if (some > left)
9304 some = left;
11fdf7f2
TL
9305 auto z = buffer::ptr_node::create(some);
9306 z->zero();
9307 bl->push_back(std::move(z));
7c673cae
FG
9308 read += some;
9309 pos += some;
9310 left -= some;
9311 if (left == 0)
9312 return read;
9313 }
9314
9315 *checkeof = true;
9316 return read;
9317 }
9318 }
9319 return read;
9320}
9321
9322
9323/*
9324 * we keep count of uncommitted sync writes on the inode, so that
9325 * fsync can DDRT.
9326 */
9327void Client::_sync_write_commit(Inode *in)
9328{
11fdf7f2 9329 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9330 unsafe_sync_write--;
9331
9332 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9333
11fdf7f2 9334 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9335 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9336 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
7c673cae
FG
9337 mount_cond.Signal();
9338 }
9339}
9340
9341int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9342{
11fdf7f2 9343 std::lock_guard lock(client_lock);
7c673cae
FG
9344 tout(cct) << "write" << std::endl;
9345 tout(cct) << fd << std::endl;
9346 tout(cct) << size << std::endl;
9347 tout(cct) << offset << std::endl;
9348
181888fb
FG
9349 if (unmounting)
9350 return -ENOTCONN;
9351
7c673cae
FG
9352 Fh *fh = get_filehandle(fd);
9353 if (!fh)
9354 return -EBADF;
9355#if defined(__linux__) && defined(O_PATH)
9356 if (fh->flags & O_PATH)
9357 return -EBADF;
9358#endif
11fdf7f2
TL
9359 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9360 size = std::min(size, (loff_t)INT_MAX);
9361 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9362 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9363 return r;
9364}
9365
9366int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9367{
9368 if (iovcnt < 0)
9369 return -EINVAL;
9370 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9371}
9372
11fdf7f2
TL
9373int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9374 unsigned iovcnt, int64_t offset, bool write,
9375 bool clamp_to_int)
7c673cae 9376{
7c673cae
FG
9377#if defined(__linux__) && defined(O_PATH)
9378 if (fh->flags & O_PATH)
9379 return -EBADF;
9380#endif
9381 loff_t totallen = 0;
9382 for (unsigned i = 0; i < iovcnt; i++) {
9383 totallen += iov[i].iov_len;
9384 }
11fdf7f2
TL
9385
9386 /*
9387 * Some of the API functions take 64-bit size values, but only return
9388 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9389 * we don't do I/Os larger than the values we can return.
9390 */
9391 if (clamp_to_int) {
9392 totallen = std::min(totallen, (loff_t)INT_MAX);
9393 }
7c673cae 9394 if (write) {
11fdf7f2
TL
9395 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9396 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9397 return w;
9398 } else {
9399 bufferlist bl;
11fdf7f2
TL
9400 int64_t r = _read(fh, offset, totallen, &bl);
9401 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9402 if (r <= 0)
9403 return r;
9404
9405 int bufoff = 0;
9406 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9407 /*
9408 * This piece of code aims to handle the case that bufferlist does not have enough data
9409 * to fill in the iov
9410 */
9411 if (resid < iov[j].iov_len) {
9412 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9413 break;
9414 } else {
9415 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9416 }
9417 resid -= iov[j].iov_len;
9418 bufoff += iov[j].iov_len;
9419 }
9420 return r;
9421 }
9422}
9423
11fdf7f2
TL
9424int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9425{
9426 std::lock_guard lock(client_lock);
9427 tout(cct) << fd << std::endl;
9428 tout(cct) << offset << std::endl;
9429
9430 if (unmounting)
9431 return -ENOTCONN;
9432
9433 Fh *fh = get_filehandle(fd);
9434 if (!fh)
9435 return -EBADF;
9436 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9437}
9438
9439int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9440 const struct iovec *iov, int iovcnt)
7c673cae 9441{
f64942e4
AA
9442 uint64_t fpos = 0;
9443
7c673cae
FG
9444 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9445 return -EFBIG;
9446
9447 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9448 Inode *in = f->inode.get();
9449
9450 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9451 return -ENOSPC;
9452 }
9453
11fdf7f2 9454 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9455
9456 // was Fh opened as writeable?
9457 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9458 return -EBADF;
9459
7c673cae
FG
9460 // use/adjust fd pos?
9461 if (offset < 0) {
9462 lock_fh_pos(f);
9463 /*
9464 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9465 * change out from under us.
9466 */
9467 if (f->flags & O_APPEND) {
9468 int r = _lseek(f, 0, SEEK_END);
9469 if (r < 0) {
9470 unlock_fh_pos(f);
9471 return r;
9472 }
9473 }
9474 offset = f->pos;
f64942e4 9475 fpos = offset+size;
7c673cae
FG
9476 unlock_fh_pos(f);
9477 }
9478
11fdf7f2
TL
9479 // check quota
9480 uint64_t endoff = offset + size;
9481 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9482 f->actor_perms)) {
9483 return -EDQUOT;
9484 }
9485
7c673cae
FG
9486 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9487
9488 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9489
9490 // time it.
9491 utime_t start = ceph_clock_now();
9492
9493 if (in->inline_version == 0) {
9494 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9495 if (r < 0)
9496 return r;
11fdf7f2 9497 ceph_assert(in->inline_version > 0);
7c673cae
FG
9498 }
9499
9500 // copy into fresh buffer (since our write may be resub, async)
9501 bufferlist bl;
9502 if (buf) {
9503 if (size > 0)
9504 bl.append(buf, size);
9505 } else if (iov){
9506 for (int i = 0; i < iovcnt; i++) {
9507 if (iov[i].iov_len > 0) {
9508 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9509 }
9510 }
9511 }
9512
9513 utime_t lat;
9514 uint64_t totalwritten;
11fdf7f2
TL
9515 int want, have;
9516 if (f->mode & CEPH_FILE_MODE_LAZY)
9517 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9518 else
9519 want = CEPH_CAP_FILE_BUFFER;
9520 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9521 if (r < 0)
9522 return r;
9523
9524 /* clear the setuid/setgid bits, if any */
181888fb 9525 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9526 struct ceph_statx stx = { 0 };
9527
9528 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9529 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9530 if (r < 0)
9531 return r;
9532 } else {
9533 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9534 }
9535
9536 if (f->flags & O_DIRECT)
11fdf7f2 9537 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9538
9539 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9540
11fdf7f2
TL
9541 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9542
7c673cae
FG
9543 if (in->inline_version < CEPH_INLINE_NONE) {
9544 if (endoff > cct->_conf->client_max_inline_size ||
9545 endoff > CEPH_INLINE_MAX_SIZE ||
9546 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9547 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9548 uninline_data(in, onuninline.get());
7c673cae
FG
9549 } else {
9550 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9551
9552 uint32_t len = in->inline_data.length();
9553
9554 if (endoff < len)
9555 in->inline_data.copy(endoff, len - endoff, bl);
9556
9557 if (offset < len)
9558 in->inline_data.splice(offset, len - offset);
9559 else if (offset > len)
9560 in->inline_data.append_zero(offset - len);
9561
9562 in->inline_data.append(bl);
9563 in->inline_version++;
9564
9565 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9566
9567 goto success;
9568 }
9569 }
9570
11fdf7f2
TL
9571 if (cct->_conf->client_oc &&
9572 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9573 // do buffered write
9574 if (!in->oset.dirty_or_tx)
9575 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9576
9577 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9578
9579 // async, caching, non-blocking.
9580 r = objectcacher->file_write(&in->oset, &in->layout,
9581 in->snaprealm->get_snap_context(),
9582 offset, size, bl, ceph::real_clock::now(),
9583 0);
9584 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9585
9586 if (r < 0)
9587 goto done;
9588
9589 // flush cached write if O_SYNC is set on file fh
9590 // O_DSYNC == O_SYNC on linux < 2.6.33
9591 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9592 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9593 _flush_range(in, offset, size);
9594 }
9595 } else {
9596 if (f->flags & O_DIRECT)
9597 _flush_range(in, offset, size);
9598
9599 // simple, non-atomic sync write
11fdf7f2 9600 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9601 unsafe_sync_write++;
9602 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9603
9604 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9605 offset, size, bl, ceph::real_clock::now(), 0,
9606 in->truncate_size, in->truncate_seq,
11fdf7f2 9607 &onfinish);
7c673cae 9608 client_lock.Unlock();
11fdf7f2 9609 onfinish.wait();
7c673cae
FG
9610 client_lock.Lock();
9611 _sync_write_commit(in);
9612 }
9613
9614 // if we get here, write was successful, update client metadata
9615success:
9616 // time
9617 lat = ceph_clock_now();
9618 lat -= start;
9619 logger->tinc(l_c_wrlat, lat);
9620
f64942e4
AA
9621 if (fpos) {
9622 lock_fh_pos(f);
9623 f->pos = fpos;
9624 unlock_fh_pos(f);
9625 }
7c673cae 9626 totalwritten = size;
11fdf7f2 9627 r = (int64_t)totalwritten;
7c673cae
FG
9628
9629 // extend file?
9630 if (totalwritten + offset > in->size) {
9631 in->size = totalwritten + offset;
28e407b8 9632 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9633
11fdf7f2 9634 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9635 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9636 } else if (is_max_size_approaching(in)) {
9637 check_caps(in, 0);
7c673cae
FG
9638 }
9639
9640 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9641 } else {
9642 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9643 }
9644
9645 // mtime
91327a77 9646 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9647 in->change_attr++;
28e407b8 9648 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9649
9650done:
9651
11fdf7f2 9652 if (nullptr != onuninline) {
7c673cae 9653 client_lock.Unlock();
11fdf7f2 9654 int uninline_ret = onuninline->wait();
7c673cae
FG
9655 client_lock.Lock();
9656
9657 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9658 in->inline_data.clear();
9659 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9660 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9661 check_caps(in, 0);
9662 } else
9663 r = uninline_ret;
9664 }
9665
9666 put_cap_ref(in, CEPH_CAP_FILE_WR);
9667 return r;
9668}
9669
9670int Client::_flush(Fh *f)
9671{
9672 Inode *in = f->inode.get();
9673 int err = f->take_async_err();
9674 if (err != 0) {
9675 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9676 << cpp_strerror(err) << dendl;
9677 } else {
9678 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9679 }
9680
9681 return err;
9682}
9683
9684int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9685{
9686 struct ceph_statx stx;
9687 stx.stx_size = length;
9688 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9689}
9690
9691int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9692{
11fdf7f2
TL
9693 std::lock_guard lock(client_lock);
9694 tout(cct) << __func__ << std::endl;
7c673cae
FG
9695 tout(cct) << fd << std::endl;
9696 tout(cct) << length << std::endl;
9697
181888fb
FG
9698 if (unmounting)
9699 return -ENOTCONN;
9700
7c673cae
FG
9701 Fh *f = get_filehandle(fd);
9702 if (!f)
9703 return -EBADF;
9704#if defined(__linux__) && defined(O_PATH)
9705 if (f->flags & O_PATH)
9706 return -EBADF;
9707#endif
9708 struct stat attr;
9709 attr.st_size = length;
9710 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9711}
9712
9713int Client::fsync(int fd, bool syncdataonly)
9714{
11fdf7f2 9715 std::lock_guard lock(client_lock);
7c673cae
FG
9716 tout(cct) << "fsync" << std::endl;
9717 tout(cct) << fd << std::endl;
9718 tout(cct) << syncdataonly << std::endl;
9719
181888fb
FG
9720 if (unmounting)
9721 return -ENOTCONN;
9722
7c673cae
FG
9723 Fh *f = get_filehandle(fd);
9724 if (!f)
9725 return -EBADF;
9726#if defined(__linux__) && defined(O_PATH)
9727 if (f->flags & O_PATH)
9728 return -EBADF;
9729#endif
9730 int r = _fsync(f, syncdataonly);
9731 if (r == 0) {
9732 // The IOs in this fsync were okay, but maybe something happened
9733 // in the background that we shoudl be reporting?
9734 r = f->take_async_err();
1adf2230 9735 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9736 << ") = 0, async_err = " << r << dendl;
9737 } else {
9738 // Assume that an error we encountered during fsync, even reported
9739 // synchronously, would also have applied the error to the Fh, and we
9740 // should clear it here to avoid returning the same error again on next
9741 // call.
1adf2230 9742 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9743 << r << dendl;
9744 f->take_async_err();
9745 }
9746 return r;
9747}
9748
9749int Client::_fsync(Inode *in, bool syncdataonly)
9750{
9751 int r = 0;
11fdf7f2 9752 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9753 ceph_tid_t flush_tid = 0;
9754 InodeRef tmp_ref;
11fdf7f2
TL
9755 utime_t lat;
9756 utime_t start = ceph_clock_now();
7c673cae 9757
1adf2230 9758 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9759
9760 if (cct->_conf->client_oc) {
11fdf7f2
TL
9761 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9762 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9763 _flush(in, object_cacher_completion.get());
7c673cae
FG
9764 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9765 }
9766
9767 if (!syncdataonly && in->dirty_caps) {
9768 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9769 if (in->flushing_caps)
9770 flush_tid = last_flush_tid;
9771 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9772
9773 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9774 flush_mdlog_sync();
9775
7c673cae
FG
9776 MetaRequest *req = in->unsafe_ops.back();
9777 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9778
9779 req->get();
9780 wait_on_list(req->waitfor_safe);
9781 put_request(req);
9782 }
9783
11fdf7f2 9784 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
7c673cae 9785 client_lock.Unlock();
7c673cae 9786 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9787 r = object_cacher_completion->wait();
7c673cae
FG
9788 client_lock.Lock();
9789 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9790 } else {
9791 // FIXME: this can starve
9792 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9793 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9794 << " uncommitted, waiting" << dendl;
9795 wait_on_list(in->waitfor_commit);
9796 }
9797 }
9798
9799 if (!r) {
9800 if (flush_tid > 0)
9801 wait_sync_caps(in, flush_tid);
9802
9803 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9804 } else {
1adf2230 9805 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9806 << cpp_strerror(-r) << dendl;
9807 }
11fdf7f2
TL
9808
9809 lat = ceph_clock_now();
9810 lat -= start;
9811 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9812
9813 return r;
9814}
9815
9816int Client::_fsync(Fh *f, bool syncdataonly)
9817{
1adf2230 9818 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9819 return _fsync(f->inode.get(), syncdataonly);
9820}
9821
9822int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9823{
11fdf7f2 9824 std::lock_guard lock(client_lock);
7c673cae
FG
9825 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9826 tout(cct) << fd << std::endl;
9827
181888fb
FG
9828 if (unmounting)
9829 return -ENOTCONN;
9830
7c673cae
FG
9831 Fh *f = get_filehandle(fd);
9832 if (!f)
9833 return -EBADF;
9834 int r = _getattr(f->inode, mask, perms);
9835 if (r < 0)
9836 return r;
9837 fill_stat(f->inode, stbuf, NULL);
1adf2230 9838 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9839 return r;
9840}
9841
9842int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9843 unsigned int want, unsigned int flags)
9844{
11fdf7f2 9845 std::lock_guard lock(client_lock);
7c673cae
FG
9846 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9847 tout(cct) << fd << std::endl;
9848
181888fb
FG
9849 if (unmounting)
9850 return -ENOTCONN;
9851
7c673cae
FG
9852 Fh *f = get_filehandle(fd);
9853 if (!f)
9854 return -EBADF;
9855
9856 unsigned mask = statx_to_mask(flags, want);
9857
9858 int r = 0;
94b18763 9859 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9860 r = _getattr(f->inode, mask, perms);
9861 if (r < 0) {
9862 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9863 return r;
9864 }
9865 }
9866
9867 fill_statx(f->inode, mask, stx);
9868 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9869 return r;
9870}
9871
9872// not written yet, but i want to link!
9873
9874int Client::chdir(const char *relpath, std::string &new_cwd,
9875 const UserPerm& perms)
9876{
11fdf7f2 9877 std::lock_guard lock(client_lock);
7c673cae
FG
9878 tout(cct) << "chdir" << std::endl;
9879 tout(cct) << relpath << std::endl;
181888fb
FG
9880
9881 if (unmounting)
9882 return -ENOTCONN;
9883
7c673cae
FG
9884 filepath path(relpath);
9885 InodeRef in;
9886 int r = path_walk(path, &in, perms);
9887 if (r < 0)
9888 return r;
92f5a8d4
TL
9889
9890 if (!(in.get()->is_dir()))
9891 return -ENOTDIR;
9892
7c673cae
FG
9893 if (cwd != in)
9894 cwd.swap(in);
9895 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9896
b5b8bbf5 9897 _getcwd(new_cwd, perms);
7c673cae
FG
9898 return 0;
9899}
9900
b5b8bbf5 9901void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9902{
9903 filepath path;
11fdf7f2 9904 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9905
9906 Inode *in = cwd.get();
9907 while (in != root) {
11fdf7f2 9908 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9909
9910 // A cwd or ancester is unlinked
11fdf7f2 9911 if (in->dentries.empty()) {
7c673cae
FG
9912 return;
9913 }
9914
9915 Dentry *dn = in->get_first_parent();
9916
9917
9918 if (!dn) {
9919 // look it up
11fdf7f2 9920 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9921 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9922 filepath path(in->ino);
9923 req->set_filepath(path);
9924 req->set_inode(in);
9925 int res = make_request(req, perms);
9926 if (res < 0)
9927 break;
9928
9929 // start over
9930 path = filepath();
9931 in = cwd.get();
9932 continue;
9933 }
9934 path.push_front_dentry(dn->name);
9935 in = dn->dir->parent_inode;
9936 }
9937 dir = "/";
9938 dir += path.get_path();
9939}
9940
b5b8bbf5
FG
9941void Client::getcwd(string& dir, const UserPerm& perms)
9942{
11fdf7f2 9943 std::lock_guard l(client_lock);
181888fb
FG
9944 if (!unmounting)
9945 _getcwd(dir, perms);
b5b8bbf5
FG
9946}
9947
7c673cae
FG
9948int Client::statfs(const char *path, struct statvfs *stbuf,
9949 const UserPerm& perms)
9950{
11fdf7f2
TL
9951 std::lock_guard l(client_lock);
9952 tout(cct) << __func__ << std::endl;
91327a77 9953 unsigned long int total_files_on_fs;
7c673cae 9954
181888fb
FG
9955 if (unmounting)
9956 return -ENOTCONN;
9957
7c673cae
FG
9958 ceph_statfs stats;
9959 C_SaferCond cond;
d2e6a577
FG
9960
9961 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9962 if (data_pools.size() == 1) {
9963 objecter->get_fs_stats(stats, data_pools[0], &cond);
9964 } else {
9965 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9966 }
7c673cae
FG
9967
9968 client_lock.Unlock();
9969 int rval = cond.wait();
91327a77
AA
9970 assert(root);
9971 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9972 client_lock.Lock();
9973
9974 if (rval < 0) {
9975 ldout(cct, 1) << "underlying call to statfs returned error: "
9976 << cpp_strerror(rval)
9977 << dendl;
9978 return rval;
9979 }
9980
9981 memset(stbuf, 0, sizeof(*stbuf));
9982
9983 /*
9984 * we're going to set a block size of 4MB so we can represent larger
9985 * FSes without overflowing. Additionally convert the space
9986 * measurements from KB to bytes while making them in terms of
9987 * blocks. We use 4MB only because it is big enough, and because it
9988 * actually *is* the (ceph) default block size.
9989 */
9990 const int CEPH_BLOCK_SHIFT = 22;
9991 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9992 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9993 stbuf->f_files = total_files_on_fs;
9994 stbuf->f_ffree = 0;
7c673cae
FG
9995 stbuf->f_favail = -1;
9996 stbuf->f_fsid = -1; // ??
9997 stbuf->f_flag = 0; // ??
9998 stbuf->f_namemax = NAME_MAX;
9999
10000 // Usually quota_root will == root_ancestor, but if the mount root has no
10001 // quota but we can see a parent of it that does have a quota, we'll
10002 // respect that one instead.
11fdf7f2 10003 ceph_assert(root != nullptr);
7c673cae
FG
10004 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10005
10006 // get_quota_root should always give us something
10007 // because client quotas are always enabled
11fdf7f2 10008 ceph_assert(quota_root != nullptr);
7c673cae
FG
10009
10010 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10011
10012 // Skip the getattr if any sessions are stale, as we don't want to
10013 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10014 // is unhealthy.
10015 if (!_any_stale_sessions()) {
10016 int r = _getattr(quota_root, 0, perms, true);
10017 if (r != 0) {
10018 // Ignore return value: error getting latest inode metadata is not a good
10019 // reason to break "df".
10020 lderr(cct) << "Error in getattr on quota root 0x"
10021 << std::hex << quota_root->ino << std::dec
10022 << " statfs result may be outdated" << dendl;
10023 }
10024 }
10025
10026 // Special case: if there is a size quota set on the Inode acting
10027 // as the root for this client mount, then report the quota status
10028 // as the filesystem statistics.
10029 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10030 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
10031 // It is possible for a quota to be exceeded: arithmetic here must
10032 // handle case where used > total.
10033 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
10034
10035 stbuf->f_blocks = total;
10036 stbuf->f_bfree = free;
10037 stbuf->f_bavail = free;
10038 } else {
d2e6a577 10039 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10040 // multiple pools may be used without one filesystem namespace via
10041 // layouts, this is the most correct thing we can do.
10042 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10043 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10044 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10045 }
10046
10047 return rval;
10048}
10049
10050int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10051 struct flock *fl, uint64_t owner, bool removing)
10052{
11fdf7f2 10053 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10054 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10055 << " type " << fl->l_type << " owner " << owner
10056 << " " << fl->l_start << "~" << fl->l_len << dendl;
10057
10058 int lock_cmd;
10059 if (F_RDLCK == fl->l_type)
10060 lock_cmd = CEPH_LOCK_SHARED;
10061 else if (F_WRLCK == fl->l_type)
10062 lock_cmd = CEPH_LOCK_EXCL;
10063 else if (F_UNLCK == fl->l_type)
10064 lock_cmd = CEPH_LOCK_UNLOCK;
10065 else
10066 return -EIO;
10067
10068 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10069 sleep = 0;
10070
10071 /*
10072 * Set the most significant bit, so that MDS knows the 'owner'
10073 * is sufficient to identify the owner of lock. (old code uses
10074 * both 'owner' and 'pid')
10075 */
10076 owner |= (1ULL << 63);
10077
10078 MetaRequest *req = new MetaRequest(op);
10079 filepath path;
10080 in->make_nosnap_relative_path(path);
10081 req->set_filepath(path);
10082 req->set_inode(in);
10083
10084 req->head.args.filelock_change.rule = lock_type;
10085 req->head.args.filelock_change.type = lock_cmd;
10086 req->head.args.filelock_change.owner = owner;
10087 req->head.args.filelock_change.pid = fl->l_pid;
10088 req->head.args.filelock_change.start = fl->l_start;
10089 req->head.args.filelock_change.length = fl->l_len;
10090 req->head.args.filelock_change.wait = sleep;
10091
10092 int ret;
10093 bufferlist bl;
10094
10095 if (sleep && switch_interrupt_cb) {
10096 // enable interrupt
10097 switch_interrupt_cb(callback_handle, req->get());
10098 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10099 // disable interrupt
10100 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10101 if (ret == 0 && req->aborted()) {
10102 // effect of this lock request has been revoked by the 'lock intr' request
10103 ret = req->get_abort_code();
10104 }
7c673cae
FG
10105 put_request(req);
10106 } else {
10107 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10108 }
10109
10110 if (ret == 0) {
10111 if (op == CEPH_MDS_OP_GETFILELOCK) {
10112 ceph_filelock filelock;
11fdf7f2
TL
10113 auto p = bl.cbegin();
10114 decode(filelock, p);
7c673cae
FG
10115
10116 if (CEPH_LOCK_SHARED == filelock.type)
10117 fl->l_type = F_RDLCK;
10118 else if (CEPH_LOCK_EXCL == filelock.type)
10119 fl->l_type = F_WRLCK;
10120 else
10121 fl->l_type = F_UNLCK;
10122
10123 fl->l_whence = SEEK_SET;
10124 fl->l_start = filelock.start;
10125 fl->l_len = filelock.length;
10126 fl->l_pid = filelock.pid;
10127 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10128 ceph_lock_state_t *lock_state;
10129 if (lock_type == CEPH_LOCK_FCNTL) {
10130 if (!in->fcntl_locks)
11fdf7f2
TL
10131 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10132 lock_state = in->fcntl_locks.get();
7c673cae
FG
10133 } else if (lock_type == CEPH_LOCK_FLOCK) {
10134 if (!in->flock_locks)
11fdf7f2
TL
10135 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10136 lock_state = in->flock_locks.get();
7c673cae
FG
10137 } else {
10138 ceph_abort();
10139 return -EINVAL;
10140 }
10141 _update_lock_state(fl, owner, lock_state);
10142
10143 if (!removing) {
10144 if (lock_type == CEPH_LOCK_FCNTL) {
10145 if (!fh->fcntl_locks)
11fdf7f2
TL
10146 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10147 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10148 } else {
10149 if (!fh->flock_locks)
11fdf7f2
TL
10150 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10151 lock_state = fh->flock_locks.get();
7c673cae
FG
10152 }
10153 _update_lock_state(fl, owner, lock_state);
10154 }
10155 } else
10156 ceph_abort();
10157 }
10158 return ret;
10159}
10160
10161int Client::_interrupt_filelock(MetaRequest *req)
10162{
31f18b77
FG
10163 // Set abort code, but do not kick. The abort code prevents the request
10164 // from being re-sent.
10165 req->abort(-EINTR);
10166 if (req->mds < 0)
10167 return 0; // haven't sent the request
10168
7c673cae
FG
10169 Inode *in = req->inode();
10170
10171 int lock_type;
10172 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10173 lock_type = CEPH_LOCK_FLOCK_INTR;
10174 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10175 lock_type = CEPH_LOCK_FCNTL_INTR;
10176 else {
10177 ceph_abort();
10178 return -EINVAL;
10179 }
10180
10181 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10182 filepath path;
10183 in->make_nosnap_relative_path(path);
10184 intr_req->set_filepath(path);
10185 intr_req->set_inode(in);
10186 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10187 intr_req->head.args.filelock_change.rule = lock_type;
10188 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10189
10190 UserPerm perms(req->get_uid(), req->get_gid());
10191 return make_request(intr_req, perms, NULL, NULL, -1);
10192}
10193
10194void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10195{
10196 if (!in->fcntl_locks && !in->flock_locks)
10197 return;
10198
10199 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10200 encode(nr_fcntl_locks, bl);
7c673cae 10201 if (nr_fcntl_locks) {
11fdf7f2 10202 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10203 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10204 p != lock_state->held_locks.end();
10205 ++p)
11fdf7f2 10206 encode(p->second, bl);
7c673cae
FG
10207 }
10208
10209 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10210 encode(nr_flock_locks, bl);
7c673cae 10211 if (nr_flock_locks) {
11fdf7f2 10212 auto &lock_state = in->flock_locks;
7c673cae
FG
10213 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10214 p != lock_state->held_locks.end();
10215 ++p)
11fdf7f2 10216 encode(p->second, bl);
7c673cae
FG
10217 }
10218
11fdf7f2 10219 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10220 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10221}
10222
10223void Client::_release_filelocks(Fh *fh)
10224{
10225 if (!fh->fcntl_locks && !fh->flock_locks)
10226 return;
10227
10228 Inode *in = fh->inode.get();
11fdf7f2 10229 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10230
10231 list<pair<int, ceph_filelock> > to_release;
10232
10233 if (fh->fcntl_locks) {
11fdf7f2 10234 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10235 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10236 p != lock_state->held_locks.end();
10237 ++p)
10238 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10239 lock_state.reset();
7c673cae
FG
10240 }
10241 if (fh->flock_locks) {
11fdf7f2 10242 auto &lock_state = fh->flock_locks;
7c673cae
FG
10243 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10244 p != lock_state->held_locks.end();
10245 ++p)
10246 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10247 lock_state.reset();
7c673cae
FG
10248 }
10249
10250 if (to_release.empty())
10251 return;
10252
11fdf7f2
TL
10253 // mds has already released filelocks if session was closed.
10254 if (in->caps.empty())
10255 return;
10256
7c673cae
FG
10257 struct flock fl;
10258 memset(&fl, 0, sizeof(fl));
10259 fl.l_whence = SEEK_SET;
10260 fl.l_type = F_UNLCK;
10261
10262 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10263 p != to_release.end();
10264 ++p) {
10265 fl.l_start = p->second.start;
10266 fl.l_len = p->second.length;
10267 fl.l_pid = p->second.pid;
10268 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10269 p->second.owner, true);
10270 }
10271}
10272
10273void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10274 ceph_lock_state_t *lock_state)
10275{
10276 int lock_cmd;
10277 if (F_RDLCK == fl->l_type)
10278 lock_cmd = CEPH_LOCK_SHARED;
10279 else if (F_WRLCK == fl->l_type)
10280 lock_cmd = CEPH_LOCK_EXCL;
10281 else
10282 lock_cmd = CEPH_LOCK_UNLOCK;;
10283
10284 ceph_filelock filelock;
10285 filelock.start = fl->l_start;
10286 filelock.length = fl->l_len;
10287 filelock.client = 0;
10288 // see comment in _do_filelock()
10289 filelock.owner = owner | (1ULL << 63);
10290 filelock.pid = fl->l_pid;
10291 filelock.type = lock_cmd;
10292
10293 if (filelock.type == CEPH_LOCK_UNLOCK) {
10294 list<ceph_filelock> activated_locks;
10295 lock_state->remove_lock(filelock, activated_locks);
10296 } else {
10297 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10298 ceph_assert(r);
7c673cae
FG
10299 }
10300}
10301
10302int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10303{
10304 Inode *in = fh->inode.get();
10305 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10306 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10307 return ret;
10308}
10309
10310int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10311{
10312 Inode *in = fh->inode.get();
10313 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10314 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10315 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10316 return ret;
10317}
10318
10319int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10320{
10321 Inode *in = fh->inode.get();
10322 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10323
10324 int sleep = !(cmd & LOCK_NB);
10325 cmd &= ~LOCK_NB;
10326
10327 int type;
10328 switch (cmd) {
10329 case LOCK_SH:
10330 type = F_RDLCK;
10331 break;
10332 case LOCK_EX:
10333 type = F_WRLCK;
10334 break;
10335 case LOCK_UN:
10336 type = F_UNLCK;
10337 break;
10338 default:
10339 return -EINVAL;
10340 }
10341
10342 struct flock fl;
10343 memset(&fl, 0, sizeof(fl));
10344 fl.l_type = type;
10345 fl.l_whence = SEEK_SET;
10346
10347 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10348 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10349 return ret;
10350}
10351
10352int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10353{
10354 /* Since the only thing this does is wrap a call to statfs, and
10355 statfs takes a lock, it doesn't seem we have a need to split it
10356 out. */
10357 return statfs(0, stbuf, perms);
10358}
10359
10360void Client::ll_register_callbacks(struct client_callback_args *args)
10361{
10362 if (!args)
10363 return;
11fdf7f2
TL
10364 std::lock_guard l(client_lock);
10365 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10366 << " invalidate_ino_cb " << args->ino_cb
10367 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10368 << " switch_interrupt_cb " << args->switch_intr_cb
10369 << " remount_cb " << args->remount_cb
10370 << dendl;
10371 callback_handle = args->handle;
10372 if (args->ino_cb) {
10373 ino_invalidate_cb = args->ino_cb;
10374 async_ino_invalidator.start();
10375 }
10376 if (args->dentry_cb) {
10377 dentry_invalidate_cb = args->dentry_cb;
10378 async_dentry_invalidator.start();
10379 }
10380 if (args->switch_intr_cb) {
10381 switch_interrupt_cb = args->switch_intr_cb;
10382 interrupt_finisher.start();
10383 }
10384 if (args->remount_cb) {
10385 remount_cb = args->remount_cb;
10386 remount_finisher.start();
10387 }
7c673cae
FG
10388 umask_cb = args->umask_cb;
10389}
10390
10391int Client::test_dentry_handling(bool can_invalidate)
10392{
10393 int r = 0;
10394
10395 can_invalidate_dentries = can_invalidate;
10396
10397 if (can_invalidate_dentries) {
11fdf7f2 10398 ceph_assert(dentry_invalidate_cb);
7c673cae 10399 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10400 r = 0;
11fdf7f2
TL
10401 } else {
10402 ceph_assert(remount_cb);
7c673cae 10403 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10404 r = _do_remount(false);
b32b8144 10405 }
11fdf7f2 10406
7c673cae
FG
10407 return r;
10408}
10409
10410int Client::_sync_fs()
10411{
11fdf7f2 10412 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10413
10414 // flush file data
11fdf7f2
TL
10415 std::unique_ptr<C_SaferCond> cond = nullptr;
10416 if (cct->_conf->client_oc) {
10417 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10418 objectcacher->flush_all(cond.get());
10419 }
7c673cae
FG
10420
10421 // flush caps
10422 flush_caps_sync();
10423 ceph_tid_t flush_tid = last_flush_tid;
10424
10425 // wait for unsafe mds requests
10426 wait_unsafe_requests();
10427
10428 wait_sync_caps(flush_tid);
10429
11fdf7f2 10430 if (nullptr != cond) {
7c673cae 10431 client_lock.Unlock();
11fdf7f2
TL
10432 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10433 cond->wait();
10434 ldout(cct, 15) << __func__ << " flush finished" << dendl;
7c673cae
FG
10435 client_lock.Lock();
10436 }
10437
10438 return 0;
10439}
10440
10441int Client::sync_fs()
10442{
11fdf7f2 10443 std::lock_guard l(client_lock);
181888fb
FG
10444
10445 if (unmounting)
10446 return -ENOTCONN;
10447
7c673cae
FG
10448 return _sync_fs();
10449}
10450
10451int64_t Client::drop_caches()
10452{
11fdf7f2 10453 std::lock_guard l(client_lock);
7c673cae
FG
10454 return objectcacher->release_all();
10455}
10456
11fdf7f2
TL
10457int Client::_lazyio(Fh *fh, int enable)
10458{
10459 Inode *in = fh->inode.get();
10460 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10461
10462 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10463 return 0;
10464
10465 int orig_mode = fh->mode;
10466 if (enable) {
10467 fh->mode |= CEPH_FILE_MODE_LAZY;
10468 in->get_open_ref(fh->mode);
10469 in->put_open_ref(orig_mode);
10470 check_caps(in, CHECK_CAPS_NODELAY);
10471 } else {
10472 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10473 in->get_open_ref(fh->mode);
10474 in->put_open_ref(orig_mode);
10475 check_caps(in, 0);
10476 }
10477
10478 return 0;
10479}
10480
10481int Client::lazyio(int fd, int enable)
10482{
10483 std::lock_guard l(client_lock);
10484 Fh *f = get_filehandle(fd);
10485 if (!f)
10486 return -EBADF;
10487
10488 return _lazyio(f, enable);
10489}
10490
10491int Client::ll_lazyio(Fh *fh, int enable)
10492{
10493 std::lock_guard lock(client_lock);
10494 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10495 tout(cct) << __func__ << std::endl;
10496
10497 return _lazyio(fh, enable);
10498}
7c673cae 10499
92f5a8d4 10500int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 10501{
11fdf7f2 10502 std::lock_guard l(client_lock);
92f5a8d4 10503 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
10504 << ", " << offset << ", " << count << ")" << dendl;
10505
10506 Fh *f = get_filehandle(fd);
10507 if (!f)
10508 return -EBADF;
10509
10510 // for now
10511 _fsync(f, true);
10512
10513 return 0;
10514}
10515
10516int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10517{
11fdf7f2 10518 std::lock_guard l(client_lock);
7c673cae
FG
10519 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10520 << ", " << offset << ", " << count << ")" << dendl;
10521
10522 Fh *f = get_filehandle(fd);
10523 if (!f)
10524 return -EBADF;
10525 Inode *in = f->inode.get();
10526
10527 _fsync(f, true);
92f5a8d4
TL
10528 if (_release(in)) {
10529 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10530 if (r < 0)
10531 return r;
10532 }
7c673cae
FG
10533 return 0;
10534}
10535
10536
10537// =============================
10538// snaps
10539
10540int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10541{
11fdf7f2 10542 std::lock_guard l(client_lock);
181888fb
FG
10543
10544 if (unmounting)
10545 return -ENOTCONN;
10546
7c673cae
FG
10547 filepath path(relpath);
10548 InodeRef in;
10549 int r = path_walk(path, &in, perm);
10550 if (r < 0)
10551 return r;
10552 if (cct->_conf->client_permissions) {
10553 r = may_create(in.get(), perm);
10554 if (r < 0)
10555 return r;
10556 }
10557 Inode *snapdir = open_snapdir(in.get());
10558 return _mkdir(snapdir, name, 0, perm);
10559}
181888fb 10560
7c673cae
FG
10561int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10562{
11fdf7f2 10563 std::lock_guard l(client_lock);
181888fb
FG
10564
10565 if (unmounting)
10566 return -ENOTCONN;
10567
7c673cae
FG
10568 filepath path(relpath);
10569 InodeRef in;
10570 int r = path_walk(path, &in, perms);
10571 if (r < 0)
10572 return r;
10573 if (cct->_conf->client_permissions) {
10574 r = may_delete(in.get(), NULL, perms);
10575 if (r < 0)
10576 return r;
10577 }
10578 Inode *snapdir = open_snapdir(in.get());
10579 return _rmdir(snapdir, name, perms);
10580}
10581
10582// =============================
10583// expose caps
10584
10585int Client::get_caps_issued(int fd) {
10586
11fdf7f2 10587 std::lock_guard lock(client_lock);
7c673cae 10588
181888fb
FG
10589 if (unmounting)
10590 return -ENOTCONN;
10591
7c673cae
FG
10592 Fh *f = get_filehandle(fd);
10593 if (!f)
10594 return -EBADF;
10595
10596 return f->inode->caps_issued();
10597}
10598
10599int Client::get_caps_issued(const char *path, const UserPerm& perms)
10600{
11fdf7f2 10601 std::lock_guard lock(client_lock);
181888fb
FG
10602
10603 if (unmounting)
10604 return -ENOTCONN;
10605
7c673cae
FG
10606 filepath p(path);
10607 InodeRef in;
10608 int r = path_walk(p, &in, perms, true);
10609 if (r < 0)
10610 return r;
10611 return in->caps_issued();
10612}
10613
10614// =========================================
10615// low level
10616
10617Inode *Client::open_snapdir(Inode *diri)
10618{
10619 Inode *in;
10620 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10621 if (!inode_map.count(vino)) {
10622 in = new Inode(this, vino, &diri->layout);
10623
10624 in->ino = diri->ino;
10625 in->snapid = CEPH_SNAPDIR;
10626 in->mode = diri->mode;
10627 in->uid = diri->uid;
10628 in->gid = diri->gid;
494da23a 10629 in->nlink = 1;
7c673cae
FG
10630 in->mtime = diri->mtime;
10631 in->ctime = diri->ctime;
10632 in->btime = diri->btime;
10633 in->size = diri->size;
10634 in->change_attr = diri->change_attr;
10635
10636 in->dirfragtree.clear();
10637 in->snapdir_parent = diri;
10638 diri->flags |= I_SNAPDIR_OPEN;
10639 inode_map[vino] = in;
10640 if (use_faked_inos())
10641 _assign_faked_ino(in);
10642 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10643 } else {
10644 in = inode_map[vino];
10645 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10646 }
10647 return in;
10648}
10649
10650int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10651 Inode **out, const UserPerm& perms)
10652{
11fdf7f2 10653 std::lock_guard lock(client_lock);
31f18b77 10654 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10655 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10656 tout(cct) << __func__ << std::endl;
7c673cae
FG
10657 tout(cct) << name << std::endl;
10658
181888fb
FG
10659 if (unmounting)
10660 return -ENOTCONN;
10661
7c673cae 10662 int r = 0;
11fdf7f2
TL
10663 if (!fuse_default_permissions) {
10664 if (strcmp(name, ".") && strcmp(name, "..")) {
10665 r = may_lookup(parent, perms);
10666 if (r < 0)
10667 return r;
10668 }
7c673cae
FG
10669 }
10670
10671 string dname(name);
10672 InodeRef in;
10673
10674 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10675 if (r < 0) {
10676 attr->st_ino = 0;
10677 goto out;
10678 }
10679
11fdf7f2 10680 ceph_assert(in);
7c673cae
FG
10681 fill_stat(in, attr);
10682 _ll_get(in.get());
10683
10684 out:
11fdf7f2 10685 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10686 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10687 tout(cct) << attr->st_ino << std::endl;
10688 *out = in.get();
10689 return r;
10690}
10691
1adf2230
AA
10692int Client::ll_lookup_inode(
10693 struct inodeno_t ino,
10694 const UserPerm& perms,
10695 Inode **inode)
10696{
81eedcae 10697 ceph_assert(inode != NULL);
11fdf7f2 10698 std::lock_guard lock(client_lock);
1adf2230
AA
10699 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10700
81eedcae
TL
10701 if (unmounting)
10702 return -ENOTCONN;
10703
1adf2230
AA
10704 // Num1: get inode and *inode
10705 int r = _lookup_ino(ino, perms, inode);
81eedcae 10706 if (r)
1adf2230 10707 return r;
81eedcae 10708
11fdf7f2 10709 ceph_assert(*inode != NULL);
1adf2230 10710
81eedcae
TL
10711 if (!(*inode)->dentries.empty()) {
10712 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10713 return 0;
10714 }
10715
10716 if ((*inode)->is_root()) {
10717 ldout(cct, 8) << "ino is root, no parent" << dendl;
10718 return 0;
10719 }
10720
1adf2230
AA
10721 // Num2: Request the parent inode, so that we can look up the name
10722 Inode *parent;
10723 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10724 if (r) {
1adf2230
AA
10725 _ll_forget(*inode, 1);
10726 return r;
1adf2230 10727 }
81eedcae 10728
11fdf7f2 10729 ceph_assert(parent != NULL);
1adf2230
AA
10730
10731 // Num3: Finally, get the name (dentry) of the requested inode
10732 r = _lookup_name(*inode, parent, perms);
10733 if (r) {
10734 // Unexpected error
10735 _ll_forget(parent, 1);
10736 _ll_forget(*inode, 1);
10737 return r;
10738 }
10739
10740 _ll_forget(parent, 1);
10741 return 0;
10742}
10743
7c673cae
FG
10744int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10745 struct ceph_statx *stx, unsigned want, unsigned flags,
10746 const UserPerm& perms)
10747{
11fdf7f2 10748 std::lock_guard lock(client_lock);
31f18b77 10749 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10750 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10751 tout(cct) << "ll_lookupx" << std::endl;
10752 tout(cct) << name << std::endl;
10753
181888fb
FG
10754 if (unmounting)
10755 return -ENOTCONN;
10756
7c673cae 10757 int r = 0;
11fdf7f2 10758 if (!fuse_default_permissions) {
7c673cae
FG
10759 r = may_lookup(parent, perms);
10760 if (r < 0)
10761 return r;
10762 }
10763
10764 string dname(name);
10765 InodeRef in;
10766
10767 unsigned mask = statx_to_mask(flags, want);
10768 r = _lookup(parent, dname, mask, &in, perms);
10769 if (r < 0) {
10770 stx->stx_ino = 0;
10771 stx->stx_mask = 0;
10772 } else {
11fdf7f2 10773 ceph_assert(in);
7c673cae
FG
10774 fill_statx(in, mask, stx);
10775 _ll_get(in.get());
10776 }
10777
11fdf7f2 10778 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10779 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10780 tout(cct) << stx->stx_ino << std::endl;
10781 *out = in.get();
10782 return r;
10783}
10784
10785int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10786 unsigned int want, unsigned int flags, const UserPerm& perms)
10787{
11fdf7f2 10788 std::lock_guard lock(client_lock);
181888fb
FG
10789
10790 if (unmounting)
10791 return -ENOTCONN;
10792
7c673cae
FG
10793 filepath fp(name, 0);
10794 InodeRef in;
10795 int rc;
10796 unsigned mask = statx_to_mask(flags, want);
10797
11fdf7f2
TL
10798 ldout(cct, 3) << __func__ << " " << name << dendl;
10799 tout(cct) << __func__ << std::endl;
7c673cae
FG
10800 tout(cct) << name << std::endl;
10801
10802 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10803 if (rc < 0) {
10804 /* zero out mask, just in case... */
10805 stx->stx_mask = 0;
10806 stx->stx_ino = 0;
10807 *out = NULL;
10808 return rc;
10809 } else {
11fdf7f2 10810 ceph_assert(in);
7c673cae
FG
10811 fill_statx(in, mask, stx);
10812 _ll_get(in.get());
10813 *out = in.get();
10814 return 0;
10815 }
10816}
10817
10818void Client::_ll_get(Inode *in)
10819{
10820 if (in->ll_ref == 0) {
10821 in->get();
11fdf7f2
TL
10822 if (in->is_dir() && !in->dentries.empty()) {
10823 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10824 in->get_first_parent()->get(); // pin dentry
10825 }
11fdf7f2
TL
10826 if (in->snapid != CEPH_NOSNAP)
10827 ll_snap_ref[in->snapid]++;
7c673cae
FG
10828 }
10829 in->ll_get();
11fdf7f2 10830 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10831}
10832
494da23a 10833int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
10834{
10835 in->ll_put(num);
11fdf7f2 10836 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10837 if (in->ll_ref == 0) {
11fdf7f2
TL
10838 if (in->is_dir() && !in->dentries.empty()) {
10839 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10840 in->get_first_parent()->put(); // unpin dentry
10841 }
11fdf7f2
TL
10842 if (in->snapid != CEPH_NOSNAP) {
10843 auto p = ll_snap_ref.find(in->snapid);
10844 ceph_assert(p != ll_snap_ref.end());
10845 ceph_assert(p->second > 0);
10846 if (--p->second == 0)
10847 ll_snap_ref.erase(p);
10848 }
7c673cae
FG
10849 put_inode(in);
10850 return 0;
10851 } else {
10852 return in->ll_ref;
10853 }
10854}
10855
10856void Client::_ll_drop_pins()
10857{
11fdf7f2 10858 ldout(cct, 10) << __func__ << dendl;
1adf2230 10859 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10860 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10861 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10862 it != inode_map.end();
10863 it = next) {
10864 Inode *in = it->second;
10865 next = it;
10866 ++next;
1adf2230
AA
10867 if (in->ll_ref){
10868 to_be_put.insert(in);
7c673cae 10869 _ll_put(in, in->ll_ref);
1adf2230 10870 }
7c673cae
FG
10871 }
10872}
10873
494da23a 10874bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 10875{
11fdf7f2 10876 inodeno_t ino = in->ino;
7c673cae 10877
11fdf7f2
TL
10878 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10879 tout(cct) << __func__ << std::endl;
7c673cae
FG
10880 tout(cct) << ino.val << std::endl;
10881 tout(cct) << count << std::endl;
10882
181888fb
FG
10883 // Ignore forget if we're no longer mounted
10884 if (unmounting)
10885 return true;
10886
7c673cae
FG
10887 if (ino == 1) return true; // ignore forget on root.
10888
10889 bool last = false;
10890 if (in->ll_ref < count) {
10891 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10892 << ", which only has ll_ref=" << in->ll_ref << dendl;
10893 _ll_put(in, in->ll_ref);
10894 last = true;
10895 } else {
10896 if (_ll_put(in, count) == 0)
10897 last = true;
10898 }
10899
10900 return last;
10901}
10902
494da23a 10903bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 10904{
11fdf7f2 10905 std::lock_guard lock(client_lock);
1adf2230
AA
10906 return _ll_forget(in, count);
10907}
10908
7c673cae
FG
10909bool Client::ll_put(Inode *in)
10910{
10911 /* ll_forget already takes the lock */
10912 return ll_forget(in, 1);
10913}
10914
11fdf7f2
TL
10915int Client::ll_get_snap_ref(snapid_t snap)
10916{
10917 std::lock_guard lock(client_lock);
10918 auto p = ll_snap_ref.find(snap);
10919 if (p != ll_snap_ref.end())
10920 return p->second;
10921 return 0;
10922}
10923
7c673cae
FG
10924snapid_t Client::ll_get_snapid(Inode *in)
10925{
11fdf7f2 10926 std::lock_guard lock(client_lock);
7c673cae
FG
10927 return in->snapid;
10928}
10929
10930Inode *Client::ll_get_inode(ino_t ino)
10931{
11fdf7f2 10932 std::lock_guard lock(client_lock);
181888fb
FG
10933
10934 if (unmounting)
10935 return NULL;
10936
7c673cae
FG
10937 vinodeno_t vino = _map_faked_ino(ino);
10938 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10939 if (p == inode_map.end())
10940 return NULL;
10941 Inode *in = p->second;
10942 _ll_get(in);
10943 return in;
10944}
10945
10946Inode *Client::ll_get_inode(vinodeno_t vino)
10947{
11fdf7f2 10948 std::lock_guard lock(client_lock);
181888fb
FG
10949
10950 if (unmounting)
10951 return NULL;
10952
7c673cae
FG
10953 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10954 if (p == inode_map.end())
10955 return NULL;
10956 Inode *in = p->second;
10957 _ll_get(in);
10958 return in;
10959}
10960
10961int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10962{
10963 vinodeno_t vino = _get_vino(in);
10964
11fdf7f2
TL
10965 ldout(cct, 8) << __func__ << " " << vino << dendl;
10966 tout(cct) << __func__ << std::endl;
7c673cae
FG
10967 tout(cct) << vino.ino.val << std::endl;
10968
10969 if (vino.snapid < CEPH_NOSNAP)
10970 return 0;
10971 else
10972 return _getattr(in, caps, perms);
10973}
10974
10975int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10976{
11fdf7f2 10977 std::lock_guard lock(client_lock);
7c673cae 10978
181888fb
FG
10979 if (unmounting)
10980 return -ENOTCONN;
10981
7c673cae
FG
10982 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10983
10984 if (res == 0)
10985 fill_stat(in, attr);
11fdf7f2 10986 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10987 return res;
10988}
10989
10990int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10991 unsigned int flags, const UserPerm& perms)
10992{
11fdf7f2 10993 std::lock_guard lock(client_lock);
7c673cae 10994
181888fb
FG
10995 if (unmounting)
10996 return -ENOTCONN;
10997
7c673cae
FG
10998 int res = 0;
10999 unsigned mask = statx_to_mask(flags, want);
11000
94b18763 11001 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
11002 res = _ll_getattr(in, mask, perms);
11003
11004 if (res == 0)
11005 fill_statx(in, mask, stx);
11fdf7f2 11006 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11007 return res;
11008}
11009
11010int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11011 const UserPerm& perms, InodeRef *inp)
11012{
11013 vinodeno_t vino = _get_vino(in);
11014
11fdf7f2 11015 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 11016 << dendl;
11fdf7f2 11017 tout(cct) << __func__ << std::endl;
7c673cae
FG
11018 tout(cct) << vino.ino.val << std::endl;
11019 tout(cct) << stx->stx_mode << std::endl;
11020 tout(cct) << stx->stx_uid << std::endl;
11021 tout(cct) << stx->stx_gid << std::endl;
11022 tout(cct) << stx->stx_size << std::endl;
11023 tout(cct) << stx->stx_mtime << std::endl;
11024 tout(cct) << stx->stx_atime << std::endl;
11025 tout(cct) << stx->stx_btime << std::endl;
11026 tout(cct) << mask << std::endl;
11027
11fdf7f2 11028 if (!fuse_default_permissions) {
7c673cae
FG
11029 int res = may_setattr(in, stx, mask, perms);
11030 if (res < 0)
11031 return res;
11032 }
11033
11034 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11035
11036 return __setattrx(in, stx, mask, perms, inp);
11037}
11038
11039int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11040 const UserPerm& perms)
11041{
11fdf7f2 11042 std::lock_guard lock(client_lock);
181888fb
FG
11043
11044 if (unmounting)
11045 return -ENOTCONN;
11046
7c673cae
FG
11047 InodeRef target(in);
11048 int res = _ll_setattrx(in, stx, mask, perms, &target);
11049 if (res == 0) {
11fdf7f2 11050 ceph_assert(in == target.get());
7c673cae
FG
11051 fill_statx(in, in->caps_issued(), stx);
11052 }
11053
11fdf7f2 11054 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11055 return res;
11056}
11057
11058int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11059 const UserPerm& perms)
11060{
11061 struct ceph_statx stx;
11062 stat_to_statx(attr, &stx);
11063
11fdf7f2 11064 std::lock_guard lock(client_lock);
181888fb
FG
11065
11066 if (unmounting)
11067 return -ENOTCONN;
11068
7c673cae
FG
11069 InodeRef target(in);
11070 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11071 if (res == 0) {
11fdf7f2 11072 ceph_assert(in == target.get());
7c673cae
FG
11073 fill_stat(in, attr);
11074 }
11075
11fdf7f2 11076 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11077 return res;
11078}
11079
11080
11081// ----------
11082// xattrs
11083
11084int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11085 const UserPerm& perms)
11086{
11fdf7f2 11087 std::lock_guard lock(client_lock);
181888fb
FG
11088
11089 if (unmounting)
11090 return -ENOTCONN;
11091
7c673cae
FG
11092 InodeRef in;
11093 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11094 if (r < 0)
11095 return r;
11096 return _getxattr(in, name, value, size, perms);
11097}
11098
11099int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11100 const UserPerm& perms)
11101{
11fdf7f2 11102 std::lock_guard lock(client_lock);
181888fb
FG
11103
11104 if (unmounting)
11105 return -ENOTCONN;
11106
7c673cae
FG
11107 InodeRef in;
11108 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11109 if (r < 0)
11110 return r;
11111 return _getxattr(in, name, value, size, perms);
11112}
11113
11114int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11115 const UserPerm& perms)
11116{
11fdf7f2 11117 std::lock_guard lock(client_lock);
181888fb
FG
11118
11119 if (unmounting)
11120 return -ENOTCONN;
11121
7c673cae
FG
11122 Fh *f = get_filehandle(fd);
11123 if (!f)
11124 return -EBADF;
11125 return _getxattr(f->inode, name, value, size, perms);
11126}
11127
11128int Client::listxattr(const char *path, char *list, size_t size,
11129 const UserPerm& perms)
11130{
11fdf7f2 11131 std::lock_guard lock(client_lock);
181888fb
FG
11132
11133 if (unmounting)
11134 return -ENOTCONN;
11135
7c673cae
FG
11136 InodeRef in;
11137 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11138 if (r < 0)
11139 return r;
11140 return Client::_listxattr(in.get(), list, size, perms);
11141}
11142
11143int Client::llistxattr(const char *path, char *list, size_t size,
11144 const UserPerm& perms)
11145{
11fdf7f2 11146 std::lock_guard lock(client_lock);
181888fb
FG
11147
11148 if (unmounting)
11149 return -ENOTCONN;
11150
7c673cae
FG
11151 InodeRef in;
11152 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11153 if (r < 0)
11154 return r;
11155 return Client::_listxattr(in.get(), list, size, perms);
11156}
11157
11158int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11159{
11fdf7f2 11160 std::lock_guard lock(client_lock);
181888fb
FG
11161
11162 if (unmounting)
11163 return -ENOTCONN;
11164
7c673cae
FG
11165 Fh *f = get_filehandle(fd);
11166 if (!f)
11167 return -EBADF;
11168 return Client::_listxattr(f->inode.get(), list, size, perms);
11169}
11170
11171int Client::removexattr(const char *path, const char *name,
11172 const UserPerm& perms)
11173{
11fdf7f2 11174 std::lock_guard lock(client_lock);
181888fb
FG
11175
11176 if (unmounting)
11177 return -ENOTCONN;
11178
7c673cae
FG
11179 InodeRef in;
11180 int r = Client::path_walk(path, &in, perms, true);
11181 if (r < 0)
11182 return r;
11183 return _removexattr(in, name, perms);
11184}
11185
11186int Client::lremovexattr(const char *path, const char *name,
11187 const UserPerm& perms)
11188{
11fdf7f2 11189 std::lock_guard lock(client_lock);
181888fb
FG
11190
11191 if (unmounting)
11192 return -ENOTCONN;
11193
7c673cae
FG
11194 InodeRef in;
11195 int r = Client::path_walk(path, &in, perms, false);
11196 if (r < 0)
11197 return r;
11198 return _removexattr(in, name, perms);
11199}
11200
11201int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11202{
11fdf7f2 11203 std::lock_guard lock(client_lock);
181888fb
FG
11204
11205 if (unmounting)
11206 return -ENOTCONN;
11207
7c673cae
FG
11208 Fh *f = get_filehandle(fd);
11209 if (!f)
11210 return -EBADF;
11211 return _removexattr(f->inode, name, perms);
11212}
11213
11214int Client::setxattr(const char *path, const char *name, const void *value,
11215 size_t size, int flags, const UserPerm& perms)
11216{
11217 _setxattr_maybe_wait_for_osdmap(name, value, size);
11218
11fdf7f2 11219 std::lock_guard lock(client_lock);
181888fb
FG
11220
11221 if (unmounting)
11222 return -ENOTCONN;
11223
7c673cae
FG
11224 InodeRef in;
11225 int r = Client::path_walk(path, &in, perms, true);
11226 if (r < 0)
11227 return r;
11228 return _setxattr(in, name, value, size, flags, perms);
11229}
11230
11231int Client::lsetxattr(const char *path, const char *name, const void *value,
11232 size_t size, int flags, const UserPerm& perms)
11233{
11234 _setxattr_maybe_wait_for_osdmap(name, value, size);
11235
11fdf7f2 11236 std::lock_guard lock(client_lock);
181888fb
FG
11237
11238 if (unmounting)
11239 return -ENOTCONN;
11240
7c673cae
FG
11241 InodeRef in;
11242 int r = Client::path_walk(path, &in, perms, false);
11243 if (r < 0)
11244 return r;
11245 return _setxattr(in, name, value, size, flags, perms);
11246}
11247
11248int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11249 int flags, const UserPerm& perms)
11250{
11251 _setxattr_maybe_wait_for_osdmap(name, value, size);
11252
11fdf7f2 11253 std::lock_guard lock(client_lock);
181888fb
FG
11254
11255 if (unmounting)
11256 return -ENOTCONN;
11257
7c673cae
FG
11258 Fh *f = get_filehandle(fd);
11259 if (!f)
11260 return -EBADF;
11261 return _setxattr(f->inode, name, value, size, flags, perms);
11262}
11263
11264int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11265 const UserPerm& perms)
11266{
11267 int r;
11268
11269 const VXattr *vxattr = _match_vxattr(in, name);
11270 if (vxattr) {
11271 r = -ENODATA;
11272
11273 // Do a force getattr to get the latest quota before returning
11274 // a value to userspace.
28e407b8
AA
11275 int flags = 0;
11276 if (vxattr->flags & VXATTR_RSTAT) {
11277 flags |= CEPH_STAT_RSTAT;
11278 }
11279 r = _getattr(in, flags, perms, true);
7c673cae
FG
11280 if (r != 0) {
11281 // Error from getattr!
11282 return r;
11283 }
11284
11285 // call pointer-to-member function
11286 char buf[256];
11287 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11288 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11289 } else {
11290 r = -ENODATA;
11291 }
11292
11293 if (size != 0) {
11294 if (r > (int)size) {
11295 r = -ERANGE;
11296 } else if (r > 0) {
11297 memcpy(value, buf, r);
11298 }
11299 }
11300 goto out;
11301 }
11302
11303 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11304 r = -EOPNOTSUPP;
11305 goto out;
11306 }
11307
11308 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11309 if (r == 0) {
11310 string n(name);
11311 r = -ENODATA;
11312 if (in->xattrs.count(n)) {
11313 r = in->xattrs[n].length();
11314 if (r > 0 && size != 0) {
11315 if (size >= (unsigned)r)
11316 memcpy(value, in->xattrs[n].c_str(), r);
11317 else
11318 r = -ERANGE;
11319 }
11320 }
11321 }
11322 out:
1adf2230 11323 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11324 return r;
11325}
11326
11327int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11328 const UserPerm& perms)
11329{
11330 if (cct->_conf->client_permissions) {
11331 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11332 if (r < 0)
11333 return r;
11334 }
11335 return _getxattr(in.get(), name, value, size, perms);
11336}
11337
11338int Client::ll_getxattr(Inode *in, const char *name, void *value,
11339 size_t size, const UserPerm& perms)
11340{
11fdf7f2 11341 std::lock_guard lock(client_lock);
7c673cae 11342
181888fb
FG
11343 if (unmounting)
11344 return -ENOTCONN;
11345
7c673cae
FG
11346 vinodeno_t vino = _get_vino(in);
11347
11fdf7f2
TL
11348 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11349 tout(cct) << __func__ << std::endl;
7c673cae
FG
11350 tout(cct) << vino.ino.val << std::endl;
11351 tout(cct) << name << std::endl;
11352
11fdf7f2 11353 if (!fuse_default_permissions) {
7c673cae
FG
11354 int r = xattr_permission(in, name, MAY_READ, perms);
11355 if (r < 0)
11356 return r;
11357 }
11358
11359 return _getxattr(in, name, value, size, perms);
11360}
11361
11362int Client::_listxattr(Inode *in, char *name, size_t size,
11363 const UserPerm& perms)
11364{
81eedcae 11365 bool len_only = (size == 0);
7c673cae 11366 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11367 if (r != 0) {
11368 goto out;
11369 }
7c673cae 11370
81eedcae
TL
11371 r = 0;
11372 for (const auto& p : in->xattrs) {
11373 size_t this_len = p.first.length() + 1;
11374 r += this_len;
11375 if (len_only)
11376 continue;
7c673cae 11377
81eedcae
TL
11378 if (this_len > size) {
11379 r = -ERANGE;
11380 goto out;
11381 }
11382
11383 memcpy(name, p.first.c_str(), this_len);
11384 name += this_len;
11385 size -= this_len;
11386 }
11387
11388 const VXattr *vxattr;
11389 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11390 if (vxattr->hidden)
11391 continue;
11392 // call pointer-to-member function
11393 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11394 continue;
11395
11396 size_t this_len = vxattr->name.length() + 1;
11397 r += this_len;
11398 if (len_only)
11399 continue;
11400
11401 if (this_len > size) {
11402 r = -ERANGE;
11403 goto out;
7c673cae 11404 }
81eedcae
TL
11405
11406 memcpy(name, vxattr->name.c_str(), this_len);
11407 name += this_len;
11408 size -= this_len;
7c673cae 11409 }
81eedcae 11410out:
11fdf7f2 11411 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11412 return r;
11413}
11414
11415int Client::ll_listxattr(Inode *in, char *names, size_t size,
11416 const UserPerm& perms)
11417{
11fdf7f2 11418 std::lock_guard lock(client_lock);
7c673cae 11419
181888fb
FG
11420 if (unmounting)
11421 return -ENOTCONN;
11422
7c673cae
FG
11423 vinodeno_t vino = _get_vino(in);
11424
11fdf7f2
TL
11425 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11426 tout(cct) << __func__ << std::endl;
7c673cae
FG
11427 tout(cct) << vino.ino.val << std::endl;
11428 tout(cct) << size << std::endl;
11429
11430 return _listxattr(in, names, size, perms);
11431}
11432
11433int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11434 size_t size, int flags, const UserPerm& perms)
11435{
11436
11437 int xattr_flags = 0;
11438 if (!value)
11439 xattr_flags |= CEPH_XATTR_REMOVE;
11440 if (flags & XATTR_CREATE)
11441 xattr_flags |= CEPH_XATTR_CREATE;
11442 if (flags & XATTR_REPLACE)
11443 xattr_flags |= CEPH_XATTR_REPLACE;
11444
11445 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11446 filepath path;
11447 in->make_nosnap_relative_path(path);
11448 req->set_filepath(path);
11449 req->set_string2(name);
11450 req->set_inode(in);
11451 req->head.args.setxattr.flags = xattr_flags;
11452
11453 bufferlist bl;
11fdf7f2 11454 assert (value || size == 0);
7c673cae
FG
11455 bl.append((const char*)value, size);
11456 req->set_data(bl);
11457
11458 int res = make_request(req, perms);
11459
11460 trim_cache();
11fdf7f2 11461 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11462 res << dendl;
11463 return res;
11464}
11465
11466int Client::_setxattr(Inode *in, const char *name, const void *value,
11467 size_t size, int flags, const UserPerm& perms)
11468{
11469 if (in->snapid != CEPH_NOSNAP) {
11470 return -EROFS;
11471 }
11472
11473 bool posix_acl_xattr = false;
11474 if (acl_type == POSIX_ACL)
11475 posix_acl_xattr = !strncmp(name, "system.", 7);
11476
11477 if (strncmp(name, "user.", 5) &&
11478 strncmp(name, "security.", 9) &&
11479 strncmp(name, "trusted.", 8) &&
11480 strncmp(name, "ceph.", 5) &&
11481 !posix_acl_xattr)
11482 return -EOPNOTSUPP;
11483
11fdf7f2
TL
11484 bool check_realm = false;
11485
7c673cae
FG
11486 if (posix_acl_xattr) {
11487 if (!strcmp(name, ACL_EA_ACCESS)) {
11488 mode_t new_mode = in->mode;
11489 if (value) {
11490 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11491 if (ret < 0)
11492 return ret;
11493 if (ret == 0) {
11494 value = NULL;
11495 size = 0;
11496 }
11497 if (new_mode != in->mode) {
11498 struct ceph_statx stx;
11499 stx.stx_mode = new_mode;
11500 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11501 if (ret < 0)
11502 return ret;
11503 }
11504 }
11505 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11506 if (value) {
11507 if (!S_ISDIR(in->mode))
11508 return -EACCES;
11509 int ret = posix_acl_check(value, size);
11510 if (ret < 0)
11511 return -EINVAL;
11512 if (ret == 0) {
11513 value = NULL;
11514 size = 0;
11515 }
11516 }
11517 } else {
11518 return -EOPNOTSUPP;
11519 }
11520 } else {
11521 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11522 if (vxattr) {
11523 if (vxattr->readonly)
11524 return -EOPNOTSUPP;
11525 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11526 check_realm = true;
11527 }
7c673cae
FG
11528 }
11529
11fdf7f2
TL
11530 int ret = _do_setxattr(in, name, value, size, flags, perms);
11531 if (ret >= 0 && check_realm) {
11532 // check if snaprealm was created for quota inode
11533 if (in->quota.is_enable() &&
11534 !(in->snaprealm && in->snaprealm->ino == in->ino))
11535 ret = -EOPNOTSUPP;
11536 }
11537
11538 return ret;
7c673cae
FG
11539}
11540
11541int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11542 size_t size, int flags, const UserPerm& perms)
11543{
11544 if (cct->_conf->client_permissions) {
11545 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11546 if (r < 0)
11547 return r;
11548 }
11549 return _setxattr(in.get(), name, value, size, flags, perms);
11550}
11551
11552int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11553{
11554 string tmp;
11555 if (name == "layout") {
11556 string::iterator begin = value.begin();
11557 string::iterator end = value.end();
11558 keys_and_values<string::iterator> p; // create instance of parser
11559 std::map<string, string> m; // map to receive results
11560 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11561 return -EINVAL;
11562 }
11563 if (begin != end)
11564 return -EINVAL;
11565 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11566 if (q->first == "pool") {
11567 tmp = q->second;
11568 break;
11569 }
11570 }
11571 } else if (name == "layout.pool") {
11572 tmp = value;
11573 }
11574
11575 if (tmp.length()) {
11576 int64_t pool;
11577 try {
11578 pool = boost::lexical_cast<unsigned>(tmp);
11579 if (!osdmap->have_pg_pool(pool))
11580 return -ENOENT;
11581 } catch (boost::bad_lexical_cast const&) {
11582 pool = osdmap->lookup_pg_pool_name(tmp);
11583 if (pool < 0) {
11584 return -ENOENT;
11585 }
11586 }
11587 }
11588
11589 return 0;
11590}
11591
11592void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11593{
11594 // For setting pool of layout, MetaRequest need osdmap epoch.
11595 // There is a race which create a new data pool but client and mds both don't have.
11596 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11597 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11598 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11599 string rest(strstr(name, "layout"));
11600 string v((const char*)value, size);
11601 int r = objecter->with_osdmap([&](const OSDMap& o) {
11602 return _setxattr_check_data_pool(rest, v, &o);
11603 });
11604
11605 if (r == -ENOENT) {
11606 C_SaferCond ctx;
11607 objecter->wait_for_latest_osdmap(&ctx);
11608 ctx.wait();
11609 }
11610 }
11611}
11612
11613int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11614 size_t size, int flags, const UserPerm& perms)
11615{
11616 _setxattr_maybe_wait_for_osdmap(name, value, size);
11617
11fdf7f2 11618 std::lock_guard lock(client_lock);
7c673cae 11619
181888fb
FG
11620 if (unmounting)
11621 return -ENOTCONN;
11622
7c673cae
FG
11623 vinodeno_t vino = _get_vino(in);
11624
11fdf7f2
TL
11625 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11626 tout(cct) << __func__ << std::endl;
7c673cae
FG
11627 tout(cct) << vino.ino.val << std::endl;
11628 tout(cct) << name << std::endl;
11629
11fdf7f2 11630 if (!fuse_default_permissions) {
7c673cae
FG
11631 int r = xattr_permission(in, name, MAY_WRITE, perms);
11632 if (r < 0)
11633 return r;
11634 }
11635 return _setxattr(in, name, value, size, flags, perms);
11636}
11637
11638int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11639{
11640 if (in->snapid != CEPH_NOSNAP) {
11641 return -EROFS;
11642 }
11643
11644 // same xattrs supported by kernel client
11645 if (strncmp(name, "user.", 5) &&
11646 strncmp(name, "system.", 7) &&
11647 strncmp(name, "security.", 9) &&
11648 strncmp(name, "trusted.", 8) &&
11649 strncmp(name, "ceph.", 5))
11650 return -EOPNOTSUPP;
11651
11652 const VXattr *vxattr = _match_vxattr(in, name);
11653 if (vxattr && vxattr->readonly)
11654 return -EOPNOTSUPP;
11655
11656 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11657 filepath path;
11658 in->make_nosnap_relative_path(path);
11659 req->set_filepath(path);
11660 req->set_filepath2(name);
11661 req->set_inode(in);
11662
11663 int res = make_request(req, perms);
11664
11665 trim_cache();
1adf2230 11666 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11667 return res;
11668}
11669
11670int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11671{
11672 if (cct->_conf->client_permissions) {
11673 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11674 if (r < 0)
11675 return r;
11676 }
11677 return _removexattr(in.get(), name, perms);
11678}
11679
11680int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11681{
11fdf7f2 11682 std::lock_guard lock(client_lock);
7c673cae 11683
181888fb
FG
11684 if (unmounting)
11685 return -ENOTCONN;
11686
7c673cae
FG
11687 vinodeno_t vino = _get_vino(in);
11688
11689 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11690 tout(cct) << "ll_removexattr" << std::endl;
11691 tout(cct) << vino.ino.val << std::endl;
11692 tout(cct) << name << std::endl;
11693
11fdf7f2 11694 if (!fuse_default_permissions) {
7c673cae
FG
11695 int r = xattr_permission(in, name, MAY_WRITE, perms);
11696 if (r < 0)
11697 return r;
11698 }
11699
11700 return _removexattr(in, name, perms);
11701}
11702
11703bool Client::_vxattrcb_quota_exists(Inode *in)
11704{
11fdf7f2
TL
11705 return in->quota.is_enable() &&
11706 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11707}
11708size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11709{
11710 return snprintf(val, size,
11711 "max_bytes=%lld max_files=%lld",
11712 (long long int)in->quota.max_bytes,
11713 (long long int)in->quota.max_files);
11714}
11715size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11716{
11717 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11718}
11719size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11720{
11721 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11722}
11723
11724bool Client::_vxattrcb_layout_exists(Inode *in)
11725{
11726 return in->layout != file_layout_t();
11727}
11728size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11729{
11730 int r = snprintf(val, size,
11fdf7f2 11731 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11732 (unsigned long long)in->layout.stripe_unit,
11733 (unsigned long long)in->layout.stripe_count,
11734 (unsigned long long)in->layout.object_size);
11735 objecter->with_osdmap([&](const OSDMap& o) {
11736 if (o.have_pg_pool(in->layout.pool_id))
11737 r += snprintf(val + r, size - r, "%s",
11738 o.get_pool_name(in->layout.pool_id).c_str());
11739 else
11740 r += snprintf(val + r, size - r, "%" PRIu64,
11741 (uint64_t)in->layout.pool_id);
11742 });
11743 if (in->layout.pool_ns.length())
11744 r += snprintf(val + r, size - r, " pool_namespace=%s",
11745 in->layout.pool_ns.c_str());
11746 return r;
11747}
11748size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11749{
11fdf7f2 11750 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11751}
11752size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11753{
11fdf7f2 11754 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11755}
11756size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11757{
11fdf7f2 11758 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11759}
11760size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11761{
11762 size_t r;
11763 objecter->with_osdmap([&](const OSDMap& o) {
11764 if (o.have_pg_pool(in->layout.pool_id))
11765 r = snprintf(val, size, "%s", o.get_pool_name(
11766 in->layout.pool_id).c_str());
11767 else
11768 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11769 });
11770 return r;
11771}
11772size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11773{
11774 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11775}
11776size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11777{
11fdf7f2 11778 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11779}
11780size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11781{
11fdf7f2 11782 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11783}
11784size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11785{
11fdf7f2 11786 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11787}
11788size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11789{
11fdf7f2 11790 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11791}
11792size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11793{
11fdf7f2 11794 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11795}
11796size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11797{
11fdf7f2 11798 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11799}
11800size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11801{
11fdf7f2 11802 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11803}
11804size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11805{
81eedcae 11806 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
11807 (long)in->rstat.rctime.nsec());
11808}
11fdf7f2
TL
11809bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11810{
11811 return in->dir_pin != -ENODATA;
11812}
11813size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11814{
11815 return snprintf(val, size, "%ld", (long)in->dir_pin);
11816}
7c673cae 11817
81eedcae
TL
11818bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11819{
11820 return !in->snap_btime.is_zero();
11821}
11822
11823size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11824{
11825 return snprintf(val, size, "%llu.%09lu",
11826 (long long unsigned)in->snap_btime.sec(),
11827 (long unsigned)in->snap_btime.nsec());
11828}
11829
7c673cae
FG
11830#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11831#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11832
11833#define XATTR_NAME_CEPH(_type, _name) \
11834{ \
11835 name: CEPH_XATTR_NAME(_type, _name), \
11836 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11837 readonly: true, \
11838 hidden: false, \
11839 exists_cb: NULL, \
28e407b8
AA
11840 flags: 0, \
11841}
11842#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11843{ \
11844 name: CEPH_XATTR_NAME(_type, _name), \
11845 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11846 readonly: true, \
11847 hidden: false, \
11848 exists_cb: NULL, \
11849 flags: _flags, \
7c673cae
FG
11850}
11851#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11852{ \
11853 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11854 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11855 readonly: false, \
11856 hidden: true, \
11857 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11858 flags: 0, \
7c673cae
FG
11859}
11860#define XATTR_QUOTA_FIELD(_type, _name) \
11861{ \
11862 name: CEPH_XATTR_NAME(_type, _name), \
11863 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11864 readonly: false, \
11865 hidden: true, \
11866 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11867 flags: 0, \
7c673cae
FG
11868}
11869
11870const Client::VXattr Client::_dir_vxattrs[] = {
11871 {
11872 name: "ceph.dir.layout",
11873 getxattr_cb: &Client::_vxattrcb_layout,
11874 readonly: false,
11875 hidden: true,
11876 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11877 flags: 0,
7c673cae
FG
11878 },
11879 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11880 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11881 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11882 XATTR_LAYOUT_FIELD(dir, layout, pool),
11883 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11884 XATTR_NAME_CEPH(dir, entries),
11885 XATTR_NAME_CEPH(dir, files),
11886 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11887 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11888 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11889 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11890 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11891 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11892 {
11893 name: "ceph.quota",
11894 getxattr_cb: &Client::_vxattrcb_quota,
11895 readonly: false,
11896 hidden: true,
11897 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11898 flags: 0,
7c673cae
FG
11899 },
11900 XATTR_QUOTA_FIELD(quota, max_bytes),
11901 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11902 {
11903 name: "ceph.dir.pin",
11904 getxattr_cb: &Client::_vxattrcb_dir_pin,
11905 readonly: false,
11906 hidden: true,
11907 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11908 flags: 0,
11909 },
81eedcae
TL
11910 {
11911 name: "ceph.snap.btime",
11912 getxattr_cb: &Client::_vxattrcb_snap_btime,
11913 readonly: true,
11914 hidden: false,
11915 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11916 flags: 0,
11917 },
7c673cae
FG
11918 { name: "" } /* Required table terminator */
11919};
11920
11921const Client::VXattr Client::_file_vxattrs[] = {
11922 {
11923 name: "ceph.file.layout",
11924 getxattr_cb: &Client::_vxattrcb_layout,
11925 readonly: false,
11926 hidden: true,
11927 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11928 flags: 0,
7c673cae
FG
11929 },
11930 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11931 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11932 XATTR_LAYOUT_FIELD(file, layout, object_size),
11933 XATTR_LAYOUT_FIELD(file, layout, pool),
11934 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
11935 {
11936 name: "ceph.snap.btime",
11937 getxattr_cb: &Client::_vxattrcb_snap_btime,
11938 readonly: true,
11939 hidden: false,
11940 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11941 flags: 0,
11942 },
7c673cae
FG
11943 { name: "" } /* Required table terminator */
11944};
11945
11946const Client::VXattr *Client::_get_vxattrs(Inode *in)
11947{
11948 if (in->is_dir())
11949 return _dir_vxattrs;
11950 else if (in->is_file())
11951 return _file_vxattrs;
11952 return NULL;
11953}
11954
11955const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11956{
11957 if (strncmp(name, "ceph.", 5) == 0) {
11958 const VXattr *vxattr = _get_vxattrs(in);
11959 if (vxattr) {
11960 while (!vxattr->name.empty()) {
11961 if (vxattr->name == name)
11962 return vxattr;
11963 vxattr++;
11964 }
11965 }
11966 }
11967 return NULL;
11968}
11969
7c673cae
FG
11970int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11971{
11fdf7f2 11972 std::lock_guard lock(client_lock);
7c673cae 11973
181888fb
FG
11974 if (unmounting)
11975 return -ENOTCONN;
11976
7c673cae
FG
11977 vinodeno_t vino = _get_vino(in);
11978
11979 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11980 tout(cct) << "ll_readlink" << std::endl;
11981 tout(cct) << vino.ino.val << std::endl;
11982
11fdf7f2
TL
11983 for (auto dn : in->dentries) {
11984 touch_dn(dn);
7c673cae
FG
11985 }
11986
11987 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11988 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11989 return r;
11990}
11991
11992int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11993 const UserPerm& perms, InodeRef *inp)
11994{
1adf2230 11995 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11996 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11997 << ", gid " << perms.gid() << ")" << dendl;
11998
11999 if (strlen(name) > NAME_MAX)
12000 return -ENAMETOOLONG;
12001
12002 if (dir->snapid != CEPH_NOSNAP) {
12003 return -EROFS;
12004 }
12005 if (is_quota_files_exceeded(dir, perms)) {
12006 return -EDQUOT;
12007 }
12008
12009 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12010
12011 filepath path;
12012 dir->make_nosnap_relative_path(path);
12013 path.push_dentry(name);
12014 req->set_filepath(path);
12015 req->set_inode(dir);
12016 req->head.args.mknod.rdev = rdev;
12017 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12018 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12019
12020 bufferlist xattrs_bl;
12021 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12022 if (res < 0)
12023 goto fail;
12024 req->head.args.mknod.mode = mode;
12025 if (xattrs_bl.length() > 0)
12026 req->set_data(xattrs_bl);
12027
12028 Dentry *de;
12029 res = get_or_create(dir, name, &de);
12030 if (res < 0)
12031 goto fail;
12032 req->set_dentry(de);
12033
12034 res = make_request(req, perms, inp);
12035
12036 trim_cache();
12037
1adf2230 12038 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12039 return res;
12040
12041 fail:
12042 put_request(req);
12043 return res;
12044}
12045
12046int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12047 dev_t rdev, struct stat *attr, Inode **out,
12048 const UserPerm& perms)
12049{
11fdf7f2 12050 std::lock_guard lock(client_lock);
7c673cae 12051
181888fb
FG
12052 if (unmounting)
12053 return -ENOTCONN;
12054
7c673cae
FG
12055 vinodeno_t vparent = _get_vino(parent);
12056
12057 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12058 tout(cct) << "ll_mknod" << std::endl;
12059 tout(cct) << vparent.ino.val << std::endl;
12060 tout(cct) << name << std::endl;
12061 tout(cct) << mode << std::endl;
12062 tout(cct) << rdev << std::endl;
12063
11fdf7f2 12064 if (!fuse_default_permissions) {
7c673cae
FG
12065 int r = may_create(parent, perms);
12066 if (r < 0)
12067 return r;
12068 }
12069
12070 InodeRef in;
12071 int r = _mknod(parent, name, mode, rdev, perms, &in);
12072 if (r == 0) {
12073 fill_stat(in, attr);
12074 _ll_get(in.get());
12075 }
12076 tout(cct) << attr->st_ino << std::endl;
12077 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12078 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12079 *out = in.get();
12080 return r;
12081}
12082
12083int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12084 dev_t rdev, Inode **out,
12085 struct ceph_statx *stx, unsigned want, unsigned flags,
12086 const UserPerm& perms)
12087{
12088 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12089 std::lock_guard lock(client_lock);
7c673cae 12090
181888fb
FG
12091 if (unmounting)
12092 return -ENOTCONN;
12093
7c673cae
FG
12094 vinodeno_t vparent = _get_vino(parent);
12095
12096 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12097 tout(cct) << "ll_mknodx" << std::endl;
12098 tout(cct) << vparent.ino.val << std::endl;
12099 tout(cct) << name << std::endl;
12100 tout(cct) << mode << std::endl;
12101 tout(cct) << rdev << std::endl;
12102
11fdf7f2 12103 if (!fuse_default_permissions) {
7c673cae
FG
12104 int r = may_create(parent, perms);
12105 if (r < 0)
12106 return r;
12107 }
12108
12109 InodeRef in;
12110 int r = _mknod(parent, name, mode, rdev, perms, &in);
12111 if (r == 0) {
12112 fill_statx(in, caps, stx);
12113 _ll_get(in.get());
12114 }
12115 tout(cct) << stx->stx_ino << std::endl;
12116 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12117 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12118 *out = in.get();
12119 return r;
12120}
12121
12122int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12123 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12124 int object_size, const char *data_pool, bool *created,
12125 const UserPerm& perms)
12126{
1adf2230 12127 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12128 mode << dec << ")" << dendl;
12129
12130 if (strlen(name) > NAME_MAX)
12131 return -ENAMETOOLONG;
12132 if (dir->snapid != CEPH_NOSNAP) {
12133 return -EROFS;
12134 }
12135 if (is_quota_files_exceeded(dir, perms)) {
12136 return -EDQUOT;
12137 }
12138
12139 // use normalized flags to generate cmode
11fdf7f2
TL
12140 int cflags = ceph_flags_sys2wire(flags);
12141 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12142 cflags |= CEPH_O_LAZY;
12143
12144 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12145
12146 int64_t pool_id = -1;
12147 if (data_pool && *data_pool) {
12148 pool_id = objecter->with_osdmap(
12149 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12150 if (pool_id < 0)
12151 return -EINVAL;
12152 if (pool_id > 0xffffffffll)
12153 return -ERANGE; // bummer!
12154 }
12155
12156 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12157
12158 filepath path;
12159 dir->make_nosnap_relative_path(path);
12160 path.push_dentry(name);
12161 req->set_filepath(path);
12162 req->set_inode(dir);
11fdf7f2 12163 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12164
12165 req->head.args.open.stripe_unit = stripe_unit;
12166 req->head.args.open.stripe_count = stripe_count;
12167 req->head.args.open.object_size = object_size;
12168 if (cct->_conf->client_debug_getattr_caps)
12169 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12170 else
12171 req->head.args.open.mask = 0;
12172 req->head.args.open.pool = pool_id;
12173 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12174 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12175
12176 mode |= S_IFREG;
12177 bufferlist xattrs_bl;
12178 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12179 if (res < 0)
12180 goto fail;
12181 req->head.args.open.mode = mode;
12182 if (xattrs_bl.length() > 0)
12183 req->set_data(xattrs_bl);
12184
12185 Dentry *de;
12186 res = get_or_create(dir, name, &de);
12187 if (res < 0)
12188 goto fail;
12189 req->set_dentry(de);
12190
12191 res = make_request(req, perms, inp, created);
12192 if (res < 0) {
12193 goto reply_error;
12194 }
12195
12196 /* If the caller passed a value in fhp, do the open */
12197 if(fhp) {
12198 (*inp)->get_open_ref(cmode);
12199 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12200 }
12201
12202 reply_error:
12203 trim_cache();
12204
1adf2230 12205 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12206 << " layout " << stripe_unit
12207 << ' ' << stripe_count
12208 << ' ' << object_size
12209 <<") = " << res << dendl;
12210 return res;
12211
12212 fail:
12213 put_request(req);
12214 return res;
12215}
12216
12217
12218int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12219 InodeRef *inp)
12220{
1adf2230 12221 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12222 << mode << dec << ", uid " << perm.uid()
12223 << ", gid " << perm.gid() << ")" << dendl;
12224
12225 if (strlen(name) > NAME_MAX)
12226 return -ENAMETOOLONG;
12227
12228 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12229 return -EROFS;
12230 }
12231 if (is_quota_files_exceeded(dir, perm)) {
12232 return -EDQUOT;
12233 }
12234 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12235 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12236
12237 filepath path;
12238 dir->make_nosnap_relative_path(path);
12239 path.push_dentry(name);
12240 req->set_filepath(path);
12241 req->set_inode(dir);
12242 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12243 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12244
12245 mode |= S_IFDIR;
12246 bufferlist xattrs_bl;
12247 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12248 if (res < 0)
12249 goto fail;
12250 req->head.args.mkdir.mode = mode;
12251 if (xattrs_bl.length() > 0)
12252 req->set_data(xattrs_bl);
12253
12254 Dentry *de;
12255 res = get_or_create(dir, name, &de);
12256 if (res < 0)
12257 goto fail;
12258 req->set_dentry(de);
12259
12260 ldout(cct, 10) << "_mkdir: making request" << dendl;
12261 res = make_request(req, perm, inp);
12262 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12263
12264 trim_cache();
12265
1adf2230 12266 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12267 return res;
12268
12269 fail:
12270 put_request(req);
12271 return res;
12272}
12273
12274int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12275 struct stat *attr, Inode **out, const UserPerm& perm)
12276{
11fdf7f2 12277 std::lock_guard lock(client_lock);
7c673cae 12278
181888fb
FG
12279 if (unmounting)
12280 return -ENOTCONN;
12281
7c673cae
FG
12282 vinodeno_t vparent = _get_vino(parent);
12283
12284 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12285 tout(cct) << "ll_mkdir" << std::endl;
12286 tout(cct) << vparent.ino.val << std::endl;
12287 tout(cct) << name << std::endl;
12288 tout(cct) << mode << std::endl;
12289
11fdf7f2 12290 if (!fuse_default_permissions) {
7c673cae
FG
12291 int r = may_create(parent, perm);
12292 if (r < 0)
12293 return r;
12294 }
12295
12296 InodeRef in;
12297 int r = _mkdir(parent, name, mode, perm, &in);
12298 if (r == 0) {
12299 fill_stat(in, attr);
12300 _ll_get(in.get());
12301 }
12302 tout(cct) << attr->st_ino << std::endl;
12303 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12304 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12305 *out = in.get();
12306 return r;
12307}
12308
12309int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12310 struct ceph_statx *stx, unsigned want, unsigned flags,
12311 const UserPerm& perms)
12312{
11fdf7f2 12313 std::lock_guard lock(client_lock);
7c673cae 12314
181888fb
FG
12315 if (unmounting)
12316 return -ENOTCONN;
12317
7c673cae
FG
12318 vinodeno_t vparent = _get_vino(parent);
12319
12320 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12321 tout(cct) << "ll_mkdirx" << std::endl;
12322 tout(cct) << vparent.ino.val << std::endl;
12323 tout(cct) << name << std::endl;
12324 tout(cct) << mode << std::endl;
12325
11fdf7f2 12326 if (!fuse_default_permissions) {
7c673cae
FG
12327 int r = may_create(parent, perms);
12328 if (r < 0)
12329 return r;
12330 }
12331
12332 InodeRef in;
12333 int r = _mkdir(parent, name, mode, perms, &in);
12334 if (r == 0) {
12335 fill_statx(in, statx_to_mask(flags, want), stx);
12336 _ll_get(in.get());
12337 } else {
12338 stx->stx_ino = 0;
12339 stx->stx_mask = 0;
12340 }
12341 tout(cct) << stx->stx_ino << std::endl;
12342 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12343 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12344 *out = in.get();
12345 return r;
12346}
12347
12348int Client::_symlink(Inode *dir, const char *name, const char *target,
12349 const UserPerm& perms, InodeRef *inp)
12350{
1adf2230 12351 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12352 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12353 << dendl;
12354
12355 if (strlen(name) > NAME_MAX)
12356 return -ENAMETOOLONG;
12357
12358 if (dir->snapid != CEPH_NOSNAP) {
12359 return -EROFS;
12360 }
12361 if (is_quota_files_exceeded(dir, perms)) {
12362 return -EDQUOT;
12363 }
12364
12365 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12366
12367 filepath path;
12368 dir->make_nosnap_relative_path(path);
12369 path.push_dentry(name);
12370 req->set_filepath(path);
12371 req->set_inode(dir);
12372 req->set_string2(target);
12373 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12374 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12375
12376 Dentry *de;
12377 int res = get_or_create(dir, name, &de);
12378 if (res < 0)
12379 goto fail;
12380 req->set_dentry(de);
12381
12382 res = make_request(req, perms, inp);
12383
12384 trim_cache();
1adf2230 12385 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12386 res << dendl;
12387 return res;
12388
12389 fail:
12390 put_request(req);
12391 return res;
12392}
12393
12394int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12395 struct stat *attr, Inode **out, const UserPerm& perms)
12396{
11fdf7f2 12397 std::lock_guard lock(client_lock);
7c673cae 12398
181888fb
FG
12399 if (unmounting)
12400 return -ENOTCONN;
12401
7c673cae
FG
12402 vinodeno_t vparent = _get_vino(parent);
12403
12404 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12405 << dendl;
12406 tout(cct) << "ll_symlink" << std::endl;
12407 tout(cct) << vparent.ino.val << std::endl;
12408 tout(cct) << name << std::endl;
12409 tout(cct) << value << std::endl;
12410
11fdf7f2 12411 if (!fuse_default_permissions) {
7c673cae
FG
12412 int r = may_create(parent, perms);
12413 if (r < 0)
12414 return r;
12415 }
12416
12417 InodeRef in;
12418 int r = _symlink(parent, name, value, perms, &in);
12419 if (r == 0) {
12420 fill_stat(in, attr);
12421 _ll_get(in.get());
12422 }
12423 tout(cct) << attr->st_ino << std::endl;
12424 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12425 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12426 *out = in.get();
12427 return r;
12428}
12429
12430int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12431 Inode **out, struct ceph_statx *stx, unsigned want,
12432 unsigned flags, const UserPerm& perms)
12433{
11fdf7f2 12434 std::lock_guard lock(client_lock);
7c673cae 12435
181888fb
FG
12436 if (unmounting)
12437 return -ENOTCONN;
12438
7c673cae
FG
12439 vinodeno_t vparent = _get_vino(parent);
12440
12441 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12442 << dendl;
12443 tout(cct) << "ll_symlinkx" << std::endl;
12444 tout(cct) << vparent.ino.val << std::endl;
12445 tout(cct) << name << std::endl;
12446 tout(cct) << value << std::endl;
12447
11fdf7f2 12448 if (!fuse_default_permissions) {
7c673cae
FG
12449 int r = may_create(parent, perms);
12450 if (r < 0)
12451 return r;
12452 }
12453
12454 InodeRef in;
12455 int r = _symlink(parent, name, value, perms, &in);
12456 if (r == 0) {
12457 fill_statx(in, statx_to_mask(flags, want), stx);
12458 _ll_get(in.get());
12459 }
12460 tout(cct) << stx->stx_ino << std::endl;
12461 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12462 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12463 *out = in.get();
12464 return r;
12465}
12466
12467int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12468{
1adf2230 12469 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12470 << " uid " << perm.uid() << " gid " << perm.gid()
12471 << ")" << dendl;
12472
12473 if (dir->snapid != CEPH_NOSNAP) {
12474 return -EROFS;
12475 }
12476
12477 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12478
12479 filepath path;
12480 dir->make_nosnap_relative_path(path);
12481 path.push_dentry(name);
12482 req->set_filepath(path);
12483
12484 InodeRef otherin;
b32b8144 12485 Inode *in;
7c673cae 12486 Dentry *de;
b32b8144 12487
7c673cae
FG
12488 int res = get_or_create(dir, name, &de);
12489 if (res < 0)
12490 goto fail;
12491 req->set_dentry(de);
12492 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12493 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12494
12495 res = _lookup(dir, name, 0, &otherin, perm);
12496 if (res < 0)
12497 goto fail;
b32b8144
FG
12498
12499 in = otherin.get();
12500 req->set_other_inode(in);
12501 in->break_all_delegs();
7c673cae
FG
12502 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12503
12504 req->set_inode(dir);
12505
12506 res = make_request(req, perm);
12507
12508 trim_cache();
1adf2230 12509 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12510 return res;
12511
12512 fail:
12513 put_request(req);
12514 return res;
12515}
12516
12517int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12518{
11fdf7f2 12519 std::lock_guard lock(client_lock);
7c673cae 12520
181888fb
FG
12521 if (unmounting)
12522 return -ENOTCONN;
12523
7c673cae
FG
12524 vinodeno_t vino = _get_vino(in);
12525
12526 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12527 tout(cct) << "ll_unlink" << std::endl;
12528 tout(cct) << vino.ino.val << std::endl;
12529 tout(cct) << name << std::endl;
12530
11fdf7f2 12531 if (!fuse_default_permissions) {
7c673cae
FG
12532 int r = may_delete(in, name, perm);
12533 if (r < 0)
12534 return r;
12535 }
12536 return _unlink(in, name, perm);
12537}
12538
12539int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12540{
1adf2230 12541 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12542 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12543
12544 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12545 return -EROFS;
12546 }
b32b8144
FG
12547
12548 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12549 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12550 filepath path;
12551 dir->make_nosnap_relative_path(path);
12552 path.push_dentry(name);
12553 req->set_filepath(path);
11fdf7f2 12554 req->set_inode(dir);
7c673cae
FG
12555
12556 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12557 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12558 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12559
12560 InodeRef in;
12561
12562 Dentry *de;
12563 int res = get_or_create(dir, name, &de);
12564 if (res < 0)
12565 goto fail;
b32b8144
FG
12566 if (op == CEPH_MDS_OP_RMDIR)
12567 req->set_dentry(de);
12568 else
12569 de->get();
12570
7c673cae
FG
12571 res = _lookup(dir, name, 0, &in, perms);
12572 if (res < 0)
12573 goto fail;
11fdf7f2
TL
12574
12575 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12576 unlink(de, true, true);
b32b8144 12577 de->put();
7c673cae 12578 }
11fdf7f2 12579 req->set_other_inode(in.get());
7c673cae
FG
12580
12581 res = make_request(req, perms);
12582
12583 trim_cache();
1adf2230 12584 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12585 return res;
12586
12587 fail:
12588 put_request(req);
12589 return res;
12590}
12591
12592int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12593{
11fdf7f2 12594 std::lock_guard lock(client_lock);
7c673cae 12595
181888fb
FG
12596 if (unmounting)
12597 return -ENOTCONN;
12598
7c673cae
FG
12599 vinodeno_t vino = _get_vino(in);
12600
12601 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12602 tout(cct) << "ll_rmdir" << std::endl;
12603 tout(cct) << vino.ino.val << std::endl;
12604 tout(cct) << name << std::endl;
12605
11fdf7f2 12606 if (!fuse_default_permissions) {
7c673cae
FG
12607 int r = may_delete(in, name, perms);
12608 if (r < 0)
12609 return r;
12610 }
12611
12612 return _rmdir(in, name, perms);
12613}
12614
12615int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12616{
1adf2230 12617 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12618 << todir->ino << " " << toname
12619 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12620 << dendl;
12621
12622 if (fromdir->snapid != todir->snapid)
12623 return -EXDEV;
12624
12625 int op = CEPH_MDS_OP_RENAME;
12626 if (fromdir->snapid != CEPH_NOSNAP) {
12627 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12628 op = CEPH_MDS_OP_RENAMESNAP;
12629 else
12630 return -EROFS;
12631 }
12632 if (fromdir != todir) {
12633 Inode *fromdir_root =
12634 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12635 Inode *todir_root =
12636 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12637 if (fromdir_root != todir_root) {
12638 return -EXDEV;
12639 }
12640 }
12641
12642 InodeRef target;
12643 MetaRequest *req = new MetaRequest(op);
12644
12645 filepath from;
12646 fromdir->make_nosnap_relative_path(from);
12647 from.push_dentry(fromname);
12648 filepath to;
12649 todir->make_nosnap_relative_path(to);
12650 to.push_dentry(toname);
12651 req->set_filepath(to);
12652 req->set_filepath2(from);
12653
12654 Dentry *oldde;
12655 int res = get_or_create(fromdir, fromname, &oldde);
12656 if (res < 0)
12657 goto fail;
12658 Dentry *de;
12659 res = get_or_create(todir, toname, &de);
12660 if (res < 0)
12661 goto fail;
12662
12663 if (op == CEPH_MDS_OP_RENAME) {
12664 req->set_old_dentry(oldde);
12665 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12666 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12667
12668 req->set_dentry(de);
12669 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12670 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12671
12672 InodeRef oldin, otherin;
12673 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12674 if (res < 0)
12675 goto fail;
b32b8144
FG
12676
12677 Inode *oldinode = oldin.get();
12678 oldinode->break_all_delegs();
12679 req->set_old_inode(oldinode);
7c673cae
FG
12680 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12681
12682 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12683 switch (res) {
12684 case 0:
12685 {
12686 Inode *in = otherin.get();
12687 req->set_other_inode(in);
12688 in->break_all_delegs();
12689 }
7c673cae 12690 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12691 break;
12692 case -ENOENT:
12693 break;
12694 default:
12695 goto fail;
7c673cae
FG
12696 }
12697
12698 req->set_inode(todir);
12699 } else {
12700 // renamesnap reply contains no tracedn, so we need to invalidate
12701 // dentry manually
12702 unlink(oldde, true, true);
12703 unlink(de, true, true);
11fdf7f2
TL
12704
12705 req->set_inode(todir);
7c673cae
FG
12706 }
12707
12708 res = make_request(req, perm, &target);
12709 ldout(cct, 10) << "rename result is " << res << dendl;
12710
12711 // renamed item from our cache
12712
12713 trim_cache();
1adf2230 12714 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12715 return res;
12716
12717 fail:
12718 put_request(req);
12719 return res;
12720}
12721
12722int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12723 const char *newname, const UserPerm& perm)
12724{
11fdf7f2 12725 std::lock_guard lock(client_lock);
7c673cae 12726
181888fb
FG
12727 if (unmounting)
12728 return -ENOTCONN;
12729
7c673cae
FG
12730 vinodeno_t vparent = _get_vino(parent);
12731 vinodeno_t vnewparent = _get_vino(newparent);
12732
12733 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12734 << vnewparent << " " << newname << dendl;
12735 tout(cct) << "ll_rename" << std::endl;
12736 tout(cct) << vparent.ino.val << std::endl;
12737 tout(cct) << name << std::endl;
12738 tout(cct) << vnewparent.ino.val << std::endl;
12739 tout(cct) << newname << std::endl;
12740
11fdf7f2 12741 if (!fuse_default_permissions) {
7c673cae
FG
12742 int r = may_delete(parent, name, perm);
12743 if (r < 0)
12744 return r;
12745 r = may_delete(newparent, newname, perm);
12746 if (r < 0 && r != -ENOENT)
12747 return r;
12748 }
12749
12750 return _rename(parent, name, newparent, newname, perm);
12751}
12752
12753int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12754{
1adf2230 12755 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12756 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12757
12758 if (strlen(newname) > NAME_MAX)
12759 return -ENAMETOOLONG;
12760
12761 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12762 return -EROFS;
12763 }
12764 if (is_quota_files_exceeded(dir, perm)) {
12765 return -EDQUOT;
12766 }
12767
b32b8144 12768 in->break_all_delegs();
7c673cae
FG
12769 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12770
12771 filepath path(newname, dir->ino);
12772 req->set_filepath(path);
12773 filepath existing(in->ino);
12774 req->set_filepath2(existing);
12775
12776 req->set_inode(dir);
12777 req->inode_drop = CEPH_CAP_FILE_SHARED;
12778 req->inode_unless = CEPH_CAP_FILE_EXCL;
12779
12780 Dentry *de;
12781 int res = get_or_create(dir, newname, &de);
12782 if (res < 0)
12783 goto fail;
12784 req->set_dentry(de);
12785
12786 res = make_request(req, perm, inp);
12787 ldout(cct, 10) << "link result is " << res << dendl;
12788
12789 trim_cache();
1adf2230 12790 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12791 return res;
12792
12793 fail:
12794 put_request(req);
12795 return res;
12796}
12797
12798int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12799 const UserPerm& perm)
12800{
11fdf7f2 12801 std::lock_guard lock(client_lock);
7c673cae 12802
181888fb
FG
12803 if (unmounting)
12804 return -ENOTCONN;
12805
7c673cae
FG
12806 vinodeno_t vino = _get_vino(in);
12807 vinodeno_t vnewparent = _get_vino(newparent);
12808
31f18b77 12809 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12810 newname << dendl;
12811 tout(cct) << "ll_link" << std::endl;
12812 tout(cct) << vino.ino.val << std::endl;
12813 tout(cct) << vnewparent << std::endl;
12814 tout(cct) << newname << std::endl;
12815
7c673cae
FG
12816 InodeRef target;
12817
11fdf7f2 12818 if (!fuse_default_permissions) {
7c673cae
FG
12819 if (S_ISDIR(in->mode))
12820 return -EPERM;
12821
11fdf7f2 12822 int r = may_hardlink(in, perm);
7c673cae
FG
12823 if (r < 0)
12824 return r;
12825
12826 r = may_create(newparent, perm);
12827 if (r < 0)
12828 return r;
12829 }
12830
12831 return _link(in, newparent, newname, perm, &target);
12832}
12833
12834int Client::ll_num_osds(void)
12835{
11fdf7f2 12836 std::lock_guard lock(client_lock);
7c673cae
FG
12837 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12838}
12839
12840int Client::ll_osdaddr(int osd, uint32_t *addr)
12841{
11fdf7f2 12842 std::lock_guard lock(client_lock);
181888fb 12843
7c673cae
FG
12844 entity_addr_t g;
12845 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12846 if (!o.exists(osd))
12847 return false;
11fdf7f2 12848 g = o.get_addrs(osd).front();
7c673cae
FG
12849 return true;
12850 });
12851 if (!exists)
12852 return -1;
12853 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12854 *addr = ntohl(nb_addr);
12855 return 0;
12856}
181888fb 12857
7c673cae
FG
12858uint32_t Client::ll_stripe_unit(Inode *in)
12859{
11fdf7f2 12860 std::lock_guard lock(client_lock);
7c673cae
FG
12861 return in->layout.stripe_unit;
12862}
12863
12864uint64_t Client::ll_snap_seq(Inode *in)
12865{
11fdf7f2 12866 std::lock_guard lock(client_lock);
7c673cae
FG
12867 return in->snaprealm->seq;
12868}
12869
12870int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12871{
11fdf7f2 12872 std::lock_guard lock(client_lock);
7c673cae
FG
12873 *layout = in->layout;
12874 return 0;
12875}
12876
12877int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12878{
12879 return ll_file_layout(fh->inode.get(), layout);
12880}
12881
12882/* Currently we cannot take advantage of redundancy in reads, since we
12883 would have to go through all possible placement groups (a
12884 potentially quite large number determined by a hash), and use CRUSH
12885 to calculate the appropriate set of OSDs for each placement group,
12886 then index into that. An array with one entry per OSD is much more
12887 tractable and works for demonstration purposes. */
12888
12889int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12890 file_layout_t* layout)
12891{
11fdf7f2 12892 std::lock_guard lock(client_lock);
181888fb 12893
28e407b8 12894 inodeno_t ino = in->ino;
7c673cae
FG
12895 uint32_t object_size = layout->object_size;
12896 uint32_t su = layout->stripe_unit;
12897 uint32_t stripe_count = layout->stripe_count;
12898 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12899 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12900
11fdf7f2
TL
12901 if(stripe_count) {
12902 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12903 stripepos = blockno % stripe_count; // which object in the object set (X)
12904 }
7c673cae
FG
12905 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12906 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12907
12908 object_t oid = file_object_t(ino, objectno);
12909 return objecter->with_osdmap([&](const OSDMap& o) {
12910 ceph_object_layout olayout =
12911 o.file_to_object_layout(oid, *layout);
12912 pg_t pg = (pg_t)olayout.ol_pgid;
12913 vector<int> osds;
12914 int primary;
12915 o.pg_to_acting_osds(pg, &osds, &primary);
12916 return primary;
12917 });
12918}
12919
12920/* Return the offset of the block, internal to the object */
12921
12922uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12923{
11fdf7f2 12924 std::lock_guard lock(client_lock);
7c673cae
FG
12925 file_layout_t *layout=&(in->layout);
12926 uint32_t object_size = layout->object_size;
12927 uint32_t su = layout->stripe_unit;
12928 uint64_t stripes_per_object = object_size / su;
12929
12930 return (blockno % stripes_per_object) * su;
12931}
12932
12933int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12934 const UserPerm& perms)
12935{
11fdf7f2 12936 std::lock_guard lock(client_lock);
7c673cae 12937
181888fb
FG
12938 if (unmounting)
12939 return -ENOTCONN;
12940
7c673cae
FG
12941 vinodeno_t vino = _get_vino(in);
12942
12943 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12944 tout(cct) << "ll_opendir" << std::endl;
12945 tout(cct) << vino.ino.val << std::endl;
12946
11fdf7f2 12947 if (!fuse_default_permissions) {
7c673cae
FG
12948 int r = may_open(in, flags, perms);
12949 if (r < 0)
12950 return r;
12951 }
12952
12953 int r = _opendir(in, dirpp, perms);
12954 tout(cct) << (unsigned long)*dirpp << std::endl;
12955
12956 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12957 << dendl;
12958 return r;
12959}
12960
12961int Client::ll_releasedir(dir_result_t *dirp)
12962{
11fdf7f2 12963 std::lock_guard lock(client_lock);
7c673cae
FG
12964 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12965 tout(cct) << "ll_releasedir" << std::endl;
12966 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12967
12968 if (unmounting)
12969 return -ENOTCONN;
12970
7c673cae
FG
12971 _closedir(dirp);
12972 return 0;
12973}
12974
12975int Client::ll_fsyncdir(dir_result_t *dirp)
12976{
11fdf7f2 12977 std::lock_guard lock(client_lock);
7c673cae
FG
12978 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12979 tout(cct) << "ll_fsyncdir" << std::endl;
12980 tout(cct) << (unsigned long)dirp << std::endl;
12981
181888fb
FG
12982 if (unmounting)
12983 return -ENOTCONN;
12984
7c673cae
FG
12985 return _fsync(dirp->inode.get(), false);
12986}
12987
12988int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12989{
11fdf7f2 12990 ceph_assert(!(flags & O_CREAT));
7c673cae 12991
11fdf7f2 12992 std::lock_guard lock(client_lock);
7c673cae 12993
181888fb
FG
12994 if (unmounting)
12995 return -ENOTCONN;
12996
7c673cae
FG
12997 vinodeno_t vino = _get_vino(in);
12998
12999 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13000 tout(cct) << "ll_open" << std::endl;
13001 tout(cct) << vino.ino.val << std::endl;
13002 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13003
13004 int r;
11fdf7f2 13005 if (!fuse_default_permissions) {
7c673cae
FG
13006 r = may_open(in, flags, perms);
13007 if (r < 0)
13008 goto out;
13009 }
13010
13011 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13012
13013 out:
13014 Fh *fhptr = fhp ? *fhp : NULL;
13015 if (fhptr) {
13016 ll_unclosed_fh_set.insert(fhptr);
13017 }
13018 tout(cct) << (unsigned long)fhptr << std::endl;
13019 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13020 " = " << r << " (" << fhptr << ")" << dendl;
13021 return r;
13022}
13023
13024int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13025 int flags, InodeRef *in, int caps, Fh **fhp,
13026 const UserPerm& perms)
13027{
13028 *fhp = NULL;
13029
13030 vinodeno_t vparent = _get_vino(parent);
13031
1adf2230 13032 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13033 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13034 << ", gid " << perms.gid() << dendl;
13035 tout(cct) << "ll_create" << std::endl;
13036 tout(cct) << vparent.ino.val << std::endl;
13037 tout(cct) << name << std::endl;
13038 tout(cct) << mode << std::endl;
13039 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13040
13041 bool created = false;
13042 int r = _lookup(parent, name, caps, in, perms);
13043
13044 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13045 return -EEXIST;
13046
13047 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2 13048 if (!fuse_default_permissions) {
7c673cae
FG
13049 r = may_create(parent, perms);
13050 if (r < 0)
13051 goto out;
13052 }
13053 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13054 perms);
13055 if (r < 0)
13056 goto out;
13057 }
13058
13059 if (r < 0)
13060 goto out;
13061
11fdf7f2 13062 ceph_assert(*in);
7c673cae
FG
13063
13064 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13065 if (!created) {
11fdf7f2 13066 if (!fuse_default_permissions) {
7c673cae
FG
13067 r = may_open(in->get(), flags, perms);
13068 if (r < 0) {
13069 if (*fhp) {
13070 int release_r = _release_fh(*fhp);
11fdf7f2 13071 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13072 }
13073 goto out;
13074 }
13075 }
13076 if (*fhp == NULL) {
13077 r = _open(in->get(), flags, mode, fhp, perms);
13078 if (r < 0)
13079 goto out;
13080 }
13081 }
13082
13083out:
13084 if (*fhp) {
13085 ll_unclosed_fh_set.insert(*fhp);
13086 }
13087
13088 ino_t ino = 0;
13089 if (r >= 0) {
13090 Inode *inode = in->get();
13091 if (use_faked_inos())
13092 ino = inode->faked_ino;
13093 else
13094 ino = inode->ino;
13095 }
13096
13097 tout(cct) << (unsigned long)*fhp << std::endl;
13098 tout(cct) << ino << std::endl;
1adf2230 13099 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13100 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13101 *fhp << " " << hex << ino << dec << ")" << dendl;
13102
13103 return r;
13104}
13105
13106int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13107 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13108 const UserPerm& perms)
13109{
11fdf7f2 13110 std::lock_guard lock(client_lock);
7c673cae
FG
13111 InodeRef in;
13112
181888fb
FG
13113 if (unmounting)
13114 return -ENOTCONN;
13115
7c673cae
FG
13116 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13117 fhp, perms);
13118 if (r >= 0) {
11fdf7f2 13119 ceph_assert(in);
7c673cae
FG
13120
13121 // passing an Inode in outp requires an additional ref
13122 if (outp) {
13123 _ll_get(in.get());
13124 *outp = in.get();
13125 }
13126 fill_stat(in, attr);
13127 } else {
13128 attr->st_ino = 0;
13129 }
13130
13131 return r;
13132}
13133
13134int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13135 int oflags, Inode **outp, Fh **fhp,
13136 struct ceph_statx *stx, unsigned want, unsigned lflags,
13137 const UserPerm& perms)
13138{
13139 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13140 std::lock_guard lock(client_lock);
7c673cae
FG
13141 InodeRef in;
13142
181888fb
FG
13143 if (unmounting)
13144 return -ENOTCONN;
7c673cae
FG
13145
13146 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13147 if (r >= 0) {
11fdf7f2 13148 ceph_assert(in);
7c673cae
FG
13149
13150 // passing an Inode in outp requires an additional ref
13151 if (outp) {
13152 _ll_get(in.get());
13153 *outp = in.get();
13154 }
13155 fill_statx(in, caps, stx);
13156 } else {
13157 stx->stx_ino = 0;
13158 stx->stx_mask = 0;
13159 }
13160
13161 return r;
13162}
13163
13164loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13165{
11fdf7f2 13166 std::lock_guard lock(client_lock);
7c673cae
FG
13167 tout(cct) << "ll_lseek" << std::endl;
13168 tout(cct) << offset << std::endl;
13169 tout(cct) << whence << std::endl;
13170
181888fb
FG
13171 if (unmounting)
13172 return -ENOTCONN;
13173
7c673cae
FG
13174 return _lseek(fh, offset, whence);
13175}
13176
13177int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13178{
11fdf7f2 13179 std::lock_guard lock(client_lock);
7c673cae
FG
13180 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13181 tout(cct) << "ll_read" << std::endl;
13182 tout(cct) << (unsigned long)fh << std::endl;
13183 tout(cct) << off << std::endl;
13184 tout(cct) << len << std::endl;
13185
181888fb
FG
13186 if (unmounting)
13187 return -ENOTCONN;
13188
11fdf7f2
TL
13189 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13190 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13191 return _read(fh, off, len, bl);
13192}
13193
13194int Client::ll_read_block(Inode *in, uint64_t blockid,
13195 char *buf,
13196 uint64_t offset,
13197 uint64_t length,
13198 file_layout_t* layout)
13199{
11fdf7f2 13200 std::lock_guard lock(client_lock);
181888fb
FG
13201
13202 if (unmounting)
13203 return -ENOTCONN;
13204
b32b8144 13205 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13206 object_t oid = file_object_t(vino.ino, blockid);
13207 C_SaferCond onfinish;
13208 bufferlist bl;
13209
13210 objecter->read(oid,
13211 object_locator_t(layout->pool_id),
13212 offset,
13213 length,
13214 vino.snapid,
13215 &bl,
13216 CEPH_OSD_FLAG_READ,
13217 &onfinish);
13218
13219 client_lock.Unlock();
13220 int r = onfinish.wait();
13221 client_lock.Lock();
13222
13223 if (r >= 0) {
13224 bl.copy(0, bl.length(), buf);
13225 r = bl.length();
13226 }
13227
13228 return r;
13229}
13230
13231/* It appears that the OSD doesn't return success unless the entire
13232 buffer was written, return the write length on success. */
13233
13234int Client::ll_write_block(Inode *in, uint64_t blockid,
13235 char* buf, uint64_t offset,
13236 uint64_t length, file_layout_t* layout,
13237 uint64_t snapseq, uint32_t sync)
13238{
7c673cae 13239 vinodeno_t vino = ll_get_vino(in);
7c673cae 13240 int r = 0;
11fdf7f2
TL
13241 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13242
7c673cae
FG
13243 if (length == 0) {
13244 return -EINVAL;
13245 }
13246 if (true || sync) {
13247 /* if write is stable, the epilogue is waiting on
13248 * flock */
11fdf7f2 13249 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13250 }
13251 object_t oid = file_object_t(vino.ino, blockid);
13252 SnapContext fakesnap;
11fdf7f2
TL
13253 ceph::bufferlist bl;
13254 if (length > 0) {
13255 bl.push_back(buffer::copy(buf, length));
13256 }
7c673cae
FG
13257
13258 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13259 << dendl;
13260
13261 fakesnap.seq = snapseq;
13262
13263 /* lock just in time */
13264 client_lock.Lock();
181888fb
FG
13265 if (unmounting) {
13266 client_lock.Unlock();
181888fb
FG
13267 return -ENOTCONN;
13268 }
7c673cae
FG
13269
13270 objecter->write(oid,
13271 object_locator_t(layout->pool_id),
13272 offset,
13273 length,
13274 fakesnap,
13275 bl,
13276 ceph::real_clock::now(),
13277 0,
11fdf7f2 13278 onsafe.get());
7c673cae
FG
13279
13280 client_lock.Unlock();
11fdf7f2
TL
13281 if (nullptr != onsafe) {
13282 r = onsafe->wait();
7c673cae
FG
13283 }
13284
13285 if (r < 0) {
13286 return r;
13287 } else {
13288 return length;
13289 }
13290}
13291
13292int Client::ll_commit_blocks(Inode *in,
13293 uint64_t offset,
13294 uint64_t length)
13295{
11fdf7f2 13296 std::lock_guard lock(client_lock);
7c673cae
FG
13297 /*
13298 BarrierContext *bctx;
b32b8144 13299 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13300 uint64_t ino = vino.ino;
13301
13302 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13303 << offset << " to " << length << dendl;
13304
13305 if (length == 0) {
13306 return -EINVAL;
13307 }
13308
13309 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13310 if (p != barriers.end()) {
13311 barrier_interval civ(offset, offset + length);
13312 p->second->commit_barrier(civ);
13313 }
13314 */
13315 return 0;
13316}
13317
13318int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13319{
11fdf7f2 13320 std::lock_guard lock(client_lock);
7c673cae
FG
13321 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13322 "~" << len << dendl;
13323 tout(cct) << "ll_write" << std::endl;
13324 tout(cct) << (unsigned long)fh << std::endl;
13325 tout(cct) << off << std::endl;
13326 tout(cct) << len << std::endl;
13327
181888fb
FG
13328 if (unmounting)
13329 return -ENOTCONN;
13330
11fdf7f2
TL
13331 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13332 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13333 int r = _write(fh, off, len, data, NULL, 0);
13334 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13335 << dendl;
13336 return r;
13337}
13338
11fdf7f2
TL
13339int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13340{
13341 std::lock_guard lock(client_lock);
13342 if (unmounting)
13343 return -ENOTCONN;
13344 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13345}
13346
13347int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13348{
13349 std::lock_guard lock(client_lock);
13350 if (unmounting)
13351 return -ENOTCONN;
13352 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13353}
13354
7c673cae
FG
13355int Client::ll_flush(Fh *fh)
13356{
11fdf7f2 13357 std::lock_guard lock(client_lock);
7c673cae
FG
13358 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13359 tout(cct) << "ll_flush" << std::endl;
13360 tout(cct) << (unsigned long)fh << std::endl;
13361
181888fb
FG
13362 if (unmounting)
13363 return -ENOTCONN;
13364
7c673cae
FG
13365 return _flush(fh);
13366}
13367
13368int Client::ll_fsync(Fh *fh, bool syncdataonly)
13369{
11fdf7f2 13370 std::lock_guard lock(client_lock);
7c673cae
FG
13371 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13372 tout(cct) << "ll_fsync" << std::endl;
13373 tout(cct) << (unsigned long)fh << std::endl;
13374
181888fb
FG
13375 if (unmounting)
13376 return -ENOTCONN;
13377
7c673cae
FG
13378 int r = _fsync(fh, syncdataonly);
13379 if (r) {
13380 // If we're returning an error, clear it from the FH
13381 fh->take_async_err();
13382 }
13383 return r;
13384}
13385
28e407b8
AA
13386int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13387{
11fdf7f2 13388 std::lock_guard lock(client_lock);
28e407b8
AA
13389 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13390 tout(cct) << "ll_sync_inode" << std::endl;
13391 tout(cct) << (unsigned long)in << std::endl;
13392
13393 if (unmounting)
13394 return -ENOTCONN;
13395
13396 return _fsync(in, syncdataonly);
13397}
13398
7c673cae
FG
13399#ifdef FALLOC_FL_PUNCH_HOLE
13400
13401int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13402{
13403 if (offset < 0 || length <= 0)
13404 return -EINVAL;
13405
13406 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13407 return -EOPNOTSUPP;
13408
13409 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13410 return -EOPNOTSUPP;
13411
13412 Inode *in = fh->inode.get();
13413
13414 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13415 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13416 return -ENOSPC;
13417 }
13418
13419 if (in->snapid != CEPH_NOSNAP)
13420 return -EROFS;
13421
13422 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13423 return -EBADF;
13424
13425 uint64_t size = offset + length;
13426 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13427 size > in->size &&
11fdf7f2 13428 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13429 return -EDQUOT;
13430 }
13431
13432 int have;
13433 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13434 if (r < 0)
13435 return r;
13436
11fdf7f2 13437 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13438 if (mode & FALLOC_FL_PUNCH_HOLE) {
13439 if (in->inline_version < CEPH_INLINE_NONE &&
13440 (have & CEPH_CAP_FILE_BUFFER)) {
13441 bufferlist bl;
13442 int len = in->inline_data.length();
13443 if (offset < len) {
13444 if (offset > 0)
13445 in->inline_data.copy(0, offset, bl);
13446 int size = length;
13447 if (offset + size > len)
13448 size = len - offset;
13449 if (size > 0)
13450 bl.append_zero(size);
13451 if (offset + size < len)
13452 in->inline_data.copy(offset + size, len - offset - size, bl);
13453 in->inline_data = bl;
13454 in->inline_version++;
13455 }
91327a77 13456 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13457 in->change_attr++;
28e407b8 13458 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13459 } else {
13460 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13461 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13462 uninline_data(in, onuninline.get());
7c673cae
FG
13463 }
13464
11fdf7f2 13465 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13466
13467 unsafe_sync_write++;
13468 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13469
13470 _invalidate_inode_cache(in, offset, length);
13471 filer->zero(in->ino, &in->layout,
13472 in->snaprealm->get_snap_context(),
13473 offset, length,
13474 ceph::real_clock::now(),
11fdf7f2 13475 0, true, &onfinish);
91327a77 13476 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13477 in->change_attr++;
28e407b8 13478 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13479
13480 client_lock.Unlock();
11fdf7f2 13481 onfinish.wait();
7c673cae
FG
13482 client_lock.Lock();
13483 _sync_write_commit(in);
13484 }
13485 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13486 uint64_t size = offset + length;
13487 if (size > in->size) {
13488 in->size = size;
91327a77 13489 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13490 in->change_attr++;
28e407b8 13491 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13492
11fdf7f2 13493 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13494 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13495 } else if (is_max_size_approaching(in)) {
13496 check_caps(in, 0);
7c673cae
FG
13497 }
13498 }
13499 }
13500
11fdf7f2 13501 if (nullptr != onuninline) {
7c673cae 13502 client_lock.Unlock();
11fdf7f2 13503 int ret = onuninline->wait();
7c673cae
FG
13504 client_lock.Lock();
13505
11fdf7f2 13506 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13507 in->inline_data.clear();
13508 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13509 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13510 check_caps(in, 0);
13511 } else
11fdf7f2 13512 r = ret;
7c673cae
FG
13513 }
13514
13515 put_cap_ref(in, CEPH_CAP_FILE_WR);
13516 return r;
13517}
13518#else
13519
13520int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13521{
13522 return -EOPNOTSUPP;
13523}
13524
13525#endif
13526
13527
11fdf7f2 13528int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13529{
11fdf7f2
TL
13530 std::lock_guard lock(client_lock);
13531 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13532 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13533 tout(cct) << (unsigned long)fh << std::endl;
13534
181888fb
FG
13535 if (unmounting)
13536 return -ENOTCONN;
13537
7c673cae
FG
13538 return _fallocate(fh, mode, offset, length);
13539}
13540
13541int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13542{
11fdf7f2
TL
13543 std::lock_guard lock(client_lock);
13544 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13545
181888fb
FG
13546 if (unmounting)
13547 return -ENOTCONN;
13548
7c673cae
FG
13549 Fh *fh = get_filehandle(fd);
13550 if (!fh)
13551 return -EBADF;
13552#if defined(__linux__) && defined(O_PATH)
13553 if (fh->flags & O_PATH)
13554 return -EBADF;
13555#endif
13556 return _fallocate(fh, mode, offset, length);
13557}
13558
13559int Client::ll_release(Fh *fh)
13560{
11fdf7f2 13561 std::lock_guard lock(client_lock);
91327a77
AA
13562
13563 if (unmounting)
13564 return -ENOTCONN;
13565
11fdf7f2 13566 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13567 dendl;
11fdf7f2 13568 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13569 tout(cct) << (unsigned long)fh << std::endl;
13570
13571 if (ll_unclosed_fh_set.count(fh))
13572 ll_unclosed_fh_set.erase(fh);
13573 return _release_fh(fh);
13574}
13575
13576int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13577{
11fdf7f2 13578 std::lock_guard lock(client_lock);
7c673cae
FG
13579
13580 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13581 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13582
181888fb
FG
13583 if (unmounting)
13584 return -ENOTCONN;
13585
7c673cae
FG
13586 return _getlk(fh, fl, owner);
13587}
13588
13589int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13590{
11fdf7f2 13591 std::lock_guard lock(client_lock);
7c673cae 13592
11fdf7f2
TL
13593 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13594 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13595
181888fb
FG
13596 if (unmounting)
13597 return -ENOTCONN;
13598
7c673cae
FG
13599 return _setlk(fh, fl, owner, sleep);
13600}
13601
13602int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13603{
11fdf7f2 13604 std::lock_guard lock(client_lock);
7c673cae 13605
11fdf7f2
TL
13606 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13607 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13608
181888fb
FG
13609 if (unmounting)
13610 return -ENOTCONN;
13611
7c673cae
FG
13612 return _flock(fh, cmd, owner);
13613}
13614
b32b8144
FG
13615int Client::set_deleg_timeout(uint32_t timeout)
13616{
11fdf7f2 13617 std::lock_guard lock(client_lock);
b32b8144
FG
13618
13619 /*
13620 * The whole point is to prevent blacklisting so we must time out the
13621 * delegation before the session autoclose timeout kicks in.
13622 */
13623 if (timeout >= mdsmap->get_session_autoclose())
13624 return -EINVAL;
13625
13626 deleg_timeout = timeout;
13627 return 0;
13628}
13629
13630int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13631{
13632 int ret = -EINVAL;
13633
11fdf7f2 13634 std::lock_guard lock(client_lock);
b32b8144
FG
13635
13636 if (!mounted)
13637 return -ENOTCONN;
13638
13639 Inode *inode = fh->inode.get();
13640
13641 switch(cmd) {
13642 case CEPH_DELEGATION_NONE:
13643 inode->unset_deleg(fh);
13644 ret = 0;
13645 break;
13646 default:
13647 try {
13648 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13649 } catch (std::bad_alloc&) {
b32b8144
FG
13650 ret = -ENOMEM;
13651 }
13652 break;
13653 }
13654 return ret;
13655}
13656
7c673cae
FG
13657class C_Client_RequestInterrupt : public Context {
13658private:
13659 Client *client;
13660 MetaRequest *req;
13661public:
13662 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13663 req->get();
13664 }
13665 void finish(int r) override {
11fdf7f2
TL
13666 std::lock_guard l(client->client_lock);
13667 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13668 client->_interrupt_filelock(req);
13669 client->put_request(req);
13670 }
13671};
13672
13673void Client::ll_interrupt(void *d)
13674{
13675 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13676 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13677 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13678 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13679}
13680
13681// =========================================
13682// layout
13683
13684// expose file layouts
13685
13686int Client::describe_layout(const char *relpath, file_layout_t *lp,
13687 const UserPerm& perms)
13688{
11fdf7f2 13689 std::lock_guard lock(client_lock);
7c673cae 13690
181888fb
FG
13691 if (unmounting)
13692 return -ENOTCONN;
13693
7c673cae
FG
13694 filepath path(relpath);
13695 InodeRef in;
13696 int r = path_walk(path, &in, perms);
13697 if (r < 0)
13698 return r;
13699
13700 *lp = in->layout;
13701
11fdf7f2 13702 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13703 return 0;
13704}
13705
13706int Client::fdescribe_layout(int fd, file_layout_t *lp)
13707{
11fdf7f2 13708 std::lock_guard lock(client_lock);
7c673cae 13709
181888fb
FG
13710 if (unmounting)
13711 return -ENOTCONN;
13712
7c673cae
FG
13713 Fh *f = get_filehandle(fd);
13714 if (!f)
13715 return -EBADF;
13716 Inode *in = f->inode.get();
13717
13718 *lp = in->layout;
13719
11fdf7f2 13720 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13721 return 0;
13722}
13723
d2e6a577
FG
13724int64_t Client::get_default_pool_id()
13725{
11fdf7f2 13726 std::lock_guard lock(client_lock);
181888fb
FG
13727
13728 if (unmounting)
13729 return -ENOTCONN;
13730
d2e6a577
FG
13731 /* first data pool is the default */
13732 return mdsmap->get_first_data_pool();
13733}
7c673cae
FG
13734
13735// expose osdmap
13736
13737int64_t Client::get_pool_id(const char *pool_name)
13738{
11fdf7f2 13739 std::lock_guard lock(client_lock);
181888fb
FG
13740
13741 if (unmounting)
13742 return -ENOTCONN;
13743
7c673cae
FG
13744 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13745 pool_name);
13746}
13747
13748string Client::get_pool_name(int64_t pool)
13749{
11fdf7f2 13750 std::lock_guard lock(client_lock);
181888fb
FG
13751
13752 if (unmounting)
13753 return string();
13754
7c673cae
FG
13755 return objecter->with_osdmap([pool](const OSDMap& o) {
13756 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13757 });
13758}
13759
13760int Client::get_pool_replication(int64_t pool)
13761{
11fdf7f2 13762 std::lock_guard lock(client_lock);
181888fb
FG
13763
13764 if (unmounting)
13765 return -ENOTCONN;
13766
7c673cae
FG
13767 return objecter->with_osdmap([pool](const OSDMap& o) {
13768 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13769 });
13770}
13771
13772int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13773{
11fdf7f2 13774 std::lock_guard lock(client_lock);
7c673cae 13775
181888fb
FG
13776 if (unmounting)
13777 return -ENOTCONN;
13778
7c673cae
FG
13779 Fh *f = get_filehandle(fd);
13780 if (!f)
13781 return -EBADF;
13782 Inode *in = f->inode.get();
13783
13784 vector<ObjectExtent> extents;
13785 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13786 ceph_assert(extents.size() == 1);
7c673cae
FG
13787
13788 objecter->with_osdmap([&](const OSDMap& o) {
13789 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13790 o.pg_to_acting_osds(pg, osds);
13791 });
13792
13793 if (osds.empty())
13794 return -EINVAL;
13795
13796 /*
13797 * Return the remainder of the extent (stripe unit)
13798 *
13799 * If length = 1 is passed to Striper::file_to_extents we get a single
13800 * extent back, but its length is one so we still need to compute the length
13801 * to the end of the stripe unit.
13802 *
13803 * If length = su then we may get 1 or 2 objects back in the extents vector
13804 * which would have to be examined. Even then, the offsets are local to the
13805 * object, so matching up to the file offset is extra work.
13806 *
13807 * It seems simpler to stick with length = 1 and manually compute the
13808 * remainder.
13809 */
13810 if (len) {
13811 uint64_t su = in->layout.stripe_unit;
13812 *len = su - (off % su);
13813 }
13814
13815 return 0;
13816}
13817
13818int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13819{
11fdf7f2 13820 std::lock_guard lock(client_lock);
181888fb
FG
13821
13822 if (unmounting)
13823 return -ENOTCONN;
13824
7c673cae
FG
13825 if (id < 0)
13826 return -EINVAL;
13827 return objecter->with_osdmap([&](const OSDMap& o) {
13828 return o.crush->get_full_location_ordered(id, path);
13829 });
13830}
13831
13832int Client::get_file_stripe_address(int fd, loff_t offset,
13833 vector<entity_addr_t>& address)
13834{
11fdf7f2 13835 std::lock_guard lock(client_lock);
7c673cae 13836
181888fb
FG
13837 if (unmounting)
13838 return -ENOTCONN;
13839
7c673cae
FG
13840 Fh *f = get_filehandle(fd);
13841 if (!f)
13842 return -EBADF;
13843 Inode *in = f->inode.get();
13844
13845 // which object?
13846 vector<ObjectExtent> extents;
13847 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13848 in->truncate_size, extents);
11fdf7f2 13849 ceph_assert(extents.size() == 1);
7c673cae
FG
13850
13851 // now we have the object and its 'layout'
13852 return objecter->with_osdmap([&](const OSDMap& o) {
13853 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13854 vector<int> osds;
13855 o.pg_to_acting_osds(pg, osds);
13856 if (osds.empty())
13857 return -EINVAL;
13858 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13859 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13860 address.push_back(addr);
13861 }
13862 return 0;
13863 });
13864}
13865
13866int Client::get_osd_addr(int osd, entity_addr_t& addr)
13867{
11fdf7f2 13868 std::lock_guard lock(client_lock);
181888fb
FG
13869
13870 if (unmounting)
13871 return -ENOTCONN;
13872
7c673cae
FG
13873 return objecter->with_osdmap([&](const OSDMap& o) {
13874 if (!o.exists(osd))
13875 return -ENOENT;
13876
11fdf7f2 13877 addr = o.get_addrs(osd).front();
7c673cae
FG
13878 return 0;
13879 });
13880}
13881
13882int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13883 loff_t length, loff_t offset)
13884{
11fdf7f2 13885 std::lock_guard lock(client_lock);
7c673cae 13886
181888fb
FG
13887 if (unmounting)
13888 return -ENOTCONN;
13889
7c673cae
FG
13890 Fh *f = get_filehandle(fd);
13891 if (!f)
13892 return -EBADF;
13893 Inode *in = f->inode.get();
13894
13895 // map to a list of extents
13896 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13897
11fdf7f2 13898 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13899 return 0;
13900}
13901
13902
b32b8144 13903/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13904int Client::get_local_osd()
13905{
11fdf7f2 13906 std::lock_guard lock(client_lock);
181888fb
FG
13907
13908 if (unmounting)
13909 return -ENOTCONN;
13910
7c673cae
FG
13911 objecter->with_osdmap([this](const OSDMap& o) {
13912 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13913 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13914 local_osd_epoch = o.get_epoch();
13915 }
13916 });
13917 return local_osd;
13918}
13919
13920
13921
13922
13923
13924
13925// ===============================
13926
13927void Client::ms_handle_connect(Connection *con)
13928{
11fdf7f2 13929 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13930}
13931
13932bool Client::ms_handle_reset(Connection *con)
13933{
11fdf7f2 13934 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13935 return false;
13936}
13937
13938void Client::ms_handle_remote_reset(Connection *con)
13939{
11fdf7f2
TL
13940 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13941 std::lock_guard l(client_lock);
7c673cae
FG
13942 switch (con->get_peer_type()) {
13943 case CEPH_ENTITY_TYPE_MDS:
13944 {
13945 // kludge to figure out which mds this is; fixme with a Connection* state
13946 mds_rank_t mds = MDS_RANK_NONE;
13947 MetaSession *s = NULL;
11fdf7f2
TL
13948 for (auto &p : mds_sessions) {
13949 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13950 mds = p.first;
13951 s = &p.second;
7c673cae
FG
13952 }
13953 }
13954 if (mds >= 0) {
d2e6a577 13955 assert (s != NULL);
7c673cae
FG
13956 switch (s->state) {
13957 case MetaSession::STATE_CLOSING:
13958 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13959 _closed_mds_session(s);
13960 break;
13961
13962 case MetaSession::STATE_OPENING:
13963 {
13964 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13965 list<Context*> waiters;
13966 waiters.swap(s->waiting_for_open);
13967 _closed_mds_session(s);
13968 MetaSession *news = _get_or_open_mds_session(mds);
13969 news->waiting_for_open.swap(waiters);
13970 }
13971 break;
13972
13973 case MetaSession::STATE_OPEN:
13974 {
28e407b8 13975 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 13976 const auto& conf = cct->_conf;
7c673cae
FG
13977 if (conf->client_reconnect_stale) {
13978 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13979 _closed_mds_session(s);
13980 } else {
13981 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13982 s->state = MetaSession::STATE_STALE;
13983 }
13984 }
13985 break;
13986
13987 case MetaSession::STATE_NEW:
13988 case MetaSession::STATE_CLOSED:
13989 default:
13990 break;
13991 }
13992 }
13993 }
13994 break;
13995 }
13996}
13997
13998bool Client::ms_handle_refused(Connection *con)
13999{
11fdf7f2 14000 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14001 return false;
14002}
14003
11fdf7f2 14004bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
14005{
14006 if (dest_type == CEPH_ENTITY_TYPE_MON)
14007 return true;
14008 *authorizer = monclient->build_authorizer(dest_type);
14009 return true;
14010}
14011
14012Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14013{
11fdf7f2
TL
14014 Inode *quota_in = root_ancestor;
14015 SnapRealm *realm = in->snaprealm;
14016 while (realm) {
14017 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14018 if (realm->ino != in->ino) {
14019 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14020 if (p == inode_map.end())
14021 break;
7c673cae 14022
11fdf7f2
TL
14023 if (p->second->quota.is_enable()) {
14024 quota_in = p->second;
14025 break;
7c673cae 14026 }
7c673cae 14027 }
11fdf7f2 14028 realm = realm->pparent;
7c673cae 14029 }
11fdf7f2
TL
14030 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14031 return quota_in;
7c673cae
FG
14032}
14033
14034/**
14035 * Traverse quota ancestors of the Inode, return true
14036 * if any of them passes the passed function
14037 */
14038bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14039 std::function<bool (const Inode &in)> test)
14040{
14041 while (true) {
11fdf7f2 14042 ceph_assert(in != NULL);
7c673cae
FG
14043 if (test(*in)) {
14044 return true;
14045 }
14046
14047 if (in == root_ancestor) {
14048 // We're done traversing, drop out
14049 return false;
14050 } else {
14051 // Continue up the tree
14052 in = get_quota_root(in, perms);
14053 }
14054 }
14055
14056 return false;
14057}
14058
14059bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14060{
14061 return check_quota_condition(in, perms,
14062 [](const Inode &in) {
14063 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14064 });
14065}
14066
14067bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14068 const UserPerm& perms)
7c673cae
FG
14069{
14070 return check_quota_condition(in, perms,
11fdf7f2 14071 [&new_bytes](const Inode &in) {
7c673cae
FG
14072 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14073 > in.quota.max_bytes;
14074 });
14075}
14076
11fdf7f2 14077bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14078{
11fdf7f2
TL
14079 return check_quota_condition(in, perms,
14080 [](const Inode &in) {
14081 if (in.quota.max_bytes) {
14082 if (in.rstat.rbytes >= in.quota.max_bytes) {
14083 return true;
14084 }
14085
14086 ceph_assert(in.size >= in.reported_size);
14087 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14088 const uint64_t size = in.size - in.reported_size;
14089 return (space >> 4) < size;
14090 } else {
14091 return false;
14092 }
14093 });
7c673cae
FG
14094}
14095
14096enum {
14097 POOL_CHECKED = 1,
14098 POOL_CHECKING = 2,
14099 POOL_READ = 4,
14100 POOL_WRITE = 8,
14101};
14102
14103int Client::check_pool_perm(Inode *in, int need)
14104{
14105 if (!cct->_conf->client_check_pool_perm)
14106 return 0;
14107
14108 int64_t pool_id = in->layout.pool_id;
14109 std::string pool_ns = in->layout.pool_ns;
14110 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14111 int have = 0;
14112 while (true) {
14113 auto it = pool_perms.find(perm_key);
14114 if (it == pool_perms.end())
14115 break;
14116 if (it->second == POOL_CHECKING) {
14117 // avoid concurrent checkings
14118 wait_on_list(waiting_for_pool_perm);
14119 } else {
14120 have = it->second;
11fdf7f2 14121 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14122 break;
14123 }
14124 }
14125
14126 if (!have) {
14127 if (in->snapid != CEPH_NOSNAP) {
14128 // pool permission check needs to write to the first object. But for snapshot,
14129 // head of the first object may have alread been deleted. To avoid creating
14130 // orphan object, skip the check for now.
14131 return 0;
14132 }
14133
14134 pool_perms[perm_key] = POOL_CHECKING;
14135
14136 char oid_buf[32];
14137 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14138 object_t oid = oid_buf;
14139
14140 SnapContext nullsnapc;
14141
14142 C_SaferCond rd_cond;
14143 ObjectOperation rd_op;
14144 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14145
14146 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14147 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14148
14149 C_SaferCond wr_cond;
14150 ObjectOperation wr_op;
14151 wr_op.create(true);
14152
14153 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14154 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14155
14156 client_lock.Unlock();
14157 int rd_ret = rd_cond.wait();
14158 int wr_ret = wr_cond.wait();
14159 client_lock.Lock();
14160
14161 bool errored = false;
14162
14163 if (rd_ret == 0 || rd_ret == -ENOENT)
14164 have |= POOL_READ;
14165 else if (rd_ret != -EPERM) {
11fdf7f2 14166 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14167 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14168 errored = true;
14169 }
14170
14171 if (wr_ret == 0 || wr_ret == -EEXIST)
14172 have |= POOL_WRITE;
14173 else if (wr_ret != -EPERM) {
11fdf7f2 14174 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14175 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14176 errored = true;
14177 }
14178
14179 if (errored) {
14180 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14181 // Raise EIO because actual error code might be misleading for
14182 // userspace filesystem user.
14183 pool_perms.erase(perm_key);
14184 signal_cond_list(waiting_for_pool_perm);
14185 return -EIO;
14186 }
14187
14188 pool_perms[perm_key] = have | POOL_CHECKED;
14189 signal_cond_list(waiting_for_pool_perm);
14190 }
14191
14192 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14193 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14194 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14195 return -EPERM;
14196 }
14197 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14198 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14199 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14200 return -EPERM;
14201 }
14202
14203 return 0;
14204}
14205
14206int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14207{
14208 if (acl_type == POSIX_ACL) {
14209 if (in->xattrs.count(ACL_EA_ACCESS)) {
14210 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14211
14212 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14213 }
14214 }
14215 return -EAGAIN;
14216}
14217
14218int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14219{
14220 if (acl_type == NO_ACL)
14221 return 0;
14222
14223 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14224 if (r < 0)
14225 goto out;
14226
14227 if (acl_type == POSIX_ACL) {
14228 if (in->xattrs.count(ACL_EA_ACCESS)) {
14229 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14230 bufferptr acl(access_acl.c_str(), access_acl.length());
14231 r = posix_acl_access_chmod(acl, mode);
14232 if (r < 0)
14233 goto out;
14234 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14235 } else {
14236 r = 0;
14237 }
14238 }
14239out:
14240 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14241 return r;
14242}
14243
14244int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14245 const UserPerm& perms)
14246{
14247 if (acl_type == NO_ACL)
14248 return 0;
14249
14250 if (S_ISLNK(*mode))
14251 return 0;
14252
14253 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14254 if (r < 0)
14255 goto out;
14256
14257 if (acl_type == POSIX_ACL) {
14258 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14259 map<string, bufferptr> xattrs;
14260
14261 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14262 bufferptr acl(default_acl.c_str(), default_acl.length());
14263 r = posix_acl_inherit_mode(acl, mode);
14264 if (r < 0)
14265 goto out;
14266
14267 if (r > 0) {
14268 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14269 if (r < 0)
14270 goto out;
14271 if (r > 0)
14272 xattrs[ACL_EA_ACCESS] = acl;
14273 }
14274
14275 if (S_ISDIR(*mode))
14276 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14277
14278 r = xattrs.size();
14279 if (r > 0)
11fdf7f2 14280 encode(xattrs, xattrs_bl);
7c673cae
FG
14281 } else {
14282 if (umask_cb)
14283 *mode &= ~umask_cb(callback_handle);
14284 r = 0;
14285 }
14286 }
14287out:
14288 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14289 return r;
14290}
14291
14292void Client::set_filer_flags(int flags)
14293{
11fdf7f2
TL
14294 std::lock_guard l(client_lock);
14295 ceph_assert(flags == 0 ||
7c673cae
FG
14296 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14297 objecter->add_global_op_flags(flags);
14298}
14299
14300void Client::clear_filer_flags(int flags)
14301{
11fdf7f2
TL
14302 std::lock_guard l(client_lock);
14303 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14304 objecter->clear_global_op_flag(flags);
14305}
14306
11fdf7f2
TL
14307// called before mount
14308void Client::set_uuid(const std::string& uuid)
14309{
14310 std::lock_guard l(client_lock);
14311 assert(initialized);
14312 assert(!uuid.empty());
14313
14314 metadata["uuid"] = uuid;
14315 _close_sessions();
14316}
14317
14318// called before mount. 0 means infinite
14319void Client::set_session_timeout(unsigned timeout)
14320{
14321 std::lock_guard l(client_lock);
14322 assert(initialized);
14323
14324 metadata["timeout"] = stringify(timeout);
14325}
14326
14327// called before mount
14328int Client::start_reclaim(const std::string& uuid, unsigned flags,
14329 const std::string& fs_name)
14330{
14331 std::lock_guard l(client_lock);
14332 if (!initialized)
14333 return -ENOTCONN;
14334
14335 if (uuid.empty())
14336 return -EINVAL;
14337
14338 {
14339 auto it = metadata.find("uuid");
14340 if (it != metadata.end() && it->second == uuid)
14341 return -EINVAL;
14342 }
14343
14344 int r = subscribe_mdsmap(fs_name);
14345 if (r < 0) {
14346 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14347 return r;
14348 }
14349
14350 if (metadata.empty())
14351 populate_metadata("");
14352
14353 while (mdsmap->get_epoch() == 0)
14354 wait_on_list(waiting_for_mdsmap);
14355
14356 reclaim_errno = 0;
14357 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14358 if (!mdsmap->is_up(mds)) {
14359 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14360 wait_on_list(waiting_for_mdsmap);
14361 continue;
14362 }
14363
14364 MetaSession *session;
14365 if (!have_open_session(mds)) {
14366 session = _get_or_open_mds_session(mds);
14367 if (session->state != MetaSession::STATE_OPENING) {
14368 // umounting?
14369 return -EINVAL;
14370 }
14371 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14372 wait_on_context_list(session->waiting_for_open);
14373 if (rejected_by_mds.count(mds))
14374 return -EPERM;
14375 continue;
14376 }
14377
14378 session = &mds_sessions.at(mds);
14379 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14380 return -EOPNOTSUPP;
14381
14382 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14383 session->reclaim_state == MetaSession::RECLAIMING) {
14384 session->reclaim_state = MetaSession::RECLAIMING;
14385 auto m = MClientReclaim::create(uuid, flags);
14386 session->con->send_message2(std::move(m));
14387 wait_on_list(waiting_for_reclaim);
14388 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14389 return reclaim_errno ? : -ENOTRECOVERABLE;
14390 } else {
14391 mds++;
14392 }
14393 }
14394
14395 // didn't find target session in any mds
14396 if (reclaim_target_addrs.empty()) {
14397 if (flags & CEPH_RECLAIM_RESET)
14398 return -ENOENT;
14399 return -ENOTRECOVERABLE;
14400 }
14401
14402 if (flags & CEPH_RECLAIM_RESET)
14403 return 0;
14404
14405 // use blacklist to check if target session was killed
14406 // (config option mds_session_blacklist_on_evict needs to be true)
14407 C_SaferCond cond;
14408 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14409 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14410 client_lock.Unlock();
14411 cond.wait();
14412 client_lock.Lock();
14413 }
14414
14415 bool blacklisted = objecter->with_osdmap(
14416 [this](const OSDMap &osd_map) -> bool {
14417 return osd_map.is_blacklisted(reclaim_target_addrs);
14418 });
14419 if (blacklisted)
14420 return -ENOTRECOVERABLE;
14421
14422 metadata["reclaiming_uuid"] = uuid;
14423 return 0;
14424}
14425
14426void Client::finish_reclaim()
14427{
14428 auto it = metadata.find("reclaiming_uuid");
14429 if (it == metadata.end()) {
14430 for (auto &p : mds_sessions)
14431 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14432 return;
14433 }
14434
14435 for (auto &p : mds_sessions) {
14436 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14437 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14438 p.second.con->send_message2(std::move(m));
14439 }
14440
14441 metadata["uuid"] = it->second;
14442 metadata.erase(it);
14443}
14444
14445void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14446{
14447 mds_rank_t from = mds_rank_t(reply->get_source().num());
14448 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14449
14450 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14451 if (!session) {
14452 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14453 return;
14454 }
14455
14456 if (reply->get_result() >= 0) {
14457 session->reclaim_state = MetaSession::RECLAIM_OK;
14458 if (reply->get_epoch() > reclaim_osd_epoch)
14459 reclaim_osd_epoch = reply->get_epoch();
14460 if (!reply->get_addrs().empty())
14461 reclaim_target_addrs = reply->get_addrs();
14462 } else {
14463 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14464 reclaim_errno = reply->get_result();
14465 }
14466
14467 signal_cond_list(waiting_for_reclaim);
14468}
14469
7c673cae
FG
14470/**
14471 * This is included in cap release messages, to cause
14472 * the MDS to wait until this OSD map epoch. It is necessary
14473 * in corner cases where we cancel RADOS ops, so that
14474 * nobody else tries to do IO to the same objects in
14475 * the same epoch as the cancelled ops.
14476 */
14477void Client::set_cap_epoch_barrier(epoch_t e)
14478{
14479 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14480 cap_epoch_barrier = e;
14481}
14482
14483const char** Client::get_tracked_conf_keys() const
14484{
14485 static const char* keys[] = {
14486 "client_cache_size",
14487 "client_cache_mid",
14488 "client_acl_type",
b32b8144
FG
14489 "client_deleg_timeout",
14490 "client_deleg_break_on_open",
7c673cae
FG
14491 NULL
14492 };
14493 return keys;
14494}
14495
11fdf7f2 14496void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14497 const std::set <std::string> &changed)
14498{
11fdf7f2 14499 std::lock_guard lock(client_lock);
7c673cae 14500
181888fb 14501 if (changed.count("client_cache_mid")) {
7c673cae
FG
14502 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14503 }
14504 if (changed.count("client_acl_type")) {
14505 acl_type = NO_ACL;
14506 if (cct->_conf->client_acl_type == "posix_acl")
14507 acl_type = POSIX_ACL;
14508 }
14509}
14510
7c673cae
FG
14511void intrusive_ptr_add_ref(Inode *in)
14512{
14513 in->get();
14514}
14515
14516void intrusive_ptr_release(Inode *in)
14517{
14518 in->client->put_inode(in);
14519}
14520
14521mds_rank_t Client::_get_random_up_mds() const
14522{
11fdf7f2 14523 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
14524
14525 std::set<mds_rank_t> up;
14526 mdsmap->get_up_mds_set(up);
14527
14528 if (up.empty())
14529 return MDS_RANK_NONE;
14530 std::set<mds_rank_t>::const_iterator p = up.begin();
14531 for (int n = rand() % up.size(); n; n--)
14532 ++p;
14533 return *p;
14534}
14535
14536
14537StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14538 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14539{
14540 monclient->set_messenger(m);
14541 objecter->set_client_incarnation(0);
14542}
14543
14544StandaloneClient::~StandaloneClient()
14545{
14546 delete objecter;
14547 objecter = nullptr;
14548}
14549
14550int StandaloneClient::init()
14551{
14552 timer.init();
14553 objectcacher->start();
14554 objecter->init();
14555
14556 client_lock.Lock();
11fdf7f2 14557 ceph_assert(!is_initialized());
7c673cae
FG
14558
14559 messenger->add_dispatcher_tail(objecter);
14560 messenger->add_dispatcher_tail(this);
14561
14562 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14563 int r = monclient->init();
14564 if (r < 0) {
14565 // need to do cleanup because we're in an intermediate init state
14566 timer.shutdown();
14567 client_lock.Unlock();
14568 objecter->shutdown();
14569 objectcacher->stop();
14570 monclient->shutdown();
14571 return r;
14572 }
14573 objecter->start();
14574
14575 client_lock.Unlock();
14576 _finish_init();
14577
14578 return 0;
14579}
14580
14581void StandaloneClient::shutdown()
14582{
14583 Client::shutdown();
14584 objecter->shutdown();
14585 monclient->shutdown();
14586}