]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #else
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
77
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
83
84 #define dout_subsys ceph_subsys_client
85
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
90
91 #include "Client.h"
92 #include "Inode.h"
93 #include "Dentry.h"
94 #include "Delegation.h"
95 #include "Dir.h"
96 #include "ClientSnapRealm.h"
97 #include "Fh.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
102
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
105
106 #include "include/cephfs/ceph_ll_client.h"
107
108 #if HAVE_GETGROUPLIST
109 #include <grp.h>
110 #include <pwd.h>
111 #include <unistd.h>
112 #endif
113
114 #undef dout_prefix
115 #define dout_prefix *_dout << "client." << whoami << " "
116
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119 // FreeBSD fails to define this
120 #ifndef O_DSYNC
121 #define O_DSYNC 0x0
122 #endif
123 // Darwin fails to define this
124 #ifndef O_RSYNC
125 #define O_RSYNC 0x0
126 #endif
127
128 #ifndef O_DIRECT
129 #define O_DIRECT 0x0
130 #endif
131
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
137 // them as invalid.
138 #ifndef O_NOFOLLOW
139 #define O_NOFOLLOW 0x0
140 #endif
141
142 #ifndef O_SYNC
143 #define O_SYNC 0x0
144 #endif
145
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
148 #ifndef S_IXUGO
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150 #endif
151
152 using namespace TOPNSPC::common;
153
154 namespace bs = boost::system;
155 namespace ca = ceph::async;
156
157 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
158 {
159 Client *client = static_cast<Client*>(p);
160 client->flush_set_callback(oset);
161 }
162
163 bool Client::is_reserved_vino(vinodeno_t &vino) {
164 if (MDS_IS_PRIVATE_INO(vino.ino)) {
165 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
166 return true;
167 }
168 return false;
169 }
170
171
172 // -------------
173
174 Client::CommandHook::CommandHook(Client *client) :
175 m_client(client)
176 {
177 }
178
179 int Client::CommandHook::call(
180 std::string_view command,
181 const cmdmap_t& cmdmap,
182 Formatter *f,
183 std::ostream& errss,
184 bufferlist& out)
185 {
186 f->open_object_section("result");
187 {
188 std::scoped_lock l{m_client->client_lock};
189 if (command == "mds_requests")
190 m_client->dump_mds_requests(f);
191 else if (command == "mds_sessions") {
192 bool cap_dump = false;
193 cmd_getval(cmdmap, "cap_dump", cap_dump);
194 m_client->dump_mds_sessions(f, cap_dump);
195 } else if (command == "dump_cache")
196 m_client->dump_cache(f);
197 else if (command == "kick_stale_sessions")
198 m_client->_kick_stale_sessions();
199 else if (command == "status")
200 m_client->dump_status(f);
201 else
202 ceph_abort_msg("bad command registered");
203 }
204 f->close_section();
205 return 0;
206 }
207
208
209 // -------------
210
211 int Client::get_fd_inode(int fd, InodeRef *in) {
212 int r = 0;
213 if (fd == CEPHFS_AT_FDCWD) {
214 *in = cwd;
215 } else {
216 Fh *f = get_filehandle(fd);
217 if (!f) {
218 r = -CEPHFS_EBADF;
219 } else {
220 *in = f->inode;
221 }
222 }
223 return r;
224 }
225
226 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
227 : inode(in), offset(0), next_offset(2),
228 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
229 perms(perms)
230 { }
231
232 void Client::_reset_faked_inos()
233 {
234 ino_t start = 1024;
235 free_faked_inos.clear();
236 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
237 last_used_faked_ino = 0;
238 last_used_faked_root = 0;
239 #ifdef _WIN32
240 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
241 // Windows structures, including Dokan ones, are using 64B identifiers.
242 _use_faked_inos = false;
243 #else
244 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
245 #endif
246 }
247
248 void Client::_assign_faked_ino(Inode *in)
249 {
250 if (0 == last_used_faked_ino)
251 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
252 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
253 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
254 last_used_faked_ino = 2048;
255 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
256 }
257 ceph_assert(it != free_faked_inos.end());
258 if (last_used_faked_ino < it.get_start()) {
259 ceph_assert(it.get_len() > 0);
260 last_used_faked_ino = it.get_start();
261 } else {
262 ++last_used_faked_ino;
263 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
264 }
265 in->faked_ino = last_used_faked_ino;
266 free_faked_inos.erase(in->faked_ino);
267 faked_ino_map[in->faked_ino] = in->vino();
268 }
269
270 /*
271 * In the faked mode, if you export multiple subdirectories,
272 * you will see that the inode numbers of the exported subdirectories
273 * are the same. so we distinguish the mount point by reserving
274 * the "fake ids" between "1024~2048" and combining the last
275 * 10bits(0x3ff) of the "root inodes".
276 */
277 void Client::_assign_faked_root(Inode *in)
278 {
279 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
280 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
281 last_used_faked_root = 0;
282 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
283 }
284 assert(it != free_faked_inos.end());
285 vinodeno_t inode_info = in->vino();
286 uint64_t inode_num = (uint64_t)inode_info.ino;
287 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
288 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
289 assert(it.get_start() + it.get_len() > last_used_faked_root);
290
291 in->faked_ino = last_used_faked_root;
292 free_faked_inos.erase(in->faked_ino);
293 faked_ino_map[in->faked_ino] = in->vino();
294 }
295
296 void Client::_release_faked_ino(Inode *in)
297 {
298 free_faked_inos.insert(in->faked_ino);
299 faked_ino_map.erase(in->faked_ino);
300 }
301
302 vinodeno_t Client::_map_faked_ino(ino_t ino)
303 {
304 vinodeno_t vino;
305 if (ino == 1)
306 vino = root->vino();
307 else if (faked_ino_map.count(ino))
308 vino = faked_ino_map[ino];
309 else
310 vino = vinodeno_t(0, CEPH_NOSNAP);
311 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
312 return vino;
313 }
314
315 vinodeno_t Client::map_faked_ino(ino_t ino)
316 {
317 std::scoped_lock lock(client_lock);
318 return _map_faked_ino(ino);
319 }
320
321 // cons/des
322
323 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
324 : Dispatcher(m->cct->get()),
325 timer(m->cct, timer_lock, false),
326 messenger(m),
327 monclient(mc),
328 objecter(objecter_),
329 whoami(mc->get_global_id()),
330 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
331 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
332 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
333 async_ino_invalidator(m->cct),
334 async_dentry_invalidator(m->cct),
335 interrupt_finisher(m->cct),
336 remount_finisher(m->cct),
337 async_ino_releasor(m->cct),
338 objecter_finisher(m->cct),
339 m_command_hook(this),
340 fscid(0)
341 {
342 _reset_faked_inos();
343
344 user_id = cct->_conf->client_mount_uid;
345 group_id = cct->_conf->client_mount_gid;
346 fuse_default_permissions = cct->_conf.get_val<bool>(
347 "fuse_default_permissions");
348
349 if (cct->_conf->client_acl_type == "posix_acl")
350 acl_type = POSIX_ACL;
351
352 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
353
354 // file handles
355 free_fd_set.insert(10, 1<<30);
356
357 mdsmap.reset(new MDSMap);
358
359 // osd interfaces
360 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
361 &client_lock));
362 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
363 client_flush_set_callback, // all commit callback
364 (void*)this,
365 cct->_conf->client_oc_size,
366 cct->_conf->client_oc_max_objects,
367 cct->_conf->client_oc_max_dirty,
368 cct->_conf->client_oc_target_dirty,
369 cct->_conf->client_oc_max_dirty_age,
370 true));
371 }
372
373
374 Client::~Client()
375 {
376 ceph_assert(ceph_mutex_is_not_locked(client_lock));
377
378 // If the task is crashed or aborted and doesn't
379 // get any chance to run the umount and shutdow.
380 {
381 std::scoped_lock l{client_lock};
382 tick_thread_stopped = true;
383 upkeep_cond.notify_one();
384 }
385
386 if (upkeeper.joinable())
387 upkeeper.join();
388
389 // It is necessary to hold client_lock, because any inode destruction
390 // may call into ObjectCacher, which asserts that it's lock (which is
391 // client_lock) is held.
392 std::scoped_lock l{client_lock};
393 tear_down_cache();
394 }
395
396 void Client::tear_down_cache()
397 {
398 // fd's
399 for (auto &[fd, fh] : fd_map) {
400 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
401 _release_fh(fh);
402 }
403 fd_map.clear();
404
405 while (!opened_dirs.empty()) {
406 dir_result_t *dirp = *opened_dirs.begin();
407 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
408 _closedir(dirp);
409 }
410
411 // caps!
412 // *** FIXME ***
413
414 // empty lru
415 trim_cache();
416 ceph_assert(lru.lru_get_size() == 0);
417
418 // close root ino
419 ceph_assert(inode_map.size() <= 1 + root_parents.size());
420 if (root && inode_map.size() == 1 + root_parents.size()) {
421 root.reset();
422 }
423
424 ceph_assert(inode_map.empty());
425 }
426
427 inodeno_t Client::get_root_ino()
428 {
429 std::scoped_lock l(client_lock);
430 if (use_faked_inos())
431 return root->faked_ino;
432 else
433 return root->ino;
434 }
435
436 Inode *Client::get_root()
437 {
438 std::scoped_lock l(client_lock);
439 root->ll_get();
440 return root.get();
441 }
442
443
444 // debug crapola
445
446 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
447 {
448 filepath path;
449 in->make_long_path(path);
450 ldout(cct, 1) << "dump_inode: "
451 << (disconnected ? "DISCONNECTED ":"")
452 << "inode " << in->ino
453 << " " << path
454 << " ref " << in->get_nref()
455 << " " << *in << dendl;
456
457 if (f) {
458 f->open_object_section("inode");
459 f->dump_stream("path") << path;
460 if (disconnected)
461 f->dump_int("disconnected", 1);
462 in->dump(f);
463 f->close_section();
464 }
465
466 did.insert(in);
467 if (in->dir) {
468 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
469 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
470 it != in->dir->dentries.end();
471 ++it) {
472 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
473 if (f) {
474 f->open_object_section("dentry");
475 it->second->dump(f);
476 f->close_section();
477 }
478 if (it->second->inode)
479 dump_inode(f, it->second->inode.get(), did, false);
480 }
481 }
482 }
483
484 void Client::dump_cache(Formatter *f)
485 {
486 set<Inode*> did;
487
488 ldout(cct, 1) << __func__ << dendl;
489
490 if (f)
491 f->open_array_section("cache");
492
493 if (root)
494 dump_inode(f, root.get(), did, true);
495
496 // make a second pass to catch anything disconnected
497 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
498 it != inode_map.end();
499 ++it) {
500 if (did.count(it->second))
501 continue;
502 dump_inode(f, it->second, did, true);
503 }
504
505 if (f)
506 f->close_section();
507 }
508
509 void Client::dump_status(Formatter *f)
510 {
511 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
512
513 ldout(cct, 1) << __func__ << dendl;
514
515 const epoch_t osd_epoch
516 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
517
518 if (f) {
519 f->open_object_section("metadata");
520 for (const auto& kv : metadata)
521 f->dump_string(kv.first.c_str(), kv.second);
522 f->close_section();
523
524 f->dump_int("dentry_count", lru.lru_get_size());
525 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
526 f->dump_int("id", get_nodeid().v);
527 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
528 f->dump_object("inst", inst);
529 f->dump_object("addr", inst.addr);
530 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
531 f->dump_string("addr_str", inst.addr.get_legacy_str());
532 f->dump_int("inode_count", inode_map.size());
533 f->dump_int("mds_epoch", mdsmap->get_epoch());
534 f->dump_int("osd_epoch", osd_epoch);
535 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
536 f->dump_bool("blocklisted", blocklisted);
537 f->dump_string("fs_name", mdsmap->get_fs_name());
538 }
539 }
540
541 void Client::_pre_init()
542 {
543 timer.init();
544
545 objecter_finisher.start();
546 filer.reset(new Filer(objecter, &objecter_finisher));
547 objecter->enable_blocklist_events();
548
549 objectcacher->start();
550 }
551
552 int Client::init()
553 {
554 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
555 ceph_assert(iref_writer.is_first_writer());
556
557 _pre_init();
558 {
559 std::scoped_lock l{client_lock};
560 messenger->add_dispatcher_tail(this);
561 }
562 _finish_init();
563 iref_writer.update_state(CLIENT_INITIALIZED);
564 return 0;
565 }
566
567 void Client::_finish_init()
568 {
569 {
570 std::scoped_lock l{client_lock};
571 // logger
572 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
573 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
574 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
575 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
576 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
577 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
578 logger.reset(plb.create_perf_counters());
579 cct->get_perfcounters_collection()->add(logger.get());
580 }
581
582 cct->_conf.add_observer(this);
583
584 AdminSocket* admin_socket = cct->get_admin_socket();
585 int ret = admin_socket->register_command("mds_requests",
586 &m_command_hook,
587 "show in-progress mds requests");
588 if (ret < 0) {
589 lderr(cct) << "error registering admin socket command: "
590 << cpp_strerror(-ret) << dendl;
591 }
592 ret = admin_socket->register_command("mds_sessions "
593 "name=cap_dump,type=CephBool,req=false",
594 &m_command_hook,
595 "show mds session state");
596 if (ret < 0) {
597 lderr(cct) << "error registering admin socket command: "
598 << cpp_strerror(-ret) << dendl;
599 }
600 ret = admin_socket->register_command("dump_cache",
601 &m_command_hook,
602 "show in-memory metadata cache contents");
603 if (ret < 0) {
604 lderr(cct) << "error registering admin socket command: "
605 << cpp_strerror(-ret) << dendl;
606 }
607 ret = admin_socket->register_command("kick_stale_sessions",
608 &m_command_hook,
609 "kick sessions that were remote reset");
610 if (ret < 0) {
611 lderr(cct) << "error registering admin socket command: "
612 << cpp_strerror(-ret) << dendl;
613 }
614 ret = admin_socket->register_command("status",
615 &m_command_hook,
616 "show overall client status");
617 if (ret < 0) {
618 lderr(cct) << "error registering admin socket command: "
619 << cpp_strerror(-ret) << dendl;
620 }
621 }
622
623 void Client::shutdown()
624 {
625 ldout(cct, 1) << __func__ << dendl;
626
627 // If we were not mounted, but were being used for sending
628 // MDS commands, we may have sessions that need closing.
629 {
630 std::scoped_lock l{client_lock};
631
632 // To make sure the tick thread will be stoppped before
633 // destructing the Client, just in case like the _mount()
634 // failed but didn't not get a chance to stop the tick
635 // thread
636 tick_thread_stopped = true;
637 upkeep_cond.notify_one();
638
639 _close_sessions();
640 }
641 cct->_conf.remove_observer(this);
642
643 cct->get_admin_socket()->unregister_commands(&m_command_hook);
644
645 if (ino_invalidate_cb) {
646 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
647 async_ino_invalidator.wait_for_empty();
648 async_ino_invalidator.stop();
649 }
650
651 if (dentry_invalidate_cb) {
652 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
653 async_dentry_invalidator.wait_for_empty();
654 async_dentry_invalidator.stop();
655 }
656
657 if (switch_interrupt_cb) {
658 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
659 interrupt_finisher.wait_for_empty();
660 interrupt_finisher.stop();
661 }
662
663 if (remount_cb) {
664 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
665 remount_finisher.wait_for_empty();
666 remount_finisher.stop();
667 }
668
669 if (ino_release_cb) {
670 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
671 async_ino_releasor.wait_for_empty();
672 async_ino_releasor.stop();
673 }
674
675 objectcacher->stop(); // outside of client_lock! this does a join.
676
677 /*
678 * We are shuting down the client.
679 *
680 * Just declare the state to CLIENT_NEW to block and fail any
681 * new comming "reader" and then try to wait all the in-flight
682 * "readers" to finish.
683 */
684 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
685 if (!iref_writer.is_first_writer())
686 return;
687 iref_writer.wait_readers_done();
688
689 {
690 std::scoped_lock l(timer_lock);
691 timer.shutdown();
692 }
693
694 objecter_finisher.wait_for_empty();
695 objecter_finisher.stop();
696
697 if (logger) {
698 cct->get_perfcounters_collection()->remove(logger.get());
699 logger.reset();
700 }
701 }
702
703
704 // ===================
705 // metadata cache stuff
706
707 void Client::trim_cache(bool trim_kernel_dcache)
708 {
709 uint64_t max = cct->_conf->client_cache_size;
710 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
711 unsigned last = 0;
712 while (lru.lru_get_size() != last) {
713 last = lru.lru_get_size();
714
715 if (!is_unmounting() && lru.lru_get_size() <= max) break;
716
717 // trim!
718 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
719 if (!dn)
720 break; // done
721
722 trim_dentry(dn);
723 }
724
725 if (trim_kernel_dcache && lru.lru_get_size() > max)
726 _invalidate_kernel_dcache();
727
728 // hose root?
729 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
730 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
731 root.reset();
732 }
733 }
734
735 void Client::trim_cache_for_reconnect(MetaSession *s)
736 {
737 mds_rank_t mds = s->mds_num;
738 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
739
740 int trimmed = 0;
741 list<Dentry*> skipped;
742 while (lru.lru_get_size() > 0) {
743 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
744 if (!dn)
745 break;
746
747 if ((dn->inode && dn->inode->caps.count(mds)) ||
748 dn->dir->parent_inode->caps.count(mds)) {
749 trim_dentry(dn);
750 trimmed++;
751 } else
752 skipped.push_back(dn);
753 }
754
755 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
756 lru.lru_insert_mid(*p);
757
758 ldout(cct, 20) << __func__ << " mds." << mds
759 << " trimmed " << trimmed << " dentries" << dendl;
760
761 if (s->caps.size() > 0)
762 _invalidate_kernel_dcache();
763 }
764
765 void Client::trim_dentry(Dentry *dn)
766 {
767 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
768 << " in dir "
769 << std::hex << dn->dir->parent_inode->ino << std::dec
770 << dendl;
771 if (dn->inode) {
772 Inode *diri = dn->dir->parent_inode;
773 clear_dir_complete_and_ordered(diri, true);
774 }
775 unlink(dn, false, false); // drop dir, drop dentry
776 }
777
778
779 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
780 uint64_t truncate_seq, uint64_t truncate_size)
781 {
782 uint64_t prior_size = in->size;
783
784 if (truncate_seq > in->truncate_seq ||
785 (truncate_seq == in->truncate_seq && size > in->size)) {
786 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
787 in->size = size;
788 in->reported_size = size;
789 if (truncate_seq != in->truncate_seq) {
790 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
791 << truncate_seq << dendl;
792 in->truncate_seq = truncate_seq;
793 in->oset.truncate_seq = truncate_seq;
794
795 // truncate cached file data
796 if (prior_size > size) {
797 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
798 }
799 }
800
801 // truncate inline data
802 if (in->inline_version < CEPH_INLINE_NONE) {
803 uint32_t len = in->inline_data.length();
804 if (size < len)
805 in->inline_data.splice(size, len - size);
806 }
807 }
808 if (truncate_seq >= in->truncate_seq &&
809 in->truncate_size != truncate_size) {
810 if (in->is_file()) {
811 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
812 << truncate_size << dendl;
813 in->truncate_size = truncate_size;
814 in->oset.truncate_size = truncate_size;
815 } else {
816 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
817 }
818 }
819 }
820
821 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
822 utime_t ctime, utime_t mtime, utime_t atime)
823 {
824 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
825 << " ctime " << ctime << " mtime " << mtime << dendl;
826
827 if (time_warp_seq > in->time_warp_seq)
828 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
829 << " is higher than local time_warp_seq "
830 << in->time_warp_seq << dendl;
831
832 int warn = false;
833 // be careful with size, mtime, atime
834 if (issued & (CEPH_CAP_FILE_EXCL|
835 CEPH_CAP_FILE_WR|
836 CEPH_CAP_FILE_BUFFER|
837 CEPH_CAP_AUTH_EXCL|
838 CEPH_CAP_XATTR_EXCL)) {
839 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
840 if (ctime > in->ctime)
841 in->ctime = ctime;
842 if (time_warp_seq > in->time_warp_seq) {
843 //the mds updated times, so take those!
844 in->mtime = mtime;
845 in->atime = atime;
846 in->time_warp_seq = time_warp_seq;
847 } else if (time_warp_seq == in->time_warp_seq) {
848 //take max times
849 if (mtime > in->mtime)
850 in->mtime = mtime;
851 if (atime > in->atime)
852 in->atime = atime;
853 } else if (issued & CEPH_CAP_FILE_EXCL) {
854 //ignore mds values as we have a higher seq
855 } else warn = true;
856 } else {
857 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
858 if (time_warp_seq >= in->time_warp_seq) {
859 in->ctime = ctime;
860 in->mtime = mtime;
861 in->atime = atime;
862 in->time_warp_seq = time_warp_seq;
863 } else warn = true;
864 }
865 if (warn) {
866 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
867 << time_warp_seq << " is lower than local time_warp_seq "
868 << in->time_warp_seq
869 << dendl;
870 }
871 }
872
873 void Client::_fragmap_remove_non_leaves(Inode *in)
874 {
875 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
876 if (!in->dirfragtree.is_leaf(p->first))
877 in->fragmap.erase(p++);
878 else
879 ++p;
880 }
881
882 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
883 {
884 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
885 if (p->second == mds)
886 in->fragmap.erase(p++);
887 else
888 ++p;
889 }
890
891 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
892 MetaSession *session,
893 const UserPerm& request_perms)
894 {
895 Inode *in;
896 bool was_new = false;
897 if (inode_map.count(st->vino)) {
898 in = inode_map[st->vino];
899 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
900 } else {
901 in = new Inode(this, st->vino, &st->layout);
902 inode_map[st->vino] = in;
903
904 if (use_faked_inos())
905 _assign_faked_ino(in);
906
907 if (!root) {
908 root = in;
909 if (use_faked_inos())
910 _assign_faked_root(root.get());
911 root_ancestor = in;
912 cwd = root;
913 } else if (is_mounting()) {
914 root_parents[root_ancestor] = in;
915 root_ancestor = in;
916 }
917
918 // immutable bits
919 in->ino = st->vino.ino;
920 in->snapid = st->vino.snapid;
921 in->mode = st->mode & S_IFMT;
922 was_new = true;
923 }
924
925 in->rdev = st->rdev;
926 if (in->is_symlink())
927 in->symlink = st->symlink;
928
929 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
930 bool new_version = false;
931 if (in->version == 0 ||
932 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
933 (in->version & ~1) < st->version))
934 new_version = true;
935
936 int issued;
937 in->caps_issued(&issued);
938 issued |= in->caps_dirty();
939 int new_issued = ~issued & (int)st->cap.caps;
940
941 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
942 !(issued & CEPH_CAP_AUTH_EXCL)) {
943 in->mode = st->mode;
944 in->uid = st->uid;
945 in->gid = st->gid;
946 in->btime = st->btime;
947 in->snap_btime = st->snap_btime;
948 in->snap_metadata = st->snap_metadata;
949 }
950
951 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
952 !(issued & CEPH_CAP_LINK_EXCL)) {
953 in->nlink = st->nlink;
954 }
955
956 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
957 update_inode_file_time(in, issued, st->time_warp_seq,
958 st->ctime, st->mtime, st->atime);
959 }
960
961 if (new_version ||
962 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
963 in->layout = st->layout;
964 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
965 }
966
967 if (in->is_dir()) {
968 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
969 in->dirstat = st->dirstat;
970 }
971 // dir_layout/rstat/quota are not tracked by capability, update them only if
972 // the inode stat is from auth mds
973 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
974 in->dir_layout = st->dir_layout;
975 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
976 in->rstat = st->rstat;
977 in->quota = st->quota;
978 in->dir_pin = st->dir_pin;
979 }
980 // move me if/when version reflects fragtree changes.
981 if (in->dirfragtree != st->dirfragtree) {
982 in->dirfragtree = st->dirfragtree;
983 _fragmap_remove_non_leaves(in);
984 }
985 }
986
987 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
988 st->xattrbl.length() &&
989 st->xattr_version > in->xattr_version) {
990 auto p = st->xattrbl.cbegin();
991 decode(in->xattrs, p);
992 in->xattr_version = st->xattr_version;
993 }
994
995 if (st->inline_version > in->inline_version) {
996 in->inline_data = st->inline_data;
997 in->inline_version = st->inline_version;
998 }
999
1000 /* always take a newer change attr */
1001 if (st->change_attr > in->change_attr)
1002 in->change_attr = st->change_attr;
1003
1004 if (st->version > in->version)
1005 in->version = st->version;
1006
1007 if (was_new)
1008 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1009
1010 if (!st->cap.caps)
1011 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1012
1013 if (in->snapid == CEPH_NOSNAP) {
1014 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1015 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1016 st->cap.flags, request_perms);
1017 if (in->auth_cap && in->auth_cap->session == session) {
1018 in->max_size = st->max_size;
1019 in->rstat = st->rstat;
1020 }
1021
1022 // setting I_COMPLETE needs to happen after adding the cap
1023 if (in->is_dir() &&
1024 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1025 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1026 in->dirstat.nfiles == 0 &&
1027 in->dirstat.nsubdirs == 0) {
1028 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1029 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1030 if (in->dir) {
1031 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1032 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1033 in->dir->readdir_cache.clear();
1034 for (const auto& p : in->dir->dentries) {
1035 unlink(p.second, true, true); // keep dir, keep dentry
1036 }
1037 if (in->dir->dentries.empty())
1038 close_dir(in->dir);
1039 }
1040 }
1041 } else {
1042 in->snap_caps |= st->cap.caps;
1043 }
1044
1045 in->fscrypt = st->fscrypt;
1046 return in;
1047 }
1048
1049
1050 /*
1051 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1052 */
1053 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1054 Inode *in, utime_t from, MetaSession *session,
1055 Dentry *old_dentry)
1056 {
1057 Dentry *dn = NULL;
1058 if (dir->dentries.count(dname))
1059 dn = dir->dentries[dname];
1060
1061 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1062 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1063 << dendl;
1064
1065 if (dn && dn->inode) {
1066 if (dn->inode->vino() == in->vino()) {
1067 touch_dn(dn);
1068 ldout(cct, 12) << " had dentry " << dname
1069 << " with correct vino " << dn->inode->vino()
1070 << dendl;
1071 } else {
1072 ldout(cct, 12) << " had dentry " << dname
1073 << " with WRONG vino " << dn->inode->vino()
1074 << dendl;
1075 unlink(dn, true, true); // keep dir, keep dentry
1076 }
1077 }
1078
1079 if (!dn || !dn->inode) {
1080 InodeRef tmp_ref(in);
1081 if (old_dentry) {
1082 if (old_dentry->dir != dir) {
1083 Inode *old_diri = old_dentry->dir->parent_inode;
1084 clear_dir_complete_and_ordered(old_diri, false);
1085 }
1086 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1087 }
1088 Inode *diri = dir->parent_inode;
1089 clear_dir_complete_and_ordered(diri, false);
1090 dn = link(dir, dname, in, dn);
1091 }
1092
1093 update_dentry_lease(dn, dlease, from, session);
1094 return dn;
1095 }
1096
1097 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1098 {
1099 utime_t dttl = from;
1100 dttl += (float)dlease->duration_ms / 1000.0;
1101
1102 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1103
1104 ceph_assert(dn);
1105
1106 if (dlease->mask & CEPH_LEASE_VALID) {
1107 if (dttl > dn->lease_ttl) {
1108 ldout(cct, 10) << "got dentry lease on " << dn->name
1109 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1110 dn->lease_ttl = dttl;
1111 dn->lease_mds = session->mds_num;
1112 dn->lease_seq = dlease->seq;
1113 dn->lease_gen = session->cap_gen;
1114 }
1115 }
1116 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1117 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1118 dn->mark_primary();
1119 dn->alternate_name = std::move(dlease->alternate_name);
1120 }
1121
1122
1123 /*
1124 * update MDS location cache for a single inode
1125 */
1126 void Client::update_dir_dist(Inode *in, DirStat *dst)
1127 {
1128 // auth
1129 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1130 if (dst->auth >= 0) {
1131 in->fragmap[dst->frag] = dst->auth;
1132 } else {
1133 in->fragmap.erase(dst->frag);
1134 }
1135 if (!in->dirfragtree.is_leaf(dst->frag)) {
1136 in->dirfragtree.force_to_leaf(cct, dst->frag);
1137 _fragmap_remove_non_leaves(in);
1138 }
1139
1140 // replicated
1141 in->dir_replicated = !dst->dist.empty();
1142 if (!dst->dist.empty())
1143 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1144 else
1145 in->frag_repmap.erase(dst->frag);
1146 }
1147
1148 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1149 {
1150 if (complete)
1151 diri->dir_release_count++;
1152 else
1153 diri->dir_ordered_count++;
1154 if (diri->flags & I_COMPLETE) {
1155 if (complete) {
1156 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1157 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1158 } else {
1159 if (diri->flags & I_DIR_ORDERED) {
1160 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1161 diri->flags &= ~I_DIR_ORDERED;
1162 }
1163 }
1164 if (diri->dir)
1165 diri->dir->readdir_cache.clear();
1166 }
1167 }
1168
1169 /*
1170 * insert results from readdir or lssnap into the metadata cache.
1171 */
1172 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1173
1174 auto& reply = request->reply;
1175 ConnectionRef con = request->reply->get_connection();
1176 uint64_t features;
1177 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1178 features = (uint64_t)-1;
1179 }
1180 else {
1181 features = con->get_features();
1182 }
1183
1184 dir_result_t *dirp = request->dirp;
1185 ceph_assert(dirp);
1186
1187 // the extra buffer list is only set for readdir and lssnap replies
1188 auto p = reply->get_extra_bl().cbegin();
1189 if (!p.end()) {
1190 // snapdir?
1191 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1192 ceph_assert(diri);
1193 diri = open_snapdir(diri);
1194 }
1195
1196 // only open dir if we're actually adding stuff to it!
1197 Dir *dir = diri->open_dir();
1198 ceph_assert(dir);
1199
1200 // dirstat
1201 DirStat dst(p, features);
1202 __u32 numdn;
1203 __u16 flags;
1204 decode(numdn, p);
1205 decode(flags, p);
1206
1207 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1208 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1209
1210 frag_t fg = (unsigned)request->head.args.readdir.frag;
1211 unsigned readdir_offset = dirp->next_offset;
1212 string readdir_start = dirp->last_name;
1213 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1214
1215 unsigned last_hash = 0;
1216 if (hash_order) {
1217 if (!readdir_start.empty()) {
1218 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1219 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1220 /* mds understands offset_hash */
1221 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1222 }
1223 }
1224
1225 if (fg != dst.frag) {
1226 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1227 fg = dst.frag;
1228 if (!hash_order) {
1229 readdir_offset = 2;
1230 readdir_start.clear();
1231 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1232 }
1233 }
1234
1235 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1236 << ", hash_order=" << hash_order
1237 << ", readdir_start " << readdir_start
1238 << ", last_hash " << last_hash
1239 << ", next_offset " << readdir_offset << dendl;
1240
1241 if (diri->snapid != CEPH_SNAPDIR &&
1242 fg.is_leftmost() && readdir_offset == 2 &&
1243 !(hash_order && last_hash)) {
1244 dirp->release_count = diri->dir_release_count;
1245 dirp->ordered_count = diri->dir_ordered_count;
1246 dirp->start_shared_gen = diri->shared_gen;
1247 dirp->cache_index = 0;
1248 }
1249
1250 dirp->buffer_frag = fg;
1251
1252 _readdir_drop_dirp_buffer(dirp);
1253 dirp->buffer.reserve(numdn);
1254
1255 string dname;
1256 LeaseStat dlease;
1257 for (unsigned i=0; i<numdn; i++) {
1258 decode(dname, p);
1259 dlease.decode(p, features);
1260 InodeStat ist(p, features);
1261
1262 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1263
1264 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1265 request->perms);
1266 Dentry *dn;
1267 if (diri->dir->dentries.count(dname)) {
1268 Dentry *olddn = diri->dir->dentries[dname];
1269 if (olddn->inode != in) {
1270 // replace incorrect dentry
1271 unlink(olddn, true, true); // keep dir, dentry
1272 dn = link(dir, dname, in, olddn);
1273 ceph_assert(dn == olddn);
1274 } else {
1275 // keep existing dn
1276 dn = olddn;
1277 touch_dn(dn);
1278 }
1279 } else {
1280 // new dn
1281 dn = link(dir, dname, in, NULL);
1282 }
1283 dn->alternate_name = std::move(dlease.alternate_name);
1284
1285 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1286 if (hash_order) {
1287 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1288 if (hash != last_hash)
1289 readdir_offset = 2;
1290 last_hash = hash;
1291 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1292 } else {
1293 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1294 }
1295 // add to readdir cache
1296 if (dirp->release_count == diri->dir_release_count &&
1297 dirp->ordered_count == diri->dir_ordered_count &&
1298 dirp->start_shared_gen == diri->shared_gen) {
1299 if (dirp->cache_index == dir->readdir_cache.size()) {
1300 if (i == 0) {
1301 ceph_assert(!dirp->inode->is_complete_and_ordered());
1302 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1303 }
1304 dir->readdir_cache.push_back(dn);
1305 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1306 if (dirp->inode->is_complete_and_ordered())
1307 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1308 else
1309 dir->readdir_cache[dirp->cache_index] = dn;
1310 } else {
1311 ceph_abort_msg("unexpected readdir buffer idx");
1312 }
1313 dirp->cache_index++;
1314 }
1315 // add to cached result list
1316 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1317 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1318 }
1319
1320 if (numdn > 0)
1321 dirp->last_name = dname;
1322 if (end)
1323 dirp->next_offset = 2;
1324 else
1325 dirp->next_offset = readdir_offset;
1326
1327 if (dir->is_empty())
1328 close_dir(dir);
1329 }
1330 }
1331
1332 /** insert_trace
1333 *
1334 * insert a trace from a MDS reply into the cache.
1335 */
1336 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1337 {
1338 auto& reply = request->reply;
1339 int op = request->get_op();
1340
1341 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1342 << " is_target=" << (int)reply->head.is_target
1343 << " is_dentry=" << (int)reply->head.is_dentry
1344 << dendl;
1345
1346 auto p = reply->get_trace_bl().cbegin();
1347 if (request->got_unsafe) {
1348 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1349 ceph_assert(p.end());
1350 return NULL;
1351 }
1352
1353 if (p.end()) {
1354 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1355
1356 Dentry *d = request->dentry();
1357 if (d) {
1358 Inode *diri = d->dir->parent_inode;
1359 clear_dir_complete_and_ordered(diri, true);
1360 }
1361
1362 if (d && reply->get_result() == 0) {
1363 if (op == CEPH_MDS_OP_RENAME) {
1364 // rename
1365 Dentry *od = request->old_dentry();
1366 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1367 ceph_assert(od);
1368 unlink(od, true, true); // keep dir, dentry
1369 } else if (op == CEPH_MDS_OP_RMDIR ||
1370 op == CEPH_MDS_OP_UNLINK) {
1371 // unlink, rmdir
1372 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1373 unlink(d, true, true); // keep dir, dentry
1374 }
1375 }
1376 return NULL;
1377 }
1378
1379 ConnectionRef con = request->reply->get_connection();
1380 uint64_t features;
1381 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1382 features = (uint64_t)-1;
1383 }
1384 else {
1385 features = con->get_features();
1386 }
1387 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1388
1389 // snap trace
1390 SnapRealm *realm = NULL;
1391 if (reply->snapbl.length())
1392 update_snap_trace(reply->snapbl, &realm);
1393
1394 ldout(cct, 10) << " hrm "
1395 << " is_target=" << (int)reply->head.is_target
1396 << " is_dentry=" << (int)reply->head.is_dentry
1397 << dendl;
1398
1399 InodeStat dirst;
1400 DirStat dst;
1401 string dname;
1402 LeaseStat dlease;
1403 InodeStat ist;
1404
1405 if (reply->head.is_dentry) {
1406 dirst.decode(p, features);
1407 dst.decode(p, features);
1408 decode(dname, p);
1409 dlease.decode(p, features);
1410 }
1411
1412 Inode *in = 0;
1413 if (reply->head.is_target) {
1414 ist.decode(p, features);
1415 if (cct->_conf->client_debug_getattr_caps) {
1416 unsigned wanted = 0;
1417 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1418 wanted = request->head.args.getattr.mask;
1419 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1420 wanted = request->head.args.open.mask;
1421
1422 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1423 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1424 ceph_abort_msg("MDS reply does not contain xattrs");
1425 }
1426
1427 in = add_update_inode(&ist, request->sent_stamp, session,
1428 request->perms);
1429 }
1430
1431 Inode *diri = NULL;
1432 if (reply->head.is_dentry) {
1433 diri = add_update_inode(&dirst, request->sent_stamp, session,
1434 request->perms);
1435 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1436
1437 if (in) {
1438 Dir *dir = diri->open_dir();
1439 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1440 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1441 } else {
1442 Dentry *dn = NULL;
1443 if (diri->dir && diri->dir->dentries.count(dname)) {
1444 dn = diri->dir->dentries[dname];
1445 if (dn->inode) {
1446 clear_dir_complete_and_ordered(diri, false);
1447 unlink(dn, true, true); // keep dir, dentry
1448 }
1449 }
1450 if (dlease.duration_ms > 0) {
1451 if (!dn) {
1452 Dir *dir = diri->open_dir();
1453 dn = link(dir, dname, NULL, NULL);
1454 }
1455 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1456 }
1457 }
1458 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1459 op == CEPH_MDS_OP_MKSNAP) {
1460 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1461 // fake it for snap lookup
1462 vinodeno_t vino = ist.vino;
1463 vino.snapid = CEPH_SNAPDIR;
1464 ceph_assert(inode_map.count(vino));
1465 diri = inode_map[vino];
1466
1467 string dname = request->path.last_dentry();
1468
1469 LeaseStat dlease;
1470 dlease.duration_ms = 0;
1471
1472 if (in) {
1473 Dir *dir = diri->open_dir();
1474 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1475 } else {
1476 if (diri->dir && diri->dir->dentries.count(dname)) {
1477 Dentry *dn = diri->dir->dentries[dname];
1478 if (dn->inode)
1479 unlink(dn, true, true); // keep dir, dentry
1480 }
1481 }
1482 }
1483
1484 if (in) {
1485 if (op == CEPH_MDS_OP_READDIR ||
1486 op == CEPH_MDS_OP_LSSNAP) {
1487 insert_readdir_results(request, session, in);
1488 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1489 // hack: return parent inode instead
1490 in = diri;
1491 }
1492
1493 if (request->dentry() == NULL && in != request->inode()) {
1494 // pin the target inode if its parent dentry is not pinned
1495 request->set_other_inode(in);
1496 }
1497 }
1498
1499 if (realm)
1500 put_snap_realm(realm);
1501
1502 request->target = in;
1503 return in;
1504 }
1505
1506 // -------
1507
1508 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1509 {
1510 mds_rank_t mds = MDS_RANK_NONE;
1511 __u32 hash = 0;
1512 bool is_hash = false;
1513
1514 Inode *in = NULL;
1515 Dentry *de = NULL;
1516
1517 if (req->resend_mds >= 0) {
1518 mds = req->resend_mds;
1519 req->resend_mds = -1;
1520 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1521 goto out;
1522 }
1523
1524 if (cct->_conf->client_use_random_mds)
1525 goto random_mds;
1526
1527 in = req->inode();
1528 de = req->dentry();
1529 if (in) {
1530 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1531 if (req->path.depth()) {
1532 hash = in->hash_dentry_name(req->path[0]);
1533 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1534 << " on " << req->path[0]
1535 << " => " << hash << dendl;
1536 is_hash = true;
1537 }
1538 } else if (de) {
1539 if (de->inode) {
1540 in = de->inode.get();
1541 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1542 } else {
1543 in = de->dir->parent_inode;
1544 hash = in->hash_dentry_name(de->name);
1545 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1546 << " on " << de->name
1547 << " => " << hash << dendl;
1548 is_hash = true;
1549 }
1550 }
1551 if (in) {
1552 if (in->snapid != CEPH_NOSNAP) {
1553 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1554 while (in->snapid != CEPH_NOSNAP) {
1555 if (in->snapid == CEPH_SNAPDIR)
1556 in = in->snapdir_parent.get();
1557 else if (!in->dentries.empty())
1558 /* In most cases there will only be one dentry, so getting it
1559 * will be the correct action. If there are multiple hard links,
1560 * I think the MDS should be able to redirect as needed*/
1561 in = in->get_first_parent()->dir->parent_inode;
1562 else {
1563 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1564 break;
1565 }
1566 }
1567 is_hash = false;
1568 }
1569
1570 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1571 << " hash=" << hash << dendl;
1572
1573 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1574 frag_t fg = in->dirfragtree[hash];
1575 if (!req->auth_is_best()) {
1576 auto repmapit = in->frag_repmap.find(fg);
1577 if (repmapit != in->frag_repmap.end()) {
1578 auto& repmap = repmapit->second;
1579 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1580 mds = repmap.at(r);
1581 }
1582 } else if (in->fragmap.count(fg)) {
1583 mds = in->fragmap[fg];
1584 if (phash_diri)
1585 *phash_diri = in;
1586 } else if (in->auth_cap) {
1587 req->send_to_auth = true;
1588 mds = in->auth_cap->session->mds_num;
1589 }
1590 if (mds >= 0) {
1591 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1592 goto out;
1593 }
1594 }
1595
1596 if (in->auth_cap && req->auth_is_best()) {
1597 mds = in->auth_cap->session->mds_num;
1598 } else if (!in->caps.empty()) {
1599 mds = in->caps.begin()->second.session->mds_num;
1600 } else {
1601 goto random_mds;
1602 }
1603 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1604
1605 goto out;
1606 }
1607
1608 random_mds:
1609 if (mds < 0) {
1610 mds = _get_random_up_mds();
1611 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1612 }
1613
1614 out:
1615 ldout(cct, 20) << "mds is " << mds << dendl;
1616 return mds;
1617 }
1618
1619 void Client::connect_mds_targets(mds_rank_t mds)
1620 {
1621 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1622 ceph_assert(mds_sessions.count(mds));
1623 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1624 for (const auto &rank : info.export_targets) {
1625 if (mds_sessions.count(rank) == 0 &&
1626 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1627 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1628 << " export target mds." << rank << dendl;
1629 _open_mds_session(rank);
1630 }
1631 }
1632 }
1633
1634 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1635 {
1636 f->dump_int("id", get_nodeid().v);
1637 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1638 f->dump_object("inst", inst);
1639 f->dump_stream("inst_str") << inst;
1640 f->dump_stream("addr_str") << inst.addr;
1641 f->open_array_section("sessions");
1642 for (const auto &p : mds_sessions) {
1643 f->open_object_section("session");
1644 p.second.dump(f, cap_dump);
1645 f->close_section();
1646 }
1647 f->close_section();
1648 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1649 }
1650
1651 void Client::dump_mds_requests(Formatter *f)
1652 {
1653 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1654 p != mds_requests.end();
1655 ++p) {
1656 f->open_object_section("request");
1657 p->second->dump(f);
1658 f->close_section();
1659 }
1660 }
1661
1662 int Client::verify_reply_trace(int r, MetaSession *session,
1663 MetaRequest *request, const MConstRef<MClientReply>& reply,
1664 InodeRef *ptarget, bool *pcreated,
1665 const UserPerm& perms)
1666 {
1667 // check whether this request actually did the create, and set created flag
1668 bufferlist extra_bl;
1669 inodeno_t created_ino;
1670 bool got_created_ino = false;
1671 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1672
1673 extra_bl = reply->get_extra_bl();
1674 if (extra_bl.length() >= 8) {
1675 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1676 struct openc_response_t ocres;
1677
1678 decode(ocres, extra_bl);
1679 created_ino = ocres.created_ino;
1680 /*
1681 * The userland cephfs client doesn't have a way to do an async create
1682 * (yet), so just discard delegated_inos for now. Eventually we should
1683 * store them and use them in create calls, even if they are synchronous,
1684 * if only for testing purposes.
1685 */
1686 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1687 } else {
1688 // u64 containing number of created ino
1689 decode(created_ino, extra_bl);
1690 }
1691 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1692 got_created_ino = true;
1693 }
1694
1695 if (pcreated)
1696 *pcreated = got_created_ino;
1697
1698 if (request->target) {
1699 *ptarget = request->target;
1700 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1701 } else {
1702 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1703 (*ptarget) = p->second;
1704 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1705 } else {
1706 // we got a traceless reply, and need to look up what we just
1707 // created. for now, do this by name. someday, do this by the
1708 // ino... which we know! FIXME.
1709 InodeRef target;
1710 Dentry *d = request->dentry();
1711 if (d) {
1712 if (d->dir) {
1713 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1714 << d->dir->parent_inode->ino << "/" << d->name
1715 << " got_ino " << got_created_ino
1716 << " ino " << created_ino
1717 << dendl;
1718 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1719 &target, perms);
1720 } else {
1721 // if the dentry is not linked, just do our best. see #5021.
1722 ceph_abort_msg("how did this happen? i want logs!");
1723 }
1724 } else {
1725 Inode *in = request->inode();
1726 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1727 << in->ino << dendl;
1728 r = _getattr(in, request->regetattr_mask, perms, true);
1729 target = in;
1730 }
1731 if (r >= 0) {
1732 // verify ino returned in reply and trace_dist are the same
1733 if (got_created_ino &&
1734 created_ino.val != target->ino.val) {
1735 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1736 r = -CEPHFS_EINTR;
1737 }
1738 if (ptarget)
1739 ptarget->swap(target);
1740 }
1741 }
1742 }
1743
1744 return r;
1745 }
1746
1747
1748 /**
1749 * make a request
1750 *
1751 * Blocking helper to make an MDS request.
1752 *
1753 * If the ptarget flag is set, behavior changes slightly: the caller
1754 * expects to get a pointer to the inode we are creating or operating
1755 * on. As a result, we will follow up any traceless mutation reply
1756 * with a getattr or lookup to transparently handle a traceless reply
1757 * from the MDS (as when the MDS restarts and the client has to replay
1758 * a request).
1759 *
1760 * @param request the MetaRequest to execute
1761 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1762 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1763 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1764 * @param use_mds [optional] prefer a specific mds (-1 for default)
1765 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1766 */
1767 int Client::make_request(MetaRequest *request,
1768 const UserPerm& perms,
1769 InodeRef *ptarget, bool *pcreated,
1770 mds_rank_t use_mds,
1771 bufferlist *pdirbl)
1772 {
1773 int r = 0;
1774
1775 // assign a unique tid
1776 ceph_tid_t tid = ++last_tid;
1777 request->set_tid(tid);
1778
1779 // and timestamp
1780 request->op_stamp = ceph_clock_now();
1781
1782 // make note
1783 mds_requests[tid] = request->get();
1784 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1785 oldest_tid = tid;
1786
1787 request->set_caller_perms(perms);
1788
1789 if (cct->_conf->client_inject_fixed_oldest_tid) {
1790 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1791 request->set_oldest_client_tid(1);
1792 } else {
1793 request->set_oldest_client_tid(oldest_tid);
1794 }
1795
1796 // hack target mds?
1797 if (use_mds >= 0)
1798 request->resend_mds = use_mds;
1799
1800 MetaSession *session = NULL;
1801 while (1) {
1802 if (request->aborted())
1803 break;
1804
1805 if (blocklisted) {
1806 request->abort(-CEPHFS_EBLOCKLISTED);
1807 break;
1808 }
1809
1810 // set up wait cond
1811 ceph::condition_variable caller_cond;
1812 request->caller_cond = &caller_cond;
1813
1814 // choose mds
1815 Inode *hash_diri = NULL;
1816 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1817 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1818 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1819 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1820 if (hash_diri) {
1821 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1822 _fragmap_remove_stopped_mds(hash_diri, mds);
1823 } else {
1824 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1825 request->resend_mds = _get_random_up_mds();
1826 }
1827 } else {
1828 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1829 wait_on_list(waiting_for_mdsmap);
1830 }
1831 continue;
1832 }
1833
1834 // open a session?
1835 if (!have_open_session(mds)) {
1836 session = _get_or_open_mds_session(mds);
1837 if (session->state == MetaSession::STATE_REJECTED) {
1838 request->abort(-CEPHFS_EPERM);
1839 break;
1840 }
1841 // wait
1842 if (session->state == MetaSession::STATE_OPENING) {
1843 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1844 wait_on_context_list(session->waiting_for_open);
1845 continue;
1846 }
1847
1848 if (!have_open_session(mds))
1849 continue;
1850 } else {
1851 session = &mds_sessions.at(mds);
1852 }
1853
1854 // send request.
1855 send_request(request, session);
1856
1857 // wait for signal
1858 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1859 request->kick = false;
1860 std::unique_lock l{client_lock, std::adopt_lock};
1861 caller_cond.wait(l, [request] {
1862 return (request->reply || // reply
1863 request->resend_mds >= 0 || // forward
1864 request->kick);
1865 });
1866 l.release();
1867 request->caller_cond = nullptr;
1868
1869 // did we get a reply?
1870 if (request->reply)
1871 break;
1872 }
1873
1874 if (!request->reply) {
1875 ceph_assert(request->aborted());
1876 ceph_assert(!request->got_unsafe);
1877 r = request->get_abort_code();
1878 request->item.remove_myself();
1879 unregister_request(request);
1880 put_request(request);
1881 return r;
1882 }
1883
1884 // got it!
1885 auto reply = std::move(request->reply);
1886 r = reply->get_result();
1887 if (r >= 0)
1888 request->success = true;
1889
1890 // kick dispatcher (we've got it!)
1891 ceph_assert(request->dispatch_cond);
1892 request->dispatch_cond->notify_all();
1893 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1894 request->dispatch_cond = 0;
1895
1896 if (r >= 0 && ptarget)
1897 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1898
1899 if (pdirbl)
1900 *pdirbl = reply->get_extra_bl();
1901
1902 // -- log times --
1903 utime_t lat = ceph_clock_now();
1904 lat -= request->sent_stamp;
1905 ldout(cct, 20) << "lat " << lat << dendl;
1906 logger->tinc(l_c_lat, lat);
1907 logger->tinc(l_c_reply, lat);
1908
1909 put_request(request);
1910 return r;
1911 }
1912
1913 void Client::unregister_request(MetaRequest *req)
1914 {
1915 mds_requests.erase(req->tid);
1916 if (req->tid == oldest_tid) {
1917 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1918 while (true) {
1919 if (p == mds_requests.end()) {
1920 oldest_tid = 0;
1921 break;
1922 }
1923 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1924 oldest_tid = p->first;
1925 break;
1926 }
1927 ++p;
1928 }
1929 }
1930 put_request(req);
1931 }
1932
1933 void Client::put_request(MetaRequest *request)
1934 {
1935 if (request->_put()) {
1936 int op = -1;
1937 if (request->success)
1938 op = request->get_op();
1939 InodeRef other_in;
1940 request->take_other_inode(&other_in);
1941 delete request;
1942
1943 if (other_in &&
1944 (op == CEPH_MDS_OP_RMDIR ||
1945 op == CEPH_MDS_OP_RENAME ||
1946 op == CEPH_MDS_OP_RMSNAP)) {
1947 _try_to_trim_inode(other_in.get(), false);
1948 }
1949 }
1950 }
1951
1952 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1953 mds_rank_t mds, int drop,
1954 int unless, int force)
1955 {
1956 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1957 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1958 << ", force:" << force << ")" << dendl;
1959 int released = 0;
1960 auto it = in->caps.find(mds);
1961 if (it != in->caps.end()) {
1962 Cap &cap = it->second;
1963 drop &= ~(in->dirty_caps | get_caps_used(in));
1964 if ((drop & cap.issued) &&
1965 !(unless & cap.issued)) {
1966 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1967 cap.issued &= ~drop;
1968 cap.implemented &= ~drop;
1969 released = 1;
1970 } else {
1971 released = force;
1972 }
1973 if (released) {
1974 cap.wanted = in->caps_wanted();
1975 if (&cap == in->auth_cap &&
1976 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1977 in->requested_max_size = 0;
1978 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1979 }
1980 ceph_mds_request_release rel;
1981 rel.ino = in->ino;
1982 rel.cap_id = cap.cap_id;
1983 rel.seq = cap.seq;
1984 rel.issue_seq = cap.issue_seq;
1985 rel.mseq = cap.mseq;
1986 rel.caps = cap.implemented;
1987 rel.wanted = cap.wanted;
1988 rel.dname_len = 0;
1989 rel.dname_seq = 0;
1990 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1991 }
1992 }
1993 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1994 << released << dendl;
1995 return released;
1996 }
1997
1998 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1999 mds_rank_t mds, int drop, int unless)
2000 {
2001 ldout(cct, 20) << __func__ << " enter(dn:"
2002 << dn << ")" << dendl;
2003 int released = 0;
2004 if (dn->dir)
2005 released = encode_inode_release(dn->dir->parent_inode, req,
2006 mds, drop, unless, 1);
2007 if (released && dn->lease_mds == mds) {
2008 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2009 auto& rel = req->cap_releases.back();
2010 rel.item.dname_len = dn->name.length();
2011 rel.item.dname_seq = dn->lease_seq;
2012 rel.dname = dn->name;
2013 dn->lease_mds = -1;
2014 }
2015 ldout(cct, 25) << __func__ << " exit(dn:"
2016 << dn << ")" << dendl;
2017 }
2018
2019
2020 /*
2021 * This requires the MClientRequest *request member to be set.
2022 * It will error out horribly without one.
2023 * Additionally, if you set any *drop member, you'd better have
2024 * set the corresponding dentry!
2025 */
2026 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2027 {
2028 ldout(cct, 20) << __func__ << " enter (req: "
2029 << req << ", mds: " << mds << ")" << dendl;
2030 if (req->inode_drop && req->inode())
2031 encode_inode_release(req->inode(), req,
2032 mds, req->inode_drop,
2033 req->inode_unless);
2034
2035 if (req->old_inode_drop && req->old_inode())
2036 encode_inode_release(req->old_inode(), req,
2037 mds, req->old_inode_drop,
2038 req->old_inode_unless);
2039 if (req->other_inode_drop && req->other_inode())
2040 encode_inode_release(req->other_inode(), req,
2041 mds, req->other_inode_drop,
2042 req->other_inode_unless);
2043
2044 if (req->dentry_drop && req->dentry())
2045 encode_dentry_release(req->dentry(), req,
2046 mds, req->dentry_drop,
2047 req->dentry_unless);
2048
2049 if (req->old_dentry_drop && req->old_dentry())
2050 encode_dentry_release(req->old_dentry(), req,
2051 mds, req->old_dentry_drop,
2052 req->old_dentry_unless);
2053 ldout(cct, 25) << __func__ << " exit (req: "
2054 << req << ", mds " << mds <<dendl;
2055 }
2056
2057 bool Client::have_open_session(mds_rank_t mds)
2058 {
2059 const auto &it = mds_sessions.find(mds);
2060 return it != mds_sessions.end() &&
2061 (it->second.state == MetaSession::STATE_OPEN ||
2062 it->second.state == MetaSession::STATE_STALE);
2063 }
2064
2065 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
2066 {
2067 const auto &it = mds_sessions.find(mds);
2068 if (it == mds_sessions.end() || it->second.con != con) {
2069 return NULL;
2070 } else {
2071 return &it->second;
2072 }
2073 }
2074
2075 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
2076 {
2077 auto it = mds_sessions.find(mds);
2078 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
2079 }
2080
2081 /**
2082 * Populate a map of strings with client-identifying metadata,
2083 * such as the hostname. Call this once at initialization.
2084 */
2085 void Client::populate_metadata(const std::string &mount_root)
2086 {
2087 // Hostname
2088 #ifdef _WIN32
2089 // TODO: move this to compat.h
2090 char hostname[64];
2091 DWORD hostname_sz = 64;
2092 GetComputerNameA(hostname, &hostname_sz);
2093 metadata["hostname"] = hostname;
2094 #else
2095 struct utsname u;
2096 int r = uname(&u);
2097 if (r >= 0) {
2098 metadata["hostname"] = u.nodename;
2099 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2100 } else {
2101 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2102 }
2103 #endif
2104
2105 metadata["pid"] = stringify(getpid());
2106
2107 // Ceph entity id (the '0' in "client.0")
2108 metadata["entity_id"] = cct->_conf->name.get_id();
2109
2110 // Our mount position
2111 if (!mount_root.empty()) {
2112 metadata["root"] = mount_root;
2113 }
2114
2115 // Ceph version
2116 metadata["ceph_version"] = pretty_version_to_str();
2117 metadata["ceph_sha1"] = git_version_to_str();
2118
2119 // Apply any metadata from the user's configured overrides
2120 std::vector<std::string> tokens;
2121 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2122 for (const auto &i : tokens) {
2123 auto eqpos = i.find("=");
2124 // Throw out anything that isn't of the form "<str>=<str>"
2125 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2126 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2127 continue;
2128 }
2129 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2130 }
2131 }
2132
2133 /**
2134 * Optionally add or override client metadata fields.
2135 */
2136 void Client::update_metadata(std::string const &k, std::string const &v)
2137 {
2138 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2139 ceph_assert(iref_reader.is_state_satisfied());
2140
2141 std::scoped_lock l(client_lock);
2142
2143 auto it = metadata.find(k);
2144 if (it != metadata.end()) {
2145 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2146 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2147 }
2148
2149 metadata[k] = v;
2150 }
2151
2152 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2153 {
2154 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2155 auto addrs = mdsmap->get_addrs(mds);
2156 auto em = mds_sessions.emplace(std::piecewise_construct,
2157 std::forward_as_tuple(mds),
2158 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2159 ceph_assert(em.second); /* not already present */
2160 MetaSession *session = &em.first->second;
2161
2162 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2163 m->metadata = metadata;
2164 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2165 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2166 session->con->send_message2(std::move(m));
2167 return session;
2168 }
2169
2170 void Client::_close_mds_session(MetaSession *s)
2171 {
2172 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2173 s->state = MetaSession::STATE_CLOSING;
2174 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2175 }
2176
2177 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2178 {
2179 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2180 if (rejected && s->state != MetaSession::STATE_CLOSING)
2181 s->state = MetaSession::STATE_REJECTED;
2182 else
2183 s->state = MetaSession::STATE_CLOSED;
2184 s->con->mark_down();
2185 signal_context_list(s->waiting_for_open);
2186 mount_cond.notify_all();
2187 remove_session_caps(s, err);
2188 kick_requests_closed(s);
2189 mds_ranks_closing.erase(s->mds_num);
2190 if (s->state == MetaSession::STATE_CLOSED)
2191 mds_sessions.erase(s->mds_num);
2192 }
2193
2194 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2195 {
2196 mds_rank_t from = mds_rank_t(m->get_source().num());
2197 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2198
2199 std::scoped_lock cl(client_lock);
2200 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2201 if (!session) {
2202 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2203 return;
2204 }
2205
2206 switch (m->get_op()) {
2207 case CEPH_SESSION_OPEN:
2208 {
2209 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2210 missing_features -= m->supported_features;
2211 if (!missing_features.empty()) {
2212 lderr(cct) << "mds." << from << " lacks required features '"
2213 << missing_features << "', closing session " << dendl;
2214 _close_mds_session(session);
2215 _closed_mds_session(session, -CEPHFS_EPERM, true);
2216 break;
2217 }
2218 session->mds_features = std::move(m->supported_features);
2219
2220 renew_caps(session);
2221 session->state = MetaSession::STATE_OPEN;
2222 if (is_unmounting())
2223 mount_cond.notify_all();
2224 else
2225 connect_mds_targets(from);
2226 signal_context_list(session->waiting_for_open);
2227 break;
2228 }
2229
2230 case CEPH_SESSION_CLOSE:
2231 _closed_mds_session(session);
2232 break;
2233
2234 case CEPH_SESSION_RENEWCAPS:
2235 if (session->cap_renew_seq == m->get_seq()) {
2236 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2237 session->cap_ttl =
2238 session->last_cap_renew_request + mdsmap->get_session_timeout();
2239 if (was_stale)
2240 wake_up_session_caps(session, false);
2241 }
2242 break;
2243
2244 case CEPH_SESSION_STALE:
2245 // invalidate session caps/leases
2246 session->cap_gen++;
2247 session->cap_ttl = ceph_clock_now();
2248 session->cap_ttl -= 1;
2249 renew_caps(session);
2250 break;
2251
2252 case CEPH_SESSION_RECALL_STATE:
2253 /*
2254 * Call the renew caps and flush cap releases just before
2255 * triming the caps in case the tick() won't get a chance
2256 * to run them, which could cause the client to be blocklisted
2257 * and MDS daemons trying to recall the caps again and
2258 * again.
2259 *
2260 * In most cases it will do nothing, and the new cap releases
2261 * added by trim_caps() followed will be deferred flushing
2262 * by tick().
2263 */
2264 renew_and_flush_cap_releases();
2265 trim_caps(session, m->get_max_caps());
2266 break;
2267
2268 case CEPH_SESSION_FLUSHMSG:
2269 /* flush cap release */
2270 if (auto& m = session->release; m) {
2271 session->con->send_message2(std::move(m));
2272 }
2273 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2274 break;
2275
2276 case CEPH_SESSION_FORCE_RO:
2277 force_session_readonly(session);
2278 break;
2279
2280 case CEPH_SESSION_REJECT:
2281 {
2282 std::string_view error_str;
2283 auto it = m->metadata.find("error_string");
2284 if (it != m->metadata.end())
2285 error_str = it->second;
2286 else
2287 error_str = "unknown error";
2288 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2289
2290 _closed_mds_session(session, -CEPHFS_EPERM, true);
2291 }
2292 break;
2293
2294 default:
2295 ceph_abort();
2296 }
2297 }
2298
2299 bool Client::_any_stale_sessions() const
2300 {
2301 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2302
2303 for (const auto &p : mds_sessions) {
2304 if (p.second.state == MetaSession::STATE_STALE) {
2305 return true;
2306 }
2307 }
2308
2309 return false;
2310 }
2311
2312 void Client::_kick_stale_sessions()
2313 {
2314 ldout(cct, 1) << __func__ << dendl;
2315
2316 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2317 MetaSession &s = it->second;
2318 if (s.state == MetaSession::STATE_REJECTED) {
2319 mds_sessions.erase(it++);
2320 continue;
2321 }
2322 ++it;
2323 if (s.state == MetaSession::STATE_STALE)
2324 _closed_mds_session(&s);
2325 }
2326 }
2327
2328 void Client::send_request(MetaRequest *request, MetaSession *session,
2329 bool drop_cap_releases)
2330 {
2331 // make the request
2332 mds_rank_t mds = session->mds_num;
2333 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2334 << " for mds." << mds << dendl;
2335 auto r = build_client_request(request);
2336 if (request->dentry()) {
2337 r->set_dentry_wanted();
2338 }
2339 if (request->got_unsafe) {
2340 r->set_replayed_op();
2341 if (request->target)
2342 r->head.ino = request->target->ino;
2343 } else {
2344 encode_cap_releases(request, mds);
2345 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2346 request->cap_releases.clear();
2347 else
2348 r->releases.swap(request->cap_releases);
2349 }
2350 r->set_mdsmap_epoch(mdsmap->get_epoch());
2351 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2352 objecter->with_osdmap([r](const OSDMap& o) {
2353 r->set_osdmap_epoch(o.get_epoch());
2354 });
2355 }
2356
2357 if (request->mds == -1) {
2358 request->sent_stamp = ceph_clock_now();
2359 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2360 }
2361 request->mds = mds;
2362
2363 Inode *in = request->inode();
2364 if (in) {
2365 auto it = in->caps.find(mds);
2366 if (it != in->caps.end()) {
2367 request->sent_on_mseq = it->second.mseq;
2368 }
2369 }
2370
2371 session->requests.push_back(&request->item);
2372
2373 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2374 session->con->send_message2(std::move(r));
2375 }
2376
2377 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2378 {
2379 auto req = make_message<MClientRequest>(request->get_op());
2380 req->set_tid(request->tid);
2381 req->set_stamp(request->op_stamp);
2382 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2383
2384 // if the filepath's haven't been set, set them!
2385 if (request->path.empty()) {
2386 Inode *in = request->inode();
2387 Dentry *de = request->dentry();
2388 if (in)
2389 in->make_nosnap_relative_path(request->path);
2390 else if (de) {
2391 if (de->inode)
2392 de->inode->make_nosnap_relative_path(request->path);
2393 else if (de->dir) {
2394 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2395 request->path.push_dentry(de->name);
2396 }
2397 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2398 << " No path, inode, or appropriately-endowed dentry given!"
2399 << dendl;
2400 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2401 << " No path, inode, or dentry given!"
2402 << dendl;
2403 }
2404 req->set_filepath(request->get_filepath());
2405 req->set_filepath2(request->get_filepath2());
2406 req->set_alternate_name(request->alternate_name);
2407 req->set_data(request->data);
2408 req->set_retry_attempt(request->retry_attempt++);
2409 req->head.num_fwd = request->num_fwd;
2410 const gid_t *_gids;
2411 int gid_count = request->perms.get_gids(&_gids);
2412 req->set_gid_list(gid_count, _gids);
2413 return req;
2414 }
2415
2416
2417
2418 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2419 {
2420 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2421
2422 std::scoped_lock cl(client_lock);
2423 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2424 if (!session) {
2425 return;
2426 }
2427 ceph_tid_t tid = fwd->get_tid();
2428
2429 if (mds_requests.count(tid) == 0) {
2430 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2431 return;
2432 }
2433
2434 MetaRequest *request = mds_requests[tid];
2435 ceph_assert(request);
2436
2437 // reset retry counter
2438 request->retry_attempt = 0;
2439
2440 // request not forwarded, or dest mds has no session.
2441 // resend.
2442 ldout(cct, 10) << __func__ << " tid " << tid
2443 << " fwd " << fwd->get_num_fwd()
2444 << " to mds." << fwd->get_dest_mds()
2445 << ", resending to " << fwd->get_dest_mds()
2446 << dendl;
2447
2448 request->mds = -1;
2449 request->item.remove_myself();
2450 request->num_fwd = fwd->get_num_fwd();
2451 request->resend_mds = fwd->get_dest_mds();
2452 request->caller_cond->notify_all();
2453 }
2454
2455 bool Client::is_dir_operation(MetaRequest *req)
2456 {
2457 int op = req->get_op();
2458 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2459 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2460 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2461 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2462 return true;
2463 return false;
2464 }
2465
2466 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2467 {
2468 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2469
2470 std::scoped_lock cl(client_lock);
2471 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2472 if (!session) {
2473 return;
2474 }
2475
2476 ceph_tid_t tid = reply->get_tid();
2477 bool is_safe = reply->is_safe();
2478
2479 if (mds_requests.count(tid) == 0) {
2480 lderr(cct) << __func__ << " no pending request on tid " << tid
2481 << " safe is:" << is_safe << dendl;
2482 return;
2483 }
2484 MetaRequest *request = mds_requests.at(tid);
2485
2486 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2487 << " tid " << tid << dendl;
2488
2489 if (request->got_unsafe && !is_safe) {
2490 //duplicate response
2491 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2492 << mds_num << " safe:" << is_safe << dendl;
2493 return;
2494 }
2495
2496 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
2497 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2498 << " from mds." << request->mds << dendl;
2499 request->send_to_auth = true;
2500 request->resend_mds = choose_target_mds(request);
2501 Inode *in = request->inode();
2502 std::map<mds_rank_t, Cap>::const_iterator it;
2503 if (request->resend_mds >= 0 &&
2504 request->resend_mds == request->mds &&
2505 (in == NULL ||
2506 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2507 request->sent_on_mseq == it->second.mseq)) {
2508 ldout(cct, 20) << "have to return ESTALE" << dendl;
2509 } else {
2510 request->caller_cond->notify_all();
2511 return;
2512 }
2513 }
2514
2515 ceph_assert(!request->reply);
2516 request->reply = reply;
2517 insert_trace(request, session);
2518
2519 // Handle unsafe reply
2520 if (!is_safe) {
2521 request->got_unsafe = true;
2522 session->unsafe_requests.push_back(&request->unsafe_item);
2523 if (is_dir_operation(request)) {
2524 Inode *dir = request->inode();
2525 ceph_assert(dir);
2526 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2527 }
2528 if (request->target) {
2529 InodeRef &in = request->target;
2530 in->unsafe_ops.push_back(&request->unsafe_target_item);
2531 }
2532 }
2533
2534 // Only signal the caller once (on the first reply):
2535 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2536 if (!is_safe || !request->got_unsafe) {
2537 ceph::condition_variable cond;
2538 request->dispatch_cond = &cond;
2539
2540 // wake up waiter
2541 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2542 request->caller_cond->notify_all();
2543
2544 // wake for kick back
2545 std::unique_lock l{client_lock, std::adopt_lock};
2546 cond.wait(l, [tid, request, &cond, this] {
2547 if (request->dispatch_cond) {
2548 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2549 << tid << " " << &cond << dendl;
2550 }
2551 return !request->dispatch_cond;
2552 });
2553 l.release();
2554 }
2555
2556 if (is_safe) {
2557 // the filesystem change is committed to disk
2558 // we're done, clean up
2559 if (request->got_unsafe) {
2560 request->unsafe_item.remove_myself();
2561 request->unsafe_dir_item.remove_myself();
2562 request->unsafe_target_item.remove_myself();
2563 signal_cond_list(request->waitfor_safe);
2564 }
2565 request->item.remove_myself();
2566 unregister_request(request);
2567 }
2568 if (is_unmounting())
2569 mount_cond.notify_all();
2570 }
2571
2572 void Client::_handle_full_flag(int64_t pool)
2573 {
2574 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2575 << "on " << pool << dendl;
2576 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2577 // to do this rather than blocking, because otherwise when we fill up we
2578 // potentially lock caps forever on files with dirty pages, and we need
2579 // to be able to release those caps to the MDS so that it can delete files
2580 // and free up space.
2581 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2582
2583 // For all inodes with layouts in this pool and a pending flush write op
2584 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2585 // from ObjectCacher so that it doesn't re-issue the write in response to
2586 // the ENOSPC error.
2587 // Fortunately since we're cancelling everything in a given pool, we don't
2588 // need to know which ops belong to which ObjectSet, we can just blow all
2589 // the un-flushed cached data away and mark any dirty inodes' async_err
2590 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2591 // affecting this pool, and all the objectsets we're purging were also
2592 // in this pool.
2593 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2594 i != inode_map.end(); ++i)
2595 {
2596 Inode *inode = i->second;
2597 if (inode->oset.dirty_or_tx
2598 && (pool == -1 || inode->layout.pool_id == pool)) {
2599 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2600 << " has dirty objects, purging and setting ENOSPC" << dendl;
2601 objectcacher->purge_set(&inode->oset);
2602 inode->set_async_err(-CEPHFS_ENOSPC);
2603 }
2604 }
2605
2606 if (cancelled_epoch != (epoch_t)-1) {
2607 set_cap_epoch_barrier(cancelled_epoch);
2608 }
2609 }
2610
2611 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2612 {
2613 std::set<entity_addr_t> new_blocklists;
2614
2615 std::scoped_lock cl(client_lock);
2616 objecter->consume_blocklist_events(&new_blocklists);
2617
2618 const auto myaddrs = messenger->get_myaddrs();
2619 bool new_blocklist = false;
2620 bool prenautilus = objecter->with_osdmap(
2621 [&](const OSDMap& o) {
2622 return o.require_osd_release < ceph_release_t::nautilus;
2623 });
2624 if (!blocklisted) {
2625 for (auto a : myaddrs.v) {
2626 // blocklist entries are always TYPE_ANY for nautilus+
2627 a.set_type(entity_addr_t::TYPE_ANY);
2628 if (new_blocklists.count(a)) {
2629 new_blocklist = true;
2630 break;
2631 }
2632 if (prenautilus) {
2633 // ...except pre-nautilus, they were TYPE_LEGACY
2634 a.set_type(entity_addr_t::TYPE_LEGACY);
2635 if (new_blocklists.count(a)) {
2636 new_blocklist = true;
2637 break;
2638 }
2639 }
2640 }
2641 }
2642 if (new_blocklist) {
2643 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2644 return o.get_epoch();
2645 });
2646 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2647 blocklisted = true;
2648
2649 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2650
2651 // Since we know all our OSD ops will fail, cancel them all preemtively,
2652 // so that on an unhealthy cluster we can umount promptly even if e.g.
2653 // some PGs were inaccessible.
2654 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2655
2656 }
2657
2658 if (blocklisted) {
2659 // Handle case where we were blocklisted but no longer are
2660 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2661 return o.is_blocklisted(myaddrs);});
2662 }
2663
2664 // Always subscribe to next osdmap for blocklisted client
2665 // until this client is not blocklisted.
2666 if (blocklisted) {
2667 objecter->maybe_request_map();
2668 }
2669
2670 if (objecter->osdmap_full_flag()) {
2671 _handle_full_flag(-1);
2672 } else {
2673 // Accumulate local list of full pools so that I can drop
2674 // the objecter lock before re-entering objecter in
2675 // cancel_writes
2676 std::vector<int64_t> full_pools;
2677
2678 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2679 for (const auto& kv : o.get_pools()) {
2680 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2681 full_pools.push_back(kv.first);
2682 }
2683 }
2684 });
2685
2686 for (auto p : full_pools)
2687 _handle_full_flag(p);
2688
2689 // Subscribe to subsequent maps to watch for the full flag going
2690 // away. For the global full flag objecter does this for us, but
2691 // it pays no attention to the per-pool full flag so in this branch
2692 // we do it ourselves.
2693 if (!full_pools.empty()) {
2694 objecter->maybe_request_map();
2695 }
2696 }
2697 }
2698
2699
2700 // ------------------------
2701 // incoming messages
2702
2703
2704 bool Client::ms_dispatch2(const MessageRef &m)
2705 {
2706 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2707 if (!iref_reader.is_state_satisfied()) {
2708 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2709 return true;
2710 }
2711
2712 switch (m->get_type()) {
2713 // mounting and mds sessions
2714 case CEPH_MSG_MDS_MAP:
2715 handle_mds_map(ref_cast<MMDSMap>(m));
2716 break;
2717 case CEPH_MSG_FS_MAP:
2718 handle_fs_map(ref_cast<MFSMap>(m));
2719 break;
2720 case CEPH_MSG_FS_MAP_USER:
2721 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2722 break;
2723 case CEPH_MSG_CLIENT_SESSION:
2724 handle_client_session(ref_cast<MClientSession>(m));
2725 break;
2726
2727 case CEPH_MSG_OSD_MAP:
2728 handle_osd_map(ref_cast<MOSDMap>(m));
2729 break;
2730
2731 // requests
2732 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2733 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2734 break;
2735 case CEPH_MSG_CLIENT_REPLY:
2736 handle_client_reply(ref_cast<MClientReply>(m));
2737 break;
2738
2739 // reclaim reply
2740 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2741 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2742 break;
2743
2744 case CEPH_MSG_CLIENT_SNAP:
2745 handle_snap(ref_cast<MClientSnap>(m));
2746 break;
2747 case CEPH_MSG_CLIENT_CAPS:
2748 handle_caps(ref_cast<MClientCaps>(m));
2749 break;
2750 case CEPH_MSG_CLIENT_LEASE:
2751 handle_lease(ref_cast<MClientLease>(m));
2752 break;
2753 case MSG_COMMAND_REPLY:
2754 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2755 handle_command_reply(ref_cast<MCommandReply>(m));
2756 } else {
2757 return false;
2758 }
2759 break;
2760 case CEPH_MSG_CLIENT_QUOTA:
2761 handle_quota(ref_cast<MClientQuota>(m));
2762 break;
2763
2764 default:
2765 return false;
2766 }
2767
2768 // unmounting?
2769 std::scoped_lock cl(client_lock);
2770 if (is_unmounting()) {
2771 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2772 << "+" << inode_map.size() << dendl;
2773 uint64_t size = lru.lru_get_size() + inode_map.size();
2774 trim_cache();
2775 if (size > lru.lru_get_size() + inode_map.size()) {
2776 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2777 mount_cond.notify_all();
2778 } else {
2779 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2780 << "+" << inode_map.size() << dendl;
2781 }
2782 }
2783
2784 return true;
2785 }
2786
2787 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2788 {
2789 std::scoped_lock cl(client_lock);
2790 fsmap.reset(new FSMap(m->get_fsmap()));
2791
2792 signal_cond_list(waiting_for_fsmap);
2793
2794 monclient->sub_got("fsmap", fsmap->get_epoch());
2795 }
2796
2797 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2798 {
2799 std::scoped_lock cl(client_lock);
2800 fsmap_user.reset(new FSMapUser);
2801 *fsmap_user = m->get_fsmap();
2802
2803 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2804 signal_cond_list(waiting_for_fsmap);
2805 }
2806
2807 // Cancel all the commands for missing or laggy GIDs
2808 void Client::cancel_commands(const MDSMap& newmap)
2809 {
2810 std::vector<ceph_tid_t> cancel_ops;
2811
2812 std::scoped_lock cmd_lock(command_lock);
2813 auto &commands = command_table.get_commands();
2814 for (const auto &[tid, op] : commands) {
2815 const mds_gid_t op_mds_gid = op.mds_gid;
2816 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2817 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2818 cancel_ops.push_back(tid);
2819 if (op.outs) {
2820 std::ostringstream ss;
2821 ss << "MDS " << op_mds_gid << " went away";
2822 *(op.outs) = ss.str();
2823 }
2824 /*
2825 * No need to make the con->mark_down under
2826 * client_lock here, because the con will
2827 * has its own lock.
2828 */
2829 op.con->mark_down();
2830 if (op.on_finish)
2831 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2832 }
2833 }
2834
2835 for (const auto &tid : cancel_ops)
2836 command_table.erase(tid);
2837 }
2838
2839 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2840 {
2841 std::unique_lock cl(client_lock);
2842 if (m->get_epoch() <= mdsmap->get_epoch()) {
2843 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2844 << " is identical to or older than our "
2845 << mdsmap->get_epoch() << dendl;
2846 return;
2847 }
2848
2849 cl.unlock();
2850 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2851 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2852 _mdsmap->decode(m->get_encoded());
2853 cancel_commands(*_mdsmap.get());
2854 cl.lock();
2855
2856 _mdsmap.swap(mdsmap);
2857
2858 // reset session
2859 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2860 mds_rank_t mds = p->first;
2861 MetaSession *session = &p->second;
2862 ++p;
2863
2864 int oldstate = _mdsmap->get_state(mds);
2865 int newstate = mdsmap->get_state(mds);
2866 if (!mdsmap->is_up(mds)) {
2867 session->con->mark_down();
2868 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2869 auto old_inc = _mdsmap->get_incarnation(mds);
2870 auto new_inc = mdsmap->get_incarnation(mds);
2871 if (old_inc != new_inc) {
2872 ldout(cct, 1) << "mds incarnation changed from "
2873 << old_inc << " to " << new_inc << dendl;
2874 oldstate = MDSMap::STATE_NULL;
2875 }
2876 session->con->mark_down();
2877 session->addrs = mdsmap->get_addrs(mds);
2878 // When new MDS starts to take over, notify kernel to trim unused entries
2879 // in its dcache/icache. Hopefully, the kernel will release some unused
2880 // inodes before the new MDS enters reconnect state.
2881 trim_cache_for_reconnect(session);
2882 } else if (oldstate == newstate)
2883 continue; // no change
2884
2885 session->mds_state = newstate;
2886 if (newstate == MDSMap::STATE_RECONNECT) {
2887 session->con = messenger->connect_to_mds(session->addrs);
2888 send_reconnect(session);
2889 } else if (newstate > MDSMap::STATE_RECONNECT) {
2890 if (oldstate < MDSMap::STATE_RECONNECT) {
2891 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2892 _closed_mds_session(session);
2893 continue;
2894 }
2895 if (newstate >= MDSMap::STATE_ACTIVE) {
2896 if (oldstate < MDSMap::STATE_ACTIVE) {
2897 // kick new requests
2898 kick_requests(session);
2899 kick_flushing_caps(session);
2900 signal_context_list(session->waiting_for_open);
2901 wake_up_session_caps(session, true);
2902 }
2903 connect_mds_targets(mds);
2904 }
2905 } else if (newstate == MDSMap::STATE_NULL &&
2906 mds >= mdsmap->get_max_mds()) {
2907 _closed_mds_session(session);
2908 }
2909 }
2910
2911 // kick any waiting threads
2912 signal_cond_list(waiting_for_mdsmap);
2913
2914 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2915 }
2916
2917 void Client::send_reconnect(MetaSession *session)
2918 {
2919 mds_rank_t mds = session->mds_num;
2920 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2921
2922 // trim unused caps to reduce MDS's cache rejoin time
2923 trim_cache_for_reconnect(session);
2924
2925 session->readonly = false;
2926
2927 session->release.reset();
2928
2929 // reset my cap seq number
2930 session->seq = 0;
2931 //connect to the mds' offload targets
2932 connect_mds_targets(mds);
2933 //make sure unsafe requests get saved
2934 resend_unsafe_requests(session);
2935
2936 early_kick_flushing_caps(session);
2937
2938 auto m = make_message<MClientReconnect>();
2939 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2940
2941 // i have an open session.
2942 ceph::unordered_set<inodeno_t> did_snaprealm;
2943 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2944 p != inode_map.end();
2945 ++p) {
2946 Inode *in = p->second;
2947 auto it = in->caps.find(mds);
2948 if (it != in->caps.end()) {
2949 if (allow_multi &&
2950 m->get_approx_size() >=
2951 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2952 m->mark_more();
2953 session->con->send_message2(std::move(m));
2954
2955 m = make_message<MClientReconnect>();
2956 }
2957
2958 Cap &cap = it->second;
2959 ldout(cct, 10) << " caps on " << p->first
2960 << " " << ccap_string(cap.issued)
2961 << " wants " << ccap_string(in->caps_wanted())
2962 << dendl;
2963 filepath path;
2964 in->make_short_path(path);
2965 ldout(cct, 10) << " path " << path << dendl;
2966
2967 bufferlist flockbl;
2968 _encode_filelocks(in, flockbl);
2969
2970 cap.seq = 0; // reset seq.
2971 cap.issue_seq = 0; // reset seq.
2972 cap.mseq = 0; // reset seq.
2973 // cap gen should catch up with session cap_gen
2974 if (cap.gen < session->cap_gen) {
2975 cap.gen = session->cap_gen;
2976 cap.issued = cap.implemented = CEPH_CAP_PIN;
2977 } else {
2978 cap.issued = cap.implemented;
2979 }
2980 snapid_t snap_follows = 0;
2981 if (!in->cap_snaps.empty())
2982 snap_follows = in->cap_snaps.begin()->first;
2983
2984 m->add_cap(p->first.ino,
2985 cap.cap_id,
2986 path.get_ino(), path.get_path(), // ino
2987 in->caps_wanted(), // wanted
2988 cap.issued, // issued
2989 in->snaprealm->ino,
2990 snap_follows,
2991 flockbl);
2992
2993 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2994 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2995 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2996 did_snaprealm.insert(in->snaprealm->ino);
2997 }
2998 }
2999 }
3000
3001 if (!allow_multi)
3002 m->set_encoding_version(0); // use connection features to choose encoding
3003 session->con->send_message2(std::move(m));
3004
3005 mount_cond.notify_all();
3006
3007 if (session->reclaim_state == MetaSession::RECLAIMING)
3008 signal_cond_list(waiting_for_reclaim);
3009 }
3010
3011
3012 void Client::kick_requests(MetaSession *session)
3013 {
3014 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3015 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3016 p != mds_requests.end();
3017 ++p) {
3018 MetaRequest *req = p->second;
3019 if (req->got_unsafe)
3020 continue;
3021 if (req->aborted()) {
3022 if (req->caller_cond) {
3023 req->kick = true;
3024 req->caller_cond->notify_all();
3025 }
3026 continue;
3027 }
3028 if (req->retry_attempt > 0)
3029 continue; // new requests only
3030 if (req->mds == session->mds_num) {
3031 send_request(p->second, session);
3032 }
3033 }
3034 }
3035
3036 void Client::resend_unsafe_requests(MetaSession *session)
3037 {
3038 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3039 !iter.end();
3040 ++iter)
3041 send_request(*iter, session);
3042
3043 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3044 // process completed requests in clientreplay stage.
3045 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3046 p != mds_requests.end();
3047 ++p) {
3048 MetaRequest *req = p->second;
3049 if (req->got_unsafe)
3050 continue;
3051 if (req->aborted())
3052 continue;
3053 if (req->retry_attempt == 0)
3054 continue; // old requests only
3055 if (req->mds == session->mds_num)
3056 send_request(req, session, true);
3057 }
3058 }
3059
3060 void Client::wait_unsafe_requests()
3061 {
3062 list<MetaRequest*> last_unsafe_reqs;
3063 for (const auto &p : mds_sessions) {
3064 const MetaSession &s = p.second;
3065 if (!s.unsafe_requests.empty()) {
3066 MetaRequest *req = s.unsafe_requests.back();
3067 req->get();
3068 last_unsafe_reqs.push_back(req);
3069 }
3070 }
3071
3072 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3073 p != last_unsafe_reqs.end();
3074 ++p) {
3075 MetaRequest *req = *p;
3076 if (req->unsafe_item.is_on_list())
3077 wait_on_list(req->waitfor_safe);
3078 put_request(req);
3079 }
3080 }
3081
3082 void Client::kick_requests_closed(MetaSession *session)
3083 {
3084 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3085 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3086 p != mds_requests.end(); ) {
3087 MetaRequest *req = p->second;
3088 ++p;
3089 if (req->mds == session->mds_num) {
3090 if (req->caller_cond) {
3091 req->kick = true;
3092 req->caller_cond->notify_all();
3093 }
3094 req->item.remove_myself();
3095 if (req->got_unsafe) {
3096 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3097 req->unsafe_item.remove_myself();
3098 if (is_dir_operation(req)) {
3099 Inode *dir = req->inode();
3100 assert(dir);
3101 dir->set_async_err(-CEPHFS_EIO);
3102 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3103 << dir->ino << " " << req->get_tid() << dendl;
3104 req->unsafe_dir_item.remove_myself();
3105 }
3106 if (req->target) {
3107 InodeRef &in = req->target;
3108 in->set_async_err(-CEPHFS_EIO);
3109 lderr(cct) << "kick_requests_closed drop req of inode : "
3110 << in->ino << " " << req->get_tid() << dendl;
3111 req->unsafe_target_item.remove_myself();
3112 }
3113 signal_cond_list(req->waitfor_safe);
3114 unregister_request(req);
3115 }
3116 }
3117 }
3118 ceph_assert(session->requests.empty());
3119 ceph_assert(session->unsafe_requests.empty());
3120 }
3121
3122
3123
3124
3125 /************
3126 * leases
3127 */
3128
3129 void Client::got_mds_push(MetaSession *s)
3130 {
3131 s->seq++;
3132 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3133 if (s->state == MetaSession::STATE_CLOSING) {
3134 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3135 }
3136 }
3137
3138 void Client::handle_lease(const MConstRef<MClientLease>& m)
3139 {
3140 ldout(cct, 10) << __func__ << " " << *m << dendl;
3141
3142 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3143 mds_rank_t mds = mds_rank_t(m->get_source().num());
3144
3145 std::scoped_lock cl(client_lock);
3146 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3147 if (!session) {
3148 return;
3149 }
3150
3151 got_mds_push(session);
3152
3153 ceph_seq_t seq = m->get_seq();
3154
3155 Inode *in;
3156 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3157 if (inode_map.count(vino) == 0) {
3158 ldout(cct, 10) << " don't have vino " << vino << dendl;
3159 goto revoke;
3160 }
3161 in = inode_map[vino];
3162
3163 if (m->get_mask() & CEPH_LEASE_VALID) {
3164 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3165 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3166 goto revoke;
3167 }
3168 Dentry *dn = in->dir->dentries[m->dname];
3169 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3170 dn->lease_mds = -1;
3171 }
3172
3173 revoke:
3174 {
3175 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3176 m->get_mask(), m->get_ino(),
3177 m->get_first(), m->get_last(), m->dname);
3178 m->get_connection()->send_message2(std::move(reply));
3179 }
3180 }
3181
3182 void Client::_put_inode(Inode *in, int n)
3183 {
3184 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3185
3186 int left = in->get_nref();
3187 ceph_assert(left >= n + 1);
3188 in->iput(n);
3189 left -= n;
3190 if (left == 1) { // the last one will be held by the inode_map
3191 // release any caps
3192 remove_all_caps(in);
3193
3194 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3195 bool unclean = objectcacher->release_set(&in->oset);
3196 ceph_assert(!unclean);
3197 inode_map.erase(in->vino());
3198 if (use_faked_inos())
3199 _release_faked_ino(in);
3200
3201 if (root == nullptr) {
3202 root_ancestor = 0;
3203 while (!root_parents.empty())
3204 root_parents.erase(root_parents.begin());
3205 }
3206
3207 in->iput();
3208 }
3209 }
3210
3211 void Client::delay_put_inodes(bool wakeup)
3212 {
3213 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3214
3215 std::map<Inode*,int> release;
3216 {
3217 std::scoped_lock dl(delay_i_lock);
3218 release.swap(delay_i_release);
3219 }
3220
3221 if (release.empty())
3222 return;
3223
3224 for (auto &[in, cnt] : release)
3225 _put_inode(in, cnt);
3226
3227 if (wakeup)
3228 mount_cond.notify_all();
3229 }
3230
3231 void Client::put_inode(Inode *in, int n)
3232 {
3233 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3234
3235 std::scoped_lock dl(delay_i_lock);
3236 delay_i_release[in] += n;
3237 }
3238
3239 void Client::close_dir(Dir *dir)
3240 {
3241 Inode *in = dir->parent_inode;
3242 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3243 ceph_assert(dir->is_empty());
3244 ceph_assert(in->dir == dir);
3245 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3246 if (!in->dentries.empty())
3247 in->get_first_parent()->put(); // unpin dentry
3248
3249 delete in->dir;
3250 in->dir = 0;
3251 put_inode(in); // unpin inode
3252 }
3253
3254 /**
3255 * Don't call this with in==NULL, use get_or_create for that
3256 * leave dn set to default NULL unless you're trying to add
3257 * a new inode to a pre-created Dentry
3258 */
3259 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3260 {
3261 if (!dn) {
3262 // create a new Dentry
3263 dn = new Dentry(dir, name);
3264
3265 lru.lru_insert_mid(dn); // mid or top?
3266
3267 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3268 << " dn " << dn << " (new dn)" << dendl;
3269 } else {
3270 ceph_assert(!dn->inode);
3271 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3272 << " dn " << dn << " (old dn)" << dendl;
3273 }
3274
3275 if (in) { // link to inode
3276 InodeRef tmp_ref;
3277 // only one parent for directories!
3278 if (in->is_dir() && !in->dentries.empty()) {
3279 tmp_ref = in; // prevent unlink below from freeing the inode.
3280 Dentry *olddn = in->get_first_parent();
3281 ceph_assert(olddn->dir != dir || olddn->name != name);
3282 Inode *old_diri = olddn->dir->parent_inode;
3283 clear_dir_complete_and_ordered(old_diri, true);
3284 unlink(olddn, true, true); // keep dir, dentry
3285 }
3286
3287 dn->link(in);
3288 inc_dentry_nr();
3289 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3290 }
3291
3292 return dn;
3293 }
3294
3295 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3296 {
3297 InodeRef in(dn->inode);
3298 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3299 << " inode " << dn->inode << dendl;
3300
3301 // unlink from inode
3302 if (dn->inode) {
3303 dn->unlink();
3304 dec_dentry_nr();
3305 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3306 }
3307
3308 if (keepdentry) {
3309 dn->lease_mds = -1;
3310 } else {
3311 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3312
3313 // unlink from dir
3314 Dir *dir = dn->dir;
3315 dn->detach();
3316
3317 // delete den
3318 lru.lru_remove(dn);
3319 dn->put();
3320
3321 if (dir->is_empty() && !keepdir)
3322 close_dir(dir);
3323 }
3324 }
3325
3326 /**
3327 * For asynchronous flushes, check for errors from the IO and
3328 * update the inode if necessary
3329 */
3330 class C_Client_FlushComplete : public Context {
3331 private:
3332 Client *client;
3333 InodeRef inode;
3334 public:
3335 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3336 void finish(int r) override {
3337 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3338 if (r != 0) {
3339 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3340 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3341 << " 0x" << std::hex << inode->ino << std::dec
3342 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3343 inode->set_async_err(r);
3344 }
3345 }
3346 };
3347
3348
3349 /****
3350 * caps
3351 */
3352
3353 void Client::get_cap_ref(Inode *in, int cap)
3354 {
3355 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3356 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3357 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3358 in->iget();
3359 }
3360 if ((cap & CEPH_CAP_FILE_CACHE) &&
3361 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3362 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3363 in->iget();
3364 }
3365 in->get_cap_ref(cap);
3366 }
3367
3368 void Client::put_cap_ref(Inode *in, int cap)
3369 {
3370 int last = in->put_cap_ref(cap);
3371 if (last) {
3372 int put_nref = 0;
3373 int drop = last & ~in->caps_issued();
3374 if (in->snapid == CEPH_NOSNAP) {
3375 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3376 !in->cap_snaps.empty() &&
3377 in->cap_snaps.rbegin()->second.writing) {
3378 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3379 in->cap_snaps.rbegin()->second.writing = 0;
3380 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3381 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3382 }
3383 if (last & CEPH_CAP_FILE_BUFFER) {
3384 for (auto &p : in->cap_snaps)
3385 p.second.dirty_data = 0;
3386 signal_cond_list(in->waitfor_commit);
3387 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3388 ++put_nref;
3389 }
3390 }
3391 if (last & CEPH_CAP_FILE_CACHE) {
3392 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3393 ++put_nref;
3394 }
3395 if (drop)
3396 check_caps(in, 0);
3397 if (put_nref)
3398 put_inode(in, put_nref);
3399 }
3400 }
3401
3402 // get caps for a given file handle -- the inode should have @need caps
3403 // issued by the mds and @want caps not revoked (or not under revocation).
3404 // this routine blocks till the cap requirement is satisfied. also account
3405 // (track) for capability hit when required (when cap requirement succeedes).
3406 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3407 {
3408 Inode *in = fh->inode.get();
3409
3410 int r = check_pool_perm(in, need);
3411 if (r < 0)
3412 return r;
3413
3414 while (1) {
3415 int file_wanted = in->caps_file_wanted();
3416 if ((file_wanted & need) != need) {
3417 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3418 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3419 << dendl;
3420 return -CEPHFS_EBADF;
3421 }
3422
3423 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3424 return -CEPHFS_EBADF;
3425
3426 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3427 return -CEPHFS_EIO;
3428
3429 int implemented;
3430 int have = in->caps_issued(&implemented);
3431
3432 bool waitfor_caps = false;
3433 bool waitfor_commit = false;
3434
3435 if (have & need & CEPH_CAP_FILE_WR) {
3436 if (endoff > 0) {
3437 if ((endoff >= (loff_t)in->max_size ||
3438 endoff > (loff_t)(in->size << 1)) &&
3439 endoff > (loff_t)in->wanted_max_size) {
3440 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3441 in->wanted_max_size = endoff;
3442 }
3443 if (in->wanted_max_size > in->max_size &&
3444 in->wanted_max_size > in->requested_max_size)
3445 check_caps(in, 0);
3446 }
3447
3448 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3449 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3450 waitfor_caps = true;
3451 }
3452 if (!in->cap_snaps.empty()) {
3453 if (in->cap_snaps.rbegin()->second.writing) {
3454 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3455 waitfor_caps = true;
3456 }
3457 for (auto &p : in->cap_snaps) {
3458 if (p.second.dirty_data) {
3459 waitfor_commit = true;
3460 break;
3461 }
3462 }
3463 if (waitfor_commit) {
3464 _flush(in, new C_Client_FlushComplete(this, in));
3465 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3466 }
3467 }
3468 }
3469
3470 if (!waitfor_caps && !waitfor_commit) {
3471 if ((have & need) == need) {
3472 int revoking = implemented & ~have;
3473 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3474 << " need " << ccap_string(need) << " want " << ccap_string(want)
3475 << " revoking " << ccap_string(revoking)
3476 << dendl;
3477 if ((revoking & want) == 0) {
3478 *phave = need | (have & want);
3479 in->get_cap_ref(need);
3480 cap_hit();
3481 return 0;
3482 }
3483 }
3484 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3485 waitfor_caps = true;
3486 }
3487
3488 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3489 in->auth_cap->session->readonly)
3490 return -CEPHFS_EROFS;
3491
3492 if (in->flags & I_CAP_DROPPED) {
3493 int mds_wanted = in->caps_mds_wanted();
3494 if ((mds_wanted & need) != need) {
3495 int ret = _renew_caps(in);
3496 if (ret < 0)
3497 return ret;
3498 continue;
3499 }
3500 if (!(file_wanted & ~mds_wanted))
3501 in->flags &= ~I_CAP_DROPPED;
3502 }
3503
3504 if (waitfor_caps)
3505 wait_on_list(in->waitfor_caps);
3506 else if (waitfor_commit)
3507 wait_on_list(in->waitfor_commit);
3508 }
3509 }
3510
3511 int Client::get_caps_used(Inode *in)
3512 {
3513 unsigned used = in->caps_used();
3514 if (!(used & CEPH_CAP_FILE_CACHE) &&
3515 !objectcacher->set_is_empty(&in->oset))
3516 used |= CEPH_CAP_FILE_CACHE;
3517 return used;
3518 }
3519
3520 void Client::cap_delay_requeue(Inode *in)
3521 {
3522 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3523 in->hold_caps_until = ceph_clock_now();
3524 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3525 delayed_list.push_back(&in->delay_cap_item);
3526 }
3527
3528 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3529 int flags, int used, int want, int retain,
3530 int flush, ceph_tid_t flush_tid)
3531 {
3532 int held = cap->issued | cap->implemented;
3533 int revoking = cap->implemented & ~cap->issued;
3534 retain &= ~revoking;
3535 int dropping = cap->issued & ~retain;
3536 int op = CEPH_CAP_OP_UPDATE;
3537
3538 ldout(cct, 10) << __func__ << " " << *in
3539 << " mds." << session->mds_num << " seq " << cap->seq
3540 << " used " << ccap_string(used)
3541 << " want " << ccap_string(want)
3542 << " flush " << ccap_string(flush)
3543 << " retain " << ccap_string(retain)
3544 << " held "<< ccap_string(held)
3545 << " revoking " << ccap_string(revoking)
3546 << " dropping " << ccap_string(dropping)
3547 << dendl;
3548
3549 if (cct->_conf->client_inject_release_failure && revoking) {
3550 const int would_have_issued = cap->issued & retain;
3551 const int would_have_implemented = cap->implemented & (cap->issued | used);
3552 // Simulated bug:
3553 // - tell the server we think issued is whatever they issued plus whatever we implemented
3554 // - leave what we have implemented in place
3555 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3556 cap->issued = cap->issued | cap->implemented;
3557
3558 // Make an exception for revoking xattr caps: we are injecting
3559 // failure to release other caps, but allow xattr because client
3560 // will block on xattr ops if it can't release these to MDS (#9800)
3561 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3562 cap->issued ^= xattr_mask & revoking;
3563 cap->implemented ^= xattr_mask & revoking;
3564
3565 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3566 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3567 } else {
3568 // Normal behaviour
3569 cap->issued &= retain;
3570 cap->implemented &= cap->issued | used;
3571 }
3572
3573 snapid_t follows = 0;
3574
3575 if (flush)
3576 follows = in->snaprealm->get_snap_context().seq;
3577
3578 auto m = make_message<MClientCaps>(op,
3579 in->ino,
3580 0,
3581 cap->cap_id, cap->seq,
3582 cap->implemented,
3583 want,
3584 flush,
3585 cap->mseq,
3586 cap_epoch_barrier);
3587 m->caller_uid = in->cap_dirtier_uid;
3588 m->caller_gid = in->cap_dirtier_gid;
3589
3590 m->head.issue_seq = cap->issue_seq;
3591 m->set_tid(flush_tid);
3592
3593 m->head.uid = in->uid;
3594 m->head.gid = in->gid;
3595 m->head.mode = in->mode;
3596
3597 m->head.nlink = in->nlink;
3598
3599 if (flush & CEPH_CAP_XATTR_EXCL) {
3600 encode(in->xattrs, m->xattrbl);
3601 m->head.xattr_version = in->xattr_version;
3602 }
3603
3604 m->size = in->size;
3605 m->max_size = in->max_size;
3606 m->truncate_seq = in->truncate_seq;
3607 m->truncate_size = in->truncate_size;
3608 m->mtime = in->mtime;
3609 m->atime = in->atime;
3610 m->ctime = in->ctime;
3611 m->btime = in->btime;
3612 m->time_warp_seq = in->time_warp_seq;
3613 m->change_attr = in->change_attr;
3614
3615 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3616 !in->cap_snaps.empty() &&
3617 in->cap_snaps.rbegin()->second.flush_tid == 0)
3618 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3619 m->flags = flags;
3620
3621 if (flush & CEPH_CAP_FILE_WR) {
3622 m->inline_version = in->inline_version;
3623 m->inline_data = in->inline_data;
3624 }
3625
3626 in->reported_size = in->size;
3627 m->set_snap_follows(follows);
3628 cap->wanted = want;
3629 if (cap == in->auth_cap) {
3630 if (want & CEPH_CAP_ANY_FILE_WR) {
3631 m->set_max_size(in->wanted_max_size);
3632 in->requested_max_size = in->wanted_max_size;
3633 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3634 } else {
3635 in->requested_max_size = 0;
3636 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3637 }
3638 }
3639
3640 if (!session->flushing_caps_tids.empty())
3641 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3642
3643 session->con->send_message2(std::move(m));
3644 }
3645
3646 static bool is_max_size_approaching(Inode *in)
3647 {
3648 /* mds will adjust max size according to the reported size */
3649 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3650 return false;
3651 if (in->size >= in->max_size)
3652 return true;
3653 /* half of previous max_size increment has been used */
3654 if (in->max_size > in->reported_size &&
3655 (in->size << 1) >= in->max_size + in->reported_size)
3656 return true;
3657 return false;
3658 }
3659
3660 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3661 {
3662 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3663 return used;
3664 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3665 return used;
3666
3667 if (issued & CEPH_CAP_FILE_LAZYIO) {
3668 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3669 used &= ~CEPH_CAP_FILE_CACHE;
3670 used |= CEPH_CAP_FILE_LAZYIO;
3671 }
3672 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3673 used &= ~CEPH_CAP_FILE_BUFFER;
3674 used |= CEPH_CAP_FILE_LAZYIO;
3675 }
3676 } else {
3677 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3678 used &= ~CEPH_CAP_FILE_CACHE;
3679 used |= CEPH_CAP_FILE_LAZYIO;
3680 }
3681 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3682 used &= ~CEPH_CAP_FILE_BUFFER;
3683 used |= CEPH_CAP_FILE_LAZYIO;
3684 }
3685 }
3686 return used;
3687 }
3688
3689 /**
3690 * check_caps
3691 *
3692 * Examine currently used and wanted versus held caps. Release, flush or ack
3693 * revoked caps to the MDS as appropriate.
3694 *
3695 * @param in the inode to check
3696 * @param flags flags to apply to cap check
3697 */
3698 void Client::check_caps(Inode *in, unsigned flags)
3699 {
3700 unsigned wanted = in->caps_wanted();
3701 unsigned used = get_caps_used(in);
3702 unsigned cap_used;
3703
3704 int implemented;
3705 int issued = in->caps_issued(&implemented);
3706 int revoking = implemented & ~issued;
3707
3708 int orig_used = used;
3709 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3710
3711 int retain = wanted | used | CEPH_CAP_PIN;
3712 if (!is_unmounting() && in->nlink > 0) {
3713 if (wanted) {
3714 retain |= CEPH_CAP_ANY;
3715 } else if (in->is_dir() &&
3716 (issued & CEPH_CAP_FILE_SHARED) &&
3717 (in->flags & I_COMPLETE)) {
3718 // we do this here because we don't want to drop to Fs (and then
3719 // drop the Fs if we do a create!) if that alone makes us send lookups
3720 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3721 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3722 retain |= wanted;
3723 } else {
3724 retain |= CEPH_CAP_ANY_SHARED;
3725 // keep RD only if we didn't have the file open RW,
3726 // because then the mds would revoke it anyway to
3727 // journal max_size=0.
3728 if (in->max_size == 0)
3729 retain |= CEPH_CAP_ANY_RD;
3730 }
3731 }
3732
3733 ldout(cct, 10) << __func__ << " on " << *in
3734 << " wanted " << ccap_string(wanted)
3735 << " used " << ccap_string(used)
3736 << " issued " << ccap_string(issued)
3737 << " revoking " << ccap_string(revoking)
3738 << " flags=" << flags
3739 << dendl;
3740
3741 if (in->snapid != CEPH_NOSNAP)
3742 return; //snap caps last forever, can't write
3743
3744 if (in->caps.empty())
3745 return; // guard if at end of func
3746
3747 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3748 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3749 if (_release(in))
3750 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3751 }
3752
3753
3754 for (auto &p : in->caps) {
3755 mds_rank_t mds = p.first;
3756 Cap &cap = p.second;
3757
3758 MetaSession *session = &mds_sessions.at(mds);
3759
3760 cap_used = used;
3761 if (in->auth_cap && &cap != in->auth_cap)
3762 cap_used &= ~in->auth_cap->issued;
3763
3764 revoking = cap.implemented & ~cap.issued;
3765
3766 ldout(cct, 10) << " cap mds." << mds
3767 << " issued " << ccap_string(cap.issued)
3768 << " implemented " << ccap_string(cap.implemented)
3769 << " revoking " << ccap_string(revoking) << dendl;
3770
3771 if (in->wanted_max_size > in->max_size &&
3772 in->wanted_max_size > in->requested_max_size &&
3773 &cap == in->auth_cap)
3774 goto ack;
3775
3776 /* approaching file_max? */
3777 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3778 &cap == in->auth_cap &&
3779 is_max_size_approaching(in)) {
3780 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3781 << ", reported " << in->reported_size << dendl;
3782 goto ack;
3783 }
3784
3785 /* completed revocation? */
3786 if (revoking && (revoking & cap_used) == 0) {
3787 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3788 goto ack;
3789 }
3790
3791 /* want more caps from mds? */
3792 if (wanted & ~(cap.wanted | cap.issued))
3793 goto ack;
3794
3795 if (!revoking && is_unmounting() && (cap_used == 0))
3796 goto ack;
3797
3798 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3799 !in->dirty_caps) // and we have no dirty caps
3800 continue;
3801
3802 if (!(flags & CHECK_CAPS_NODELAY)) {
3803 ldout(cct, 10) << "delaying cap release" << dendl;
3804 cap_delay_requeue(in);
3805 continue;
3806 }
3807
3808 ack:
3809 if (&cap == in->auth_cap) {
3810 if (in->flags & I_KICK_FLUSH) {
3811 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3812 << " to mds." << mds << dendl;
3813 kick_flushing_caps(in, session);
3814 }
3815 if (!in->cap_snaps.empty() &&
3816 in->cap_snaps.rbegin()->second.flush_tid == 0)
3817 flush_snaps(in);
3818 }
3819
3820 int flushing;
3821 int msg_flags = 0;
3822 ceph_tid_t flush_tid;
3823 if (in->auth_cap == &cap && in->dirty_caps) {
3824 flushing = mark_caps_flushing(in, &flush_tid);
3825 if (flags & CHECK_CAPS_SYNCHRONOUS)
3826 msg_flags |= MClientCaps::FLAG_SYNC;
3827 } else {
3828 flushing = 0;
3829 flush_tid = 0;
3830 }
3831
3832 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3833 flushing, flush_tid);
3834 }
3835 }
3836
3837
3838 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3839 {
3840 int used = get_caps_used(in);
3841 int dirty = in->caps_dirty();
3842 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3843
3844 if (in->cap_snaps.size() &&
3845 in->cap_snaps.rbegin()->second.writing) {
3846 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3847 return;
3848 } else if (in->caps_dirty() ||
3849 (used & CEPH_CAP_FILE_WR) ||
3850 (dirty & CEPH_CAP_ANY_WR)) {
3851 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3852 ceph_assert(capsnapem.second); /* element inserted */
3853 CapSnap &capsnap = capsnapem.first->second;
3854 capsnap.context = old_snapc;
3855 capsnap.issued = in->caps_issued();
3856 capsnap.dirty = in->caps_dirty();
3857
3858 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3859
3860 capsnap.uid = in->uid;
3861 capsnap.gid = in->gid;
3862 capsnap.mode = in->mode;
3863 capsnap.btime = in->btime;
3864 capsnap.xattrs = in->xattrs;
3865 capsnap.xattr_version = in->xattr_version;
3866 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3867 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3868
3869 if (used & CEPH_CAP_FILE_WR) {
3870 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3871 capsnap.writing = 1;
3872 } else {
3873 finish_cap_snap(in, capsnap, used);
3874 }
3875 } else {
3876 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3877 }
3878 }
3879
3880 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3881 {
3882 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3883 capsnap.size = in->size;
3884 capsnap.mtime = in->mtime;
3885 capsnap.atime = in->atime;
3886 capsnap.ctime = in->ctime;
3887 capsnap.time_warp_seq = in->time_warp_seq;
3888 capsnap.change_attr = in->change_attr;
3889 capsnap.dirty |= in->caps_dirty();
3890
3891 /* Only reset it if it wasn't set before */
3892 if (capsnap.cap_dirtier_uid == -1) {
3893 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3894 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3895 }
3896
3897 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3898 capsnap.inline_data = in->inline_data;
3899 capsnap.inline_version = in->inline_version;
3900 }
3901
3902 if (used & CEPH_CAP_FILE_BUFFER) {
3903 capsnap.writing = 1;
3904 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3905 << " WRBUFFER, delaying" << dendl;
3906 } else {
3907 capsnap.dirty_data = 0;
3908 flush_snaps(in);
3909 }
3910 }
3911
3912 void Client::send_flush_snap(Inode *in, MetaSession *session,
3913 snapid_t follows, CapSnap& capsnap)
3914 {
3915 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3916 in->ino, in->snaprealm->ino, 0,
3917 in->auth_cap->mseq, cap_epoch_barrier);
3918 m->caller_uid = capsnap.cap_dirtier_uid;
3919 m->caller_gid = capsnap.cap_dirtier_gid;
3920
3921 m->set_client_tid(capsnap.flush_tid);
3922 m->head.snap_follows = follows;
3923
3924 m->head.caps = capsnap.issued;
3925 m->head.dirty = capsnap.dirty;
3926
3927 m->head.uid = capsnap.uid;
3928 m->head.gid = capsnap.gid;
3929 m->head.mode = capsnap.mode;
3930 m->btime = capsnap.btime;
3931
3932 m->size = capsnap.size;
3933
3934 m->head.xattr_version = capsnap.xattr_version;
3935 encode(capsnap.xattrs, m->xattrbl);
3936
3937 m->ctime = capsnap.ctime;
3938 m->btime = capsnap.btime;
3939 m->mtime = capsnap.mtime;
3940 m->atime = capsnap.atime;
3941 m->time_warp_seq = capsnap.time_warp_seq;
3942 m->change_attr = capsnap.change_attr;
3943
3944 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3945 m->inline_version = in->inline_version;
3946 m->inline_data = in->inline_data;
3947 }
3948
3949 ceph_assert(!session->flushing_caps_tids.empty());
3950 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3951
3952 session->con->send_message2(std::move(m));
3953 }
3954
3955 void Client::flush_snaps(Inode *in)
3956 {
3957 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3958 ceph_assert(in->cap_snaps.size());
3959
3960 // pick auth mds
3961 ceph_assert(in->auth_cap);
3962 MetaSession *session = in->auth_cap->session;
3963
3964 for (auto &p : in->cap_snaps) {
3965 CapSnap &capsnap = p.second;
3966 // only do new flush
3967 if (capsnap.flush_tid > 0)
3968 continue;
3969
3970 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3971 << " follows " << p.first
3972 << " size " << capsnap.size
3973 << " mtime " << capsnap.mtime
3974 << " dirty_data=" << capsnap.dirty_data
3975 << " writing=" << capsnap.writing
3976 << " on " << *in << dendl;
3977 if (capsnap.dirty_data || capsnap.writing)
3978 break;
3979
3980 capsnap.flush_tid = ++last_flush_tid;
3981 session->flushing_caps_tids.insert(capsnap.flush_tid);
3982 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3983 if (!in->flushing_cap_item.is_on_list())
3984 session->flushing_caps.push_back(&in->flushing_cap_item);
3985
3986 send_flush_snap(in, session, p.first, capsnap);
3987 }
3988 }
3989
3990 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3991 {
3992 ceph::condition_variable cond;
3993 ls.push_back(&cond);
3994 std::unique_lock l{client_lock, std::adopt_lock};
3995 cond.wait(l);
3996 l.release();
3997 ls.remove(&cond);
3998 }
3999
4000 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4001 {
4002 for (auto cond : ls) {
4003 cond->notify_all();
4004 }
4005 }
4006
4007 void Client::wait_on_context_list(list<Context*>& ls)
4008 {
4009 ceph::condition_variable cond;
4010 bool done = false;
4011 int r;
4012 ls.push_back(new C_Cond(cond, &done, &r));
4013 std::unique_lock l{client_lock, std::adopt_lock};
4014 cond.wait(l, [&done] { return done;});
4015 l.release();
4016 }
4017
4018 void Client::signal_context_list(list<Context*>& ls)
4019 {
4020 while (!ls.empty()) {
4021 ls.front()->complete(0);
4022 ls.pop_front();
4023 }
4024 }
4025
4026 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4027 {
4028 for (const auto &cap : s->caps) {
4029 auto &in = cap->inode;
4030 if (reconnect) {
4031 in.requested_max_size = 0;
4032 in.wanted_max_size = 0;
4033 } else {
4034 if (cap->gen < s->cap_gen) {
4035 // mds did not re-issue stale cap.
4036 cap->issued = cap->implemented = CEPH_CAP_PIN;
4037 // make sure mds knows what we want.
4038 if (in.caps_file_wanted() & ~cap->wanted)
4039 in.flags |= I_CAP_DROPPED;
4040 }
4041 }
4042 signal_cond_list(in.waitfor_caps);
4043 }
4044 }
4045
4046
4047 // flush dirty data (from objectcache)
4048
4049 class C_Client_CacheInvalidate : public Context {
4050 private:
4051 Client *client;
4052 vinodeno_t ino;
4053 int64_t offset, length;
4054 public:
4055 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4056 client(c), offset(off), length(len) {
4057 if (client->use_faked_inos())
4058 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4059 else
4060 ino = in->vino();
4061 }
4062 void finish(int r) override {
4063 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4064 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4065 client->_async_invalidate(ino, offset, length);
4066 }
4067 };
4068
4069 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4070 {
4071 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4072 if (!mref_reader.is_state_satisfied())
4073 return;
4074
4075 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4076 ino_invalidate_cb(callback_handle, ino, off, len);
4077 }
4078
4079 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4080
4081 if (ino_invalidate_cb)
4082 // we queue the invalidate, which calls the callback and decrements the ref
4083 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4084 }
4085
4086 void Client::_invalidate_inode_cache(Inode *in)
4087 {
4088 ldout(cct, 10) << __func__ << " " << *in << dendl;
4089
4090 // invalidate our userspace inode cache
4091 if (cct->_conf->client_oc) {
4092 objectcacher->release_set(&in->oset);
4093 if (!objectcacher->set_is_empty(&in->oset))
4094 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4095 }
4096
4097 _schedule_invalidate_callback(in, 0, 0);
4098 }
4099
4100 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4101 {
4102 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4103
4104 // invalidate our userspace inode cache
4105 if (cct->_conf->client_oc) {
4106 vector<ObjectExtent> ls;
4107 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4108 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4109 }
4110
4111 _schedule_invalidate_callback(in, off, len);
4112 }
4113
4114 bool Client::_release(Inode *in)
4115 {
4116 ldout(cct, 20) << "_release " << *in << dendl;
4117 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4118 _invalidate_inode_cache(in);
4119 return true;
4120 }
4121 return false;
4122 }
4123
4124 bool Client::_flush(Inode *in, Context *onfinish)
4125 {
4126 ldout(cct, 10) << "_flush " << *in << dendl;
4127
4128 if (!in->oset.dirty_or_tx) {
4129 ldout(cct, 10) << " nothing to flush" << dendl;
4130 onfinish->complete(0);
4131 return true;
4132 }
4133
4134 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4135 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4136 objectcacher->purge_set(&in->oset);
4137 if (onfinish) {
4138 onfinish->complete(-CEPHFS_ENOSPC);
4139 }
4140 return true;
4141 }
4142
4143 return objectcacher->flush_set(&in->oset, onfinish);
4144 }
4145
4146 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4147 {
4148 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4149 if (!in->oset.dirty_or_tx) {
4150 ldout(cct, 10) << " nothing to flush" << dendl;
4151 return;
4152 }
4153
4154 C_SaferCond onflush("Client::_flush_range flock");
4155 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4156 offset, size, &onflush);
4157 if (!ret) {
4158 // wait for flush
4159 client_lock.unlock();
4160 onflush.wait();
4161 client_lock.lock();
4162 }
4163 }
4164
4165 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4166 {
4167 // std::scoped_lock l(client_lock);
4168 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4169 Inode *in = static_cast<Inode *>(oset->parent);
4170 ceph_assert(in);
4171 _flushed(in);
4172 }
4173
4174 void Client::_flushed(Inode *in)
4175 {
4176 ldout(cct, 10) << "_flushed " << *in << dendl;
4177
4178 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4179 }
4180
4181
4182
4183 // checks common to add_update_cap, handle_cap_grant
4184 void Client::check_cap_issue(Inode *in, unsigned issued)
4185 {
4186 unsigned had = in->caps_issued();
4187
4188 if ((issued & CEPH_CAP_FILE_CACHE) &&
4189 !(had & CEPH_CAP_FILE_CACHE))
4190 in->cache_gen++;
4191
4192 if ((issued & CEPH_CAP_FILE_SHARED) !=
4193 (had & CEPH_CAP_FILE_SHARED)) {
4194 if (issued & CEPH_CAP_FILE_SHARED)
4195 in->shared_gen++;
4196 if (in->is_dir())
4197 clear_dir_complete_and_ordered(in, true);
4198 }
4199 }
4200
4201 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4202 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4203 inodeno_t realm, int flags, const UserPerm& cap_perms)
4204 {
4205 if (!in->is_any_caps()) {
4206 ceph_assert(in->snaprealm == 0);
4207 in->snaprealm = get_snap_realm(realm);
4208 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4209 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4210 } else {
4211 ceph_assert(in->snaprealm);
4212 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4213 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4214 in->snaprealm_item.remove_myself();
4215 auto oldrealm = in->snaprealm;
4216 in->snaprealm = get_snap_realm(realm);
4217 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4218 put_snap_realm(oldrealm);
4219 }
4220 }
4221
4222 mds_rank_t mds = mds_session->mds_num;
4223 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4224 Cap &cap = capem.first->second;
4225 if (!capem.second) {
4226 if (cap.gen < mds_session->cap_gen)
4227 cap.issued = cap.implemented = CEPH_CAP_PIN;
4228
4229 /*
4230 * auth mds of the inode changed. we received the cap export
4231 * message, but still haven't received the cap import message.
4232 * handle_cap_export() updated the new auth MDS' cap.
4233 *
4234 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4235 * a message that was send before the cap import message. So
4236 * don't remove caps.
4237 */
4238 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4239 if (&cap != in->auth_cap)
4240 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4241
4242 ceph_assert(cap.cap_id == cap_id);
4243 seq = cap.seq;
4244 mseq = cap.mseq;
4245 issued |= cap.issued;
4246 flags |= CEPH_CAP_FLAG_AUTH;
4247 }
4248 } else {
4249 inc_pinned_icaps();
4250 }
4251
4252 check_cap_issue(in, issued);
4253
4254 if (flags & CEPH_CAP_FLAG_AUTH) {
4255 if (in->auth_cap != &cap &&
4256 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4257 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4258 ldout(cct, 10) << __func__ << " changing auth cap: "
4259 << "add myself to new auth MDS' flushing caps list" << dendl;
4260 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4261 }
4262 in->auth_cap = &cap;
4263 }
4264 }
4265
4266 unsigned old_caps = cap.issued;
4267 cap.cap_id = cap_id;
4268 cap.issued = issued;
4269 cap.implemented |= issued;
4270 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4271 cap.wanted = wanted;
4272 else
4273 cap.wanted |= wanted;
4274 cap.seq = seq;
4275 cap.issue_seq = seq;
4276 cap.mseq = mseq;
4277 cap.gen = mds_session->cap_gen;
4278 cap.latest_perms = cap_perms;
4279 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4280 << " from mds." << mds
4281 << " on " << *in
4282 << dendl;
4283
4284 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4285 // non-auth MDS is revoking the newly grant caps ?
4286 for (auto &p : in->caps) {
4287 if (&p.second == &cap)
4288 continue;
4289 if (p.second.implemented & ~p.second.issued & issued) {
4290 check_caps(in, CHECK_CAPS_NODELAY);
4291 break;
4292 }
4293 }
4294 }
4295
4296 if (issued & ~old_caps)
4297 signal_cond_list(in->waitfor_caps);
4298 }
4299
4300 void Client::remove_cap(Cap *cap, bool queue_release)
4301 {
4302 auto &in = cap->inode;
4303 MetaSession *session = cap->session;
4304 mds_rank_t mds = cap->session->mds_num;
4305
4306 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4307
4308 if (queue_release) {
4309 session->enqueue_cap_release(
4310 in.ino,
4311 cap->cap_id,
4312 cap->issue_seq,
4313 cap->mseq,
4314 cap_epoch_barrier);
4315 } else {
4316 dec_pinned_icaps();
4317 }
4318
4319
4320 if (in.auth_cap == cap) {
4321 if (in.flushing_cap_item.is_on_list()) {
4322 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4323 in.flushing_cap_item.remove_myself();
4324 }
4325 in.auth_cap = NULL;
4326 }
4327 size_t n = in.caps.erase(mds);
4328 ceph_assert(n == 1);
4329 cap = nullptr;
4330
4331 if (!in.is_any_caps()) {
4332 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4333 in.snaprealm_item.remove_myself();
4334 put_snap_realm(in.snaprealm);
4335 in.snaprealm = 0;
4336 }
4337 }
4338
4339 void Client::remove_all_caps(Inode *in)
4340 {
4341 while (!in->caps.empty())
4342 remove_cap(&in->caps.begin()->second, true);
4343 }
4344
4345 void Client::remove_session_caps(MetaSession *s, int err)
4346 {
4347 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4348
4349 while (s->caps.size()) {
4350 Cap *cap = *s->caps.begin();
4351 InodeRef in(&cap->inode);
4352 bool dirty_caps = false;
4353 if (in->auth_cap == cap) {
4354 dirty_caps = in->dirty_caps | in->flushing_caps;
4355 in->wanted_max_size = 0;
4356 in->requested_max_size = 0;
4357 if (in->has_any_filelocks())
4358 in->flags |= I_ERROR_FILELOCK;
4359 }
4360 auto caps = cap->implemented;
4361 if (cap->wanted | cap->issued)
4362 in->flags |= I_CAP_DROPPED;
4363 remove_cap(cap, false);
4364 in->cap_snaps.clear();
4365 if (dirty_caps) {
4366 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4367 if (in->flushing_caps) {
4368 num_flushing_caps--;
4369 in->flushing_cap_tids.clear();
4370 }
4371 in->flushing_caps = 0;
4372 in->mark_caps_clean();
4373 put_inode(in.get());
4374 }
4375 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4376 if (caps && !in->caps_issued_mask(caps, true)) {
4377 if (err == -CEPHFS_EBLOCKLISTED) {
4378 if (in->oset.dirty_or_tx) {
4379 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4380 in->set_async_err(err);
4381 }
4382 objectcacher->purge_set(&in->oset);
4383 } else {
4384 objectcacher->release_set(&in->oset);
4385 }
4386 _schedule_invalidate_callback(in.get(), 0, 0);
4387 }
4388
4389 signal_cond_list(in->waitfor_caps);
4390 }
4391 s->flushing_caps_tids.clear();
4392 sync_cond.notify_all();
4393 }
4394
4395 int Client::_do_remount(bool retry_on_error)
4396 {
4397 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4398
4399 errno = 0;
4400 int r = remount_cb(callback_handle);
4401 if (r == 0) {
4402 retries_on_invalidate = 0;
4403 } else {
4404 int e = errno;
4405 client_t whoami = get_nodeid();
4406 if (r == -1) {
4407 lderr(cct) <<
4408 "failed to remount (to trim kernel dentries): "
4409 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4410 } else {
4411 lderr(cct) <<
4412 "failed to remount (to trim kernel dentries): "
4413 "return code = " << r << dendl;
4414 }
4415 bool should_abort =
4416 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4417 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4418 !(retry_on_error && (++retries_on_invalidate < max_retries));
4419 if (should_abort && !is_unmounting()) {
4420 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4421 ceph_abort();
4422 }
4423 }
4424 return r;
4425 }
4426
4427 class C_Client_Remount : public Context {
4428 private:
4429 Client *client;
4430 public:
4431 explicit C_Client_Remount(Client *c) : client(c) {}
4432 void finish(int r) override {
4433 ceph_assert(r == 0);
4434 client->_do_remount(true);
4435 }
4436 };
4437
4438 void Client::_invalidate_kernel_dcache()
4439 {
4440 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4441 if (!mref_reader.is_state_satisfied())
4442 return;
4443
4444 if (can_invalidate_dentries) {
4445 if (dentry_invalidate_cb && root->dir) {
4446 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4447 p != root->dir->dentries.end();
4448 ++p) {
4449 if (p->second->inode)
4450 _schedule_invalidate_dentry_callback(p->second, false);
4451 }
4452 }
4453 } else if (remount_cb) {
4454 // Hacky:
4455 // when remounting a file system, linux kernel trims all unused dentries in the fs
4456 remount_finisher.queue(new C_Client_Remount(this));
4457 }
4458 }
4459
4460 void Client::_trim_negative_child_dentries(InodeRef& in)
4461 {
4462 if (!in->is_dir())
4463 return;
4464
4465 Dir* dir = in->dir;
4466 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4467 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4468 Dentry *dn = p->second;
4469 ++p;
4470 ceph_assert(!dn->inode);
4471 if (dn->lru_is_expireable())
4472 unlink(dn, true, false); // keep dir, drop dentry
4473 }
4474 if (dir->dentries.empty()) {
4475 close_dir(dir);
4476 }
4477 }
4478
4479 if (in->flags & I_SNAPDIR_OPEN) {
4480 InodeRef snapdir = open_snapdir(in.get());
4481 _trim_negative_child_dentries(snapdir);
4482 }
4483 }
4484
4485 class C_Client_CacheRelease : public Context {
4486 private:
4487 Client *client;
4488 vinodeno_t ino;
4489 public:
4490 C_Client_CacheRelease(Client *c, Inode *in) :
4491 client(c) {
4492 if (client->use_faked_inos())
4493 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4494 else
4495 ino = in->vino();
4496 }
4497 void finish(int r) override {
4498 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4499 client->_async_inode_release(ino);
4500 }
4501 };
4502
4503 void Client::_async_inode_release(vinodeno_t ino)
4504 {
4505 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4506 if (!mref_reader.is_state_satisfied())
4507 return;
4508
4509 ldout(cct, 10) << __func__ << " " << ino << dendl;
4510 ino_release_cb(callback_handle, ino);
4511 }
4512
4513 void Client::_schedule_ino_release_callback(Inode *in) {
4514
4515 if (ino_release_cb)
4516 // we queue the invalidate, which calls the callback and decrements the ref
4517 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4518 }
4519
4520 void Client::trim_caps(MetaSession *s, uint64_t max)
4521 {
4522 mds_rank_t mds = s->mds_num;
4523 size_t caps_size = s->caps.size();
4524 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4525 << " caps " << caps_size << dendl;
4526
4527 uint64_t trimmed = 0;
4528 auto p = s->caps.begin();
4529 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4530 * looking at from getting deleted during traversal. */
4531 while ((caps_size - trimmed) > max && !p.end()) {
4532 Cap *cap = *p;
4533 InodeRef in(&cap->inode);
4534
4535 // Increment p early because it will be invalidated if cap
4536 // is deleted inside remove_cap
4537 ++p;
4538
4539 if (in->caps.size() > 1 && cap != in->auth_cap) {
4540 int mine = cap->issued | cap->implemented;
4541 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4542 // disposable non-auth cap
4543 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4544 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4545 cap = (remove_cap(cap, true), nullptr);
4546 trimmed++;
4547 }
4548 } else {
4549 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4550 _trim_negative_child_dentries(in);
4551 bool all = true;
4552 auto q = in->dentries.begin();
4553 while (q != in->dentries.end()) {
4554 Dentry *dn = *q;
4555 ++q;
4556 if (dn->lru_is_expireable()) {
4557 if (can_invalidate_dentries &&
4558 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4559 // Only issue one of these per DN for inodes in root: handle
4560 // others more efficiently by calling for root-child DNs at
4561 // the end of this function.
4562 _schedule_invalidate_dentry_callback(dn, true);
4563 }
4564 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4565 to_trim.insert(dn);
4566 } else {
4567 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4568 all = false;
4569 }
4570 }
4571 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4572 _schedule_ino_release_callback(in.get());
4573 }
4574 if (all && in->ino != CEPH_INO_ROOT) {
4575 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4576 trimmed++;
4577 }
4578 }
4579 }
4580 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4581 for (const auto &dn : to_trim) {
4582 trim_dentry(dn);
4583 }
4584 to_trim.clear();
4585
4586 caps_size = s->caps.size();
4587 if (caps_size > (size_t)max)
4588 _invalidate_kernel_dcache();
4589 }
4590
4591 void Client::force_session_readonly(MetaSession *s)
4592 {
4593 s->readonly = true;
4594 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4595 auto &in = (*p)->inode;
4596 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4597 signal_cond_list(in.waitfor_caps);
4598 }
4599 }
4600
4601 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4602 {
4603 MetaSession *session = in->auth_cap->session;
4604
4605 int flushing = in->dirty_caps;
4606 ceph_assert(flushing);
4607
4608 ceph_tid_t flush_tid = ++last_flush_tid;
4609 in->flushing_cap_tids[flush_tid] = flushing;
4610
4611 if (!in->flushing_caps) {
4612 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4613 num_flushing_caps++;
4614 } else {
4615 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4616 }
4617
4618 in->flushing_caps |= flushing;
4619 in->mark_caps_clean();
4620
4621 if (!in->flushing_cap_item.is_on_list())
4622 session->flushing_caps.push_back(&in->flushing_cap_item);
4623 session->flushing_caps_tids.insert(flush_tid);
4624
4625 *ptid = flush_tid;
4626 return flushing;
4627 }
4628
4629 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4630 {
4631 for (auto &p : in->cap_snaps) {
4632 CapSnap &capsnap = p.second;
4633 if (capsnap.flush_tid > 0) {
4634 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4635 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4636 }
4637 }
4638 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4639 it != in->flushing_cap_tids.end();
4640 ++it) {
4641 old_s->flushing_caps_tids.erase(it->first);
4642 new_s->flushing_caps_tids.insert(it->first);
4643 }
4644 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4645 }
4646
4647 /*
4648 * Flush all caps back to the MDS. Because the callers generally wait on the
4649 * result of this function (syncfs and umount cases), we set
4650 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4651 */
4652 void Client::flush_caps_sync()
4653 {
4654 ldout(cct, 10) << __func__ << dendl;
4655 xlist<Inode*>::iterator p = delayed_list.begin();
4656 while (!p.end()) {
4657 unsigned flags = CHECK_CAPS_NODELAY;
4658 Inode *in = *p;
4659
4660 ++p;
4661 delayed_list.pop_front();
4662 if (p.end() && dirty_list.empty())
4663 flags |= CHECK_CAPS_SYNCHRONOUS;
4664 check_caps(in, flags);
4665 }
4666
4667 // other caps, too
4668 p = dirty_list.begin();
4669 while (!p.end()) {
4670 unsigned flags = CHECK_CAPS_NODELAY;
4671 Inode *in = *p;
4672
4673 ++p;
4674 if (p.end())
4675 flags |= CHECK_CAPS_SYNCHRONOUS;
4676 check_caps(in, flags);
4677 }
4678 }
4679
4680 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4681 {
4682 while (in->flushing_caps) {
4683 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4684 ceph_assert(it != in->flushing_cap_tids.end());
4685 if (it->first > want)
4686 break;
4687 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4688 << ccap_string(it->second) << " want " << want
4689 << " last " << it->first << dendl;
4690 wait_on_list(in->waitfor_caps);
4691 }
4692 }
4693
4694 void Client::wait_sync_caps(ceph_tid_t want)
4695 {
4696 retry:
4697 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4698 << num_flushing_caps << " total flushing)" << dendl;
4699 for (auto &p : mds_sessions) {
4700 MetaSession *s = &p.second;
4701 if (s->flushing_caps_tids.empty())
4702 continue;
4703 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4704 if (oldest_tid <= want) {
4705 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4706 << " (want " << want << ")" << dendl;
4707 std::unique_lock l{client_lock, std::adopt_lock};
4708 sync_cond.wait(l);
4709 l.release();
4710 goto retry;
4711 }
4712 }
4713 }
4714
4715 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4716 {
4717 in->flags &= ~I_KICK_FLUSH;
4718
4719 Cap *cap = in->auth_cap;
4720 ceph_assert(cap->session == session);
4721
4722 ceph_tid_t last_snap_flush = 0;
4723 for (auto p = in->flushing_cap_tids.rbegin();
4724 p != in->flushing_cap_tids.rend();
4725 ++p) {
4726 if (!p->second) {
4727 last_snap_flush = p->first;
4728 break;
4729 }
4730 }
4731
4732 int wanted = in->caps_wanted();
4733 int used = get_caps_used(in) | in->caps_dirty();
4734 auto it = in->cap_snaps.begin();
4735 for (auto& p : in->flushing_cap_tids) {
4736 if (p.second) {
4737 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4738 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4739 p.second, p.first);
4740 } else {
4741 ceph_assert(it != in->cap_snaps.end());
4742 ceph_assert(it->second.flush_tid == p.first);
4743 send_flush_snap(in, session, it->first, it->second);
4744 ++it;
4745 }
4746 }
4747 }
4748
4749 void Client::kick_flushing_caps(MetaSession *session)
4750 {
4751 mds_rank_t mds = session->mds_num;
4752 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4753
4754 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4755 Inode *in = *p;
4756 if (in->flags & I_KICK_FLUSH) {
4757 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4758 kick_flushing_caps(in, session);
4759 }
4760 }
4761 }
4762
4763 void Client::early_kick_flushing_caps(MetaSession *session)
4764 {
4765 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4766 Inode *in = *p;
4767 Cap *cap = in->auth_cap;
4768 ceph_assert(cap);
4769
4770 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4771 // stage. This guarantees that MDS processes the cap flush message before issuing
4772 // the flushing caps to other client.
4773 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4774 in->flags |= I_KICK_FLUSH;
4775 continue;
4776 }
4777
4778 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4779 << " to mds." << session->mds_num << dendl;
4780 // send_reconnect() also will reset these sequence numbers. make sure
4781 // sequence numbers in cap flush message match later reconnect message.
4782 cap->seq = 0;
4783 cap->issue_seq = 0;
4784 cap->mseq = 0;
4785 cap->issued = cap->implemented;
4786
4787 kick_flushing_caps(in, session);
4788 }
4789 }
4790
4791 void SnapRealm::build_snap_context()
4792 {
4793 set<snapid_t> snaps;
4794 snapid_t max_seq = seq;
4795
4796 // start with prior_parents?
4797 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4798 snaps.insert(prior_parent_snaps[i]);
4799
4800 // current parent's snaps
4801 if (pparent) {
4802 const SnapContext& psnapc = pparent->get_snap_context();
4803 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4804 if (psnapc.snaps[i] >= parent_since)
4805 snaps.insert(psnapc.snaps[i]);
4806 if (psnapc.seq > max_seq)
4807 max_seq = psnapc.seq;
4808 }
4809
4810 // my snaps
4811 for (unsigned i=0; i<my_snaps.size(); i++)
4812 snaps.insert(my_snaps[i]);
4813
4814 // ok!
4815 cached_snap_context.seq = max_seq;
4816 cached_snap_context.snaps.resize(0);
4817 cached_snap_context.snaps.reserve(snaps.size());
4818 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4819 cached_snap_context.snaps.push_back(*p);
4820 }
4821
4822 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4823 {
4824 list<SnapRealm*> q;
4825 q.push_back(realm);
4826
4827 while (!q.empty()) {
4828 realm = q.front();
4829 q.pop_front();
4830
4831 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4832 realm->invalidate_cache();
4833
4834 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4835 p != realm->pchildren.end();
4836 ++p)
4837 q.push_back(*p);
4838 }
4839 }
4840
4841 SnapRealm *Client::get_snap_realm(inodeno_t r)
4842 {
4843 SnapRealm *realm = snap_realms[r];
4844 if (!realm)
4845 snap_realms[r] = realm = new SnapRealm(r);
4846 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4847 realm->nref++;
4848 return realm;
4849 }
4850
4851 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4852 {
4853 if (snap_realms.count(r) == 0) {
4854 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4855 return NULL;
4856 }
4857 SnapRealm *realm = snap_realms[r];
4858 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4859 realm->nref++;
4860 return realm;
4861 }
4862
4863 void Client::put_snap_realm(SnapRealm *realm)
4864 {
4865 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4866 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4867 if (--realm->nref == 0) {
4868 snap_realms.erase(realm->ino);
4869 if (realm->pparent) {
4870 realm->pparent->pchildren.erase(realm);
4871 put_snap_realm(realm->pparent);
4872 }
4873 delete realm;
4874 }
4875 }
4876
4877 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4878 {
4879 if (realm->parent != parent) {
4880 ldout(cct, 10) << __func__ << " " << *realm
4881 << " " << realm->parent << " -> " << parent << dendl;
4882 realm->parent = parent;
4883 if (realm->pparent) {
4884 realm->pparent->pchildren.erase(realm);
4885 put_snap_realm(realm->pparent);
4886 }
4887 realm->pparent = get_snap_realm(parent);
4888 realm->pparent->pchildren.insert(realm);
4889 return true;
4890 }
4891 return false;
4892 }
4893
4894 static bool has_new_snaps(const SnapContext& old_snapc,
4895 const SnapContext& new_snapc)
4896 {
4897 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4898 }
4899
4900
4901 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4902 {
4903 SnapRealm *first_realm = NULL;
4904 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4905
4906 map<SnapRealm*, SnapContext> dirty_realms;
4907
4908 auto p = bl.cbegin();
4909 while (!p.end()) {
4910 SnapRealmInfo info;
4911 decode(info, p);
4912 SnapRealm *realm = get_snap_realm(info.ino());
4913
4914 bool invalidate = false;
4915
4916 if (info.seq() > realm->seq) {
4917 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4918 << dendl;
4919
4920 if (flush) {
4921 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4922 // flush me + children
4923 list<SnapRealm*> q;
4924 q.push_back(realm);
4925 while (!q.empty()) {
4926 SnapRealm *realm = q.front();
4927 q.pop_front();
4928
4929 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4930 p != realm->pchildren.end();
4931 ++p)
4932 q.push_back(*p);
4933
4934 if (dirty_realms.count(realm) == 0) {
4935 realm->nref++;
4936 dirty_realms[realm] = realm->get_snap_context();
4937 }
4938 }
4939 }
4940
4941 // update
4942 realm->seq = info.seq();
4943 realm->created = info.created();
4944 realm->parent_since = info.parent_since();
4945 realm->prior_parent_snaps = info.prior_parent_snaps;
4946 realm->my_snaps = info.my_snaps;
4947 invalidate = true;
4948 }
4949
4950 // _always_ verify parent
4951 if (adjust_realm_parent(realm, info.parent()))
4952 invalidate = true;
4953
4954 if (invalidate) {
4955 invalidate_snaprealm_and_children(realm);
4956 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4957 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4958 } else {
4959 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4960 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4961 }
4962
4963 if (!first_realm)
4964 first_realm = realm;
4965 else
4966 put_snap_realm(realm);
4967 }
4968
4969 for (auto &[realm, snapc] : dirty_realms) {
4970 // if there are new snaps ?
4971 if (has_new_snaps(snapc, realm->get_snap_context())) {
4972 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4973 for (auto&& in : realm->inodes_with_caps) {
4974 queue_cap_snap(in, snapc);
4975 }
4976 } else {
4977 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4978 }
4979 put_snap_realm(realm);
4980 }
4981
4982 if (realm_ret)
4983 *realm_ret = first_realm;
4984 else
4985 put_snap_realm(first_realm);
4986 }
4987
4988 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4989 {
4990 ldout(cct, 10) << __func__ << " " << *m << dendl;
4991 mds_rank_t mds = mds_rank_t(m->get_source().num());
4992
4993 std::scoped_lock cl(client_lock);
4994 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4995 if (!session) {
4996 return;
4997 }
4998
4999 got_mds_push(session);
5000
5001 map<Inode*, SnapContext> to_move;
5002 SnapRealm *realm = 0;
5003
5004 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5005 ceph_assert(m->head.split);
5006 SnapRealmInfo info;
5007 auto p = m->bl.cbegin();
5008 decode(info, p);
5009 ceph_assert(info.ino() == m->head.split);
5010
5011 // flush, then move, ino's.
5012 realm = get_snap_realm(info.ino());
5013 ldout(cct, 10) << " splitting off " << *realm << dendl;
5014 for (auto& ino : m->split_inos) {
5015 vinodeno_t vino(ino, CEPH_NOSNAP);
5016 if (inode_map.count(vino)) {
5017 Inode *in = inode_map[vino];
5018 if (!in->snaprealm || in->snaprealm == realm)
5019 continue;
5020 if (in->snaprealm->created > info.created()) {
5021 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5022 << *in->snaprealm << dendl;
5023 continue;
5024 }
5025 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5026
5027
5028 in->snaprealm_item.remove_myself();
5029 to_move[in] = in->snaprealm->get_snap_context();
5030 put_snap_realm(in->snaprealm);
5031 }
5032 }
5033
5034 // move child snaprealms, too
5035 for (auto& child_realm : m->split_realms) {
5036 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5037 SnapRealm *child = get_snap_realm_maybe(child_realm);
5038 if (!child)
5039 continue;
5040 adjust_realm_parent(child, realm->ino);
5041 put_snap_realm(child);
5042 }
5043 }
5044
5045 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5046
5047 if (realm) {
5048 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5049 Inode *in = p->first;
5050 in->snaprealm = realm;
5051 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5052 realm->nref++;
5053 // queue for snap writeback
5054 if (has_new_snaps(p->second, realm->get_snap_context()))
5055 queue_cap_snap(in, p->second);
5056 }
5057 put_snap_realm(realm);
5058 }
5059 }
5060
5061 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5062 {
5063 mds_rank_t mds = mds_rank_t(m->get_source().num());
5064
5065 std::scoped_lock cl(client_lock);
5066 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5067 if (!session) {
5068 return;
5069 }
5070
5071 got_mds_push(session);
5072
5073 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5074
5075 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5076 if (inode_map.count(vino)) {
5077 Inode *in = NULL;
5078 in = inode_map[vino];
5079
5080 if (in) {
5081 in->quota = m->quota;
5082 in->rstat = m->rstat;
5083 }
5084 }
5085 }
5086
5087 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5088 {
5089 mds_rank_t mds = mds_rank_t(m->get_source().num());
5090
5091 std::scoped_lock cl(client_lock);
5092 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5093 if (!session) {
5094 return;
5095 }
5096
5097 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5098 // Pause RADOS operations until we see the required epoch
5099 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5100 }
5101
5102 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5103 // Record the barrier so that we will transmit it to MDS when releasing
5104 set_cap_epoch_barrier(m->osd_epoch_barrier);
5105 }
5106
5107 got_mds_push(session);
5108
5109 Inode *in;
5110 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5111 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5112 in = it->second;
5113 } else {
5114 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5115 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5116 session->enqueue_cap_release(
5117 m->get_ino(),
5118 m->get_cap_id(),
5119 m->get_seq(),
5120 m->get_mseq(),
5121 cap_epoch_barrier);
5122 } else {
5123 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5124 }
5125
5126 // in case the mds is waiting on e.g. a revocation
5127 flush_cap_releases();
5128 return;
5129 }
5130
5131 switch (m->get_op()) {
5132 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
5133 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
5134 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
5135 }
5136
5137 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5138 Cap &cap = in->caps.at(mds);
5139
5140 switch (m->get_op()) {
5141 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
5142 case CEPH_CAP_OP_IMPORT:
5143 case CEPH_CAP_OP_REVOKE:
5144 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
5145 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
5146 }
5147 } else {
5148 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5149 return;
5150 }
5151 }
5152
5153 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5154 {
5155 mds_rank_t mds = session->mds_num;
5156
5157 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5158 << " IMPORT from mds." << mds << dendl;
5159
5160 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5161 Cap *cap = NULL;
5162 UserPerm cap_perms;
5163 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5164 cap = &it->second;
5165 cap_perms = cap->latest_perms;
5166 }
5167
5168 // add/update it
5169 SnapRealm *realm = NULL;
5170 update_snap_trace(m->snapbl, &realm);
5171
5172 int issued = m->get_caps();
5173 int wanted = m->get_wanted();
5174 add_update_cap(in, session, m->get_cap_id(),
5175 issued, wanted, m->get_seq(), m->get_mseq(),
5176 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5177
5178 if (cap && cap->cap_id == m->peer.cap_id) {
5179 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5180 }
5181
5182 if (realm)
5183 put_snap_realm(realm);
5184
5185 if (in->auth_cap && in->auth_cap->session == session) {
5186 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5187 in->requested_max_size > m->get_max_size()) {
5188 in->requested_max_size = 0;
5189 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5190 }
5191 // reflush any/all caps (if we are now the auth_cap)
5192 kick_flushing_caps(in, session);
5193 }
5194 }
5195
5196 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5197 {
5198 mds_rank_t mds = session->mds_num;
5199
5200 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5201 << " EXPORT from mds." << mds << dendl;
5202
5203 auto it = in->caps.find(mds);
5204 if (it != in->caps.end()) {
5205 Cap &cap = it->second;
5206 if (cap.cap_id == m->get_cap_id()) {
5207 if (m->peer.cap_id) {
5208 const auto peer_mds = mds_rank_t(m->peer.mds);
5209 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5210 auto it = in->caps.find(peer_mds);
5211 if (it != in->caps.end()) {
5212 Cap &tcap = it->second;
5213 if (tcap.cap_id == m->peer.cap_id &&
5214 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5215 tcap.cap_id = m->peer.cap_id;
5216 tcap.seq = m->peer.seq - 1;
5217 tcap.issue_seq = tcap.seq;
5218 tcap.issued |= cap.issued;
5219 tcap.implemented |= cap.issued;
5220 if (&cap == in->auth_cap)
5221 in->auth_cap = &tcap;
5222 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5223 adjust_session_flushing_caps(in, session, tsession);
5224 }
5225 } else {
5226 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5227 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5228 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5229 cap.latest_perms);
5230 }
5231 } else {
5232 if (cap.wanted | cap.issued)
5233 in->flags |= I_CAP_DROPPED;
5234 }
5235
5236 remove_cap(&cap, false);
5237 }
5238 }
5239 }
5240
5241 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5242 {
5243 mds_rank_t mds = session->mds_num;
5244 ceph_assert(in->caps.count(mds));
5245
5246 ldout(cct, 10) << __func__ << " on ino " << *in
5247 << " size " << in->size << " -> " << m->get_size()
5248 << dendl;
5249
5250 int issued;
5251 in->caps_issued(&issued);
5252 issued |= in->caps_dirty();
5253 update_inode_file_size(in, issued, m->get_size(),
5254 m->get_truncate_seq(), m->get_truncate_size());
5255 }
5256
5257 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5258 {
5259 ceph_tid_t flush_ack_tid = m->get_client_tid();
5260 int dirty = m->get_dirty();
5261 int cleaned = 0;
5262 int flushed = 0;
5263
5264 auto it = in->flushing_cap_tids.begin();
5265 if (it->first < flush_ack_tid) {
5266 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5267 << " got unexpected flush ack tid " << flush_ack_tid
5268 << " expected is " << it->first << dendl;
5269 }
5270 for (; it != in->flushing_cap_tids.end(); ) {
5271 if (!it->second) {
5272 // cap snap
5273 ++it;
5274 continue;
5275 }
5276 if (it->first == flush_ack_tid)
5277 cleaned = it->second;
5278 if (it->first <= flush_ack_tid) {
5279 session->flushing_caps_tids.erase(it->first);
5280 in->flushing_cap_tids.erase(it++);
5281 ++flushed;
5282 continue;
5283 }
5284 cleaned &= ~it->second;
5285 if (!cleaned)
5286 break;
5287 ++it;
5288 }
5289
5290 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5291 << " cleaned " << ccap_string(cleaned) << " on " << *in
5292 << " with " << ccap_string(dirty) << dendl;
5293
5294 if (flushed) {
5295 signal_cond_list(in->waitfor_caps);
5296 if (session->flushing_caps_tids.empty() ||
5297 *session->flushing_caps_tids.begin() > flush_ack_tid)
5298 sync_cond.notify_all();
5299 }
5300
5301 if (!dirty) {
5302 in->cap_dirtier_uid = -1;
5303 in->cap_dirtier_gid = -1;
5304 }
5305
5306 if (!cleaned) {
5307 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5308 } else {
5309 if (in->flushing_caps) {
5310 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5311 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5312 in->flushing_caps &= ~cleaned;
5313 if (in->flushing_caps == 0) {
5314 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5315 num_flushing_caps--;
5316 if (in->flushing_cap_tids.empty())
5317 in->flushing_cap_item.remove_myself();
5318 }
5319 if (!in->caps_dirty())
5320 put_inode(in);
5321 }
5322 }
5323 }
5324
5325
5326 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5327 {
5328 ceph_tid_t flush_ack_tid = m->get_client_tid();
5329 mds_rank_t mds = session->mds_num;
5330 ceph_assert(in->caps.count(mds));
5331 snapid_t follows = m->get_snap_follows();
5332
5333 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5334 auto& capsnap = it->second;
5335 if (flush_ack_tid != capsnap.flush_tid) {
5336 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5337 } else {
5338 InodeRef tmp_ref(in);
5339 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5340 << " on " << *in << dendl;
5341 session->flushing_caps_tids.erase(capsnap.flush_tid);
5342 in->flushing_cap_tids.erase(capsnap.flush_tid);
5343 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5344 in->flushing_cap_item.remove_myself();
5345 in->cap_snaps.erase(it);
5346
5347 signal_cond_list(in->waitfor_caps);
5348 if (session->flushing_caps_tids.empty() ||
5349 *session->flushing_caps_tids.begin() > flush_ack_tid)
5350 sync_cond.notify_all();
5351 }
5352 } else {
5353 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5354 << " on " << *in << dendl;
5355 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5356 }
5357 }
5358
5359 class C_Client_DentryInvalidate : public Context {
5360 private:
5361 Client *client;
5362 vinodeno_t dirino;
5363 vinodeno_t ino;
5364 string name;
5365 public:
5366 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5367 client(c), name(dn->name) {
5368 if (client->use_faked_inos()) {
5369 dirino.ino = dn->dir->parent_inode->faked_ino;
5370 if (del)
5371 ino.ino = dn->inode->faked_ino;
5372 } else {
5373 dirino = dn->dir->parent_inode->vino();
5374 if (del)
5375 ino = dn->inode->vino();
5376 }
5377 if (!del)
5378 ino.ino = inodeno_t();
5379 }
5380 void finish(int r) override {
5381 // _async_dentry_invalidate is responsible for its own locking
5382 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5383 client->_async_dentry_invalidate(dirino, ino, name);
5384 }
5385 };
5386
5387 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5388 {
5389 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5390 if (!mref_reader.is_state_satisfied())
5391 return;
5392
5393 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5394 << " in dir " << dirino << dendl;
5395 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5396 }
5397
5398 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5399 {
5400 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5401 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5402 }
5403
5404 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5405 {
5406 int ref = in->get_nref();
5407 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5408
5409 if (in->dir && !in->dir->dentries.empty()) {
5410 for (auto p = in->dir->dentries.begin();
5411 p != in->dir->dentries.end(); ) {
5412 Dentry *dn = p->second;
5413 ++p;
5414 /* rmsnap removes whole subtree, need trim inodes recursively.
5415 * we don't need to invalidate dentries recursively. because
5416 * invalidating a directory dentry effectively invalidate
5417 * whole subtree */
5418 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5419 _try_to_trim_inode(dn->inode.get(), false);
5420
5421 if (dn->lru_is_expireable())
5422 unlink(dn, true, false); // keep dir, drop dentry
5423 }
5424 if (in->dir->dentries.empty()) {
5425 close_dir(in->dir);
5426 --ref;
5427 }
5428 }
5429
5430 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5431 InodeRef snapdir = open_snapdir(in);
5432 _try_to_trim_inode(snapdir.get(), false);
5433 --ref;
5434 }
5435
5436 if (ref > 1) {
5437 auto q = in->dentries.begin();
5438 while (q != in->dentries.end()) {
5439 Dentry *dn = *q;
5440 ++q;
5441 if( in->ll_ref > 0 && sched_inval) {
5442 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5443 // so in->dentries doesn't always reflect the state of kernel's dcache.
5444 _schedule_invalidate_dentry_callback(dn, true);
5445 }
5446 unlink(dn, true, true);
5447 }
5448 }
5449 }
5450
5451 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5452 {
5453 mds_rank_t mds = session->mds_num;
5454 int used = get_caps_used(in);
5455 int wanted = in->caps_wanted();
5456
5457 const unsigned new_caps = m->get_caps();
5458 const bool was_stale = session->cap_gen > cap->gen;
5459 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5460 << " mds." << mds << " seq " << m->get_seq()
5461 << " caps now " << ccap_string(new_caps)
5462 << " was " << ccap_string(cap->issued)
5463 << (was_stale ? " (stale)" : "") << dendl;
5464
5465 if (was_stale)
5466 cap->issued = cap->implemented = CEPH_CAP_PIN;
5467 cap->seq = m->get_seq();
5468 cap->gen = session->cap_gen;
5469
5470 check_cap_issue(in, new_caps);
5471
5472 // update inode
5473 int issued;
5474 in->caps_issued(&issued);
5475 issued |= in->caps_dirty();
5476
5477 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5478 !(issued & CEPH_CAP_AUTH_EXCL)) {
5479 in->mode = m->head.mode;
5480 in->uid = m->head.uid;
5481 in->gid = m->head.gid;
5482 in->btime = m->btime;
5483 }
5484 bool deleted_inode = false;
5485 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5486 !(issued & CEPH_CAP_LINK_EXCL)) {
5487 in->nlink = m->head.nlink;
5488 if (in->nlink == 0 &&
5489 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5490 deleted_inode = true;
5491 }
5492 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5493 m->xattrbl.length() &&
5494 m->head.xattr_version > in->xattr_version) {
5495 auto p = m->xattrbl.cbegin();
5496 decode(in->xattrs, p);
5497 in->xattr_version = m->head.xattr_version;
5498 }
5499
5500 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5501 in->dirstat.nfiles = m->get_nfiles();
5502 in->dirstat.nsubdirs = m->get_nsubdirs();
5503 }
5504
5505 if (new_caps & CEPH_CAP_ANY_RD) {
5506 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5507 m->get_ctime(), m->get_mtime(), m->get_atime());
5508 }
5509
5510 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5511 in->layout = m->get_layout();
5512 update_inode_file_size(in, issued, m->get_size(),
5513 m->get_truncate_seq(), m->get_truncate_size());
5514 }
5515
5516 if (m->inline_version > in->inline_version) {
5517 in->inline_data = m->inline_data;
5518 in->inline_version = m->inline_version;
5519 }
5520
5521 /* always take a newer change attr */
5522 if (m->get_change_attr() > in->change_attr)
5523 in->change_attr = m->get_change_attr();
5524
5525 // max_size
5526 if (cap == in->auth_cap &&
5527 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5528 (m->get_max_size() != in->max_size)) {
5529 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5530 in->max_size = m->get_max_size();
5531 if (in->max_size > in->wanted_max_size) {
5532 in->wanted_max_size = 0;
5533 in->requested_max_size = 0;
5534 }
5535 }
5536
5537 bool check = false;
5538 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5539 (wanted & ~(cap->wanted | new_caps))) {
5540 // If mds is importing cap, prior cap messages that update 'wanted'
5541 // may get dropped by mds (migrate seq mismatch).
5542 //
5543 // We don't send cap message to update 'wanted' if what we want are
5544 // already issued. If mds revokes caps, cap message that releases caps
5545 // also tells mds what we want. But if caps got revoked by mds forcedly
5546 // (session stale). We may haven't told mds what we want.
5547 check = true;
5548 }
5549
5550
5551 // update caps
5552 auto revoked = cap->issued & ~new_caps;
5553 if (revoked) {
5554 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5555 cap->issued = new_caps;
5556 cap->implemented |= new_caps;
5557
5558 // recall delegations if we're losing caps necessary for them
5559 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5560 in->recall_deleg(false);
5561 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5562 in->recall_deleg(true);
5563
5564 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5565 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5566 !_flush(in, new C_Client_FlushComplete(this, in))) {
5567 // waitin' for flush
5568 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5569 if (_release(in))
5570 check = true;
5571 } else {
5572 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5573 check = true;
5574 }
5575 } else if (cap->issued == new_caps) {
5576 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5577 } else {
5578 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5579 cap->issued = new_caps;
5580 cap->implemented |= new_caps;
5581
5582 if (cap == in->auth_cap) {
5583 // non-auth MDS is revoking the newly grant caps ?
5584 for (const auto &p : in->caps) {
5585 if (&p.second == cap)
5586 continue;
5587 if (p.second.implemented & ~p.second.issued & new_caps) {
5588 check = true;
5589 break;
5590 }
5591 }
5592 }
5593 }
5594
5595 if (check)
5596 check_caps(in, 0);
5597
5598 // wake up waiters
5599 if (new_caps)
5600 signal_cond_list(in->waitfor_caps);
5601
5602 // may drop inode's last ref
5603 if (deleted_inode)
5604 _try_to_trim_inode(in, true);
5605 }
5606
5607 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5608 {
5609 if (perms.uid() == 0) {
5610 // Executable are overridable when there is at least one exec bit set
5611 if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
5612 return -CEPHFS_EACCES;
5613 return 0;
5614 }
5615
5616 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5617 int ret = _posix_acl_permission(in, perms, want);
5618 if (ret != -CEPHFS_EAGAIN)
5619 return ret;
5620 }
5621
5622 // check permissions before doing anything else
5623 if (!in->check_mode(perms, want))
5624 return -CEPHFS_EACCES;
5625 return 0;
5626 }
5627
5628 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5629 const UserPerm& perms)
5630 {
5631 int r = _getattr_for_perm(in, perms);
5632 if (r < 0)
5633 goto out;
5634
5635 r = 0;
5636 if (strncmp(name, "system.", 7) == 0) {
5637 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5638 r = -CEPHFS_EPERM;
5639 } else {
5640 r = inode_permission(in, perms, want);
5641 }
5642 out:
5643 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5644 return r;
5645 }
5646
5647 ostream& operator<<(ostream &out, const UserPerm& perm) {
5648 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5649 return out;
5650 }
5651
5652 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5653 const UserPerm& perms)
5654 {
5655 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5656 int r = _getattr_for_perm(in, perms);
5657 if (r < 0)
5658 goto out;
5659
5660 if (mask & CEPH_SETATTR_SIZE) {
5661 r = inode_permission(in, perms, MAY_WRITE);
5662 if (r < 0)
5663 goto out;
5664 }
5665
5666 r = -CEPHFS_EPERM;
5667 if (mask & CEPH_SETATTR_UID) {
5668 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5669 goto out;
5670 }
5671 if (mask & CEPH_SETATTR_GID) {
5672 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5673 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5674 goto out;
5675 }
5676
5677 if (mask & CEPH_SETATTR_MODE) {
5678 if (perms.uid() != 0 && perms.uid() != in->uid)
5679 goto out;
5680
5681 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5682 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5683 stx->stx_mode &= ~S_ISGID;
5684 }
5685
5686 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5687 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5688 if (perms.uid() != 0 && perms.uid() != in->uid) {
5689 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5690 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5691 check_mask |= CEPH_SETATTR_MTIME;
5692 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5693 check_mask |= CEPH_SETATTR_ATIME;
5694 if (check_mask & mask) {
5695 goto out;
5696 } else {
5697 r = inode_permission(in, perms, MAY_WRITE);
5698 if (r < 0)
5699 goto out;
5700 }
5701 }
5702 }
5703 r = 0;
5704 out:
5705 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5706 return r;
5707 }
5708
5709 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5710 {
5711 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5712 unsigned want = 0;
5713
5714 if ((flags & O_ACCMODE) == O_WRONLY)
5715 want = MAY_WRITE;
5716 else if ((flags & O_ACCMODE) == O_RDWR)
5717 want = MAY_READ | MAY_WRITE;
5718 else if ((flags & O_ACCMODE) == O_RDONLY)
5719 want = MAY_READ;
5720 if (flags & O_TRUNC)
5721 want |= MAY_WRITE;
5722
5723 int r = 0;
5724 switch (in->mode & S_IFMT) {
5725 case S_IFLNK:
5726 r = -CEPHFS_ELOOP;
5727 goto out;
5728 case S_IFDIR:
5729 if (want & MAY_WRITE) {
5730 r = -CEPHFS_EISDIR;
5731 goto out;
5732 }
5733 break;
5734 }
5735
5736 r = _getattr_for_perm(in, perms);
5737 if (r < 0)
5738 goto out;
5739
5740 r = inode_permission(in, perms, want);
5741 out:
5742 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5743 return r;
5744 }
5745
5746 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5747 {
5748 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5749 int r = _getattr_for_perm(dir, perms);
5750 if (r < 0)
5751 goto out;
5752
5753 r = inode_permission(dir, perms, MAY_EXEC);
5754 out:
5755 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5756 return r;
5757 }
5758
5759 int Client::may_create(Inode *dir, const UserPerm& perms)
5760 {
5761 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5762 int r = _getattr_for_perm(dir, perms);
5763 if (r < 0)
5764 goto out;
5765
5766 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5767 out:
5768 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5769 return r;
5770 }
5771
5772 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5773 {
5774 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5775 int r = _getattr_for_perm(dir, perms);
5776 if (r < 0)
5777 goto out;
5778
5779 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5780 if (r < 0)
5781 goto out;
5782
5783 /* 'name == NULL' means rmsnap w/o permission checks */
5784 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5785 InodeRef otherin;
5786 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5787 if (r < 0)
5788 goto out;
5789 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5790 r = -CEPHFS_EPERM;
5791 }
5792 out:
5793 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5794 return r;
5795 }
5796
5797 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5798 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5799
5800 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5801 if (!mref_reader.is_state_satisfied())
5802 return -ENOTCONN;
5803
5804 filepath path(relpath);
5805 string name = path.last_dentry();
5806 path.pop_dentry();
5807 InodeRef dir;
5808
5809 std::scoped_lock lock(client_lock);
5810 int r = path_walk(path, &dir, perms);
5811 if (r < 0)
5812 return r;
5813 if (cct->_conf->client_permissions) {
5814 int r = may_delete(dir.get(), name.c_str(), perms);
5815 if (r < 0)
5816 return r;
5817 }
5818
5819 return 0;
5820 }
5821
5822 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5823 {
5824 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5825 int r = _getattr_for_perm(in, perms);
5826 if (r < 0)
5827 goto out;
5828
5829 if (perms.uid() == 0 || perms.uid() == in->uid) {
5830 r = 0;
5831 goto out;
5832 }
5833
5834 r = -CEPHFS_EPERM;
5835 if (!S_ISREG(in->mode))
5836 goto out;
5837
5838 if (in->mode & S_ISUID)
5839 goto out;
5840
5841 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5842 goto out;
5843
5844 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5845 out:
5846 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5847 return r;
5848 }
5849
5850 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5851 {
5852 int mask = CEPH_STAT_CAP_MODE;
5853 bool force = false;
5854 if (acl_type != NO_ACL) {
5855 mask |= CEPH_STAT_CAP_XATTR;
5856 force = in->xattr_version == 0;
5857 }
5858 return _getattr(in, mask, perms, force);
5859 }
5860
5861 vinodeno_t Client::_get_vino(Inode *in)
5862 {
5863 /* The caller must hold the client lock */
5864 return vinodeno_t(in->ino, in->snapid);
5865 }
5866
5867 /**
5868 * Resolve an MDS spec to a list of MDS daemon GIDs.
5869 *
5870 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5871 * It may be '*' in which case it matches all GIDs.
5872 *
5873 * If no error is returned, the `targets` vector will be populated with at least
5874 * one MDS.
5875 */
5876 int Client::resolve_mds(
5877 const std::string &mds_spec,
5878 std::vector<mds_gid_t> *targets)
5879 {
5880 ceph_assert(fsmap);
5881 ceph_assert(targets != nullptr);
5882
5883 mds_role_t role;
5884 CachedStackStringStream css;
5885 int role_r = fsmap->parse_role(mds_spec, &role, *css);
5886 if (role_r == 0) {
5887 // We got a role, resolve it to a GID
5888 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5889 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5890 << role << "' aka " << info.human_name() << dendl;
5891 targets->push_back(info.global_id);
5892 return 0;
5893 }
5894
5895 std::string strtol_err;
5896 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5897 if (strtol_err.empty()) {
5898 // It is a possible GID
5899 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5900 if (fsmap->gid_exists(mds_gid)) {
5901 auto& info = fsmap->get_info_gid(mds_gid);
5902 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5903 << info.human_name() << dendl;
5904 targets->push_back(mds_gid);
5905 return 0;
5906 } else {
5907 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5908 << dendl;
5909 lderr(cct) << "FSMap: " << *fsmap << dendl;
5910 return -CEPHFS_ENOENT;
5911 }
5912 } else if (mds_spec == "*") {
5913 // It is a wildcard: use all MDSs
5914 const auto& mds_info = fsmap->get_mds_info();
5915
5916 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5917 if (mds_info.empty()) {
5918 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5919 lderr(cct) << "FSMap: " << *fsmap << dendl;
5920 return -CEPHFS_ENOENT;
5921 }
5922
5923 for (const auto& [gid, info] : mds_info) {
5924 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5925 targets->push_back(gid);
5926 }
5927 return 0;
5928 } else {
5929 // It did not parse as an integer, it is not a wildcard, it must be a name
5930 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5931 if (mds_gid == 0) {
5932 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
5933 lderr(cct) << "FSMap: " << *fsmap << dendl;
5934 return -CEPHFS_ENOENT;
5935 } else {
5936 auto& info = fsmap->get_info_gid(mds_gid);
5937 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5938 << "' to " << info.human_name() << dendl;
5939 targets->push_back(mds_gid);
5940 }
5941 return 0;
5942 }
5943 }
5944
5945
5946 /**
5947 * Authenticate with mon and establish global ID
5948 */
5949 int Client::authenticate()
5950 {
5951 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5952
5953 if (monclient->is_authenticated()) {
5954 return 0;
5955 }
5956
5957 client_lock.unlock();
5958 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5959 client_lock.lock();
5960 if (r < 0) {
5961 return r;
5962 }
5963
5964 whoami = monclient->get_global_id();
5965 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5966
5967 return 0;
5968 }
5969
5970 int Client::fetch_fsmap(bool user)
5971 {
5972 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5973
5974 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5975 // rather than MDSMap because no one MDSMap contains all the daemons, and
5976 // a `tell` can address any daemon.
5977 version_t fsmap_latest;
5978 bs::error_code ec;
5979 do {
5980 client_lock.unlock();
5981 std::tie(fsmap_latest, std::ignore) =
5982 monclient->get_version("fsmap", ca::use_blocked[ec]);
5983 client_lock.lock();
5984 } while (ec == bs::errc::resource_unavailable_try_again);
5985
5986 if (ec) {
5987 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5988 return ceph::from_error_code(ec);
5989 }
5990
5991 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5992
5993 if (user) {
5994 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5995 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5996 monclient->renew_subs();
5997 wait_on_list(waiting_for_fsmap);
5998 }
5999 ceph_assert(fsmap_user);
6000 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6001 } else {
6002 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6003 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6004 monclient->renew_subs();
6005 wait_on_list(waiting_for_fsmap);
6006 }
6007 ceph_assert(fsmap);
6008 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6009 }
6010 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6011 << fsmap_latest << dendl;
6012 return 0;
6013 }
6014
6015 /**
6016 *
6017 * @mds_spec one of ID, rank, GID, "*"
6018 *
6019 */
6020 int Client::mds_command(
6021 const std::string &mds_spec,
6022 const vector<string>& cmd,
6023 const bufferlist& inbl,
6024 bufferlist *outbl,
6025 string *outs,
6026 Context *onfinish)
6027 {
6028 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6029 if (!iref_reader.is_state_satisfied())
6030 return -CEPHFS_ENOTCONN;
6031
6032 std::unique_lock cl(client_lock);
6033
6034 int r;
6035 r = authenticate();
6036 if (r < 0) {
6037 return r;
6038 }
6039
6040 r = fetch_fsmap(false);
6041 if (r < 0) {
6042 return r;
6043 }
6044
6045 // Look up MDS target(s) of the command
6046 std::vector<mds_gid_t> targets;
6047 r = resolve_mds(mds_spec, &targets);
6048 if (r < 0) {
6049 return r;
6050 }
6051
6052 // If daemons are laggy, we won't send them commands. If all
6053 // are laggy then we fail.
6054 std::vector<mds_gid_t> non_laggy;
6055 for (const auto& gid : targets) {
6056 const auto info = fsmap->get_info_gid(gid);
6057 if (!info.laggy()) {
6058 non_laggy.push_back(gid);
6059 }
6060 }
6061 if (non_laggy.size() == 0) {
6062 *outs = "All targeted MDS daemons are laggy";
6063 return -CEPHFS_ENOENT;
6064 }
6065
6066 if (metadata.empty()) {
6067 // We are called on an unmounted client, so metadata
6068 // won't be initialized yet.
6069 populate_metadata("");
6070 }
6071
6072 // Send commands to targets
6073 C_GatherBuilder gather(cct, onfinish);
6074 for (const auto& target_gid : non_laggy) {
6075 const auto info = fsmap->get_info_gid(target_gid);
6076
6077 // Open a connection to the target MDS
6078 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6079
6080 cl.unlock();
6081 {
6082 std::scoped_lock cmd_lock(command_lock);
6083 // Generate MDSCommandOp state
6084 auto &op = command_table.start_command();
6085
6086 op.on_finish = gather.new_sub();
6087 op.cmd = cmd;
6088 op.outbl = outbl;
6089 op.outs = outs;
6090 op.inbl = inbl;
6091 op.mds_gid = target_gid;
6092 op.con = conn;
6093
6094 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6095 << " tid=" << op.tid << cmd << dendl;
6096
6097 // Construct and send MCommand
6098 MessageRef m = op.get_message(monclient->get_fsid());
6099 conn->send_message2(std::move(m));
6100 }
6101 cl.lock();
6102 }
6103 gather.activate();
6104
6105 return 0;
6106 }
6107
6108 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6109 {
6110 ceph_tid_t const tid = m->get_tid();
6111
6112 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6113
6114 std::scoped_lock cmd_lock(command_lock);
6115 if (!command_table.exists(tid)) {
6116 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6117 return;
6118 }
6119
6120 auto &op = command_table.get_command(tid);
6121 if (op.outbl) {
6122 *op.outbl = m->get_data();
6123 }
6124 if (op.outs) {
6125 *op.outs = m->rs;
6126 }
6127
6128 if (op.on_finish) {
6129 op.on_finish->complete(m->r);
6130 }
6131
6132 command_table.erase(tid);
6133 }
6134
6135 // -------------------
6136 // MOUNT
6137
6138 int Client::subscribe_mdsmap(const std::string &fs_name)
6139 {
6140 int r = authenticate();
6141 if (r < 0) {
6142 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6143 return r;
6144 }
6145
6146 std::string resolved_fs_name;
6147 if (fs_name.empty()) {
6148 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6149 if (resolved_fs_name.empty())
6150 // Try the backwards compatibility fs name option
6151 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6152 } else {
6153 resolved_fs_name = fs_name;
6154 }
6155
6156 std::string want = "mdsmap";
6157 if (!resolved_fs_name.empty()) {
6158 r = fetch_fsmap(true);
6159 if (r < 0)
6160 return r;
6161 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6162 if (fscid == FS_CLUSTER_ID_NONE) {
6163 return -CEPHFS_ENOENT;
6164 }
6165
6166 std::ostringstream oss;
6167 oss << want << "." << fscid;
6168 want = oss.str();
6169 }
6170 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6171
6172 monclient->sub_want(want, 0, 0);
6173 monclient->renew_subs();
6174
6175 return 0;
6176 }
6177
6178 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6179 bool require_mds, const std::string &fs_name)
6180 {
6181 ceph_assert(is_initialized());
6182
6183 /*
6184 * To make sure that the _unmount() must wait until the mount()
6185 * is done.
6186 */
6187 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6188 if (!mref_writer.is_first_writer()) // already mounting or mounted
6189 return 0;
6190
6191 std::unique_lock cl(client_lock);
6192
6193 int r = subscribe_mdsmap(fs_name);
6194 if (r < 0) {
6195 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6196 return r;
6197 }
6198
6199 start_tick_thread(); // start tick thread
6200
6201 if (require_mds) {
6202 while (1) {
6203 auto availability = mdsmap->is_cluster_available();
6204 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6205 // Error out
6206 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6207 return CEPH_FUSE_NO_MDS_UP;
6208 } else if (availability == MDSMap::AVAILABLE) {
6209 // Continue to mount
6210 break;
6211 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6212 // Else, wait. MDSMonitor will update the map to bring
6213 // us to a conclusion eventually.
6214 wait_on_list(waiting_for_mdsmap);
6215 } else {
6216 // Unexpected value!
6217 ceph_abort();
6218 }
6219 }
6220 }
6221
6222 populate_metadata(mount_root.empty() ? "/" : mount_root);
6223
6224 filepath fp(CEPH_INO_ROOT);
6225 if (!mount_root.empty()) {
6226 fp = filepath(mount_root.c_str());
6227 }
6228 while (true) {
6229 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6230 req->set_filepath(fp);
6231 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6232 int res = make_request(req, perms);
6233 if (res < 0) {
6234 if (res == -CEPHFS_EACCES && root) {
6235 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6236 break;
6237 }
6238 return res;
6239 }
6240
6241 if (fp.depth())
6242 fp.pop_dentry();
6243 else
6244 break;
6245 }
6246
6247 ceph_assert(root);
6248 _ll_get(root.get());
6249
6250 // trace?
6251 if (!cct->_conf->client_trace.empty()) {
6252 traceout.open(cct->_conf->client_trace.c_str());
6253 if (traceout.is_open()) {
6254 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6255 } else {
6256 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6257 }
6258 }
6259
6260 /*
6261 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6262 ldout(cct, 3) << "op: struct stat st;" << dendl;
6263 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6264 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6265 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6266 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6267 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6268 ldout(cct, 3) << "op: int fd;" << dendl;
6269 */
6270
6271 mref_writer.update_state(CLIENT_MOUNTED);
6272 return 0;
6273 }
6274
6275 // UNMOUNT
6276
6277 void Client::_close_sessions()
6278 {
6279 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6280 if (it->second.state == MetaSession::STATE_REJECTED)
6281 mds_sessions.erase(it++);
6282 else
6283 ++it;
6284 }
6285
6286 while (!mds_sessions.empty()) {
6287 // send session closes!
6288 for (auto &p : mds_sessions) {
6289 if (p.second.state != MetaSession::STATE_CLOSING) {
6290 _close_mds_session(&p.second);
6291 mds_ranks_closing.insert(p.first);
6292 }
6293 }
6294
6295 // wait for sessions to close
6296 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6297 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6298 << timo << "s)" << dendl;
6299 std::unique_lock l{client_lock, std::adopt_lock};
6300 if (!timo) {
6301 mount_cond.wait(l);
6302 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6303 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6304 while (!mds_ranks_closing.empty()) {
6305 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6306 // this prunes entry from mds_sessions and mds_ranks_closing
6307 _closed_mds_session(&session, -CEPHFS_ETIMEDOUT);
6308 }
6309 }
6310
6311 mds_ranks_closing.clear();
6312 l.release();
6313 }
6314 }
6315
6316 void Client::flush_mdlog_sync()
6317 {
6318 if (mds_requests.empty())
6319 return;
6320 for (auto &p : mds_sessions) {
6321 flush_mdlog(&p.second);
6322 }
6323 }
6324
6325 void Client::flush_mdlog(MetaSession *session)
6326 {
6327 // Only send this to Luminous or newer MDS daemons, older daemons
6328 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6329 const uint64_t features = session->con->get_features();
6330 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6331 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6332 session->con->send_message2(std::move(m));
6333 }
6334 }
6335
6336
6337 void Client::_abort_mds_sessions(int err)
6338 {
6339 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6340 auto req = p->second;
6341 ++p;
6342 // unsafe requests will be removed during close session below.
6343 if (req->got_unsafe)
6344 continue;
6345
6346 req->abort(err);
6347 if (req->caller_cond) {
6348 req->kick = true;
6349 req->caller_cond->notify_all();
6350 }
6351 }
6352
6353 // Process aborts on any requests that were on this waitlist.
6354 // Any requests that were on a waiting_for_open session waitlist
6355 // will get kicked during close session below.
6356 signal_cond_list(waiting_for_mdsmap);
6357
6358 // Force-close all sessions
6359 while(!mds_sessions.empty()) {
6360 auto& session = mds_sessions.begin()->second;
6361 _closed_mds_session(&session, err);
6362 }
6363 }
6364
6365 void Client::_unmount(bool abort)
6366 {
6367 /*
6368 * We are unmounting the client.
6369 *
6370 * Just declare the state to STATE_UNMOUNTING to block and fail
6371 * any new comming "reader" and then try to wait all the in-flight
6372 * "readers" to finish.
6373 */
6374 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6375 if (!mref_writer.is_first_writer())
6376 return;
6377 mref_writer.wait_readers_done();
6378
6379 std::unique_lock lock{client_lock};
6380
6381 if (abort || blocklisted) {
6382 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6383 } else {
6384 ldout(cct, 2) << "unmounting" << dendl;
6385 }
6386
6387 deleg_timeout = 0;
6388
6389 if (abort) {
6390 mount_aborted = true;
6391 // Abort all mds sessions
6392 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6393
6394 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6395 } else {
6396 // flush the mdlog for pending requests, if any
6397 flush_mdlog_sync();
6398 }
6399
6400 mount_cond.wait(lock, [this] {
6401 if (!mds_requests.empty()) {
6402 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6403 << dendl;
6404 }
6405 return mds_requests.empty();
6406 });
6407
6408 cwd.reset();
6409 root.reset();
6410
6411 // clean up any unclosed files
6412 while (!fd_map.empty()) {
6413 Fh *fh = fd_map.begin()->second;
6414 fd_map.erase(fd_map.begin());
6415 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6416 _release_fh(fh);
6417 }
6418
6419 while (!ll_unclosed_fh_set.empty()) {
6420 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6421 Fh *fh = *it;
6422 ll_unclosed_fh_set.erase(fh);
6423 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6424 _release_fh(fh);
6425 }
6426
6427 while (!opened_dirs.empty()) {
6428 dir_result_t *dirp = *opened_dirs.begin();
6429 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6430 _closedir(dirp);
6431 }
6432
6433 _ll_drop_pins();
6434
6435 if (cct->_conf->client_oc) {
6436 // flush/release all buffered data
6437 std::list<InodeRef> anchor;
6438 for (auto& p : inode_map) {
6439 Inode *in = p.second;
6440 if (!in) {
6441 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6442 ceph_assert(in);
6443 }
6444
6445 // prevent inode from getting freed
6446 anchor.emplace_back(in);
6447
6448 if (abort || blocklisted) {
6449 objectcacher->purge_set(&in->oset);
6450 } else if (!in->caps.empty()) {
6451 _release(in);
6452 _flush(in, new C_Client_FlushComplete(this, in));
6453 }
6454 }
6455 }
6456
6457 if (abort || blocklisted) {
6458 for (auto p = dirty_list.begin(); !p.end(); ) {
6459 Inode *in = *p;
6460 ++p;
6461 if (in->dirty_caps) {
6462 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6463 in->mark_caps_clean();
6464 put_inode(in);
6465 }
6466 }
6467 } else {
6468 flush_caps_sync();
6469 wait_sync_caps(last_flush_tid);
6470 }
6471
6472 // empty lru cache
6473 trim_cache();
6474
6475 delay_put_inodes();
6476
6477 while (lru.lru_get_size() > 0 ||
6478 !inode_map.empty()) {
6479 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6480 << "+" << inode_map.size() << " items"
6481 << ", waiting (for caps to release?)"
6482 << dendl;
6483
6484 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6485 r == std::cv_status::timeout) {
6486 dump_cache(NULL);
6487 }
6488 }
6489 ceph_assert(lru.lru_get_size() == 0);
6490 ceph_assert(inode_map.empty());
6491
6492 // stop tracing
6493 if (!cct->_conf->client_trace.empty()) {
6494 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6495 traceout.close();
6496 }
6497
6498 // stop the tick thread
6499 tick_thread_stopped = true;
6500 upkeep_cond.notify_one();
6501
6502 _close_sessions();
6503
6504 mref_writer.update_state(CLIENT_UNMOUNTED);
6505
6506 ldout(cct, 2) << "unmounted." << dendl;
6507 }
6508
6509 void Client::unmount()
6510 {
6511 _unmount(false);
6512 }
6513
6514 void Client::abort_conn()
6515 {
6516 _unmount(true);
6517 }
6518
6519 void Client::flush_cap_releases()
6520 {
6521 uint64_t nr_caps = 0;
6522
6523 // send any cap releases
6524 for (auto &p : mds_sessions) {
6525 auto &session = p.second;
6526 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6527 p.first)) {
6528 nr_caps += session.release->caps.size();
6529 if (cct->_conf->client_inject_release_failure) {
6530 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6531 } else {
6532 session.con->send_message2(std::move(session.release));
6533 }
6534 session.release.reset();
6535 }
6536 }
6537
6538 if (nr_caps > 0) {
6539 dec_pinned_icaps(nr_caps);
6540 }
6541 }
6542
6543 void Client::renew_and_flush_cap_releases()
6544 {
6545 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6546
6547 if (!mount_aborted && mdsmap->get_epoch()) {
6548 // renew caps?
6549 utime_t el = ceph_clock_now() - last_cap_renew;
6550 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6551 renew_caps();
6552
6553 flush_cap_releases();
6554 }
6555 }
6556
6557 void Client::tick()
6558 {
6559 ldout(cct, 20) << "tick" << dendl;
6560
6561 utime_t now = ceph_clock_now();
6562
6563 /*
6564 * If the mount() is not finished
6565 */
6566 if (is_mounting() && !mds_requests.empty()) {
6567 MetaRequest *req = mds_requests.begin()->second;
6568
6569 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6570 req->abort(-CEPHFS_ETIMEDOUT);
6571 if (req->caller_cond) {
6572 req->kick = true;
6573 req->caller_cond->notify_all();
6574 }
6575 signal_cond_list(waiting_for_mdsmap);
6576 for (auto &p : mds_sessions) {
6577 signal_context_list(p.second.waiting_for_open);
6578 }
6579 }
6580 }
6581
6582 renew_and_flush_cap_releases();
6583
6584 // delayed caps
6585 xlist<Inode*>::iterator p = delayed_list.begin();
6586 while (!p.end()) {
6587 Inode *in = *p;
6588 ++p;
6589 if (!mount_aborted && in->hold_caps_until > now)
6590 break;
6591 delayed_list.pop_front();
6592 if (!mount_aborted)
6593 check_caps(in, CHECK_CAPS_NODELAY);
6594 }
6595
6596 if (!mount_aborted)
6597 collect_and_send_metrics();
6598
6599 delay_put_inodes(is_unmounting());
6600 trim_cache(true);
6601
6602 if (blocklisted && (is_mounted() || is_unmounting()) &&
6603 last_auto_reconnect + 30 * 60 < now &&
6604 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6605 messenger->client_reset();
6606 fd_gen++; // invalidate open files
6607 blocklisted = false;
6608 _kick_stale_sessions();
6609 last_auto_reconnect = now;
6610 }
6611 }
6612
6613 void Client::start_tick_thread()
6614 {
6615 upkeeper = std::thread([this]() {
6616 using time = ceph::coarse_mono_time;
6617 using sec = std::chrono::seconds;
6618
6619 auto last_tick = time::min();
6620
6621 std::unique_lock cl(client_lock);
6622 while (!tick_thread_stopped) {
6623 auto now = clock::now();
6624 auto since = now - last_tick;
6625
6626 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6627 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6628
6629 auto interval = std::max(t_interval, d_interval);
6630 if (likely(since >= interval*.90)) {
6631 tick();
6632 last_tick = clock::now();
6633 } else {
6634 interval -= since;
6635 }
6636
6637 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6638 if (!tick_thread_stopped)
6639 upkeep_cond.wait_for(cl, interval);
6640 }
6641 });
6642 }
6643
6644 void Client::collect_and_send_metrics() {
6645 ldout(cct, 20) << __func__ << dendl;
6646
6647 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6648
6649 // right now, we only track and send global metrics. its sufficient
6650 // to send these metrics to MDS rank0.
6651 collect_and_send_global_metrics();
6652 }
6653
6654 void Client::collect_and_send_global_metrics() {
6655 ldout(cct, 20) << __func__ << dendl;
6656 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6657
6658 if (!have_open_session((mds_rank_t)0)) {
6659 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6660 << dendl;
6661 return;
6662 }
6663 auto session = _get_or_open_mds_session((mds_rank_t)0);
6664 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6665 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6666 return;
6667 }
6668
6669 ClientMetricMessage metric;
6670 std::vector<ClientMetricMessage> message;
6671
6672 // read latency
6673 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6674 message.push_back(metric);
6675
6676 // write latency
6677 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6678 message.push_back(metric);
6679
6680 // metadata latency
6681 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6682 message.push_back(metric);
6683
6684 // cap hit ratio -- nr_caps is unused right now
6685 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6686 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6687 message.push_back(metric);
6688
6689 // dentry lease hit ratio
6690 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6691 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6692 message.push_back(metric);
6693
6694 // opened files
6695 {
6696 auto [opened_files, total_inodes] = get_opened_files_rates();
6697 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6698 }
6699 message.push_back(metric);
6700
6701 // pinned i_caps
6702 {
6703 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6704 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6705 }
6706 message.push_back(metric);
6707
6708 // opened inodes
6709 {
6710 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6711 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6712 }
6713 message.push_back(metric);
6714
6715 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6716 }
6717
6718 void Client::renew_caps()
6719 {
6720 ldout(cct, 10) << "renew_caps()" << dendl;
6721 last_cap_renew = ceph_clock_now();
6722
6723 for (auto &p : mds_sessions) {
6724 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6725 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6726 renew_caps(&p.second);
6727 }
6728 }
6729
6730 void Client::renew_caps(MetaSession *session)
6731 {
6732 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6733 session->last_cap_renew_request = ceph_clock_now();
6734 uint64_t seq = ++session->cap_renew_seq;
6735 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6736 }
6737
6738
6739 // ===============================================================
6740 // high level (POSIXy) interface
6741
6742 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6743 InodeRef *target, const UserPerm& perms)
6744 {
6745 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6746 MetaRequest *req = new MetaRequest(op);
6747 filepath path;
6748 dir->make_nosnap_relative_path(path);
6749 path.push_dentry(name);
6750 req->set_filepath(path);
6751 req->set_inode(dir);
6752 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6753 mask |= DEBUG_GETATTR_CAPS;
6754 req->head.args.getattr.mask = mask;
6755
6756 ldout(cct, 10) << __func__ << " on " << path << dendl;
6757
6758 int r = make_request(req, perms, target);
6759 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6760 return r;
6761 }
6762
6763 bool Client::_dentry_valid(const Dentry *dn)
6764 {
6765 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6766
6767 // is dn lease valid?
6768 utime_t now = ceph_clock_now();
6769 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6770 mds_sessions.count(dn->lease_mds)) {
6771 MetaSession &s = mds_sessions.at(dn->lease_mds);
6772 if (s.cap_ttl > now && s.cap_gen == dn->lease_gen) {
6773 dlease_hit();
6774 return true;
6775 }
6776
6777 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6778 << " vs lease_gen " << dn->lease_gen << dendl;
6779 }
6780
6781 dlease_miss();
6782 return false;
6783 }
6784
6785 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6786 const UserPerm& perms, std::string* alternate_name)
6787 {
6788 int r = 0;
6789 Dentry *dn = NULL;
6790 bool did_lookup_request = false;
6791 // can only request shared caps
6792 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6793
6794 if (dname == "..") {
6795 if (dir->dentries.empty()) {
6796 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6797 filepath path(dir->ino);
6798 req->set_filepath(path);
6799
6800 InodeRef tmptarget;
6801 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6802
6803 if (r == 0) {
6804 *target = std::move(tmptarget);
6805 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6806 } else {
6807 *target = dir;
6808 }
6809 }
6810 else
6811 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6812 goto done;
6813 }
6814
6815 if (dname == ".") {
6816 *target = dir;
6817 goto done;
6818 }
6819
6820 if (!dir->is_dir()) {
6821 r = -CEPHFS_ENOTDIR;
6822 goto done;
6823 }
6824
6825 if (dname.length() > NAME_MAX) {
6826 r = -CEPHFS_ENAMETOOLONG;
6827 goto done;
6828 }
6829
6830 if (dname == cct->_conf->client_snapdir &&
6831 dir->snapid == CEPH_NOSNAP) {
6832 *target = open_snapdir(dir);
6833 goto done;
6834 }
6835
6836 relookup:
6837 if (dir->dir &&
6838 dir->dir->dentries.count(dname)) {
6839 dn = dir->dir->dentries[dname];
6840
6841 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6842 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6843
6844 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6845 if (_dentry_valid(dn)) {
6846 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6847 // make trim_caps() behave.
6848 dir->try_touch_cap(dn->lease_mds);
6849 goto hit_dn;
6850 }
6851 // dir shared caps?
6852 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6853 if (dn->cap_shared_gen == dir->shared_gen &&
6854 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6855 goto hit_dn;
6856 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6857 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6858 << *dir << " dn '" << dname << "'" << dendl;
6859 return -CEPHFS_ENOENT;
6860 }
6861 }
6862 } else {
6863 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6864 }
6865 } else {
6866 // can we conclude ENOENT locally?
6867 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6868 (dir->flags & I_COMPLETE)) {
6869 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6870 return -CEPHFS_ENOENT;
6871 }
6872 }
6873
6874 if (did_lookup_request) {
6875 r = 0;
6876 goto done;
6877 }
6878 r = _do_lookup(dir, dname, mask, target, perms);
6879 did_lookup_request = true;
6880 if (r == 0) {
6881 /* complete lookup to get dentry for alternate_name */
6882 goto relookup;
6883 } else {
6884 goto done;
6885 }
6886
6887 hit_dn:
6888 if (dn->inode) {
6889 *target = dn->inode;
6890 if (alternate_name)
6891 *alternate_name = dn->alternate_name;
6892 } else {
6893 r = -CEPHFS_ENOENT;
6894 }
6895 touch_dn(dn);
6896 goto done;
6897
6898 done:
6899 if (r < 0)
6900 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6901 else
6902 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6903 return r;
6904 }
6905
6906 int Client::get_or_create(Inode *dir, const char* name,
6907 Dentry **pdn, bool expect_null)
6908 {
6909 // lookup
6910 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6911 dir->open_dir();
6912 if (dir->dir->dentries.count(name)) {
6913 Dentry *dn = dir->dir->dentries[name];
6914 if (_dentry_valid(dn)) {
6915 if (expect_null)
6916 return -CEPHFS_EEXIST;
6917 }
6918 *pdn = dn;
6919 } else {
6920 // otherwise link up a new one
6921 *pdn = link(dir->dir, name, NULL, NULL);
6922 }
6923
6924 // success
6925 return 0;
6926 }
6927
6928 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6929 {
6930 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6931 if (!mref_reader.is_state_satisfied())
6932 return -CEPHFS_ENOTCONN;
6933
6934 ldout(cct, 10) << __func__ << ": " << path << dendl;
6935
6936 std::scoped_lock lock(client_lock);
6937
6938 return path_walk(path, wdr, perms, followsym);
6939 }
6940
6941 int Client::path_walk(const filepath& origpath, InodeRef *end,
6942 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
6943 {
6944 walk_dentry_result wdr;
6945 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
6946 *end = std::move(wdr.in);
6947 return rc;
6948 }
6949
6950 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
6951 bool followsym, int mask, InodeRef dirinode)
6952 {
6953 filepath path = origpath;
6954 InodeRef cur;
6955 std::string alternate_name;
6956 if (origpath.absolute())
6957 cur = root;
6958 else if (!dirinode)
6959 cur = cwd;
6960 else {
6961 cur = dirinode;
6962 }
6963 ceph_assert(cur);
6964
6965 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
6966 ldout(cct, 10) << __func__ << " " << path << dendl;
6967
6968 int symlinks = 0;
6969
6970 unsigned i=0;
6971 while (i < path.depth() && cur) {
6972 int caps = 0;
6973 const string &dname = path[i];
6974 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6975 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6976 InodeRef next;
6977 if (cct->_conf->client_permissions) {
6978 int r = may_lookup(cur.get(), perms);
6979 if (r < 0)
6980 return r;
6981 caps = CEPH_CAP_AUTH_SHARED;
6982 }
6983
6984 /* Get extra requested caps on the last component */
6985 if (i == (path.depth() - 1))
6986 caps |= mask;
6987 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
6988 if (r < 0)
6989 return r;
6990 // only follow trailing symlink if followsym. always follow
6991 // 'directory' symlinks.
6992 if (next && next->is_symlink()) {
6993 symlinks++;
6994 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6995 if (symlinks > MAXSYMLINKS) {
6996 return -CEPHFS_ELOOP;
6997 }
6998
6999 if (i < path.depth() - 1) {
7000 // dir symlink
7001 // replace consumed components of path with symlink dir target
7002 filepath resolved(next->symlink.c_str());
7003 resolved.append(path.postfixpath(i + 1));
7004 path = resolved;
7005 i = 0;
7006 if (next->symlink[0] == '/') {
7007 cur = root;
7008 }
7009 continue;
7010 } else if (followsym) {
7011 if (next->symlink[0] == '/') {
7012 path = next->symlink.c_str();
7013 i = 0;
7014 // reset position
7015 cur = root;
7016 } else {
7017 filepath more(next->symlink.c_str());
7018 // we need to remove the symlink component from off of the path
7019 // before adding the target that the symlink points to. remain
7020 // at the same position in the path.
7021 path.pop_dentry();
7022 path.append(more);
7023 }
7024 continue;
7025 }
7026 }
7027 cur.swap(next);
7028 i++;
7029 }
7030 if (!cur)
7031 return -CEPHFS_ENOENT;
7032 if (result) {
7033 result->in = std::move(cur);
7034 result->alternate_name = std::move(alternate_name);
7035 }
7036 return 0;
7037 }
7038
7039
7040 // namespace ops
7041
7042 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7043 {
7044 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7045 if (!mref_reader.is_state_satisfied())
7046 return -CEPHFS_ENOTCONN;
7047
7048 tout(cct) << "link" << std::endl;
7049 tout(cct) << relexisting << std::endl;
7050 tout(cct) << relpath << std::endl;
7051
7052 filepath existing(relexisting);
7053
7054 InodeRef in, dir;
7055
7056 std::scoped_lock lock(client_lock);
7057 int r = path_walk(existing, &in, perm, true);
7058 if (r < 0)
7059 return r;
7060 if (std::string(relpath) == "/") {
7061 r = -CEPHFS_EEXIST;
7062 return r;
7063 }
7064 filepath path(relpath);
7065 string name = path.last_dentry();
7066 path.pop_dentry();
7067
7068 r = path_walk(path, &dir, perm, true);
7069 if (r < 0)
7070 return r;
7071 if (cct->_conf->client_permissions) {
7072 if (S_ISDIR(in->mode)) {
7073 r = -CEPHFS_EPERM;
7074 return r;
7075 }
7076 r = may_hardlink(in.get(), perm);
7077 if (r < 0)
7078 return r;
7079 r = may_create(dir.get(), perm);
7080 if (r < 0)
7081 return r;
7082 }
7083 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7084 return r;
7085 }
7086
7087 int Client::unlink(const char *relpath, const UserPerm& perm)
7088 {
7089 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7090 }
7091
7092 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7093 {
7094 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7095 if (!mref_reader.is_state_satisfied()) {
7096 return -CEPHFS_ENOTCONN;
7097 }
7098
7099 tout(cct) << __func__ << std::endl;
7100 tout(cct) << dirfd << std::endl;
7101 tout(cct) << relpath << std::endl;
7102 tout(cct) << flags << std::endl;
7103
7104 if (std::string(relpath) == "/") {
7105 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7106 }
7107
7108 filepath path(relpath);
7109 string name = path.last_dentry();
7110 path.pop_dentry();
7111 InodeRef dir;
7112
7113 std::scoped_lock lock(client_lock);
7114
7115 InodeRef dirinode;
7116 int r = get_fd_inode(dirfd, &dirinode);
7117 if (r < 0) {
7118 return r;
7119 }
7120
7121 r = path_walk(path, &dir, perm, true, 0, dirinode);
7122 if (r < 0) {
7123 return r;
7124 }
7125 if (cct->_conf->client_permissions) {
7126 r = may_delete(dir.get(), name.c_str(), perm);
7127 if (r < 0) {
7128 return r;
7129 }
7130 }
7131 if (flags & AT_REMOVEDIR) {
7132 r = _rmdir(dir.get(), name.c_str(), perm);
7133 } else {
7134 r = _unlink(dir.get(), name.c_str(), perm);
7135 }
7136 return r;
7137 }
7138
7139 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7140 {
7141 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7142 if (!mref_reader.is_state_satisfied())
7143 return -CEPHFS_ENOTCONN;
7144
7145 tout(cct) << __func__ << std::endl;
7146 tout(cct) << relfrom << std::endl;
7147 tout(cct) << relto << std::endl;
7148
7149 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7150 return -CEPHFS_EBUSY;
7151
7152 filepath from(relfrom);
7153 filepath to(relto);
7154 string fromname = from.last_dentry();
7155 from.pop_dentry();
7156 string toname = to.last_dentry();
7157 to.pop_dentry();
7158
7159 InodeRef fromdir, todir;
7160
7161 std::scoped_lock lock(client_lock);
7162 int r = path_walk(from, &fromdir, perm);
7163 if (r < 0)
7164 goto out;
7165 r = path_walk(to, &todir, perm);
7166 if (r < 0)
7167 goto out;
7168
7169 if (cct->_conf->client_permissions) {
7170 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7171 if (r < 0)
7172 return r;
7173 r = may_delete(todir.get(), toname.c_str(), perm);
7174 if (r < 0 && r != -CEPHFS_ENOENT)
7175 return r;
7176 }
7177 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7178 out:
7179 return r;
7180 }
7181
7182 // dirs
7183
7184 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7185 {
7186 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7187 }
7188
7189 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7190 std::string alternate_name)
7191 {
7192 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7193 if (!mref_reader.is_state_satisfied())
7194 return -CEPHFS_ENOTCONN;
7195
7196 tout(cct) << __func__ << std::endl;
7197 tout(cct) << dirfd << std::endl;
7198 tout(cct) << relpath << std::endl;
7199 tout(cct) << mode << std::endl;
7200 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7201
7202 if (std::string(relpath) == "/") {
7203 return -CEPHFS_EEXIST;
7204 }
7205
7206 filepath path(relpath);
7207 string name = path.last_dentry();
7208 path.pop_dentry();
7209 InodeRef dir;
7210
7211 std::scoped_lock lock(client_lock);
7212
7213 InodeRef dirinode;
7214 int r = get_fd_inode(dirfd, &dirinode);
7215 if (r < 0) {
7216 return r;
7217 }
7218
7219 r = path_walk(path, &dir, perm, true, 0, dirinode);
7220 if (r < 0) {
7221 return r;
7222 }
7223 if (cct->_conf->client_permissions) {
7224 r = may_create(dir.get(), perm);
7225 if (r < 0) {
7226 return r;
7227 }
7228 }
7229 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7230 }
7231
7232 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7233 {
7234 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7235 if (!mref_reader.is_state_satisfied())
7236 return -CEPHFS_ENOTCONN;
7237
7238 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7239 tout(cct) << __func__ << std::endl;
7240 tout(cct) << relpath << std::endl;
7241 tout(cct) << mode << std::endl;
7242
7243 //get through existing parts of path
7244 filepath path(relpath);
7245 unsigned int i;
7246 int r = 0, caps = 0;
7247 InodeRef cur, next;
7248
7249 std::scoped_lock lock(client_lock);
7250 cur = cwd;
7251 for (i=0; i<path.depth(); ++i) {
7252 if (cct->_conf->client_permissions) {
7253 r = may_lookup(cur.get(), perms);
7254 if (r < 0)
7255 break;
7256 caps = CEPH_CAP_AUTH_SHARED;
7257 }
7258 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7259 if (r < 0)
7260 break;
7261 cur.swap(next);
7262 }
7263 if (r!=-CEPHFS_ENOENT) return r;
7264 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7265 //make new directory at each level
7266 for (; i<path.depth(); ++i) {
7267 if (cct->_conf->client_permissions) {
7268 r = may_create(cur.get(), perms);
7269 if (r < 0)
7270 return r;
7271 }
7272 //make new dir
7273 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7274
7275 //check proper creation/existence
7276 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7277 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7278 }
7279 if (r < 0)
7280 return r;
7281 //move to new dir and continue
7282 cur.swap(next);
7283 ldout(cct, 20) << __func__ << ": successfully created directory "
7284 << filepath(cur->ino).get_path() << dendl;
7285 }
7286 return 0;
7287 }
7288
7289 int Client::rmdir(const char *relpath, const UserPerm& perms)
7290 {
7291 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7292 }
7293
7294 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7295 {
7296 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7297 if (!mref_reader.is_state_satisfied())
7298 return -CEPHFS_ENOTCONN;
7299
7300 tout(cct) << __func__ << std::endl;
7301 tout(cct) << relpath << std::endl;
7302 tout(cct) << mode << std::endl;
7303 tout(cct) << rdev << std::endl;
7304
7305 if (std::string(relpath) == "/")
7306 return -CEPHFS_EEXIST;
7307
7308 filepath path(relpath);
7309 string name = path.last_dentry();
7310 path.pop_dentry();
7311 InodeRef dir;
7312
7313 std::scoped_lock lock(client_lock);
7314 int r = path_walk(path, &dir, perms);
7315 if (r < 0)
7316 return r;
7317 if (cct->_conf->client_permissions) {
7318 int r = may_create(dir.get(), perms);
7319 if (r < 0)
7320 return r;
7321 }
7322 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7323 }
7324
7325 // symlinks
7326
7327 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7328 {
7329 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7330 }
7331
7332 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7333 std::string alternate_name)
7334 {
7335 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7336 if (!mref_reader.is_state_satisfied()) {
7337 return -CEPHFS_ENOTCONN;
7338 }
7339
7340 tout(cct) << __func__ << std::endl;
7341 tout(cct) << target << std::endl;
7342 tout(cct) << dirfd << std::endl;
7343 tout(cct) << relpath << std::endl;
7344
7345 if (std::string(relpath) == "/") {
7346 return -CEPHFS_EEXIST;
7347 }
7348
7349 filepath path(relpath);
7350 string name = path.last_dentry();
7351 path.pop_dentry();
7352 InodeRef dir;
7353
7354 std::scoped_lock lock(client_lock);
7355
7356 InodeRef dirinode;
7357 int r = get_fd_inode(dirfd, &dirinode);
7358 if (r < 0) {
7359 return r;
7360 }
7361 r = path_walk(path, &dir, perms, true, 0, dirinode);
7362 if (r < 0) {
7363 return r;
7364 }
7365 if (cct->_conf->client_permissions) {
7366 int r = may_create(dir.get(), perms);
7367 if (r < 0) {
7368 return r;
7369 }
7370 }
7371 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7372 }
7373
7374 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7375 {
7376 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7377 }
7378
7379 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7380 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7381 if (!mref_reader.is_state_satisfied()) {
7382 return -CEPHFS_ENOTCONN;
7383 }
7384
7385 tout(cct) << __func__ << std::endl;
7386 tout(cct) << dirfd << std::endl;
7387 tout(cct) << relpath << std::endl;
7388
7389 InodeRef dirinode;
7390 std::scoped_lock lock(client_lock);
7391 int r = get_fd_inode(dirfd, &dirinode);
7392 if (r < 0) {
7393 return r;
7394 }
7395
7396 InodeRef in;
7397 filepath path(relpath);
7398 r = path_walk(path, &in, perms, false, 0, dirinode);
7399 if (r < 0) {
7400 return r;
7401 }
7402
7403 return _readlink(in.get(), buf, size);
7404 }
7405
7406 int Client::_readlink(Inode *in, char *buf, size_t size)
7407 {
7408 if (!in->is_symlink())
7409 return -CEPHFS_EINVAL;
7410
7411 // copy into buf (at most size bytes)
7412 int r = in->symlink.length();
7413 if (r > (int)size)
7414 r = size;
7415 memcpy(buf, in->symlink.c_str(), r);
7416 return r;
7417 }
7418
7419
7420 // inode stuff
7421
7422 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7423 {
7424 bool yes = in->caps_issued_mask(mask, true);
7425
7426 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7427 if (yes && !force)
7428 return 0;
7429
7430 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7431 filepath path;
7432 in->make_nosnap_relative_path(path);
7433 req->set_filepath(path);
7434 req->set_inode(in);
7435 req->head.args.getattr.mask = mask;
7436
7437 int res = make_request(req, perms);
7438 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7439 return res;
7440 }
7441
7442 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7443 const UserPerm& perms, InodeRef *inp)
7444 {
7445 int issued = in->caps_issued();
7446
7447 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7448 ccap_string(issued) << dendl;
7449
7450 if (in->snapid != CEPH_NOSNAP) {
7451 return -CEPHFS_EROFS;
7452 }
7453 if ((mask & CEPH_SETATTR_SIZE) &&
7454 (uint64_t)stx->stx_size > in->size &&
7455 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7456 perms)) {
7457 return -CEPHFS_EDQUOT;
7458 }
7459
7460 // make the change locally?
7461 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7462 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7463 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7464 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7465 << in->cap_dirtier_gid << ", forcing sync setattr"
7466 << dendl;
7467 /*
7468 * This works because we implicitly flush the caps as part of the
7469 * request, so the cap update check will happen with the writeback
7470 * cap context, and then the setattr check will happen with the
7471 * caller's context.
7472 *
7473 * In reality this pattern is likely pretty rare (different users
7474 * setattr'ing the same file). If that turns out not to be the
7475 * case later, we can build a more complex pipelined cap writeback
7476 * infrastructure...
7477 */
7478 if (!mask)
7479 mask |= CEPH_SETATTR_CTIME;
7480 goto force_request;
7481 }
7482
7483 if (!mask) {
7484 // caller just needs us to bump the ctime
7485 in->ctime = ceph_clock_now();
7486 in->cap_dirtier_uid = perms.uid();
7487 in->cap_dirtier_gid = perms.gid();
7488 if (issued & CEPH_CAP_AUTH_EXCL)
7489 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7490 else if (issued & CEPH_CAP_FILE_EXCL)
7491 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7492 else if (issued & CEPH_CAP_XATTR_EXCL)
7493 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7494 else
7495 mask |= CEPH_SETATTR_CTIME;
7496 }
7497
7498 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7499 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7500
7501 mask &= ~CEPH_SETATTR_KILL_SGUID;
7502
7503 if (mask & CEPH_SETATTR_UID) {
7504 in->ctime = ceph_clock_now();
7505 in->cap_dirtier_uid = perms.uid();
7506 in->cap_dirtier_gid = perms.gid();
7507 in->uid = stx->stx_uid;
7508 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7509 mask &= ~CEPH_SETATTR_UID;
7510 kill_sguid = true;
7511 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7512 }
7513 if (mask & CEPH_SETATTR_GID) {
7514 in->ctime = ceph_clock_now();
7515 in->cap_dirtier_uid = perms.uid();
7516 in->cap_dirtier_gid = perms.gid();
7517 in->gid = stx->stx_gid;
7518 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7519 mask &= ~CEPH_SETATTR_GID;
7520 kill_sguid = true;
7521 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7522 }
7523
7524 if (mask & CEPH_SETATTR_MODE) {
7525 in->ctime = ceph_clock_now();
7526 in->cap_dirtier_uid = perms.uid();
7527 in->cap_dirtier_gid = perms.gid();
7528 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7529 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7530 mask &= ~CEPH_SETATTR_MODE;
7531 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7532 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7533 /* Must squash the any setuid/setgid bits with an ownership change */
7534 in->mode &= ~(S_ISUID|S_ISGID);
7535 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7536 }
7537
7538 if (mask & CEPH_SETATTR_BTIME) {
7539 in->ctime = ceph_clock_now();
7540 in->cap_dirtier_uid = perms.uid();
7541 in->cap_dirtier_gid = perms.gid();
7542 in->btime = utime_t(stx->stx_btime);
7543 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7544 mask &= ~CEPH_SETATTR_BTIME;
7545 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7546 }
7547 } else if (mask & CEPH_SETATTR_SIZE) {
7548 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7549 mask |= CEPH_SETATTR_KILL_SGUID;
7550 }
7551
7552 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7553 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7554 if (mask & CEPH_SETATTR_MTIME)
7555 in->mtime = utime_t(stx->stx_mtime);
7556 if (mask & CEPH_SETATTR_ATIME)
7557 in->atime = utime_t(stx->stx_atime);
7558 in->ctime = ceph_clock_now();
7559 in->cap_dirtier_uid = perms.uid();
7560 in->cap_dirtier_gid = perms.gid();
7561 in->time_warp_seq++;
7562 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7563 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7564 }
7565 }
7566 if (!mask) {
7567 in->change_attr++;
7568 return 0;
7569 }
7570
7571 force_request:
7572 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7573
7574 filepath path;
7575
7576 in->make_nosnap_relative_path(path);
7577 req->set_filepath(path);
7578 req->set_inode(in);
7579
7580 if (mask & CEPH_SETATTR_KILL_SGUID) {
7581 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7582 }
7583 if (mask & CEPH_SETATTR_MODE) {
7584 req->head.args.setattr.mode = stx->stx_mode;
7585 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7586 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7587 }
7588 if (mask & CEPH_SETATTR_UID) {
7589 req->head.args.setattr.uid = stx->stx_uid;
7590 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7591 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7592 }
7593 if (mask & CEPH_SETATTR_GID) {
7594 req->head.args.setattr.gid = stx->stx_gid;
7595 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7596 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7597 }
7598 if (mask & CEPH_SETATTR_BTIME) {
7599 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7600 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7601 }
7602 if (mask & CEPH_SETATTR_MTIME) {
7603 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7604 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7605 CEPH_CAP_FILE_WR;
7606 }
7607 if (mask & CEPH_SETATTR_ATIME) {
7608 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7609 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7610 CEPH_CAP_FILE_WR;
7611 }
7612 if (mask & CEPH_SETATTR_SIZE) {
7613 if ((uint64_t)stx->stx_size < mdsmap->get_max_filesize()) {
7614 req->head.args.setattr.size = stx->stx_size;
7615 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7616 } else { //too big!
7617 put_request(req);
7618 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7619 return -CEPHFS_EFBIG;
7620 }
7621 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7622 CEPH_CAP_FILE_WR;
7623 }
7624 req->head.args.setattr.mask = mask;
7625
7626 req->regetattr_mask = mask;
7627
7628 int res = make_request(req, perms, inp);
7629 ldout(cct, 10) << "_setattr result=" << res << dendl;
7630 return res;
7631 }
7632
7633 /* Note that we only care about attrs that setattr cares about */
7634 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7635 {
7636 stx->stx_size = st->st_size;
7637 stx->stx_mode = st->st_mode;
7638 stx->stx_uid = st->st_uid;
7639 stx->stx_gid = st->st_gid;
7640 #ifdef __APPLE__
7641 stx->stx_mtime = st->st_mtimespec;
7642 stx->stx_atime = st->st_atimespec;
7643 #elif __WIN32
7644 stx->stx_mtime.tv_sec = st->st_mtime;
7645 stx->stx_atime.tv_sec = st->st_atime;
7646 #else
7647 stx->stx_mtime = st->st_mtim;
7648 stx->stx_atime = st->st_atim;
7649 #endif
7650 }
7651
7652 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7653 const UserPerm& perms, InodeRef *inp)
7654 {
7655 int ret = _do_setattr(in, stx, mask, perms, inp);
7656 if (ret < 0)
7657 return ret;
7658 if (mask & CEPH_SETATTR_MODE)
7659 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7660 return ret;
7661 }
7662
7663 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7664 const UserPerm& perms)
7665 {
7666 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7667 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7668 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7669 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7670 if (cct->_conf->client_permissions) {
7671 int r = may_setattr(in.get(), stx, mask, perms);
7672 if (r < 0)
7673 return r;
7674 }
7675 return __setattrx(in.get(), stx, mask, perms);
7676 }
7677
7678 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7679 const UserPerm& perms)
7680 {
7681 struct ceph_statx stx;
7682
7683 stat_to_statx(attr, &stx);
7684 mask &= ~CEPH_SETATTR_BTIME;
7685
7686 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7687 mask &= ~CEPH_SETATTR_UID;
7688 }
7689 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7690 mask &= ~CEPH_SETATTR_GID;
7691 }
7692
7693 return _setattrx(in, &stx, mask, perms);
7694 }
7695
7696 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7697 const UserPerm& perms)
7698 {
7699 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7700 if (!mref_reader.is_state_satisfied())
7701 return -CEPHFS_ENOTCONN;
7702
7703 tout(cct) << __func__ << std::endl;
7704 tout(cct) << relpath << std::endl;
7705 tout(cct) << mask << std::endl;
7706
7707 filepath path(relpath);
7708 InodeRef in;
7709
7710 std::scoped_lock lock(client_lock);
7711 int r = path_walk(path, &in, perms);
7712 if (r < 0)
7713 return r;
7714 return _setattr(in, attr, mask, perms);
7715 }
7716
7717 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7718 const UserPerm& perms, int flags)
7719 {
7720 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7721 if (!mref_reader.is_state_satisfied())
7722 return -CEPHFS_ENOTCONN;
7723
7724 tout(cct) << __func__ << std::endl;
7725 tout(cct) << relpath << std::endl;
7726 tout(cct) << mask << std::endl;
7727
7728 filepath path(relpath);
7729 InodeRef in;
7730
7731 std::scoped_lock lock(client_lock);
7732 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7733 if (r < 0)
7734 return r;
7735 return _setattrx(in, stx, mask, perms);
7736 }
7737
7738 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7739 {
7740 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7741 if (!mref_reader.is_state_satisfied())
7742 return -CEPHFS_ENOTCONN;
7743
7744 tout(cct) << __func__ << std::endl;
7745 tout(cct) << fd << std::endl;
7746 tout(cct) << mask << std::endl;
7747
7748 std::scoped_lock lock(client_lock);
7749 Fh *f = get_filehandle(fd);
7750 if (!f)
7751 return -CEPHFS_EBADF;
7752 #if defined(__linux__) && defined(O_PATH)
7753 if (f->flags & O_PATH)
7754 return -CEPHFS_EBADF;
7755 #endif
7756 return _setattr(f->inode, attr, mask, perms);
7757 }
7758
7759 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7760 {
7761 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7762 if (!mref_reader.is_state_satisfied())
7763 return -CEPHFS_ENOTCONN;
7764
7765 tout(cct) << __func__ << std::endl;
7766 tout(cct) << fd << std::endl;
7767 tout(cct) << mask << std::endl;
7768
7769 std::scoped_lock lock(client_lock);
7770 Fh *f = get_filehandle(fd);
7771 if (!f)
7772 return -CEPHFS_EBADF;
7773 #if defined(__linux__) && defined(O_PATH)
7774 if (f->flags & O_PATH)
7775 return -CEPHFS_EBADF;
7776 #endif
7777 return _setattrx(f->inode, stx, mask, perms);
7778 }
7779
7780 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7781 frag_info_t *dirstat, int mask)
7782 {
7783 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7784 if (!mref_reader.is_state_satisfied())
7785 return -CEPHFS_ENOTCONN;
7786
7787 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7788 tout(cct) << "stat" << std::endl;
7789 tout(cct) << relpath << std::endl;
7790
7791 filepath path(relpath);
7792 InodeRef in;
7793
7794 std::scoped_lock lock(client_lock);
7795 int r = path_walk(path, &in, perms, true, mask);
7796 if (r < 0)
7797 return r;
7798 r = _getattr(in, mask, perms);
7799 if (r < 0) {
7800 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7801 return r;
7802 }
7803 fill_stat(in, stbuf, dirstat);
7804 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7805 return r;
7806 }
7807
7808 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7809 {
7810 unsigned mask = 0;
7811
7812 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7813 if (flags & AT_NO_ATTR_SYNC)
7814 goto out;
7815
7816 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7817 mask |= CEPH_CAP_PIN;
7818 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7819 mask |= CEPH_CAP_AUTH_SHARED;
7820 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7821 mask |= CEPH_CAP_LINK_SHARED;
7822 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7823 mask |= CEPH_CAP_FILE_SHARED;
7824 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7825 mask |= CEPH_CAP_XATTR_SHARED;
7826 out:
7827 return mask;
7828 }
7829
7830 int Client::statx(const char *relpath, struct ceph_statx *stx,
7831 const UserPerm& perms,
7832 unsigned int want, unsigned int flags)
7833 {
7834 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7835 }
7836
7837 int Client::lstat(const char *relpath, struct stat *stbuf,
7838 const UserPerm& perms, frag_info_t *dirstat, int mask)
7839 {
7840 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7841 if (!mref_reader.is_state_satisfied())
7842 return -CEPHFS_ENOTCONN;
7843
7844 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7845 tout(cct) << __func__ << std::endl;
7846 tout(cct) << relpath << std::endl;
7847
7848 filepath path(relpath);
7849 InodeRef in;
7850
7851 std::scoped_lock lock(client_lock);
7852 // don't follow symlinks
7853 int r = path_walk(path, &in, perms, false, mask);
7854 if (r < 0)
7855 return r;
7856 r = _getattr(in, mask, perms);
7857 if (r < 0) {
7858 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7859 return r;
7860 }
7861 fill_stat(in, stbuf, dirstat);
7862 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7863 return r;
7864 }
7865
7866 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7867 {
7868 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7869 << " mode 0" << oct << in->mode << dec
7870 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7871 memset(st, 0, sizeof(struct stat));
7872 if (use_faked_inos())
7873 st->st_ino = in->faked_ino;
7874 else
7875 st->st_ino = in->ino;
7876 st->st_dev = in->snapid;
7877 st->st_mode = in->mode;
7878 st->st_rdev = in->rdev;
7879 if (in->is_dir()) {
7880 switch (in->nlink) {
7881 case 0:
7882 st->st_nlink = 0; /* dir is unlinked */
7883 break;
7884 case 1:
7885 st->st_nlink = 1 /* parent dentry */
7886 + 1 /* <dir>/. */
7887 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7888 break;
7889 default:
7890 ceph_abort();
7891 }
7892 } else {
7893 st->st_nlink = in->nlink;
7894 }
7895 st->st_uid = in->uid;
7896 st->st_gid = in->gid;
7897 if (in->ctime > in->mtime) {
7898 stat_set_ctime_sec(st, in->ctime.sec());
7899 stat_set_ctime_nsec(st, in->ctime.nsec());
7900 } else {
7901 stat_set_ctime_sec(st, in->mtime.sec());
7902 stat_set_ctime_nsec(st, in->mtime.nsec());
7903 }
7904 stat_set_atime_sec(st, in->atime.sec());
7905 stat_set_atime_nsec(st, in->atime.nsec());
7906 stat_set_mtime_sec(st, in->mtime.sec());
7907 stat_set_mtime_nsec(st, in->mtime.nsec());
7908 if (in->is_dir()) {
7909 if (cct->_conf->client_dirsize_rbytes)
7910 st->st_size = in->rstat.rbytes;
7911 else
7912 st->st_size = in->dirstat.size();
7913 // The Windows "stat" structure provides just a subset of the fields that are
7914 // available on Linux.
7915 #ifndef _WIN32
7916 st->st_blocks = 1;
7917 #endif
7918 } else {
7919 st->st_size = in->size;
7920 #ifndef _WIN32
7921 st->st_blocks = (in->size + 511) >> 9;
7922 #endif
7923 }
7924 #ifndef _WIN32
7925 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7926 #endif
7927
7928 if (dirstat)
7929 *dirstat = in->dirstat;
7930 if (rstat)
7931 *rstat = in->rstat;
7932
7933 return in->caps_issued();
7934 }
7935
7936 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7937 {
7938 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7939 << " mode 0" << oct << in->mode << dec
7940 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7941 memset(stx, 0, sizeof(struct ceph_statx));
7942
7943 /*
7944 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7945 * so that all bits are set.
7946 */
7947 if (!mask)
7948 mask = ~0;
7949
7950 /* These are always considered to be available */
7951 stx->stx_dev = in->snapid;
7952 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7953
7954 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7955 stx->stx_mode = S_IFMT & in->mode;
7956 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7957 stx->stx_rdev = in->rdev;
7958 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7959
7960 if (mask & CEPH_CAP_AUTH_SHARED) {
7961 stx->stx_uid = in->uid;
7962 stx->stx_gid = in->gid;
7963 stx->stx_mode = in->mode;
7964 in->btime.to_timespec(&stx->stx_btime);
7965 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7966 }
7967
7968 if (mask & CEPH_CAP_LINK_SHARED) {
7969 if (in->is_dir()) {
7970 switch (in->nlink) {
7971 case 0:
7972 stx->stx_nlink = 0; /* dir is unlinked */
7973 break;
7974 case 1:
7975 stx->stx_nlink = 1 /* parent dentry */
7976 + 1 /* <dir>/. */
7977 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7978 break;
7979 default:
7980 ceph_abort();
7981 }
7982 } else {
7983 stx->stx_nlink = in->nlink;
7984 }
7985 stx->stx_mask |= CEPH_STATX_NLINK;
7986 }
7987
7988 if (mask & CEPH_CAP_FILE_SHARED) {
7989
7990 in->atime.to_timespec(&stx->stx_atime);
7991 in->mtime.to_timespec(&stx->stx_mtime);
7992
7993 if (in->is_dir()) {
7994 if (cct->_conf->client_dirsize_rbytes)
7995 stx->stx_size = in->rstat.rbytes;
7996 else
7997 stx->stx_size = in->dirstat.size();
7998 stx->stx_blocks = 1;
7999 } else {
8000 stx->stx_size = in->size;
8001 stx->stx_blocks = (in->size + 511) >> 9;
8002 }
8003 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8004 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8005 }
8006
8007 /* Change time and change_attr both require all shared caps to view */
8008 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8009 stx->stx_version = in->change_attr;
8010 if (in->ctime > in->mtime)
8011 in->ctime.to_timespec(&stx->stx_ctime);
8012 else
8013 in->mtime.to_timespec(&stx->stx_ctime);
8014 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8015 }
8016
8017 }
8018
8019 void Client::touch_dn(Dentry *dn)
8020 {
8021 lru.lru_touch(dn);
8022 }
8023
8024 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8025 {
8026 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8027 }
8028
8029 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8030 {
8031 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8032 if (!mref_reader.is_state_satisfied())
8033 return -CEPHFS_ENOTCONN;
8034
8035 tout(cct) << __func__ << std::endl;
8036 tout(cct) << fd << std::endl;
8037 tout(cct) << mode << std::endl;
8038
8039 std::scoped_lock lock(client_lock);
8040 Fh *f = get_filehandle(fd);
8041 if (!f)
8042 return -CEPHFS_EBADF;
8043 #if defined(__linux__) && defined(O_PATH)
8044 if (f->flags & O_PATH)
8045 return -CEPHFS_EBADF;
8046 #endif
8047 struct stat attr;
8048 attr.st_mode = mode;
8049 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8050 }
8051
8052 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8053 const UserPerm& perms) {
8054 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8055 if (!mref_reader.is_state_satisfied()) {
8056 return -CEPHFS_ENOTCONN;
8057 }
8058
8059 tout(cct) << __func__ << std::endl;
8060 tout(cct) << dirfd << std::endl;
8061 tout(cct) << relpath << std::endl;
8062 tout(cct) << mode << std::endl;
8063 tout(cct) << flags << std::endl;
8064
8065 filepath path(relpath);
8066 InodeRef in;
8067 InodeRef dirinode;
8068
8069 std::scoped_lock lock(client_lock);
8070 int r = get_fd_inode(dirfd, &dirinode);
8071 if (r < 0) {
8072 return r;
8073 }
8074
8075 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8076 if (r < 0) {
8077 return r;
8078 }
8079 struct stat attr;
8080 attr.st_mode = mode;
8081 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8082 }
8083
8084 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8085 {
8086 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8087 }
8088
8089 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8090 const UserPerm& perms)
8091 {
8092 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8093 }
8094
8095 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8096 {
8097 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8098 if (!mref_reader.is_state_satisfied())
8099 return -CEPHFS_ENOTCONN;
8100
8101 tout(cct) << __func__ << std::endl;
8102 tout(cct) << fd << std::endl;
8103 tout(cct) << new_uid << std::endl;
8104 tout(cct) << new_gid << std::endl;
8105
8106 std::scoped_lock lock(client_lock);
8107 Fh *f = get_filehandle(fd);
8108 if (!f)
8109 return -CEPHFS_EBADF;
8110 #if defined(__linux__) && defined(O_PATH)
8111 if (f->flags & O_PATH)
8112 return -CEPHFS_EBADF;
8113 #endif
8114 struct stat attr;
8115 attr.st_uid = new_uid;
8116 attr.st_gid = new_gid;
8117 int mask = 0;
8118 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8119 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8120 return _setattr(f->inode, &attr, mask, perms);
8121 }
8122
8123 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8124 const UserPerm& perms)
8125 {
8126 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8127 }
8128
8129 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8130 int flags, const UserPerm& perms) {
8131 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8132 if (!mref_reader.is_state_satisfied()) {
8133 return -CEPHFS_ENOTCONN;
8134 }
8135
8136 tout(cct) << __func__ << std::endl;
8137 tout(cct) << dirfd << std::endl;
8138 tout(cct) << relpath << std::endl;
8139 tout(cct) << new_uid << std::endl;
8140 tout(cct) << new_gid << std::endl;
8141 tout(cct) << flags << std::endl;
8142
8143 filepath path(relpath);
8144 InodeRef in;
8145 InodeRef dirinode;
8146
8147 std::scoped_lock lock(client_lock);
8148 int r = get_fd_inode(dirfd, &dirinode);
8149 if (r < 0) {
8150 return r;
8151 }
8152
8153 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8154 if (r < 0) {
8155 return r;
8156 }
8157 struct stat attr;
8158 attr.st_uid = new_uid;
8159 attr.st_gid = new_gid;
8160 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8161 }
8162
8163 static void attr_set_atime_and_mtime(struct stat *attr,
8164 const utime_t &atime,
8165 const utime_t &mtime)
8166 {
8167 stat_set_atime_sec(attr, atime.tv.tv_sec);
8168 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8169 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8170 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8171 }
8172
8173 // for [l]utime() invoke the timeval variant as the timespec
8174 // variant are not yet implemented. for futime[s](), invoke
8175 // the timespec variant.
8176 int Client::utime(const char *relpath, struct utimbuf *buf,
8177 const UserPerm& perms)
8178 {
8179 struct timeval tv[2];
8180 tv[0].tv_sec = buf->actime;
8181 tv[0].tv_usec = 0;
8182 tv[1].tv_sec = buf->modtime;
8183 tv[1].tv_usec = 0;
8184
8185 return utimes(relpath, tv, perms);
8186 }
8187
8188 int Client::lutime(const char *relpath, struct utimbuf *buf,
8189 const UserPerm& perms)
8190 {
8191 struct timeval tv[2];
8192 tv[0].tv_sec = buf->actime;
8193 tv[0].tv_usec = 0;
8194 tv[1].tv_sec = buf->modtime;
8195 tv[1].tv_usec = 0;
8196
8197 return lutimes(relpath, tv, perms);
8198 }
8199
8200 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8201 {
8202 struct timespec ts[2];
8203 ts[0].tv_sec = buf->actime;
8204 ts[0].tv_nsec = 0;
8205 ts[1].tv_sec = buf->modtime;
8206 ts[1].tv_nsec = 0;
8207
8208 return futimens(fd, ts, perms);
8209 }
8210
8211 int Client::utimes(const char *relpath, struct timeval times[2],
8212 const UserPerm& perms)
8213 {
8214 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8215 if (!mref_reader.is_state_satisfied())
8216 return -CEPHFS_ENOTCONN;
8217
8218 tout(cct) << __func__ << std::endl;
8219 tout(cct) << relpath << std::endl;
8220 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8221 << std::endl;
8222 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8223 << std::endl;
8224
8225 filepath path(relpath);
8226 InodeRef in;
8227
8228 std::scoped_lock lock(client_lock);
8229 int r = path_walk(path, &in, perms);
8230 if (r < 0)
8231 return r;
8232 struct stat attr;
8233 utime_t atime(times[0]);
8234 utime_t mtime(times[1]);
8235
8236 attr_set_atime_and_mtime(&attr, atime, mtime);
8237 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8238 }
8239
8240 int Client::lutimes(const char *relpath, struct timeval times[2],
8241 const UserPerm& perms)
8242 {
8243 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8244 if (!mref_reader.is_state_satisfied())
8245 return -CEPHFS_ENOTCONN;
8246
8247 tout(cct) << __func__ << std::endl;
8248 tout(cct) << relpath << std::endl;
8249 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8250 << std::endl;
8251 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8252 << std::endl;
8253
8254 filepath path(relpath);
8255 InodeRef in;
8256
8257 std::scoped_lock lock(client_lock);
8258 int r = path_walk(path, &in, perms, false);
8259 if (r < 0)
8260 return r;
8261 struct stat attr;
8262 utime_t atime(times[0]);
8263 utime_t mtime(times[1]);
8264
8265 attr_set_atime_and_mtime(&attr, atime, mtime);
8266 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8267 }
8268
8269 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8270 {
8271 struct timespec ts[2];
8272 ts[0].tv_sec = times[0].tv_sec;
8273 ts[0].tv_nsec = times[0].tv_usec * 1000;
8274 ts[1].tv_sec = times[1].tv_sec;
8275 ts[1].tv_nsec = times[1].tv_usec * 1000;
8276
8277 return futimens(fd, ts, perms);
8278 }
8279
8280 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8281 {
8282 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8283 if (!mref_reader.is_state_satisfied())
8284 return -CEPHFS_ENOTCONN;
8285
8286 tout(cct) << __func__ << std::endl;
8287 tout(cct) << fd << std::endl;
8288 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8289 << std::endl;
8290 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8291 << std::endl;
8292
8293 std::scoped_lock lock(client_lock);
8294 Fh *f = get_filehandle(fd);
8295 if (!f)
8296 return -CEPHFS_EBADF;
8297 #if defined(__linux__) && defined(O_PATH)
8298 if (f->flags & O_PATH)
8299 return -CEPHFS_EBADF;
8300 #endif
8301 struct stat attr;
8302 utime_t atime(times[0]);
8303 utime_t mtime(times[1]);
8304
8305 attr_set_atime_and_mtime(&attr, atime, mtime);
8306 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8307 }
8308
8309 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8310 const UserPerm& perms) {
8311 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8312 if (!mref_reader.is_state_satisfied()) {
8313 return -CEPHFS_ENOTCONN;
8314 }
8315
8316 tout(cct) << __func__ << std::endl;
8317 tout(cct) << dirfd << std::endl;
8318 tout(cct) << relpath << std::endl;
8319 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8320 << std::endl;
8321 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8322 << std::endl;
8323 tout(cct) << flags << std::endl;
8324
8325 filepath path(relpath);
8326 InodeRef in;
8327 InodeRef dirinode;
8328
8329 std::scoped_lock lock(client_lock);
8330 int r = get_fd_inode(dirfd, &dirinode);
8331 if (r < 0) {
8332 return r;
8333 }
8334
8335 #if defined(__linux__) && defined(O_PATH)
8336 if (flags & O_PATH) {
8337 return -CEPHFS_EBADF;
8338 }
8339 #endif
8340
8341 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8342 if (r < 0) {
8343 return r;
8344 }
8345 struct stat attr;
8346 utime_t atime(times[0]);
8347 utime_t mtime(times[1]);
8348
8349 attr_set_atime_and_mtime(&attr, atime, mtime);
8350 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8351 }
8352
8353 int Client::flock(int fd, int operation, uint64_t owner)
8354 {
8355 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8356 if (!mref_reader.is_state_satisfied())
8357 return -CEPHFS_ENOTCONN;
8358
8359 tout(cct) << __func__ << std::endl;
8360 tout(cct) << fd << std::endl;
8361 tout(cct) << operation << std::endl;
8362 tout(cct) << owner << std::endl;
8363
8364 std::scoped_lock lock(client_lock);
8365 Fh *f = get_filehandle(fd);
8366 if (!f)
8367 return -CEPHFS_EBADF;
8368
8369 return _flock(f, operation, owner);
8370 }
8371
8372 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8373 {
8374 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8375 if (!mref_reader.is_state_satisfied())
8376 return -CEPHFS_ENOTCONN;
8377
8378 tout(cct) << __func__ << std::endl;
8379 tout(cct) << relpath << std::endl;
8380
8381 filepath path(relpath);
8382 InodeRef in;
8383
8384 std::scoped_lock lock(client_lock);
8385 int r = path_walk(path, &in, perms, true);
8386 if (r < 0)
8387 return r;
8388 if (cct->_conf->client_permissions) {
8389 int r = may_open(in.get(), O_RDONLY, perms);
8390 if (r < 0)
8391 return r;
8392 }
8393 r = _opendir(in.get(), dirpp, perms);
8394 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8395 if (r != -CEPHFS_ENOTDIR)
8396 tout(cct) << (uintptr_t)*dirpp << std::endl;
8397 return r;
8398 }
8399
8400 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8401 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8402 if (!mref_reader.is_state_satisfied()) {
8403 return -CEPHFS_ENOTCONN;
8404 }
8405
8406 tout(cct) << __func__ << std::endl;
8407 tout(cct) << dirfd << std::endl;
8408
8409 InodeRef dirinode;
8410 std::scoped_lock locker(client_lock);
8411 int r = get_fd_inode(dirfd, &dirinode);
8412 if (r < 0) {
8413 return r;
8414 }
8415
8416 if (cct->_conf->client_permissions) {
8417 r = may_open(dirinode.get(), O_RDONLY, perms);
8418 if (r < 0) {
8419 return r;
8420 }
8421 }
8422 r = _opendir(dirinode.get(), dirpp, perms);
8423 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8424 if (r != -CEPHFS_ENOTDIR) {
8425 tout(cct) << (uintptr_t)*dirpp << std::endl;
8426 }
8427 return r;
8428 }
8429
8430 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8431 {
8432 if (!in->is_dir())
8433 return -CEPHFS_ENOTDIR;
8434 *dirpp = new dir_result_t(in, perms);
8435 opened_dirs.insert(*dirpp);
8436 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8437 return 0;
8438 }
8439
8440
8441 int Client::closedir(dir_result_t *dir)
8442 {
8443 tout(cct) << __func__ << std::endl;
8444 tout(cct) << (uintptr_t)dir << std::endl;
8445
8446 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8447 std::scoped_lock lock(client_lock);
8448 _closedir(dir);
8449 return 0;
8450 }
8451
8452 void Client::_closedir(dir_result_t *dirp)
8453 {
8454 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8455
8456 if (dirp->inode) {
8457 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8458 dirp->inode.reset();
8459 }
8460 _readdir_drop_dirp_buffer(dirp);
8461 opened_dirs.erase(dirp);
8462 delete dirp;
8463 }
8464
8465 void Client::rewinddir(dir_result_t *dirp)
8466 {
8467 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8468
8469 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8470 if (!mref_reader.is_state_satisfied())
8471 return;
8472
8473 std::scoped_lock lock(client_lock);
8474 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8475 _readdir_drop_dirp_buffer(d);
8476 d->reset();
8477 }
8478
8479 loff_t Client::telldir(dir_result_t *dirp)
8480 {
8481 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8482 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8483 return d->offset;
8484 }
8485
8486 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8487 {
8488 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8489
8490 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8491 if (!mref_reader.is_state_satisfied())
8492 return;
8493
8494 std::scoped_lock lock(client_lock);
8495
8496 if (offset == dirp->offset)
8497 return;
8498
8499 if (offset > dirp->offset)
8500 dirp->release_count = 0; // bump if we do a forward seek
8501 else
8502 dirp->ordered_count = 0; // disable filling readdir cache
8503
8504 if (dirp->hash_order()) {
8505 if (dirp->offset > offset) {
8506 _readdir_drop_dirp_buffer(dirp);
8507 dirp->reset();
8508 }
8509 } else {
8510 if (offset == 0 ||
8511 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8512 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8513 _readdir_drop_dirp_buffer(dirp);
8514 dirp->reset();
8515 }
8516 }
8517
8518 dirp->offset = offset;
8519 }
8520
8521
8522 //struct dirent {
8523 // ino_t d_ino; /* inode number */
8524 // off_t d_off; /* offset to the next dirent */
8525 // unsigned short d_reclen; /* length of this record */
8526 // unsigned char d_type; /* type of file */
8527 // char d_name[256]; /* filename */
8528 //};
8529 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8530 {
8531 strncpy(de->d_name, name, 255);
8532 de->d_name[255] = '\0';
8533 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8534 de->d_ino = ino;
8535 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8536 de->d_off = next_off;
8537 #endif
8538 de->d_reclen = 1;
8539 de->d_type = IFTODT(type);
8540 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8541 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8542 #endif
8543 }
8544
8545 void Client::_readdir_next_frag(dir_result_t *dirp)
8546 {
8547 frag_t fg = dirp->buffer_frag;
8548
8549 if (fg.is_rightmost()) {
8550 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8551 dirp->set_end();
8552 return;
8553 }
8554
8555 // advance
8556 fg = fg.next();
8557 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8558
8559 if (dirp->hash_order()) {
8560 // keep last_name
8561 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8562 if (dirp->offset < new_offset) // don't decrease offset
8563 dirp->offset = new_offset;
8564 } else {
8565 dirp->last_name.clear();
8566 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8567 _readdir_rechoose_frag(dirp);
8568 }
8569 }
8570
8571 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8572 {
8573 ceph_assert(dirp->inode);
8574
8575 if (dirp->hash_order())
8576 return;
8577
8578 frag_t cur = frag_t(dirp->offset_high());
8579 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8580 if (fg != cur) {
8581 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8582 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8583 dirp->last_name.clear();
8584 dirp->next_offset = 2;
8585 }
8586 }
8587
8588 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8589 {
8590 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8591 dirp->buffer.clear();
8592 }
8593
8594 int Client::_readdir_get_frag(dir_result_t *dirp)
8595 {
8596 ceph_assert(dirp);
8597 ceph_assert(dirp->inode);
8598
8599 // get the current frag.
8600 frag_t fg;
8601 if (dirp->hash_order())
8602 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8603 else
8604 fg = frag_t(dirp->offset_high());
8605
8606 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8607 << " offset " << hex << dirp->offset << dec << dendl;
8608
8609 int op = CEPH_MDS_OP_READDIR;
8610 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8611 op = CEPH_MDS_OP_LSSNAP;
8612
8613 InodeRef& diri = dirp->inode;
8614
8615 MetaRequest *req = new MetaRequest(op);
8616 filepath path;
8617 diri->make_nosnap_relative_path(path);
8618 req->set_filepath(path);
8619 req->set_inode(diri.get());
8620 req->head.args.readdir.frag = fg;
8621 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8622 if (dirp->last_name.length()) {
8623 req->path2.set_path(dirp->last_name);
8624 } else if (dirp->hash_order()) {
8625 req->head.args.readdir.offset_hash = dirp->offset_high();
8626 }
8627 req->dirp = dirp;
8628
8629 bufferlist dirbl;
8630 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8631
8632 if (res == -CEPHFS_EAGAIN) {
8633 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8634 _readdir_rechoose_frag(dirp);
8635 return _readdir_get_frag(dirp);
8636 }
8637
8638 if (res == 0) {
8639 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8640 << " size " << dirp->buffer.size() << dendl;
8641 } else {
8642 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8643 dirp->set_end();
8644 }
8645
8646 return res;
8647 }
8648
8649 struct dentry_off_lt {
8650 bool operator()(const Dentry* dn, int64_t off) const {
8651 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8652 }
8653 };
8654
8655 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8656 int caps, bool getref)
8657 {
8658 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8659 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8660 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8661 << dendl;
8662 Dir *dir = dirp->inode->dir;
8663
8664 if (!dir) {
8665 ldout(cct, 10) << " dir is empty" << dendl;
8666 dirp->set_end();
8667 return 0;
8668 }
8669
8670 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8671 dir->readdir_cache.end(),
8672 dirp->offset, dentry_off_lt());
8673
8674 string dn_name;
8675 while (true) {
8676 int mask = caps;
8677 if (!dirp->inode->is_complete_and_ordered())
8678 return -CEPHFS_EAGAIN;
8679 if (pd == dir->readdir_cache.end())
8680 break;
8681 Dentry *dn = *pd;
8682 if (dn->inode == NULL) {
8683 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8684 ++pd;
8685 continue;
8686 }
8687 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8688 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8689 ++pd;
8690 continue;
8691 }
8692
8693 int idx = pd - dir->readdir_cache.begin();
8694 if (dn->inode->is_dir()) {
8695 mask |= CEPH_STAT_RSTAT;
8696 }
8697 int r = _getattr(dn->inode, mask, dirp->perms);
8698 if (r < 0)
8699 return r;
8700
8701 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8702 pd = dir->readdir_cache.begin() + idx;
8703 if (pd >= dir->readdir_cache.end() || *pd != dn)
8704 return -CEPHFS_EAGAIN;
8705
8706 struct ceph_statx stx;
8707 struct dirent de;
8708 fill_statx(dn->inode, caps, &stx);
8709
8710 uint64_t next_off = dn->offset + 1;
8711 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8712 ++pd;
8713 if (pd == dir->readdir_cache.end())
8714 next_off = dir_result_t::END;
8715
8716 Inode *in = NULL;
8717 if (getref) {
8718 in = dn->inode.get();
8719 _ll_get(in);
8720 }
8721
8722 dn_name = dn->name; // fill in name while we have lock
8723
8724 client_lock.unlock();
8725 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8726 client_lock.lock();
8727 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8728 << " = " << r << dendl;
8729 if (r < 0) {
8730 return r;
8731 }
8732
8733 dirp->offset = next_off;
8734 if (dirp->at_end())
8735 dirp->next_offset = 2;
8736 else
8737 dirp->next_offset = dirp->offset_low();
8738 dirp->last_name = dn_name; // we successfully returned this one; update!
8739 dirp->release_count = 0; // last_name no longer match cache index
8740 if (r > 0)
8741 return r;
8742 }
8743
8744 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8745 dirp->set_end();
8746 return 0;
8747 }
8748
8749 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8750 unsigned want, unsigned flags, bool getref)
8751 {
8752 int caps = statx_to_mask(flags, want);
8753
8754 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8755 if (!mref_reader.is_state_satisfied())
8756 return -CEPHFS_ENOTCONN;
8757
8758 std::unique_lock cl(client_lock);
8759
8760 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8761
8762 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8763 << dec << " at_end=" << dirp->at_end()
8764 << " hash_order=" << dirp->hash_order() << dendl;
8765
8766 struct dirent de;
8767 struct ceph_statx stx;
8768 memset(&de, 0, sizeof(de));
8769 memset(&stx, 0, sizeof(stx));
8770
8771 InodeRef& diri = dirp->inode;
8772
8773 if (dirp->at_end())
8774 return 0;
8775
8776 if (dirp->offset == 0) {
8777 ldout(cct, 15) << " including ." << dendl;
8778 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8779 uint64_t next_off = 1;
8780
8781 int r;
8782 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8783 if (r < 0)
8784 return r;
8785
8786 fill_statx(diri, caps, &stx);
8787 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8788
8789 Inode *inode = NULL;
8790 if (getref) {
8791 inode = diri.get();
8792 _ll_get(inode);
8793 }
8794
8795 cl.unlock();
8796 r = cb(p, &de, &stx, next_off, inode);
8797 cl.lock();
8798 if (r < 0)
8799 return r;
8800
8801 dirp->offset = next_off;
8802 if (r > 0)
8803 return r;
8804 }
8805 if (dirp->offset == 1) {
8806 ldout(cct, 15) << " including .." << dendl;
8807 uint64_t next_off = 2;
8808 InodeRef in;
8809 if (diri->dentries.empty())
8810 in = diri;
8811 else
8812 in = diri->get_first_parent()->dir->parent_inode;
8813
8814 int r;
8815 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8816 if (r < 0)
8817 return r;
8818
8819 fill_statx(in, caps, &stx);
8820 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8821
8822 Inode *inode = NULL;
8823 if (getref) {
8824 inode = in.get();
8825 _ll_get(inode);
8826 }
8827
8828 cl.unlock();
8829 r = cb(p, &de, &stx, next_off, inode);
8830 cl.lock();
8831 if (r < 0)
8832 return r;
8833
8834 dirp->offset = next_off;
8835 if (r > 0)
8836 return r;
8837 }
8838
8839 // can we read from our cache?
8840 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8841 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8842 << dirp->inode->is_complete_and_ordered()
8843 << " issued " << ccap_string(dirp->inode->caps_issued())
8844 << dendl;
8845 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8846 dirp->inode->is_complete_and_ordered() &&
8847 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8848 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8849 if (err != -CEPHFS_EAGAIN)
8850 return err;
8851 }
8852
8853 while (1) {
8854 if (dirp->at_end())
8855 return 0;
8856
8857 bool check_caps = true;
8858 if (!dirp->is_cached()) {
8859 int r = _readdir_get_frag(dirp);
8860 if (r)
8861 return r;
8862 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8863 // different than the requested one. (our dirfragtree was outdated)
8864 check_caps = false;
8865 }
8866 frag_t fg = dirp->buffer_frag;
8867
8868 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8869 << " offset " << hex << dirp->offset << dendl;
8870
8871 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8872 dirp->offset, dir_result_t::dentry_off_lt());
8873 it != dirp->buffer.end();
8874 ++it) {
8875 dir_result_t::dentry &entry = *it;
8876
8877 uint64_t next_off = entry.offset + 1;
8878
8879 int r;
8880 if (check_caps) {
8881 int mask = caps;
8882 if(entry.inode->is_dir()){
8883 mask |= CEPH_STAT_RSTAT;
8884 }
8885 r = _getattr(entry.inode, mask, dirp->perms);
8886 if (r < 0)
8887 return r;
8888 }
8889
8890 fill_statx(entry.inode, caps, &stx);
8891 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8892
8893 Inode *inode = NULL;
8894 if (getref) {
8895 inode = entry.inode.get();
8896 _ll_get(inode);
8897 }
8898
8899 cl.unlock();
8900 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8901 cl.lock();
8902
8903 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8904 << " = " << r << dendl;
8905 if (r < 0)
8906 return r;
8907
8908 dirp->offset = next_off;
8909 if (r > 0)
8910 return r;
8911 }
8912
8913 if (dirp->next_offset > 2) {
8914 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8915 _readdir_drop_dirp_buffer(dirp);
8916 continue; // more!
8917 }
8918
8919 if (!fg.is_rightmost()) {
8920 // next frag!
8921 _readdir_next_frag(dirp);
8922 continue;
8923 }
8924
8925 if (diri->shared_gen == dirp->start_shared_gen &&
8926 diri->dir_release_count == dirp->release_count) {
8927 if (diri->dir_ordered_count == dirp->ordered_count) {
8928 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8929 if (diri->dir) {
8930 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8931 diri->dir->readdir_cache.resize(dirp->cache_index);
8932 }
8933 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8934 } else {
8935 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8936 diri->flags |= I_COMPLETE;
8937 }
8938 }
8939
8940 dirp->set_end();
8941 return 0;
8942 }
8943 ceph_abort();
8944 return 0;
8945 }
8946
8947
8948 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8949 {
8950 return readdirplus_r(d, de, 0, 0, 0, NULL);
8951 }
8952
8953 /*
8954 * readdirplus_r
8955 *
8956 * returns
8957 * 1 if we got a dirent
8958 * 0 for end of directory
8959 * <0 on error
8960 */
8961
8962 struct single_readdir {
8963 struct dirent *de;
8964 struct ceph_statx *stx;
8965 Inode *inode;
8966 bool full;
8967 };
8968
8969 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8970 struct ceph_statx *stx, off_t off,
8971 Inode *in)
8972 {
8973 single_readdir *c = static_cast<single_readdir *>(p);
8974
8975 if (c->full)
8976 return -1; // already filled this dirent
8977
8978 *c->de = *de;
8979 if (c->stx)
8980 *c->stx = *stx;
8981 c->inode = in;
8982 c->full = true;
8983 return 1;
8984 }
8985
8986 struct dirent *Client::readdir(dir_result_t *d)
8987 {
8988 int ret;
8989 auto& de = d->de;
8990 single_readdir sr;
8991 sr.de = &de;
8992 sr.stx = NULL;
8993 sr.inode = NULL;
8994 sr.full = false;
8995
8996 // our callback fills the dirent and sets sr.full=true on first
8997 // call, and returns -1 the second time around.
8998 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8999 if (ret < -1) {
9000 errno = -ret; // this sucks.
9001 return (dirent *) NULL;
9002 }
9003 if (sr.full) {
9004 return &de;
9005 }
9006 return (dirent *) NULL;
9007 }
9008
9009 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9010 struct ceph_statx *stx, unsigned want,
9011 unsigned flags, Inode **out)
9012 {
9013 single_readdir sr;
9014 sr.de = de;
9015 sr.stx = stx;
9016 sr.inode = NULL;
9017 sr.full = false;
9018
9019 // our callback fills the dirent and sets sr.full=true on first
9020 // call, and returns -1 the second time around.
9021 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9022 if (r < -1)
9023 return r;
9024 if (out)
9025 *out = sr.inode;
9026 if (sr.full)
9027 return 1;
9028 return 0;
9029 }
9030
9031
9032 /* getdents */
9033 struct getdents_result {
9034 char *buf;
9035 int buflen;
9036 int pos;
9037 bool fullent;
9038 };
9039
9040 static int _readdir_getdent_cb(void *p, struct dirent *de,
9041 struct ceph_statx *stx, off_t off, Inode *in)
9042 {
9043 struct getdents_result *c = static_cast<getdents_result *>(p);
9044
9045 int dlen;
9046 if (c->fullent)
9047 dlen = sizeof(*de);
9048 else
9049 dlen = strlen(de->d_name) + 1;
9050
9051 if (c->pos + dlen > c->buflen)
9052 return -1; // doesn't fit
9053
9054 if (c->fullent) {
9055 memcpy(c->buf + c->pos, de, sizeof(*de));
9056 } else {
9057 memcpy(c->buf + c->pos, de->d_name, dlen);
9058 }
9059 c->pos += dlen;
9060 return 0;
9061 }
9062
9063 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9064 {
9065 getdents_result gr;
9066 gr.buf = buf;
9067 gr.buflen = buflen;
9068 gr.fullent = fullent;
9069 gr.pos = 0;
9070
9071 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9072
9073 if (r < 0) { // some error
9074 if (r == -1) { // buffer ran out of space
9075 if (gr.pos) { // but we got some entries already!
9076 return gr.pos;
9077 } // or we need a larger buffer
9078 return -CEPHFS_ERANGE;
9079 } else { // actual error, return it
9080 return r;
9081 }
9082 }
9083 return gr.pos;
9084 }
9085
9086
9087 /* getdir */
9088 struct getdir_result {
9089 list<string> *contents;
9090 int num;
9091 };
9092
9093 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9094 {
9095 getdir_result *r = static_cast<getdir_result *>(p);
9096
9097 r->contents->push_back(de->d_name);
9098 r->num++;
9099 return 0;
9100 }
9101
9102 int Client::getdir(const char *relpath, list<string>& contents,
9103 const UserPerm& perms)
9104 {
9105 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9106 tout(cct) << "getdir" << std::endl;
9107 tout(cct) << relpath << std::endl;
9108
9109 dir_result_t *d;
9110 int r = opendir(relpath, &d, perms);
9111 if (r < 0)
9112 return r;
9113
9114 getdir_result gr;
9115 gr.contents = &contents;
9116 gr.num = 0;
9117 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9118
9119 closedir(d);
9120
9121 if (r < 0)
9122 return r;
9123 return gr.num;
9124 }
9125
9126
9127 /****** file i/o **********/
9128
9129 // common parts for open and openat. call with client_lock locked.
9130 int Client::create_and_open(std::optional<int> dirfd, const char *relpath, int flags,
9131 const UserPerm& perms, mode_t mode, int stripe_unit,
9132 int stripe_count, int object_size, const char *data_pool,
9133 std::string alternate_name) {
9134 ceph_assert(ceph_mutex_is_locked(client_lock));
9135 int cflags = ceph_flags_sys2wire(flags);
9136 tout(cct) << cflags << std::endl;
9137
9138 Fh *fh = NULL;
9139
9140 #if defined(__linux__) && defined(O_PATH)
9141 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9142 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9143 * in kernel (fs/open.c). */
9144 if (flags & O_PATH)
9145 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9146 #endif
9147
9148 filepath path(relpath);
9149 InodeRef in;
9150 bool created = false;
9151 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9152 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9153 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9154
9155 InodeRef dirinode = nullptr;
9156 if (dirfd) {
9157 int r = get_fd_inode(*dirfd, &dirinode);
9158 if (r < 0) {
9159 return r;
9160 }
9161 }
9162
9163 int r = path_walk(path, &in, perms, followsym, mask, dirinode);
9164 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9165 return -CEPHFS_EEXIST;
9166
9167 #if defined(__linux__) && defined(O_PATH)
9168 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9169 #else
9170 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9171 #endif
9172 return -CEPHFS_ELOOP;
9173
9174 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9175 filepath dirpath = path;
9176 string dname = dirpath.last_dentry();
9177 dirpath.pop_dentry();
9178 InodeRef dir;
9179 r = path_walk(dirpath, &dir, perms, true,
9180 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9181 if (r < 0) {
9182 goto out;
9183 }
9184 if (cct->_conf->client_permissions) {
9185 r = may_create(dir.get(), perms);
9186 if (r < 0)
9187 goto out;
9188 }
9189 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9190 stripe_count, object_size, data_pool, &created, perms,
9191 std::move(alternate_name));
9192 }
9193 if (r < 0)
9194 goto out;
9195
9196 if (!created) {
9197 // posix says we can only check permissions of existing files
9198 if (cct->_conf->client_permissions) {
9199 r = may_open(in.get(), flags, perms);
9200 if (r < 0)
9201 goto out;
9202 }
9203 }
9204
9205 if (!fh)
9206 r = _open(in.get(), flags, mode, &fh, perms);
9207 if (r >= 0) {
9208 // allocate a integer file descriptor
9209 ceph_assert(fh);
9210 r = get_fd();
9211 ceph_assert(fd_map.count(r) == 0);
9212 fd_map[r] = fh;
9213 }
9214
9215 out:
9216 return r;
9217 }
9218
9219 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9220 mode_t mode, int stripe_unit, int stripe_count,
9221 int object_size, const char *data_pool, std::string alternate_name)
9222 {
9223 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9224 stripe_count, object_size, data_pool, alternate_name);
9225 }
9226
9227 int Client::_openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9228 mode_t mode, std::string alternate_name) {
9229 return create_and_open(dirfd, relpath, flags, perms, mode, 0, 0, 0, NULL, alternate_name);
9230 }
9231
9232 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9233 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9234 const char *data_pool, std::string alternate_name) {
9235 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9236 if (!mref_reader.is_state_satisfied()) {
9237 return -CEPHFS_ENOTCONN;
9238 }
9239
9240 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9241 tout(cct) << dirfd << std::endl;
9242 tout(cct) << relpath << std::endl;
9243 tout(cct) << flags << std::endl;
9244 tout(cct) << mode << std::endl;
9245
9246 std::scoped_lock locker(client_lock);
9247 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9248 object_size, data_pool, alternate_name);
9249
9250 tout(cct) << r << std::endl;
9251 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9252 return r;
9253 }
9254
9255 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9256 const UserPerm& perms)
9257 {
9258 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9259
9260 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9261 if (!mref_reader.is_state_satisfied())
9262 return -CEPHFS_ENOTCONN;
9263
9264 std::scoped_lock lock(client_lock);
9265 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9266 filepath path(ino);
9267 req->set_filepath(path);
9268
9269 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9270 char f[30];
9271 sprintf(f, "%u", h);
9272 filepath path2(dirino);
9273 path2.push_dentry(string(f));
9274 req->set_filepath2(path2);
9275
9276 int r = make_request(req, perms, NULL, NULL,
9277 rand() % mdsmap->get_num_in_mds());
9278 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9279 return r;
9280 }
9281
9282
9283 /**
9284 * Load inode into local cache.
9285 *
9286 * If inode pointer is non-NULL, and take a reference on
9287 * the resulting Inode object in one operation, so that caller
9288 * can safely assume inode will still be there after return.
9289 */
9290 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9291 {
9292 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9293
9294 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9295 if (!mref_reader.is_state_satisfied())
9296 return -CEPHFS_ENOTCONN;
9297
9298 if (is_reserved_vino(vino))
9299 return -CEPHFS_ESTALE;
9300
9301 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9302 filepath path(vino.ino);
9303 req->set_filepath(path);
9304
9305 /*
9306 * The MDS expects either a "real" snapid here or 0. The special value
9307 * carveouts for the snapid are all at the end of the range so we can
9308 * just look for any snapid below this value.
9309 */
9310 if (vino.snapid < CEPH_NOSNAP)
9311 req->head.args.lookupino.snapid = vino.snapid;
9312
9313 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9314 if (r == 0 && inode != NULL) {
9315 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9316 ceph_assert(p != inode_map.end());
9317 *inode = p->second;
9318 _ll_get(*inode);
9319 }
9320 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9321 return r;
9322 }
9323
9324 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9325 {
9326 vinodeno_t vino(ino, CEPH_NOSNAP);
9327 std::scoped_lock lock(client_lock);
9328 return _lookup_vino(vino, perms, inode);
9329 }
9330
9331 /**
9332 * Find the parent inode of `ino` and insert it into
9333 * our cache. Conditionally also set `parent` to a referenced
9334 * Inode* if caller provides non-NULL value.
9335 */
9336 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9337 {
9338 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9339
9340 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9341 filepath path(ino->ino);
9342 req->set_filepath(path);
9343
9344 InodeRef target;
9345 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9346 // Give caller a reference to the parent ino if they provided a pointer.
9347 if (parent != NULL) {
9348 if (r == 0) {
9349 *parent = target.get();
9350 _ll_get(*parent);
9351 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9352 } else {
9353 *parent = NULL;
9354 }
9355 }
9356 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9357 return r;
9358 }
9359
9360 /**
9361 * Populate the parent dentry for `ino`, provided it is
9362 * a child of `parent`.
9363 */
9364 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9365 {
9366 ceph_assert(parent->is_dir());
9367 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9368
9369 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9370 if (!mref_reader.is_state_satisfied())
9371 return -CEPHFS_ENOTCONN;
9372
9373 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9374 req->set_filepath2(filepath(parent->ino));
9375 req->set_filepath(filepath(ino->ino));
9376 req->set_inode(ino);
9377
9378 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9379 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9380 return r;
9381 }
9382
9383 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9384 {
9385 std::scoped_lock lock(client_lock);
9386 return _lookup_name(ino, parent, perms);
9387 }
9388
9389 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9390 {
9391 ceph_assert(in);
9392 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9393
9394 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9395
9396 if (in->snapid != CEPH_NOSNAP) {
9397 in->snap_cap_refs++;
9398 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9399 << ccap_string(in->caps_issued()) << dendl;
9400 }
9401
9402 const auto& conf = cct->_conf;
9403 f->readahead.set_trigger_requests(1);
9404 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9405 uint64_t max_readahead = Readahead::NO_LIMIT;
9406 if (conf->client_readahead_max_bytes) {
9407 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9408 }
9409 if (conf->client_readahead_max_periods) {
9410 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9411 }
9412 f->readahead.set_max_readahead_size(max_readahead);
9413 vector<uint64_t> alignments;
9414 alignments.push_back(in->layout.get_period());
9415 alignments.push_back(in->layout.stripe_unit);
9416 f->readahead.set_alignments(alignments);
9417
9418 return f;
9419 }
9420
9421 int Client::_release_fh(Fh *f)
9422 {
9423 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9424 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9425 Inode *in = f->inode.get();
9426 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9427
9428 in->unset_deleg(f);
9429
9430 if (in->snapid == CEPH_NOSNAP) {
9431 if (in->put_open_ref(f->mode)) {
9432 _flush(in, new C_Client_FlushComplete(this, in));
9433 check_caps(in, 0);
9434 }
9435 } else {
9436 ceph_assert(in->snap_cap_refs > 0);
9437 in->snap_cap_refs--;
9438 }
9439
9440 _release_filelocks(f);
9441
9442 // Finally, read any async err (i.e. from flushes)
9443 int err = f->take_async_err();
9444 if (err != 0) {
9445 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9446 << cpp_strerror(err) << dendl;
9447 } else {
9448 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9449 }
9450
9451 _put_fh(f);
9452
9453 return err;
9454 }
9455
9456 void Client::_put_fh(Fh *f)
9457 {
9458 int left = f->put();
9459 if (!left) {
9460 delete f;
9461 }
9462 }
9463
9464 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9465 const UserPerm& perms)
9466 {
9467 if (in->snapid != CEPH_NOSNAP &&
9468 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9469 return -CEPHFS_EROFS;
9470 }
9471
9472 // use normalized flags to generate cmode
9473 int cflags = ceph_flags_sys2wire(flags);
9474 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9475 cflags |= CEPH_O_LAZY;
9476
9477 int cmode = ceph_flags_to_mode(cflags);
9478 int want = ceph_caps_for_mode(cmode);
9479 int result = 0;
9480
9481 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9482
9483 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9484 // update wanted?
9485 check_caps(in, CHECK_CAPS_NODELAY);
9486 } else {
9487
9488 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9489 filepath path;
9490 in->make_nosnap_relative_path(path);
9491 req->set_filepath(path);
9492 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9493 req->head.args.open.mode = mode;
9494 req->head.args.open.pool = -1;
9495 if (cct->_conf->client_debug_getattr_caps)
9496 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9497 else
9498 req->head.args.open.mask = 0;
9499 req->head.args.open.old_size = in->size; // for O_TRUNC
9500 req->set_inode(in);
9501 result = make_request(req, perms);
9502
9503 /*
9504 * NFS expects that delegations will be broken on a conflicting open,
9505 * not just when there is actual conflicting access to the file. SMB leases
9506 * and oplocks also have similar semantics.
9507 *
9508 * Ensure that clients that have delegations enabled will wait on minimal
9509 * caps during open, just to ensure that other clients holding delegations
9510 * return theirs first.
9511 */
9512 if (deleg_timeout && result == 0) {
9513 int need = 0, have;
9514
9515 if (cmode & CEPH_FILE_MODE_WR)
9516 need |= CEPH_CAP_FILE_WR;
9517 if (cmode & CEPH_FILE_MODE_RD)
9518 need |= CEPH_CAP_FILE_RD;
9519
9520 Fh fh(in, flags, cmode, fd_gen, perms);
9521 result = get_caps(&fh, need, want, &have, -1);
9522 if (result < 0) {
9523 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9524 " . Denying open: " <<
9525 cpp_strerror(result) << dendl;
9526 } else {
9527 put_cap_ref(in, need);
9528 }
9529 }
9530 }
9531
9532 // success?
9533 if (result >= 0) {
9534 if (fhp)
9535 *fhp = _create_fh(in, flags, cmode, perms);
9536 } else {
9537 in->put_open_ref(cmode);
9538 }
9539
9540 trim_cache();
9541
9542 return result;
9543 }
9544
9545 int Client::_renew_caps(Inode *in)
9546 {
9547 int wanted = in->caps_file_wanted();
9548 if (in->is_any_caps() &&
9549 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9550 check_caps(in, CHECK_CAPS_NODELAY);
9551 return 0;
9552 }
9553
9554 int flags = 0;
9555 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9556 flags = O_RDWR;
9557 else if (wanted & CEPH_CAP_FILE_RD)
9558 flags = O_RDONLY;
9559 else if (wanted & CEPH_CAP_FILE_WR)
9560 flags = O_WRONLY;
9561
9562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9563 filepath path;
9564 in->make_nosnap_relative_path(path);
9565 req->set_filepath(path);
9566 req->head.args.open.flags = flags;
9567 req->head.args.open.pool = -1;
9568 if (cct->_conf->client_debug_getattr_caps)
9569 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9570 else
9571 req->head.args.open.mask = 0;
9572 req->set_inode(in);
9573
9574 // duplicate in case Cap goes away; not sure if that race is a concern?
9575 const UserPerm *pperm = in->get_best_perms();
9576 UserPerm perms;
9577 if (pperm != NULL)
9578 perms = *pperm;
9579 int ret = make_request(req, perms);
9580 return ret;
9581 }
9582
9583 int Client::_close(int fd)
9584 {
9585 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9586 tout(cct) << "close" << std::endl;
9587 tout(cct) << fd << std::endl;
9588
9589 Fh *fh = get_filehandle(fd);
9590 if (!fh)
9591 return -CEPHFS_EBADF;
9592 int err = _release_fh(fh);
9593 fd_map.erase(fd);
9594 put_fd(fd);
9595 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9596 return err;
9597 }
9598
9599 int Client::close(int fd) {
9600 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9601 if (!mref_reader.is_state_satisfied())
9602 return -CEPHFS_ENOTCONN;
9603
9604 std::scoped_lock lock(client_lock);
9605 return _close(fd);
9606 }
9607
9608 // ------------
9609 // read, write
9610
9611 loff_t Client::lseek(int fd, loff_t offset, int whence)
9612 {
9613 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9614 if (!mref_reader.is_state_satisfied())
9615 return -CEPHFS_ENOTCONN;
9616
9617 tout(cct) << "lseek" << std::endl;
9618 tout(cct) << fd << std::endl;
9619 tout(cct) << offset << std::endl;
9620 tout(cct) << whence << std::endl;
9621
9622 std::scoped_lock lock(client_lock);
9623 Fh *f = get_filehandle(fd);
9624 if (!f)
9625 return -CEPHFS_EBADF;
9626 #if defined(__linux__) && defined(O_PATH)
9627 if (f->flags & O_PATH)
9628 return -CEPHFS_EBADF;
9629 #endif
9630 return _lseek(f, offset, whence);
9631 }
9632
9633 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9634 {
9635 Inode *in = f->inode.get();
9636 bool whence_check = false;
9637 loff_t pos = -1;
9638
9639 switch (whence) {
9640 case SEEK_END:
9641 whence_check = true;
9642 break;
9643
9644 #ifdef SEEK_DATA
9645 case SEEK_DATA:
9646 whence_check = true;
9647 break;
9648 #endif
9649
9650 #ifdef SEEK_HOLE
9651 case SEEK_HOLE:
9652 whence_check = true;
9653 break;
9654 #endif
9655 }
9656
9657 if (whence_check) {
9658 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9659 if (r < 0)
9660 return r;
9661 }
9662
9663 switch (whence) {
9664 case SEEK_SET:
9665 pos = offset;
9666 break;
9667
9668 case SEEK_CUR:
9669 pos = f->pos + offset;
9670 break;
9671
9672 case SEEK_END:
9673 pos = in->size + offset;
9674 break;
9675
9676 #ifdef SEEK_DATA
9677 case SEEK_DATA:
9678 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9679 return -CEPHFS_ENXIO;
9680 pos = offset;
9681 break;
9682 #endif
9683
9684 #ifdef SEEK_HOLE
9685 case SEEK_HOLE:
9686 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9687 return -CEPHFS_ENXIO;
9688 pos = in->size;
9689 break;
9690 #endif
9691
9692 default:
9693 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9694 return -CEPHFS_EINVAL;
9695 }
9696
9697 if (pos < 0) {
9698 return -CEPHFS_EINVAL;
9699 } else {
9700 f->pos = pos;
9701 }
9702
9703 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9704 return f->pos;
9705 }
9706
9707
9708 void Client::lock_fh_pos(Fh *f)
9709 {
9710 ldout(cct, 10) << __func__ << " " << f << dendl;
9711
9712 if (f->pos_locked || !f->pos_waiters.empty()) {
9713 ceph::condition_variable cond;
9714 f->pos_waiters.push_back(&cond);
9715 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9716 std::unique_lock l{client_lock, std::adopt_lock};
9717 cond.wait(l, [f, me=&cond] {
9718 return !f->pos_locked && f->pos_waiters.front() == me;
9719 });
9720 l.release();
9721 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9722 ceph_assert(f->pos_waiters.front() == &cond);
9723 f->pos_waiters.pop_front();
9724 }
9725
9726 f->pos_locked = true;
9727 }
9728
9729 void Client::unlock_fh_pos(Fh *f)
9730 {
9731 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9732
9733 ldout(cct, 10) << __func__ << " " << f << dendl;
9734 f->pos_locked = false;
9735 if (!f->pos_waiters.empty()) {
9736 // only wake up the oldest waiter
9737 auto cond = f->pos_waiters.front();
9738 cond->notify_one();
9739 }
9740 }
9741
9742 int Client::uninline_data(Inode *in, Context *onfinish)
9743 {
9744 if (!in->inline_data.length()) {
9745 onfinish->complete(0);
9746 return 0;
9747 }
9748
9749 char oid_buf[32];
9750 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9751 object_t oid = oid_buf;
9752
9753 ObjectOperation create_ops;
9754 create_ops.create(false);
9755
9756 objecter->mutate(oid,
9757 OSDMap::file_to_object_locator(in->layout),
9758 create_ops,
9759 in->snaprealm->get_snap_context(),
9760 ceph::real_clock::now(),
9761 0,
9762 NULL);
9763
9764 bufferlist inline_version_bl;
9765 encode(in->inline_version, inline_version_bl);
9766
9767 ObjectOperation uninline_ops;
9768 uninline_ops.cmpxattr("inline_version",
9769 CEPH_OSD_CMPXATTR_OP_GT,
9770 CEPH_OSD_CMPXATTR_MODE_U64,
9771 inline_version_bl);
9772 bufferlist inline_data = in->inline_data;
9773 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9774 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9775
9776 objecter->mutate(oid,
9777 OSDMap::file_to_object_locator(in->layout),
9778 uninline_ops,
9779 in->snaprealm->get_snap_context(),
9780 ceph::real_clock::now(),
9781 0,
9782 onfinish);
9783
9784 return 0;
9785 }
9786
9787 //
9788
9789 // blocking osd interface
9790
9791 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9792 {
9793 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9794 if (!mref_reader.is_state_satisfied())
9795 return -CEPHFS_ENOTCONN;
9796
9797 tout(cct) << "read" << std::endl;
9798 tout(cct) << fd << std::endl;
9799 tout(cct) << size << std::endl;
9800 tout(cct) << offset << std::endl;
9801
9802 std::unique_lock lock(client_lock);
9803 Fh *f = get_filehandle(fd);
9804 if (!f)
9805 return -CEPHFS_EBADF;
9806 #if defined(__linux__) && defined(O_PATH)
9807 if (f->flags & O_PATH)
9808 return -CEPHFS_EBADF;
9809 #endif
9810 bufferlist bl;
9811 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9812 size = std::min(size, (loff_t)INT_MAX);
9813 int r = _read(f, offset, size, &bl);
9814 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9815 if (r >= 0) {
9816 lock.unlock();
9817 bl.begin().copy(bl.length(), buf);
9818 r = bl.length();
9819 }
9820 return r;
9821 }
9822
9823 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9824 {
9825 if (iovcnt < 0)
9826 return -CEPHFS_EINVAL;
9827 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9828 }
9829
9830 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9831 {
9832 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9833
9834 int want, have = 0;
9835 bool movepos = false;
9836 std::unique_ptr<C_SaferCond> onuninline;
9837 int64_t rc = 0;
9838 const auto& conf = cct->_conf;
9839 Inode *in = f->inode.get();
9840 utime_t lat;
9841 utime_t start = ceph_clock_now();
9842
9843 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9844 return -CEPHFS_EBADF;
9845 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9846
9847 if (offset < 0) {
9848 lock_fh_pos(f);
9849 offset = f->pos;
9850 movepos = true;
9851 }
9852 loff_t start_pos = offset;
9853
9854 if (in->inline_version == 0) {
9855 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9856 if (r < 0) {
9857 rc = r;
9858 goto done;
9859 }
9860 ceph_assert(in->inline_version > 0);
9861 }
9862
9863 retry:
9864 if (f->mode & CEPH_FILE_MODE_LAZY)
9865 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9866 else
9867 want = CEPH_CAP_FILE_CACHE;
9868 {
9869 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9870 if (r < 0) {
9871 rc = r;
9872 goto done;
9873 }
9874 }
9875 if (f->flags & O_DIRECT)
9876 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9877
9878 if (in->inline_version < CEPH_INLINE_NONE) {
9879 if (!(have & CEPH_CAP_FILE_CACHE)) {
9880 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9881 uninline_data(in, onuninline.get());
9882 } else {
9883 uint32_t len = in->inline_data.length();
9884 uint64_t endoff = offset + size;
9885 if (endoff > in->size)
9886 endoff = in->size;
9887
9888 if (offset < len) {
9889 if (endoff <= len) {
9890 bl->substr_of(in->inline_data, offset, endoff - offset);
9891 } else {
9892 bl->substr_of(in->inline_data, offset, len - offset);
9893 bl->append_zero(endoff - len);
9894 }
9895 rc = endoff - offset;
9896 } else if ((uint64_t)offset < endoff) {
9897 bl->append_zero(endoff - offset);
9898 rc = endoff - offset;
9899 } else {
9900 rc = 0;
9901 }
9902 goto success;
9903 }
9904 }
9905
9906 if (!conf->client_debug_force_sync_read &&
9907 conf->client_oc &&
9908 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9909
9910 if (f->flags & O_RSYNC) {
9911 _flush_range(in, offset, size);
9912 }
9913 rc = _read_async(f, offset, size, bl);
9914 if (rc < 0)
9915 goto done;
9916 } else {
9917 if (f->flags & O_DIRECT)
9918 _flush_range(in, offset, size);
9919
9920 bool checkeof = false;
9921 rc = _read_sync(f, offset, size, bl, &checkeof);
9922 if (rc < 0)
9923 goto done;
9924 if (checkeof) {
9925 offset += rc;
9926 size -= rc;
9927
9928 put_cap_ref(in, CEPH_CAP_FILE_RD);
9929 have = 0;
9930 // reverify size
9931 {
9932 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9933 if (r < 0) {
9934 rc = r;
9935 goto done;
9936 }
9937 }
9938
9939 // eof? short read.
9940 if ((uint64_t)offset < in->size)
9941 goto retry;
9942 }
9943 }
9944
9945 success:
9946 ceph_assert(rc >= 0);
9947 if (movepos) {
9948 // adjust fd pos
9949 f->pos = start_pos + rc;
9950 }
9951
9952 lat = ceph_clock_now();
9953 lat -= start;
9954 logger->tinc(l_c_read, lat);
9955
9956 done:
9957 // done!
9958
9959 if (onuninline) {
9960 client_lock.unlock();
9961 int ret = onuninline->wait();
9962 client_lock.lock();
9963 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
9964 in->inline_data.clear();
9965 in->inline_version = CEPH_INLINE_NONE;
9966 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9967 check_caps(in, 0);
9968 } else
9969 rc = ret;
9970 }
9971 if (have) {
9972 put_cap_ref(in, CEPH_CAP_FILE_RD);
9973 }
9974 if (movepos) {
9975 unlock_fh_pos(f);
9976 }
9977 return rc;
9978 }
9979
9980 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9981 client(c), f(f) {
9982 f->get();
9983 f->readahead.inc_pending();
9984 }
9985
9986 Client::C_Readahead::~C_Readahead() {
9987 f->readahead.dec_pending();
9988 client->_put_fh(f);
9989 }
9990
9991 void Client::C_Readahead::finish(int r) {
9992 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9993 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9994 }
9995
9996 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9997 {
9998 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9999
10000 const auto& conf = cct->_conf;
10001 Inode *in = f->inode.get();
10002
10003 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10004
10005 // trim read based on file size?
10006 if (off >= in->size)
10007 return 0;
10008 if (len == 0)
10009 return 0;
10010 if (off + len > in->size) {
10011 len = in->size - off;
10012 }
10013
10014 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10015 << " max_bytes=" << f->readahead.get_max_readahead_size()
10016 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10017
10018 // read (and possibly block)
10019 int r = 0;
10020 C_SaferCond onfinish("Client::_read_async flock");
10021 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10022 off, len, bl, 0, &onfinish);
10023 if (r == 0) {
10024 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10025 client_lock.unlock();
10026 r = onfinish.wait();
10027 client_lock.lock();
10028 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10029 }
10030
10031 if(f->readahead.get_min_readahead_size() > 0) {
10032 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10033 if (readahead_extent.second > 0) {
10034 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10035 << " (caller wants " << off << "~" << len << ")" << dendl;
10036 Context *onfinish2 = new C_Readahead(this, f);
10037 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10038 readahead_extent.first, readahead_extent.second,
10039 NULL, 0, onfinish2);
10040 if (r2 == 0) {
10041 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10042 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10043 } else {
10044 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10045 delete onfinish2;
10046 }
10047 }
10048 }
10049
10050 return r;
10051 }
10052
10053 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10054 bool *checkeof)
10055 {
10056 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10057
10058 Inode *in = f->inode.get();
10059 uint64_t pos = off;
10060 int left = len;
10061 int read = 0;
10062
10063 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10064
10065 // 0 success, 1 continue and < 0 error happen.
10066 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10067 int r = onfinish.wait();
10068
10069 // if we get ENOENT from OSD, assume 0 bytes returned
10070 if (r == -CEPHFS_ENOENT)
10071 r = 0;
10072 if (r < 0)
10073 return r;
10074
10075 if (tbl.length()) {
10076 r = tbl.length();
10077
10078 read += r;
10079 pos += r;
10080 left -= r;
10081 bl->claim_append(tbl);
10082 }
10083 // short read?
10084 if (r >= 0 && r < wanted) {
10085 if (pos < in->size) {
10086 // zero up to known EOF
10087 int64_t some = in->size - pos;
10088 if (some > left)
10089 some = left;
10090 auto z = buffer::ptr_node::create(some);
10091 z->zero();
10092 bl->push_back(std::move(z));
10093 read += some;
10094 pos += some;
10095 left -= some;
10096 if (left == 0)
10097 return 0;
10098 }
10099
10100 *checkeof = true;
10101 return 0;
10102 }
10103 return 1;
10104 };
10105
10106 while (left > 0) {
10107 C_SaferCond onfinish("Client::_read_sync flock");
10108 bufferlist tbl;
10109
10110 int wanted = left;
10111 filer->read_trunc(in->ino, &in->layout, in->snapid,
10112 pos, left, &tbl, 0,
10113 in->truncate_size, in->truncate_seq,
10114 &onfinish);
10115 client_lock.unlock();
10116 int r = wait_and_copy(onfinish, tbl, wanted);
10117 client_lock.lock();
10118 if (!r)
10119 return read;
10120 if (r < 0)
10121 return r;
10122 }
10123 return read;
10124 }
10125
10126 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10127 {
10128 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10129 if (!mref_reader.is_state_satisfied())
10130 return -CEPHFS_ENOTCONN;
10131
10132 tout(cct) << "write" << std::endl;
10133 tout(cct) << fd << std::endl;
10134 tout(cct) << size << std::endl;
10135 tout(cct) << offset << std::endl;
10136
10137 std::scoped_lock lock(client_lock);
10138 Fh *fh = get_filehandle(fd);
10139 if (!fh)
10140 return -CEPHFS_EBADF;
10141 #if defined(__linux__) && defined(O_PATH)
10142 if (fh->flags & O_PATH)
10143 return -CEPHFS_EBADF;
10144 #endif
10145 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10146 size = std::min(size, (loff_t)INT_MAX);
10147 int r = _write(fh, offset, size, buf, NULL, false);
10148 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10149 return r;
10150 }
10151
10152 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10153 {
10154 if (iovcnt < 0)
10155 return -CEPHFS_EINVAL;
10156 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10157 }
10158
10159 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10160 unsigned iovcnt, int64_t offset, bool write,
10161 bool clamp_to_int, std::unique_lock<ceph::mutex> &cl)
10162 {
10163 #if defined(__linux__) && defined(O_PATH)
10164 if (fh->flags & O_PATH)
10165 return -CEPHFS_EBADF;
10166 #endif
10167 loff_t totallen = 0;
10168 for (unsigned i = 0; i < iovcnt; i++) {
10169 totallen += iov[i].iov_len;
10170 }
10171
10172 /*
10173 * Some of the API functions take 64-bit size values, but only return
10174 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10175 * we don't do I/Os larger than the values we can return.
10176 */
10177 if (clamp_to_int) {
10178 totallen = std::min(totallen, (loff_t)INT_MAX);
10179 }
10180 if (write) {
10181 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10182 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10183 return w;
10184 } else {
10185 bufferlist bl;
10186 int64_t r = _read(fh, offset, totallen, &bl);
10187 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10188 if (r <= 0)
10189 return r;
10190
10191 cl.unlock();
10192 auto iter = bl.cbegin();
10193 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10194 /*
10195 * This piece of code aims to handle the case that bufferlist
10196 * does not have enough data to fill in the iov
10197 */
10198 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10199 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10200 resid -= round_size;
10201 /* iter is self-updating */
10202 }
10203 cl.lock();
10204 return r;
10205 }
10206 }
10207
10208 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10209 {
10210 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10211 if (!mref_reader.is_state_satisfied())
10212 return -CEPHFS_ENOTCONN;
10213
10214 tout(cct) << fd << std::endl;
10215 tout(cct) << offset << std::endl;
10216
10217 std::unique_lock cl(client_lock);
10218 Fh *fh = get_filehandle(fd);
10219 if (!fh)
10220 return -CEPHFS_EBADF;
10221 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true, cl);
10222 }
10223
10224 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10225 const struct iovec *iov, int iovcnt)
10226 {
10227 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10228
10229 uint64_t fpos = 0;
10230
10231 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
10232 return -CEPHFS_EFBIG;
10233
10234 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10235 Inode *in = f->inode.get();
10236
10237 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10238 return -CEPHFS_ENOSPC;
10239 }
10240
10241 ceph_assert(in->snapid == CEPH_NOSNAP);
10242
10243 // was Fh opened as writeable?
10244 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10245 return -CEPHFS_EBADF;
10246
10247 // use/adjust fd pos?
10248 if (offset < 0) {
10249 lock_fh_pos(f);
10250 /*
10251 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10252 * change out from under us.
10253 */
10254 if (f->flags & O_APPEND) {
10255 auto r = _lseek(f, 0, SEEK_END);
10256 if (r < 0) {
10257 unlock_fh_pos(f);
10258 return r;
10259 }
10260 }
10261 offset = f->pos;
10262 fpos = offset+size;
10263 unlock_fh_pos(f);
10264 }
10265
10266 // check quota
10267 uint64_t endoff = offset + size;
10268 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10269 f->actor_perms)) {
10270 return -CEPHFS_EDQUOT;
10271 }
10272
10273 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10274
10275 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10276
10277 // time it.
10278 utime_t start = ceph_clock_now();
10279
10280 if (in->inline_version == 0) {
10281 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10282 if (r < 0)
10283 return r;
10284 ceph_assert(in->inline_version > 0);
10285 }
10286
10287 // copy into fresh buffer (since our write may be resub, async)
10288 bufferlist bl;
10289 if (buf) {
10290 if (size > 0)
10291 bl.append(buf, size);
10292 } else if (iov){
10293 for (int i = 0; i < iovcnt; i++) {
10294 if (iov[i].iov_len > 0) {
10295 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10296 }
10297 }
10298 }
10299
10300 utime_t lat;
10301 uint64_t totalwritten;
10302 int want, have;
10303 if (f->mode & CEPH_FILE_MODE_LAZY)
10304 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10305 else
10306 want = CEPH_CAP_FILE_BUFFER;
10307 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10308 if (r < 0)
10309 return r;
10310
10311 /* clear the setuid/setgid bits, if any */
10312 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10313 struct ceph_statx stx = { 0 };
10314
10315 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10316 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10317 if (r < 0)
10318 return r;
10319 } else {
10320 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10321 }
10322
10323 if (f->flags & O_DIRECT)
10324 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10325
10326 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10327
10328 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10329
10330 if (in->inline_version < CEPH_INLINE_NONE) {
10331 if (endoff > cct->_conf->client_max_inline_size ||
10332 endoff > CEPH_INLINE_MAX_SIZE ||
10333 !(have & CEPH_CAP_FILE_BUFFER)) {
10334 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10335 uninline_data(in, onuninline.get());
10336 } else {
10337 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10338
10339 uint32_t len = in->inline_data.length();
10340
10341 if (endoff < len)
10342 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10343
10344 if (offset < len)
10345 in->inline_data.splice(offset, len - offset);
10346 else if (offset > len)
10347 in->inline_data.append_zero(offset - len);
10348
10349 in->inline_data.append(bl);
10350 in->inline_version++;
10351
10352 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10353
10354 goto success;
10355 }
10356 }
10357
10358 if (cct->_conf->client_oc &&
10359 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10360 // do buffered write
10361 if (!in->oset.dirty_or_tx)
10362 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10363
10364 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10365
10366 // async, caching, non-blocking.
10367 r = objectcacher->file_write(&in->oset, &in->layout,
10368 in->snaprealm->get_snap_context(),
10369 offset, size, bl, ceph::real_clock::now(),
10370 0);
10371 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10372
10373 if (r < 0)
10374 goto done;
10375
10376 // flush cached write if O_SYNC is set on file fh
10377 // O_DSYNC == O_SYNC on linux < 2.6.33
10378 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10379 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10380 _flush_range(in, offset, size);
10381 }
10382 } else {
10383 if (f->flags & O_DIRECT)
10384 _flush_range(in, offset, size);
10385
10386 // simple, non-atomic sync write
10387 C_SaferCond onfinish("Client::_write flock");
10388 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10389
10390 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10391 offset, size, bl, ceph::real_clock::now(), 0,
10392 in->truncate_size, in->truncate_seq,
10393 &onfinish);
10394 client_lock.unlock();
10395 r = onfinish.wait();
10396 client_lock.lock();
10397 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10398 if (r < 0)
10399 goto done;
10400 }
10401
10402 // if we get here, write was successful, update client metadata
10403 success:
10404 // time
10405 lat = ceph_clock_now();
10406 lat -= start;
10407 logger->tinc(l_c_wrlat, lat);
10408
10409 if (fpos) {
10410 lock_fh_pos(f);
10411 f->pos = fpos;
10412 unlock_fh_pos(f);
10413 }
10414 totalwritten = size;
10415 r = (int64_t)totalwritten;
10416
10417 // extend file?
10418 if (totalwritten + offset > in->size) {
10419 in->size = totalwritten + offset;
10420 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10421
10422 if (is_quota_bytes_approaching(in, f->actor_perms)) {
10423 check_caps(in, CHECK_CAPS_NODELAY);
10424 } else if (is_max_size_approaching(in)) {
10425 check_caps(in, 0);
10426 }
10427
10428 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10429 } else {
10430 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10431 }
10432
10433 // mtime
10434 in->mtime = in->ctime = ceph_clock_now();
10435 in->change_attr++;
10436 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10437
10438 done:
10439
10440 if (nullptr != onuninline) {
10441 client_lock.unlock();
10442 int uninline_ret = onuninline->wait();
10443 client_lock.lock();
10444
10445 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10446 in->inline_data.clear();
10447 in->inline_version = CEPH_INLINE_NONE;
10448 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10449 check_caps(in, 0);
10450 } else
10451 r = uninline_ret;
10452 }
10453
10454 put_cap_ref(in, CEPH_CAP_FILE_WR);
10455 return r;
10456 }
10457
10458 int Client::_flush(Fh *f)
10459 {
10460 Inode *in = f->inode.get();
10461 int err = f->take_async_err();
10462 if (err != 0) {
10463 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10464 << cpp_strerror(err) << dendl;
10465 } else {
10466 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10467 }
10468
10469 return err;
10470 }
10471
10472 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10473 {
10474 struct ceph_statx stx;
10475 stx.stx_size = length;
10476 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10477 }
10478
10479 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10480 {
10481 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10482 if (!mref_reader.is_state_satisfied())
10483 return -CEPHFS_ENOTCONN;
10484
10485 tout(cct) << __func__ << std::endl;
10486 tout(cct) << fd << std::endl;
10487 tout(cct) << length << std::endl;
10488
10489 std::scoped_lock lock(client_lock);
10490 Fh *f = get_filehandle(fd);
10491 if (!f)
10492 return -CEPHFS_EBADF;
10493 #if defined(__linux__) && defined(O_PATH)
10494 if (f->flags & O_PATH)
10495 return -CEPHFS_EBADF;
10496 #endif
10497 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10498 return -CEPHFS_EBADF;
10499 struct stat attr;
10500 attr.st_size = length;
10501 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10502 }
10503
10504 int Client::fsync(int fd, bool syncdataonly)
10505 {
10506 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10507 if (!mref_reader.is_state_satisfied())
10508 return -CEPHFS_ENOTCONN;
10509
10510 tout(cct) << "fsync" << std::endl;
10511 tout(cct) << fd << std::endl;
10512 tout(cct) << syncdataonly << std::endl;
10513
10514 std::scoped_lock lock(client_lock);
10515 Fh *f = get_filehandle(fd);
10516 if (!f)
10517 return -CEPHFS_EBADF;
10518 #if defined(__linux__) && defined(O_PATH)
10519 if (f->flags & O_PATH)
10520 return -CEPHFS_EBADF;
10521 #endif
10522 int r = _fsync(f, syncdataonly);
10523 if (r == 0) {
10524 // The IOs in this fsync were okay, but maybe something happened
10525 // in the background that we shoudl be reporting?
10526 r = f->take_async_err();
10527 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10528 << ") = 0, async_err = " << r << dendl;
10529 } else {
10530 // Assume that an error we encountered during fsync, even reported
10531 // synchronously, would also have applied the error to the Fh, and we
10532 // should clear it here to avoid returning the same error again on next
10533 // call.
10534 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10535 << r << dendl;
10536 f->take_async_err();
10537 }
10538 return r;
10539 }
10540
10541 int Client::_fsync(Inode *in, bool syncdataonly)
10542 {
10543 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10544
10545 int r = 0;
10546 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10547 ceph_tid_t flush_tid = 0;
10548 InodeRef tmp_ref;
10549 utime_t lat;
10550 utime_t start = ceph_clock_now();
10551
10552 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10553
10554 if (cct->_conf->client_oc) {
10555 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10556 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10557 _flush(in, object_cacher_completion.get());
10558 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10559 }
10560
10561 if (!syncdataonly && in->dirty_caps) {
10562 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10563 if (in->flushing_caps)
10564 flush_tid = last_flush_tid;
10565 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10566
10567 if (!syncdataonly && !in->unsafe_ops.empty()) {
10568 flush_mdlog_sync();
10569
10570 MetaRequest *req = in->unsafe_ops.back();
10571 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10572
10573 req->get();
10574 wait_on_list(req->waitfor_safe);
10575 put_request(req);
10576 }
10577
10578 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10579 client_lock.unlock();
10580 ldout(cct, 15) << "waiting on data to flush" << dendl;
10581 r = object_cacher_completion->wait();
10582 client_lock.lock();
10583 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10584 } else {
10585 // FIXME: this can starve
10586 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10587 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10588 << " uncommitted, waiting" << dendl;
10589 wait_on_list(in->waitfor_commit);
10590 }
10591 }
10592
10593 if (!r) {
10594 if (flush_tid > 0)
10595 wait_sync_caps(in, flush_tid);
10596
10597 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10598 } else {
10599 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10600 << cpp_strerror(-r) << dendl;
10601 }
10602
10603 lat = ceph_clock_now();
10604 lat -= start;
10605 logger->tinc(l_c_fsync, lat);
10606
10607 return r;
10608 }
10609
10610 int Client::_fsync(Fh *f, bool syncdataonly)
10611 {
10612 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10613 return _fsync(f->inode.get(), syncdataonly);
10614 }
10615
10616 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10617 {
10618 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10619 if (!mref_reader.is_state_satisfied())
10620 return -CEPHFS_ENOTCONN;
10621
10622 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10623 tout(cct) << fd << std::endl;
10624
10625 std::scoped_lock lock(client_lock);
10626 Fh *f = get_filehandle(fd);
10627 if (!f)
10628 return -CEPHFS_EBADF;
10629 int r = _getattr(f->inode, mask, perms);
10630 if (r < 0)
10631 return r;
10632 fill_stat(f->inode, stbuf, NULL);
10633 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10634 return r;
10635 }
10636
10637 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10638 unsigned int want, unsigned int flags)
10639 {
10640 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10641 if (!mref_reader.is_state_satisfied())
10642 return -CEPHFS_ENOTCONN;
10643
10644 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10645 tout(cct) << fd << std::endl;
10646
10647 std::scoped_lock lock(client_lock);
10648 Fh *f = get_filehandle(fd);
10649 if (!f)
10650 return -CEPHFS_EBADF;
10651
10652 unsigned mask = statx_to_mask(flags, want);
10653
10654 int r = 0;
10655 if (mask) {
10656 r = _getattr(f->inode, mask, perms);
10657 if (r < 0) {
10658 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10659 return r;
10660 }
10661 }
10662
10663 fill_statx(f->inode, mask, stx);
10664 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10665 return r;
10666 }
10667
10668 int Client::statxat(int dirfd, const char *relpath,
10669 struct ceph_statx *stx, const UserPerm& perms,
10670 unsigned int want, unsigned int flags) {
10671 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10672 if (!mref_reader.is_state_satisfied()) {
10673 return -CEPHFS_ENOTCONN;
10674 }
10675
10676 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10677 tout(cct) << dirfd << std::endl;
10678 tout(cct) << relpath << std::endl;
10679
10680 unsigned mask = statx_to_mask(flags, want);
10681
10682 InodeRef dirinode;
10683 std::scoped_lock lock(client_lock);
10684 int r = get_fd_inode(dirfd, &dirinode);
10685 if (r < 0) {
10686 return r;
10687 }
10688
10689 InodeRef in;
10690 filepath path(relpath);
10691 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10692 if (r < 0) {
10693 return r;
10694 }
10695 r = _getattr(in, mask, perms);
10696 if (r < 0) {
10697 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10698 return r;
10699 }
10700
10701 fill_statx(in, mask, stx);
10702 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10703 return r;
10704 }
10705
10706 // not written yet, but i want to link!
10707
10708 int Client::chdir(const char *relpath, std::string &new_cwd,
10709 const UserPerm& perms)
10710 {
10711 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10712 if (!mref_reader.is_state_satisfied())
10713 return -CEPHFS_ENOTCONN;
10714
10715 tout(cct) << "chdir" << std::endl;
10716 tout(cct) << relpath << std::endl;
10717
10718 filepath path(relpath);
10719 InodeRef in;
10720
10721 std::scoped_lock lock(client_lock);
10722 int r = path_walk(path, &in, perms);
10723 if (r < 0)
10724 return r;
10725
10726 if (!(in.get()->is_dir()))
10727 return -CEPHFS_ENOTDIR;
10728
10729 if (cwd != in)
10730 cwd.swap(in);
10731 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10732
10733 _getcwd(new_cwd, perms);
10734 return 0;
10735 }
10736
10737 void Client::_getcwd(string& dir, const UserPerm& perms)
10738 {
10739 filepath path;
10740 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10741
10742 Inode *in = cwd.get();
10743 while (in != root.get()) {
10744 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10745
10746 // A cwd or ancester is unlinked
10747 if (in->dentries.empty()) {
10748 return;
10749 }
10750
10751 Dentry *dn = in->get_first_parent();
10752
10753
10754 if (!dn) {
10755 // look it up
10756 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10757 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10758 filepath path(in->ino);
10759 req->set_filepath(path);
10760 req->set_inode(in);
10761 int res = make_request(req, perms);
10762 if (res < 0)
10763 break;
10764
10765 // start over
10766 path = filepath();
10767 in = cwd.get();
10768 continue;
10769 }
10770 path.push_front_dentry(dn->name);
10771 in = dn->dir->parent_inode;
10772 }
10773 dir = "/";
10774 dir += path.get_path();
10775 }
10776
10777 void Client::getcwd(string& dir, const UserPerm& perms)
10778 {
10779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10780 if (!mref_reader.is_state_satisfied())
10781 return;
10782
10783 std::scoped_lock l(client_lock);
10784
10785 _getcwd(dir, perms);
10786 }
10787
10788 int Client::statfs(const char *path, struct statvfs *stbuf,
10789 const UserPerm& perms)
10790 {
10791 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10792 if (!mref_reader.is_state_satisfied())
10793 return -CEPHFS_ENOTCONN;
10794
10795 tout(cct) << __func__ << std::endl;
10796 unsigned long int total_files_on_fs;
10797
10798 ceph_statfs stats;
10799 C_SaferCond cond;
10800
10801 std::unique_lock lock(client_lock);
10802 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10803 if (data_pools.size() == 1) {
10804 objecter->get_fs_stats(stats, data_pools[0], &cond);
10805 } else {
10806 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10807 }
10808
10809 lock.unlock();
10810 int rval = cond.wait();
10811 lock.lock();
10812
10813 assert(root);
10814 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10815
10816 if (rval < 0) {
10817 ldout(cct, 1) << "underlying call to statfs returned error: "
10818 << cpp_strerror(rval)
10819 << dendl;
10820 return rval;
10821 }
10822
10823 memset(stbuf, 0, sizeof(*stbuf));
10824
10825 /*
10826 * we're going to set a block size of 4MB so we can represent larger
10827 * FSes without overflowing. Additionally convert the space
10828 * measurements from KB to bytes while making them in terms of
10829 * blocks. We use 4MB only because it is big enough, and because it
10830 * actually *is* the (ceph) default block size.
10831 */
10832 const int CEPH_BLOCK_SHIFT = 22;
10833 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10834 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10835 stbuf->f_files = total_files_on_fs;
10836 stbuf->f_ffree = -1;
10837 stbuf->f_favail = -1;
10838 stbuf->f_fsid = -1; // ??
10839 stbuf->f_flag = 0; // ??
10840 stbuf->f_namemax = NAME_MAX;
10841
10842 // Usually quota_root will == root_ancestor, but if the mount root has no
10843 // quota but we can see a parent of it that does have a quota, we'll
10844 // respect that one instead.
10845 ceph_assert(root != nullptr);
10846 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
10847
10848 // get_quota_root should always give us something
10849 // because client quotas are always enabled
10850 ceph_assert(quota_root != nullptr);
10851
10852 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10853
10854 // Skip the getattr if any sessions are stale, as we don't want to
10855 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10856 // is unhealthy.
10857 if (!_any_stale_sessions()) {
10858 int r = _getattr(quota_root, 0, perms, true);
10859 if (r != 0) {
10860 // Ignore return value: error getting latest inode metadata is not a good
10861 // reason to break "df".
10862 lderr(cct) << "Error in getattr on quota root 0x"
10863 << std::hex << quota_root->ino << std::dec
10864 << " statfs result may be outdated" << dendl;
10865 }
10866 }
10867
10868 // Special case: if there is a size quota set on the Inode acting
10869 // as the root for this client mount, then report the quota status
10870 // as the filesystem statistics.
10871 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10872 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10873 // It is possible for a quota to be exceeded: arithmetic here must
10874 // handle case where used > total.
10875 const fsblkcnt_t free = total > used ? total - used : 0;
10876
10877 stbuf->f_blocks = total;
10878 stbuf->f_bfree = free;
10879 stbuf->f_bavail = free;
10880 } else {
10881 // General case: report the cluster statistics returned from RADOS. Because
10882 // multiple pools may be used without one filesystem namespace via
10883 // layouts, this is the most correct thing we can do.
10884 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10885 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10886 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10887 }
10888
10889 return rval;
10890 }
10891
10892 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10893 struct flock *fl, uint64_t owner, bool removing)
10894 {
10895 ldout(cct, 10) << __func__ << " ino " << in->ino
10896 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10897 << " type " << fl->l_type << " owner " << owner
10898 << " " << fl->l_start << "~" << fl->l_len << dendl;
10899
10900 if (in->flags & I_ERROR_FILELOCK)
10901 return -CEPHFS_EIO;
10902
10903 int lock_cmd;
10904 if (F_RDLCK == fl->l_type)
10905 lock_cmd = CEPH_LOCK_SHARED;
10906 else if (F_WRLCK == fl->l_type)
10907 lock_cmd = CEPH_LOCK_EXCL;
10908 else if (F_UNLCK == fl->l_type)
10909 lock_cmd = CEPH_LOCK_UNLOCK;
10910 else
10911 return -CEPHFS_EIO;
10912
10913 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10914 sleep = 0;
10915
10916 /*
10917 * Set the most significant bit, so that MDS knows the 'owner'
10918 * is sufficient to identify the owner of lock. (old code uses
10919 * both 'owner' and 'pid')
10920 */
10921 owner |= (1ULL << 63);
10922
10923 MetaRequest *req = new MetaRequest(op);
10924 filepath path;
10925 in->make_nosnap_relative_path(path);
10926 req->set_filepath(path);
10927 req->set_inode(in);
10928
10929 req->head.args.filelock_change.rule = lock_type;
10930 req->head.args.filelock_change.type = lock_cmd;
10931 req->head.args.filelock_change.owner = owner;
10932 req->head.args.filelock_change.pid = fl->l_pid;
10933 req->head.args.filelock_change.start = fl->l_start;
10934 req->head.args.filelock_change.length = fl->l_len;
10935 req->head.args.filelock_change.wait = sleep;
10936
10937 int ret;
10938 bufferlist bl;
10939
10940 if (sleep && switch_interrupt_cb) {
10941 // enable interrupt
10942 switch_interrupt_cb(callback_handle, req->get());
10943 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10944 // disable interrupt
10945 switch_interrupt_cb(callback_handle, NULL);
10946 if (ret == 0 && req->aborted()) {
10947 // effect of this lock request has been revoked by the 'lock intr' request
10948 ret = req->get_abort_code();
10949 }
10950 put_request(req);
10951 } else {
10952 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10953 }
10954
10955 if (ret == 0) {
10956 if (op == CEPH_MDS_OP_GETFILELOCK) {
10957 ceph_filelock filelock;
10958 auto p = bl.cbegin();
10959 decode(filelock, p);
10960
10961 if (CEPH_LOCK_SHARED == filelock.type)
10962 fl->l_type = F_RDLCK;
10963 else if (CEPH_LOCK_EXCL == filelock.type)
10964 fl->l_type = F_WRLCK;
10965 else
10966 fl->l_type = F_UNLCK;
10967
10968 fl->l_whence = SEEK_SET;
10969 fl->l_start = filelock.start;
10970 fl->l_len = filelock.length;
10971 fl->l_pid = filelock.pid;
10972 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10973 ceph_lock_state_t *lock_state;
10974 if (lock_type == CEPH_LOCK_FCNTL) {
10975 if (!in->fcntl_locks)
10976 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10977 lock_state = in->fcntl_locks.get();
10978 } else if (lock_type == CEPH_LOCK_FLOCK) {
10979 if (!in->flock_locks)
10980 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10981 lock_state = in->flock_locks.get();
10982 } else {
10983 ceph_abort();
10984 return -CEPHFS_EINVAL;
10985 }
10986 _update_lock_state(fl, owner, lock_state);
10987
10988 if (!removing) {
10989 if (lock_type == CEPH_LOCK_FCNTL) {
10990 if (!fh->fcntl_locks)
10991 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10992 lock_state = fh->fcntl_locks.get();
10993 } else {
10994 if (!fh->flock_locks)
10995 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10996 lock_state = fh->flock_locks.get();
10997 }
10998 _update_lock_state(fl, owner, lock_state);
10999 }
11000 } else
11001 ceph_abort();
11002 }
11003 return ret;
11004 }
11005
11006 int Client::_interrupt_filelock(MetaRequest *req)
11007 {
11008 // Set abort code, but do not kick. The abort code prevents the request
11009 // from being re-sent.
11010 req->abort(-CEPHFS_EINTR);
11011 if (req->mds < 0)
11012 return 0; // haven't sent the request
11013
11014 Inode *in = req->inode();
11015
11016 int lock_type;
11017 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11018 lock_type = CEPH_LOCK_FLOCK_INTR;
11019 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11020 lock_type = CEPH_LOCK_FCNTL_INTR;
11021 else {
11022 ceph_abort();
11023 return -CEPHFS_EINVAL;
11024 }
11025
11026 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11027 filepath path;
11028 in->make_nosnap_relative_path(path);
11029 intr_req->set_filepath(path);
11030 intr_req->set_inode(in);
11031 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11032 intr_req->head.args.filelock_change.rule = lock_type;
11033 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11034
11035 UserPerm perms(req->get_uid(), req->get_gid());
11036 return make_request(intr_req, perms, NULL, NULL, -1);
11037 }
11038
11039 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11040 {
11041 if (!in->fcntl_locks && !in->flock_locks)
11042 return;
11043
11044 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11045 encode(nr_fcntl_locks, bl);
11046 if (nr_fcntl_locks) {
11047 auto &lock_state = in->fcntl_locks;
11048 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
11049 p != lock_state->held_locks.end();
11050 ++p)
11051 encode(p->second, bl);
11052 }
11053
11054 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11055 encode(nr_flock_locks, bl);
11056 if (nr_flock_locks) {
11057 auto &lock_state = in->flock_locks;
11058 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
11059 p != lock_state->held_locks.end();
11060 ++p)
11061 encode(p->second, bl);
11062 }
11063
11064 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11065 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11066 }
11067
11068 void Client::_release_filelocks(Fh *fh)
11069 {
11070 if (!fh->fcntl_locks && !fh->flock_locks)
11071 return;
11072
11073 Inode *in = fh->inode.get();
11074 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11075
11076 list<ceph_filelock> activated_locks;
11077
11078 list<pair<int, ceph_filelock> > to_release;
11079
11080 if (fh->fcntl_locks) {
11081 auto &lock_state = fh->fcntl_locks;
11082 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11083 auto q = p++;
11084 if (in->flags & I_ERROR_FILELOCK) {
11085 lock_state->remove_lock(q->second, activated_locks);
11086 } else {
11087 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11088 }
11089 }
11090 lock_state.reset();
11091 }
11092 if (fh->flock_locks) {
11093 auto &lock_state = fh->flock_locks;
11094 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11095 auto q = p++;
11096 if (in->flags & I_ERROR_FILELOCK) {
11097 lock_state->remove_lock(q->second, activated_locks);
11098 } else {
11099 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11100 }
11101 }
11102 lock_state.reset();
11103 }
11104
11105 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11106 in->flags &= ~I_ERROR_FILELOCK;
11107
11108 if (to_release.empty())
11109 return;
11110
11111 struct flock fl;
11112 memset(&fl, 0, sizeof(fl));
11113 fl.l_whence = SEEK_SET;
11114 fl.l_type = F_UNLCK;
11115
11116 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11117 p != to_release.end();
11118 ++p) {
11119 fl.l_start = p->second.start;
11120 fl.l_len = p->second.length;
11121 fl.l_pid = p->second.pid;
11122 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11123 p->second.owner, true);
11124 }
11125 }
11126
11127 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11128 ceph_lock_state_t *lock_state)
11129 {
11130 int lock_cmd;
11131 if (F_RDLCK == fl->l_type)
11132 lock_cmd = CEPH_LOCK_SHARED;
11133 else if (F_WRLCK == fl->l_type)
11134 lock_cmd = CEPH_LOCK_EXCL;
11135 else
11136 lock_cmd = CEPH_LOCK_UNLOCK;;
11137
11138 ceph_filelock filelock;
11139 filelock.start = fl->l_start;
11140 filelock.length = fl->l_len;
11141 filelock.client = 0;
11142 // see comment in _do_filelock()
11143 filelock.owner = owner | (1ULL << 63);
11144 filelock.pid = fl->l_pid;
11145 filelock.type = lock_cmd;
11146
11147 if (filelock.type == CEPH_LOCK_UNLOCK) {
11148 list<ceph_filelock> activated_locks;
11149 lock_state->remove_lock(filelock, activated_locks);
11150 } else {
11151 bool r = lock_state->add_lock(filelock, false, false, NULL);
11152 ceph_assert(r);
11153 }
11154 }
11155
11156 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11157 {
11158 Inode *in = fh->inode.get();
11159 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11160 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11161 return ret;
11162 }
11163
11164 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11165 {
11166 Inode *in = fh->inode.get();
11167 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11168 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11169 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11170 return ret;
11171 }
11172
11173 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11174 {
11175 Inode *in = fh->inode.get();
11176 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11177
11178 int sleep = !(cmd & LOCK_NB);
11179 cmd &= ~LOCK_NB;
11180
11181 int type;
11182 switch (cmd) {
11183 case LOCK_SH:
11184 type = F_RDLCK;
11185 break;
11186 case LOCK_EX:
11187 type = F_WRLCK;
11188 break;
11189 case LOCK_UN:
11190 type = F_UNLCK;
11191 break;
11192 default:
11193 return -CEPHFS_EINVAL;
11194 }
11195
11196 struct flock fl;
11197 memset(&fl, 0, sizeof(fl));
11198 fl.l_type = type;
11199 fl.l_whence = SEEK_SET;
11200
11201 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11202 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11203 return ret;
11204 }
11205
11206 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11207 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11208 if (!mref_reader.is_state_satisfied()) {
11209 return -CEPHFS_ENOTCONN;
11210 }
11211
11212 std::unique_lock locker(client_lock);
11213 InodeRef in;
11214 int r = Client::path_walk(path, &in, perms, true);
11215 if (r < 0) {
11216 return r;
11217 }
11218
11219 if (in->snapid == CEPH_NOSNAP) {
11220 return -CEPHFS_EINVAL;
11221 }
11222
11223 snap_info->id = in->snapid;
11224 snap_info->metadata = in->snap_metadata;
11225 return 0;
11226 }
11227
11228 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11229 {
11230 /* Since the only thing this does is wrap a call to statfs, and
11231 statfs takes a lock, it doesn't seem we have a need to split it
11232 out. */
11233 return statfs(0, stbuf, perms);
11234 }
11235
11236 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11237 {
11238 if (!args)
11239 return;
11240 std::scoped_lock l(client_lock);
11241 ldout(cct, 10) << __func__ << " cb " << args->handle
11242 << " invalidate_ino_cb " << args->ino_cb
11243 << " invalidate_dentry_cb " << args->dentry_cb
11244 << " switch_interrupt_cb " << args->switch_intr_cb
11245 << " remount_cb " << args->remount_cb
11246 << dendl;
11247 callback_handle = args->handle;
11248 if (args->ino_cb) {
11249 ino_invalidate_cb = args->ino_cb;
11250 async_ino_invalidator.start();
11251 }
11252 if (args->dentry_cb) {
11253 dentry_invalidate_cb = args->dentry_cb;
11254 async_dentry_invalidator.start();
11255 }
11256 if (args->switch_intr_cb) {
11257 switch_interrupt_cb = args->switch_intr_cb;
11258 interrupt_finisher.start();
11259 }
11260 if (args->remount_cb) {
11261 remount_cb = args->remount_cb;
11262 remount_finisher.start();
11263 }
11264 if (args->ino_release_cb) {
11265 ino_release_cb = args->ino_release_cb;
11266 async_ino_releasor.start();
11267 }
11268 if (args->umask_cb)
11269 umask_cb = args->umask_cb;
11270 }
11271
11272 int Client::test_dentry_handling(bool can_invalidate)
11273 {
11274 int r = 0;
11275
11276 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11277 if (!iref_reader.is_state_satisfied())
11278 return -CEPHFS_ENOTCONN;
11279
11280 can_invalidate_dentries = can_invalidate;
11281
11282 if (can_invalidate_dentries) {
11283 ceph_assert(dentry_invalidate_cb);
11284 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11285 r = 0;
11286 } else {
11287 ceph_assert(remount_cb);
11288 ldout(cct, 1) << "using remount_cb" << dendl;
11289 r = _do_remount(false);
11290 }
11291
11292 return r;
11293 }
11294
11295 int Client::_sync_fs()
11296 {
11297 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11298
11299 ldout(cct, 10) << __func__ << dendl;
11300
11301 // flush file data
11302 std::unique_ptr<C_SaferCond> cond = nullptr;
11303 if (cct->_conf->client_oc) {
11304 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11305 objectcacher->flush_all(cond.get());
11306 }
11307
11308 // flush caps
11309 flush_caps_sync();
11310 ceph_tid_t flush_tid = last_flush_tid;
11311
11312 // wait for unsafe mds requests
11313 wait_unsafe_requests();
11314
11315 wait_sync_caps(flush_tid);
11316
11317 if (nullptr != cond) {
11318 client_lock.unlock();
11319 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11320 cond->wait();
11321 ldout(cct, 15) << __func__ << " flush finished" << dendl;
11322 client_lock.lock();
11323 }
11324
11325 return 0;
11326 }
11327
11328 int Client::sync_fs()
11329 {
11330 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11331 if (!mref_reader.is_state_satisfied())
11332 return -CEPHFS_ENOTCONN;
11333
11334 std::scoped_lock l(client_lock);
11335
11336 return _sync_fs();
11337 }
11338
11339 int64_t Client::drop_caches()
11340 {
11341 std::scoped_lock l(client_lock);
11342 return objectcacher->release_all();
11343 }
11344
11345 int Client::_lazyio(Fh *fh, int enable)
11346 {
11347 Inode *in = fh->inode.get();
11348 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11349
11350 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11351 return 0;
11352
11353 int orig_mode = fh->mode;
11354 if (enable) {
11355 fh->mode |= CEPH_FILE_MODE_LAZY;
11356 in->get_open_ref(fh->mode);
11357 in->put_open_ref(orig_mode);
11358 check_caps(in, CHECK_CAPS_NODELAY);
11359 } else {
11360 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11361 in->get_open_ref(fh->mode);
11362 in->put_open_ref(orig_mode);
11363 check_caps(in, 0);
11364 }
11365
11366 return 0;
11367 }
11368
11369 int Client::lazyio(int fd, int enable)
11370 {
11371 std::scoped_lock l(client_lock);
11372 Fh *f = get_filehandle(fd);
11373 if (!f)
11374 return -CEPHFS_EBADF;
11375
11376 return _lazyio(f, enable);
11377 }
11378
11379 int Client::ll_lazyio(Fh *fh, int enable)
11380 {
11381 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11382 tout(cct) << __func__ << std::endl;
11383
11384 std::scoped_lock lock(client_lock);
11385 return _lazyio(fh, enable);
11386 }
11387
11388 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11389 {
11390 std::scoped_lock l(client_lock);
11391 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11392 << ", " << offset << ", " << count << ")" << dendl;
11393
11394 Fh *f = get_filehandle(fd);
11395 if (!f)
11396 return -CEPHFS_EBADF;
11397
11398 // for now
11399 _fsync(f, true);
11400
11401 return 0;
11402 }
11403
11404 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11405 {
11406 std::scoped_lock l(client_lock);
11407 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11408 << ", " << offset << ", " << count << ")" << dendl;
11409
11410 Fh *f = get_filehandle(fd);
11411 if (!f)
11412 return -CEPHFS_EBADF;
11413 Inode *in = f->inode.get();
11414
11415 _fsync(f, true);
11416 if (_release(in)) {
11417 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11418 if (r < 0)
11419 return r;
11420 }
11421 return 0;
11422 }
11423
11424
11425 // =============================
11426 // snaps
11427
11428 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11429 mode_t mode, const std::map<std::string, std::string> &metadata)
11430 {
11431 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11432 if (!mref_reader.is_state_satisfied())
11433 return -CEPHFS_ENOTCONN;
11434
11435 std::scoped_lock l(client_lock);
11436
11437 filepath path(relpath);
11438 InodeRef in;
11439 int r = path_walk(path, &in, perm);
11440 if (r < 0)
11441 return r;
11442 if (cct->_conf->client_permissions) {
11443 r = may_create(in.get(), perm);
11444 if (r < 0)
11445 return r;
11446 }
11447 Inode *snapdir = open_snapdir(in.get());
11448 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11449 }
11450
11451 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11452 {
11453 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11454 if (!mref_reader.is_state_satisfied())
11455 return -CEPHFS_ENOTCONN;
11456
11457 std::scoped_lock l(client_lock);
11458
11459 filepath path(relpath);
11460 InodeRef in;
11461 int r = path_walk(path, &in, perms);
11462 if (r < 0)
11463 return r;
11464 Inode *snapdir = open_snapdir(in.get());
11465 if (cct->_conf->client_permissions) {
11466 r = may_delete(snapdir, check_perms ? name : NULL, perms);
11467 if (r < 0)
11468 return r;
11469 }
11470 return _rmdir(snapdir, name, perms);
11471 }
11472
11473 // =============================
11474 // expose caps
11475
11476 int Client::get_caps_issued(int fd)
11477 {
11478 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11479 if (!mref_reader.is_state_satisfied())
11480 return -CEPHFS_ENOTCONN;
11481
11482 std::scoped_lock lock(client_lock);
11483
11484 Fh *f = get_filehandle(fd);
11485 if (!f)
11486 return -CEPHFS_EBADF;
11487
11488 return f->inode->caps_issued();
11489 }
11490
11491 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11492 {
11493 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11494 if (!mref_reader.is_state_satisfied())
11495 return -CEPHFS_ENOTCONN;
11496
11497 std::scoped_lock lock(client_lock);
11498
11499 filepath p(path);
11500 InodeRef in;
11501 int r = path_walk(p, &in, perms, true);
11502 if (r < 0)
11503 return r;
11504 return in->caps_issued();
11505 }
11506
11507 // =========================================
11508 // low level
11509
11510 Inode *Client::open_snapdir(Inode *diri)
11511 {
11512 Inode *in;
11513 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11514 if (!inode_map.count(vino)) {
11515 in = new Inode(this, vino, &diri->layout);
11516
11517 in->ino = diri->ino;
11518 in->snapid = CEPH_SNAPDIR;
11519 in->mode = diri->mode;
11520 in->uid = diri->uid;
11521 in->gid = diri->gid;
11522 in->nlink = 1;
11523 in->mtime = diri->mtime;
11524 in->ctime = diri->ctime;
11525 in->btime = diri->btime;
11526 in->atime = diri->atime;
11527 in->size = diri->size;
11528 in->change_attr = diri->change_attr;
11529
11530 in->dirfragtree.clear();
11531 in->snapdir_parent = diri;
11532 diri->flags |= I_SNAPDIR_OPEN;
11533 inode_map[vino] = in;
11534 if (use_faked_inos())
11535 _assign_faked_ino(in);
11536 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11537 } else {
11538 in = inode_map[vino];
11539 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11540 }
11541 return in;
11542 }
11543
11544 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11545 Inode **out, const UserPerm& perms)
11546 {
11547 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11548 if (!mref_reader.is_state_satisfied())
11549 return -CEPHFS_ENOTCONN;
11550
11551 vinodeno_t vparent = _get_vino(parent);
11552 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11553 tout(cct) << __func__ << std::endl;
11554 tout(cct) << name << std::endl;
11555
11556 std::scoped_lock lock(client_lock);
11557
11558 int r = 0;
11559 if (!fuse_default_permissions) {
11560 if (strcmp(name, ".") && strcmp(name, "..")) {
11561 r = may_lookup(parent, perms);
11562 if (r < 0)
11563 return r;
11564 }
11565 }
11566
11567 string dname(name);
11568 InodeRef in;
11569
11570 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11571 if (r < 0) {
11572 attr->st_ino = 0;
11573 goto out;
11574 }
11575
11576 ceph_assert(in);
11577 fill_stat(in, attr);
11578 _ll_get(in.get());
11579
11580 out:
11581 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11582 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11583 tout(cct) << attr->st_ino << std::endl;
11584 *out = in.get();
11585 return r;
11586 }
11587
11588 int Client::ll_lookup_vino(
11589 vinodeno_t vino,
11590 const UserPerm& perms,
11591 Inode **inode)
11592 {
11593 ceph_assert(inode != NULL);
11594 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11595 if (!mref_reader.is_state_satisfied())
11596 return -CEPHFS_ENOTCONN;
11597
11598 if (is_reserved_vino(vino))
11599 return -CEPHFS_ESTALE;
11600
11601 std::scoped_lock lock(client_lock);
11602 ldout(cct, 3) << __func__ << " " << vino << dendl;
11603
11604 // Check the cache first
11605 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11606 if (p != inode_map.end()) {
11607 *inode = p->second;
11608 _ll_get(*inode);
11609 return 0;
11610 }
11611
11612 uint64_t snapid = vino.snapid;
11613
11614 // for snapdir, find the non-snapped dir inode
11615 if (snapid == CEPH_SNAPDIR)
11616 vino.snapid = CEPH_NOSNAP;
11617
11618 int r = _lookup_vino(vino, perms, inode);
11619 if (r)
11620 return r;
11621 ceph_assert(*inode != NULL);
11622
11623 if (snapid == CEPH_SNAPDIR) {
11624 Inode *tmp = *inode;
11625
11626 // open the snapdir and put the inode ref
11627 *inode = open_snapdir(tmp);
11628 _ll_forget(tmp, 1);
11629 _ll_get(*inode);
11630 }
11631 return 0;
11632 }
11633
11634 int Client::ll_lookup_inode(
11635 struct inodeno_t ino,
11636 const UserPerm& perms,
11637 Inode **inode)
11638 {
11639 vinodeno_t vino(ino, CEPH_NOSNAP);
11640 return ll_lookup_vino(vino, perms, inode);
11641 }
11642
11643 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11644 struct ceph_statx *stx, unsigned want, unsigned flags,
11645 const UserPerm& perms)
11646 {
11647 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11648 if (!mref_reader.is_state_satisfied())
11649 return -CEPHFS_ENOTCONN;
11650
11651 vinodeno_t vparent = _get_vino(parent);
11652 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11653 tout(cct) << "ll_lookupx" << std::endl;
11654 tout(cct) << name << std::endl;
11655
11656 std::scoped_lock lock(client_lock);
11657
11658 int r = 0;
11659 if (!fuse_default_permissions) {
11660 r = may_lookup(parent, perms);
11661 if (r < 0)
11662 return r;
11663 }
11664
11665 string dname(name);
11666 InodeRef in;
11667
11668 unsigned mask = statx_to_mask(flags, want);
11669 r = _lookup(parent, dname, mask, &in, perms);
11670 if (r < 0) {
11671 stx->stx_ino = 0;
11672 stx->stx_mask = 0;
11673 } else {
11674 ceph_assert(in);
11675 fill_statx(in, mask, stx);
11676 _ll_get(in.get());
11677 }
11678
11679 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11680 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11681 tout(cct) << stx->stx_ino << std::endl;
11682 *out = in.get();
11683 return r;
11684 }
11685
11686 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11687 unsigned int want, unsigned int flags, const UserPerm& perms)
11688 {
11689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11690 if (!mref_reader.is_state_satisfied())
11691 return -CEPHFS_ENOTCONN;
11692
11693 filepath fp(name, 0);
11694 InodeRef in;
11695 int rc;
11696 unsigned mask = statx_to_mask(flags, want);
11697
11698 ldout(cct, 3) << __func__ << " " << name << dendl;
11699 tout(cct) << __func__ << std::endl;
11700 tout(cct) << name << std::endl;
11701
11702 std::scoped_lock lock(client_lock);
11703 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11704 if (rc < 0) {
11705 /* zero out mask, just in case... */
11706 stx->stx_mask = 0;
11707 stx->stx_ino = 0;
11708 *out = NULL;
11709 return rc;
11710 } else {
11711 ceph_assert(in);
11712 fill_statx(in, mask, stx);
11713 _ll_get(in.get());
11714 *out = in.get();
11715 return 0;
11716 }
11717 }
11718
11719 void Client::_ll_get(Inode *in)
11720 {
11721 if (in->ll_ref == 0) {
11722 in->iget();
11723 if (in->is_dir() && !in->dentries.empty()) {
11724 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11725 in->get_first_parent()->get(); // pin dentry
11726 }
11727 if (in->snapid != CEPH_NOSNAP)
11728 ll_snap_ref[in->snapid]++;
11729 }
11730 in->ll_get();
11731 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11732 }
11733
11734 int Client::_ll_put(Inode *in, uint64_t num)
11735 {
11736 in->ll_put(num);
11737 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11738 if (in->ll_ref == 0) {
11739 if (in->is_dir() && !in->dentries.empty()) {
11740 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11741 in->get_first_parent()->put(); // unpin dentry
11742 }
11743 if (in->snapid != CEPH_NOSNAP) {
11744 auto p = ll_snap_ref.find(in->snapid);
11745 ceph_assert(p != ll_snap_ref.end());
11746 ceph_assert(p->second > 0);
11747 if (--p->second == 0)
11748 ll_snap_ref.erase(p);
11749 }
11750 put_inode(in);
11751 return 0;
11752 } else {
11753 return in->ll_ref;
11754 }
11755 }
11756
11757 void Client::_ll_drop_pins()
11758 {
11759 ldout(cct, 10) << __func__ << dendl;
11760 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11761 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11762 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11763 it != inode_map.end();
11764 it = next) {
11765 Inode *in = it->second;
11766 next = it;
11767 ++next;
11768 if (in->ll_ref){
11769 to_be_put.insert(in);
11770 _ll_put(in, in->ll_ref);
11771 }
11772 }
11773 }
11774
11775 bool Client::_ll_forget(Inode *in, uint64_t count)
11776 {
11777 inodeno_t ino = in->ino;
11778
11779 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11780 tout(cct) << __func__ << std::endl;
11781 tout(cct) << ino.val << std::endl;
11782 tout(cct) << count << std::endl;
11783
11784 // Ignore forget if we're no longer mounted
11785 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11786 if (!mref_reader.is_state_satisfied())
11787 return true;
11788
11789 if (ino == 1) return true; // ignore forget on root.
11790
11791 bool last = false;
11792 if (in->ll_ref < count) {
11793 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11794 << ", which only has ll_ref=" << in->ll_ref << dendl;
11795 _ll_put(in, in->ll_ref);
11796 last = true;
11797 } else {
11798 if (_ll_put(in, count) == 0)
11799 last = true;
11800 }
11801
11802 return last;
11803 }
11804
11805 bool Client::ll_forget(Inode *in, uint64_t count)
11806 {
11807 std::scoped_lock lock(client_lock);
11808 return _ll_forget(in, count);
11809 }
11810
11811 bool Client::ll_put(Inode *in)
11812 {
11813 /* ll_forget already takes the lock */
11814 return ll_forget(in, 1);
11815 }
11816
11817 int Client::ll_get_snap_ref(snapid_t snap)
11818 {
11819 std::scoped_lock lock(client_lock);
11820 auto p = ll_snap_ref.find(snap);
11821 if (p != ll_snap_ref.end())
11822 return p->second;
11823 return 0;
11824 }
11825
11826 snapid_t Client::ll_get_snapid(Inode *in)
11827 {
11828 std::scoped_lock lock(client_lock);
11829 return in->snapid;
11830 }
11831
11832 Inode *Client::ll_get_inode(ino_t ino)
11833 {
11834 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11835 if (!mref_reader.is_state_satisfied())
11836 return NULL;
11837
11838 std::scoped_lock lock(client_lock);
11839
11840 vinodeno_t vino = _map_faked_ino(ino);
11841 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11842 if (p == inode_map.end())
11843 return NULL;
11844 Inode *in = p->second;
11845 _ll_get(in);
11846 return in;
11847 }
11848
11849 Inode *Client::ll_get_inode(vinodeno_t vino)
11850 {
11851 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11852 if (!mref_reader.is_state_satisfied())
11853 return NULL;
11854
11855 if (is_reserved_vino(vino))
11856 return NULL;
11857
11858 std::scoped_lock lock(client_lock);
11859
11860 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11861 if (p == inode_map.end())
11862 return NULL;
11863 Inode *in = p->second;
11864 _ll_get(in);
11865 return in;
11866 }
11867
11868 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11869 {
11870 vinodeno_t vino = _get_vino(in);
11871
11872 ldout(cct, 8) << __func__ << " " << vino << dendl;
11873 tout(cct) << __func__ << std::endl;
11874 tout(cct) << vino.ino.val << std::endl;
11875
11876 if (vino.snapid < CEPH_NOSNAP)
11877 return 0;
11878 else
11879 return _getattr(in, caps, perms);
11880 }
11881
11882 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11883 {
11884 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11885 if (!mref_reader.is_state_satisfied())
11886 return -CEPHFS_ENOTCONN;
11887
11888 std::scoped_lock lock(client_lock);
11889
11890 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11891
11892 if (res == 0)
11893 fill_stat(in, attr);
11894 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11895 return res;
11896 }
11897
11898 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11899 unsigned int flags, const UserPerm& perms)
11900 {
11901 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11902 if (!mref_reader.is_state_satisfied())
11903 return -CEPHFS_ENOTCONN;
11904
11905 std::scoped_lock lock(client_lock);
11906
11907 int res = 0;
11908 unsigned mask = statx_to_mask(flags, want);
11909
11910 if (mask && !in->caps_issued_mask(mask, true))
11911 res = _ll_getattr(in, mask, perms);
11912
11913 if (res == 0)
11914 fill_statx(in, mask, stx);
11915 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11916 return res;
11917 }
11918
11919 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11920 const UserPerm& perms, InodeRef *inp)
11921 {
11922 vinodeno_t vino = _get_vino(in);
11923
11924 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11925 << dendl;
11926 tout(cct) << __func__ << std::endl;
11927 tout(cct) << vino.ino.val << std::endl;
11928 tout(cct) << stx->stx_mode << std::endl;
11929 tout(cct) << stx->stx_uid << std::endl;
11930 tout(cct) << stx->stx_gid << std::endl;
11931 tout(cct) << stx->stx_size << std::endl;
11932 tout(cct) << stx->stx_mtime << std::endl;
11933 tout(cct) << stx->stx_atime << std::endl;
11934 tout(cct) << stx->stx_btime << std::endl;
11935 tout(cct) << mask << std::endl;
11936
11937 if (!fuse_default_permissions) {
11938 int res = may_setattr(in, stx, mask, perms);
11939 if (res < 0)
11940 return res;
11941 }
11942
11943 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11944
11945 return __setattrx(in, stx, mask, perms, inp);
11946 }
11947
11948 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11949 const UserPerm& perms)
11950 {
11951 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11952 if (!mref_reader.is_state_satisfied())
11953 return -CEPHFS_ENOTCONN;
11954
11955 std::scoped_lock lock(client_lock);
11956
11957 InodeRef target(in);
11958 int res = _ll_setattrx(in, stx, mask, perms, &target);
11959 if (res == 0) {
11960 ceph_assert(in == target.get());
11961 fill_statx(in, in->caps_issued(), stx);
11962 }
11963
11964 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11965 return res;
11966 }
11967
11968 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11969 const UserPerm& perms)
11970 {
11971 struct ceph_statx stx;
11972 stat_to_statx(attr, &stx);
11973
11974 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11975 if (!mref_reader.is_state_satisfied())
11976 return -CEPHFS_ENOTCONN;
11977
11978 std::scoped_lock lock(client_lock);
11979
11980 InodeRef target(in);
11981 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11982 if (res == 0) {
11983 ceph_assert(in == target.get());
11984 fill_stat(in, attr);
11985 }
11986
11987 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11988 return res;
11989 }
11990
11991
11992 // ----------
11993 // xattrs
11994
11995 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11996 const UserPerm& perms)
11997 {
11998 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11999 if (!mref_reader.is_state_satisfied())
12000 return -CEPHFS_ENOTCONN;
12001
12002 std::scoped_lock lock(client_lock);
12003
12004 InodeRef in;
12005 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12006 if (r < 0)
12007 return r;
12008 return _getxattr(in, name, value, size, perms);
12009 }
12010
12011 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12012 const UserPerm& perms)
12013 {
12014 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12015 if (!mref_reader.is_state_satisfied())
12016 return -CEPHFS_ENOTCONN;
12017
12018 std::scoped_lock lock(client_lock);
12019
12020 InodeRef in;
12021 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12022 if (r < 0)
12023 return r;
12024 return _getxattr(in, name, value, size, perms);
12025 }
12026
12027 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12028 const UserPerm& perms)
12029 {
12030 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12031 if (!mref_reader.is_state_satisfied())
12032 return -CEPHFS_ENOTCONN;
12033
12034 std::scoped_lock lock(client_lock);
12035
12036 Fh *f = get_filehandle(fd);
12037 if (!f)
12038 return -CEPHFS_EBADF;
12039 return _getxattr(f->inode, name, value, size, perms);
12040 }
12041
12042 int Client::listxattr(const char *path, char *list, size_t size,
12043 const UserPerm& perms)
12044 {
12045 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12046 if (!mref_reader.is_state_satisfied())
12047 return -CEPHFS_ENOTCONN;
12048
12049 std::scoped_lock lock(client_lock);
12050
12051 InodeRef in;
12052 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12053 if (r < 0)
12054 return r;
12055 return Client::_listxattr(in.get(), list, size, perms);
12056 }
12057
12058 int Client::llistxattr(const char *path, char *list, size_t size,
12059 const UserPerm& perms)
12060 {
12061 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12062 if (!mref_reader.is_state_satisfied())
12063 return -CEPHFS_ENOTCONN;
12064
12065 std::scoped_lock lock(client_lock);
12066
12067 InodeRef in;
12068 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12069 if (r < 0)
12070 return r;
12071 return Client::_listxattr(in.get(), list, size, perms);
12072 }
12073
12074 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12075 {
12076 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12077 if (!mref_reader.is_state_satisfied())
12078 return -CEPHFS_ENOTCONN;
12079
12080 std::scoped_lock lock(client_lock);
12081
12082 Fh *f = get_filehandle(fd);
12083 if (!f)
12084 return -CEPHFS_EBADF;
12085 return Client::_listxattr(f->inode.get(), list, size, perms);
12086 }
12087
12088 int Client::removexattr(const char *path, const char *name,
12089 const UserPerm& perms)
12090 {
12091 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12092 if (!mref_reader.is_state_satisfied())
12093 return -CEPHFS_ENOTCONN;
12094
12095 std::scoped_lock lock(client_lock);
12096
12097 InodeRef in;
12098 int r = Client::path_walk(path, &in, perms, true);
12099 if (r < 0)
12100 return r;
12101 return _removexattr(in, name, perms);
12102 }
12103
12104 int Client::lremovexattr(const char *path, const char *name,
12105 const UserPerm& perms)
12106 {
12107 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12108 if (!mref_reader.is_state_satisfied())
12109 return -CEPHFS_ENOTCONN;
12110
12111 std::scoped_lock lock(client_lock);
12112
12113 InodeRef in;
12114 int r = Client::path_walk(path, &in, perms, false);
12115 if (r < 0)
12116 return r;
12117 return _removexattr(in, name, perms);
12118 }
12119
12120 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12121 {
12122 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12123 if (!mref_reader.is_state_satisfied())
12124 return -CEPHFS_ENOTCONN;
12125
12126 std::scoped_lock lock(client_lock);
12127
12128 Fh *f = get_filehandle(fd);
12129 if (!f)
12130 return -CEPHFS_EBADF;
12131 return _removexattr(f->inode, name, perms);
12132 }
12133
12134 int Client::setxattr(const char *path, const char *name, const void *value,
12135 size_t size, int flags, const UserPerm& perms)
12136 {
12137 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12138 if (!mref_reader.is_state_satisfied())
12139 return -CEPHFS_ENOTCONN;
12140
12141 _setxattr_maybe_wait_for_osdmap(name, value, size);
12142
12143 std::scoped_lock lock(client_lock);
12144
12145 InodeRef in;
12146 int r = Client::path_walk(path, &in, perms, true);
12147 if (r < 0)
12148 return r;
12149 return _setxattr(in, name, value, size, flags, perms);
12150 }
12151
12152 int Client::lsetxattr(const char *path, const char *name, const void *value,
12153 size_t size, int flags, const UserPerm& perms)
12154 {
12155 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12156 if (!mref_reader.is_state_satisfied())
12157 return -CEPHFS_ENOTCONN;
12158
12159 _setxattr_maybe_wait_for_osdmap(name, value, size);
12160
12161 std::scoped_lock lock(client_lock);
12162
12163 InodeRef in;
12164 int r = Client::path_walk(path, &in, perms, false);
12165 if (r < 0)
12166 return r;
12167 return _setxattr(in, name, value, size, flags, perms);
12168 }
12169
12170 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12171 int flags, const UserPerm& perms)
12172 {
12173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12174 if (!mref_reader.is_state_satisfied())
12175 return -CEPHFS_ENOTCONN;
12176
12177 _setxattr_maybe_wait_for_osdmap(name, value, size);
12178
12179 std::scoped_lock lock(client_lock);
12180
12181 Fh *f = get_filehandle(fd);
12182 if (!f)
12183 return -CEPHFS_EBADF;
12184 return _setxattr(f->inode, name, value, size, flags, perms);
12185 }
12186
12187 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12188 const UserPerm& perms)
12189 {
12190 int r;
12191
12192 const VXattr *vxattr = _match_vxattr(in, name);
12193 if (vxattr) {
12194 r = -CEPHFS_ENODATA;
12195
12196 // Do a force getattr to get the latest quota before returning
12197 // a value to userspace.
12198 int flags = 0;
12199 if (vxattr->flags & VXATTR_RSTAT) {
12200 flags |= CEPH_STAT_RSTAT;
12201 }
12202 if (vxattr->flags & VXATTR_DIRSTAT) {
12203 flags |= CEPH_CAP_FILE_SHARED;
12204 }
12205 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12206 if (r != 0) {
12207 // Error from getattr!
12208 return r;
12209 }
12210
12211 // call pointer-to-member function
12212 char buf[256];
12213 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12214 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12215 } else {
12216 r = -CEPHFS_ENODATA;
12217 }
12218
12219 if (size != 0) {
12220 if (r > (int)size) {
12221 r = -CEPHFS_ERANGE;
12222 } else if (r > 0) {
12223 memcpy(value, buf, r);
12224 }
12225 }
12226 goto out;
12227 }
12228
12229 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12230 r = -CEPHFS_EOPNOTSUPP;
12231 goto out;
12232 }
12233
12234 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12235 if (r == 0) {
12236 string n(name);
12237 r = -CEPHFS_ENODATA;
12238 if (in->xattrs.count(n)) {
12239 r = in->xattrs[n].length();
12240 if (r > 0 && size != 0) {
12241 if (size >= (unsigned)r)
12242 memcpy(value, in->xattrs[n].c_str(), r);
12243 else
12244 r = -CEPHFS_ERANGE;
12245 }
12246 }
12247 }
12248 out:
12249 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12250 return r;
12251 }
12252
12253 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12254 const UserPerm& perms)
12255 {
12256 if (cct->_conf->client_permissions) {
12257 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12258 if (r < 0)
12259 return r;
12260 }
12261 return _getxattr(in.get(), name, value, size, perms);
12262 }
12263
12264 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12265 size_t size, const UserPerm& perms)
12266 {
12267 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12268 if (!mref_reader.is_state_satisfied())
12269 return -CEPHFS_ENOTCONN;
12270
12271 vinodeno_t vino = _get_vino(in);
12272
12273 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12274 tout(cct) << __func__ << std::endl;
12275 tout(cct) << vino.ino.val << std::endl;
12276 tout(cct) << name << std::endl;
12277
12278 std::scoped_lock lock(client_lock);
12279 if (!fuse_default_permissions) {
12280 int r = xattr_permission(in, name, MAY_READ, perms);
12281 if (r < 0)
12282 return r;
12283 }
12284
12285 return _getxattr(in, name, value, size, perms);
12286 }
12287
12288 int Client::_listxattr(Inode *in, char *name, size_t size,
12289 const UserPerm& perms)
12290 {
12291 bool len_only = (size == 0);
12292 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12293 if (r != 0) {
12294 goto out;
12295 }
12296
12297 r = 0;
12298 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12299 if (xattr_name.rfind("ceph.", 0) == 0) {
12300 continue;
12301 }
12302
12303 size_t this_len = xattr_name.length() + 1;
12304 r += this_len;
12305 if (len_only)
12306 continue;
12307
12308 if (this_len > size) {
12309 r = -CEPHFS_ERANGE;
12310 goto out;
12311 }
12312
12313 memcpy(name, xattr_name.c_str(), this_len);
12314 name += this_len;
12315 size -= this_len;
12316 }
12317 out:
12318 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12319 return r;
12320 }
12321
12322 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12323 const UserPerm& perms)
12324 {
12325 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12326 if (!mref_reader.is_state_satisfied())
12327 return -CEPHFS_ENOTCONN;
12328
12329 vinodeno_t vino = _get_vino(in);
12330
12331 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12332 tout(cct) << __func__ << std::endl;
12333 tout(cct) << vino.ino.val << std::endl;
12334 tout(cct) << size << std::endl;
12335
12336 std::scoped_lock lock(client_lock);
12337 return _listxattr(in, names, size, perms);
12338 }
12339
12340 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12341 size_t size, int flags, const UserPerm& perms)
12342 {
12343
12344 int xattr_flags = 0;
12345 if (!value)
12346 xattr_flags |= CEPH_XATTR_REMOVE;
12347 if (flags & XATTR_CREATE)
12348 xattr_flags |= CEPH_XATTR_CREATE;
12349 if (flags & XATTR_REPLACE)
12350 xattr_flags |= CEPH_XATTR_REPLACE;
12351
12352 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12353 filepath path;
12354 in->make_nosnap_relative_path(path);
12355 req->set_filepath(path);
12356 req->set_string2(name);
12357 req->set_inode(in);
12358 req->head.args.setxattr.flags = xattr_flags;
12359
12360 bufferlist bl;
12361 assert (value || size == 0);
12362 bl.append((const char*)value, size);
12363 req->set_data(bl);
12364
12365 int res = make_request(req, perms);
12366
12367 trim_cache();
12368 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12369 res << dendl;
12370 return res;
12371 }
12372
12373 int Client::_setxattr(Inode *in, const char *name, const void *value,
12374 size_t size, int flags, const UserPerm& perms)
12375 {
12376 if (in->snapid != CEPH_NOSNAP) {
12377 return -CEPHFS_EROFS;
12378 }
12379
12380 if (size == 0) {
12381 value = "";
12382 } else if (value == NULL) {
12383 return -CEPHFS_EINVAL;
12384 }
12385
12386 bool posix_acl_xattr = false;
12387 if (acl_type == POSIX_ACL)
12388 posix_acl_xattr = !strncmp(name, "system.", 7);
12389
12390 if (strncmp(name, "user.", 5) &&
12391 strncmp(name, "security.", 9) &&
12392 strncmp(name, "trusted.", 8) &&
12393 strncmp(name, "ceph.", 5) &&
12394 !posix_acl_xattr)
12395 return -CEPHFS_EOPNOTSUPP;
12396
12397 bool check_realm = false;
12398
12399 if (posix_acl_xattr) {
12400 if (!strcmp(name, ACL_EA_ACCESS)) {
12401 mode_t new_mode = in->mode;
12402 if (value) {
12403 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12404 if (ret < 0)
12405 return ret;
12406 if (ret == 0) {
12407 value = NULL;
12408 size = 0;
12409 }
12410 if (new_mode != in->mode) {
12411 struct ceph_statx stx;
12412 stx.stx_mode = new_mode;
12413 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12414 if (ret < 0)
12415 return ret;
12416 }
12417 }
12418 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12419 if (value) {
12420 if (!S_ISDIR(in->mode))
12421 return -CEPHFS_EACCES;
12422 int ret = posix_acl_check(value, size);
12423 if (ret < 0)
12424 return -CEPHFS_EINVAL;
12425 if (ret == 0) {
12426 value = NULL;
12427 size = 0;
12428 }
12429 }
12430 } else {
12431 return -CEPHFS_EOPNOTSUPP;
12432 }
12433 } else {
12434 const VXattr *vxattr = _match_vxattr(in, name);
12435 if (vxattr) {
12436 if (vxattr->readonly)
12437 return -CEPHFS_EOPNOTSUPP;
12438 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12439 check_realm = true;
12440 }
12441 }
12442
12443 int ret = _do_setxattr(in, name, value, size, flags, perms);
12444 if (ret >= 0 && check_realm) {
12445 // check if snaprealm was created for quota inode
12446 if (in->quota.is_enable() &&
12447 !(in->snaprealm && in->snaprealm->ino == in->ino))
12448 ret = -CEPHFS_EOPNOTSUPP;
12449 }
12450
12451 return ret;
12452 }
12453
12454 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12455 size_t size, int flags, const UserPerm& perms)
12456 {
12457 if (cct->_conf->client_permissions) {
12458 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12459 if (r < 0)
12460 return r;
12461 }
12462 return _setxattr(in.get(), name, value, size, flags, perms);
12463 }
12464
12465 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12466 {
12467 string tmp;
12468 if (name == "layout") {
12469 string::iterator begin = value.begin();
12470 string::iterator end = value.end();
12471 keys_and_values<string::iterator> p; // create instance of parser
12472 std::map<string, string> m; // map to receive results
12473 if (!qi::parse(begin, end, p, m)) { // returns true if successful
12474 return -CEPHFS_EINVAL;
12475 }
12476 if (begin != end)
12477 return -CEPHFS_EINVAL;
12478 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12479 if (q->first == "pool") {
12480 tmp = q->second;
12481 break;
12482 }
12483 }
12484 } else if (name == "layout.pool") {
12485 tmp = value;
12486 }
12487
12488 if (tmp.length()) {
12489 int64_t pool;
12490 try {
12491 pool = boost::lexical_cast<unsigned>(tmp);
12492 if (!osdmap->have_pg_pool(pool))
12493 return -CEPHFS_ENOENT;
12494 } catch (boost::bad_lexical_cast const&) {
12495 pool = osdmap->lookup_pg_pool_name(tmp);
12496 if (pool < 0) {
12497 return -CEPHFS_ENOENT;
12498 }
12499 }
12500 }
12501
12502 return 0;
12503 }
12504
12505 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12506 {
12507 // For setting pool of layout, MetaRequest need osdmap epoch.
12508 // There is a race which create a new data pool but client and mds both don't have.
12509 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12510 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12511 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12512 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12513 string rest(strstr(name, "layout"));
12514 string v((const char*)value, size);
12515 int r = objecter->with_osdmap([&](const OSDMap& o) {
12516 return _setxattr_check_data_pool(rest, v, &o);
12517 });
12518
12519 if (r == -CEPHFS_ENOENT) {
12520 bs::error_code ec;
12521 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12522 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12523 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12524 }
12525 }
12526 }
12527
12528 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12529 size_t size, int flags, const UserPerm& perms)
12530 {
12531 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12532 if (!mref_reader.is_state_satisfied())
12533 return -CEPHFS_ENOTCONN;
12534
12535 _setxattr_maybe_wait_for_osdmap(name, value, size);
12536
12537 vinodeno_t vino = _get_vino(in);
12538
12539 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12540 tout(cct) << __func__ << std::endl;
12541 tout(cct) << vino.ino.val << std::endl;
12542 tout(cct) << name << std::endl;
12543
12544 std::scoped_lock lock(client_lock);
12545 if (!fuse_default_permissions) {
12546 int r = xattr_permission(in, name, MAY_WRITE, perms);
12547 if (r < 0)
12548 return r;
12549 }
12550 return _setxattr(in, name, value, size, flags, perms);
12551 }
12552
12553 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12554 {
12555 if (in->snapid != CEPH_NOSNAP) {
12556 return -CEPHFS_EROFS;
12557 }
12558
12559 // same xattrs supported by kernel client
12560 if (strncmp(name, "user.", 5) &&
12561 strncmp(name, "system.", 7) &&
12562 strncmp(name, "security.", 9) &&
12563 strncmp(name, "trusted.", 8) &&
12564 strncmp(name, "ceph.", 5))
12565 return -CEPHFS_EOPNOTSUPP;
12566
12567 const VXattr *vxattr = _match_vxattr(in, name);
12568 if (vxattr && vxattr->readonly)
12569 return -CEPHFS_EOPNOTSUPP;
12570
12571 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12572 filepath path;
12573 in->make_nosnap_relative_path(path);
12574 req->set_filepath(path);
12575 req->set_filepath2(name);
12576 req->set_inode(in);
12577
12578 int res = make_request(req, perms);
12579
12580 trim_cache();
12581 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12582 return res;
12583 }
12584
12585 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12586 {
12587 if (cct->_conf->client_permissions) {
12588 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12589 if (r < 0)
12590 return r;
12591 }
12592 return _removexattr(in.get(), name, perms);
12593 }
12594
12595 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12596 {
12597 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12598 if (!mref_reader.is_state_satisfied())
12599 return -CEPHFS_ENOTCONN;
12600
12601 vinodeno_t vino = _get_vino(in);
12602
12603 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12604 tout(cct) << "ll_removexattr" << std::endl;
12605 tout(cct) << vino.ino.val << std::endl;
12606 tout(cct) << name << std::endl;
12607
12608 std::scoped_lock lock(client_lock);
12609 if (!fuse_default_permissions) {
12610 int r = xattr_permission(in, name, MAY_WRITE, perms);
12611 if (r < 0)
12612 return r;
12613 }
12614
12615 return _removexattr(in, name, perms);
12616 }
12617
12618 bool Client::_vxattrcb_quota_exists(Inode *in)
12619 {
12620 return in->quota.is_enable() &&
12621 (in->snapid != CEPH_NOSNAP ||
12622 (in->snaprealm && in->snaprealm->ino == in->ino));
12623 }
12624 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12625 {
12626 return snprintf(val, size,
12627 "max_bytes=%lld max_files=%lld",
12628 (long long int)in->quota.max_bytes,
12629 (long long int)in->quota.max_files);
12630 }
12631 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12632 {
12633 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12634 }
12635 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12636 {
12637 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12638 }
12639
12640 bool Client::_vxattrcb_layout_exists(Inode *in)
12641 {
12642 return in->layout != file_layout_t();
12643 }
12644 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12645 {
12646 int r = snprintf(val, size,
12647 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12648 (unsigned long long)in->layout.stripe_unit,
12649 (unsigned long long)in->layout.stripe_count,
12650 (unsigned long long)in->layout.object_size);
12651 objecter->with_osdmap([&](const OSDMap& o) {
12652 if (o.have_pg_pool(in->layout.pool_id))
12653 r += snprintf(val + r, size - r, "%s",
12654 o.get_pool_name(in->layout.pool_id).c_str());
12655 else
12656 r += snprintf(val + r, size - r, "%" PRIu64,
12657 (uint64_t)in->layout.pool_id);
12658 });
12659 if (in->layout.pool_ns.length())
12660 r += snprintf(val + r, size - r, " pool_namespace=%s",
12661 in->layout.pool_ns.c_str());
12662 return r;
12663 }
12664 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12665 {
12666 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12667 }
12668 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12669 {
12670 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12671 }
12672 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12673 {
12674 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12675 }
12676 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12677 {
12678 size_t r;
12679 objecter->with_osdmap([&](const OSDMap& o) {
12680 if (o.have_pg_pool(in->layout.pool_id))
12681 r = snprintf(val, size, "%s", o.get_pool_name(
12682 in->layout.pool_id).c_str());
12683 else
12684 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12685 });
12686 return r;
12687 }
12688 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12689 {
12690 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12691 }
12692 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12693 {
12694 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12695 }
12696 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12697 {
12698 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12699 }
12700 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12701 {
12702 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12703 }
12704 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12705 {
12706 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
12707 }
12708 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12709 {
12710 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
12711 }
12712 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12713 {
12714 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
12715 }
12716 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12717 {
12718 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12719 }
12720 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12721 {
12722 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
12723 }
12724 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12725 {
12726 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12727 (long)in->rstat.rctime.nsec());
12728 }
12729 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12730 {
12731 return in->dir_pin != -CEPHFS_ENODATA;
12732 }
12733 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12734 {
12735 return snprintf(val, size, "%ld", (long)in->dir_pin);
12736 }
12737
12738 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12739 {
12740 return !in->snap_btime.is_zero();
12741 }
12742
12743 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12744 {
12745 return snprintf(val, size, "%llu.%09lu",
12746 (long long unsigned)in->snap_btime.sec(),
12747 (long unsigned)in->snap_btime.nsec());
12748 }
12749
12750 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12751 {
12752 // checking one of the xattrs would suffice
12753 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12754 }
12755
12756 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12757 {
12758 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12759 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12760 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12761 in->xattrs["ceph.mirror.info.fs_id"].length(),
12762 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12763 }
12764
12765 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12766 {
12767 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12768 }
12769
12770 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12771 {
12772 auto name = messenger->get_myname();
12773 return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12774 }
12775
12776 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12777 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12778
12779 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12780 { \
12781 name: CEPH_XATTR_NAME(_type, _name), \
12782 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12783 readonly: true, \
12784 exists_cb: NULL, \
12785 flags: _flags, \
12786 }
12787 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12788 { \
12789 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12790 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12791 readonly: false, \
12792 exists_cb: &Client::_vxattrcb_layout_exists, \
12793 flags: 0, \
12794 }
12795 #define XATTR_QUOTA_FIELD(_type, _name) \
12796 { \
12797 name: CEPH_XATTR_NAME(_type, _name), \
12798 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12799 readonly: false, \
12800 exists_cb: &Client::_vxattrcb_quota_exists, \
12801 flags: 0, \
12802 }
12803
12804 const Client::VXattr Client::_dir_vxattrs[] = {
12805 {
12806 name: "ceph.dir.layout",
12807 getxattr_cb: &Client::_vxattrcb_layout,
12808 readonly: false,
12809 exists_cb: &Client::_vxattrcb_layout_exists,
12810 flags: 0,
12811 },
12812 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12813 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12814 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12815 XATTR_LAYOUT_FIELD(dir, layout, pool),
12816 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12817 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12818 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12819 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12820 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12821 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12822 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12823 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
12824 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12825 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12826 {
12827 name: "ceph.quota",
12828 getxattr_cb: &Client::_vxattrcb_quota,
12829 readonly: false,
12830 exists_cb: &Client::_vxattrcb_quota_exists,
12831 flags: 0,
12832 },
12833 XATTR_QUOTA_FIELD(quota, max_bytes),
12834 XATTR_QUOTA_FIELD(quota, max_files),
12835 {
12836 name: "ceph.dir.pin",
12837 getxattr_cb: &Client::_vxattrcb_dir_pin,
12838 readonly: false,
12839 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12840 flags: 0,
12841 },
12842 {
12843 name: "ceph.snap.btime",
12844 getxattr_cb: &Client::_vxattrcb_snap_btime,
12845 readonly: true,
12846 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12847 flags: 0,
12848 },
12849 {
12850 name: "ceph.mirror.info",
12851 getxattr_cb: &Client::_vxattrcb_mirror_info,
12852 readonly: false,
12853 exists_cb: &Client::_vxattrcb_mirror_info_exists,
12854 flags: 0,
12855 },
12856 { name: "" } /* Required table terminator */
12857 };
12858
12859 const Client::VXattr Client::_file_vxattrs[] = {
12860 {
12861 name: "ceph.file.layout",
12862 getxattr_cb: &Client::_vxattrcb_layout,
12863 readonly: false,
12864 exists_cb: &Client::_vxattrcb_layout_exists,
12865 flags: 0,
12866 },
12867 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12868 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12869 XATTR_LAYOUT_FIELD(file, layout, object_size),
12870 XATTR_LAYOUT_FIELD(file, layout, pool),
12871 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12872 {
12873 name: "ceph.snap.btime",
12874 getxattr_cb: &Client::_vxattrcb_snap_btime,
12875 readonly: true,
12876 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12877 flags: 0,
12878 },
12879 { name: "" } /* Required table terminator */
12880 };
12881
12882 const Client::VXattr Client::_common_vxattrs[] = {
12883 {
12884 name: "ceph.cluster_fsid",
12885 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12886 readonly: true,
12887 exists_cb: nullptr,
12888 flags: 0,
12889 },
12890 {
12891 name: "ceph.client_id",
12892 getxattr_cb: &Client::_vxattrcb_client_id,
12893 readonly: true,
12894 exists_cb: nullptr,
12895 flags: 0,
12896 },
12897 { name: "" } /* Required table terminator */
12898 };
12899
12900 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12901 {
12902 if (in->is_dir())
12903 return _dir_vxattrs;
12904 else if (in->is_file())
12905 return _file_vxattrs;
12906 return NULL;
12907 }
12908
12909 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12910 {
12911 if (strncmp(name, "ceph.", 5) == 0) {
12912 const VXattr *vxattr = _get_vxattrs(in);
12913 if (vxattr) {
12914 while (!vxattr->name.empty()) {
12915 if (vxattr->name == name)
12916 return vxattr;
12917 vxattr++;
12918 }
12919 }
12920
12921 // for common vxattrs
12922 vxattr = _common_vxattrs;
12923 while (!vxattr->name.empty()) {
12924 if (vxattr->name == name)
12925 return vxattr;
12926 vxattr++;
12927 }
12928 }
12929
12930 return NULL;
12931 }
12932
12933 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12934 {
12935 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12936 if (!mref_reader.is_state_satisfied())
12937 return -CEPHFS_ENOTCONN;
12938
12939 vinodeno_t vino = _get_vino(in);
12940
12941 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12942 tout(cct) << "ll_readlink" << std::endl;
12943 tout(cct) << vino.ino.val << std::endl;
12944
12945 std::scoped_lock lock(client_lock);
12946 for (auto dn : in->dentries) {
12947 touch_dn(dn);
12948 }
12949
12950 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12951 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12952 return r;
12953 }
12954
12955 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12956 const UserPerm& perms, InodeRef *inp)
12957 {
12958 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12959 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12960 << ", gid " << perms.gid() << ")" << dendl;
12961
12962 if (strlen(name) > NAME_MAX)
12963 return -CEPHFS_ENAMETOOLONG;
12964
12965 if (dir->snapid != CEPH_NOSNAP) {
12966 return -CEPHFS_EROFS;
12967 }
12968 if (is_quota_files_exceeded(dir, perms)) {
12969 return -CEPHFS_EDQUOT;
12970 }
12971
12972 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12973
12974 filepath path;
12975 dir->make_nosnap_relative_path(path);
12976 path.push_dentry(name);
12977 req->set_filepath(path);
12978 req->set_inode(dir);
12979 req->head.args.mknod.rdev = rdev;
12980 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12981 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12982
12983 bufferlist xattrs_bl;
12984 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12985 if (res < 0)
12986 goto fail;
12987 req->head.args.mknod.mode = mode;
12988 if (xattrs_bl.length() > 0)
12989 req->set_data(xattrs_bl);
12990
12991 Dentry *de;
12992 res = get_or_create(dir, name, &de);
12993 if (res < 0)
12994 goto fail;
12995 req->set_dentry(de);
12996
12997 res = make_request(req, perms, inp);
12998
12999 trim_cache();
13000
13001 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13002 return res;
13003
13004 fail:
13005 put_request(req);
13006 return res;
13007 }
13008
13009 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13010 dev_t rdev, struct stat *attr, Inode **out,
13011 const UserPerm& perms)
13012 {
13013 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13014 if (!mref_reader.is_state_satisfied())
13015 return -CEPHFS_ENOTCONN;
13016
13017 vinodeno_t vparent = _get_vino(parent);
13018
13019 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13020 tout(cct) << "ll_mknod" << std::endl;
13021 tout(cct) << vparent.ino.val << std::endl;
13022 tout(cct) << name << std::endl;
13023 tout(cct) << mode << std::endl;
13024 tout(cct) << rdev << std::endl;
13025
13026 std::scoped_lock lock(client_lock);
13027 if (!fuse_default_permissions) {
13028 int r = may_create(parent, perms);
13029 if (r < 0)
13030 return r;
13031 }
13032
13033 InodeRef in;
13034 int r = _mknod(parent, name, mode, rdev, perms, &in);
13035 if (r == 0) {
13036 fill_stat(in, attr);
13037 _ll_get(in.get());
13038 }
13039 tout(cct) << attr->st_ino << std::endl;
13040 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13041 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13042 *out = in.get();
13043 return r;
13044 }
13045
13046 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13047 dev_t rdev, Inode **out,
13048 struct ceph_statx *stx, unsigned want, unsigned flags,
13049 const UserPerm& perms)
13050 {
13051 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13052 if (!mref_reader.is_state_satisfied())
13053 return -CEPHFS_ENOTCONN;
13054
13055 unsigned caps = statx_to_mask(flags, want);
13056
13057 vinodeno_t vparent = _get_vino(parent);
13058
13059 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13060 tout(cct) << "ll_mknodx" << std::endl;
13061 tout(cct) << vparent.ino.val << std::endl;
13062 tout(cct) << name << std::endl;
13063 tout(cct) << mode << std::endl;
13064 tout(cct) << rdev << std::endl;
13065
13066 std::scoped_lock lock(client_lock);
13067
13068 if (!fuse_default_permissions) {
13069 int r = may_create(parent, perms);
13070 if (r < 0)
13071 return r;
13072 }
13073
13074 InodeRef in;
13075 int r = _mknod(parent, name, mode, rdev, perms, &in);
13076 if (r == 0) {
13077 fill_statx(in, caps, stx);
13078 _ll_get(in.get());
13079 }
13080 tout(cct) << stx->stx_ino << std::endl;
13081 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13082 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13083 *out = in.get();
13084 return r;
13085 }
13086
13087 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13088 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13089 int object_size, const char *data_pool, bool *created,
13090 const UserPerm& perms, std::string alternate_name)
13091 {
13092 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13093 mode << dec << ")" << dendl;
13094
13095 if (strlen(name) > NAME_MAX)
13096 return -CEPHFS_ENAMETOOLONG;
13097 if (dir->snapid != CEPH_NOSNAP) {
13098 return -CEPHFS_EROFS;
13099 }
13100 if (is_quota_files_exceeded(dir, perms)) {
13101 return -CEPHFS_EDQUOT;
13102 }
13103
13104 // use normalized flags to generate cmode
13105 int cflags = ceph_flags_sys2wire(flags);
13106 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13107 cflags |= CEPH_O_LAZY;
13108
13109 int cmode = ceph_flags_to_mode(cflags);
13110
13111 int64_t pool_id = -1;
13112 if (data_pool && *data_pool) {
13113 pool_id = objecter->with_osdmap(
13114 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13115 if (pool_id < 0)
13116 return -CEPHFS_EINVAL;
13117 if (pool_id > 0xffffffffll)
13118 return -CEPHFS_ERANGE; // bummer!
13119 }
13120
13121 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13122
13123 filepath path;
13124 dir->make_nosnap_relative_path(path);
13125 path.push_dentry(name);
13126 req->set_filepath(path);
13127 req->set_alternate_name(std::move(alternate_name));
13128 req->set_inode(dir);
13129 req->head.args.open.flags = cflags | CEPH_O_CREAT;
13130
13131 req->head.args.open.stripe_unit = stripe_unit;
13132 req->head.args.open.stripe_count = stripe_count;
13133 req->head.args.open.object_size = object_size;
13134 if (cct->_conf->client_debug_getattr_caps)
13135 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13136 else
13137 req->head.args.open.mask = 0;
13138 req->head.args.open.pool = pool_id;
13139 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13140 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13141
13142 mode |= S_IFREG;
13143 bufferlist xattrs_bl;
13144 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13145 if (res < 0)
13146 goto fail;
13147 req->head.args.open.mode = mode;
13148 if (xattrs_bl.length() > 0)
13149 req->set_data(xattrs_bl);
13150
13151 Dentry *de;
13152 res = get_or_create(dir, name, &de);
13153 if (res < 0)
13154 goto fail;
13155 req->set_dentry(de);
13156
13157 res = make_request(req, perms, inp, created);
13158 if (res < 0) {
13159 goto reply_error;
13160 }
13161
13162 /* If the caller passed a value in fhp, do the open */
13163 if(fhp) {
13164 (*inp)->get_open_ref(cmode);
13165 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13166 }
13167
13168 reply_error:
13169 trim_cache();
13170
13171 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
13172 << " layout " << stripe_unit
13173 << ' ' << stripe_count
13174 << ' ' << object_size
13175 <<") = " << res << dendl;
13176 return res;
13177
13178 fail:
13179 put_request(req);
13180 return res;
13181 }
13182
13183 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
13184 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13185 std::string alternate_name)
13186 {
13187 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
13188 << mode << dec << ", uid " << perm.uid()
13189 << ", gid " << perm.gid() << ")" << dendl;
13190
13191 if (strlen(name) > NAME_MAX)
13192 return -CEPHFS_ENAMETOOLONG;
13193
13194 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13195 return -CEPHFS_EROFS;
13196 }
13197 if (is_quota_files_exceeded(dir, perm)) {
13198 return -CEPHFS_EDQUOT;
13199 }
13200
13201 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13202 MetaRequest *req = new MetaRequest(is_snap_op ?
13203 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13204
13205 filepath path;
13206 dir->make_nosnap_relative_path(path);
13207 path.push_dentry(name);
13208 req->set_filepath(path);
13209 req->set_inode(dir);
13210 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13211 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13212 req->set_alternate_name(std::move(alternate_name));
13213
13214 mode |= S_IFDIR;
13215 bufferlist bl;
13216 int res = _posix_acl_create(dir, &mode, bl, perm);
13217 if (res < 0)
13218 goto fail;
13219 req->head.args.mkdir.mode = mode;
13220 if (is_snap_op) {
13221 SnapPayload payload;
13222 // clear the bufferlist that may have been populated by the call
13223 // to _posix_acl_create(). MDS mksnap does not make use of it.
13224 // So, reuse it to pass metadata payload.
13225 bl.clear();
13226 payload.metadata = metadata;
13227 encode(payload, bl);
13228 }
13229 if (bl.length() > 0) {
13230 req->set_data(bl);
13231 }
13232
13233 Dentry *de;
13234 res = get_or_create(dir, name, &de);
13235 if (res < 0)
13236 goto fail;
13237 req->set_dentry(de);
13238
13239 ldout(cct, 10) << "_mkdir: making request" << dendl;
13240 res = make_request(req, perm, inp);
13241 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13242
13243 trim_cache();
13244
13245 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13246 return res;
13247
13248 fail:
13249 put_request(req);
13250 return res;
13251 }
13252
13253 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13254 struct stat *attr, Inode **out, const UserPerm& perm)
13255 {
13256 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13257 if (!mref_reader.is_state_satisfied())
13258 return -CEPHFS_ENOTCONN;
13259
13260 vinodeno_t vparent = _get_vino(parent);
13261
13262 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13263 tout(cct) << "ll_mkdir" << std::endl;
13264 tout(cct) << vparent.ino.val << std::endl;
13265 tout(cct) << name << std::endl;
13266 tout(cct) << mode << std::endl;
13267
13268 std::scoped_lock lock(client_lock);
13269
13270 if (!fuse_default_permissions) {
13271 int r = may_create(parent, perm);
13272 if (r < 0)
13273 return r;
13274 }
13275
13276 InodeRef in;
13277 int r = _mkdir(parent, name, mode, perm, &in);
13278 if (r == 0) {
13279 fill_stat(in, attr);
13280 _ll_get(in.get());
13281 }
13282 tout(cct) << attr->st_ino << std::endl;
13283 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13284 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13285 *out = in.get();
13286 return r;
13287 }
13288
13289 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13290 struct ceph_statx *stx, unsigned want, unsigned flags,
13291 const UserPerm& perms)
13292 {
13293 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13294 if (!mref_reader.is_state_satisfied())
13295 return -CEPHFS_ENOTCONN;
13296
13297 vinodeno_t vparent = _get_vino(parent);
13298
13299 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13300 tout(cct) << "ll_mkdirx" << std::endl;
13301 tout(cct) << vparent.ino.val << std::endl;
13302 tout(cct) << name << std::endl;
13303 tout(cct) << mode << std::endl;
13304
13305 std::scoped_lock lock(client_lock);
13306
13307 if (!fuse_default_permissions) {
13308 int r = may_create(parent, perms);
13309 if (r < 0)
13310 return r;
13311 }
13312
13313 InodeRef in;
13314 int r = _mkdir(parent, name, mode, perms, &in);
13315 if (r == 0) {
13316 fill_statx(in, statx_to_mask(flags, want), stx);
13317 _ll_get(in.get());
13318 } else {
13319 stx->stx_ino = 0;
13320 stx->stx_mask = 0;
13321 }
13322 tout(cct) << stx->stx_ino << std::endl;
13323 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13324 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13325 *out = in.get();
13326 return r;
13327 }
13328
13329 int Client::_symlink(Inode *dir, const char *name, const char *target,
13330 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13331 {
13332 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13333 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13334 << dendl;
13335
13336 if (strlen(name) > NAME_MAX)
13337 return -CEPHFS_ENAMETOOLONG;
13338
13339 if (dir->snapid != CEPH_NOSNAP) {
13340 return -CEPHFS_EROFS;
13341 }
13342 if (is_quota_files_exceeded(dir, perms)) {
13343 return -CEPHFS_EDQUOT;
13344 }
13345
13346 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13347
13348 filepath path;
13349 dir->make_nosnap_relative_path(path);
13350 path.push_dentry(name);
13351 req->set_filepath(path);
13352 req->set_alternate_name(std::move(alternate_name));
13353 req->set_inode(dir);
13354 req->set_string2(target);
13355 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13356 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13357
13358 Dentry *de;
13359 int res = get_or_create(dir, name, &de);
13360 if (res < 0)
13361 goto fail;
13362 req->set_dentry(de);
13363
13364 res = make_request(req, perms, inp);
13365
13366 trim_cache();
13367 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13368 res << dendl;
13369 return res;
13370
13371 fail:
13372 put_request(req);
13373 return res;
13374 }
13375
13376 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13377 struct stat *attr, Inode **out, const UserPerm& perms)
13378 {
13379 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13380 if (!mref_reader.is_state_satisfied())
13381 return -CEPHFS_ENOTCONN;
13382
13383 vinodeno_t vparent = _get_vino(parent);
13384
13385 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13386 << dendl;
13387 tout(cct) << "ll_symlink" << std::endl;
13388 tout(cct) << vparent.ino.val << std::endl;
13389 tout(cct) << name << std::endl;
13390 tout(cct) << value << std::endl;
13391
13392 std::scoped_lock lock(client_lock);
13393
13394 if (!fuse_default_permissions) {
13395 int r = may_create(parent, perms);
13396 if (r < 0)
13397 return r;
13398 }
13399
13400 InodeRef in;
13401 int r = _symlink(parent, name, value, perms, "", &in);
13402 if (r == 0) {
13403 fill_stat(in, attr);
13404 _ll_get(in.get());
13405 }
13406 tout(cct) << attr->st_ino << std::endl;
13407 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13408 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13409 *out = in.get();
13410 return r;
13411 }
13412
13413 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13414 Inode **out, struct ceph_statx *stx, unsigned want,
13415 unsigned flags, const UserPerm& perms)
13416 {
13417 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13418 if (!mref_reader.is_state_satisfied())
13419 return -CEPHFS_ENOTCONN;
13420
13421 vinodeno_t vparent = _get_vino(parent);
13422
13423 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13424 << dendl;
13425 tout(cct) << "ll_symlinkx" << std::endl;
13426 tout(cct) << vparent.ino.val << std::endl;
13427 tout(cct) << name << std::endl;
13428 tout(cct) << value << std::endl;
13429
13430 std::scoped_lock lock(client_lock);
13431
13432 if (!fuse_default_permissions) {
13433 int r = may_create(parent, perms);
13434 if (r < 0)
13435 return r;
13436 }
13437
13438 InodeRef in;
13439 int r = _symlink(parent, name, value, perms, "", &in);
13440 if (r == 0) {
13441 fill_statx(in, statx_to_mask(flags, want), stx);
13442 _ll_get(in.get());
13443 }
13444 tout(cct) << stx->stx_ino << std::endl;
13445 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13446 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13447 *out = in.get();
13448 return r;
13449 }
13450
13451 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13452 {
13453 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13454 << " uid " << perm.uid() << " gid " << perm.gid()
13455 << ")" << dendl;
13456
13457 if (dir->snapid != CEPH_NOSNAP) {
13458 return -CEPHFS_EROFS;
13459 }
13460
13461 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13462
13463 filepath path;
13464 dir->make_nosnap_relative_path(path);
13465 path.push_dentry(name);
13466 req->set_filepath(path);
13467
13468 InodeRef otherin;
13469 Inode *in;
13470 Dentry *de;
13471
13472 int res = get_or_create(dir, name, &de);
13473 if (res < 0)
13474 goto fail;
13475 req->set_dentry(de);
13476 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13477 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13478
13479 res = _lookup(dir, name, 0, &otherin, perm);
13480 if (res < 0)
13481 goto fail;
13482
13483 in = otherin.get();
13484 req->set_other_inode(in);
13485 in->break_all_delegs();
13486 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13487
13488 req->set_inode(dir);
13489
13490 res = make_request(req, perm);
13491
13492 trim_cache();
13493 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13494 return res;
13495
13496 fail:
13497 put_request(req);
13498 return res;
13499 }
13500
13501 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13502 {
13503 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13504 if (!mref_reader.is_state_satisfied())
13505 return -CEPHFS_ENOTCONN;
13506
13507 vinodeno_t vino = _get_vino(in);
13508
13509 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13510 tout(cct) << "ll_unlink" << std::endl;
13511 tout(cct) << vino.ino.val << std::endl;
13512 tout(cct) << name << std::endl;
13513
13514 std::scoped_lock lock(client_lock);
13515
13516 if (!fuse_default_permissions) {
13517 int r = may_delete(in, name, perm);
13518 if (r < 0)
13519 return r;
13520 }
13521 return _unlink(in, name, perm);
13522 }
13523
13524 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13525 {
13526 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13527 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13528
13529 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13530 return -CEPHFS_EROFS;
13531 }
13532
13533 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13534 MetaRequest *req = new MetaRequest(op);
13535 filepath path;
13536 dir->make_nosnap_relative_path(path);
13537 path.push_dentry(name);
13538 req->set_filepath(path);
13539 req->set_inode(dir);
13540
13541 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13542 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13543 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13544
13545 InodeRef in;
13546
13547 Dentry *de;
13548 int res = get_or_create(dir, name, &de);
13549 if (res < 0)
13550 goto fail;
13551 if (op == CEPH_MDS_OP_RMDIR)
13552 req->set_dentry(de);
13553 else
13554 de->get();
13555
13556 res = _lookup(dir, name, 0, &in, perms);
13557 if (res < 0)
13558 goto fail;
13559
13560 if (op == CEPH_MDS_OP_RMSNAP) {
13561 unlink(de, true, true);
13562 de->put();
13563 }
13564 req->set_other_inode(in.get());
13565
13566 res = make_request(req, perms);
13567
13568 trim_cache();
13569 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13570 return res;
13571
13572 fail:
13573 put_request(req);
13574 return res;
13575 }
13576
13577 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13578 {
13579 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13580 if (!mref_reader.is_state_satisfied())
13581 return -CEPHFS_ENOTCONN;
13582
13583 vinodeno_t vino = _get_vino(in);
13584
13585 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13586 tout(cct) << "ll_rmdir" << std::endl;
13587 tout(cct) << vino.ino.val << std::endl;
13588 tout(cct) << name << std::endl;
13589
13590 std::scoped_lock lock(client_lock);
13591
13592 if (!fuse_default_permissions) {
13593 int r = may_delete(in, name, perms);
13594 if (r < 0)
13595 return r;
13596 }
13597
13598 return _rmdir(in, name, perms);
13599 }
13600
13601 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13602 {
13603 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13604 << todir->ino << " " << toname
13605 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13606 << dendl;
13607
13608 if (fromdir->snapid != todir->snapid)
13609 return -CEPHFS_EXDEV;
13610
13611 int op = CEPH_MDS_OP_RENAME;
13612 if (fromdir->snapid != CEPH_NOSNAP) {
13613 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13614 op = CEPH_MDS_OP_RENAMESNAP;
13615 else
13616 return -CEPHFS_EROFS;
13617 }
13618 if (fromdir != todir) {
13619 Inode *fromdir_root =
13620 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13621 Inode *todir_root =
13622 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13623 if (fromdir_root != todir_root) {
13624 return -CEPHFS_EXDEV;
13625 }
13626 }
13627
13628 InodeRef target;
13629 MetaRequest *req = new MetaRequest(op);
13630
13631 filepath from;
13632 fromdir->make_nosnap_relative_path(from);
13633 from.push_dentry(fromname);
13634 filepath to;
13635 todir->make_nosnap_relative_path(to);
13636 to.push_dentry(toname);
13637 req->set_filepath(to);
13638 req->set_filepath2(from);
13639 req->set_alternate_name(std::move(alternate_name));
13640
13641 Dentry *oldde;
13642 int res = get_or_create(fromdir, fromname, &oldde);
13643 if (res < 0)
13644 goto fail;
13645 Dentry *de;
13646 res = get_or_create(todir, toname, &de);
13647 if (res < 0)
13648 goto fail;
13649
13650 if (op == CEPH_MDS_OP_RENAME) {
13651 req->set_old_dentry(oldde);
13652 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13653 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13654
13655 req->set_dentry(de);
13656 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13657 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13658
13659 InodeRef oldin, otherin;
13660 res = _lookup(fromdir, fromname, 0, &oldin, perm);
13661 if (res < 0)
13662 goto fail;
13663
13664 Inode *oldinode = oldin.get();
13665 oldinode->break_all_delegs();
13666 req->set_old_inode(oldinode);
13667 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13668
13669 res = _lookup(todir, toname, 0, &otherin, perm);
13670 switch (res) {
13671 case 0:
13672 {
13673 Inode *in = otherin.get();
13674 req->set_other_inode(in);
13675 in->break_all_delegs();
13676 }
13677 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13678 break;
13679 case -CEPHFS_ENOENT:
13680 break;
13681 default:
13682 goto fail;
13683 }
13684
13685 req->set_inode(todir);
13686 } else {
13687 // renamesnap reply contains no tracedn, so we need to invalidate
13688 // dentry manually
13689 unlink(oldde, true, true);
13690 unlink(de, true, true);
13691
13692 req->set_inode(todir);
13693 }
13694
13695 res = make_request(req, perm, &target);
13696 ldout(cct, 10) << "rename result is " << res << dendl;
13697
13698 // renamed item from our cache
13699
13700 trim_cache();
13701 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
13702 return res;
13703
13704 fail:
13705 put_request(req);
13706 return res;
13707 }
13708
13709 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13710 const char *newname, const UserPerm& perm)
13711 {
13712 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13713 if (!mref_reader.is_state_satisfied())
13714 return -CEPHFS_ENOTCONN;
13715
13716 vinodeno_t vparent = _get_vino(parent);
13717 vinodeno_t vnewparent = _get_vino(newparent);
13718
13719 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13720 << vnewparent << " " << newname << dendl;
13721 tout(cct) << "ll_rename" << std::endl;
13722 tout(cct) << vparent.ino.val << std::endl;
13723 tout(cct) << name << std::endl;
13724 tout(cct) << vnewparent.ino.val << std::endl;
13725 tout(cct) << newname << std::endl;
13726
13727 std::scoped_lock lock(client_lock);
13728
13729 if (!fuse_default_permissions) {
13730 int r = may_delete(parent, name, perm);
13731 if (r < 0)
13732 return r;
13733 r = may_delete(newparent, newname, perm);
13734 if (r < 0 && r != -CEPHFS_ENOENT)
13735 return r;
13736 }
13737
13738 return _rename(parent, name, newparent, newname, perm, "");
13739 }
13740
13741 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
13742 {
13743 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13744 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13745
13746 if (strlen(newname) > NAME_MAX)
13747 return -CEPHFS_ENAMETOOLONG;
13748
13749 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13750 return -CEPHFS_EROFS;
13751 }
13752 if (is_quota_files_exceeded(dir, perm)) {
13753 return -CEPHFS_EDQUOT;
13754 }
13755
13756 in->break_all_delegs();
13757 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13758
13759 filepath path(newname, dir->ino);
13760 req->set_filepath(path);
13761 req->set_alternate_name(std::move(alternate_name));
13762 filepath existing(in->ino);
13763 req->set_filepath2(existing);
13764
13765 req->set_inode(dir);
13766 req->inode_drop = CEPH_CAP_FILE_SHARED;
13767 req->inode_unless = CEPH_CAP_FILE_EXCL;
13768
13769 Dentry *de;
13770 int res = get_or_create(dir, newname, &de);
13771 if (res < 0)
13772 goto fail;
13773 req->set_dentry(de);
13774
13775 res = make_request(req, perm, inp);
13776 ldout(cct, 10) << "link result is " << res << dendl;
13777
13778 trim_cache();
13779 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13780 return res;
13781
13782 fail:
13783 put_request(req);
13784 return res;
13785 }
13786
13787 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13788 const UserPerm& perm)
13789 {
13790 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13791 if (!mref_reader.is_state_satisfied())
13792 return -CEPHFS_ENOTCONN;
13793
13794 vinodeno_t vino = _get_vino(in);
13795 vinodeno_t vnewparent = _get_vino(newparent);
13796
13797 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13798 newname << dendl;
13799 tout(cct) << "ll_link" << std::endl;
13800 tout(cct) << vino.ino.val << std::endl;
13801 tout(cct) << vnewparent << std::endl;
13802 tout(cct) << newname << std::endl;
13803
13804 InodeRef target;
13805
13806 std::scoped_lock lock(client_lock);
13807
13808 if (!fuse_default_permissions) {
13809 if (S_ISDIR(in->mode))
13810 return -CEPHFS_EPERM;
13811
13812 int r = may_hardlink(in, perm);
13813 if (r < 0)
13814 return r;
13815
13816 r = may_create(newparent, perm);
13817 if (r < 0)
13818 return r;
13819 }
13820
13821 return _link(in, newparent, newname, perm, "", &target);
13822 }
13823
13824 int Client::ll_num_osds(void)
13825 {
13826 std::scoped_lock lock(client_lock);
13827 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13828 }
13829
13830 int Client::ll_osdaddr(int osd, uint32_t *addr)
13831 {
13832 std::scoped_lock lock(client_lock);
13833
13834 entity_addr_t g;
13835 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13836 if (!o.exists(osd))
13837 return false;
13838 g = o.get_addrs(osd).front();
13839 return true;
13840 });
13841 if (!exists)
13842 return -1;
13843 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13844 *addr = ntohl(nb_addr);
13845 return 0;
13846 }
13847
13848 uint32_t Client::ll_stripe_unit(Inode *in)
13849 {
13850 std::scoped_lock lock(client_lock);
13851 return in->layout.stripe_unit;
13852 }
13853
13854 uint64_t Client::ll_snap_seq(Inode *in)
13855 {
13856 std::scoped_lock lock(client_lock);
13857 return in->snaprealm->seq;
13858 }
13859
13860 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13861 {
13862 std::scoped_lock lock(client_lock);
13863 *layout = in->layout;
13864 return 0;
13865 }
13866
13867 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13868 {
13869 return ll_file_layout(fh->inode.get(), layout);
13870 }
13871
13872 /* Currently we cannot take advantage of redundancy in reads, since we
13873 would have to go through all possible placement groups (a
13874 potentially quite large number determined by a hash), and use CRUSH
13875 to calculate the appropriate set of OSDs for each placement group,
13876 then index into that. An array with one entry per OSD is much more
13877 tractable and works for demonstration purposes. */
13878
13879 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13880 file_layout_t* layout)
13881 {
13882 std::scoped_lock lock(client_lock);
13883
13884 inodeno_t ino = in->ino;
13885 uint32_t object_size = layout->object_size;
13886 uint32_t su = layout->stripe_unit;
13887 uint32_t stripe_count = layout->stripe_count;
13888 uint64_t stripes_per_object = object_size / su;
13889 uint64_t stripeno = 0, stripepos = 0;
13890
13891 if(stripe_count) {
13892 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13893 stripepos = blockno % stripe_count; // which object in the object set (X)
13894 }
13895 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13896 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13897
13898 object_t oid = file_object_t(ino, objectno);
13899 return objecter->with_osdmap([&](const OSDMap& o) {
13900 ceph_object_layout olayout =
13901 o.file_to_object_layout(oid, *layout);
13902 pg_t pg = (pg_t)olayout.ol_pgid;
13903 vector<int> osds;
13904 int primary;
13905 o.pg_to_acting_osds(pg, &osds, &primary);
13906 return primary;
13907 });
13908 }
13909
13910 /* Return the offset of the block, internal to the object */
13911
13912 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13913 {
13914 std::scoped_lock lock(client_lock);
13915 file_layout_t *layout=&(in->layout);
13916 uint32_t object_size = layout->object_size;
13917 uint32_t su = layout->stripe_unit;
13918 uint64_t stripes_per_object = object_size / su;
13919
13920 return (blockno % stripes_per_object) * su;
13921 }
13922
13923 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13924 const UserPerm& perms)
13925 {
13926 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13927 if (!mref_reader.is_state_satisfied())
13928 return -CEPHFS_ENOTCONN;
13929
13930 vinodeno_t vino = _get_vino(in);
13931
13932 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13933 tout(cct) << "ll_opendir" << std::endl;
13934 tout(cct) << vino.ino.val << std::endl;
13935
13936 std::scoped_lock lock(client_lock);
13937
13938 if (!fuse_default_permissions) {
13939 int r = may_open(in, flags, perms);
13940 if (r < 0)
13941 return r;
13942 }
13943
13944 int r = _opendir(in, dirpp, perms);
13945 tout(cct) << (uintptr_t)*dirpp << std::endl;
13946
13947 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13948 << dendl;
13949 return r;
13950 }
13951
13952 int Client::ll_releasedir(dir_result_t *dirp)
13953 {
13954 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13955 if (!mref_reader.is_state_satisfied())
13956 return -CEPHFS_ENOTCONN;
13957
13958 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13959 tout(cct) << "ll_releasedir" << std::endl;
13960 tout(cct) << (uintptr_t)dirp << std::endl;
13961
13962 std::scoped_lock lock(client_lock);
13963
13964 _closedir(dirp);
13965 return 0;
13966 }
13967
13968 int Client::ll_fsyncdir(dir_result_t *dirp)
13969 {
13970 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13971 if (!mref_reader.is_state_satisfied())
13972 return -CEPHFS_ENOTCONN;
13973
13974 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13975 tout(cct) << "ll_fsyncdir" << std::endl;
13976 tout(cct) << (uintptr_t)dirp << std::endl;
13977
13978 std::scoped_lock lock(client_lock);
13979 return _fsync(dirp->inode.get(), false);
13980 }
13981
13982 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13983 {
13984 ceph_assert(!(flags & O_CREAT));
13985
13986 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13987 if (!mref_reader.is_state_satisfied())
13988 return -CEPHFS_ENOTCONN;
13989
13990 vinodeno_t vino = _get_vino(in);
13991
13992 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13993 tout(cct) << "ll_open" << std::endl;
13994 tout(cct) << vino.ino.val << std::endl;
13995 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13996
13997 std::scoped_lock lock(client_lock);
13998
13999 int r;
14000 if (!fuse_default_permissions) {
14001 r = may_open(in, flags, perms);
14002 if (r < 0)
14003 goto out;
14004 }
14005
14006 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14007
14008 out:
14009 Fh *fhptr = fhp ? *fhp : NULL;
14010 if (fhptr) {
14011 ll_unclosed_fh_set.insert(fhptr);
14012 }
14013 tout(cct) << (uintptr_t)fhptr << std::endl;
14014 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14015 " = " << r << " (" << fhptr << ")" << dendl;
14016 return r;
14017 }
14018
14019 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14020 int flags, InodeRef *in, int caps, Fh **fhp,
14021 const UserPerm& perms)
14022 {
14023 *fhp = NULL;
14024
14025 vinodeno_t vparent = _get_vino(parent);
14026
14027 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14028 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14029 << ", gid " << perms.gid() << dendl;
14030 tout(cct) << "ll_create" << std::endl;
14031 tout(cct) << vparent.ino.val << std::endl;
14032 tout(cct) << name << std::endl;
14033 tout(cct) << mode << std::endl;
14034 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14035
14036 bool created = false;
14037 int r = _lookup(parent, name, caps, in, perms);
14038
14039 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14040 return -CEPHFS_EEXIST;
14041
14042 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14043 if (!fuse_default_permissions) {
14044 r = may_create(parent, perms);
14045 if (r < 0)
14046 goto out;
14047 }
14048 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14049 perms, "");
14050 if (r < 0)
14051 goto out;
14052 }
14053
14054 if (r < 0)
14055 goto out;
14056
14057 ceph_assert(*in);
14058
14059 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14060 if (!created) {
14061 if (!fuse_default_permissions) {
14062 r = may_open(in->get(), flags, perms);
14063 if (r < 0) {
14064 if (*fhp) {
14065 int release_r = _release_fh(*fhp);
14066 ceph_assert(release_r == 0); // during create, no async data ops should have happened
14067 }
14068 goto out;
14069 }
14070 }
14071 if (*fhp == NULL) {
14072 r = _open(in->get(), flags, mode, fhp, perms);
14073 if (r < 0)
14074 goto out;
14075 }
14076 }
14077
14078 out:
14079 if (*fhp) {
14080 ll_unclosed_fh_set.insert(*fhp);
14081 }
14082
14083 ino_t ino = 0;
14084 if (r >= 0) {
14085 Inode *inode = in->get();
14086 if (use_faked_inos())
14087 ino = inode->faked_ino;
14088 else
14089 ino = inode->ino;
14090 }
14091
14092 tout(cct) << (uintptr_t)*fhp << std::endl;
14093 tout(cct) << ino << std::endl;
14094 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14095 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14096 *fhp << " " << hex << ino << dec << ")" << dendl;
14097
14098 return r;
14099 }
14100
14101 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14102 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14103 const UserPerm& perms)
14104 {
14105 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14106 if (!mref_reader.is_state_satisfied())
14107 return -CEPHFS_ENOTCONN;
14108
14109 std::scoped_lock lock(client_lock);
14110 InodeRef in;
14111
14112 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14113 fhp, perms);
14114 if (r >= 0) {
14115 ceph_assert(in);
14116
14117 // passing an Inode in outp requires an additional ref
14118 if (outp) {
14119 _ll_get(in.get());
14120 *outp = in.get();
14121 }
14122 fill_stat(in, attr);
14123 } else {
14124 attr->st_ino = 0;
14125 }
14126
14127 return r;
14128 }
14129
14130 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14131 int oflags, Inode **outp, Fh **fhp,
14132 struct ceph_statx *stx, unsigned want, unsigned lflags,
14133 const UserPerm& perms)
14134 {
14135 unsigned caps = statx_to_mask(lflags, want);
14136 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14137 if (!mref_reader.is_state_satisfied())
14138 return -CEPHFS_ENOTCONN;
14139
14140 std::scoped_lock lock(client_lock);
14141 InodeRef in;
14142
14143 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14144 if (r >= 0) {
14145 ceph_assert(in);
14146
14147 // passing an Inode in outp requires an additional ref
14148 if (outp) {
14149 _ll_get(in.get());
14150 *outp = in.get();
14151 }
14152 fill_statx(in, caps, stx);
14153 } else {
14154 stx->stx_ino = 0;
14155 stx->stx_mask = 0;
14156 }
14157
14158 return r;
14159 }
14160
14161 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14162 {
14163 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14164 if (!mref_reader.is_state_satisfied())
14165 return -CEPHFS_ENOTCONN;
14166
14167 tout(cct) << "ll_lseek" << std::endl;
14168 tout(cct) << offset << std::endl;
14169 tout(cct) << whence << std::endl;
14170
14171 std::scoped_lock lock(client_lock);
14172 return _lseek(fh, offset, whence);
14173 }
14174
14175 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14176 {
14177 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14178 if (!mref_reader.is_state_satisfied())
14179 return -CEPHFS_ENOTCONN;
14180
14181 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14182 tout(cct) << "ll_read" << std::endl;
14183 tout(cct) << (uintptr_t)fh << std::endl;
14184 tout(cct) << off << std::endl;
14185 tout(cct) << len << std::endl;
14186
14187 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14188 len = std::min(len, (loff_t)INT_MAX);
14189 std::scoped_lock lock(client_lock);
14190
14191 int r = _read(fh, off, len, bl);
14192 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14193 << dendl;
14194 return r;
14195 }
14196
14197 int Client::ll_read_block(Inode *in, uint64_t blockid,
14198 char *buf,
14199 uint64_t offset,
14200 uint64_t length,
14201 file_layout_t* layout)
14202 {
14203 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14204 if (!mref_reader.is_state_satisfied())
14205 return -CEPHFS_ENOTCONN;
14206
14207 vinodeno_t vino = _get_vino(in);
14208 object_t oid = file_object_t(vino.ino, blockid);
14209 C_SaferCond onfinish;
14210 bufferlist bl;
14211
14212 objecter->read(oid,
14213 object_locator_t(layout->pool_id),
14214 offset,
14215 length,
14216 vino.snapid,
14217 &bl,
14218 CEPH_OSD_FLAG_READ,
14219 &onfinish);
14220
14221 int r = onfinish.wait();
14222 if (r >= 0) {
14223 bl.begin().copy(bl.length(), buf);
14224 r = bl.length();
14225 }
14226
14227 return r;
14228 }
14229
14230 /* It appears that the OSD doesn't return success unless the entire
14231 buffer was written, return the write length on success. */
14232
14233 int Client::ll_write_block(Inode *in, uint64_t blockid,
14234 char* buf, uint64_t offset,
14235 uint64_t length, file_layout_t* layout,
14236 uint64_t snapseq, uint32_t sync)
14237 {
14238 vinodeno_t vino = ll_get_vino(in);
14239 int r = 0;
14240 std::unique_ptr<C_SaferCond> onsafe = nullptr;
14241
14242 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14243 if (!mref_reader.is_state_satisfied())
14244 return -CEPHFS_ENOTCONN;
14245
14246 if (length == 0) {
14247 return -CEPHFS_EINVAL;
14248 }
14249 if (true || sync) {
14250 /* if write is stable, the epilogue is waiting on
14251 * flock */
14252 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14253 }
14254 object_t oid = file_object_t(vino.ino, blockid);
14255 SnapContext fakesnap;
14256 ceph::bufferlist bl;
14257 if (length > 0) {
14258 bl.push_back(buffer::copy(buf, length));
14259 }
14260
14261 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14262 << dendl;
14263
14264 fakesnap.seq = snapseq;
14265
14266 /* lock just in time */
14267 objecter->write(oid,
14268 object_locator_t(layout->pool_id),
14269 offset,
14270 length,
14271 fakesnap,
14272 bl,
14273 ceph::real_clock::now(),
14274 0,
14275 onsafe.get());
14276
14277 if (nullptr != onsafe) {
14278 r = onsafe->wait();
14279 }
14280
14281 if (r < 0) {
14282 return r;
14283 } else {
14284 return length;
14285 }
14286 }
14287
14288 int Client::ll_commit_blocks(Inode *in,
14289 uint64_t offset,
14290 uint64_t length)
14291 {
14292 /*
14293 BarrierContext *bctx;
14294 vinodeno_t vino = _get_vino(in);
14295 uint64_t ino = vino.ino;
14296
14297 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14298 << offset << " to " << length << dendl;
14299
14300 if (length == 0) {
14301 return -CEPHFS_EINVAL;
14302 }
14303
14304 std::scoped_lock lock(client_lock);
14305 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14306 if (p != barriers.end()) {
14307 barrier_interval civ(offset, offset + length);
14308 p->second->commit_barrier(civ);
14309 }
14310 */
14311 return 0;
14312 }
14313
14314 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14315 {
14316 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14317 "~" << len << dendl;
14318 tout(cct) << "ll_write" << std::endl;
14319 tout(cct) << (uintptr_t)fh << std::endl;
14320 tout(cct) << off << std::endl;
14321 tout(cct) << len << std::endl;
14322
14323 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14324 if (!mref_reader.is_state_satisfied())
14325 return -CEPHFS_ENOTCONN;
14326
14327 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14328 len = std::min(len, (loff_t)INT_MAX);
14329 std::scoped_lock lock(client_lock);
14330
14331 int r = _write(fh, off, len, data, NULL, 0);
14332 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14333 << dendl;
14334 return r;
14335 }
14336
14337 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14338 {
14339 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14340 if (!mref_reader.is_state_satisfied())
14341 return -CEPHFS_ENOTCONN;
14342
14343 std::unique_lock cl(client_lock);
14344 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false, cl);
14345 }
14346
14347 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14348 {
14349 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14350 if (!mref_reader.is_state_satisfied())
14351 return -CEPHFS_ENOTCONN;
14352
14353 std::unique_lock cl(client_lock);
14354 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false, cl);
14355 }
14356
14357 int Client::ll_flush(Fh *fh)
14358 {
14359 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14360 if (!mref_reader.is_state_satisfied())
14361 return -CEPHFS_ENOTCONN;
14362
14363 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14364 tout(cct) << "ll_flush" << std::endl;
14365 tout(cct) << (uintptr_t)fh << std::endl;
14366
14367 std::scoped_lock lock(client_lock);
14368 return _flush(fh);
14369 }
14370
14371 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14372 {
14373 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14374 if (!mref_reader.is_state_satisfied())
14375 return -CEPHFS_ENOTCONN;
14376
14377 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14378 tout(cct) << "ll_fsync" << std::endl;
14379 tout(cct) << (uintptr_t)fh << std::endl;
14380
14381 std::scoped_lock lock(client_lock);
14382 int r = _fsync(fh, syncdataonly);
14383 if (r) {
14384 // If we're returning an error, clear it from the FH
14385 fh->take_async_err();
14386 }
14387 return r;
14388 }
14389
14390 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14391 {
14392 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14393 if (!mref_reader.is_state_satisfied())
14394 return -CEPHFS_ENOTCONN;
14395
14396 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14397 tout(cct) << "ll_sync_inode" << std::endl;
14398 tout(cct) << (uintptr_t)in << std::endl;
14399
14400 std::scoped_lock lock(client_lock);
14401 return _fsync(in, syncdataonly);
14402 }
14403
14404 #ifdef FALLOC_FL_PUNCH_HOLE
14405
14406 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14407 {
14408 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14409
14410 if (offset < 0 || length <= 0)
14411 return -CEPHFS_EINVAL;
14412
14413 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14414 return -CEPHFS_EOPNOTSUPP;
14415
14416 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14417 return -CEPHFS_EOPNOTSUPP;
14418
14419 Inode *in = fh->inode.get();
14420
14421 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14422 !(mode & FALLOC_FL_PUNCH_HOLE)) {
14423 return -CEPHFS_ENOSPC;
14424 }
14425
14426 if (in->snapid != CEPH_NOSNAP)
14427 return -CEPHFS_EROFS;
14428
14429 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14430 return -CEPHFS_EBADF;
14431
14432 uint64_t size = offset + length;
14433 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14434 size > in->size &&
14435 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14436 return -CEPHFS_EDQUOT;
14437 }
14438
14439 int have;
14440 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14441 if (r < 0)
14442 return r;
14443
14444 std::unique_ptr<C_SaferCond> onuninline = nullptr;
14445 if (mode & FALLOC_FL_PUNCH_HOLE) {
14446 if (in->inline_version < CEPH_INLINE_NONE &&
14447 (have & CEPH_CAP_FILE_BUFFER)) {
14448 bufferlist bl;
14449 auto inline_iter = in->inline_data.cbegin();
14450 int len = in->inline_data.length();
14451 if (offset < len) {
14452 if (offset > 0)
14453 inline_iter.copy(offset, bl);
14454 int size = length;
14455 if (offset + size > len)
14456 size = len - offset;
14457 if (size > 0)
14458 bl.append_zero(size);
14459 if (offset + size < len) {
14460 inline_iter += size;
14461 inline_iter.copy(len - offset - size, bl);
14462 }
14463 in->inline_data = bl;
14464 in->inline_version++;
14465 }
14466 in->mtime = in->ctime = ceph_clock_now();
14467 in->change_attr++;
14468 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14469 } else {
14470 if (in->inline_version < CEPH_INLINE_NONE) {
14471 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14472 uninline_data(in, onuninline.get());
14473 }
14474
14475 C_SaferCond onfinish("Client::_punch_hole flock");
14476
14477 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14478
14479 _invalidate_inode_cache(in, offset, length);
14480 filer->zero(in->ino, &in->layout,
14481 in->snaprealm->get_snap_context(),
14482 offset, length,
14483 ceph::real_clock::now(),
14484 0, true, &onfinish);
14485 in->mtime = in->ctime = ceph_clock_now();
14486 in->change_attr++;
14487 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14488
14489 client_lock.unlock();
14490 onfinish.wait();
14491 client_lock.lock();
14492 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14493 }
14494 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14495 uint64_t size = offset + length;
14496 if (size > in->size) {
14497 in->size = size;
14498 in->mtime = in->ctime = ceph_clock_now();
14499 in->change_attr++;
14500 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14501
14502 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14503 check_caps(in, CHECK_CAPS_NODELAY);
14504 } else if (is_max_size_approaching(in)) {
14505 check_caps(in, 0);
14506 }
14507 }
14508 }
14509
14510 if (nullptr != onuninline) {
14511 client_lock.unlock();
14512 int ret = onuninline->wait();
14513 client_lock.lock();
14514
14515 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14516 in->inline_data.clear();
14517 in->inline_version = CEPH_INLINE_NONE;
14518 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14519 check_caps(in, 0);
14520 } else
14521 r = ret;
14522 }
14523
14524 put_cap_ref(in, CEPH_CAP_FILE_WR);
14525 return r;
14526 }
14527 #else
14528
14529 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14530 {
14531 return -CEPHFS_EOPNOTSUPP;
14532 }
14533
14534 #endif
14535
14536
14537 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14538 {
14539 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14540 if (!mref_reader.is_state_satisfied())
14541 return -CEPHFS_ENOTCONN;
14542
14543 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14544 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14545 tout(cct) << (uintptr_t)fh << std::endl;
14546
14547 std::scoped_lock lock(client_lock);
14548 return _fallocate(fh, mode, offset, length);
14549 }
14550
14551 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14552 {
14553 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14554 if (!mref_reader.is_state_satisfied())
14555 return -CEPHFS_ENOTCONN;
14556
14557 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14558
14559 std::scoped_lock lock(client_lock);
14560 Fh *fh = get_filehandle(fd);
14561 if (!fh)
14562 return -CEPHFS_EBADF;
14563 #if defined(__linux__) && defined(O_PATH)
14564 if (fh->flags & O_PATH)
14565 return -CEPHFS_EBADF;
14566 #endif
14567 return _fallocate(fh, mode, offset, length);
14568 }
14569
14570 int Client::ll_release(Fh *fh)
14571 {
14572 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14573 if (!mref_reader.is_state_satisfied())
14574 return -CEPHFS_ENOTCONN;
14575
14576 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14577 dendl;
14578 tout(cct) << __func__ << " (fh)" << std::endl;
14579 tout(cct) << (uintptr_t)fh << std::endl;
14580
14581 std::scoped_lock lock(client_lock);
14582
14583 if (ll_unclosed_fh_set.count(fh))
14584 ll_unclosed_fh_set.erase(fh);
14585 return _release_fh(fh);
14586 }
14587
14588 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14589 {
14590 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14591 if (!mref_reader.is_state_satisfied())
14592 return -CEPHFS_ENOTCONN;
14593
14594 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14595 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14596
14597 std::scoped_lock lock(client_lock);
14598 return _getlk(fh, fl, owner);
14599 }
14600
14601 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14602 {
14603 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14604 if (!mref_reader.is_state_satisfied())
14605 return -CEPHFS_ENOTCONN;
14606
14607 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14608 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14609
14610 std::scoped_lock lock(client_lock);
14611 return _setlk(fh, fl, owner, sleep);
14612 }
14613
14614 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14615 {
14616 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14617 if (!mref_reader.is_state_satisfied())
14618 return -CEPHFS_ENOTCONN;
14619
14620 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14621 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14622
14623 std::scoped_lock lock(client_lock);
14624 return _flock(fh, cmd, owner);
14625 }
14626
14627 int Client::set_deleg_timeout(uint32_t timeout)
14628 {
14629 std::scoped_lock lock(client_lock);
14630
14631 /*
14632 * The whole point is to prevent blocklisting so we must time out the
14633 * delegation before the session autoclose timeout kicks in.
14634 */
14635 if (timeout >= mdsmap->get_session_autoclose())
14636 return -CEPHFS_EINVAL;
14637
14638 deleg_timeout = timeout;
14639 return 0;
14640 }
14641
14642 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14643 {
14644 int ret = -CEPHFS_EINVAL;
14645
14646 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14647 if (!mref_reader.is_state_satisfied())
14648 return -CEPHFS_ENOTCONN;
14649
14650 std::scoped_lock lock(client_lock);
14651
14652 Inode *inode = fh->inode.get();
14653
14654 switch(cmd) {
14655 case CEPH_DELEGATION_NONE:
14656 inode->unset_deleg(fh);
14657 ret = 0;
14658 break;
14659 default:
14660 try {
14661 ret = inode->set_deleg(fh, cmd, cb, priv);
14662 } catch (std::bad_alloc&) {
14663 ret = -CEPHFS_ENOMEM;
14664 }
14665 break;
14666 }
14667 return ret;
14668 }
14669
14670 class C_Client_RequestInterrupt : public Context {
14671 private:
14672 Client *client;
14673 MetaRequest *req;
14674 public:
14675 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14676 req->get();
14677 }
14678 void finish(int r) override {
14679 std::scoped_lock l(client->client_lock);
14680 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14681 client->_interrupt_filelock(req);
14682 client->put_request(req);
14683 }
14684 };
14685
14686 void Client::ll_interrupt(void *d)
14687 {
14688 MetaRequest *req = static_cast<MetaRequest*>(d);
14689 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14690 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
14691 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14692 }
14693
14694 // =========================================
14695 // layout
14696
14697 // expose file layouts
14698
14699 int Client::describe_layout(const char *relpath, file_layout_t *lp,
14700 const UserPerm& perms)
14701 {
14702 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14703 if (!mref_reader.is_state_satisfied())
14704 return -CEPHFS_ENOTCONN;
14705
14706 std::scoped_lock lock(client_lock);
14707
14708 filepath path(relpath);
14709 InodeRef in;
14710 int r = path_walk(path, &in, perms);
14711 if (r < 0)
14712 return r;
14713
14714 *lp = in->layout;
14715
14716 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
14717 return 0;
14718 }
14719
14720 int Client::fdescribe_layout(int fd, file_layout_t *lp)
14721 {
14722 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14723 if (!mref_reader.is_state_satisfied())
14724 return -CEPHFS_ENOTCONN;
14725
14726 std::scoped_lock lock(client_lock);
14727
14728 Fh *f = get_filehandle(fd);
14729 if (!f)
14730 return -CEPHFS_EBADF;
14731 Inode *in = f->inode.get();
14732
14733 *lp = in->layout;
14734
14735 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
14736 return 0;
14737 }
14738
14739 int64_t Client::get_default_pool_id()
14740 {
14741 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14742 if (!mref_reader.is_state_satisfied())
14743 return -CEPHFS_ENOTCONN;
14744
14745 std::scoped_lock lock(client_lock);
14746
14747 /* first data pool is the default */
14748 return mdsmap->get_first_data_pool();
14749 }
14750
14751 // expose osdmap
14752
14753 int64_t Client::get_pool_id(const char *pool_name)
14754 {
14755 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14756 if (!mref_reader.is_state_satisfied())
14757 return -CEPHFS_ENOTCONN;
14758
14759 std::scoped_lock lock(client_lock);
14760
14761 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14762 pool_name);
14763 }
14764
14765 string Client::get_pool_name(int64_t pool)
14766 {
14767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14768 if (!mref_reader.is_state_satisfied())
14769 return string();
14770
14771 std::scoped_lock lock(client_lock);
14772
14773 return objecter->with_osdmap([pool](const OSDMap& o) {
14774 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14775 });
14776 }
14777
14778 int Client::get_pool_replication(int64_t pool)
14779 {
14780 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14781 if (!mref_reader.is_state_satisfied())
14782 return -CEPHFS_ENOTCONN;
14783
14784 std::scoped_lock lock(client_lock);
14785
14786 return objecter->with_osdmap([pool](const OSDMap& o) {
14787 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
14788 });
14789 }
14790
14791 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14792 {
14793 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14794 if (!mref_reader.is_state_satisfied())
14795 return -CEPHFS_ENOTCONN;
14796
14797 std::scoped_lock lock(client_lock);
14798
14799 Fh *f = get_filehandle(fd);
14800 if (!f)
14801 return -CEPHFS_EBADF;
14802 Inode *in = f->inode.get();
14803
14804 vector<ObjectExtent> extents;
14805 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14806 ceph_assert(extents.size() == 1);
14807
14808 objecter->with_osdmap([&](const OSDMap& o) {
14809 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14810 o.pg_to_acting_osds(pg, osds);
14811 });
14812
14813 if (osds.empty())
14814 return -CEPHFS_EINVAL;
14815
14816 /*
14817 * Return the remainder of the extent (stripe unit)
14818 *
14819 * If length = 1 is passed to Striper::file_to_extents we get a single
14820 * extent back, but its length is one so we still need to compute the length
14821 * to the end of the stripe unit.
14822 *
14823 * If length = su then we may get 1 or 2 objects back in the extents vector
14824 * which would have to be examined. Even then, the offsets are local to the
14825 * object, so matching up to the file offset is extra work.
14826 *
14827 * It seems simpler to stick with length = 1 and manually compute the
14828 * remainder.
14829 */
14830 if (len) {
14831 uint64_t su = in->layout.stripe_unit;
14832 *len = su - (off % su);
14833 }
14834
14835 return 0;
14836 }
14837
14838 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14839 {
14840 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14841 if (!mref_reader.is_state_satisfied())
14842 return -CEPHFS_ENOTCONN;
14843
14844 std::scoped_lock lock(client_lock);
14845
14846 if (id < 0)
14847 return -CEPHFS_EINVAL;
14848 return objecter->with_osdmap([&](const OSDMap& o) {
14849 return o.crush->get_full_location_ordered(id, path);
14850 });
14851 }
14852
14853 int Client::get_file_stripe_address(int fd, loff_t offset,
14854 vector<entity_addr_t>& address)
14855 {
14856 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14857 if (!mref_reader.is_state_satisfied())
14858 return -CEPHFS_ENOTCONN;
14859
14860 std::scoped_lock lock(client_lock);
14861
14862 Fh *f = get_filehandle(fd);
14863 if (!f)
14864 return -CEPHFS_EBADF;
14865 Inode *in = f->inode.get();
14866
14867 // which object?
14868 vector<ObjectExtent> extents;
14869 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14870 in->truncate_size, extents);
14871 ceph_assert(extents.size() == 1);
14872
14873 // now we have the object and its 'layout'
14874 return objecter->with_osdmap([&](const OSDMap& o) {
14875 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14876 vector<int> osds;
14877 o.pg_to_acting_osds(pg, osds);
14878 if (osds.empty())
14879 return -CEPHFS_EINVAL;
14880 for (unsigned i = 0; i < osds.size(); i++) {
14881 entity_addr_t addr = o.get_addrs(osds[i]).front();
14882 address.push_back(addr);
14883 }
14884 return 0;
14885 });
14886 }
14887
14888 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14889 {
14890 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14891 if (!mref_reader.is_state_satisfied())
14892 return -CEPHFS_ENOTCONN;
14893
14894 std::scoped_lock lock(client_lock);
14895
14896 return objecter->with_osdmap([&](const OSDMap& o) {
14897 if (!o.exists(osd))
14898 return -CEPHFS_ENOENT;
14899
14900 addr = o.get_addrs(osd).front();
14901 return 0;
14902 });
14903 }
14904
14905 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14906 loff_t length, loff_t offset)
14907 {
14908 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14909 if (!mref_reader.is_state_satisfied())
14910 return -CEPHFS_ENOTCONN;
14911
14912 std::scoped_lock lock(client_lock);
14913
14914 Fh *f = get_filehandle(fd);
14915 if (!f)
14916 return -CEPHFS_EBADF;
14917 Inode *in = f->inode.get();
14918
14919 // map to a list of extents
14920 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14921
14922 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14923 return 0;
14924 }
14925
14926
14927 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
14928 int Client::get_local_osd()
14929 {
14930 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14931 if (!mref_reader.is_state_satisfied())
14932 return -CEPHFS_ENOTCONN;
14933
14934 std::scoped_lock lock(client_lock);
14935
14936 objecter->with_osdmap([this](const OSDMap& o) {
14937 if (o.get_epoch() != local_osd_epoch) {
14938 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14939 local_osd_epoch = o.get_epoch();
14940 }
14941 });
14942 return local_osd;
14943 }
14944
14945
14946
14947
14948
14949
14950 // ===============================
14951
14952 void Client::ms_handle_connect(Connection *con)
14953 {
14954 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14955 }
14956
14957 bool Client::ms_handle_reset(Connection *con)
14958 {
14959 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14960 return false;
14961 }
14962
14963 void Client::ms_handle_remote_reset(Connection *con)
14964 {
14965 std::scoped_lock lock(client_lock);
14966 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14967 switch (con->get_peer_type()) {
14968 case CEPH_ENTITY_TYPE_MDS:
14969 {
14970 // kludge to figure out which mds this is; fixme with a Connection* state
14971 mds_rank_t mds = MDS_RANK_NONE;
14972 MetaSession *s = NULL;
14973 for (auto &p : mds_sessions) {
14974 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14975 mds = p.first;
14976 s = &p.second;
14977 }
14978 }
14979 if (mds >= 0) {
14980 assert (s != NULL);
14981 switch (s->state) {
14982 case MetaSession::STATE_CLOSING:
14983 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14984 _closed_mds_session(s);
14985 break;
14986
14987 case MetaSession::STATE_OPENING:
14988 {
14989 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14990 list<Context*> waiters;
14991 waiters.swap(s->waiting_for_open);
14992 _closed_mds_session(s);
14993 MetaSession *news = _get_or_open_mds_session(mds);
14994 news->waiting_for_open.swap(waiters);
14995 }
14996 break;
14997
14998 case MetaSession::STATE_OPEN:
14999 {
15000 objecter->maybe_request_map(); /* to check if we are blocklisted */
15001 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15002 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15003 _closed_mds_session(s);
15004 } else {
15005 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15006 s->state = MetaSession::STATE_STALE;
15007 }
15008 }
15009 break;
15010
15011 case MetaSession::STATE_NEW:
15012 case MetaSession::STATE_CLOSED:
15013 default:
15014 break;
15015 }
15016 }
15017 }
15018 break;
15019 }
15020 }
15021
15022 bool Client::ms_handle_refused(Connection *con)
15023 {
15024 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15025 return false;
15026 }
15027
15028 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15029 {
15030 Inode *quota_in = root_ancestor;
15031 SnapRealm *realm = in->snaprealm;
15032 while (realm) {
15033 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15034 if (realm->ino != in->ino) {
15035 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15036 if (p == inode_map.end())
15037 break;
15038
15039 if (p->second->quota.is_enable()) {
15040 quota_in = p->second;
15041 break;
15042 }
15043 }
15044 realm = realm->pparent;
15045 }
15046 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15047 return quota_in;
15048 }
15049
15050 /**
15051 * Traverse quota ancestors of the Inode, return true
15052 * if any of them passes the passed function
15053 */
15054 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15055 std::function<bool (const Inode &in)> test)
15056 {
15057 while (true) {
15058 ceph_assert(in != NULL);
15059 if (test(*in)) {
15060 return true;
15061 }
15062
15063 if (in == root_ancestor) {
15064 // We're done traversing, drop out
15065 return false;
15066 } else {
15067 // Continue up the tree
15068 in = get_quota_root(in, perms);
15069 }
15070 }
15071
15072 return false;
15073 }
15074
15075 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15076 {
15077 return check_quota_condition(in, perms,
15078 [](const Inode &in) {
15079 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15080 });
15081 }
15082
15083 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15084 const UserPerm& perms)
15085 {
15086 return check_quota_condition(in, perms,
15087 [&new_bytes](const Inode &in) {
15088 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15089 > in.quota.max_bytes;
15090 });
15091 }
15092
15093 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15094 {
15095 ceph_assert(in->size >= in->reported_size);
15096 const uint64_t size = in->size - in->reported_size;
15097 return check_quota_condition(in, perms,
15098 [&size](const Inode &in) {
15099 if (in.quota.max_bytes) {
15100 if (in.rstat.rbytes >= in.quota.max_bytes) {
15101 return true;
15102 }
15103
15104 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15105 return (space >> 4) < size;
15106 } else {
15107 return false;
15108 }
15109 });
15110 }
15111
15112 enum {
15113 POOL_CHECKED = 1,
15114 POOL_CHECKING = 2,
15115 POOL_READ = 4,
15116 POOL_WRITE = 8,
15117 };
15118
15119 int Client::check_pool_perm(Inode *in, int need)
15120 {
15121 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15122
15123 if (!cct->_conf->client_check_pool_perm)
15124 return 0;
15125
15126 /* Only need to do this for regular files */
15127 if (!in->is_file())
15128 return 0;
15129
15130 int64_t pool_id = in->layout.pool_id;
15131 std::string pool_ns = in->layout.pool_ns;
15132 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15133 int have = 0;
15134 while (true) {
15135 auto it = pool_perms.find(perm_key);
15136 if (it == pool_perms.end())
15137 break;
15138 if (it->second == POOL_CHECKING) {
15139 // avoid concurrent checkings
15140 wait_on_list(waiting_for_pool_perm);
15141 } else {
15142 have = it->second;
15143 ceph_assert(have & POOL_CHECKED);
15144 break;
15145 }
15146 }
15147
15148 if (!have) {
15149 if (in->snapid != CEPH_NOSNAP) {
15150 // pool permission check needs to write to the first object. But for snapshot,
15151 // head of the first object may have alread been deleted. To avoid creating
15152 // orphan object, skip the check for now.
15153 return 0;
15154 }
15155
15156 pool_perms[perm_key] = POOL_CHECKING;
15157
15158 char oid_buf[32];
15159 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15160 object_t oid = oid_buf;
15161
15162 SnapContext nullsnapc;
15163
15164 C_SaferCond rd_cond;
15165 ObjectOperation rd_op;
15166 rd_op.stat(nullptr, nullptr, nullptr);
15167
15168 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15169 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15170
15171 C_SaferCond wr_cond;
15172 ObjectOperation wr_op;
15173 wr_op.create(true);
15174
15175 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15176 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15177
15178 client_lock.unlock();
15179 int rd_ret = rd_cond.wait();
15180 int wr_ret = wr_cond.wait();
15181 client_lock.lock();
15182
15183 bool errored = false;
15184
15185 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
15186 have |= POOL_READ;
15187 else if (rd_ret != -CEPHFS_EPERM) {
15188 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15189 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15190 errored = true;
15191 }
15192
15193 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
15194 have |= POOL_WRITE;
15195 else if (wr_ret != -CEPHFS_EPERM) {
15196 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15197 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15198 errored = true;
15199 }
15200
15201 if (errored) {
15202 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15203 // Raise EIO because actual error code might be misleading for
15204 // userspace filesystem user.
15205 pool_perms.erase(perm_key);
15206 signal_cond_list(waiting_for_pool_perm);
15207 return -CEPHFS_EIO;
15208 }
15209
15210 pool_perms[perm_key] = have | POOL_CHECKED;
15211 signal_cond_list(waiting_for_pool_perm);
15212 }
15213
15214 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15215 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15216 << " need " << ccap_string(need) << ", but no read perm" << dendl;
15217 return -CEPHFS_EPERM;
15218 }
15219 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15220 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15221 << " need " << ccap_string(need) << ", but no write perm" << dendl;
15222 return -CEPHFS_EPERM;
15223 }
15224
15225 return 0;
15226 }
15227
15228 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15229 {
15230 if (acl_type == POSIX_ACL) {
15231 if (in->xattrs.count(ACL_EA_ACCESS)) {
15232 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15233
15234 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15235 }
15236 }
15237 return -CEPHFS_EAGAIN;
15238 }
15239
15240 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15241 {
15242 if (acl_type == NO_ACL)
15243 return 0;
15244
15245 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15246 if (r < 0)
15247 goto out;
15248
15249 if (acl_type == POSIX_ACL) {
15250 if (in->xattrs.count(ACL_EA_ACCESS)) {
15251 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15252 bufferptr acl(access_acl.c_str(), access_acl.length());
15253 r = posix_acl_access_chmod(acl, mode);
15254 if (r < 0)
15255 goto out;
15256 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15257 } else {
15258 r = 0;
15259 }
15260 }
15261 out:
15262 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15263 return r;
15264 }
15265
15266 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15267 const UserPerm& perms)
15268 {
15269 if (acl_type == NO_ACL)
15270 return 0;
15271
15272 if (S_ISLNK(*mode))
15273 return 0;
15274
15275 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15276 if (r < 0)
15277 goto out;
15278
15279 if (acl_type == POSIX_ACL) {
15280 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15281 map<string, bufferptr> xattrs;
15282
15283 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15284 bufferptr acl(default_acl.c_str(), default_acl.length());
15285 r = posix_acl_inherit_mode(acl, mode);
15286 if (r < 0)
15287 goto out;
15288
15289 if (r > 0) {
15290 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15291 if (r < 0)
15292 goto out;
15293 if (r > 0)
15294 xattrs[ACL_EA_ACCESS] = acl;
15295 }
15296
15297 if (S_ISDIR(*mode))
15298 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15299
15300 r = xattrs.size();
15301 if (r > 0)
15302 encode(xattrs, xattrs_bl);
15303 } else {
15304 if (umask_cb)
15305 *mode &= ~umask_cb(callback_handle);
15306 r = 0;
15307 }
15308 }
15309 out:
15310 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15311 return r;
15312 }
15313
15314 void Client::set_filer_flags(int flags)
15315 {
15316 std::scoped_lock l(client_lock);
15317 ceph_assert(flags == 0 ||
15318 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15319 objecter->add_global_op_flags(flags);
15320 }
15321
15322 void Client::clear_filer_flags(int flags)
15323 {
15324 std::scoped_lock l(client_lock);
15325 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15326 objecter->clear_global_op_flag(flags);
15327 }
15328
15329 // called before mount
15330 void Client::set_uuid(const std::string& uuid)
15331 {
15332 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15333 ceph_assert(iref_reader.is_state_satisfied());
15334
15335 std::scoped_lock l(client_lock);
15336 assert(!uuid.empty());
15337
15338 metadata["uuid"] = uuid;
15339 _close_sessions();
15340 }
15341
15342 // called before mount. 0 means infinite
15343 void Client::set_session_timeout(unsigned timeout)
15344 {
15345 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15346 ceph_assert(iref_reader.is_state_satisfied());
15347
15348 std::scoped_lock l(client_lock);
15349
15350 metadata["timeout"] = stringify(timeout);
15351 }
15352
15353 // called before mount
15354 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15355 const std::string& fs_name)
15356 {
15357 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15358 if (!iref_reader.is_state_satisfied())
15359 return -CEPHFS_ENOTCONN;
15360
15361 if (uuid.empty())
15362 return -CEPHFS_EINVAL;
15363
15364 std::unique_lock l(client_lock);
15365 {
15366 auto it = metadata.find("uuid");
15367 if (it != metadata.end() && it->second == uuid)
15368 return -CEPHFS_EINVAL;
15369 }
15370
15371 int r = subscribe_mdsmap(fs_name);
15372 if (r < 0) {
15373 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15374 return r;
15375 }
15376
15377 if (metadata.empty())
15378 populate_metadata("");
15379
15380 while (mdsmap->get_epoch() == 0)
15381 wait_on_list(waiting_for_mdsmap);
15382
15383 reclaim_errno = 0;
15384 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15385 if (!mdsmap->is_up(mds)) {
15386 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15387 wait_on_list(waiting_for_mdsmap);
15388 continue;
15389 }
15390
15391 MetaSession *session;
15392 if (!have_open_session(mds)) {
15393 session = _get_or_open_mds_session(mds);
15394 if (session->state == MetaSession::STATE_REJECTED)
15395 return -CEPHFS_EPERM;
15396 if (session->state != MetaSession::STATE_OPENING) {
15397 // umounting?
15398 return -CEPHFS_EINVAL;
15399 }
15400 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15401 wait_on_context_list(session->waiting_for_open);
15402 continue;
15403 }
15404
15405 session = &mds_sessions.at(mds);
15406 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15407 return -CEPHFS_EOPNOTSUPP;
15408
15409 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15410 session->reclaim_state == MetaSession::RECLAIMING) {
15411 session->reclaim_state = MetaSession::RECLAIMING;
15412 auto m = make_message<MClientReclaim>(uuid, flags);
15413 session->con->send_message2(std::move(m));
15414 wait_on_list(waiting_for_reclaim);
15415 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15416 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15417 } else {
15418 mds++;
15419 }
15420 }
15421
15422 // didn't find target session in any mds
15423 if (reclaim_target_addrs.empty()) {
15424 if (flags & CEPH_RECLAIM_RESET)
15425 return -CEPHFS_ENOENT;
15426 return -CEPHFS_ENOTRECOVERABLE;
15427 }
15428
15429 if (flags & CEPH_RECLAIM_RESET)
15430 return 0;
15431
15432 // use blocklist to check if target session was killed
15433 // (config option mds_session_blocklist_on_evict needs to be true)
15434 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15435 bs::error_code ec;
15436 l.unlock();
15437 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15438 l.lock();
15439
15440 if (ec)
15441 return ceph::from_error_code(ec);
15442
15443 bool blocklisted = objecter->with_osdmap(
15444 [this](const OSDMap &osd_map) -> bool {
15445 return osd_map.is_blocklisted(reclaim_target_addrs);
15446 });
15447 if (blocklisted)
15448 return -CEPHFS_ENOTRECOVERABLE;
15449
15450 metadata["reclaiming_uuid"] = uuid;
15451 return 0;
15452 }
15453
15454 void Client::finish_reclaim()
15455 {
15456 auto it = metadata.find("reclaiming_uuid");
15457 if (it == metadata.end()) {
15458 for (auto &p : mds_sessions)
15459 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15460 return;
15461 }
15462
15463 for (auto &p : mds_sessions) {
15464 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15465 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15466 p.second.con->send_message2(std::move(m));
15467 }
15468
15469 metadata["uuid"] = it->second;
15470 metadata.erase(it);
15471 }
15472
15473 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15474 {
15475 mds_rank_t from = mds_rank_t(reply->get_source().num());
15476 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15477
15478 std::scoped_lock cl(client_lock);
15479 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
15480 if (!session) {
15481 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15482 return;
15483 }
15484
15485 if (reply->get_result() >= 0) {
15486 session->reclaim_state = MetaSession::RECLAIM_OK;
15487 if (reply->get_epoch() > reclaim_osd_epoch)
15488 reclaim_osd_epoch = reply->get_epoch();
15489 if (!reply->get_addrs().empty())
15490 reclaim_target_addrs = reply->get_addrs();
15491 } else {
15492 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15493 reclaim_errno = reply->get_result();
15494 }
15495
15496 signal_cond_list(waiting_for_reclaim);
15497 }
15498
15499 /**
15500 * This is included in cap release messages, to cause
15501 * the MDS to wait until this OSD map epoch. It is necessary
15502 * in corner cases where we cancel RADOS ops, so that
15503 * nobody else tries to do IO to the same objects in
15504 * the same epoch as the cancelled ops.
15505 */
15506 void Client::set_cap_epoch_barrier(epoch_t e)
15507 {
15508 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15509 cap_epoch_barrier = e;
15510 }
15511
15512 const char** Client::get_tracked_conf_keys() const
15513 {
15514 static const char* keys[] = {
15515 "client_cache_size",
15516 "client_cache_mid",
15517 "client_acl_type",
15518 "client_deleg_timeout",
15519 "client_deleg_break_on_open",
15520 "client_oc_size",
15521 "client_oc_max_objects",
15522 "client_oc_max_dirty",
15523 "client_oc_target_dirty",
15524 "client_oc_max_dirty_age",
15525 NULL
15526 };
15527 return keys;
15528 }
15529
15530 void Client::handle_conf_change(const ConfigProxy& conf,
15531 const std::set <std::string> &changed)
15532 {
15533 std::scoped_lock lock(client_lock);
15534
15535 if (changed.count("client_cache_mid")) {
15536 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15537 }
15538 if (changed.count("client_acl_type")) {
15539 acl_type = NO_ACL;
15540 if (cct->_conf->client_acl_type == "posix_acl")
15541 acl_type = POSIX_ACL;
15542 }
15543 if (changed.count("client_oc_size")) {
15544 objectcacher->set_max_size(cct->_conf->client_oc_size);
15545 }
15546 if (changed.count("client_oc_max_objects")) {
15547 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15548 }
15549 if (changed.count("client_oc_max_dirty")) {
15550 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15551 }
15552 if (changed.count("client_oc_target_dirty")) {
15553 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15554 }
15555 if (changed.count("client_oc_max_dirty_age")) {
15556 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15557 }
15558 }
15559
15560 void intrusive_ptr_add_ref(Inode *in)
15561 {
15562 in->iget();
15563 }
15564
15565 void intrusive_ptr_release(Inode *in)
15566 {
15567 in->client->put_inode(in);
15568 }
15569
15570 mds_rank_t Client::_get_random_up_mds() const
15571 {
15572 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15573
15574 std::set<mds_rank_t> up;
15575 mdsmap->get_up_mds_set(up);
15576
15577 if (up.empty())
15578 return MDS_RANK_NONE;
15579 std::set<mds_rank_t>::const_iterator p = up.begin();
15580 for (int n = rand() % up.size(); n; n--)
15581 ++p;
15582 return *p;
15583 }
15584
15585
15586 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15587 boost::asio::io_context& ictx)
15588 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15589 {
15590 monclient->set_messenger(m);
15591 objecter->set_client_incarnation(0);
15592 }
15593
15594 StandaloneClient::~StandaloneClient()
15595 {
15596 delete objecter;
15597 objecter = nullptr;
15598 }
15599
15600 int StandaloneClient::init()
15601 {
15602 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15603 ceph_assert(iref_writer.is_first_writer());
15604
15605 _pre_init();
15606 objecter->init();
15607
15608 client_lock.lock();
15609
15610 messenger->add_dispatcher_tail(objecter);
15611 messenger->add_dispatcher_tail(this);
15612
15613 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15614 int r = monclient->init();
15615 if (r < 0) {
15616 // need to do cleanup because we're in an intermediate init state
15617 {
15618 std::scoped_lock l(timer_lock);
15619 timer.shutdown();
15620 }
15621
15622 client_lock.unlock();
15623 objecter->shutdown();
15624 objectcacher->stop();
15625 monclient->shutdown();
15626 return r;
15627 }
15628 objecter->start();
15629
15630 client_lock.unlock();
15631 _finish_init();
15632 iref_writer.update_state(CLIENT_INITIALIZED);
15633
15634 return 0;
15635 }
15636
15637 void StandaloneClient::shutdown()
15638 {
15639 Client::shutdown();
15640 objecter->shutdown();
15641 monclient->shutdown();
15642 }