]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
a94876881abda1f2aa04e1f55328f0c39d471066
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #else
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
77
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
83
84 #define dout_subsys ceph_subsys_client
85
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
90
91 #include "Client.h"
92 #include "Inode.h"
93 #include "Dentry.h"
94 #include "Delegation.h"
95 #include "Dir.h"
96 #include "ClientSnapRealm.h"
97 #include "Fh.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
102
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
105
106 #include "include/cephfs/ceph_ll_client.h"
107
108 #if HAVE_GETGROUPLIST
109 #include <grp.h>
110 #include <pwd.h>
111 #include <unistd.h>
112 #endif
113
114 #undef dout_prefix
115 #define dout_prefix *_dout << "client." << whoami << " "
116
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119 // FreeBSD fails to define this
120 #ifndef O_DSYNC
121 #define O_DSYNC 0x0
122 #endif
123 // Darwin fails to define this
124 #ifndef O_RSYNC
125 #define O_RSYNC 0x0
126 #endif
127
128 #ifndef O_DIRECT
129 #define O_DIRECT 0x0
130 #endif
131
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
137 // them as invalid.
138 #ifndef O_NOFOLLOW
139 #define O_NOFOLLOW 0x0
140 #endif
141
142 #ifndef O_SYNC
143 #define O_SYNC 0x0
144 #endif
145
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
148 using namespace TOPNSPC::common;
149
150 namespace bs = boost::system;
151 namespace ca = ceph::async;
152
153 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
154 {
155 Client *client = static_cast<Client*>(p);
156 client->flush_set_callback(oset);
157 }
158
159
160 // -------------
161
162 Client::CommandHook::CommandHook(Client *client) :
163 m_client(client)
164 {
165 }
166
167 int Client::CommandHook::call(
168 std::string_view command,
169 const cmdmap_t& cmdmap,
170 Formatter *f,
171 std::ostream& errss,
172 bufferlist& out)
173 {
174 f->open_object_section("result");
175 {
176 std::scoped_lock l{m_client->client_lock};
177 if (command == "mds_requests")
178 m_client->dump_mds_requests(f);
179 else if (command == "mds_sessions") {
180 bool cap_dump = false;
181 cmd_getval(cmdmap, "cap_dump", cap_dump);
182 m_client->dump_mds_sessions(f, cap_dump);
183 } else if (command == "dump_cache")
184 m_client->dump_cache(f);
185 else if (command == "kick_stale_sessions")
186 m_client->_kick_stale_sessions();
187 else if (command == "status")
188 m_client->dump_status(f);
189 else
190 ceph_abort_msg("bad command registered");
191 }
192 f->close_section();
193 return 0;
194 }
195
196
197 // -------------
198
199 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
200 : inode(in), offset(0), next_offset(2),
201 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
202 perms(perms)
203 { }
204
205 void Client::_reset_faked_inos()
206 {
207 ino_t start = 1024;
208 free_faked_inos.clear();
209 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
210 last_used_faked_ino = 0;
211 last_used_faked_root = 0;
212 #ifdef _WIN32
213 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
214 // Windows structures, including Dokan ones, are using 64B identifiers.
215 _use_faked_inos = false;
216 #else
217 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
218 #endif
219 }
220
221 void Client::_assign_faked_ino(Inode *in)
222 {
223 if (0 == last_used_faked_ino)
224 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
225 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
226 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
227 last_used_faked_ino = 2048;
228 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
229 }
230 ceph_assert(it != free_faked_inos.end());
231 if (last_used_faked_ino < it.get_start()) {
232 ceph_assert(it.get_len() > 0);
233 last_used_faked_ino = it.get_start();
234 } else {
235 ++last_used_faked_ino;
236 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
237 }
238 in->faked_ino = last_used_faked_ino;
239 free_faked_inos.erase(in->faked_ino);
240 faked_ino_map[in->faked_ino] = in->vino();
241 }
242
243 /*
244 * In the faked mode, if you export multiple subdirectories,
245 * you will see that the inode numbers of the exported subdirectories
246 * are the same. so we distinguish the mount point by reserving
247 * the "fake ids" between "1024~2048" and combining the last
248 * 10bits(0x3ff) of the "root inodes".
249 */
250 void Client::_assign_faked_root(Inode *in)
251 {
252 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
253 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
254 last_used_faked_root = 0;
255 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
256 }
257 assert(it != free_faked_inos.end());
258 vinodeno_t inode_info = in->vino();
259 uint64_t inode_num = (uint64_t)inode_info.ino;
260 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
261 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
262 assert(it.get_start() + it.get_len() > last_used_faked_root);
263
264 in->faked_ino = last_used_faked_root;
265 free_faked_inos.erase(in->faked_ino);
266 faked_ino_map[in->faked_ino] = in->vino();
267 }
268
269 void Client::_release_faked_ino(Inode *in)
270 {
271 free_faked_inos.insert(in->faked_ino);
272 faked_ino_map.erase(in->faked_ino);
273 }
274
275 vinodeno_t Client::_map_faked_ino(ino_t ino)
276 {
277 vinodeno_t vino;
278 if (ino == 1)
279 vino = root->vino();
280 else if (faked_ino_map.count(ino))
281 vino = faked_ino_map[ino];
282 else
283 vino = vinodeno_t(0, CEPH_NOSNAP);
284 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
285 return vino;
286 }
287
288 vinodeno_t Client::map_faked_ino(ino_t ino)
289 {
290 std::scoped_lock lock(client_lock);
291 return _map_faked_ino(ino);
292 }
293
294 // cons/des
295
296 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
297 : Dispatcher(m->cct->get()),
298 timer(m->cct, timer_lock, false),
299 messenger(m),
300 monclient(mc),
301 objecter(objecter_),
302 whoami(mc->get_global_id()),
303 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
304 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
305 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
306 async_ino_invalidator(m->cct),
307 async_dentry_invalidator(m->cct),
308 interrupt_finisher(m->cct),
309 remount_finisher(m->cct),
310 async_ino_releasor(m->cct),
311 objecter_finisher(m->cct),
312 m_command_hook(this),
313 fscid(0)
314 {
315 _reset_faked_inos();
316
317 user_id = cct->_conf->client_mount_uid;
318 group_id = cct->_conf->client_mount_gid;
319 fuse_default_permissions = cct->_conf.get_val<bool>(
320 "fuse_default_permissions");
321
322 if (cct->_conf->client_acl_type == "posix_acl")
323 acl_type = POSIX_ACL;
324
325 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
326
327 // file handles
328 free_fd_set.insert(10, 1<<30);
329
330 mdsmap.reset(new MDSMap);
331
332 // osd interfaces
333 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
334 &client_lock));
335 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
336 client_flush_set_callback, // all commit callback
337 (void*)this,
338 cct->_conf->client_oc_size,
339 cct->_conf->client_oc_max_objects,
340 cct->_conf->client_oc_max_dirty,
341 cct->_conf->client_oc_target_dirty,
342 cct->_conf->client_oc_max_dirty_age,
343 true));
344 }
345
346
347 Client::~Client()
348 {
349 ceph_assert(ceph_mutex_is_not_locked(client_lock));
350
351 // If the task is crashed or aborted and doesn't
352 // get any chance to run the umount and shutdow.
353 {
354 std::scoped_lock l{client_lock};
355 tick_thread_stopped = true;
356 upkeep_cond.notify_one();
357 }
358
359 if (upkeeper.joinable())
360 upkeeper.join();
361
362 // It is necessary to hold client_lock, because any inode destruction
363 // may call into ObjectCacher, which asserts that it's lock (which is
364 // client_lock) is held.
365 std::scoped_lock l{client_lock};
366 tear_down_cache();
367 }
368
369 void Client::tear_down_cache()
370 {
371 // fd's
372 for (auto &[fd, fh] : fd_map) {
373 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
374 _release_fh(fh);
375 }
376 fd_map.clear();
377
378 while (!opened_dirs.empty()) {
379 dir_result_t *dirp = *opened_dirs.begin();
380 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
381 _closedir(dirp);
382 }
383
384 // caps!
385 // *** FIXME ***
386
387 // empty lru
388 trim_cache();
389 ceph_assert(lru.lru_get_size() == 0);
390
391 // close root ino
392 ceph_assert(inode_map.size() <= 1 + root_parents.size());
393 if (root && inode_map.size() == 1 + root_parents.size()) {
394 delete root;
395 root = 0;
396 root_ancestor = 0;
397 while (!root_parents.empty())
398 root_parents.erase(root_parents.begin());
399 inode_map.clear();
400 _reset_faked_inos();
401 }
402
403 ceph_assert(inode_map.empty());
404 }
405
406 inodeno_t Client::get_root_ino()
407 {
408 std::scoped_lock l(client_lock);
409 if (use_faked_inos())
410 return root->faked_ino;
411 else
412 return root->ino;
413 }
414
415 Inode *Client::get_root()
416 {
417 std::scoped_lock l(client_lock);
418 root->ll_get();
419 return root;
420 }
421
422
423 // debug crapola
424
425 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
426 {
427 filepath path;
428 in->make_long_path(path);
429 ldout(cct, 1) << "dump_inode: "
430 << (disconnected ? "DISCONNECTED ":"")
431 << "inode " << in->ino
432 << " " << path
433 << " ref " << in->get_num_ref()
434 << " " << *in << dendl;
435
436 if (f) {
437 f->open_object_section("inode");
438 f->dump_stream("path") << path;
439 if (disconnected)
440 f->dump_int("disconnected", 1);
441 in->dump(f);
442 f->close_section();
443 }
444
445 did.insert(in);
446 if (in->dir) {
447 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
448 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
449 it != in->dir->dentries.end();
450 ++it) {
451 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
452 if (f) {
453 f->open_object_section("dentry");
454 it->second->dump(f);
455 f->close_section();
456 }
457 if (it->second->inode)
458 dump_inode(f, it->second->inode.get(), did, false);
459 }
460 }
461 }
462
463 void Client::dump_cache(Formatter *f)
464 {
465 set<Inode*> did;
466
467 ldout(cct, 1) << __func__ << dendl;
468
469 if (f)
470 f->open_array_section("cache");
471
472 if (root)
473 dump_inode(f, root, did, true);
474
475 // make a second pass to catch anything disconnected
476 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
477 it != inode_map.end();
478 ++it) {
479 if (did.count(it->second))
480 continue;
481 dump_inode(f, it->second, did, true);
482 }
483
484 if (f)
485 f->close_section();
486 }
487
488 void Client::dump_status(Formatter *f)
489 {
490 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
491
492 ldout(cct, 1) << __func__ << dendl;
493
494 const epoch_t osd_epoch
495 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
496
497 if (f) {
498 f->open_object_section("metadata");
499 for (const auto& kv : metadata)
500 f->dump_string(kv.first.c_str(), kv.second);
501 f->close_section();
502
503 f->dump_int("dentry_count", lru.lru_get_size());
504 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
505 f->dump_int("id", get_nodeid().v);
506 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
507 f->dump_object("inst", inst);
508 f->dump_object("addr", inst.addr);
509 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
510 f->dump_string("addr_str", inst.addr.get_legacy_str());
511 f->dump_int("inode_count", inode_map.size());
512 f->dump_int("mds_epoch", mdsmap->get_epoch());
513 f->dump_int("osd_epoch", osd_epoch);
514 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
515 f->dump_bool("blocklisted", blocklisted);
516 f->dump_string("fs_name", mdsmap->get_fs_name());
517 }
518 }
519
520 void Client::_pre_init()
521 {
522 timer.init();
523
524 objecter_finisher.start();
525 filer.reset(new Filer(objecter, &objecter_finisher));
526 objecter->enable_blocklist_events();
527
528 objectcacher->start();
529 }
530
531 int Client::init()
532 {
533 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
534 ceph_assert(iref_writer.is_first_writer());
535
536 _pre_init();
537 {
538 std::scoped_lock l{client_lock};
539 messenger->add_dispatcher_tail(this);
540 }
541 _finish_init();
542 iref_writer.update_state(CLIENT_INITIALIZED);
543 return 0;
544 }
545
546 void Client::_finish_init()
547 {
548 {
549 std::scoped_lock l{client_lock};
550 // logger
551 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
552 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
553 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
554 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
555 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
556 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
557 logger.reset(plb.create_perf_counters());
558 cct->get_perfcounters_collection()->add(logger.get());
559 }
560
561 cct->_conf.add_observer(this);
562
563 AdminSocket* admin_socket = cct->get_admin_socket();
564 int ret = admin_socket->register_command("mds_requests",
565 &m_command_hook,
566 "show in-progress mds requests");
567 if (ret < 0) {
568 lderr(cct) << "error registering admin socket command: "
569 << cpp_strerror(-ret) << dendl;
570 }
571 ret = admin_socket->register_command("mds_sessions "
572 "name=cap_dump,type=CephBool,req=false",
573 &m_command_hook,
574 "show mds session state");
575 if (ret < 0) {
576 lderr(cct) << "error registering admin socket command: "
577 << cpp_strerror(-ret) << dendl;
578 }
579 ret = admin_socket->register_command("dump_cache",
580 &m_command_hook,
581 "show in-memory metadata cache contents");
582 if (ret < 0) {
583 lderr(cct) << "error registering admin socket command: "
584 << cpp_strerror(-ret) << dendl;
585 }
586 ret = admin_socket->register_command("kick_stale_sessions",
587 &m_command_hook,
588 "kick sessions that were remote reset");
589 if (ret < 0) {
590 lderr(cct) << "error registering admin socket command: "
591 << cpp_strerror(-ret) << dendl;
592 }
593 ret = admin_socket->register_command("status",
594 &m_command_hook,
595 "show overall client status");
596 if (ret < 0) {
597 lderr(cct) << "error registering admin socket command: "
598 << cpp_strerror(-ret) << dendl;
599 }
600 }
601
602 void Client::shutdown()
603 {
604 ldout(cct, 1) << __func__ << dendl;
605
606 // If we were not mounted, but were being used for sending
607 // MDS commands, we may have sessions that need closing.
608 {
609 std::scoped_lock l{client_lock};
610
611 // To make sure the tick thread will be stoppped before
612 // destructing the Client, just in case like the _mount()
613 // failed but didn't not get a chance to stop the tick
614 // thread
615 tick_thread_stopped = true;
616 upkeep_cond.notify_one();
617
618 _close_sessions();
619 }
620 cct->_conf.remove_observer(this);
621
622 cct->get_admin_socket()->unregister_commands(&m_command_hook);
623
624 if (ino_invalidate_cb) {
625 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
626 async_ino_invalidator.wait_for_empty();
627 async_ino_invalidator.stop();
628 }
629
630 if (dentry_invalidate_cb) {
631 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
632 async_dentry_invalidator.wait_for_empty();
633 async_dentry_invalidator.stop();
634 }
635
636 if (switch_interrupt_cb) {
637 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
638 interrupt_finisher.wait_for_empty();
639 interrupt_finisher.stop();
640 }
641
642 if (remount_cb) {
643 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
644 remount_finisher.wait_for_empty();
645 remount_finisher.stop();
646 }
647
648 if (ino_release_cb) {
649 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
650 async_ino_releasor.wait_for_empty();
651 async_ino_releasor.stop();
652 }
653
654 objectcacher->stop(); // outside of client_lock! this does a join.
655
656 /*
657 * We are shuting down the client.
658 *
659 * Just declare the state to CLIENT_NEW to block and fail any
660 * new comming "reader" and then try to wait all the in-flight
661 * "readers" to finish.
662 */
663 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
664 if (!iref_writer.is_first_writer())
665 return;
666 iref_writer.wait_readers_done();
667
668 {
669 std::scoped_lock l(timer_lock);
670 timer.shutdown();
671 }
672
673 objecter_finisher.wait_for_empty();
674 objecter_finisher.stop();
675
676 if (logger) {
677 cct->get_perfcounters_collection()->remove(logger.get());
678 logger.reset();
679 }
680 }
681
682
683 // ===================
684 // metadata cache stuff
685
686 void Client::trim_cache(bool trim_kernel_dcache)
687 {
688 uint64_t max = cct->_conf->client_cache_size;
689 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
690 unsigned last = 0;
691 while (lru.lru_get_size() != last) {
692 last = lru.lru_get_size();
693
694 if (!is_unmounting() && lru.lru_get_size() <= max) break;
695
696 // trim!
697 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
698 if (!dn)
699 break; // done
700
701 trim_dentry(dn);
702 }
703
704 if (trim_kernel_dcache && lru.lru_get_size() > max)
705 _invalidate_kernel_dcache();
706
707 // hose root?
708 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
709 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
710 delete root;
711 root = 0;
712 root_ancestor = 0;
713 while (!root_parents.empty())
714 root_parents.erase(root_parents.begin());
715 inode_map.clear();
716 _reset_faked_inos();
717 }
718 }
719
720 void Client::trim_cache_for_reconnect(MetaSession *s)
721 {
722 mds_rank_t mds = s->mds_num;
723 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
724
725 int trimmed = 0;
726 list<Dentry*> skipped;
727 while (lru.lru_get_size() > 0) {
728 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
729 if (!dn)
730 break;
731
732 if ((dn->inode && dn->inode->caps.count(mds)) ||
733 dn->dir->parent_inode->caps.count(mds)) {
734 trim_dentry(dn);
735 trimmed++;
736 } else
737 skipped.push_back(dn);
738 }
739
740 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
741 lru.lru_insert_mid(*p);
742
743 ldout(cct, 20) << __func__ << " mds." << mds
744 << " trimmed " << trimmed << " dentries" << dendl;
745
746 if (s->caps.size() > 0)
747 _invalidate_kernel_dcache();
748 }
749
750 void Client::trim_dentry(Dentry *dn)
751 {
752 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
753 << " in dir "
754 << std::hex << dn->dir->parent_inode->ino << std::dec
755 << dendl;
756 if (dn->inode) {
757 Inode *diri = dn->dir->parent_inode;
758 clear_dir_complete_and_ordered(diri, true);
759 }
760 unlink(dn, false, false); // drop dir, drop dentry
761 }
762
763
764 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
765 uint64_t truncate_seq, uint64_t truncate_size)
766 {
767 uint64_t prior_size = in->size;
768
769 if (truncate_seq > in->truncate_seq ||
770 (truncate_seq == in->truncate_seq && size > in->size)) {
771 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
772 in->size = size;
773 in->reported_size = size;
774 if (truncate_seq != in->truncate_seq) {
775 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
776 << truncate_seq << dendl;
777 in->truncate_seq = truncate_seq;
778 in->oset.truncate_seq = truncate_seq;
779
780 // truncate cached file data
781 if (prior_size > size) {
782 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
783 }
784 }
785
786 // truncate inline data
787 if (in->inline_version < CEPH_INLINE_NONE) {
788 uint32_t len = in->inline_data.length();
789 if (size < len)
790 in->inline_data.splice(size, len - size);
791 }
792 }
793 if (truncate_seq >= in->truncate_seq &&
794 in->truncate_size != truncate_size) {
795 if (in->is_file()) {
796 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
797 << truncate_size << dendl;
798 in->truncate_size = truncate_size;
799 in->oset.truncate_size = truncate_size;
800 } else {
801 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
802 }
803 }
804 }
805
806 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
807 utime_t ctime, utime_t mtime, utime_t atime)
808 {
809 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
810 << " ctime " << ctime << " mtime " << mtime << dendl;
811
812 if (time_warp_seq > in->time_warp_seq)
813 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
814 << " is higher than local time_warp_seq "
815 << in->time_warp_seq << dendl;
816
817 int warn = false;
818 // be careful with size, mtime, atime
819 if (issued & (CEPH_CAP_FILE_EXCL|
820 CEPH_CAP_FILE_WR|
821 CEPH_CAP_FILE_BUFFER|
822 CEPH_CAP_AUTH_EXCL|
823 CEPH_CAP_XATTR_EXCL)) {
824 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
825 if (ctime > in->ctime)
826 in->ctime = ctime;
827 if (time_warp_seq > in->time_warp_seq) {
828 //the mds updated times, so take those!
829 in->mtime = mtime;
830 in->atime = atime;
831 in->time_warp_seq = time_warp_seq;
832 } else if (time_warp_seq == in->time_warp_seq) {
833 //take max times
834 if (mtime > in->mtime)
835 in->mtime = mtime;
836 if (atime > in->atime)
837 in->atime = atime;
838 } else if (issued & CEPH_CAP_FILE_EXCL) {
839 //ignore mds values as we have a higher seq
840 } else warn = true;
841 } else {
842 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
843 if (time_warp_seq >= in->time_warp_seq) {
844 in->ctime = ctime;
845 in->mtime = mtime;
846 in->atime = atime;
847 in->time_warp_seq = time_warp_seq;
848 } else warn = true;
849 }
850 if (warn) {
851 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
852 << time_warp_seq << " is lower than local time_warp_seq "
853 << in->time_warp_seq
854 << dendl;
855 }
856 }
857
858 void Client::_fragmap_remove_non_leaves(Inode *in)
859 {
860 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
861 if (!in->dirfragtree.is_leaf(p->first))
862 in->fragmap.erase(p++);
863 else
864 ++p;
865 }
866
867 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
868 {
869 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
870 if (p->second == mds)
871 in->fragmap.erase(p++);
872 else
873 ++p;
874 }
875
876 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
877 MetaSession *session,
878 const UserPerm& request_perms)
879 {
880 Inode *in;
881 bool was_new = false;
882 if (inode_map.count(st->vino)) {
883 in = inode_map[st->vino];
884 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
885 } else {
886 in = new Inode(this, st->vino, &st->layout);
887 inode_map[st->vino] = in;
888
889 if (use_faked_inos())
890 _assign_faked_ino(in);
891
892 if (!root) {
893 root = in;
894 if (use_faked_inos())
895 _assign_faked_root(root);
896 root_ancestor = in;
897 cwd = root;
898 } else if (is_mounting()) {
899 root_parents[root_ancestor] = in;
900 root_ancestor = in;
901 }
902
903 // immutable bits
904 in->ino = st->vino.ino;
905 in->snapid = st->vino.snapid;
906 in->mode = st->mode & S_IFMT;
907 was_new = true;
908 }
909
910 in->rdev = st->rdev;
911 if (in->is_symlink())
912 in->symlink = st->symlink;
913
914 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
915 bool new_version = false;
916 if (in->version == 0 ||
917 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
918 (in->version & ~1) < st->version))
919 new_version = true;
920
921 int issued;
922 in->caps_issued(&issued);
923 issued |= in->caps_dirty();
924 int new_issued = ~issued & (int)st->cap.caps;
925
926 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
927 !(issued & CEPH_CAP_AUTH_EXCL)) {
928 in->mode = st->mode;
929 in->uid = st->uid;
930 in->gid = st->gid;
931 in->btime = st->btime;
932 in->snap_btime = st->snap_btime;
933 in->snap_metadata = st->snap_metadata;
934 }
935
936 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
937 !(issued & CEPH_CAP_LINK_EXCL)) {
938 in->nlink = st->nlink;
939 }
940
941 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
942 update_inode_file_time(in, issued, st->time_warp_seq,
943 st->ctime, st->mtime, st->atime);
944 }
945
946 if (new_version ||
947 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
948 in->layout = st->layout;
949 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
950 }
951
952 if (in->is_dir()) {
953 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
954 in->dirstat = st->dirstat;
955 }
956 // dir_layout/rstat/quota are not tracked by capability, update them only if
957 // the inode stat is from auth mds
958 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
959 in->dir_layout = st->dir_layout;
960 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
961 in->rstat = st->rstat;
962 in->quota = st->quota;
963 in->dir_pin = st->dir_pin;
964 }
965 // move me if/when version reflects fragtree changes.
966 if (in->dirfragtree != st->dirfragtree) {
967 in->dirfragtree = st->dirfragtree;
968 _fragmap_remove_non_leaves(in);
969 }
970 }
971
972 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
973 st->xattrbl.length() &&
974 st->xattr_version > in->xattr_version) {
975 auto p = st->xattrbl.cbegin();
976 decode(in->xattrs, p);
977 in->xattr_version = st->xattr_version;
978 }
979
980 if (st->inline_version > in->inline_version) {
981 in->inline_data = st->inline_data;
982 in->inline_version = st->inline_version;
983 }
984
985 /* always take a newer change attr */
986 if (st->change_attr > in->change_attr)
987 in->change_attr = st->change_attr;
988
989 if (st->version > in->version)
990 in->version = st->version;
991
992 if (was_new)
993 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
994
995 if (!st->cap.caps)
996 return in; // as with readdir returning indoes in different snaprealms (no caps!)
997
998 if (in->snapid == CEPH_NOSNAP) {
999 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1000 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1001 st->cap.flags, request_perms);
1002 if (in->auth_cap && in->auth_cap->session == session) {
1003 in->max_size = st->max_size;
1004 in->rstat = st->rstat;
1005 }
1006
1007 // setting I_COMPLETE needs to happen after adding the cap
1008 if (in->is_dir() &&
1009 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1010 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1011 in->dirstat.nfiles == 0 &&
1012 in->dirstat.nsubdirs == 0) {
1013 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1014 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1015 if (in->dir) {
1016 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1017 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1018 in->dir->readdir_cache.clear();
1019 for (const auto& p : in->dir->dentries) {
1020 unlink(p.second, true, true); // keep dir, keep dentry
1021 }
1022 if (in->dir->dentries.empty())
1023 close_dir(in->dir);
1024 }
1025 }
1026 } else {
1027 in->snap_caps |= st->cap.caps;
1028 }
1029
1030 in->fscrypt = st->fscrypt;
1031 return in;
1032 }
1033
1034
1035 /*
1036 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1037 */
1038 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1039 Inode *in, utime_t from, MetaSession *session,
1040 Dentry *old_dentry)
1041 {
1042 Dentry *dn = NULL;
1043 if (dir->dentries.count(dname))
1044 dn = dir->dentries[dname];
1045
1046 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1047 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1048 << dendl;
1049
1050 if (dn && dn->inode) {
1051 if (dn->inode->vino() == in->vino()) {
1052 touch_dn(dn);
1053 ldout(cct, 12) << " had dentry " << dname
1054 << " with correct vino " << dn->inode->vino()
1055 << dendl;
1056 } else {
1057 ldout(cct, 12) << " had dentry " << dname
1058 << " with WRONG vino " << dn->inode->vino()
1059 << dendl;
1060 unlink(dn, true, true); // keep dir, keep dentry
1061 }
1062 }
1063
1064 if (!dn || !dn->inode) {
1065 InodeRef tmp_ref(in);
1066 if (old_dentry) {
1067 if (old_dentry->dir != dir) {
1068 Inode *old_diri = old_dentry->dir->parent_inode;
1069 clear_dir_complete_and_ordered(old_diri, false);
1070 }
1071 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1072 }
1073 Inode *diri = dir->parent_inode;
1074 clear_dir_complete_and_ordered(diri, false);
1075 dn = link(dir, dname, in, dn);
1076 }
1077
1078 update_dentry_lease(dn, dlease, from, session);
1079 return dn;
1080 }
1081
1082 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1083 {
1084 utime_t dttl = from;
1085 dttl += (float)dlease->duration_ms / 1000.0;
1086
1087 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1088
1089 ceph_assert(dn);
1090
1091 if (dlease->mask & CEPH_LEASE_VALID) {
1092 if (dttl > dn->lease_ttl) {
1093 ldout(cct, 10) << "got dentry lease on " << dn->name
1094 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1095 dn->lease_ttl = dttl;
1096 dn->lease_mds = session->mds_num;
1097 dn->lease_seq = dlease->seq;
1098 dn->lease_gen = session->cap_gen;
1099 }
1100 }
1101 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1102 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1103 dn->mark_primary();
1104 dn->alternate_name = std::move(dlease->alternate_name);
1105 }
1106
1107
1108 /*
1109 * update MDS location cache for a single inode
1110 */
1111 void Client::update_dir_dist(Inode *in, DirStat *dst)
1112 {
1113 // auth
1114 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1115 if (dst->auth >= 0) {
1116 in->fragmap[dst->frag] = dst->auth;
1117 } else {
1118 in->fragmap.erase(dst->frag);
1119 }
1120 if (!in->dirfragtree.is_leaf(dst->frag)) {
1121 in->dirfragtree.force_to_leaf(cct, dst->frag);
1122 _fragmap_remove_non_leaves(in);
1123 }
1124
1125 // replicated
1126 in->dir_replicated = !dst->dist.empty();
1127 if (!dst->dist.empty())
1128 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1129 else
1130 in->frag_repmap.erase(dst->frag);
1131 }
1132
1133 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1134 {
1135 if (complete)
1136 diri->dir_release_count++;
1137 else
1138 diri->dir_ordered_count++;
1139 if (diri->flags & I_COMPLETE) {
1140 if (complete) {
1141 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1142 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1143 } else {
1144 if (diri->flags & I_DIR_ORDERED) {
1145 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1146 diri->flags &= ~I_DIR_ORDERED;
1147 }
1148 }
1149 if (diri->dir)
1150 diri->dir->readdir_cache.clear();
1151 }
1152 }
1153
1154 /*
1155 * insert results from readdir or lssnap into the metadata cache.
1156 */
1157 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1158
1159 auto& reply = request->reply;
1160 ConnectionRef con = request->reply->get_connection();
1161 uint64_t features;
1162 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1163 features = (uint64_t)-1;
1164 }
1165 else {
1166 features = con->get_features();
1167 }
1168
1169 dir_result_t *dirp = request->dirp;
1170 ceph_assert(dirp);
1171
1172 // the extra buffer list is only set for readdir and lssnap replies
1173 auto p = reply->get_extra_bl().cbegin();
1174 if (!p.end()) {
1175 // snapdir?
1176 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1177 ceph_assert(diri);
1178 diri = open_snapdir(diri);
1179 }
1180
1181 // only open dir if we're actually adding stuff to it!
1182 Dir *dir = diri->open_dir();
1183 ceph_assert(dir);
1184
1185 // dirstat
1186 DirStat dst(p, features);
1187 __u32 numdn;
1188 __u16 flags;
1189 decode(numdn, p);
1190 decode(flags, p);
1191
1192 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1193 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1194
1195 frag_t fg = (unsigned)request->head.args.readdir.frag;
1196 unsigned readdir_offset = dirp->next_offset;
1197 string readdir_start = dirp->last_name;
1198 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1199
1200 unsigned last_hash = 0;
1201 if (hash_order) {
1202 if (!readdir_start.empty()) {
1203 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1204 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1205 /* mds understands offset_hash */
1206 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1207 }
1208 }
1209
1210 if (fg != dst.frag) {
1211 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1212 fg = dst.frag;
1213 if (!hash_order) {
1214 readdir_offset = 2;
1215 readdir_start.clear();
1216 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1217 }
1218 }
1219
1220 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1221 << ", hash_order=" << hash_order
1222 << ", readdir_start " << readdir_start
1223 << ", last_hash " << last_hash
1224 << ", next_offset " << readdir_offset << dendl;
1225
1226 if (diri->snapid != CEPH_SNAPDIR &&
1227 fg.is_leftmost() && readdir_offset == 2 &&
1228 !(hash_order && last_hash)) {
1229 dirp->release_count = diri->dir_release_count;
1230 dirp->ordered_count = diri->dir_ordered_count;
1231 dirp->start_shared_gen = diri->shared_gen;
1232 dirp->cache_index = 0;
1233 }
1234
1235 dirp->buffer_frag = fg;
1236
1237 _readdir_drop_dirp_buffer(dirp);
1238 dirp->buffer.reserve(numdn);
1239
1240 string dname;
1241 LeaseStat dlease;
1242 for (unsigned i=0; i<numdn; i++) {
1243 decode(dname, p);
1244 dlease.decode(p, features);
1245 InodeStat ist(p, features);
1246
1247 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1248
1249 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1250 request->perms);
1251 Dentry *dn;
1252 if (diri->dir->dentries.count(dname)) {
1253 Dentry *olddn = diri->dir->dentries[dname];
1254 if (olddn->inode != in) {
1255 // replace incorrect dentry
1256 unlink(olddn, true, true); // keep dir, dentry
1257 dn = link(dir, dname, in, olddn);
1258 ceph_assert(dn == olddn);
1259 } else {
1260 // keep existing dn
1261 dn = olddn;
1262 touch_dn(dn);
1263 }
1264 } else {
1265 // new dn
1266 dn = link(dir, dname, in, NULL);
1267 }
1268 dn->alternate_name = std::move(dlease.alternate_name);
1269
1270 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1271 if (hash_order) {
1272 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1273 if (hash != last_hash)
1274 readdir_offset = 2;
1275 last_hash = hash;
1276 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1277 } else {
1278 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1279 }
1280 // add to readdir cache
1281 if (dirp->release_count == diri->dir_release_count &&
1282 dirp->ordered_count == diri->dir_ordered_count &&
1283 dirp->start_shared_gen == diri->shared_gen) {
1284 if (dirp->cache_index == dir->readdir_cache.size()) {
1285 if (i == 0) {
1286 ceph_assert(!dirp->inode->is_complete_and_ordered());
1287 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1288 }
1289 dir->readdir_cache.push_back(dn);
1290 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1291 if (dirp->inode->is_complete_and_ordered())
1292 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1293 else
1294 dir->readdir_cache[dirp->cache_index] = dn;
1295 } else {
1296 ceph_abort_msg("unexpected readdir buffer idx");
1297 }
1298 dirp->cache_index++;
1299 }
1300 // add to cached result list
1301 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1302 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1303 }
1304
1305 if (numdn > 0)
1306 dirp->last_name = dname;
1307 if (end)
1308 dirp->next_offset = 2;
1309 else
1310 dirp->next_offset = readdir_offset;
1311
1312 if (dir->is_empty())
1313 close_dir(dir);
1314 }
1315 }
1316
1317 /** insert_trace
1318 *
1319 * insert a trace from a MDS reply into the cache.
1320 */
1321 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1322 {
1323 auto& reply = request->reply;
1324 int op = request->get_op();
1325
1326 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1327 << " is_target=" << (int)reply->head.is_target
1328 << " is_dentry=" << (int)reply->head.is_dentry
1329 << dendl;
1330
1331 auto p = reply->get_trace_bl().cbegin();
1332 if (request->got_unsafe) {
1333 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1334 ceph_assert(p.end());
1335 return NULL;
1336 }
1337
1338 if (p.end()) {
1339 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1340
1341 Dentry *d = request->dentry();
1342 if (d) {
1343 Inode *diri = d->dir->parent_inode;
1344 clear_dir_complete_and_ordered(diri, true);
1345 }
1346
1347 if (d && reply->get_result() == 0) {
1348 if (op == CEPH_MDS_OP_RENAME) {
1349 // rename
1350 Dentry *od = request->old_dentry();
1351 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1352 ceph_assert(od);
1353 unlink(od, true, true); // keep dir, dentry
1354 } else if (op == CEPH_MDS_OP_RMDIR ||
1355 op == CEPH_MDS_OP_UNLINK) {
1356 // unlink, rmdir
1357 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1358 unlink(d, true, true); // keep dir, dentry
1359 }
1360 }
1361 return NULL;
1362 }
1363
1364 ConnectionRef con = request->reply->get_connection();
1365 uint64_t features;
1366 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1367 features = (uint64_t)-1;
1368 }
1369 else {
1370 features = con->get_features();
1371 }
1372 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1373
1374 // snap trace
1375 SnapRealm *realm = NULL;
1376 if (reply->snapbl.length())
1377 update_snap_trace(reply->snapbl, &realm);
1378
1379 ldout(cct, 10) << " hrm "
1380 << " is_target=" << (int)reply->head.is_target
1381 << " is_dentry=" << (int)reply->head.is_dentry
1382 << dendl;
1383
1384 InodeStat dirst;
1385 DirStat dst;
1386 string dname;
1387 LeaseStat dlease;
1388 InodeStat ist;
1389
1390 if (reply->head.is_dentry) {
1391 dirst.decode(p, features);
1392 dst.decode(p, features);
1393 decode(dname, p);
1394 dlease.decode(p, features);
1395 }
1396
1397 Inode *in = 0;
1398 if (reply->head.is_target) {
1399 ist.decode(p, features);
1400 if (cct->_conf->client_debug_getattr_caps) {
1401 unsigned wanted = 0;
1402 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1403 wanted = request->head.args.getattr.mask;
1404 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1405 wanted = request->head.args.open.mask;
1406
1407 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1408 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1409 ceph_abort_msg("MDS reply does not contain xattrs");
1410 }
1411
1412 in = add_update_inode(&ist, request->sent_stamp, session,
1413 request->perms);
1414 }
1415
1416 Inode *diri = NULL;
1417 if (reply->head.is_dentry) {
1418 diri = add_update_inode(&dirst, request->sent_stamp, session,
1419 request->perms);
1420 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1421
1422 if (in) {
1423 Dir *dir = diri->open_dir();
1424 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1425 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1426 } else {
1427 Dentry *dn = NULL;
1428 if (diri->dir && diri->dir->dentries.count(dname)) {
1429 dn = diri->dir->dentries[dname];
1430 if (dn->inode) {
1431 clear_dir_complete_and_ordered(diri, false);
1432 unlink(dn, true, true); // keep dir, dentry
1433 }
1434 }
1435 if (dlease.duration_ms > 0) {
1436 if (!dn) {
1437 Dir *dir = diri->open_dir();
1438 dn = link(dir, dname, NULL, NULL);
1439 }
1440 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1441 }
1442 }
1443 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1444 op == CEPH_MDS_OP_MKSNAP) {
1445 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1446 // fake it for snap lookup
1447 vinodeno_t vino = ist.vino;
1448 vino.snapid = CEPH_SNAPDIR;
1449 ceph_assert(inode_map.count(vino));
1450 diri = inode_map[vino];
1451
1452 string dname = request->path.last_dentry();
1453
1454 LeaseStat dlease;
1455 dlease.duration_ms = 0;
1456
1457 if (in) {
1458 Dir *dir = diri->open_dir();
1459 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1460 } else {
1461 if (diri->dir && diri->dir->dentries.count(dname)) {
1462 Dentry *dn = diri->dir->dentries[dname];
1463 if (dn->inode)
1464 unlink(dn, true, true); // keep dir, dentry
1465 }
1466 }
1467 }
1468
1469 if (in) {
1470 if (op == CEPH_MDS_OP_READDIR ||
1471 op == CEPH_MDS_OP_LSSNAP) {
1472 insert_readdir_results(request, session, in);
1473 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1474 // hack: return parent inode instead
1475 in = diri;
1476 }
1477
1478 if (request->dentry() == NULL && in != request->inode()) {
1479 // pin the target inode if its parent dentry is not pinned
1480 request->set_other_inode(in);
1481 }
1482 }
1483
1484 if (realm)
1485 put_snap_realm(realm);
1486
1487 request->target = in;
1488 return in;
1489 }
1490
1491 // -------
1492
1493 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1494 {
1495 mds_rank_t mds = MDS_RANK_NONE;
1496 __u32 hash = 0;
1497 bool is_hash = false;
1498
1499 Inode *in = NULL;
1500 Dentry *de = NULL;
1501
1502 if (req->resend_mds >= 0) {
1503 mds = req->resend_mds;
1504 req->resend_mds = -1;
1505 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1506 goto out;
1507 }
1508
1509 if (cct->_conf->client_use_random_mds)
1510 goto random_mds;
1511
1512 in = req->inode();
1513 de = req->dentry();
1514 if (in) {
1515 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1516 if (req->path.depth()) {
1517 hash = in->hash_dentry_name(req->path[0]);
1518 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1519 << " on " << req->path[0]
1520 << " => " << hash << dendl;
1521 is_hash = true;
1522 }
1523 } else if (de) {
1524 if (de->inode) {
1525 in = de->inode.get();
1526 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1527 } else {
1528 in = de->dir->parent_inode;
1529 hash = in->hash_dentry_name(de->name);
1530 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1531 << " on " << de->name
1532 << " => " << hash << dendl;
1533 is_hash = true;
1534 }
1535 }
1536 if (in) {
1537 if (in->snapid != CEPH_NOSNAP) {
1538 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1539 while (in->snapid != CEPH_NOSNAP) {
1540 if (in->snapid == CEPH_SNAPDIR)
1541 in = in->snapdir_parent.get();
1542 else if (!in->dentries.empty())
1543 /* In most cases there will only be one dentry, so getting it
1544 * will be the correct action. If there are multiple hard links,
1545 * I think the MDS should be able to redirect as needed*/
1546 in = in->get_first_parent()->dir->parent_inode;
1547 else {
1548 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1549 break;
1550 }
1551 }
1552 is_hash = false;
1553 }
1554
1555 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1556 << " hash=" << hash << dendl;
1557
1558 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1559 frag_t fg = in->dirfragtree[hash];
1560 if (!req->auth_is_best()) {
1561 auto repmapit = in->frag_repmap.find(fg);
1562 if (repmapit != in->frag_repmap.end()) {
1563 auto& repmap = repmapit->second;
1564 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1565 mds = repmap.at(r);
1566 }
1567 } else if (in->fragmap.count(fg)) {
1568 mds = in->fragmap[fg];
1569 if (phash_diri)
1570 *phash_diri = in;
1571 } else if (in->auth_cap) {
1572 req->send_to_auth = true;
1573 mds = in->auth_cap->session->mds_num;
1574 }
1575 if (mds >= 0) {
1576 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1577 goto out;
1578 }
1579 }
1580
1581 if (in->auth_cap && req->auth_is_best()) {
1582 mds = in->auth_cap->session->mds_num;
1583 } else if (!in->caps.empty()) {
1584 mds = in->caps.begin()->second.session->mds_num;
1585 } else {
1586 goto random_mds;
1587 }
1588 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1589
1590 goto out;
1591 }
1592
1593 random_mds:
1594 if (mds < 0) {
1595 mds = _get_random_up_mds();
1596 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1597 }
1598
1599 out:
1600 ldout(cct, 20) << "mds is " << mds << dendl;
1601 return mds;
1602 }
1603
1604 void Client::connect_mds_targets(mds_rank_t mds)
1605 {
1606 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1607 ceph_assert(mds_sessions.count(mds));
1608 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1609 for (const auto &rank : info.export_targets) {
1610 if (mds_sessions.count(rank) == 0 &&
1611 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1612 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1613 << " export target mds." << rank << dendl;
1614 _open_mds_session(rank);
1615 }
1616 }
1617 }
1618
1619 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1620 {
1621 f->dump_int("id", get_nodeid().v);
1622 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1623 f->dump_object("inst", inst);
1624 f->dump_stream("inst_str") << inst;
1625 f->dump_stream("addr_str") << inst.addr;
1626 f->open_array_section("sessions");
1627 for (const auto &p : mds_sessions) {
1628 f->open_object_section("session");
1629 p.second.dump(f, cap_dump);
1630 f->close_section();
1631 }
1632 f->close_section();
1633 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1634 }
1635
1636 void Client::dump_mds_requests(Formatter *f)
1637 {
1638 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1639 p != mds_requests.end();
1640 ++p) {
1641 f->open_object_section("request");
1642 p->second->dump(f);
1643 f->close_section();
1644 }
1645 }
1646
1647 int Client::verify_reply_trace(int r, MetaSession *session,
1648 MetaRequest *request, const MConstRef<MClientReply>& reply,
1649 InodeRef *ptarget, bool *pcreated,
1650 const UserPerm& perms)
1651 {
1652 // check whether this request actually did the create, and set created flag
1653 bufferlist extra_bl;
1654 inodeno_t created_ino;
1655 bool got_created_ino = false;
1656 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1657
1658 extra_bl = reply->get_extra_bl();
1659 if (extra_bl.length() >= 8) {
1660 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1661 struct openc_response_t ocres;
1662
1663 decode(ocres, extra_bl);
1664 created_ino = ocres.created_ino;
1665 /*
1666 * The userland cephfs client doesn't have a way to do an async create
1667 * (yet), so just discard delegated_inos for now. Eventually we should
1668 * store them and use them in create calls, even if they are synchronous,
1669 * if only for testing purposes.
1670 */
1671 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1672 } else {
1673 // u64 containing number of created ino
1674 decode(created_ino, extra_bl);
1675 }
1676 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1677 got_created_ino = true;
1678 }
1679
1680 if (pcreated)
1681 *pcreated = got_created_ino;
1682
1683 if (request->target) {
1684 *ptarget = request->target;
1685 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1686 } else {
1687 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1688 (*ptarget) = p->second;
1689 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1690 } else {
1691 // we got a traceless reply, and need to look up what we just
1692 // created. for now, do this by name. someday, do this by the
1693 // ino... which we know! FIXME.
1694 InodeRef target;
1695 Dentry *d = request->dentry();
1696 if (d) {
1697 if (d->dir) {
1698 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1699 << d->dir->parent_inode->ino << "/" << d->name
1700 << " got_ino " << got_created_ino
1701 << " ino " << created_ino
1702 << dendl;
1703 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1704 &target, perms);
1705 } else {
1706 // if the dentry is not linked, just do our best. see #5021.
1707 ceph_abort_msg("how did this happen? i want logs!");
1708 }
1709 } else {
1710 Inode *in = request->inode();
1711 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1712 << in->ino << dendl;
1713 r = _getattr(in, request->regetattr_mask, perms, true);
1714 target = in;
1715 }
1716 if (r >= 0) {
1717 // verify ino returned in reply and trace_dist are the same
1718 if (got_created_ino &&
1719 created_ino.val != target->ino.val) {
1720 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1721 r = -CEPHFS_EINTR;
1722 }
1723 if (ptarget)
1724 ptarget->swap(target);
1725 }
1726 }
1727 }
1728
1729 return r;
1730 }
1731
1732
1733 /**
1734 * make a request
1735 *
1736 * Blocking helper to make an MDS request.
1737 *
1738 * If the ptarget flag is set, behavior changes slightly: the caller
1739 * expects to get a pointer to the inode we are creating or operating
1740 * on. As a result, we will follow up any traceless mutation reply
1741 * with a getattr or lookup to transparently handle a traceless reply
1742 * from the MDS (as when the MDS restarts and the client has to replay
1743 * a request).
1744 *
1745 * @param request the MetaRequest to execute
1746 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1747 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1748 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1749 * @param use_mds [optional] prefer a specific mds (-1 for default)
1750 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1751 */
1752 int Client::make_request(MetaRequest *request,
1753 const UserPerm& perms,
1754 InodeRef *ptarget, bool *pcreated,
1755 mds_rank_t use_mds,
1756 bufferlist *pdirbl)
1757 {
1758 int r = 0;
1759
1760 // assign a unique tid
1761 ceph_tid_t tid = ++last_tid;
1762 request->set_tid(tid);
1763
1764 // and timestamp
1765 request->op_stamp = ceph_clock_now();
1766
1767 // make note
1768 mds_requests[tid] = request->get();
1769 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1770 oldest_tid = tid;
1771
1772 request->set_caller_perms(perms);
1773
1774 if (cct->_conf->client_inject_fixed_oldest_tid) {
1775 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1776 request->set_oldest_client_tid(1);
1777 } else {
1778 request->set_oldest_client_tid(oldest_tid);
1779 }
1780
1781 // hack target mds?
1782 if (use_mds >= 0)
1783 request->resend_mds = use_mds;
1784
1785 MetaSession *session = NULL;
1786 while (1) {
1787 if (request->aborted())
1788 break;
1789
1790 if (blocklisted) {
1791 request->abort(-CEPHFS_EBLOCKLISTED);
1792 break;
1793 }
1794
1795 // set up wait cond
1796 ceph::condition_variable caller_cond;
1797 request->caller_cond = &caller_cond;
1798
1799 // choose mds
1800 Inode *hash_diri = NULL;
1801 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1802 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1803 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1804 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1805 if (hash_diri) {
1806 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1807 _fragmap_remove_stopped_mds(hash_diri, mds);
1808 } else {
1809 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1810 request->resend_mds = _get_random_up_mds();
1811 }
1812 } else {
1813 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1814 wait_on_list(waiting_for_mdsmap);
1815 }
1816 continue;
1817 }
1818
1819 // open a session?
1820 if (!have_open_session(mds)) {
1821 session = _get_or_open_mds_session(mds);
1822 if (session->state == MetaSession::STATE_REJECTED) {
1823 request->abort(-CEPHFS_EPERM);
1824 break;
1825 }
1826 // wait
1827 if (session->state == MetaSession::STATE_OPENING) {
1828 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1829 wait_on_context_list(session->waiting_for_open);
1830 continue;
1831 }
1832
1833 if (!have_open_session(mds))
1834 continue;
1835 } else {
1836 session = &mds_sessions.at(mds);
1837 }
1838
1839 // send request.
1840 send_request(request, session);
1841
1842 // wait for signal
1843 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1844 request->kick = false;
1845 std::unique_lock l{client_lock, std::adopt_lock};
1846 caller_cond.wait(l, [request] {
1847 return (request->reply || // reply
1848 request->resend_mds >= 0 || // forward
1849 request->kick);
1850 });
1851 l.release();
1852 request->caller_cond = nullptr;
1853
1854 // did we get a reply?
1855 if (request->reply)
1856 break;
1857 }
1858
1859 if (!request->reply) {
1860 ceph_assert(request->aborted());
1861 ceph_assert(!request->got_unsafe);
1862 r = request->get_abort_code();
1863 request->item.remove_myself();
1864 unregister_request(request);
1865 put_request(request);
1866 return r;
1867 }
1868
1869 // got it!
1870 auto reply = std::move(request->reply);
1871 r = reply->get_result();
1872 if (r >= 0)
1873 request->success = true;
1874
1875 // kick dispatcher (we've got it!)
1876 ceph_assert(request->dispatch_cond);
1877 request->dispatch_cond->notify_all();
1878 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1879 request->dispatch_cond = 0;
1880
1881 if (r >= 0 && ptarget)
1882 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1883
1884 if (pdirbl)
1885 *pdirbl = reply->get_extra_bl();
1886
1887 // -- log times --
1888 utime_t lat = ceph_clock_now();
1889 lat -= request->sent_stamp;
1890 ldout(cct, 20) << "lat " << lat << dendl;
1891 logger->tinc(l_c_lat, lat);
1892 logger->tinc(l_c_reply, lat);
1893
1894 put_request(request);
1895 return r;
1896 }
1897
1898 void Client::unregister_request(MetaRequest *req)
1899 {
1900 mds_requests.erase(req->tid);
1901 if (req->tid == oldest_tid) {
1902 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1903 while (true) {
1904 if (p == mds_requests.end()) {
1905 oldest_tid = 0;
1906 break;
1907 }
1908 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1909 oldest_tid = p->first;
1910 break;
1911 }
1912 ++p;
1913 }
1914 }
1915 put_request(req);
1916 }
1917
1918 void Client::put_request(MetaRequest *request)
1919 {
1920 if (request->_put()) {
1921 int op = -1;
1922 if (request->success)
1923 op = request->get_op();
1924 InodeRef other_in;
1925 request->take_other_inode(&other_in);
1926 delete request;
1927
1928 if (other_in &&
1929 (op == CEPH_MDS_OP_RMDIR ||
1930 op == CEPH_MDS_OP_RENAME ||
1931 op == CEPH_MDS_OP_RMSNAP)) {
1932 _try_to_trim_inode(other_in.get(), false);
1933 }
1934 }
1935 }
1936
1937 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1938 mds_rank_t mds, int drop,
1939 int unless, int force)
1940 {
1941 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1942 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1943 << ", force:" << force << ")" << dendl;
1944 int released = 0;
1945 auto it = in->caps.find(mds);
1946 if (it != in->caps.end()) {
1947 Cap &cap = it->second;
1948 drop &= ~(in->dirty_caps | get_caps_used(in));
1949 if ((drop & cap.issued) &&
1950 !(unless & cap.issued)) {
1951 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1952 cap.issued &= ~drop;
1953 cap.implemented &= ~drop;
1954 released = 1;
1955 } else {
1956 released = force;
1957 }
1958 if (released) {
1959 cap.wanted = in->caps_wanted();
1960 if (&cap == in->auth_cap &&
1961 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1962 in->requested_max_size = 0;
1963 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1964 }
1965 ceph_mds_request_release rel;
1966 rel.ino = in->ino;
1967 rel.cap_id = cap.cap_id;
1968 rel.seq = cap.seq;
1969 rel.issue_seq = cap.issue_seq;
1970 rel.mseq = cap.mseq;
1971 rel.caps = cap.implemented;
1972 rel.wanted = cap.wanted;
1973 rel.dname_len = 0;
1974 rel.dname_seq = 0;
1975 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1976 }
1977 }
1978 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1979 << released << dendl;
1980 return released;
1981 }
1982
1983 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1984 mds_rank_t mds, int drop, int unless)
1985 {
1986 ldout(cct, 20) << __func__ << " enter(dn:"
1987 << dn << ")" << dendl;
1988 int released = 0;
1989 if (dn->dir)
1990 released = encode_inode_release(dn->dir->parent_inode, req,
1991 mds, drop, unless, 1);
1992 if (released && dn->lease_mds == mds) {
1993 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1994 auto& rel = req->cap_releases.back();
1995 rel.item.dname_len = dn->name.length();
1996 rel.item.dname_seq = dn->lease_seq;
1997 rel.dname = dn->name;
1998 dn->lease_mds = -1;
1999 }
2000 ldout(cct, 25) << __func__ << " exit(dn:"
2001 << dn << ")" << dendl;
2002 }
2003
2004
2005 /*
2006 * This requires the MClientRequest *request member to be set.
2007 * It will error out horribly without one.
2008 * Additionally, if you set any *drop member, you'd better have
2009 * set the corresponding dentry!
2010 */
2011 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2012 {
2013 ldout(cct, 20) << __func__ << " enter (req: "
2014 << req << ", mds: " << mds << ")" << dendl;
2015 if (req->inode_drop && req->inode())
2016 encode_inode_release(req->inode(), req,
2017 mds, req->inode_drop,
2018 req->inode_unless);
2019
2020 if (req->old_inode_drop && req->old_inode())
2021 encode_inode_release(req->old_inode(), req,
2022 mds, req->old_inode_drop,
2023 req->old_inode_unless);
2024 if (req->other_inode_drop && req->other_inode())
2025 encode_inode_release(req->other_inode(), req,
2026 mds, req->other_inode_drop,
2027 req->other_inode_unless);
2028
2029 if (req->dentry_drop && req->dentry())
2030 encode_dentry_release(req->dentry(), req,
2031 mds, req->dentry_drop,
2032 req->dentry_unless);
2033
2034 if (req->old_dentry_drop && req->old_dentry())
2035 encode_dentry_release(req->old_dentry(), req,
2036 mds, req->old_dentry_drop,
2037 req->old_dentry_unless);
2038 ldout(cct, 25) << __func__ << " exit (req: "
2039 << req << ", mds " << mds <<dendl;
2040 }
2041
2042 bool Client::have_open_session(mds_rank_t mds)
2043 {
2044 const auto &it = mds_sessions.find(mds);
2045 return it != mds_sessions.end() &&
2046 (it->second.state == MetaSession::STATE_OPEN ||
2047 it->second.state == MetaSession::STATE_STALE);
2048 }
2049
2050 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
2051 {
2052 const auto &it = mds_sessions.find(mds);
2053 if (it == mds_sessions.end() || it->second.con != con) {
2054 return NULL;
2055 } else {
2056 return &it->second;
2057 }
2058 }
2059
2060 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
2061 {
2062 auto it = mds_sessions.find(mds);
2063 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
2064 }
2065
2066 /**
2067 * Populate a map of strings with client-identifying metadata,
2068 * such as the hostname. Call this once at initialization.
2069 */
2070 void Client::populate_metadata(const std::string &mount_root)
2071 {
2072 // Hostname
2073 #ifdef _WIN32
2074 // TODO: move this to compat.h
2075 char hostname[64];
2076 DWORD hostname_sz = 64;
2077 GetComputerNameA(hostname, &hostname_sz);
2078 metadata["hostname"] = hostname;
2079 #else
2080 struct utsname u;
2081 int r = uname(&u);
2082 if (r >= 0) {
2083 metadata["hostname"] = u.nodename;
2084 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2085 } else {
2086 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2087 }
2088 #endif
2089
2090 metadata["pid"] = stringify(getpid());
2091
2092 // Ceph entity id (the '0' in "client.0")
2093 metadata["entity_id"] = cct->_conf->name.get_id();
2094
2095 // Our mount position
2096 if (!mount_root.empty()) {
2097 metadata["root"] = mount_root;
2098 }
2099
2100 // Ceph version
2101 metadata["ceph_version"] = pretty_version_to_str();
2102 metadata["ceph_sha1"] = git_version_to_str();
2103
2104 // Apply any metadata from the user's configured overrides
2105 std::vector<std::string> tokens;
2106 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2107 for (const auto &i : tokens) {
2108 auto eqpos = i.find("=");
2109 // Throw out anything that isn't of the form "<str>=<str>"
2110 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2111 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2112 continue;
2113 }
2114 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2115 }
2116 }
2117
2118 /**
2119 * Optionally add or override client metadata fields.
2120 */
2121 void Client::update_metadata(std::string const &k, std::string const &v)
2122 {
2123 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2124 ceph_assert(iref_reader.is_state_satisfied());
2125
2126 std::scoped_lock l(client_lock);
2127
2128 auto it = metadata.find(k);
2129 if (it != metadata.end()) {
2130 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2131 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2132 }
2133
2134 metadata[k] = v;
2135 }
2136
2137 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2138 {
2139 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2140 auto addrs = mdsmap->get_addrs(mds);
2141 auto em = mds_sessions.emplace(std::piecewise_construct,
2142 std::forward_as_tuple(mds),
2143 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2144 ceph_assert(em.second); /* not already present */
2145 MetaSession *session = &em.first->second;
2146
2147 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2148 m->metadata = metadata;
2149 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2150 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2151 session->con->send_message2(std::move(m));
2152 return session;
2153 }
2154
2155 void Client::_close_mds_session(MetaSession *s)
2156 {
2157 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2158 s->state = MetaSession::STATE_CLOSING;
2159 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2160 }
2161
2162 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2163 {
2164 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2165 if (rejected && s->state != MetaSession::STATE_CLOSING)
2166 s->state = MetaSession::STATE_REJECTED;
2167 else
2168 s->state = MetaSession::STATE_CLOSED;
2169 s->con->mark_down();
2170 signal_context_list(s->waiting_for_open);
2171 mount_cond.notify_all();
2172 remove_session_caps(s, err);
2173 kick_requests_closed(s);
2174 mds_ranks_closing.erase(s->mds_num);
2175 if (s->state == MetaSession::STATE_CLOSED)
2176 mds_sessions.erase(s->mds_num);
2177 }
2178
2179 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2180 {
2181 mds_rank_t from = mds_rank_t(m->get_source().num());
2182 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2183
2184 std::scoped_lock cl(client_lock);
2185 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2186 if (!session) {
2187 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2188 return;
2189 }
2190
2191 switch (m->get_op()) {
2192 case CEPH_SESSION_OPEN:
2193 {
2194 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2195 missing_features -= m->supported_features;
2196 if (!missing_features.empty()) {
2197 lderr(cct) << "mds." << from << " lacks required features '"
2198 << missing_features << "', closing session " << dendl;
2199 _close_mds_session(session);
2200 _closed_mds_session(session, -CEPHFS_EPERM, true);
2201 break;
2202 }
2203 session->mds_features = std::move(m->supported_features);
2204
2205 renew_caps(session);
2206 session->state = MetaSession::STATE_OPEN;
2207 if (is_unmounting())
2208 mount_cond.notify_all();
2209 else
2210 connect_mds_targets(from);
2211 signal_context_list(session->waiting_for_open);
2212 break;
2213 }
2214
2215 case CEPH_SESSION_CLOSE:
2216 _closed_mds_session(session);
2217 break;
2218
2219 case CEPH_SESSION_RENEWCAPS:
2220 if (session->cap_renew_seq == m->get_seq()) {
2221 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2222 session->cap_ttl =
2223 session->last_cap_renew_request + mdsmap->get_session_timeout();
2224 if (was_stale)
2225 wake_up_session_caps(session, false);
2226 }
2227 break;
2228
2229 case CEPH_SESSION_STALE:
2230 // invalidate session caps/leases
2231 session->cap_gen++;
2232 session->cap_ttl = ceph_clock_now();
2233 session->cap_ttl -= 1;
2234 renew_caps(session);
2235 break;
2236
2237 case CEPH_SESSION_RECALL_STATE:
2238 /*
2239 * Call the renew caps and flush cap releases just before
2240 * triming the caps in case the tick() won't get a chance
2241 * to run them, which could cause the client to be blocklisted
2242 * and MDS daemons trying to recall the caps again and
2243 * again.
2244 *
2245 * In most cases it will do nothing, and the new cap releases
2246 * added by trim_caps() followed will be deferred flushing
2247 * by tick().
2248 */
2249 renew_and_flush_cap_releases();
2250 trim_caps(session, m->get_max_caps());
2251 break;
2252
2253 case CEPH_SESSION_FLUSHMSG:
2254 /* flush cap release */
2255 if (auto& m = session->release; m) {
2256 session->con->send_message2(std::move(m));
2257 }
2258 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2259 break;
2260
2261 case CEPH_SESSION_FORCE_RO:
2262 force_session_readonly(session);
2263 break;
2264
2265 case CEPH_SESSION_REJECT:
2266 {
2267 std::string_view error_str;
2268 auto it = m->metadata.find("error_string");
2269 if (it != m->metadata.end())
2270 error_str = it->second;
2271 else
2272 error_str = "unknown error";
2273 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2274
2275 _closed_mds_session(session, -CEPHFS_EPERM, true);
2276 }
2277 break;
2278
2279 default:
2280 ceph_abort();
2281 }
2282 }
2283
2284 bool Client::_any_stale_sessions() const
2285 {
2286 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2287
2288 for (const auto &p : mds_sessions) {
2289 if (p.second.state == MetaSession::STATE_STALE) {
2290 return true;
2291 }
2292 }
2293
2294 return false;
2295 }
2296
2297 void Client::_kick_stale_sessions()
2298 {
2299 ldout(cct, 1) << __func__ << dendl;
2300
2301 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2302 MetaSession &s = it->second;
2303 if (s.state == MetaSession::STATE_REJECTED) {
2304 mds_sessions.erase(it++);
2305 continue;
2306 }
2307 ++it;
2308 if (s.state == MetaSession::STATE_STALE)
2309 _closed_mds_session(&s);
2310 }
2311 }
2312
2313 void Client::send_request(MetaRequest *request, MetaSession *session,
2314 bool drop_cap_releases)
2315 {
2316 // make the request
2317 mds_rank_t mds = session->mds_num;
2318 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2319 << " for mds." << mds << dendl;
2320 auto r = build_client_request(request);
2321 if (request->dentry()) {
2322 r->set_dentry_wanted();
2323 }
2324 if (request->got_unsafe) {
2325 r->set_replayed_op();
2326 if (request->target)
2327 r->head.ino = request->target->ino;
2328 } else {
2329 encode_cap_releases(request, mds);
2330 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2331 request->cap_releases.clear();
2332 else
2333 r->releases.swap(request->cap_releases);
2334 }
2335 r->set_mdsmap_epoch(mdsmap->get_epoch());
2336 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2337 objecter->with_osdmap([r](const OSDMap& o) {
2338 r->set_osdmap_epoch(o.get_epoch());
2339 });
2340 }
2341
2342 if (request->mds == -1) {
2343 request->sent_stamp = ceph_clock_now();
2344 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2345 }
2346 request->mds = mds;
2347
2348 Inode *in = request->inode();
2349 if (in) {
2350 auto it = in->caps.find(mds);
2351 if (it != in->caps.end()) {
2352 request->sent_on_mseq = it->second.mseq;
2353 }
2354 }
2355
2356 session->requests.push_back(&request->item);
2357
2358 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2359 session->con->send_message2(std::move(r));
2360 }
2361
2362 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2363 {
2364 auto req = make_message<MClientRequest>(request->get_op());
2365 req->set_tid(request->tid);
2366 req->set_stamp(request->op_stamp);
2367 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2368
2369 // if the filepath's haven't been set, set them!
2370 if (request->path.empty()) {
2371 Inode *in = request->inode();
2372 Dentry *de = request->dentry();
2373 if (in)
2374 in->make_nosnap_relative_path(request->path);
2375 else if (de) {
2376 if (de->inode)
2377 de->inode->make_nosnap_relative_path(request->path);
2378 else if (de->dir) {
2379 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2380 request->path.push_dentry(de->name);
2381 }
2382 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2383 << " No path, inode, or appropriately-endowed dentry given!"
2384 << dendl;
2385 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2386 << " No path, inode, or dentry given!"
2387 << dendl;
2388 }
2389 req->set_filepath(request->get_filepath());
2390 req->set_filepath2(request->get_filepath2());
2391 req->set_alternate_name(request->alternate_name);
2392 req->set_data(request->data);
2393 req->set_retry_attempt(request->retry_attempt++);
2394 req->head.num_fwd = request->num_fwd;
2395 const gid_t *_gids;
2396 int gid_count = request->perms.get_gids(&_gids);
2397 req->set_gid_list(gid_count, _gids);
2398 return req;
2399 }
2400
2401
2402
2403 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2404 {
2405 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2406
2407 std::scoped_lock cl(client_lock);
2408 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2409 if (!session) {
2410 return;
2411 }
2412 ceph_tid_t tid = fwd->get_tid();
2413
2414 if (mds_requests.count(tid) == 0) {
2415 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2416 return;
2417 }
2418
2419 MetaRequest *request = mds_requests[tid];
2420 ceph_assert(request);
2421
2422 // reset retry counter
2423 request->retry_attempt = 0;
2424
2425 // request not forwarded, or dest mds has no session.
2426 // resend.
2427 ldout(cct, 10) << __func__ << " tid " << tid
2428 << " fwd " << fwd->get_num_fwd()
2429 << " to mds." << fwd->get_dest_mds()
2430 << ", resending to " << fwd->get_dest_mds()
2431 << dendl;
2432
2433 request->mds = -1;
2434 request->item.remove_myself();
2435 request->num_fwd = fwd->get_num_fwd();
2436 request->resend_mds = fwd->get_dest_mds();
2437 request->caller_cond->notify_all();
2438 }
2439
2440 bool Client::is_dir_operation(MetaRequest *req)
2441 {
2442 int op = req->get_op();
2443 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2444 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2445 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2446 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2447 return true;
2448 return false;
2449 }
2450
2451 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2452 {
2453 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2454
2455 std::scoped_lock cl(client_lock);
2456 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2457 if (!session) {
2458 return;
2459 }
2460
2461 ceph_tid_t tid = reply->get_tid();
2462 bool is_safe = reply->is_safe();
2463
2464 if (mds_requests.count(tid) == 0) {
2465 lderr(cct) << __func__ << " no pending request on tid " << tid
2466 << " safe is:" << is_safe << dendl;
2467 return;
2468 }
2469 MetaRequest *request = mds_requests.at(tid);
2470
2471 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2472 << " tid " << tid << dendl;
2473
2474 if (request->got_unsafe && !is_safe) {
2475 //duplicate response
2476 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2477 << mds_num << " safe:" << is_safe << dendl;
2478 return;
2479 }
2480
2481 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
2482 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2483 << " from mds." << request->mds << dendl;
2484 request->send_to_auth = true;
2485 request->resend_mds = choose_target_mds(request);
2486 Inode *in = request->inode();
2487 std::map<mds_rank_t, Cap>::const_iterator it;
2488 if (request->resend_mds >= 0 &&
2489 request->resend_mds == request->mds &&
2490 (in == NULL ||
2491 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2492 request->sent_on_mseq == it->second.mseq)) {
2493 ldout(cct, 20) << "have to return ESTALE" << dendl;
2494 } else {
2495 request->caller_cond->notify_all();
2496 return;
2497 }
2498 }
2499
2500 ceph_assert(!request->reply);
2501 request->reply = reply;
2502 insert_trace(request, session);
2503
2504 // Handle unsafe reply
2505 if (!is_safe) {
2506 request->got_unsafe = true;
2507 session->unsafe_requests.push_back(&request->unsafe_item);
2508 if (is_dir_operation(request)) {
2509 Inode *dir = request->inode();
2510 ceph_assert(dir);
2511 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2512 }
2513 if (request->target) {
2514 InodeRef &in = request->target;
2515 in->unsafe_ops.push_back(&request->unsafe_target_item);
2516 }
2517 }
2518
2519 // Only signal the caller once (on the first reply):
2520 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2521 if (!is_safe || !request->got_unsafe) {
2522 ceph::condition_variable cond;
2523 request->dispatch_cond = &cond;
2524
2525 // wake up waiter
2526 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2527 request->caller_cond->notify_all();
2528
2529 // wake for kick back
2530 std::unique_lock l{client_lock, std::adopt_lock};
2531 cond.wait(l, [tid, request, &cond, this] {
2532 if (request->dispatch_cond) {
2533 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2534 << tid << " " << &cond << dendl;
2535 }
2536 return !request->dispatch_cond;
2537 });
2538 l.release();
2539 }
2540
2541 if (is_safe) {
2542 // the filesystem change is committed to disk
2543 // we're done, clean up
2544 if (request->got_unsafe) {
2545 request->unsafe_item.remove_myself();
2546 request->unsafe_dir_item.remove_myself();
2547 request->unsafe_target_item.remove_myself();
2548 signal_cond_list(request->waitfor_safe);
2549 }
2550 request->item.remove_myself();
2551 unregister_request(request);
2552 }
2553 if (is_unmounting())
2554 mount_cond.notify_all();
2555 }
2556
2557 void Client::_handle_full_flag(int64_t pool)
2558 {
2559 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2560 << "on " << pool << dendl;
2561 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2562 // to do this rather than blocking, because otherwise when we fill up we
2563 // potentially lock caps forever on files with dirty pages, and we need
2564 // to be able to release those caps to the MDS so that it can delete files
2565 // and free up space.
2566 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2567
2568 // For all inodes with layouts in this pool and a pending flush write op
2569 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2570 // from ObjectCacher so that it doesn't re-issue the write in response to
2571 // the ENOSPC error.
2572 // Fortunately since we're cancelling everything in a given pool, we don't
2573 // need to know which ops belong to which ObjectSet, we can just blow all
2574 // the un-flushed cached data away and mark any dirty inodes' async_err
2575 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2576 // affecting this pool, and all the objectsets we're purging were also
2577 // in this pool.
2578 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2579 i != inode_map.end(); ++i)
2580 {
2581 Inode *inode = i->second;
2582 if (inode->oset.dirty_or_tx
2583 && (pool == -1 || inode->layout.pool_id == pool)) {
2584 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2585 << " has dirty objects, purging and setting ENOSPC" << dendl;
2586 objectcacher->purge_set(&inode->oset);
2587 inode->set_async_err(-CEPHFS_ENOSPC);
2588 }
2589 }
2590
2591 if (cancelled_epoch != (epoch_t)-1) {
2592 set_cap_epoch_barrier(cancelled_epoch);
2593 }
2594 }
2595
2596 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2597 {
2598 std::set<entity_addr_t> new_blocklists;
2599
2600 std::scoped_lock cl(client_lock);
2601 objecter->consume_blocklist_events(&new_blocklists);
2602
2603 const auto myaddrs = messenger->get_myaddrs();
2604 bool new_blocklist = false;
2605 bool prenautilus = objecter->with_osdmap(
2606 [&](const OSDMap& o) {
2607 return o.require_osd_release < ceph_release_t::nautilus;
2608 });
2609 if (!blocklisted) {
2610 for (auto a : myaddrs.v) {
2611 // blocklist entries are always TYPE_ANY for nautilus+
2612 a.set_type(entity_addr_t::TYPE_ANY);
2613 if (new_blocklists.count(a)) {
2614 new_blocklist = true;
2615 break;
2616 }
2617 if (prenautilus) {
2618 // ...except pre-nautilus, they were TYPE_LEGACY
2619 a.set_type(entity_addr_t::TYPE_LEGACY);
2620 if (new_blocklists.count(a)) {
2621 new_blocklist = true;
2622 break;
2623 }
2624 }
2625 }
2626 }
2627 if (new_blocklist) {
2628 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2629 return o.get_epoch();
2630 });
2631 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2632 blocklisted = true;
2633
2634 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2635
2636 // Since we know all our OSD ops will fail, cancel them all preemtively,
2637 // so that on an unhealthy cluster we can umount promptly even if e.g.
2638 // some PGs were inaccessible.
2639 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2640
2641 }
2642
2643 if (blocklisted) {
2644 // Handle case where we were blocklisted but no longer are
2645 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2646 return o.is_blocklisted(myaddrs);});
2647 }
2648
2649 // Always subscribe to next osdmap for blocklisted client
2650 // until this client is not blocklisted.
2651 if (blocklisted) {
2652 objecter->maybe_request_map();
2653 }
2654
2655 if (objecter->osdmap_full_flag()) {
2656 _handle_full_flag(-1);
2657 } else {
2658 // Accumulate local list of full pools so that I can drop
2659 // the objecter lock before re-entering objecter in
2660 // cancel_writes
2661 std::vector<int64_t> full_pools;
2662
2663 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2664 for (const auto& kv : o.get_pools()) {
2665 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2666 full_pools.push_back(kv.first);
2667 }
2668 }
2669 });
2670
2671 for (auto p : full_pools)
2672 _handle_full_flag(p);
2673
2674 // Subscribe to subsequent maps to watch for the full flag going
2675 // away. For the global full flag objecter does this for us, but
2676 // it pays no attention to the per-pool full flag so in this branch
2677 // we do it ourselves.
2678 if (!full_pools.empty()) {
2679 objecter->maybe_request_map();
2680 }
2681 }
2682 }
2683
2684
2685 // ------------------------
2686 // incoming messages
2687
2688
2689 bool Client::ms_dispatch2(const MessageRef &m)
2690 {
2691 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2692 if (!iref_reader.is_state_satisfied()) {
2693 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2694 return true;
2695 }
2696
2697 switch (m->get_type()) {
2698 // mounting and mds sessions
2699 case CEPH_MSG_MDS_MAP:
2700 handle_mds_map(ref_cast<MMDSMap>(m));
2701 break;
2702 case CEPH_MSG_FS_MAP:
2703 handle_fs_map(ref_cast<MFSMap>(m));
2704 break;
2705 case CEPH_MSG_FS_MAP_USER:
2706 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2707 break;
2708 case CEPH_MSG_CLIENT_SESSION:
2709 handle_client_session(ref_cast<MClientSession>(m));
2710 break;
2711
2712 case CEPH_MSG_OSD_MAP:
2713 handle_osd_map(ref_cast<MOSDMap>(m));
2714 break;
2715
2716 // requests
2717 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2718 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2719 break;
2720 case CEPH_MSG_CLIENT_REPLY:
2721 handle_client_reply(ref_cast<MClientReply>(m));
2722 break;
2723
2724 // reclaim reply
2725 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2726 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2727 break;
2728
2729 case CEPH_MSG_CLIENT_SNAP:
2730 handle_snap(ref_cast<MClientSnap>(m));
2731 break;
2732 case CEPH_MSG_CLIENT_CAPS:
2733 handle_caps(ref_cast<MClientCaps>(m));
2734 break;
2735 case CEPH_MSG_CLIENT_LEASE:
2736 handle_lease(ref_cast<MClientLease>(m));
2737 break;
2738 case MSG_COMMAND_REPLY:
2739 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2740 handle_command_reply(ref_cast<MCommandReply>(m));
2741 } else {
2742 return false;
2743 }
2744 break;
2745 case CEPH_MSG_CLIENT_QUOTA:
2746 handle_quota(ref_cast<MClientQuota>(m));
2747 break;
2748
2749 default:
2750 return false;
2751 }
2752
2753 // unmounting?
2754 std::scoped_lock cl(client_lock);
2755 if (is_unmounting()) {
2756 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2757 << "+" << inode_map.size() << dendl;
2758 uint64_t size = lru.lru_get_size() + inode_map.size();
2759 trim_cache();
2760 if (size > lru.lru_get_size() + inode_map.size()) {
2761 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2762 mount_cond.notify_all();
2763 } else {
2764 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2765 << "+" << inode_map.size() << dendl;
2766 }
2767 }
2768
2769 return true;
2770 }
2771
2772 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2773 {
2774 std::scoped_lock cl(client_lock);
2775 fsmap.reset(new FSMap(m->get_fsmap()));
2776
2777 signal_cond_list(waiting_for_fsmap);
2778
2779 monclient->sub_got("fsmap", fsmap->get_epoch());
2780 }
2781
2782 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2783 {
2784 std::scoped_lock cl(client_lock);
2785 fsmap_user.reset(new FSMapUser);
2786 *fsmap_user = m->get_fsmap();
2787
2788 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2789 signal_cond_list(waiting_for_fsmap);
2790 }
2791
2792 // Cancel all the commands for missing or laggy GIDs
2793 void Client::cancel_commands(const MDSMap& newmap)
2794 {
2795 std::vector<ceph_tid_t> cancel_ops;
2796
2797 std::scoped_lock cmd_lock(command_lock);
2798 auto &commands = command_table.get_commands();
2799 for (const auto &[tid, op] : commands) {
2800 const mds_gid_t op_mds_gid = op.mds_gid;
2801 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2802 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2803 cancel_ops.push_back(tid);
2804 if (op.outs) {
2805 std::ostringstream ss;
2806 ss << "MDS " << op_mds_gid << " went away";
2807 *(op.outs) = ss.str();
2808 }
2809 /*
2810 * No need to make the con->mark_down under
2811 * client_lock here, because the con will
2812 * has its own lock.
2813 */
2814 op.con->mark_down();
2815 if (op.on_finish)
2816 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2817 }
2818 }
2819
2820 for (const auto &tid : cancel_ops)
2821 command_table.erase(tid);
2822 }
2823
2824 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2825 {
2826 std::unique_lock cl(client_lock);
2827 if (m->get_epoch() <= mdsmap->get_epoch()) {
2828 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2829 << " is identical to or older than our "
2830 << mdsmap->get_epoch() << dendl;
2831 return;
2832 }
2833
2834 cl.unlock();
2835 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2836 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2837 _mdsmap->decode(m->get_encoded());
2838 cancel_commands(*_mdsmap.get());
2839 cl.lock();
2840
2841 _mdsmap.swap(mdsmap);
2842
2843 // reset session
2844 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2845 mds_rank_t mds = p->first;
2846 MetaSession *session = &p->second;
2847 ++p;
2848
2849 int oldstate = _mdsmap->get_state(mds);
2850 int newstate = mdsmap->get_state(mds);
2851 if (!mdsmap->is_up(mds)) {
2852 session->con->mark_down();
2853 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2854 auto old_inc = _mdsmap->get_incarnation(mds);
2855 auto new_inc = mdsmap->get_incarnation(mds);
2856 if (old_inc != new_inc) {
2857 ldout(cct, 1) << "mds incarnation changed from "
2858 << old_inc << " to " << new_inc << dendl;
2859 oldstate = MDSMap::STATE_NULL;
2860 }
2861 session->con->mark_down();
2862 session->addrs = mdsmap->get_addrs(mds);
2863 // When new MDS starts to take over, notify kernel to trim unused entries
2864 // in its dcache/icache. Hopefully, the kernel will release some unused
2865 // inodes before the new MDS enters reconnect state.
2866 trim_cache_for_reconnect(session);
2867 } else if (oldstate == newstate)
2868 continue; // no change
2869
2870 session->mds_state = newstate;
2871 if (newstate == MDSMap::STATE_RECONNECT) {
2872 session->con = messenger->connect_to_mds(session->addrs);
2873 send_reconnect(session);
2874 } else if (newstate > MDSMap::STATE_RECONNECT) {
2875 if (oldstate < MDSMap::STATE_RECONNECT) {
2876 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2877 _closed_mds_session(session);
2878 continue;
2879 }
2880 if (newstate >= MDSMap::STATE_ACTIVE) {
2881 if (oldstate < MDSMap::STATE_ACTIVE) {
2882 // kick new requests
2883 kick_requests(session);
2884 kick_flushing_caps(session);
2885 signal_context_list(session->waiting_for_open);
2886 wake_up_session_caps(session, true);
2887 }
2888 connect_mds_targets(mds);
2889 }
2890 } else if (newstate == MDSMap::STATE_NULL &&
2891 mds >= mdsmap->get_max_mds()) {
2892 _closed_mds_session(session);
2893 }
2894 }
2895
2896 // kick any waiting threads
2897 signal_cond_list(waiting_for_mdsmap);
2898
2899 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2900 }
2901
2902 void Client::send_reconnect(MetaSession *session)
2903 {
2904 mds_rank_t mds = session->mds_num;
2905 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2906
2907 // trim unused caps to reduce MDS's cache rejoin time
2908 trim_cache_for_reconnect(session);
2909
2910 session->readonly = false;
2911
2912 session->release.reset();
2913
2914 // reset my cap seq number
2915 session->seq = 0;
2916 //connect to the mds' offload targets
2917 connect_mds_targets(mds);
2918 //make sure unsafe requests get saved
2919 resend_unsafe_requests(session);
2920
2921 early_kick_flushing_caps(session);
2922
2923 auto m = make_message<MClientReconnect>();
2924 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2925
2926 // i have an open session.
2927 ceph::unordered_set<inodeno_t> did_snaprealm;
2928 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2929 p != inode_map.end();
2930 ++p) {
2931 Inode *in = p->second;
2932 auto it = in->caps.find(mds);
2933 if (it != in->caps.end()) {
2934 if (allow_multi &&
2935 m->get_approx_size() >=
2936 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2937 m->mark_more();
2938 session->con->send_message2(std::move(m));
2939
2940 m = make_message<MClientReconnect>();
2941 }
2942
2943 Cap &cap = it->second;
2944 ldout(cct, 10) << " caps on " << p->first
2945 << " " << ccap_string(cap.issued)
2946 << " wants " << ccap_string(in->caps_wanted())
2947 << dendl;
2948 filepath path;
2949 in->make_short_path(path);
2950 ldout(cct, 10) << " path " << path << dendl;
2951
2952 bufferlist flockbl;
2953 _encode_filelocks(in, flockbl);
2954
2955 cap.seq = 0; // reset seq.
2956 cap.issue_seq = 0; // reset seq.
2957 cap.mseq = 0; // reset seq.
2958 // cap gen should catch up with session cap_gen
2959 if (cap.gen < session->cap_gen) {
2960 cap.gen = session->cap_gen;
2961 cap.issued = cap.implemented = CEPH_CAP_PIN;
2962 } else {
2963 cap.issued = cap.implemented;
2964 }
2965 snapid_t snap_follows = 0;
2966 if (!in->cap_snaps.empty())
2967 snap_follows = in->cap_snaps.begin()->first;
2968
2969 m->add_cap(p->first.ino,
2970 cap.cap_id,
2971 path.get_ino(), path.get_path(), // ino
2972 in->caps_wanted(), // wanted
2973 cap.issued, // issued
2974 in->snaprealm->ino,
2975 snap_follows,
2976 flockbl);
2977
2978 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2979 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2980 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2981 did_snaprealm.insert(in->snaprealm->ino);
2982 }
2983 }
2984 }
2985
2986 if (!allow_multi)
2987 m->set_encoding_version(0); // use connection features to choose encoding
2988 session->con->send_message2(std::move(m));
2989
2990 mount_cond.notify_all();
2991
2992 if (session->reclaim_state == MetaSession::RECLAIMING)
2993 signal_cond_list(waiting_for_reclaim);
2994 }
2995
2996
2997 void Client::kick_requests(MetaSession *session)
2998 {
2999 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3000 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3001 p != mds_requests.end();
3002 ++p) {
3003 MetaRequest *req = p->second;
3004 if (req->got_unsafe)
3005 continue;
3006 if (req->aborted()) {
3007 if (req->caller_cond) {
3008 req->kick = true;
3009 req->caller_cond->notify_all();
3010 }
3011 continue;
3012 }
3013 if (req->retry_attempt > 0)
3014 continue; // new requests only
3015 if (req->mds == session->mds_num) {
3016 send_request(p->second, session);
3017 }
3018 }
3019 }
3020
3021 void Client::resend_unsafe_requests(MetaSession *session)
3022 {
3023 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3024 !iter.end();
3025 ++iter)
3026 send_request(*iter, session);
3027
3028 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3029 // process completed requests in clientreplay stage.
3030 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3031 p != mds_requests.end();
3032 ++p) {
3033 MetaRequest *req = p->second;
3034 if (req->got_unsafe)
3035 continue;
3036 if (req->aborted())
3037 continue;
3038 if (req->retry_attempt == 0)
3039 continue; // old requests only
3040 if (req->mds == session->mds_num)
3041 send_request(req, session, true);
3042 }
3043 }
3044
3045 void Client::wait_unsafe_requests()
3046 {
3047 list<MetaRequest*> last_unsafe_reqs;
3048 for (const auto &p : mds_sessions) {
3049 const MetaSession &s = p.second;
3050 if (!s.unsafe_requests.empty()) {
3051 MetaRequest *req = s.unsafe_requests.back();
3052 req->get();
3053 last_unsafe_reqs.push_back(req);
3054 }
3055 }
3056
3057 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3058 p != last_unsafe_reqs.end();
3059 ++p) {
3060 MetaRequest *req = *p;
3061 if (req->unsafe_item.is_on_list())
3062 wait_on_list(req->waitfor_safe);
3063 put_request(req);
3064 }
3065 }
3066
3067 void Client::kick_requests_closed(MetaSession *session)
3068 {
3069 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3070 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3071 p != mds_requests.end(); ) {
3072 MetaRequest *req = p->second;
3073 ++p;
3074 if (req->mds == session->mds_num) {
3075 if (req->caller_cond) {
3076 req->kick = true;
3077 req->caller_cond->notify_all();
3078 }
3079 req->item.remove_myself();
3080 if (req->got_unsafe) {
3081 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3082 req->unsafe_item.remove_myself();
3083 if (is_dir_operation(req)) {
3084 Inode *dir = req->inode();
3085 assert(dir);
3086 dir->set_async_err(-CEPHFS_EIO);
3087 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3088 << dir->ino << " " << req->get_tid() << dendl;
3089 req->unsafe_dir_item.remove_myself();
3090 }
3091 if (req->target) {
3092 InodeRef &in = req->target;
3093 in->set_async_err(-CEPHFS_EIO);
3094 lderr(cct) << "kick_requests_closed drop req of inode : "
3095 << in->ino << " " << req->get_tid() << dendl;
3096 req->unsafe_target_item.remove_myself();
3097 }
3098 signal_cond_list(req->waitfor_safe);
3099 unregister_request(req);
3100 }
3101 }
3102 }
3103 ceph_assert(session->requests.empty());
3104 ceph_assert(session->unsafe_requests.empty());
3105 }
3106
3107
3108
3109
3110 /************
3111 * leases
3112 */
3113
3114 void Client::got_mds_push(MetaSession *s)
3115 {
3116 s->seq++;
3117 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3118 if (s->state == MetaSession::STATE_CLOSING) {
3119 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3120 }
3121 }
3122
3123 void Client::handle_lease(const MConstRef<MClientLease>& m)
3124 {
3125 ldout(cct, 10) << __func__ << " " << *m << dendl;
3126
3127 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3128 mds_rank_t mds = mds_rank_t(m->get_source().num());
3129
3130 std::scoped_lock cl(client_lock);
3131 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3132 if (!session) {
3133 return;
3134 }
3135
3136 got_mds_push(session);
3137
3138 ceph_seq_t seq = m->get_seq();
3139
3140 Inode *in;
3141 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3142 if (inode_map.count(vino) == 0) {
3143 ldout(cct, 10) << " don't have vino " << vino << dendl;
3144 goto revoke;
3145 }
3146 in = inode_map[vino];
3147
3148 if (m->get_mask() & CEPH_LEASE_VALID) {
3149 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3150 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3151 goto revoke;
3152 }
3153 Dentry *dn = in->dir->dentries[m->dname];
3154 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3155 dn->lease_mds = -1;
3156 }
3157
3158 revoke:
3159 {
3160 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3161 m->get_mask(), m->get_ino(),
3162 m->get_first(), m->get_last(), m->dname);
3163 m->get_connection()->send_message2(std::move(reply));
3164 }
3165 }
3166
3167 void Client::_put_inode(Inode *in, int n)
3168 {
3169 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3170
3171 int left = in->_put(n);
3172 if (left == 0) {
3173 // release any caps
3174 remove_all_caps(in);
3175
3176 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3177 bool unclean = objectcacher->release_set(&in->oset);
3178 ceph_assert(!unclean);
3179 inode_map.erase(in->vino());
3180 if (use_faked_inos())
3181 _release_faked_ino(in);
3182
3183 if (in == root) {
3184 root = 0;
3185 root_ancestor = 0;
3186 while (!root_parents.empty())
3187 root_parents.erase(root_parents.begin());
3188 }
3189
3190 delete in;
3191 }
3192 }
3193
3194 void Client::delay_put_inodes(bool wakeup)
3195 {
3196 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3197
3198 std::map<Inode*,int> release;
3199 {
3200 std::scoped_lock dl(delay_i_lock);
3201 release.swap(delay_i_release);
3202 }
3203
3204 if (release.empty())
3205 return;
3206
3207 for (auto &[in, cnt] : release)
3208 _put_inode(in, cnt);
3209
3210 if (wakeup)
3211 mount_cond.notify_all();
3212 }
3213
3214 void Client::put_inode(Inode *in, int n)
3215 {
3216 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3217
3218 std::scoped_lock dl(delay_i_lock);
3219 delay_i_release[in] += n;
3220 }
3221
3222 void Client::close_dir(Dir *dir)
3223 {
3224 Inode *in = dir->parent_inode;
3225 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3226 ceph_assert(dir->is_empty());
3227 ceph_assert(in->dir == dir);
3228 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3229 if (!in->dentries.empty())
3230 in->get_first_parent()->put(); // unpin dentry
3231
3232 delete in->dir;
3233 in->dir = 0;
3234 put_inode(in); // unpin inode
3235 }
3236
3237 /**
3238 * Don't call this with in==NULL, use get_or_create for that
3239 * leave dn set to default NULL unless you're trying to add
3240 * a new inode to a pre-created Dentry
3241 */
3242 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3243 {
3244 if (!dn) {
3245 // create a new Dentry
3246 dn = new Dentry(dir, name);
3247
3248 lru.lru_insert_mid(dn); // mid or top?
3249
3250 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3251 << " dn " << dn << " (new dn)" << dendl;
3252 } else {
3253 ceph_assert(!dn->inode);
3254 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3255 << " dn " << dn << " (old dn)" << dendl;
3256 }
3257
3258 if (in) { // link to inode
3259 InodeRef tmp_ref;
3260 // only one parent for directories!
3261 if (in->is_dir() && !in->dentries.empty()) {
3262 tmp_ref = in; // prevent unlink below from freeing the inode.
3263 Dentry *olddn = in->get_first_parent();
3264 ceph_assert(olddn->dir != dir || olddn->name != name);
3265 Inode *old_diri = olddn->dir->parent_inode;
3266 clear_dir_complete_and_ordered(old_diri, true);
3267 unlink(olddn, true, true); // keep dir, dentry
3268 }
3269
3270 dn->link(in);
3271 inc_dentry_nr();
3272 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3273 }
3274
3275 return dn;
3276 }
3277
3278 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3279 {
3280 InodeRef in(dn->inode);
3281 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3282 << " inode " << dn->inode << dendl;
3283
3284 // unlink from inode
3285 if (dn->inode) {
3286 dn->unlink();
3287 dec_dentry_nr();
3288 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3289 }
3290
3291 if (keepdentry) {
3292 dn->lease_mds = -1;
3293 } else {
3294 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3295
3296 // unlink from dir
3297 Dir *dir = dn->dir;
3298 dn->detach();
3299
3300 // delete den
3301 lru.lru_remove(dn);
3302 dn->put();
3303
3304 if (dir->is_empty() && !keepdir)
3305 close_dir(dir);
3306 }
3307 }
3308
3309 /**
3310 * For asynchronous flushes, check for errors from the IO and
3311 * update the inode if necessary
3312 */
3313 class C_Client_FlushComplete : public Context {
3314 private:
3315 Client *client;
3316 InodeRef inode;
3317 public:
3318 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3319 void finish(int r) override {
3320 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3321 if (r != 0) {
3322 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3323 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3324 << " 0x" << std::hex << inode->ino << std::dec
3325 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3326 inode->set_async_err(r);
3327 }
3328 }
3329 };
3330
3331
3332 /****
3333 * caps
3334 */
3335
3336 void Client::get_cap_ref(Inode *in, int cap)
3337 {
3338 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3339 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3340 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3341 in->get();
3342 }
3343 if ((cap & CEPH_CAP_FILE_CACHE) &&
3344 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3345 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3346 in->get();
3347 }
3348 in->get_cap_ref(cap);
3349 }
3350
3351 void Client::put_cap_ref(Inode *in, int cap)
3352 {
3353 int last = in->put_cap_ref(cap);
3354 if (last) {
3355 int put_nref = 0;
3356 int drop = last & ~in->caps_issued();
3357 if (in->snapid == CEPH_NOSNAP) {
3358 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3359 !in->cap_snaps.empty() &&
3360 in->cap_snaps.rbegin()->second.writing) {
3361 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3362 in->cap_snaps.rbegin()->second.writing = 0;
3363 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3364 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3365 }
3366 if (last & CEPH_CAP_FILE_BUFFER) {
3367 for (auto &p : in->cap_snaps)
3368 p.second.dirty_data = 0;
3369 signal_cond_list(in->waitfor_commit);
3370 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3371 ++put_nref;
3372 }
3373 }
3374 if (last & CEPH_CAP_FILE_CACHE) {
3375 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3376 ++put_nref;
3377 }
3378 if (drop)
3379 check_caps(in, 0);
3380 if (put_nref)
3381 put_inode(in, put_nref);
3382 }
3383 }
3384
3385 // get caps for a given file handle -- the inode should have @need caps
3386 // issued by the mds and @want caps not revoked (or not under revocation).
3387 // this routine blocks till the cap requirement is satisfied. also account
3388 // (track) for capability hit when required (when cap requirement succeedes).
3389 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3390 {
3391 Inode *in = fh->inode.get();
3392
3393 int r = check_pool_perm(in, need);
3394 if (r < 0)
3395 return r;
3396
3397 while (1) {
3398 int file_wanted = in->caps_file_wanted();
3399 if ((file_wanted & need) != need) {
3400 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3401 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3402 << dendl;
3403 return -CEPHFS_EBADF;
3404 }
3405
3406 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3407 return -CEPHFS_EBADF;
3408
3409 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3410 return -CEPHFS_EIO;
3411
3412 int implemented;
3413 int have = in->caps_issued(&implemented);
3414
3415 bool waitfor_caps = false;
3416 bool waitfor_commit = false;
3417
3418 if (have & need & CEPH_CAP_FILE_WR) {
3419 if (endoff > 0) {
3420 if ((endoff >= (loff_t)in->max_size ||
3421 endoff > (loff_t)(in->size << 1)) &&
3422 endoff > (loff_t)in->wanted_max_size) {
3423 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3424 in->wanted_max_size = endoff;
3425 }
3426 if (in->wanted_max_size > in->max_size &&
3427 in->wanted_max_size > in->requested_max_size)
3428 check_caps(in, 0);
3429 }
3430
3431 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3432 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3433 waitfor_caps = true;
3434 }
3435 if (!in->cap_snaps.empty()) {
3436 if (in->cap_snaps.rbegin()->second.writing) {
3437 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3438 waitfor_caps = true;
3439 }
3440 for (auto &p : in->cap_snaps) {
3441 if (p.second.dirty_data) {
3442 waitfor_commit = true;
3443 break;
3444 }
3445 }
3446 if (waitfor_commit) {
3447 _flush(in, new C_Client_FlushComplete(this, in));
3448 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3449 }
3450 }
3451 }
3452
3453 if (!waitfor_caps && !waitfor_commit) {
3454 if ((have & need) == need) {
3455 int revoking = implemented & ~have;
3456 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3457 << " need " << ccap_string(need) << " want " << ccap_string(want)
3458 << " revoking " << ccap_string(revoking)
3459 << dendl;
3460 if ((revoking & want) == 0) {
3461 *phave = need | (have & want);
3462 in->get_cap_ref(need);
3463 cap_hit();
3464 return 0;
3465 }
3466 }
3467 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3468 waitfor_caps = true;
3469 }
3470
3471 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3472 in->auth_cap->session->readonly)
3473 return -CEPHFS_EROFS;
3474
3475 if (in->flags & I_CAP_DROPPED) {
3476 int mds_wanted = in->caps_mds_wanted();
3477 if ((mds_wanted & need) != need) {
3478 int ret = _renew_caps(in);
3479 if (ret < 0)
3480 return ret;
3481 continue;
3482 }
3483 if (!(file_wanted & ~mds_wanted))
3484 in->flags &= ~I_CAP_DROPPED;
3485 }
3486
3487 if (waitfor_caps)
3488 wait_on_list(in->waitfor_caps);
3489 else if (waitfor_commit)
3490 wait_on_list(in->waitfor_commit);
3491 }
3492 }
3493
3494 int Client::get_caps_used(Inode *in)
3495 {
3496 unsigned used = in->caps_used();
3497 if (!(used & CEPH_CAP_FILE_CACHE) &&
3498 !objectcacher->set_is_empty(&in->oset))
3499 used |= CEPH_CAP_FILE_CACHE;
3500 return used;
3501 }
3502
3503 void Client::cap_delay_requeue(Inode *in)
3504 {
3505 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3506 in->hold_caps_until = ceph_clock_now();
3507 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3508 delayed_list.push_back(&in->delay_cap_item);
3509 }
3510
3511 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3512 int flags, int used, int want, int retain,
3513 int flush, ceph_tid_t flush_tid)
3514 {
3515 int held = cap->issued | cap->implemented;
3516 int revoking = cap->implemented & ~cap->issued;
3517 retain &= ~revoking;
3518 int dropping = cap->issued & ~retain;
3519 int op = CEPH_CAP_OP_UPDATE;
3520
3521 ldout(cct, 10) << __func__ << " " << *in
3522 << " mds." << session->mds_num << " seq " << cap->seq
3523 << " used " << ccap_string(used)
3524 << " want " << ccap_string(want)
3525 << " flush " << ccap_string(flush)
3526 << " retain " << ccap_string(retain)
3527 << " held "<< ccap_string(held)
3528 << " revoking " << ccap_string(revoking)
3529 << " dropping " << ccap_string(dropping)
3530 << dendl;
3531
3532 if (cct->_conf->client_inject_release_failure && revoking) {
3533 const int would_have_issued = cap->issued & retain;
3534 const int would_have_implemented = cap->implemented & (cap->issued | used);
3535 // Simulated bug:
3536 // - tell the server we think issued is whatever they issued plus whatever we implemented
3537 // - leave what we have implemented in place
3538 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3539 cap->issued = cap->issued | cap->implemented;
3540
3541 // Make an exception for revoking xattr caps: we are injecting
3542 // failure to release other caps, but allow xattr because client
3543 // will block on xattr ops if it can't release these to MDS (#9800)
3544 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3545 cap->issued ^= xattr_mask & revoking;
3546 cap->implemented ^= xattr_mask & revoking;
3547
3548 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3549 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3550 } else {
3551 // Normal behaviour
3552 cap->issued &= retain;
3553 cap->implemented &= cap->issued | used;
3554 }
3555
3556 snapid_t follows = 0;
3557
3558 if (flush)
3559 follows = in->snaprealm->get_snap_context().seq;
3560
3561 auto m = make_message<MClientCaps>(op,
3562 in->ino,
3563 0,
3564 cap->cap_id, cap->seq,
3565 cap->implemented,
3566 want,
3567 flush,
3568 cap->mseq,
3569 cap_epoch_barrier);
3570 m->caller_uid = in->cap_dirtier_uid;
3571 m->caller_gid = in->cap_dirtier_gid;
3572
3573 m->head.issue_seq = cap->issue_seq;
3574 m->set_tid(flush_tid);
3575
3576 m->head.uid = in->uid;
3577 m->head.gid = in->gid;
3578 m->head.mode = in->mode;
3579
3580 m->head.nlink = in->nlink;
3581
3582 if (flush & CEPH_CAP_XATTR_EXCL) {
3583 encode(in->xattrs, m->xattrbl);
3584 m->head.xattr_version = in->xattr_version;
3585 }
3586
3587 m->size = in->size;
3588 m->max_size = in->max_size;
3589 m->truncate_seq = in->truncate_seq;
3590 m->truncate_size = in->truncate_size;
3591 m->mtime = in->mtime;
3592 m->atime = in->atime;
3593 m->ctime = in->ctime;
3594 m->btime = in->btime;
3595 m->time_warp_seq = in->time_warp_seq;
3596 m->change_attr = in->change_attr;
3597
3598 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3599 !in->cap_snaps.empty() &&
3600 in->cap_snaps.rbegin()->second.flush_tid == 0)
3601 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3602 m->flags = flags;
3603
3604 if (flush & CEPH_CAP_FILE_WR) {
3605 m->inline_version = in->inline_version;
3606 m->inline_data = in->inline_data;
3607 }
3608
3609 in->reported_size = in->size;
3610 m->set_snap_follows(follows);
3611 cap->wanted = want;
3612 if (cap == in->auth_cap) {
3613 if (want & CEPH_CAP_ANY_FILE_WR) {
3614 m->set_max_size(in->wanted_max_size);
3615 in->requested_max_size = in->wanted_max_size;
3616 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3617 } else {
3618 in->requested_max_size = 0;
3619 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3620 }
3621 }
3622
3623 if (!session->flushing_caps_tids.empty())
3624 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3625
3626 session->con->send_message2(std::move(m));
3627 }
3628
3629 static bool is_max_size_approaching(Inode *in)
3630 {
3631 /* mds will adjust max size according to the reported size */
3632 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3633 return false;
3634 if (in->size >= in->max_size)
3635 return true;
3636 /* half of previous max_size increment has been used */
3637 if (in->max_size > in->reported_size &&
3638 (in->size << 1) >= in->max_size + in->reported_size)
3639 return true;
3640 return false;
3641 }
3642
3643 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3644 {
3645 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3646 return used;
3647 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3648 return used;
3649
3650 if (issued & CEPH_CAP_FILE_LAZYIO) {
3651 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3652 used &= ~CEPH_CAP_FILE_CACHE;
3653 used |= CEPH_CAP_FILE_LAZYIO;
3654 }
3655 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3656 used &= ~CEPH_CAP_FILE_BUFFER;
3657 used |= CEPH_CAP_FILE_LAZYIO;
3658 }
3659 } else {
3660 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3661 used &= ~CEPH_CAP_FILE_CACHE;
3662 used |= CEPH_CAP_FILE_LAZYIO;
3663 }
3664 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3665 used &= ~CEPH_CAP_FILE_BUFFER;
3666 used |= CEPH_CAP_FILE_LAZYIO;
3667 }
3668 }
3669 return used;
3670 }
3671
3672 /**
3673 * check_caps
3674 *
3675 * Examine currently used and wanted versus held caps. Release, flush or ack
3676 * revoked caps to the MDS as appropriate.
3677 *
3678 * @param in the inode to check
3679 * @param flags flags to apply to cap check
3680 */
3681 void Client::check_caps(Inode *in, unsigned flags)
3682 {
3683 unsigned wanted = in->caps_wanted();
3684 unsigned used = get_caps_used(in);
3685 unsigned cap_used;
3686
3687 int implemented;
3688 int issued = in->caps_issued(&implemented);
3689 int revoking = implemented & ~issued;
3690
3691 int orig_used = used;
3692 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3693
3694 int retain = wanted | used | CEPH_CAP_PIN;
3695 if (!is_unmounting() && in->nlink > 0) {
3696 if (wanted) {
3697 retain |= CEPH_CAP_ANY;
3698 } else if (in->is_dir() &&
3699 (issued & CEPH_CAP_FILE_SHARED) &&
3700 (in->flags & I_COMPLETE)) {
3701 // we do this here because we don't want to drop to Fs (and then
3702 // drop the Fs if we do a create!) if that alone makes us send lookups
3703 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3704 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3705 retain |= wanted;
3706 } else {
3707 retain |= CEPH_CAP_ANY_SHARED;
3708 // keep RD only if we didn't have the file open RW,
3709 // because then the mds would revoke it anyway to
3710 // journal max_size=0.
3711 if (in->max_size == 0)
3712 retain |= CEPH_CAP_ANY_RD;
3713 }
3714 }
3715
3716 ldout(cct, 10) << __func__ << " on " << *in
3717 << " wanted " << ccap_string(wanted)
3718 << " used " << ccap_string(used)
3719 << " issued " << ccap_string(issued)
3720 << " revoking " << ccap_string(revoking)
3721 << " flags=" << flags
3722 << dendl;
3723
3724 if (in->snapid != CEPH_NOSNAP)
3725 return; //snap caps last forever, can't write
3726
3727 if (in->caps.empty())
3728 return; // guard if at end of func
3729
3730 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3731 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3732 if (_release(in))
3733 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3734 }
3735
3736
3737 for (auto &p : in->caps) {
3738 mds_rank_t mds = p.first;
3739 Cap &cap = p.second;
3740
3741 MetaSession *session = &mds_sessions.at(mds);
3742
3743 cap_used = used;
3744 if (in->auth_cap && &cap != in->auth_cap)
3745 cap_used &= ~in->auth_cap->issued;
3746
3747 revoking = cap.implemented & ~cap.issued;
3748
3749 ldout(cct, 10) << " cap mds." << mds
3750 << " issued " << ccap_string(cap.issued)
3751 << " implemented " << ccap_string(cap.implemented)
3752 << " revoking " << ccap_string(revoking) << dendl;
3753
3754 if (in->wanted_max_size > in->max_size &&
3755 in->wanted_max_size > in->requested_max_size &&
3756 &cap == in->auth_cap)
3757 goto ack;
3758
3759 /* approaching file_max? */
3760 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3761 &cap == in->auth_cap &&
3762 is_max_size_approaching(in)) {
3763 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3764 << ", reported " << in->reported_size << dendl;
3765 goto ack;
3766 }
3767
3768 /* completed revocation? */
3769 if (revoking && (revoking & cap_used) == 0) {
3770 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3771 goto ack;
3772 }
3773
3774 /* want more caps from mds? */
3775 if (wanted & ~(cap.wanted | cap.issued))
3776 goto ack;
3777
3778 if (!revoking && is_unmounting() && (cap_used == 0))
3779 goto ack;
3780
3781 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3782 !in->dirty_caps) // and we have no dirty caps
3783 continue;
3784
3785 if (!(flags & CHECK_CAPS_NODELAY)) {
3786 ldout(cct, 10) << "delaying cap release" << dendl;
3787 cap_delay_requeue(in);
3788 continue;
3789 }
3790
3791 ack:
3792 if (&cap == in->auth_cap) {
3793 if (in->flags & I_KICK_FLUSH) {
3794 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3795 << " to mds." << mds << dendl;
3796 kick_flushing_caps(in, session);
3797 }
3798 if (!in->cap_snaps.empty() &&
3799 in->cap_snaps.rbegin()->second.flush_tid == 0)
3800 flush_snaps(in);
3801 }
3802
3803 int flushing;
3804 int msg_flags = 0;
3805 ceph_tid_t flush_tid;
3806 if (in->auth_cap == &cap && in->dirty_caps) {
3807 flushing = mark_caps_flushing(in, &flush_tid);
3808 if (flags & CHECK_CAPS_SYNCHRONOUS)
3809 msg_flags |= MClientCaps::FLAG_SYNC;
3810 } else {
3811 flushing = 0;
3812 flush_tid = 0;
3813 }
3814
3815 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3816 flushing, flush_tid);
3817 }
3818 }
3819
3820
3821 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3822 {
3823 int used = get_caps_used(in);
3824 int dirty = in->caps_dirty();
3825 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3826
3827 if (in->cap_snaps.size() &&
3828 in->cap_snaps.rbegin()->second.writing) {
3829 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3830 return;
3831 } else if (in->caps_dirty() ||
3832 (used & CEPH_CAP_FILE_WR) ||
3833 (dirty & CEPH_CAP_ANY_WR)) {
3834 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3835 ceph_assert(capsnapem.second); /* element inserted */
3836 CapSnap &capsnap = capsnapem.first->second;
3837 capsnap.context = old_snapc;
3838 capsnap.issued = in->caps_issued();
3839 capsnap.dirty = in->caps_dirty();
3840
3841 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3842
3843 capsnap.uid = in->uid;
3844 capsnap.gid = in->gid;
3845 capsnap.mode = in->mode;
3846 capsnap.btime = in->btime;
3847 capsnap.xattrs = in->xattrs;
3848 capsnap.xattr_version = in->xattr_version;
3849 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3850 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3851
3852 if (used & CEPH_CAP_FILE_WR) {
3853 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3854 capsnap.writing = 1;
3855 } else {
3856 finish_cap_snap(in, capsnap, used);
3857 }
3858 } else {
3859 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3860 }
3861 }
3862
3863 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3864 {
3865 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3866 capsnap.size = in->size;
3867 capsnap.mtime = in->mtime;
3868 capsnap.atime = in->atime;
3869 capsnap.ctime = in->ctime;
3870 capsnap.time_warp_seq = in->time_warp_seq;
3871 capsnap.change_attr = in->change_attr;
3872 capsnap.dirty |= in->caps_dirty();
3873
3874 /* Only reset it if it wasn't set before */
3875 if (capsnap.cap_dirtier_uid == -1) {
3876 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3877 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3878 }
3879
3880 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3881 capsnap.inline_data = in->inline_data;
3882 capsnap.inline_version = in->inline_version;
3883 }
3884
3885 if (used & CEPH_CAP_FILE_BUFFER) {
3886 capsnap.writing = 1;
3887 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3888 << " WRBUFFER, delaying" << dendl;
3889 } else {
3890 capsnap.dirty_data = 0;
3891 flush_snaps(in);
3892 }
3893 }
3894
3895 void Client::send_flush_snap(Inode *in, MetaSession *session,
3896 snapid_t follows, CapSnap& capsnap)
3897 {
3898 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3899 in->ino, in->snaprealm->ino, 0,
3900 in->auth_cap->mseq, cap_epoch_barrier);
3901 m->caller_uid = capsnap.cap_dirtier_uid;
3902 m->caller_gid = capsnap.cap_dirtier_gid;
3903
3904 m->set_client_tid(capsnap.flush_tid);
3905 m->head.snap_follows = follows;
3906
3907 m->head.caps = capsnap.issued;
3908 m->head.dirty = capsnap.dirty;
3909
3910 m->head.uid = capsnap.uid;
3911 m->head.gid = capsnap.gid;
3912 m->head.mode = capsnap.mode;
3913 m->btime = capsnap.btime;
3914
3915 m->size = capsnap.size;
3916
3917 m->head.xattr_version = capsnap.xattr_version;
3918 encode(capsnap.xattrs, m->xattrbl);
3919
3920 m->ctime = capsnap.ctime;
3921 m->btime = capsnap.btime;
3922 m->mtime = capsnap.mtime;
3923 m->atime = capsnap.atime;
3924 m->time_warp_seq = capsnap.time_warp_seq;
3925 m->change_attr = capsnap.change_attr;
3926
3927 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3928 m->inline_version = in->inline_version;
3929 m->inline_data = in->inline_data;
3930 }
3931
3932 ceph_assert(!session->flushing_caps_tids.empty());
3933 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3934
3935 session->con->send_message2(std::move(m));
3936 }
3937
3938 void Client::flush_snaps(Inode *in)
3939 {
3940 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3941 ceph_assert(in->cap_snaps.size());
3942
3943 // pick auth mds
3944 ceph_assert(in->auth_cap);
3945 MetaSession *session = in->auth_cap->session;
3946
3947 for (auto &p : in->cap_snaps) {
3948 CapSnap &capsnap = p.second;
3949 // only do new flush
3950 if (capsnap.flush_tid > 0)
3951 continue;
3952
3953 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3954 << " follows " << p.first
3955 << " size " << capsnap.size
3956 << " mtime " << capsnap.mtime
3957 << " dirty_data=" << capsnap.dirty_data
3958 << " writing=" << capsnap.writing
3959 << " on " << *in << dendl;
3960 if (capsnap.dirty_data || capsnap.writing)
3961 break;
3962
3963 capsnap.flush_tid = ++last_flush_tid;
3964 session->flushing_caps_tids.insert(capsnap.flush_tid);
3965 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3966 if (!in->flushing_cap_item.is_on_list())
3967 session->flushing_caps.push_back(&in->flushing_cap_item);
3968
3969 send_flush_snap(in, session, p.first, capsnap);
3970 }
3971 }
3972
3973 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3974 {
3975 ceph::condition_variable cond;
3976 ls.push_back(&cond);
3977 std::unique_lock l{client_lock, std::adopt_lock};
3978 cond.wait(l);
3979 l.release();
3980 ls.remove(&cond);
3981 }
3982
3983 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3984 {
3985 for (auto cond : ls) {
3986 cond->notify_all();
3987 }
3988 }
3989
3990 void Client::wait_on_context_list(list<Context*>& ls)
3991 {
3992 ceph::condition_variable cond;
3993 bool done = false;
3994 int r;
3995 ls.push_back(new C_Cond(cond, &done, &r));
3996 std::unique_lock l{client_lock, std::adopt_lock};
3997 cond.wait(l, [&done] { return done;});
3998 l.release();
3999 }
4000
4001 void Client::signal_context_list(list<Context*>& ls)
4002 {
4003 while (!ls.empty()) {
4004 ls.front()->complete(0);
4005 ls.pop_front();
4006 }
4007 }
4008
4009 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4010 {
4011 for (const auto &cap : s->caps) {
4012 auto &in = cap->inode;
4013 if (reconnect) {
4014 in.requested_max_size = 0;
4015 in.wanted_max_size = 0;
4016 } else {
4017 if (cap->gen < s->cap_gen) {
4018 // mds did not re-issue stale cap.
4019 cap->issued = cap->implemented = CEPH_CAP_PIN;
4020 // make sure mds knows what we want.
4021 if (in.caps_file_wanted() & ~cap->wanted)
4022 in.flags |= I_CAP_DROPPED;
4023 }
4024 }
4025 signal_cond_list(in.waitfor_caps);
4026 }
4027 }
4028
4029
4030 // flush dirty data (from objectcache)
4031
4032 class C_Client_CacheInvalidate : public Context {
4033 private:
4034 Client *client;
4035 vinodeno_t ino;
4036 int64_t offset, length;
4037 public:
4038 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4039 client(c), offset(off), length(len) {
4040 if (client->use_faked_inos())
4041 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4042 else
4043 ino = in->vino();
4044 }
4045 void finish(int r) override {
4046 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4047 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4048 client->_async_invalidate(ino, offset, length);
4049 }
4050 };
4051
4052 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4053 {
4054 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4055 if (!mref_reader.is_state_satisfied())
4056 return;
4057
4058 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4059 ino_invalidate_cb(callback_handle, ino, off, len);
4060 }
4061
4062 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4063
4064 if (ino_invalidate_cb)
4065 // we queue the invalidate, which calls the callback and decrements the ref
4066 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4067 }
4068
4069 void Client::_invalidate_inode_cache(Inode *in)
4070 {
4071 ldout(cct, 10) << __func__ << " " << *in << dendl;
4072
4073 // invalidate our userspace inode cache
4074 if (cct->_conf->client_oc) {
4075 objectcacher->release_set(&in->oset);
4076 if (!objectcacher->set_is_empty(&in->oset))
4077 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4078 }
4079
4080 _schedule_invalidate_callback(in, 0, 0);
4081 }
4082
4083 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4084 {
4085 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4086
4087 // invalidate our userspace inode cache
4088 if (cct->_conf->client_oc) {
4089 vector<ObjectExtent> ls;
4090 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4091 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4092 }
4093
4094 _schedule_invalidate_callback(in, off, len);
4095 }
4096
4097 bool Client::_release(Inode *in)
4098 {
4099 ldout(cct, 20) << "_release " << *in << dendl;
4100 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4101 _invalidate_inode_cache(in);
4102 return true;
4103 }
4104 return false;
4105 }
4106
4107 bool Client::_flush(Inode *in, Context *onfinish)
4108 {
4109 ldout(cct, 10) << "_flush " << *in << dendl;
4110
4111 if (!in->oset.dirty_or_tx) {
4112 ldout(cct, 10) << " nothing to flush" << dendl;
4113 onfinish->complete(0);
4114 return true;
4115 }
4116
4117 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4118 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4119 objectcacher->purge_set(&in->oset);
4120 if (onfinish) {
4121 onfinish->complete(-CEPHFS_ENOSPC);
4122 }
4123 return true;
4124 }
4125
4126 return objectcacher->flush_set(&in->oset, onfinish);
4127 }
4128
4129 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4130 {
4131 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4132 if (!in->oset.dirty_or_tx) {
4133 ldout(cct, 10) << " nothing to flush" << dendl;
4134 return;
4135 }
4136
4137 C_SaferCond onflush("Client::_flush_range flock");
4138 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4139 offset, size, &onflush);
4140 if (!ret) {
4141 // wait for flush
4142 client_lock.unlock();
4143 onflush.wait();
4144 client_lock.lock();
4145 }
4146 }
4147
4148 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4149 {
4150 // std::scoped_lock l(client_lock);
4151 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4152 Inode *in = static_cast<Inode *>(oset->parent);
4153 ceph_assert(in);
4154 _flushed(in);
4155 }
4156
4157 void Client::_flushed(Inode *in)
4158 {
4159 ldout(cct, 10) << "_flushed " << *in << dendl;
4160
4161 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4162 }
4163
4164
4165
4166 // checks common to add_update_cap, handle_cap_grant
4167 void Client::check_cap_issue(Inode *in, unsigned issued)
4168 {
4169 unsigned had = in->caps_issued();
4170
4171 if ((issued & CEPH_CAP_FILE_CACHE) &&
4172 !(had & CEPH_CAP_FILE_CACHE))
4173 in->cache_gen++;
4174
4175 if ((issued & CEPH_CAP_FILE_SHARED) !=
4176 (had & CEPH_CAP_FILE_SHARED)) {
4177 if (issued & CEPH_CAP_FILE_SHARED)
4178 in->shared_gen++;
4179 if (in->is_dir())
4180 clear_dir_complete_and_ordered(in, true);
4181 }
4182 }
4183
4184 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4185 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4186 inodeno_t realm, int flags, const UserPerm& cap_perms)
4187 {
4188 if (!in->is_any_caps()) {
4189 ceph_assert(in->snaprealm == 0);
4190 in->snaprealm = get_snap_realm(realm);
4191 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4192 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4193 } else {
4194 ceph_assert(in->snaprealm);
4195 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4196 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4197 in->snaprealm_item.remove_myself();
4198 auto oldrealm = in->snaprealm;
4199 in->snaprealm = get_snap_realm(realm);
4200 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4201 put_snap_realm(oldrealm);
4202 }
4203 }
4204
4205 mds_rank_t mds = mds_session->mds_num;
4206 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4207 Cap &cap = capem.first->second;
4208 if (!capem.second) {
4209 if (cap.gen < mds_session->cap_gen)
4210 cap.issued = cap.implemented = CEPH_CAP_PIN;
4211
4212 /*
4213 * auth mds of the inode changed. we received the cap export
4214 * message, but still haven't received the cap import message.
4215 * handle_cap_export() updated the new auth MDS' cap.
4216 *
4217 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4218 * a message that was send before the cap import message. So
4219 * don't remove caps.
4220 */
4221 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4222 if (&cap != in->auth_cap)
4223 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4224
4225 ceph_assert(cap.cap_id == cap_id);
4226 seq = cap.seq;
4227 mseq = cap.mseq;
4228 issued |= cap.issued;
4229 flags |= CEPH_CAP_FLAG_AUTH;
4230 }
4231 } else {
4232 inc_pinned_icaps();
4233 }
4234
4235 check_cap_issue(in, issued);
4236
4237 if (flags & CEPH_CAP_FLAG_AUTH) {
4238 if (in->auth_cap != &cap &&
4239 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4240 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4241 ldout(cct, 10) << __func__ << " changing auth cap: "
4242 << "add myself to new auth MDS' flushing caps list" << dendl;
4243 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4244 }
4245 in->auth_cap = &cap;
4246 }
4247 }
4248
4249 unsigned old_caps = cap.issued;
4250 cap.cap_id = cap_id;
4251 cap.issued = issued;
4252 cap.implemented |= issued;
4253 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4254 cap.wanted = wanted;
4255 else
4256 cap.wanted |= wanted;
4257 cap.seq = seq;
4258 cap.issue_seq = seq;
4259 cap.mseq = mseq;
4260 cap.gen = mds_session->cap_gen;
4261 cap.latest_perms = cap_perms;
4262 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4263 << " from mds." << mds
4264 << " on " << *in
4265 << dendl;
4266
4267 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4268 // non-auth MDS is revoking the newly grant caps ?
4269 for (auto &p : in->caps) {
4270 if (&p.second == &cap)
4271 continue;
4272 if (p.second.implemented & ~p.second.issued & issued) {
4273 check_caps(in, CHECK_CAPS_NODELAY);
4274 break;
4275 }
4276 }
4277 }
4278
4279 if (issued & ~old_caps)
4280 signal_cond_list(in->waitfor_caps);
4281 }
4282
4283 void Client::remove_cap(Cap *cap, bool queue_release)
4284 {
4285 auto &in = cap->inode;
4286 MetaSession *session = cap->session;
4287 mds_rank_t mds = cap->session->mds_num;
4288
4289 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4290
4291 if (queue_release) {
4292 session->enqueue_cap_release(
4293 in.ino,
4294 cap->cap_id,
4295 cap->issue_seq,
4296 cap->mseq,
4297 cap_epoch_barrier);
4298 } else {
4299 dec_pinned_icaps();
4300 }
4301
4302
4303 if (in.auth_cap == cap) {
4304 if (in.flushing_cap_item.is_on_list()) {
4305 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4306 in.flushing_cap_item.remove_myself();
4307 }
4308 in.auth_cap = NULL;
4309 }
4310 size_t n = in.caps.erase(mds);
4311 ceph_assert(n == 1);
4312 cap = nullptr;
4313
4314 if (!in.is_any_caps()) {
4315 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4316 in.snaprealm_item.remove_myself();
4317 put_snap_realm(in.snaprealm);
4318 in.snaprealm = 0;
4319 }
4320 }
4321
4322 void Client::remove_all_caps(Inode *in)
4323 {
4324 while (!in->caps.empty())
4325 remove_cap(&in->caps.begin()->second, true);
4326 }
4327
4328 void Client::remove_session_caps(MetaSession *s, int err)
4329 {
4330 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4331
4332 while (s->caps.size()) {
4333 Cap *cap = *s->caps.begin();
4334 InodeRef in(&cap->inode);
4335 bool dirty_caps = false;
4336 if (in->auth_cap == cap) {
4337 dirty_caps = in->dirty_caps | in->flushing_caps;
4338 in->wanted_max_size = 0;
4339 in->requested_max_size = 0;
4340 if (in->has_any_filelocks())
4341 in->flags |= I_ERROR_FILELOCK;
4342 }
4343 auto caps = cap->implemented;
4344 if (cap->wanted | cap->issued)
4345 in->flags |= I_CAP_DROPPED;
4346 remove_cap(cap, false);
4347 in->cap_snaps.clear();
4348 if (dirty_caps) {
4349 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4350 if (in->flushing_caps) {
4351 num_flushing_caps--;
4352 in->flushing_cap_tids.clear();
4353 }
4354 in->flushing_caps = 0;
4355 in->mark_caps_clean();
4356 put_inode(in.get());
4357 }
4358 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4359 if (caps && !in->caps_issued_mask(caps, true)) {
4360 if (err == -CEPHFS_EBLOCKLISTED) {
4361 if (in->oset.dirty_or_tx) {
4362 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4363 in->set_async_err(err);
4364 }
4365 objectcacher->purge_set(&in->oset);
4366 } else {
4367 objectcacher->release_set(&in->oset);
4368 }
4369 _schedule_invalidate_callback(in.get(), 0, 0);
4370 }
4371
4372 signal_cond_list(in->waitfor_caps);
4373 }
4374 s->flushing_caps_tids.clear();
4375 sync_cond.notify_all();
4376 }
4377
4378 int Client::_do_remount(bool retry_on_error)
4379 {
4380 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4381
4382 errno = 0;
4383 int r = remount_cb(callback_handle);
4384 if (r == 0) {
4385 retries_on_invalidate = 0;
4386 } else {
4387 int e = errno;
4388 client_t whoami = get_nodeid();
4389 if (r == -1) {
4390 lderr(cct) <<
4391 "failed to remount (to trim kernel dentries): "
4392 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4393 } else {
4394 lderr(cct) <<
4395 "failed to remount (to trim kernel dentries): "
4396 "return code = " << r << dendl;
4397 }
4398 bool should_abort =
4399 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4400 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4401 !(retry_on_error && (++retries_on_invalidate < max_retries));
4402 if (should_abort && !is_unmounting()) {
4403 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4404 ceph_abort();
4405 }
4406 }
4407 return r;
4408 }
4409
4410 class C_Client_Remount : public Context {
4411 private:
4412 Client *client;
4413 public:
4414 explicit C_Client_Remount(Client *c) : client(c) {}
4415 void finish(int r) override {
4416 ceph_assert(r == 0);
4417 client->_do_remount(true);
4418 }
4419 };
4420
4421 void Client::_invalidate_kernel_dcache()
4422 {
4423 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4424 if (!mref_reader.is_state_satisfied())
4425 return;
4426
4427 if (can_invalidate_dentries) {
4428 if (dentry_invalidate_cb && root->dir) {
4429 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4430 p != root->dir->dentries.end();
4431 ++p) {
4432 if (p->second->inode)
4433 _schedule_invalidate_dentry_callback(p->second, false);
4434 }
4435 }
4436 } else if (remount_cb) {
4437 // Hacky:
4438 // when remounting a file system, linux kernel trims all unused dentries in the fs
4439 remount_finisher.queue(new C_Client_Remount(this));
4440 }
4441 }
4442
4443 void Client::_trim_negative_child_dentries(InodeRef& in)
4444 {
4445 if (!in->is_dir())
4446 return;
4447
4448 Dir* dir = in->dir;
4449 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4450 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4451 Dentry *dn = p->second;
4452 ++p;
4453 ceph_assert(!dn->inode);
4454 if (dn->lru_is_expireable())
4455 unlink(dn, true, false); // keep dir, drop dentry
4456 }
4457 if (dir->dentries.empty()) {
4458 close_dir(dir);
4459 }
4460 }
4461
4462 if (in->flags & I_SNAPDIR_OPEN) {
4463 InodeRef snapdir = open_snapdir(in.get());
4464 _trim_negative_child_dentries(snapdir);
4465 }
4466 }
4467
4468 class C_Client_CacheRelease : public Context {
4469 private:
4470 Client *client;
4471 vinodeno_t ino;
4472 public:
4473 C_Client_CacheRelease(Client *c, Inode *in) :
4474 client(c) {
4475 if (client->use_faked_inos())
4476 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4477 else
4478 ino = in->vino();
4479 }
4480 void finish(int r) override {
4481 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4482 client->_async_inode_release(ino);
4483 }
4484 };
4485
4486 void Client::_async_inode_release(vinodeno_t ino)
4487 {
4488 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4489 if (!mref_reader.is_state_satisfied())
4490 return;
4491
4492 ldout(cct, 10) << __func__ << " " << ino << dendl;
4493 ino_release_cb(callback_handle, ino);
4494 }
4495
4496 void Client::_schedule_ino_release_callback(Inode *in) {
4497
4498 if (ino_release_cb)
4499 // we queue the invalidate, which calls the callback and decrements the ref
4500 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4501 }
4502
4503 void Client::trim_caps(MetaSession *s, uint64_t max)
4504 {
4505 mds_rank_t mds = s->mds_num;
4506 size_t caps_size = s->caps.size();
4507 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4508 << " caps " << caps_size << dendl;
4509
4510 uint64_t trimmed = 0;
4511 auto p = s->caps.begin();
4512 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4513 * looking at from getting deleted during traversal. */
4514 while ((caps_size - trimmed) > max && !p.end()) {
4515 Cap *cap = *p;
4516 InodeRef in(&cap->inode);
4517
4518 // Increment p early because it will be invalidated if cap
4519 // is deleted inside remove_cap
4520 ++p;
4521
4522 if (in->caps.size() > 1 && cap != in->auth_cap) {
4523 int mine = cap->issued | cap->implemented;
4524 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4525 // disposable non-auth cap
4526 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4527 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4528 cap = (remove_cap(cap, true), nullptr);
4529 trimmed++;
4530 }
4531 } else {
4532 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4533 _trim_negative_child_dentries(in);
4534 bool all = true;
4535 auto q = in->dentries.begin();
4536 while (q != in->dentries.end()) {
4537 Dentry *dn = *q;
4538 ++q;
4539 if (dn->lru_is_expireable()) {
4540 if (can_invalidate_dentries &&
4541 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4542 // Only issue one of these per DN for inodes in root: handle
4543 // others more efficiently by calling for root-child DNs at
4544 // the end of this function.
4545 _schedule_invalidate_dentry_callback(dn, true);
4546 }
4547 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4548 to_trim.insert(dn);
4549 } else {
4550 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4551 all = false;
4552 }
4553 }
4554 if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
4555 _schedule_ino_release_callback(in.get());
4556 }
4557 if (all && in->ino != MDS_INO_ROOT) {
4558 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4559 trimmed++;
4560 }
4561 }
4562 }
4563 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4564 for (const auto &dn : to_trim) {
4565 trim_dentry(dn);
4566 }
4567 to_trim.clear();
4568
4569 caps_size = s->caps.size();
4570 if (caps_size > (size_t)max)
4571 _invalidate_kernel_dcache();
4572 }
4573
4574 void Client::force_session_readonly(MetaSession *s)
4575 {
4576 s->readonly = true;
4577 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4578 auto &in = (*p)->inode;
4579 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4580 signal_cond_list(in.waitfor_caps);
4581 }
4582 }
4583
4584 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4585 {
4586 MetaSession *session = in->auth_cap->session;
4587
4588 int flushing = in->dirty_caps;
4589 ceph_assert(flushing);
4590
4591 ceph_tid_t flush_tid = ++last_flush_tid;
4592 in->flushing_cap_tids[flush_tid] = flushing;
4593
4594 if (!in->flushing_caps) {
4595 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4596 num_flushing_caps++;
4597 } else {
4598 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4599 }
4600
4601 in->flushing_caps |= flushing;
4602 in->mark_caps_clean();
4603
4604 if (!in->flushing_cap_item.is_on_list())
4605 session->flushing_caps.push_back(&in->flushing_cap_item);
4606 session->flushing_caps_tids.insert(flush_tid);
4607
4608 *ptid = flush_tid;
4609 return flushing;
4610 }
4611
4612 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4613 {
4614 for (auto &p : in->cap_snaps) {
4615 CapSnap &capsnap = p.second;
4616 if (capsnap.flush_tid > 0) {
4617 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4618 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4619 }
4620 }
4621 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4622 it != in->flushing_cap_tids.end();
4623 ++it) {
4624 old_s->flushing_caps_tids.erase(it->first);
4625 new_s->flushing_caps_tids.insert(it->first);
4626 }
4627 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4628 }
4629
4630 /*
4631 * Flush all caps back to the MDS. Because the callers generally wait on the
4632 * result of this function (syncfs and umount cases), we set
4633 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4634 */
4635 void Client::flush_caps_sync()
4636 {
4637 ldout(cct, 10) << __func__ << dendl;
4638 xlist<Inode*>::iterator p = delayed_list.begin();
4639 while (!p.end()) {
4640 unsigned flags = CHECK_CAPS_NODELAY;
4641 Inode *in = *p;
4642
4643 ++p;
4644 delayed_list.pop_front();
4645 if (p.end() && dirty_list.empty())
4646 flags |= CHECK_CAPS_SYNCHRONOUS;
4647 check_caps(in, flags);
4648 }
4649
4650 // other caps, too
4651 p = dirty_list.begin();
4652 while (!p.end()) {
4653 unsigned flags = CHECK_CAPS_NODELAY;
4654 Inode *in = *p;
4655
4656 ++p;
4657 if (p.end())
4658 flags |= CHECK_CAPS_SYNCHRONOUS;
4659 check_caps(in, flags);
4660 }
4661 }
4662
4663 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4664 {
4665 while (in->flushing_caps) {
4666 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4667 ceph_assert(it != in->flushing_cap_tids.end());
4668 if (it->first > want)
4669 break;
4670 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4671 << ccap_string(it->second) << " want " << want
4672 << " last " << it->first << dendl;
4673 wait_on_list(in->waitfor_caps);
4674 }
4675 }
4676
4677 void Client::wait_sync_caps(ceph_tid_t want)
4678 {
4679 retry:
4680 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4681 << num_flushing_caps << " total flushing)" << dendl;
4682 for (auto &p : mds_sessions) {
4683 MetaSession *s = &p.second;
4684 if (s->flushing_caps_tids.empty())
4685 continue;
4686 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4687 if (oldest_tid <= want) {
4688 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4689 << " (want " << want << ")" << dendl;
4690 std::unique_lock l{client_lock, std::adopt_lock};
4691 sync_cond.wait(l);
4692 l.release();
4693 goto retry;
4694 }
4695 }
4696 }
4697
4698 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4699 {
4700 in->flags &= ~I_KICK_FLUSH;
4701
4702 Cap *cap = in->auth_cap;
4703 ceph_assert(cap->session == session);
4704
4705 ceph_tid_t last_snap_flush = 0;
4706 for (auto p = in->flushing_cap_tids.rbegin();
4707 p != in->flushing_cap_tids.rend();
4708 ++p) {
4709 if (!p->second) {
4710 last_snap_flush = p->first;
4711 break;
4712 }
4713 }
4714
4715 int wanted = in->caps_wanted();
4716 int used = get_caps_used(in) | in->caps_dirty();
4717 auto it = in->cap_snaps.begin();
4718 for (auto& p : in->flushing_cap_tids) {
4719 if (p.second) {
4720 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4721 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4722 p.second, p.first);
4723 } else {
4724 ceph_assert(it != in->cap_snaps.end());
4725 ceph_assert(it->second.flush_tid == p.first);
4726 send_flush_snap(in, session, it->first, it->second);
4727 ++it;
4728 }
4729 }
4730 }
4731
4732 void Client::kick_flushing_caps(MetaSession *session)
4733 {
4734 mds_rank_t mds = session->mds_num;
4735 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4736
4737 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4738 Inode *in = *p;
4739 if (in->flags & I_KICK_FLUSH) {
4740 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4741 kick_flushing_caps(in, session);
4742 }
4743 }
4744 }
4745
4746 void Client::early_kick_flushing_caps(MetaSession *session)
4747 {
4748 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4749 Inode *in = *p;
4750 Cap *cap = in->auth_cap;
4751 ceph_assert(cap);
4752
4753 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4754 // stage. This guarantees that MDS processes the cap flush message before issuing
4755 // the flushing caps to other client.
4756 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4757 in->flags |= I_KICK_FLUSH;
4758 continue;
4759 }
4760
4761 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4762 << " to mds." << session->mds_num << dendl;
4763 // send_reconnect() also will reset these sequence numbers. make sure
4764 // sequence numbers in cap flush message match later reconnect message.
4765 cap->seq = 0;
4766 cap->issue_seq = 0;
4767 cap->mseq = 0;
4768 cap->issued = cap->implemented;
4769
4770 kick_flushing_caps(in, session);
4771 }
4772 }
4773
4774 void SnapRealm::build_snap_context()
4775 {
4776 set<snapid_t> snaps;
4777 snapid_t max_seq = seq;
4778
4779 // start with prior_parents?
4780 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4781 snaps.insert(prior_parent_snaps[i]);
4782
4783 // current parent's snaps
4784 if (pparent) {
4785 const SnapContext& psnapc = pparent->get_snap_context();
4786 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4787 if (psnapc.snaps[i] >= parent_since)
4788 snaps.insert(psnapc.snaps[i]);
4789 if (psnapc.seq > max_seq)
4790 max_seq = psnapc.seq;
4791 }
4792
4793 // my snaps
4794 for (unsigned i=0; i<my_snaps.size(); i++)
4795 snaps.insert(my_snaps[i]);
4796
4797 // ok!
4798 cached_snap_context.seq = max_seq;
4799 cached_snap_context.snaps.resize(0);
4800 cached_snap_context.snaps.reserve(snaps.size());
4801 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4802 cached_snap_context.snaps.push_back(*p);
4803 }
4804
4805 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4806 {
4807 list<SnapRealm*> q;
4808 q.push_back(realm);
4809
4810 while (!q.empty()) {
4811 realm = q.front();
4812 q.pop_front();
4813
4814 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4815 realm->invalidate_cache();
4816
4817 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4818 p != realm->pchildren.end();
4819 ++p)
4820 q.push_back(*p);
4821 }
4822 }
4823
4824 SnapRealm *Client::get_snap_realm(inodeno_t r)
4825 {
4826 SnapRealm *realm = snap_realms[r];
4827 if (!realm)
4828 snap_realms[r] = realm = new SnapRealm(r);
4829 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4830 realm->nref++;
4831 return realm;
4832 }
4833
4834 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4835 {
4836 if (snap_realms.count(r) == 0) {
4837 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4838 return NULL;
4839 }
4840 SnapRealm *realm = snap_realms[r];
4841 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4842 realm->nref++;
4843 return realm;
4844 }
4845
4846 void Client::put_snap_realm(SnapRealm *realm)
4847 {
4848 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4849 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4850 if (--realm->nref == 0) {
4851 snap_realms.erase(realm->ino);
4852 if (realm->pparent) {
4853 realm->pparent->pchildren.erase(realm);
4854 put_snap_realm(realm->pparent);
4855 }
4856 delete realm;
4857 }
4858 }
4859
4860 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4861 {
4862 if (realm->parent != parent) {
4863 ldout(cct, 10) << __func__ << " " << *realm
4864 << " " << realm->parent << " -> " << parent << dendl;
4865 realm->parent = parent;
4866 if (realm->pparent) {
4867 realm->pparent->pchildren.erase(realm);
4868 put_snap_realm(realm->pparent);
4869 }
4870 realm->pparent = get_snap_realm(parent);
4871 realm->pparent->pchildren.insert(realm);
4872 return true;
4873 }
4874 return false;
4875 }
4876
4877 static bool has_new_snaps(const SnapContext& old_snapc,
4878 const SnapContext& new_snapc)
4879 {
4880 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4881 }
4882
4883
4884 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4885 {
4886 SnapRealm *first_realm = NULL;
4887 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4888
4889 map<SnapRealm*, SnapContext> dirty_realms;
4890
4891 auto p = bl.cbegin();
4892 while (!p.end()) {
4893 SnapRealmInfo info;
4894 decode(info, p);
4895 SnapRealm *realm = get_snap_realm(info.ino());
4896
4897 bool invalidate = false;
4898
4899 if (info.seq() > realm->seq) {
4900 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4901 << dendl;
4902
4903 if (flush) {
4904 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4905 // flush me + children
4906 list<SnapRealm*> q;
4907 q.push_back(realm);
4908 while (!q.empty()) {
4909 SnapRealm *realm = q.front();
4910 q.pop_front();
4911
4912 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4913 p != realm->pchildren.end();
4914 ++p)
4915 q.push_back(*p);
4916
4917 if (dirty_realms.count(realm) == 0) {
4918 realm->nref++;
4919 dirty_realms[realm] = realm->get_snap_context();
4920 }
4921 }
4922 }
4923
4924 // update
4925 realm->seq = info.seq();
4926 realm->created = info.created();
4927 realm->parent_since = info.parent_since();
4928 realm->prior_parent_snaps = info.prior_parent_snaps;
4929 realm->my_snaps = info.my_snaps;
4930 invalidate = true;
4931 }
4932
4933 // _always_ verify parent
4934 if (adjust_realm_parent(realm, info.parent()))
4935 invalidate = true;
4936
4937 if (invalidate) {
4938 invalidate_snaprealm_and_children(realm);
4939 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4940 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4941 } else {
4942 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4943 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4944 }
4945
4946 if (!first_realm)
4947 first_realm = realm;
4948 else
4949 put_snap_realm(realm);
4950 }
4951
4952 for (auto &[realm, snapc] : dirty_realms) {
4953 // if there are new snaps ?
4954 if (has_new_snaps(snapc, realm->get_snap_context())) {
4955 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4956 for (auto&& in : realm->inodes_with_caps) {
4957 queue_cap_snap(in, snapc);
4958 }
4959 } else {
4960 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4961 }
4962 put_snap_realm(realm);
4963 }
4964
4965 if (realm_ret)
4966 *realm_ret = first_realm;
4967 else
4968 put_snap_realm(first_realm);
4969 }
4970
4971 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4972 {
4973 ldout(cct, 10) << __func__ << " " << *m << dendl;
4974 mds_rank_t mds = mds_rank_t(m->get_source().num());
4975
4976 std::scoped_lock cl(client_lock);
4977 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4978 if (!session) {
4979 return;
4980 }
4981
4982 got_mds_push(session);
4983
4984 map<Inode*, SnapContext> to_move;
4985 SnapRealm *realm = 0;
4986
4987 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4988 ceph_assert(m->head.split);
4989 SnapRealmInfo info;
4990 auto p = m->bl.cbegin();
4991 decode(info, p);
4992 ceph_assert(info.ino() == m->head.split);
4993
4994 // flush, then move, ino's.
4995 realm = get_snap_realm(info.ino());
4996 ldout(cct, 10) << " splitting off " << *realm << dendl;
4997 for (auto& ino : m->split_inos) {
4998 vinodeno_t vino(ino, CEPH_NOSNAP);
4999 if (inode_map.count(vino)) {
5000 Inode *in = inode_map[vino];
5001 if (!in->snaprealm || in->snaprealm == realm)
5002 continue;
5003 if (in->snaprealm->created > info.created()) {
5004 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5005 << *in->snaprealm << dendl;
5006 continue;
5007 }
5008 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5009
5010
5011 in->snaprealm_item.remove_myself();
5012 to_move[in] = in->snaprealm->get_snap_context();
5013 put_snap_realm(in->snaprealm);
5014 }
5015 }
5016
5017 // move child snaprealms, too
5018 for (auto& child_realm : m->split_realms) {
5019 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5020 SnapRealm *child = get_snap_realm_maybe(child_realm);
5021 if (!child)
5022 continue;
5023 adjust_realm_parent(child, realm->ino);
5024 put_snap_realm(child);
5025 }
5026 }
5027
5028 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5029
5030 if (realm) {
5031 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5032 Inode *in = p->first;
5033 in->snaprealm = realm;
5034 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5035 realm->nref++;
5036 // queue for snap writeback
5037 if (has_new_snaps(p->second, realm->get_snap_context()))
5038 queue_cap_snap(in, p->second);
5039 }
5040 put_snap_realm(realm);
5041 }
5042 }
5043
5044 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5045 {
5046 mds_rank_t mds = mds_rank_t(m->get_source().num());
5047
5048 std::scoped_lock cl(client_lock);
5049 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5050 if (!session) {
5051 return;
5052 }
5053
5054 got_mds_push(session);
5055
5056 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5057
5058 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5059 if (inode_map.count(vino)) {
5060 Inode *in = NULL;
5061 in = inode_map[vino];
5062
5063 if (in) {
5064 in->quota = m->quota;
5065 in->rstat = m->rstat;
5066 }
5067 }
5068 }
5069
5070 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5071 {
5072 mds_rank_t mds = mds_rank_t(m->get_source().num());
5073
5074 std::scoped_lock cl(client_lock);
5075 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5076 if (!session) {
5077 return;
5078 }
5079
5080 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5081 // Pause RADOS operations until we see the required epoch
5082 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5083 }
5084
5085 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5086 // Record the barrier so that we will transmit it to MDS when releasing
5087 set_cap_epoch_barrier(m->osd_epoch_barrier);
5088 }
5089
5090 got_mds_push(session);
5091
5092 Inode *in;
5093 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5094 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5095 in = it->second;
5096 } else {
5097 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5098 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5099 session->enqueue_cap_release(
5100 m->get_ino(),
5101 m->get_cap_id(),
5102 m->get_seq(),
5103 m->get_mseq(),
5104 cap_epoch_barrier);
5105 } else {
5106 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5107 }
5108
5109 // in case the mds is waiting on e.g. a revocation
5110 flush_cap_releases();
5111 return;
5112 }
5113
5114 switch (m->get_op()) {
5115 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
5116 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
5117 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
5118 }
5119
5120 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5121 Cap &cap = in->caps.at(mds);
5122
5123 switch (m->get_op()) {
5124 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
5125 case CEPH_CAP_OP_IMPORT:
5126 case CEPH_CAP_OP_REVOKE:
5127 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
5128 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
5129 }
5130 } else {
5131 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5132 return;
5133 }
5134 }
5135
5136 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5137 {
5138 mds_rank_t mds = session->mds_num;
5139
5140 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5141 << " IMPORT from mds." << mds << dendl;
5142
5143 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5144 Cap *cap = NULL;
5145 UserPerm cap_perms;
5146 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5147 cap = &it->second;
5148 cap_perms = cap->latest_perms;
5149 }
5150
5151 // add/update it
5152 SnapRealm *realm = NULL;
5153 update_snap_trace(m->snapbl, &realm);
5154
5155 int issued = m->get_caps();
5156 int wanted = m->get_wanted();
5157 add_update_cap(in, session, m->get_cap_id(),
5158 issued, wanted, m->get_seq(), m->get_mseq(),
5159 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5160
5161 if (cap && cap->cap_id == m->peer.cap_id) {
5162 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5163 }
5164
5165 if (realm)
5166 put_snap_realm(realm);
5167
5168 if (in->auth_cap && in->auth_cap->session == session) {
5169 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5170 in->requested_max_size > m->get_max_size()) {
5171 in->requested_max_size = 0;
5172 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5173 }
5174 // reflush any/all caps (if we are now the auth_cap)
5175 kick_flushing_caps(in, session);
5176 }
5177 }
5178
5179 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5180 {
5181 mds_rank_t mds = session->mds_num;
5182
5183 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5184 << " EXPORT from mds." << mds << dendl;
5185
5186 auto it = in->caps.find(mds);
5187 if (it != in->caps.end()) {
5188 Cap &cap = it->second;
5189 if (cap.cap_id == m->get_cap_id()) {
5190 if (m->peer.cap_id) {
5191 const auto peer_mds = mds_rank_t(m->peer.mds);
5192 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5193 auto it = in->caps.find(peer_mds);
5194 if (it != in->caps.end()) {
5195 Cap &tcap = it->second;
5196 if (tcap.cap_id == m->peer.cap_id &&
5197 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5198 tcap.cap_id = m->peer.cap_id;
5199 tcap.seq = m->peer.seq - 1;
5200 tcap.issue_seq = tcap.seq;
5201 tcap.issued |= cap.issued;
5202 tcap.implemented |= cap.issued;
5203 if (&cap == in->auth_cap)
5204 in->auth_cap = &tcap;
5205 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5206 adjust_session_flushing_caps(in, session, tsession);
5207 }
5208 } else {
5209 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5210 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5211 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5212 cap.latest_perms);
5213 }
5214 } else {
5215 if (cap.wanted | cap.issued)
5216 in->flags |= I_CAP_DROPPED;
5217 }
5218
5219 remove_cap(&cap, false);
5220 }
5221 }
5222 }
5223
5224 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5225 {
5226 mds_rank_t mds = session->mds_num;
5227 ceph_assert(in->caps.count(mds));
5228
5229 ldout(cct, 10) << __func__ << " on ino " << *in
5230 << " size " << in->size << " -> " << m->get_size()
5231 << dendl;
5232
5233 int issued;
5234 in->caps_issued(&issued);
5235 issued |= in->caps_dirty();
5236 update_inode_file_size(in, issued, m->get_size(),
5237 m->get_truncate_seq(), m->get_truncate_size());
5238 }
5239
5240 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5241 {
5242 ceph_tid_t flush_ack_tid = m->get_client_tid();
5243 int dirty = m->get_dirty();
5244 int cleaned = 0;
5245 int flushed = 0;
5246
5247 auto it = in->flushing_cap_tids.begin();
5248 if (it->first < flush_ack_tid) {
5249 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5250 << " got unexpected flush ack tid " << flush_ack_tid
5251 << " expected is " << it->first << dendl;
5252 }
5253 for (; it != in->flushing_cap_tids.end(); ) {
5254 if (!it->second) {
5255 // cap snap
5256 ++it;
5257 continue;
5258 }
5259 if (it->first == flush_ack_tid)
5260 cleaned = it->second;
5261 if (it->first <= flush_ack_tid) {
5262 session->flushing_caps_tids.erase(it->first);
5263 in->flushing_cap_tids.erase(it++);
5264 ++flushed;
5265 continue;
5266 }
5267 cleaned &= ~it->second;
5268 if (!cleaned)
5269 break;
5270 ++it;
5271 }
5272
5273 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5274 << " cleaned " << ccap_string(cleaned) << " on " << *in
5275 << " with " << ccap_string(dirty) << dendl;
5276
5277 if (flushed) {
5278 signal_cond_list(in->waitfor_caps);
5279 if (session->flushing_caps_tids.empty() ||
5280 *session->flushing_caps_tids.begin() > flush_ack_tid)
5281 sync_cond.notify_all();
5282 }
5283
5284 if (!dirty) {
5285 in->cap_dirtier_uid = -1;
5286 in->cap_dirtier_gid = -1;
5287 }
5288
5289 if (!cleaned) {
5290 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5291 } else {
5292 if (in->flushing_caps) {
5293 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5294 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5295 in->flushing_caps &= ~cleaned;
5296 if (in->flushing_caps == 0) {
5297 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5298 num_flushing_caps--;
5299 if (in->flushing_cap_tids.empty())
5300 in->flushing_cap_item.remove_myself();
5301 }
5302 if (!in->caps_dirty())
5303 put_inode(in);
5304 }
5305 }
5306 }
5307
5308
5309 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5310 {
5311 ceph_tid_t flush_ack_tid = m->get_client_tid();
5312 mds_rank_t mds = session->mds_num;
5313 ceph_assert(in->caps.count(mds));
5314 snapid_t follows = m->get_snap_follows();
5315
5316 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5317 auto& capsnap = it->second;
5318 if (flush_ack_tid != capsnap.flush_tid) {
5319 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5320 } else {
5321 InodeRef tmp_ref(in);
5322 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5323 << " on " << *in << dendl;
5324 session->flushing_caps_tids.erase(capsnap.flush_tid);
5325 in->flushing_cap_tids.erase(capsnap.flush_tid);
5326 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5327 in->flushing_cap_item.remove_myself();
5328 in->cap_snaps.erase(it);
5329
5330 signal_cond_list(in->waitfor_caps);
5331 if (session->flushing_caps_tids.empty() ||
5332 *session->flushing_caps_tids.begin() > flush_ack_tid)
5333 sync_cond.notify_all();
5334 }
5335 } else {
5336 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5337 << " on " << *in << dendl;
5338 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5339 }
5340 }
5341
5342 class C_Client_DentryInvalidate : public Context {
5343 private:
5344 Client *client;
5345 vinodeno_t dirino;
5346 vinodeno_t ino;
5347 string name;
5348 public:
5349 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5350 client(c), name(dn->name) {
5351 if (client->use_faked_inos()) {
5352 dirino.ino = dn->dir->parent_inode->faked_ino;
5353 if (del)
5354 ino.ino = dn->inode->faked_ino;
5355 } else {
5356 dirino = dn->dir->parent_inode->vino();
5357 if (del)
5358 ino = dn->inode->vino();
5359 }
5360 if (!del)
5361 ino.ino = inodeno_t();
5362 }
5363 void finish(int r) override {
5364 // _async_dentry_invalidate is responsible for its own locking
5365 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5366 client->_async_dentry_invalidate(dirino, ino, name);
5367 }
5368 };
5369
5370 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5371 {
5372 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5373 if (!mref_reader.is_state_satisfied())
5374 return;
5375
5376 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5377 << " in dir " << dirino << dendl;
5378 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5379 }
5380
5381 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5382 {
5383 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5384 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5385 }
5386
5387 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5388 {
5389 int ref = in->get_num_ref();
5390 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5391
5392 if (in->dir && !in->dir->dentries.empty()) {
5393 for (auto p = in->dir->dentries.begin();
5394 p != in->dir->dentries.end(); ) {
5395 Dentry *dn = p->second;
5396 ++p;
5397 /* rmsnap removes whole subtree, need trim inodes recursively.
5398 * we don't need to invalidate dentries recursively. because
5399 * invalidating a directory dentry effectively invalidate
5400 * whole subtree */
5401 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5402 _try_to_trim_inode(dn->inode.get(), false);
5403
5404 if (dn->lru_is_expireable())
5405 unlink(dn, true, false); // keep dir, drop dentry
5406 }
5407 if (in->dir->dentries.empty()) {
5408 close_dir(in->dir);
5409 --ref;
5410 }
5411 }
5412
5413 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5414 InodeRef snapdir = open_snapdir(in);
5415 _try_to_trim_inode(snapdir.get(), false);
5416 --ref;
5417 }
5418
5419 if (ref > 0) {
5420 auto q = in->dentries.begin();
5421 while (q != in->dentries.end()) {
5422 Dentry *dn = *q;
5423 ++q;
5424 if( in->ll_ref > 0 && sched_inval) {
5425 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5426 // so in->dentries doesn't always reflect the state of kernel's dcache.
5427 _schedule_invalidate_dentry_callback(dn, true);
5428 }
5429 unlink(dn, true, true);
5430 }
5431 }
5432 }
5433
5434 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5435 {
5436 mds_rank_t mds = session->mds_num;
5437 int used = get_caps_used(in);
5438 int wanted = in->caps_wanted();
5439
5440 const unsigned new_caps = m->get_caps();
5441 const bool was_stale = session->cap_gen > cap->gen;
5442 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5443 << " mds." << mds << " seq " << m->get_seq()
5444 << " caps now " << ccap_string(new_caps)
5445 << " was " << ccap_string(cap->issued)
5446 << (was_stale ? " (stale)" : "") << dendl;
5447
5448 if (was_stale)
5449 cap->issued = cap->implemented = CEPH_CAP_PIN;
5450 cap->seq = m->get_seq();
5451 cap->gen = session->cap_gen;
5452
5453 check_cap_issue(in, new_caps);
5454
5455 // update inode
5456 int issued;
5457 in->caps_issued(&issued);
5458 issued |= in->caps_dirty();
5459
5460 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5461 !(issued & CEPH_CAP_AUTH_EXCL)) {
5462 in->mode = m->head.mode;
5463 in->uid = m->head.uid;
5464 in->gid = m->head.gid;
5465 in->btime = m->btime;
5466 }
5467 bool deleted_inode = false;
5468 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5469 !(issued & CEPH_CAP_LINK_EXCL)) {
5470 in->nlink = m->head.nlink;
5471 if (in->nlink == 0 &&
5472 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5473 deleted_inode = true;
5474 }
5475 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5476 m->xattrbl.length() &&
5477 m->head.xattr_version > in->xattr_version) {
5478 auto p = m->xattrbl.cbegin();
5479 decode(in->xattrs, p);
5480 in->xattr_version = m->head.xattr_version;
5481 }
5482
5483 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5484 in->dirstat.nfiles = m->get_nfiles();
5485 in->dirstat.nsubdirs = m->get_nsubdirs();
5486 }
5487
5488 if (new_caps & CEPH_CAP_ANY_RD) {
5489 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5490 m->get_ctime(), m->get_mtime(), m->get_atime());
5491 }
5492
5493 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5494 in->layout = m->get_layout();
5495 update_inode_file_size(in, issued, m->get_size(),
5496 m->get_truncate_seq(), m->get_truncate_size());
5497 }
5498
5499 if (m->inline_version > in->inline_version) {
5500 in->inline_data = m->inline_data;
5501 in->inline_version = m->inline_version;
5502 }
5503
5504 /* always take a newer change attr */
5505 if (m->get_change_attr() > in->change_attr)
5506 in->change_attr = m->get_change_attr();
5507
5508 // max_size
5509 if (cap == in->auth_cap &&
5510 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5511 (m->get_max_size() != in->max_size)) {
5512 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5513 in->max_size = m->get_max_size();
5514 if (in->max_size > in->wanted_max_size) {
5515 in->wanted_max_size = 0;
5516 in->requested_max_size = 0;
5517 }
5518 }
5519
5520 bool check = false;
5521 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5522 (wanted & ~(cap->wanted | new_caps))) {
5523 // If mds is importing cap, prior cap messages that update 'wanted'
5524 // may get dropped by mds (migrate seq mismatch).
5525 //
5526 // We don't send cap message to update 'wanted' if what we want are
5527 // already issued. If mds revokes caps, cap message that releases caps
5528 // also tells mds what we want. But if caps got revoked by mds forcedly
5529 // (session stale). We may haven't told mds what we want.
5530 check = true;
5531 }
5532
5533
5534 // update caps
5535 auto revoked = cap->issued & ~new_caps;
5536 if (revoked) {
5537 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5538 cap->issued = new_caps;
5539 cap->implemented |= new_caps;
5540
5541 // recall delegations if we're losing caps necessary for them
5542 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5543 in->recall_deleg(false);
5544 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5545 in->recall_deleg(true);
5546
5547 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5548 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5549 !_flush(in, new C_Client_FlushComplete(this, in))) {
5550 // waitin' for flush
5551 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5552 if (_release(in))
5553 check = true;
5554 } else {
5555 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5556 check = true;
5557 }
5558 } else if (cap->issued == new_caps) {
5559 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5560 } else {
5561 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5562 cap->issued = new_caps;
5563 cap->implemented |= new_caps;
5564
5565 if (cap == in->auth_cap) {
5566 // non-auth MDS is revoking the newly grant caps ?
5567 for (const auto &p : in->caps) {
5568 if (&p.second == cap)
5569 continue;
5570 if (p.second.implemented & ~p.second.issued & new_caps) {
5571 check = true;
5572 break;
5573 }
5574 }
5575 }
5576 }
5577
5578 if (check)
5579 check_caps(in, 0);
5580
5581 // wake up waiters
5582 if (new_caps)
5583 signal_cond_list(in->waitfor_caps);
5584
5585 // may drop inode's last ref
5586 if (deleted_inode)
5587 _try_to_trim_inode(in, true);
5588 }
5589
5590 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5591 {
5592 if (perms.uid() == 0)
5593 return 0;
5594
5595 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5596 int ret = _posix_acl_permission(in, perms, want);
5597 if (ret != -CEPHFS_EAGAIN)
5598 return ret;
5599 }
5600
5601 // check permissions before doing anything else
5602 if (!in->check_mode(perms, want))
5603 return -CEPHFS_EACCES;
5604 return 0;
5605 }
5606
5607 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5608 const UserPerm& perms)
5609 {
5610 int r = _getattr_for_perm(in, perms);
5611 if (r < 0)
5612 goto out;
5613
5614 r = 0;
5615 if (strncmp(name, "system.", 7) == 0) {
5616 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5617 r = -CEPHFS_EPERM;
5618 } else {
5619 r = inode_permission(in, perms, want);
5620 }
5621 out:
5622 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5623 return r;
5624 }
5625
5626 ostream& operator<<(ostream &out, const UserPerm& perm) {
5627 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5628 return out;
5629 }
5630
5631 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5632 const UserPerm& perms)
5633 {
5634 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5635 int r = _getattr_for_perm(in, perms);
5636 if (r < 0)
5637 goto out;
5638
5639 if (mask & CEPH_SETATTR_SIZE) {
5640 r = inode_permission(in, perms, MAY_WRITE);
5641 if (r < 0)
5642 goto out;
5643 }
5644
5645 r = -CEPHFS_EPERM;
5646 if (mask & CEPH_SETATTR_UID) {
5647 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5648 goto out;
5649 }
5650 if (mask & CEPH_SETATTR_GID) {
5651 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5652 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5653 goto out;
5654 }
5655
5656 if (mask & CEPH_SETATTR_MODE) {
5657 if (perms.uid() != 0 && perms.uid() != in->uid)
5658 goto out;
5659
5660 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5661 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5662 stx->stx_mode &= ~S_ISGID;
5663 }
5664
5665 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5666 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5667 if (perms.uid() != 0 && perms.uid() != in->uid) {
5668 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5669 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5670 check_mask |= CEPH_SETATTR_MTIME;
5671 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5672 check_mask |= CEPH_SETATTR_ATIME;
5673 if (check_mask & mask) {
5674 goto out;
5675 } else {
5676 r = inode_permission(in, perms, MAY_WRITE);
5677 if (r < 0)
5678 goto out;
5679 }
5680 }
5681 }
5682 r = 0;
5683 out:
5684 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5685 return r;
5686 }
5687
5688 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5689 {
5690 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5691 unsigned want = 0;
5692
5693 if ((flags & O_ACCMODE) == O_WRONLY)
5694 want = MAY_WRITE;
5695 else if ((flags & O_ACCMODE) == O_RDWR)
5696 want = MAY_READ | MAY_WRITE;
5697 else if ((flags & O_ACCMODE) == O_RDONLY)
5698 want = MAY_READ;
5699 if (flags & O_TRUNC)
5700 want |= MAY_WRITE;
5701
5702 int r = 0;
5703 switch (in->mode & S_IFMT) {
5704 case S_IFLNK:
5705 r = -CEPHFS_ELOOP;
5706 goto out;
5707 case S_IFDIR:
5708 if (want & MAY_WRITE) {
5709 r = -CEPHFS_EISDIR;
5710 goto out;
5711 }
5712 break;
5713 }
5714
5715 r = _getattr_for_perm(in, perms);
5716 if (r < 0)
5717 goto out;
5718
5719 r = inode_permission(in, perms, want);
5720 out:
5721 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5722 return r;
5723 }
5724
5725 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5726 {
5727 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5728 int r = _getattr_for_perm(dir, perms);
5729 if (r < 0)
5730 goto out;
5731
5732 r = inode_permission(dir, perms, MAY_EXEC);
5733 out:
5734 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5735 return r;
5736 }
5737
5738 int Client::may_create(Inode *dir, const UserPerm& perms)
5739 {
5740 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5741 int r = _getattr_for_perm(dir, perms);
5742 if (r < 0)
5743 goto out;
5744
5745 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5746 out:
5747 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5748 return r;
5749 }
5750
5751 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5752 {
5753 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5754 int r = _getattr_for_perm(dir, perms);
5755 if (r < 0)
5756 goto out;
5757
5758 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5759 if (r < 0)
5760 goto out;
5761
5762 /* 'name == NULL' means rmsnap w/o permission checks */
5763 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5764 InodeRef otherin;
5765 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5766 if (r < 0)
5767 goto out;
5768 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5769 r = -CEPHFS_EPERM;
5770 }
5771 out:
5772 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5773 return r;
5774 }
5775
5776 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5777 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5778
5779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5780 if (!mref_reader.is_state_satisfied())
5781 return -ENOTCONN;
5782
5783 filepath path(relpath);
5784 string name = path.last_dentry();
5785 path.pop_dentry();
5786 InodeRef dir;
5787
5788 std::scoped_lock lock(client_lock);
5789 int r = path_walk(path, &dir, perms);
5790 if (r < 0)
5791 return r;
5792 if (cct->_conf->client_permissions) {
5793 int r = may_delete(dir.get(), name.c_str(), perms);
5794 if (r < 0)
5795 return r;
5796 }
5797
5798 return 0;
5799 }
5800
5801 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5802 {
5803 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5804 int r = _getattr_for_perm(in, perms);
5805 if (r < 0)
5806 goto out;
5807
5808 if (perms.uid() == 0 || perms.uid() == in->uid) {
5809 r = 0;
5810 goto out;
5811 }
5812
5813 r = -CEPHFS_EPERM;
5814 if (!S_ISREG(in->mode))
5815 goto out;
5816
5817 if (in->mode & S_ISUID)
5818 goto out;
5819
5820 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5821 goto out;
5822
5823 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5824 out:
5825 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5826 return r;
5827 }
5828
5829 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5830 {
5831 int mask = CEPH_STAT_CAP_MODE;
5832 bool force = false;
5833 if (acl_type != NO_ACL) {
5834 mask |= CEPH_STAT_CAP_XATTR;
5835 force = in->xattr_version == 0;
5836 }
5837 return _getattr(in, mask, perms, force);
5838 }
5839
5840 vinodeno_t Client::_get_vino(Inode *in)
5841 {
5842 /* The caller must hold the client lock */
5843 return vinodeno_t(in->ino, in->snapid);
5844 }
5845
5846 /**
5847 * Resolve an MDS spec to a list of MDS daemon GIDs.
5848 *
5849 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5850 * It may be '*' in which case it matches all GIDs.
5851 *
5852 * If no error is returned, the `targets` vector will be populated with at least
5853 * one MDS.
5854 */
5855 int Client::resolve_mds(
5856 const std::string &mds_spec,
5857 std::vector<mds_gid_t> *targets)
5858 {
5859 ceph_assert(fsmap);
5860 ceph_assert(targets != nullptr);
5861
5862 mds_role_t role;
5863 CachedStackStringStream css;
5864 int role_r = fsmap->parse_role(mds_spec, &role, *css);
5865 if (role_r == 0) {
5866 // We got a role, resolve it to a GID
5867 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5868 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5869 << role << "' aka " << info.human_name() << dendl;
5870 targets->push_back(info.global_id);
5871 return 0;
5872 }
5873
5874 std::string strtol_err;
5875 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5876 if (strtol_err.empty()) {
5877 // It is a possible GID
5878 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5879 if (fsmap->gid_exists(mds_gid)) {
5880 auto& info = fsmap->get_info_gid(mds_gid);
5881 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5882 << info.human_name() << dendl;
5883 targets->push_back(mds_gid);
5884 return 0;
5885 } else {
5886 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5887 << dendl;
5888 lderr(cct) << "FSMap: " << *fsmap << dendl;
5889 return -CEPHFS_ENOENT;
5890 }
5891 } else if (mds_spec == "*") {
5892 // It is a wildcard: use all MDSs
5893 const auto& mds_info = fsmap->get_mds_info();
5894
5895 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5896 if (mds_info.empty()) {
5897 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5898 lderr(cct) << "FSMap: " << *fsmap << dendl;
5899 return -CEPHFS_ENOENT;
5900 }
5901
5902 for (const auto& [gid, info] : mds_info) {
5903 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5904 targets->push_back(gid);
5905 }
5906 return 0;
5907 } else {
5908 // It did not parse as an integer, it is not a wildcard, it must be a name
5909 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5910 if (mds_gid == 0) {
5911 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
5912 lderr(cct) << "FSMap: " << *fsmap << dendl;
5913 return -CEPHFS_ENOENT;
5914 } else {
5915 auto& info = fsmap->get_info_gid(mds_gid);
5916 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5917 << "' to " << info.human_name() << dendl;
5918 targets->push_back(mds_gid);
5919 }
5920 return 0;
5921 }
5922 }
5923
5924
5925 /**
5926 * Authenticate with mon and establish global ID
5927 */
5928 int Client::authenticate()
5929 {
5930 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5931
5932 if (monclient->is_authenticated()) {
5933 return 0;
5934 }
5935
5936 client_lock.unlock();
5937 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5938 client_lock.lock();
5939 if (r < 0) {
5940 return r;
5941 }
5942
5943 whoami = monclient->get_global_id();
5944 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5945
5946 return 0;
5947 }
5948
5949 int Client::fetch_fsmap(bool user)
5950 {
5951 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5952
5953 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5954 // rather than MDSMap because no one MDSMap contains all the daemons, and
5955 // a `tell` can address any daemon.
5956 version_t fsmap_latest;
5957 bs::error_code ec;
5958 do {
5959 client_lock.unlock();
5960 std::tie(fsmap_latest, std::ignore) =
5961 monclient->get_version("fsmap", ca::use_blocked[ec]);
5962 client_lock.lock();
5963 } while (ec == bs::errc::resource_unavailable_try_again);
5964
5965 if (ec) {
5966 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5967 return ceph::from_error_code(ec);
5968 }
5969
5970 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5971
5972 if (user) {
5973 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5974 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5975 monclient->renew_subs();
5976 wait_on_list(waiting_for_fsmap);
5977 }
5978 ceph_assert(fsmap_user);
5979 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5980 } else {
5981 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5982 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5983 monclient->renew_subs();
5984 wait_on_list(waiting_for_fsmap);
5985 }
5986 ceph_assert(fsmap);
5987 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5988 }
5989 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5990 << fsmap_latest << dendl;
5991 return 0;
5992 }
5993
5994 /**
5995 *
5996 * @mds_spec one of ID, rank, GID, "*"
5997 *
5998 */
5999 int Client::mds_command(
6000 const std::string &mds_spec,
6001 const vector<string>& cmd,
6002 const bufferlist& inbl,
6003 bufferlist *outbl,
6004 string *outs,
6005 Context *onfinish)
6006 {
6007 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6008 if (!iref_reader.is_state_satisfied())
6009 return -CEPHFS_ENOTCONN;
6010
6011 std::unique_lock cl(client_lock);
6012
6013 int r;
6014 r = authenticate();
6015 if (r < 0) {
6016 return r;
6017 }
6018
6019 r = fetch_fsmap(false);
6020 if (r < 0) {
6021 return r;
6022 }
6023
6024 // Look up MDS target(s) of the command
6025 std::vector<mds_gid_t> targets;
6026 r = resolve_mds(mds_spec, &targets);
6027 if (r < 0) {
6028 return r;
6029 }
6030
6031 // If daemons are laggy, we won't send them commands. If all
6032 // are laggy then we fail.
6033 std::vector<mds_gid_t> non_laggy;
6034 for (const auto& gid : targets) {
6035 const auto info = fsmap->get_info_gid(gid);
6036 if (!info.laggy()) {
6037 non_laggy.push_back(gid);
6038 }
6039 }
6040 if (non_laggy.size() == 0) {
6041 *outs = "All targeted MDS daemons are laggy";
6042 return -CEPHFS_ENOENT;
6043 }
6044
6045 if (metadata.empty()) {
6046 // We are called on an unmounted client, so metadata
6047 // won't be initialized yet.
6048 populate_metadata("");
6049 }
6050
6051 // Send commands to targets
6052 C_GatherBuilder gather(cct, onfinish);
6053 for (const auto& target_gid : non_laggy) {
6054 const auto info = fsmap->get_info_gid(target_gid);
6055
6056 // Open a connection to the target MDS
6057 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6058
6059 cl.unlock();
6060 {
6061 std::scoped_lock cmd_lock(command_lock);
6062 // Generate MDSCommandOp state
6063 auto &op = command_table.start_command();
6064
6065 op.on_finish = gather.new_sub();
6066 op.cmd = cmd;
6067 op.outbl = outbl;
6068 op.outs = outs;
6069 op.inbl = inbl;
6070 op.mds_gid = target_gid;
6071 op.con = conn;
6072
6073 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6074 << " tid=" << op.tid << cmd << dendl;
6075
6076 // Construct and send MCommand
6077 MessageRef m = op.get_message(monclient->get_fsid());
6078 conn->send_message2(std::move(m));
6079 }
6080 cl.lock();
6081 }
6082 gather.activate();
6083
6084 return 0;
6085 }
6086
6087 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6088 {
6089 ceph_tid_t const tid = m->get_tid();
6090
6091 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6092
6093 std::scoped_lock cmd_lock(command_lock);
6094 if (!command_table.exists(tid)) {
6095 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6096 return;
6097 }
6098
6099 auto &op = command_table.get_command(tid);
6100 if (op.outbl) {
6101 *op.outbl = m->get_data();
6102 }
6103 if (op.outs) {
6104 *op.outs = m->rs;
6105 }
6106
6107 if (op.on_finish) {
6108 op.on_finish->complete(m->r);
6109 }
6110
6111 command_table.erase(tid);
6112 }
6113
6114 // -------------------
6115 // MOUNT
6116
6117 int Client::subscribe_mdsmap(const std::string &fs_name)
6118 {
6119 int r = authenticate();
6120 if (r < 0) {
6121 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6122 return r;
6123 }
6124
6125 std::string resolved_fs_name;
6126 if (fs_name.empty()) {
6127 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6128 if (resolved_fs_name.empty())
6129 // Try the backwards compatibility fs name option
6130 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6131 } else {
6132 resolved_fs_name = fs_name;
6133 }
6134
6135 std::string want = "mdsmap";
6136 if (!resolved_fs_name.empty()) {
6137 r = fetch_fsmap(true);
6138 if (r < 0)
6139 return r;
6140 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6141 if (fscid == FS_CLUSTER_ID_NONE) {
6142 return -CEPHFS_ENOENT;
6143 }
6144
6145 std::ostringstream oss;
6146 oss << want << "." << fscid;
6147 want = oss.str();
6148 }
6149 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6150
6151 monclient->sub_want(want, 0, 0);
6152 monclient->renew_subs();
6153
6154 return 0;
6155 }
6156
6157 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6158 bool require_mds, const std::string &fs_name)
6159 {
6160 ceph_assert(is_initialized());
6161
6162 /*
6163 * To make sure that the _unmount() must wait until the mount()
6164 * is done.
6165 */
6166 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6167 if (!mref_writer.is_first_writer()) // already mounting or mounted
6168 return 0;
6169
6170 std::unique_lock cl(client_lock);
6171
6172 int r = subscribe_mdsmap(fs_name);
6173 if (r < 0) {
6174 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6175 return r;
6176 }
6177
6178 start_tick_thread(); // start tick thread
6179
6180 if (require_mds) {
6181 while (1) {
6182 auto availability = mdsmap->is_cluster_available();
6183 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6184 // Error out
6185 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6186 return CEPH_FUSE_NO_MDS_UP;
6187 } else if (availability == MDSMap::AVAILABLE) {
6188 // Continue to mount
6189 break;
6190 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6191 // Else, wait. MDSMonitor will update the map to bring
6192 // us to a conclusion eventually.
6193 wait_on_list(waiting_for_mdsmap);
6194 } else {
6195 // Unexpected value!
6196 ceph_abort();
6197 }
6198 }
6199 }
6200
6201 populate_metadata(mount_root.empty() ? "/" : mount_root);
6202
6203 filepath fp(CEPH_INO_ROOT);
6204 if (!mount_root.empty()) {
6205 fp = filepath(mount_root.c_str());
6206 }
6207 while (true) {
6208 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6209 req->set_filepath(fp);
6210 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6211 int res = make_request(req, perms);
6212 if (res < 0) {
6213 if (res == -CEPHFS_EACCES && root) {
6214 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6215 break;
6216 }
6217 return res;
6218 }
6219
6220 if (fp.depth())
6221 fp.pop_dentry();
6222 else
6223 break;
6224 }
6225
6226 ceph_assert(root);
6227 _ll_get(root);
6228
6229 // trace?
6230 if (!cct->_conf->client_trace.empty()) {
6231 traceout.open(cct->_conf->client_trace.c_str());
6232 if (traceout.is_open()) {
6233 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6234 } else {
6235 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6236 }
6237 }
6238
6239 /*
6240 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6241 ldout(cct, 3) << "op: struct stat st;" << dendl;
6242 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6243 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6244 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6245 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6246 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6247 ldout(cct, 3) << "op: int fd;" << dendl;
6248 */
6249
6250 mref_writer.update_state(CLIENT_MOUNTED);
6251 return 0;
6252 }
6253
6254 // UNMOUNT
6255
6256 void Client::_close_sessions()
6257 {
6258 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6259 if (it->second.state == MetaSession::STATE_REJECTED)
6260 mds_sessions.erase(it++);
6261 else
6262 ++it;
6263 }
6264
6265 while (!mds_sessions.empty()) {
6266 // send session closes!
6267 for (auto &p : mds_sessions) {
6268 if (p.second.state != MetaSession::STATE_CLOSING) {
6269 _close_mds_session(&p.second);
6270 mds_ranks_closing.insert(p.first);
6271 }
6272 }
6273
6274 // wait for sessions to close
6275 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6276 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6277 << timo << "s)" << dendl;
6278 std::unique_lock l{client_lock, std::adopt_lock};
6279 if (!timo) {
6280 mount_cond.wait(l);
6281 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6282 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6283 while (!mds_ranks_closing.empty()) {
6284 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6285 // this prunes entry from mds_sessions and mds_ranks_closing
6286 _closed_mds_session(&session, -CEPHFS_ETIMEDOUT);
6287 }
6288 }
6289
6290 mds_ranks_closing.clear();
6291 l.release();
6292 }
6293 }
6294
6295 void Client::flush_mdlog_sync()
6296 {
6297 if (mds_requests.empty())
6298 return;
6299 for (auto &p : mds_sessions) {
6300 flush_mdlog(&p.second);
6301 }
6302 }
6303
6304 void Client::flush_mdlog(MetaSession *session)
6305 {
6306 // Only send this to Luminous or newer MDS daemons, older daemons
6307 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6308 const uint64_t features = session->con->get_features();
6309 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6310 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6311 session->con->send_message2(std::move(m));
6312 }
6313 }
6314
6315
6316 void Client::_abort_mds_sessions(int err)
6317 {
6318 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6319 auto req = p->second;
6320 ++p;
6321 // unsafe requests will be removed during close session below.
6322 if (req->got_unsafe)
6323 continue;
6324
6325 req->abort(err);
6326 if (req->caller_cond) {
6327 req->kick = true;
6328 req->caller_cond->notify_all();
6329 }
6330 }
6331
6332 // Process aborts on any requests that were on this waitlist.
6333 // Any requests that were on a waiting_for_open session waitlist
6334 // will get kicked during close session below.
6335 signal_cond_list(waiting_for_mdsmap);
6336
6337 // Force-close all sessions
6338 while(!mds_sessions.empty()) {
6339 auto& session = mds_sessions.begin()->second;
6340 _closed_mds_session(&session, err);
6341 }
6342 }
6343
6344 void Client::_unmount(bool abort)
6345 {
6346 /*
6347 * We are unmounting the client.
6348 *
6349 * Just declare the state to STATE_UNMOUNTING to block and fail
6350 * any new comming "reader" and then try to wait all the in-flight
6351 * "readers" to finish.
6352 */
6353 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6354 if (!mref_writer.is_first_writer())
6355 return;
6356 mref_writer.wait_readers_done();
6357
6358 std::unique_lock lock{client_lock};
6359
6360 if (abort || blocklisted) {
6361 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6362 } else {
6363 ldout(cct, 2) << "unmounting" << dendl;
6364 }
6365
6366 deleg_timeout = 0;
6367
6368 if (abort) {
6369 mount_aborted = true;
6370 // Abort all mds sessions
6371 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6372
6373 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6374 } else {
6375 // flush the mdlog for pending requests, if any
6376 flush_mdlog_sync();
6377 }
6378
6379 mount_cond.wait(lock, [this] {
6380 if (!mds_requests.empty()) {
6381 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6382 << dendl;
6383 }
6384 return mds_requests.empty();
6385 });
6386
6387 cwd.reset();
6388
6389 // clean up any unclosed files
6390 while (!fd_map.empty()) {
6391 Fh *fh = fd_map.begin()->second;
6392 fd_map.erase(fd_map.begin());
6393 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6394 _release_fh(fh);
6395 }
6396
6397 while (!ll_unclosed_fh_set.empty()) {
6398 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6399 Fh *fh = *it;
6400 ll_unclosed_fh_set.erase(fh);
6401 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6402 _release_fh(fh);
6403 }
6404
6405 while (!opened_dirs.empty()) {
6406 dir_result_t *dirp = *opened_dirs.begin();
6407 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6408 _closedir(dirp);
6409 }
6410
6411 _ll_drop_pins();
6412
6413 if (cct->_conf->client_oc) {
6414 // flush/release all buffered data
6415 std::list<InodeRef> anchor;
6416 for (auto& p : inode_map) {
6417 Inode *in = p.second;
6418 if (!in) {
6419 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6420 ceph_assert(in);
6421 }
6422
6423 // prevent inode from getting freed
6424 anchor.emplace_back(in);
6425
6426 if (abort || blocklisted) {
6427 objectcacher->purge_set(&in->oset);
6428 } else if (!in->caps.empty()) {
6429 _release(in);
6430 _flush(in, new C_Client_FlushComplete(this, in));
6431 }
6432 }
6433 }
6434
6435 if (abort || blocklisted) {
6436 for (auto p = dirty_list.begin(); !p.end(); ) {
6437 Inode *in = *p;
6438 ++p;
6439 if (in->dirty_caps) {
6440 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6441 in->mark_caps_clean();
6442 put_inode(in);
6443 }
6444 }
6445 } else {
6446 flush_caps_sync();
6447 wait_sync_caps(last_flush_tid);
6448 }
6449
6450 // empty lru cache
6451 trim_cache();
6452
6453 delay_put_inodes();
6454
6455 while (lru.lru_get_size() > 0 ||
6456 !inode_map.empty()) {
6457 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6458 << "+" << inode_map.size() << " items"
6459 << ", waiting (for caps to release?)"
6460 << dendl;
6461
6462 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6463 r == std::cv_status::timeout) {
6464 dump_cache(NULL);
6465 }
6466 }
6467 ceph_assert(lru.lru_get_size() == 0);
6468 ceph_assert(inode_map.empty());
6469
6470 // stop tracing
6471 if (!cct->_conf->client_trace.empty()) {
6472 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6473 traceout.close();
6474 }
6475
6476 // stop the tick thread
6477 tick_thread_stopped = true;
6478 upkeep_cond.notify_one();
6479
6480 _close_sessions();
6481
6482 mref_writer.update_state(CLIENT_UNMOUNTED);
6483
6484 ldout(cct, 2) << "unmounted." << dendl;
6485 }
6486
6487 void Client::unmount()
6488 {
6489 _unmount(false);
6490 }
6491
6492 void Client::abort_conn()
6493 {
6494 _unmount(true);
6495 }
6496
6497 void Client::flush_cap_releases()
6498 {
6499 uint64_t nr_caps = 0;
6500
6501 // send any cap releases
6502 for (auto &p : mds_sessions) {
6503 auto &session = p.second;
6504 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6505 p.first)) {
6506 nr_caps += session.release->caps.size();
6507 if (cct->_conf->client_inject_release_failure) {
6508 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6509 } else {
6510 session.con->send_message2(std::move(session.release));
6511 }
6512 session.release.reset();
6513 }
6514 }
6515
6516 if (nr_caps > 0) {
6517 dec_pinned_icaps(nr_caps);
6518 }
6519 }
6520
6521 void Client::renew_and_flush_cap_releases()
6522 {
6523 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6524
6525 if (!mount_aborted && mdsmap->get_epoch()) {
6526 // renew caps?
6527 utime_t el = ceph_clock_now() - last_cap_renew;
6528 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6529 renew_caps();
6530
6531 flush_cap_releases();
6532 }
6533 }
6534
6535 void Client::tick()
6536 {
6537 ldout(cct, 20) << "tick" << dendl;
6538
6539 utime_t now = ceph_clock_now();
6540
6541 /*
6542 * If the mount() is not finished
6543 */
6544 if (is_mounting() && !mds_requests.empty()) {
6545 MetaRequest *req = mds_requests.begin()->second;
6546
6547 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6548 req->abort(-CEPHFS_ETIMEDOUT);
6549 if (req->caller_cond) {
6550 req->kick = true;
6551 req->caller_cond->notify_all();
6552 }
6553 signal_cond_list(waiting_for_mdsmap);
6554 for (auto &p : mds_sessions) {
6555 signal_context_list(p.second.waiting_for_open);
6556 }
6557 }
6558 }
6559
6560 renew_and_flush_cap_releases();
6561
6562 // delayed caps
6563 xlist<Inode*>::iterator p = delayed_list.begin();
6564 while (!p.end()) {
6565 Inode *in = *p;
6566 ++p;
6567 if (!mount_aborted && in->hold_caps_until > now)
6568 break;
6569 delayed_list.pop_front();
6570 if (!mount_aborted)
6571 check_caps(in, CHECK_CAPS_NODELAY);
6572 }
6573
6574 if (!mount_aborted)
6575 collect_and_send_metrics();
6576
6577 delay_put_inodes(is_unmounting());
6578 trim_cache(true);
6579
6580 if (blocklisted && (is_mounted() || is_unmounting()) &&
6581 last_auto_reconnect + 30 * 60 < now &&
6582 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6583 messenger->client_reset();
6584 fd_gen++; // invalidate open files
6585 blocklisted = false;
6586 _kick_stale_sessions();
6587 last_auto_reconnect = now;
6588 }
6589 }
6590
6591 void Client::start_tick_thread()
6592 {
6593 upkeeper = std::thread([this]() {
6594 using time = ceph::coarse_mono_time;
6595 using sec = std::chrono::seconds;
6596
6597 auto last_tick = time::min();
6598
6599 std::unique_lock cl(client_lock);
6600 while (!tick_thread_stopped) {
6601 auto now = clock::now();
6602 auto since = now - last_tick;
6603
6604 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6605 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6606
6607 auto interval = std::max(t_interval, d_interval);
6608 if (likely(since >= interval*.90)) {
6609 tick();
6610 last_tick = clock::now();
6611 } else {
6612 interval -= since;
6613 }
6614
6615 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6616 if (!tick_thread_stopped)
6617 upkeep_cond.wait_for(cl, interval);
6618 }
6619 });
6620 }
6621
6622 void Client::collect_and_send_metrics() {
6623 ldout(cct, 20) << __func__ << dendl;
6624
6625 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6626
6627 // right now, we only track and send global metrics. its sufficient
6628 // to send these metrics to MDS rank0.
6629 collect_and_send_global_metrics();
6630 }
6631
6632 void Client::collect_and_send_global_metrics() {
6633 ldout(cct, 20) << __func__ << dendl;
6634 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6635
6636 if (!have_open_session((mds_rank_t)0)) {
6637 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6638 << dendl;
6639 return;
6640 }
6641 auto session = _get_or_open_mds_session((mds_rank_t)0);
6642 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6643 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6644 return;
6645 }
6646
6647 ClientMetricMessage metric;
6648 std::vector<ClientMetricMessage> message;
6649
6650 // read latency
6651 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6652 message.push_back(metric);
6653
6654 // write latency
6655 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6656 message.push_back(metric);
6657
6658 // metadata latency
6659 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6660 message.push_back(metric);
6661
6662 // cap hit ratio -- nr_caps is unused right now
6663 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6664 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6665 message.push_back(metric);
6666
6667 // dentry lease hit ratio
6668 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6669 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6670 message.push_back(metric);
6671
6672 // opened files
6673 {
6674 auto [opened_files, total_inodes] = get_opened_files_rates();
6675 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6676 }
6677 message.push_back(metric);
6678
6679 // pinned i_caps
6680 {
6681 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6682 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6683 }
6684 message.push_back(metric);
6685
6686 // opened inodes
6687 {
6688 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6689 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6690 }
6691 message.push_back(metric);
6692
6693 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6694 }
6695
6696 void Client::renew_caps()
6697 {
6698 ldout(cct, 10) << "renew_caps()" << dendl;
6699 last_cap_renew = ceph_clock_now();
6700
6701 for (auto &p : mds_sessions) {
6702 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6703 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6704 renew_caps(&p.second);
6705 }
6706 }
6707
6708 void Client::renew_caps(MetaSession *session)
6709 {
6710 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6711 session->last_cap_renew_request = ceph_clock_now();
6712 uint64_t seq = ++session->cap_renew_seq;
6713 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6714 }
6715
6716
6717 // ===============================================================
6718 // high level (POSIXy) interface
6719
6720 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6721 InodeRef *target, const UserPerm& perms)
6722 {
6723 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6724 MetaRequest *req = new MetaRequest(op);
6725 filepath path;
6726 dir->make_nosnap_relative_path(path);
6727 path.push_dentry(name);
6728 req->set_filepath(path);
6729 req->set_inode(dir);
6730 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6731 mask |= DEBUG_GETATTR_CAPS;
6732 req->head.args.getattr.mask = mask;
6733
6734 ldout(cct, 10) << __func__ << " on " << path << dendl;
6735
6736 int r = make_request(req, perms, target);
6737 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6738 return r;
6739 }
6740
6741 bool Client::_dentry_valid(const Dentry *dn)
6742 {
6743 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6744
6745 // is dn lease valid?
6746 utime_t now = ceph_clock_now();
6747 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6748 mds_sessions.count(dn->lease_mds)) {
6749 MetaSession &s = mds_sessions.at(dn->lease_mds);
6750 if (s.cap_ttl > now && s.cap_gen == dn->lease_gen) {
6751 dlease_hit();
6752 return true;
6753 }
6754
6755 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6756 << " vs lease_gen " << dn->lease_gen << dendl;
6757 }
6758
6759 dlease_miss();
6760 return false;
6761 }
6762
6763 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6764 const UserPerm& perms, std::string* alternate_name)
6765 {
6766 int r = 0;
6767 Dentry *dn = NULL;
6768 bool did_lookup_request = false;
6769 // can only request shared caps
6770 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6771
6772 if (dname == "..") {
6773 if (dir->dentries.empty()) {
6774 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6775 filepath path(dir->ino);
6776 req->set_filepath(path);
6777
6778 InodeRef tmptarget;
6779 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6780
6781 if (r == 0) {
6782 *target = std::move(tmptarget);
6783 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6784 } else {
6785 *target = dir;
6786 }
6787 }
6788 else
6789 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6790 goto done;
6791 }
6792
6793 if (dname == ".") {
6794 *target = dir;
6795 goto done;
6796 }
6797
6798 if (!dir->is_dir()) {
6799 r = -CEPHFS_ENOTDIR;
6800 goto done;
6801 }
6802
6803 if (dname.length() > NAME_MAX) {
6804 r = -CEPHFS_ENAMETOOLONG;
6805 goto done;
6806 }
6807
6808 if (dname == cct->_conf->client_snapdir &&
6809 dir->snapid == CEPH_NOSNAP) {
6810 *target = open_snapdir(dir);
6811 goto done;
6812 }
6813
6814 relookup:
6815 if (dir->dir &&
6816 dir->dir->dentries.count(dname)) {
6817 dn = dir->dir->dentries[dname];
6818
6819 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6820 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6821
6822 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6823 if (_dentry_valid(dn)) {
6824 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6825 // make trim_caps() behave.
6826 dir->try_touch_cap(dn->lease_mds);
6827 goto hit_dn;
6828 }
6829 // dir shared caps?
6830 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6831 if (dn->cap_shared_gen == dir->shared_gen &&
6832 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6833 goto hit_dn;
6834 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6835 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6836 << *dir << " dn '" << dname << "'" << dendl;
6837 return -CEPHFS_ENOENT;
6838 }
6839 }
6840 } else {
6841 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6842 }
6843 } else {
6844 // can we conclude ENOENT locally?
6845 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6846 (dir->flags & I_COMPLETE)) {
6847 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6848 return -CEPHFS_ENOENT;
6849 }
6850 }
6851
6852 if (did_lookup_request) {
6853 r = 0;
6854 goto done;
6855 }
6856 r = _do_lookup(dir, dname, mask, target, perms);
6857 did_lookup_request = true;
6858 if (r == 0) {
6859 /* complete lookup to get dentry for alternate_name */
6860 goto relookup;
6861 } else {
6862 goto done;
6863 }
6864
6865 hit_dn:
6866 if (dn->inode) {
6867 *target = dn->inode;
6868 if (alternate_name)
6869 *alternate_name = dn->alternate_name;
6870 } else {
6871 r = -CEPHFS_ENOENT;
6872 }
6873 touch_dn(dn);
6874 goto done;
6875
6876 done:
6877 if (r < 0)
6878 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6879 else
6880 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6881 return r;
6882 }
6883
6884 int Client::get_or_create(Inode *dir, const char* name,
6885 Dentry **pdn, bool expect_null)
6886 {
6887 // lookup
6888 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6889 dir->open_dir();
6890 if (dir->dir->dentries.count(name)) {
6891 Dentry *dn = dir->dir->dentries[name];
6892 if (_dentry_valid(dn)) {
6893 if (expect_null)
6894 return -CEPHFS_EEXIST;
6895 }
6896 *pdn = dn;
6897 } else {
6898 // otherwise link up a new one
6899 *pdn = link(dir->dir, name, NULL, NULL);
6900 }
6901
6902 // success
6903 return 0;
6904 }
6905
6906 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6907 {
6908 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6909 if (!mref_reader.is_state_satisfied())
6910 return -CEPHFS_ENOTCONN;
6911
6912 ldout(cct, 10) << __func__ << ": " << path << dendl;
6913
6914 std::scoped_lock lock(client_lock);
6915
6916 return path_walk(path, wdr, perms, followsym);
6917 }
6918
6919 int Client::path_walk(const filepath& origpath, InodeRef *end,
6920 const UserPerm& perms, bool followsym, int mask)
6921 {
6922 walk_dentry_result wdr;
6923 int rc = path_walk(origpath, &wdr, perms, followsym, mask);
6924 *end = std::move(wdr.in);
6925 return rc;
6926 }
6927
6928 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms, bool followsym, int mask)
6929 {
6930 filepath path = origpath;
6931 InodeRef cur;
6932 std::string alternate_name;
6933 if (origpath.absolute())
6934 cur = root;
6935 else
6936 cur = cwd;
6937 ceph_assert(cur);
6938
6939 ldout(cct, 10) << __func__ << " " << path << dendl;
6940
6941 int symlinks = 0;
6942
6943 unsigned i=0;
6944 while (i < path.depth() && cur) {
6945 int caps = 0;
6946 const string &dname = path[i];
6947 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6948 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6949 InodeRef next;
6950 if (cct->_conf->client_permissions) {
6951 int r = may_lookup(cur.get(), perms);
6952 if (r < 0)
6953 return r;
6954 caps = CEPH_CAP_AUTH_SHARED;
6955 }
6956
6957 /* Get extra requested caps on the last component */
6958 if (i == (path.depth() - 1))
6959 caps |= mask;
6960 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
6961 if (r < 0)
6962 return r;
6963 // only follow trailing symlink if followsym. always follow
6964 // 'directory' symlinks.
6965 if (next && next->is_symlink()) {
6966 symlinks++;
6967 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6968 if (symlinks > MAXSYMLINKS) {
6969 return -CEPHFS_ELOOP;
6970 }
6971
6972 if (i < path.depth() - 1) {
6973 // dir symlink
6974 // replace consumed components of path with symlink dir target
6975 filepath resolved(next->symlink.c_str());
6976 resolved.append(path.postfixpath(i + 1));
6977 path = resolved;
6978 i = 0;
6979 if (next->symlink[0] == '/') {
6980 cur = root;
6981 }
6982 continue;
6983 } else if (followsym) {
6984 if (next->symlink[0] == '/') {
6985 path = next->symlink.c_str();
6986 i = 0;
6987 // reset position
6988 cur = root;
6989 } else {
6990 filepath more(next->symlink.c_str());
6991 // we need to remove the symlink component from off of the path
6992 // before adding the target that the symlink points to. remain
6993 // at the same position in the path.
6994 path.pop_dentry();
6995 path.append(more);
6996 }
6997 continue;
6998 }
6999 }
7000 cur.swap(next);
7001 i++;
7002 }
7003 if (!cur)
7004 return -CEPHFS_ENOENT;
7005 if (result) {
7006 result->in = std::move(cur);
7007 result->alternate_name = std::move(alternate_name);
7008 }
7009 return 0;
7010 }
7011
7012
7013 // namespace ops
7014
7015 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7016 {
7017 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7018 if (!mref_reader.is_state_satisfied())
7019 return -CEPHFS_ENOTCONN;
7020
7021 tout(cct) << "link" << std::endl;
7022 tout(cct) << relexisting << std::endl;
7023 tout(cct) << relpath << std::endl;
7024
7025 filepath existing(relexisting);
7026
7027 InodeRef in, dir;
7028
7029 std::scoped_lock lock(client_lock);
7030 int r = path_walk(existing, &in, perm, true);
7031 if (r < 0)
7032 return r;
7033 if (std::string(relpath) == "/") {
7034 r = -CEPHFS_EEXIST;
7035 return r;
7036 }
7037 filepath path(relpath);
7038 string name = path.last_dentry();
7039 path.pop_dentry();
7040
7041 r = path_walk(path, &dir, perm, true);
7042 if (r < 0)
7043 return r;
7044 if (cct->_conf->client_permissions) {
7045 if (S_ISDIR(in->mode)) {
7046 r = -CEPHFS_EPERM;
7047 return r;
7048 }
7049 r = may_hardlink(in.get(), perm);
7050 if (r < 0)
7051 return r;
7052 r = may_create(dir.get(), perm);
7053 if (r < 0)
7054 return r;
7055 }
7056 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7057 return r;
7058 }
7059
7060 int Client::unlink(const char *relpath, const UserPerm& perm)
7061 {
7062 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7063 if (!mref_reader.is_state_satisfied())
7064 return -CEPHFS_ENOTCONN;
7065
7066 tout(cct) << __func__ << std::endl;
7067 tout(cct) << relpath << std::endl;
7068
7069 if (std::string(relpath) == "/")
7070 return -CEPHFS_EISDIR;
7071
7072 filepath path(relpath);
7073 string name = path.last_dentry();
7074 path.pop_dentry();
7075 InodeRef dir;
7076
7077 std::scoped_lock lock(client_lock);
7078 int r = path_walk(path, &dir, perm);
7079 if (r < 0)
7080 return r;
7081 if (cct->_conf->client_permissions) {
7082 r = may_delete(dir.get(), name.c_str(), perm);
7083 if (r < 0)
7084 return r;
7085 }
7086 return _unlink(dir.get(), name.c_str(), perm);
7087 }
7088
7089 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7090 {
7091 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7092 if (!mref_reader.is_state_satisfied())
7093 return -CEPHFS_ENOTCONN;
7094
7095 tout(cct) << __func__ << std::endl;
7096 tout(cct) << relfrom << std::endl;
7097 tout(cct) << relto << std::endl;
7098
7099 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7100 return -CEPHFS_EBUSY;
7101
7102 filepath from(relfrom);
7103 filepath to(relto);
7104 string fromname = from.last_dentry();
7105 from.pop_dentry();
7106 string toname = to.last_dentry();
7107 to.pop_dentry();
7108
7109 InodeRef fromdir, todir;
7110
7111 std::scoped_lock lock(client_lock);
7112 int r = path_walk(from, &fromdir, perm);
7113 if (r < 0)
7114 goto out;
7115 r = path_walk(to, &todir, perm);
7116 if (r < 0)
7117 goto out;
7118
7119 if (cct->_conf->client_permissions) {
7120 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7121 if (r < 0)
7122 return r;
7123 r = may_delete(todir.get(), toname.c_str(), perm);
7124 if (r < 0 && r != -CEPHFS_ENOENT)
7125 return r;
7126 }
7127 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7128 out:
7129 return r;
7130 }
7131
7132 // dirs
7133
7134 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7135 {
7136 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7137 if (!mref_reader.is_state_satisfied())
7138 return -CEPHFS_ENOTCONN;
7139
7140 tout(cct) << __func__ << std::endl;
7141 tout(cct) << relpath << std::endl;
7142 tout(cct) << mode << std::endl;
7143 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7144
7145 if (std::string(relpath) == "/")
7146 return -CEPHFS_EEXIST;
7147
7148 filepath path(relpath);
7149 string name = path.last_dentry();
7150 path.pop_dentry();
7151 InodeRef dir;
7152
7153 std::scoped_lock lock(client_lock);
7154 int r = path_walk(path, &dir, perm);
7155 if (r < 0)
7156 return r;
7157 if (cct->_conf->client_permissions) {
7158 r = may_create(dir.get(), perm);
7159 if (r < 0)
7160 return r;
7161 }
7162 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7163 }
7164
7165 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7166 {
7167 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7168 if (!mref_reader.is_state_satisfied())
7169 return -CEPHFS_ENOTCONN;
7170
7171 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7172 tout(cct) << __func__ << std::endl;
7173 tout(cct) << relpath << std::endl;
7174 tout(cct) << mode << std::endl;
7175
7176 //get through existing parts of path
7177 filepath path(relpath);
7178 unsigned int i;
7179 int r = 0, caps = 0;
7180 InodeRef cur, next;
7181
7182 std::scoped_lock lock(client_lock);
7183 cur = cwd;
7184 for (i=0; i<path.depth(); ++i) {
7185 if (cct->_conf->client_permissions) {
7186 r = may_lookup(cur.get(), perms);
7187 if (r < 0)
7188 break;
7189 caps = CEPH_CAP_AUTH_SHARED;
7190 }
7191 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7192 if (r < 0)
7193 break;
7194 cur.swap(next);
7195 }
7196 if (r!=-CEPHFS_ENOENT) return r;
7197 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7198 //make new directory at each level
7199 for (; i<path.depth(); ++i) {
7200 if (cct->_conf->client_permissions) {
7201 r = may_create(cur.get(), perms);
7202 if (r < 0)
7203 return r;
7204 }
7205 //make new dir
7206 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7207
7208 //check proper creation/existence
7209 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7210 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7211 }
7212 if (r < 0)
7213 return r;
7214 //move to new dir and continue
7215 cur.swap(next);
7216 ldout(cct, 20) << __func__ << ": successfully created directory "
7217 << filepath(cur->ino).get_path() << dendl;
7218 }
7219 return 0;
7220 }
7221
7222 int Client::rmdir(const char *relpath, const UserPerm& perms)
7223 {
7224 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7225 if (!mref_reader.is_state_satisfied())
7226 return -CEPHFS_ENOTCONN;
7227
7228 tout(cct) << __func__ << std::endl;
7229 tout(cct) << relpath << std::endl;
7230
7231 if (std::string(relpath) == "/")
7232 return -CEPHFS_EBUSY;
7233
7234 filepath path(relpath);
7235 string name = path.last_dentry();
7236 path.pop_dentry();
7237 InodeRef dir;
7238
7239 std::scoped_lock lock(client_lock);
7240 int r = path_walk(path, &dir, perms);
7241 if (r < 0)
7242 return r;
7243 if (cct->_conf->client_permissions) {
7244 int r = may_delete(dir.get(), name.c_str(), perms);
7245 if (r < 0)
7246 return r;
7247 }
7248 return _rmdir(dir.get(), name.c_str(), perms);
7249 }
7250
7251 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7252 {
7253 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7254 if (!mref_reader.is_state_satisfied())
7255 return -CEPHFS_ENOTCONN;
7256
7257 tout(cct) << __func__ << std::endl;
7258 tout(cct) << relpath << std::endl;
7259 tout(cct) << mode << std::endl;
7260 tout(cct) << rdev << std::endl;
7261
7262 if (std::string(relpath) == "/")
7263 return -CEPHFS_EEXIST;
7264
7265 filepath path(relpath);
7266 string name = path.last_dentry();
7267 path.pop_dentry();
7268 InodeRef dir;
7269
7270 std::scoped_lock lock(client_lock);
7271 int r = path_walk(path, &dir, perms);
7272 if (r < 0)
7273 return r;
7274 if (cct->_conf->client_permissions) {
7275 int r = may_create(dir.get(), perms);
7276 if (r < 0)
7277 return r;
7278 }
7279 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7280 }
7281
7282 // symlinks
7283
7284 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7285 {
7286 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7287 if (!mref_reader.is_state_satisfied())
7288 return -CEPHFS_ENOTCONN;
7289
7290 tout(cct) << __func__ << std::endl;
7291 tout(cct) << target << std::endl;
7292 tout(cct) << relpath << std::endl;
7293
7294 if (std::string(relpath) == "/")
7295 return -CEPHFS_EEXIST;
7296
7297 filepath path(relpath);
7298 string name = path.last_dentry();
7299 path.pop_dentry();
7300 InodeRef dir;
7301
7302 std::scoped_lock lock(client_lock);
7303 int r = path_walk(path, &dir, perms);
7304 if (r < 0)
7305 return r;
7306 if (cct->_conf->client_permissions) {
7307 int r = may_create(dir.get(), perms);
7308 if (r < 0)
7309 return r;
7310 }
7311 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7312 }
7313
7314 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7315 {
7316 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7317 if (!mref_reader.is_state_satisfied())
7318 return -CEPHFS_ENOTCONN;
7319
7320 tout(cct) << __func__ << std::endl;
7321 tout(cct) << relpath << std::endl;
7322
7323 filepath path(relpath);
7324 InodeRef in;
7325
7326 std::scoped_lock lock(client_lock);
7327 int r = path_walk(path, &in, perms, false);
7328 if (r < 0)
7329 return r;
7330
7331 return _readlink(in.get(), buf, size);
7332 }
7333
7334 int Client::_readlink(Inode *in, char *buf, size_t size)
7335 {
7336 if (!in->is_symlink())
7337 return -CEPHFS_EINVAL;
7338
7339 // copy into buf (at most size bytes)
7340 int r = in->symlink.length();
7341 if (r > (int)size)
7342 r = size;
7343 memcpy(buf, in->symlink.c_str(), r);
7344 return r;
7345 }
7346
7347
7348 // inode stuff
7349
7350 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7351 {
7352 bool yes = in->caps_issued_mask(mask, true);
7353
7354 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7355 if (yes && !force)
7356 return 0;
7357
7358 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7359 filepath path;
7360 in->make_nosnap_relative_path(path);
7361 req->set_filepath(path);
7362 req->set_inode(in);
7363 req->head.args.getattr.mask = mask;
7364
7365 int res = make_request(req, perms);
7366 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7367 return res;
7368 }
7369
7370 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7371 const UserPerm& perms, InodeRef *inp)
7372 {
7373 int issued = in->caps_issued();
7374
7375 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7376 ccap_string(issued) << dendl;
7377
7378 if (in->snapid != CEPH_NOSNAP) {
7379 return -CEPHFS_EROFS;
7380 }
7381 if ((mask & CEPH_SETATTR_SIZE) &&
7382 (uint64_t)stx->stx_size > in->size &&
7383 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7384 perms)) {
7385 return -CEPHFS_EDQUOT;
7386 }
7387
7388 // make the change locally?
7389 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7390 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7391 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7392 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7393 << in->cap_dirtier_gid << ", forcing sync setattr"
7394 << dendl;
7395 /*
7396 * This works because we implicitly flush the caps as part of the
7397 * request, so the cap update check will happen with the writeback
7398 * cap context, and then the setattr check will happen with the
7399 * caller's context.
7400 *
7401 * In reality this pattern is likely pretty rare (different users
7402 * setattr'ing the same file). If that turns out not to be the
7403 * case later, we can build a more complex pipelined cap writeback
7404 * infrastructure...
7405 */
7406 if (!mask)
7407 mask |= CEPH_SETATTR_CTIME;
7408 goto force_request;
7409 }
7410
7411 if (!mask) {
7412 // caller just needs us to bump the ctime
7413 in->ctime = ceph_clock_now();
7414 in->cap_dirtier_uid = perms.uid();
7415 in->cap_dirtier_gid = perms.gid();
7416 if (issued & CEPH_CAP_AUTH_EXCL)
7417 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7418 else if (issued & CEPH_CAP_FILE_EXCL)
7419 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7420 else if (issued & CEPH_CAP_XATTR_EXCL)
7421 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7422 else
7423 mask |= CEPH_SETATTR_CTIME;
7424 }
7425
7426 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7427 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7428
7429 mask &= ~CEPH_SETATTR_KILL_SGUID;
7430
7431 if (mask & CEPH_SETATTR_UID) {
7432 in->ctime = ceph_clock_now();
7433 in->cap_dirtier_uid = perms.uid();
7434 in->cap_dirtier_gid = perms.gid();
7435 in->uid = stx->stx_uid;
7436 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7437 mask &= ~CEPH_SETATTR_UID;
7438 kill_sguid = true;
7439 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7440 }
7441 if (mask & CEPH_SETATTR_GID) {
7442 in->ctime = ceph_clock_now();
7443 in->cap_dirtier_uid = perms.uid();
7444 in->cap_dirtier_gid = perms.gid();
7445 in->gid = stx->stx_gid;
7446 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7447 mask &= ~CEPH_SETATTR_GID;
7448 kill_sguid = true;
7449 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7450 }
7451
7452 if (mask & CEPH_SETATTR_MODE) {
7453 in->ctime = ceph_clock_now();
7454 in->cap_dirtier_uid = perms.uid();
7455 in->cap_dirtier_gid = perms.gid();
7456 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7457 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7458 mask &= ~CEPH_SETATTR_MODE;
7459 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7460 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7461 /* Must squash the any setuid/setgid bits with an ownership change */
7462 in->mode &= ~(S_ISUID|S_ISGID);
7463 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7464 }
7465
7466 if (mask & CEPH_SETATTR_BTIME) {
7467 in->ctime = ceph_clock_now();
7468 in->cap_dirtier_uid = perms.uid();
7469 in->cap_dirtier_gid = perms.gid();
7470 in->btime = utime_t(stx->stx_btime);
7471 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7472 mask &= ~CEPH_SETATTR_BTIME;
7473 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7474 }
7475 } else if (mask & CEPH_SETATTR_SIZE) {
7476 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7477 mask |= CEPH_SETATTR_KILL_SGUID;
7478 }
7479
7480 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7481 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7482 if (mask & CEPH_SETATTR_MTIME)
7483 in->mtime = utime_t(stx->stx_mtime);
7484 if (mask & CEPH_SETATTR_ATIME)
7485 in->atime = utime_t(stx->stx_atime);
7486 in->ctime = ceph_clock_now();
7487 in->cap_dirtier_uid = perms.uid();
7488 in->cap_dirtier_gid = perms.gid();
7489 in->time_warp_seq++;
7490 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7491 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7492 }
7493 }
7494 if (!mask) {
7495 in->change_attr++;
7496 return 0;
7497 }
7498
7499 force_request:
7500 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7501
7502 filepath path;
7503
7504 in->make_nosnap_relative_path(path);
7505 req->set_filepath(path);
7506 req->set_inode(in);
7507
7508 if (mask & CEPH_SETATTR_KILL_SGUID) {
7509 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7510 }
7511 if (mask & CEPH_SETATTR_MODE) {
7512 req->head.args.setattr.mode = stx->stx_mode;
7513 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7514 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7515 }
7516 if (mask & CEPH_SETATTR_UID) {
7517 req->head.args.setattr.uid = stx->stx_uid;
7518 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7519 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7520 }
7521 if (mask & CEPH_SETATTR_GID) {
7522 req->head.args.setattr.gid = stx->stx_gid;
7523 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7524 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7525 }
7526 if (mask & CEPH_SETATTR_BTIME) {
7527 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7528 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7529 }
7530 if (mask & CEPH_SETATTR_MTIME) {
7531 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7532 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7533 CEPH_CAP_FILE_WR;
7534 }
7535 if (mask & CEPH_SETATTR_ATIME) {
7536 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7537 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7538 CEPH_CAP_FILE_WR;
7539 }
7540 if (mask & CEPH_SETATTR_SIZE) {
7541 if ((uint64_t)stx->stx_size < mdsmap->get_max_filesize()) {
7542 req->head.args.setattr.size = stx->stx_size;
7543 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7544 } else { //too big!
7545 put_request(req);
7546 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7547 return -CEPHFS_EFBIG;
7548 }
7549 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7550 CEPH_CAP_FILE_WR;
7551 }
7552 req->head.args.setattr.mask = mask;
7553
7554 req->regetattr_mask = mask;
7555
7556 int res = make_request(req, perms, inp);
7557 ldout(cct, 10) << "_setattr result=" << res << dendl;
7558 return res;
7559 }
7560
7561 /* Note that we only care about attrs that setattr cares about */
7562 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7563 {
7564 stx->stx_size = st->st_size;
7565 stx->stx_mode = st->st_mode;
7566 stx->stx_uid = st->st_uid;
7567 stx->stx_gid = st->st_gid;
7568 #ifdef __APPLE__
7569 stx->stx_mtime = st->st_mtimespec;
7570 stx->stx_atime = st->st_atimespec;
7571 #elif __WIN32
7572 stx->stx_mtime.tv_sec = st->st_mtime;
7573 stx->stx_atime.tv_sec = st->st_atime;
7574 #else
7575 stx->stx_mtime = st->st_mtim;
7576 stx->stx_atime = st->st_atim;
7577 #endif
7578 }
7579
7580 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7581 const UserPerm& perms, InodeRef *inp)
7582 {
7583 int ret = _do_setattr(in, stx, mask, perms, inp);
7584 if (ret < 0)
7585 return ret;
7586 if (mask & CEPH_SETATTR_MODE)
7587 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7588 return ret;
7589 }
7590
7591 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7592 const UserPerm& perms)
7593 {
7594 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7595 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7596 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7597 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7598 if (cct->_conf->client_permissions) {
7599 int r = may_setattr(in.get(), stx, mask, perms);
7600 if (r < 0)
7601 return r;
7602 }
7603 return __setattrx(in.get(), stx, mask, perms);
7604 }
7605
7606 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7607 const UserPerm& perms)
7608 {
7609 struct ceph_statx stx;
7610
7611 stat_to_statx(attr, &stx);
7612 mask &= ~CEPH_SETATTR_BTIME;
7613
7614 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7615 mask &= ~CEPH_SETATTR_UID;
7616 }
7617 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7618 mask &= ~CEPH_SETATTR_GID;
7619 }
7620
7621 return _setattrx(in, &stx, mask, perms);
7622 }
7623
7624 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7625 const UserPerm& perms)
7626 {
7627 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7628 if (!mref_reader.is_state_satisfied())
7629 return -CEPHFS_ENOTCONN;
7630
7631 tout(cct) << __func__ << std::endl;
7632 tout(cct) << relpath << std::endl;
7633 tout(cct) << mask << std::endl;
7634
7635 filepath path(relpath);
7636 InodeRef in;
7637
7638 std::scoped_lock lock(client_lock);
7639 int r = path_walk(path, &in, perms);
7640 if (r < 0)
7641 return r;
7642 return _setattr(in, attr, mask, perms);
7643 }
7644
7645 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7646 const UserPerm& perms, int flags)
7647 {
7648 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7649 if (!mref_reader.is_state_satisfied())
7650 return -CEPHFS_ENOTCONN;
7651
7652 tout(cct) << __func__ << std::endl;
7653 tout(cct) << relpath << std::endl;
7654 tout(cct) << mask << std::endl;
7655
7656 filepath path(relpath);
7657 InodeRef in;
7658
7659 std::scoped_lock lock(client_lock);
7660 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7661 if (r < 0)
7662 return r;
7663 return _setattrx(in, stx, mask, perms);
7664 }
7665
7666 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7667 {
7668 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7669 if (!mref_reader.is_state_satisfied())
7670 return -CEPHFS_ENOTCONN;
7671
7672 tout(cct) << __func__ << std::endl;
7673 tout(cct) << fd << std::endl;
7674 tout(cct) << mask << std::endl;
7675
7676 std::scoped_lock lock(client_lock);
7677 Fh *f = get_filehandle(fd);
7678 if (!f)
7679 return -CEPHFS_EBADF;
7680 #if defined(__linux__) && defined(O_PATH)
7681 if (f->flags & O_PATH)
7682 return -CEPHFS_EBADF;
7683 #endif
7684 return _setattr(f->inode, attr, mask, perms);
7685 }
7686
7687 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7688 {
7689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7690 if (!mref_reader.is_state_satisfied())
7691 return -CEPHFS_ENOTCONN;
7692
7693 tout(cct) << __func__ << std::endl;
7694 tout(cct) << fd << std::endl;
7695 tout(cct) << mask << std::endl;
7696
7697 std::scoped_lock lock(client_lock);
7698 Fh *f = get_filehandle(fd);
7699 if (!f)
7700 return -CEPHFS_EBADF;
7701 #if defined(__linux__) && defined(O_PATH)
7702 if (f->flags & O_PATH)
7703 return -CEPHFS_EBADF;
7704 #endif
7705 return _setattrx(f->inode, stx, mask, perms);
7706 }
7707
7708 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7709 frag_info_t *dirstat, int mask)
7710 {
7711 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7712 if (!mref_reader.is_state_satisfied())
7713 return -CEPHFS_ENOTCONN;
7714
7715 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7716 tout(cct) << "stat" << std::endl;
7717 tout(cct) << relpath << std::endl;
7718
7719 filepath path(relpath);
7720 InodeRef in;
7721
7722 std::scoped_lock lock(client_lock);
7723 int r = path_walk(path, &in, perms, true, mask);
7724 if (r < 0)
7725 return r;
7726 r = _getattr(in, mask, perms);
7727 if (r < 0) {
7728 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7729 return r;
7730 }
7731 fill_stat(in, stbuf, dirstat);
7732 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7733 return r;
7734 }
7735
7736 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7737 {
7738 unsigned mask = 0;
7739
7740 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7741 if (flags & AT_NO_ATTR_SYNC)
7742 goto out;
7743
7744 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7745 mask |= CEPH_CAP_PIN;
7746 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7747 mask |= CEPH_CAP_AUTH_SHARED;
7748 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7749 mask |= CEPH_CAP_LINK_SHARED;
7750 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7751 mask |= CEPH_CAP_FILE_SHARED;
7752 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7753 mask |= CEPH_CAP_XATTR_SHARED;
7754 out:
7755 return mask;
7756 }
7757
7758 int Client::statx(const char *relpath, struct ceph_statx *stx,
7759 const UserPerm& perms,
7760 unsigned int want, unsigned int flags)
7761 {
7762 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7763 if (!mref_reader.is_state_satisfied())
7764 return -CEPHFS_ENOTCONN;
7765
7766 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7767 tout(cct) << "statx" << std::endl;
7768 tout(cct) << relpath << std::endl;
7769
7770 filepath path(relpath);
7771 InodeRef in;
7772
7773 unsigned mask = statx_to_mask(flags, want);
7774
7775 std::scoped_lock lock(client_lock);
7776 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7777 if (r < 0)
7778 return r;
7779
7780 r = _getattr(in, mask, perms);
7781 if (r < 0) {
7782 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7783 return r;
7784 }
7785
7786 fill_statx(in, mask, stx);
7787 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7788 return r;
7789 }
7790
7791 int Client::lstat(const char *relpath, struct stat *stbuf,
7792 const UserPerm& perms, frag_info_t *dirstat, int mask)
7793 {
7794 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7795 if (!mref_reader.is_state_satisfied())
7796 return -CEPHFS_ENOTCONN;
7797
7798 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7799 tout(cct) << __func__ << std::endl;
7800 tout(cct) << relpath << std::endl;
7801
7802 filepath path(relpath);
7803 InodeRef in;
7804
7805 std::scoped_lock lock(client_lock);
7806 // don't follow symlinks
7807 int r = path_walk(path, &in, perms, false, mask);
7808 if (r < 0)
7809 return r;
7810 r = _getattr(in, mask, perms);
7811 if (r < 0) {
7812 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7813 return r;
7814 }
7815 fill_stat(in, stbuf, dirstat);
7816 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7817 return r;
7818 }
7819
7820 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7821 {
7822 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7823 << " mode 0" << oct << in->mode << dec
7824 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7825 memset(st, 0, sizeof(struct stat));
7826 if (use_faked_inos())
7827 st->st_ino = in->faked_ino;
7828 else
7829 st->st_ino = in->ino;
7830 st->st_dev = in->snapid;
7831 st->st_mode = in->mode;
7832 st->st_rdev = in->rdev;
7833 if (in->is_dir()) {
7834 switch (in->nlink) {
7835 case 0:
7836 st->st_nlink = 0; /* dir is unlinked */
7837 break;
7838 case 1:
7839 st->st_nlink = 1 /* parent dentry */
7840 + 1 /* <dir>/. */
7841 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7842 break;
7843 default:
7844 ceph_abort();
7845 }
7846 } else {
7847 st->st_nlink = in->nlink;
7848 }
7849 st->st_uid = in->uid;
7850 st->st_gid = in->gid;
7851 if (in->ctime > in->mtime) {
7852 stat_set_ctime_sec(st, in->ctime.sec());
7853 stat_set_ctime_nsec(st, in->ctime.nsec());
7854 } else {
7855 stat_set_ctime_sec(st, in->mtime.sec());
7856 stat_set_ctime_nsec(st, in->mtime.nsec());
7857 }
7858 stat_set_atime_sec(st, in->atime.sec());
7859 stat_set_atime_nsec(st, in->atime.nsec());
7860 stat_set_mtime_sec(st, in->mtime.sec());
7861 stat_set_mtime_nsec(st, in->mtime.nsec());
7862 if (in->is_dir()) {
7863 if (cct->_conf->client_dirsize_rbytes)
7864 st->st_size = in->rstat.rbytes;
7865 else
7866 st->st_size = in->dirstat.size();
7867 // The Windows "stat" structure provides just a subset of the fields that are
7868 // available on Linux.
7869 #ifndef _WIN32
7870 st->st_blocks = 1;
7871 #endif
7872 } else {
7873 st->st_size = in->size;
7874 #ifndef _WIN32
7875 st->st_blocks = (in->size + 511) >> 9;
7876 #endif
7877 }
7878 #ifndef _WIN32
7879 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7880 #endif
7881
7882 if (dirstat)
7883 *dirstat = in->dirstat;
7884 if (rstat)
7885 *rstat = in->rstat;
7886
7887 return in->caps_issued();
7888 }
7889
7890 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7891 {
7892 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7893 << " mode 0" << oct << in->mode << dec
7894 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7895 memset(stx, 0, sizeof(struct ceph_statx));
7896
7897 /*
7898 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7899 * so that all bits are set.
7900 */
7901 if (!mask)
7902 mask = ~0;
7903
7904 /* These are always considered to be available */
7905 stx->stx_dev = in->snapid;
7906 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7907
7908 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7909 stx->stx_mode = S_IFMT & in->mode;
7910 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7911 stx->stx_rdev = in->rdev;
7912 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7913
7914 if (mask & CEPH_CAP_AUTH_SHARED) {
7915 stx->stx_uid = in->uid;
7916 stx->stx_gid = in->gid;
7917 stx->stx_mode = in->mode;
7918 in->btime.to_timespec(&stx->stx_btime);
7919 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7920 }
7921
7922 if (mask & CEPH_CAP_LINK_SHARED) {
7923 if (in->is_dir()) {
7924 switch (in->nlink) {
7925 case 0:
7926 stx->stx_nlink = 0; /* dir is unlinked */
7927 break;
7928 case 1:
7929 stx->stx_nlink = 1 /* parent dentry */
7930 + 1 /* <dir>/. */
7931 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7932 break;
7933 default:
7934 ceph_abort();
7935 }
7936 } else {
7937 stx->stx_nlink = in->nlink;
7938 }
7939 stx->stx_mask |= CEPH_STATX_NLINK;
7940 }
7941
7942 if (mask & CEPH_CAP_FILE_SHARED) {
7943
7944 in->atime.to_timespec(&stx->stx_atime);
7945 in->mtime.to_timespec(&stx->stx_mtime);
7946
7947 if (in->is_dir()) {
7948 if (cct->_conf->client_dirsize_rbytes)
7949 stx->stx_size = in->rstat.rbytes;
7950 else
7951 stx->stx_size = in->dirstat.size();
7952 stx->stx_blocks = 1;
7953 } else {
7954 stx->stx_size = in->size;
7955 stx->stx_blocks = (in->size + 511) >> 9;
7956 }
7957 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7958 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7959 }
7960
7961 /* Change time and change_attr both require all shared caps to view */
7962 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7963 stx->stx_version = in->change_attr;
7964 if (in->ctime > in->mtime)
7965 in->ctime.to_timespec(&stx->stx_ctime);
7966 else
7967 in->mtime.to_timespec(&stx->stx_ctime);
7968 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7969 }
7970
7971 }
7972
7973 void Client::touch_dn(Dentry *dn)
7974 {
7975 lru.lru_touch(dn);
7976 }
7977
7978 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7979 {
7980 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7981 if (!mref_reader.is_state_satisfied())
7982 return -CEPHFS_ENOTCONN;
7983
7984 tout(cct) << __func__ << std::endl;
7985 tout(cct) << relpath << std::endl;
7986 tout(cct) << mode << std::endl;
7987
7988 filepath path(relpath);
7989 InodeRef in;
7990
7991 std::scoped_lock lock(client_lock);
7992 int r = path_walk(path, &in, perms);
7993 if (r < 0)
7994 return r;
7995 struct stat attr;
7996 attr.st_mode = mode;
7997 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7998 }
7999
8000 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8001 {
8002 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8003 if (!mref_reader.is_state_satisfied())
8004 return -CEPHFS_ENOTCONN;
8005
8006 tout(cct) << __func__ << std::endl;
8007 tout(cct) << fd << std::endl;
8008 tout(cct) << mode << std::endl;
8009
8010 std::scoped_lock lock(client_lock);
8011 Fh *f = get_filehandle(fd);
8012 if (!f)
8013 return -CEPHFS_EBADF;
8014 #if defined(__linux__) && defined(O_PATH)
8015 if (f->flags & O_PATH)
8016 return -CEPHFS_EBADF;
8017 #endif
8018 struct stat attr;
8019 attr.st_mode = mode;
8020 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8021 }
8022
8023 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8024 {
8025 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8026 if (!mref_reader.is_state_satisfied())
8027 return -CEPHFS_ENOTCONN;
8028
8029 tout(cct) << __func__ << std::endl;
8030 tout(cct) << relpath << std::endl;
8031 tout(cct) << mode << std::endl;
8032
8033 filepath path(relpath);
8034 InodeRef in;
8035
8036 std::scoped_lock lock(client_lock);
8037 // don't follow symlinks
8038 int r = path_walk(path, &in, perms, false);
8039 if (r < 0)
8040 return r;
8041 struct stat attr;
8042 attr.st_mode = mode;
8043 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8044 }
8045
8046 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8047 const UserPerm& perms)
8048 {
8049 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8050 if (!mref_reader.is_state_satisfied())
8051 return -CEPHFS_ENOTCONN;
8052
8053 tout(cct) << __func__ << std::endl;
8054 tout(cct) << relpath << std::endl;
8055 tout(cct) << new_uid << std::endl;
8056 tout(cct) << new_gid << std::endl;
8057
8058 filepath path(relpath);
8059 InodeRef in;
8060
8061 std::scoped_lock lock(client_lock);
8062 int r = path_walk(path, &in, perms);
8063 if (r < 0)
8064 return r;
8065 struct stat attr;
8066 attr.st_uid = new_uid;
8067 attr.st_gid = new_gid;
8068 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8069 }
8070
8071 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8072 {
8073 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8074 if (!mref_reader.is_state_satisfied())
8075 return -CEPHFS_ENOTCONN;
8076
8077 tout(cct) << __func__ << std::endl;
8078 tout(cct) << fd << std::endl;
8079 tout(cct) << new_uid << std::endl;
8080 tout(cct) << new_gid << std::endl;
8081
8082 std::scoped_lock lock(client_lock);
8083 Fh *f = get_filehandle(fd);
8084 if (!f)
8085 return -CEPHFS_EBADF;
8086 #if defined(__linux__) && defined(O_PATH)
8087 if (f->flags & O_PATH)
8088 return -CEPHFS_EBADF;
8089 #endif
8090 struct stat attr;
8091 attr.st_uid = new_uid;
8092 attr.st_gid = new_gid;
8093 int mask = 0;
8094 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8095 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8096 return _setattr(f->inode, &attr, mask, perms);
8097 }
8098
8099 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8100 const UserPerm& perms)
8101 {
8102 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8103 if (!mref_reader.is_state_satisfied())
8104 return -CEPHFS_ENOTCONN;
8105
8106 tout(cct) << __func__ << std::endl;
8107 tout(cct) << relpath << std::endl;
8108 tout(cct) << new_uid << std::endl;
8109 tout(cct) << new_gid << std::endl;
8110
8111 filepath path(relpath);
8112 InodeRef in;
8113
8114 std::scoped_lock lock(client_lock);
8115 // don't follow symlinks
8116 int r = path_walk(path, &in, perms, false);
8117 if (r < 0)
8118 return r;
8119 struct stat attr;
8120 attr.st_uid = new_uid;
8121 attr.st_gid = new_gid;
8122 int mask = 0;
8123 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8124 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8125 return _setattr(in, &attr, mask, perms);
8126 }
8127
8128 static void attr_set_atime_and_mtime(struct stat *attr,
8129 const utime_t &atime,
8130 const utime_t &mtime)
8131 {
8132 stat_set_atime_sec(attr, atime.tv.tv_sec);
8133 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8134 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8135 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8136 }
8137
8138 // for [l]utime() invoke the timeval variant as the timespec
8139 // variant are not yet implemented. for futime[s](), invoke
8140 // the timespec variant.
8141 int Client::utime(const char *relpath, struct utimbuf *buf,
8142 const UserPerm& perms)
8143 {
8144 struct timeval tv[2];
8145 tv[0].tv_sec = buf->actime;
8146 tv[0].tv_usec = 0;
8147 tv[1].tv_sec = buf->modtime;
8148 tv[1].tv_usec = 0;
8149
8150 return utimes(relpath, tv, perms);
8151 }
8152
8153 int Client::lutime(const char *relpath, struct utimbuf *buf,
8154 const UserPerm& perms)
8155 {
8156 struct timeval tv[2];
8157 tv[0].tv_sec = buf->actime;
8158 tv[0].tv_usec = 0;
8159 tv[1].tv_sec = buf->modtime;
8160 tv[1].tv_usec = 0;
8161
8162 return lutimes(relpath, tv, perms);
8163 }
8164
8165 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8166 {
8167 struct timespec ts[2];
8168 ts[0].tv_sec = buf->actime;
8169 ts[0].tv_nsec = 0;
8170 ts[1].tv_sec = buf->modtime;
8171 ts[1].tv_nsec = 0;
8172
8173 return futimens(fd, ts, perms);
8174 }
8175
8176 int Client::utimes(const char *relpath, struct timeval times[2],
8177 const UserPerm& perms)
8178 {
8179 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8180 if (!mref_reader.is_state_satisfied())
8181 return -CEPHFS_ENOTCONN;
8182
8183 tout(cct) << __func__ << std::endl;
8184 tout(cct) << relpath << std::endl;
8185 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8186 << std::endl;
8187 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8188 << std::endl;
8189
8190 filepath path(relpath);
8191 InodeRef in;
8192
8193 std::scoped_lock lock(client_lock);
8194 int r = path_walk(path, &in, perms);
8195 if (r < 0)
8196 return r;
8197 struct stat attr;
8198 utime_t atime(times[0]);
8199 utime_t mtime(times[1]);
8200
8201 attr_set_atime_and_mtime(&attr, atime, mtime);
8202 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8203 }
8204
8205 int Client::lutimes(const char *relpath, struct timeval times[2],
8206 const UserPerm& perms)
8207 {
8208 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8209 if (!mref_reader.is_state_satisfied())
8210 return -CEPHFS_ENOTCONN;
8211
8212 tout(cct) << __func__ << std::endl;
8213 tout(cct) << relpath << std::endl;
8214 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8215 << std::endl;
8216 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8217 << std::endl;
8218
8219 filepath path(relpath);
8220 InodeRef in;
8221
8222 std::scoped_lock lock(client_lock);
8223 int r = path_walk(path, &in, perms, false);
8224 if (r < 0)
8225 return r;
8226 struct stat attr;
8227 utime_t atime(times[0]);
8228 utime_t mtime(times[1]);
8229
8230 attr_set_atime_and_mtime(&attr, atime, mtime);
8231 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8232 }
8233
8234 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8235 {
8236 struct timespec ts[2];
8237 ts[0].tv_sec = times[0].tv_sec;
8238 ts[0].tv_nsec = times[0].tv_usec * 1000;
8239 ts[1].tv_sec = times[1].tv_sec;
8240 ts[1].tv_nsec = times[1].tv_usec * 1000;
8241
8242 return futimens(fd, ts, perms);
8243 }
8244
8245 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8246 {
8247 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8248 if (!mref_reader.is_state_satisfied())
8249 return -CEPHFS_ENOTCONN;
8250
8251 tout(cct) << __func__ << std::endl;
8252 tout(cct) << fd << std::endl;
8253 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8254 << std::endl;
8255 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8256 << std::endl;
8257
8258 std::scoped_lock lock(client_lock);
8259 Fh *f = get_filehandle(fd);
8260 if (!f)
8261 return -CEPHFS_EBADF;
8262 #if defined(__linux__) && defined(O_PATH)
8263 if (f->flags & O_PATH)
8264 return -CEPHFS_EBADF;
8265 #endif
8266 struct stat attr;
8267 utime_t atime(times[0]);
8268 utime_t mtime(times[1]);
8269
8270 attr_set_atime_and_mtime(&attr, atime, mtime);
8271 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8272 }
8273
8274 int Client::flock(int fd, int operation, uint64_t owner)
8275 {
8276 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8277 if (!mref_reader.is_state_satisfied())
8278 return -CEPHFS_ENOTCONN;
8279
8280 tout(cct) << __func__ << std::endl;
8281 tout(cct) << fd << std::endl;
8282 tout(cct) << operation << std::endl;
8283 tout(cct) << owner << std::endl;
8284
8285 std::scoped_lock lock(client_lock);
8286 Fh *f = get_filehandle(fd);
8287 if (!f)
8288 return -CEPHFS_EBADF;
8289
8290 return _flock(f, operation, owner);
8291 }
8292
8293 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8294 {
8295 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8296 if (!mref_reader.is_state_satisfied())
8297 return -CEPHFS_ENOTCONN;
8298
8299 tout(cct) << __func__ << std::endl;
8300 tout(cct) << relpath << std::endl;
8301
8302 filepath path(relpath);
8303 InodeRef in;
8304
8305 std::scoped_lock lock(client_lock);
8306 int r = path_walk(path, &in, perms, true);
8307 if (r < 0)
8308 return r;
8309 if (cct->_conf->client_permissions) {
8310 int r = may_open(in.get(), O_RDONLY, perms);
8311 if (r < 0)
8312 return r;
8313 }
8314 r = _opendir(in.get(), dirpp, perms);
8315 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8316 if (r != -CEPHFS_ENOTDIR)
8317 tout(cct) << (uintptr_t)*dirpp << std::endl;
8318 return r;
8319 }
8320
8321 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8322 {
8323 if (!in->is_dir())
8324 return -CEPHFS_ENOTDIR;
8325 *dirpp = new dir_result_t(in, perms);
8326 opened_dirs.insert(*dirpp);
8327 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8328 return 0;
8329 }
8330
8331
8332 int Client::closedir(dir_result_t *dir)
8333 {
8334 tout(cct) << __func__ << std::endl;
8335 tout(cct) << (uintptr_t)dir << std::endl;
8336
8337 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8338 std::scoped_lock lock(client_lock);
8339 _closedir(dir);
8340 return 0;
8341 }
8342
8343 void Client::_closedir(dir_result_t *dirp)
8344 {
8345 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8346
8347 if (dirp->inode) {
8348 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8349 dirp->inode.reset();
8350 }
8351 _readdir_drop_dirp_buffer(dirp);
8352 opened_dirs.erase(dirp);
8353 delete dirp;
8354 }
8355
8356 void Client::rewinddir(dir_result_t *dirp)
8357 {
8358 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8359
8360 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8361 if (!mref_reader.is_state_satisfied())
8362 return;
8363
8364 std::scoped_lock lock(client_lock);
8365 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8366 _readdir_drop_dirp_buffer(d);
8367 d->reset();
8368 }
8369
8370 loff_t Client::telldir(dir_result_t *dirp)
8371 {
8372 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8373 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8374 return d->offset;
8375 }
8376
8377 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8378 {
8379 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8380
8381 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8382 if (!mref_reader.is_state_satisfied())
8383 return;
8384
8385 std::scoped_lock lock(client_lock);
8386
8387 if (offset == dirp->offset)
8388 return;
8389
8390 if (offset > dirp->offset)
8391 dirp->release_count = 0; // bump if we do a forward seek
8392 else
8393 dirp->ordered_count = 0; // disable filling readdir cache
8394
8395 if (dirp->hash_order()) {
8396 if (dirp->offset > offset) {
8397 _readdir_drop_dirp_buffer(dirp);
8398 dirp->reset();
8399 }
8400 } else {
8401 if (offset == 0 ||
8402 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8403 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8404 _readdir_drop_dirp_buffer(dirp);
8405 dirp->reset();
8406 }
8407 }
8408
8409 dirp->offset = offset;
8410 }
8411
8412
8413 //struct dirent {
8414 // ino_t d_ino; /* inode number */
8415 // off_t d_off; /* offset to the next dirent */
8416 // unsigned short d_reclen; /* length of this record */
8417 // unsigned char d_type; /* type of file */
8418 // char d_name[256]; /* filename */
8419 //};
8420 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8421 {
8422 strncpy(de->d_name, name, 255);
8423 de->d_name[255] = '\0';
8424 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8425 de->d_ino = ino;
8426 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8427 de->d_off = next_off;
8428 #endif
8429 de->d_reclen = 1;
8430 de->d_type = IFTODT(type);
8431 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8432 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8433 #endif
8434 }
8435
8436 void Client::_readdir_next_frag(dir_result_t *dirp)
8437 {
8438 frag_t fg = dirp->buffer_frag;
8439
8440 if (fg.is_rightmost()) {
8441 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8442 dirp->set_end();
8443 return;
8444 }
8445
8446 // advance
8447 fg = fg.next();
8448 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8449
8450 if (dirp->hash_order()) {
8451 // keep last_name
8452 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8453 if (dirp->offset < new_offset) // don't decrease offset
8454 dirp->offset = new_offset;
8455 } else {
8456 dirp->last_name.clear();
8457 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8458 _readdir_rechoose_frag(dirp);
8459 }
8460 }
8461
8462 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8463 {
8464 ceph_assert(dirp->inode);
8465
8466 if (dirp->hash_order())
8467 return;
8468
8469 frag_t cur = frag_t(dirp->offset_high());
8470 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8471 if (fg != cur) {
8472 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8473 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8474 dirp->last_name.clear();
8475 dirp->next_offset = 2;
8476 }
8477 }
8478
8479 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8480 {
8481 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8482 dirp->buffer.clear();
8483 }
8484
8485 int Client::_readdir_get_frag(dir_result_t *dirp)
8486 {
8487 ceph_assert(dirp);
8488 ceph_assert(dirp->inode);
8489
8490 // get the current frag.
8491 frag_t fg;
8492 if (dirp->hash_order())
8493 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8494 else
8495 fg = frag_t(dirp->offset_high());
8496
8497 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8498 << " offset " << hex << dirp->offset << dec << dendl;
8499
8500 int op = CEPH_MDS_OP_READDIR;
8501 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8502 op = CEPH_MDS_OP_LSSNAP;
8503
8504 InodeRef& diri = dirp->inode;
8505
8506 MetaRequest *req = new MetaRequest(op);
8507 filepath path;
8508 diri->make_nosnap_relative_path(path);
8509 req->set_filepath(path);
8510 req->set_inode(diri.get());
8511 req->head.args.readdir.frag = fg;
8512 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8513 if (dirp->last_name.length()) {
8514 req->path2.set_path(dirp->last_name);
8515 } else if (dirp->hash_order()) {
8516 req->head.args.readdir.offset_hash = dirp->offset_high();
8517 }
8518 req->dirp = dirp;
8519
8520 bufferlist dirbl;
8521 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8522
8523 if (res == -CEPHFS_EAGAIN) {
8524 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8525 _readdir_rechoose_frag(dirp);
8526 return _readdir_get_frag(dirp);
8527 }
8528
8529 if (res == 0) {
8530 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8531 << " size " << dirp->buffer.size() << dendl;
8532 } else {
8533 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8534 dirp->set_end();
8535 }
8536
8537 return res;
8538 }
8539
8540 struct dentry_off_lt {
8541 bool operator()(const Dentry* dn, int64_t off) const {
8542 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8543 }
8544 };
8545
8546 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8547 int caps, bool getref)
8548 {
8549 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8550 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8551 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8552 << dendl;
8553 Dir *dir = dirp->inode->dir;
8554
8555 if (!dir) {
8556 ldout(cct, 10) << " dir is empty" << dendl;
8557 dirp->set_end();
8558 return 0;
8559 }
8560
8561 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8562 dir->readdir_cache.end(),
8563 dirp->offset, dentry_off_lt());
8564
8565 string dn_name;
8566 while (true) {
8567 int mask = caps;
8568 if (!dirp->inode->is_complete_and_ordered())
8569 return -CEPHFS_EAGAIN;
8570 if (pd == dir->readdir_cache.end())
8571 break;
8572 Dentry *dn = *pd;
8573 if (dn->inode == NULL) {
8574 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8575 ++pd;
8576 continue;
8577 }
8578 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8579 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8580 ++pd;
8581 continue;
8582 }
8583
8584 int idx = pd - dir->readdir_cache.begin();
8585 if (dn->inode->is_dir()) {
8586 mask |= CEPH_STAT_RSTAT;
8587 }
8588 int r = _getattr(dn->inode, mask, dirp->perms);
8589 if (r < 0)
8590 return r;
8591
8592 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8593 pd = dir->readdir_cache.begin() + idx;
8594 if (pd >= dir->readdir_cache.end() || *pd != dn)
8595 return -CEPHFS_EAGAIN;
8596
8597 struct ceph_statx stx;
8598 struct dirent de;
8599 fill_statx(dn->inode, caps, &stx);
8600
8601 uint64_t next_off = dn->offset + 1;
8602 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8603 ++pd;
8604 if (pd == dir->readdir_cache.end())
8605 next_off = dir_result_t::END;
8606
8607 Inode *in = NULL;
8608 if (getref) {
8609 in = dn->inode.get();
8610 _ll_get(in);
8611 }
8612
8613 dn_name = dn->name; // fill in name while we have lock
8614
8615 client_lock.unlock();
8616 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8617 client_lock.lock();
8618 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8619 << " = " << r << dendl;
8620 if (r < 0) {
8621 return r;
8622 }
8623
8624 dirp->offset = next_off;
8625 if (dirp->at_end())
8626 dirp->next_offset = 2;
8627 else
8628 dirp->next_offset = dirp->offset_low();
8629 dirp->last_name = dn_name; // we successfully returned this one; update!
8630 dirp->release_count = 0; // last_name no longer match cache index
8631 if (r > 0)
8632 return r;
8633 }
8634
8635 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8636 dirp->set_end();
8637 return 0;
8638 }
8639
8640 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8641 unsigned want, unsigned flags, bool getref)
8642 {
8643 int caps = statx_to_mask(flags, want);
8644
8645 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8646 if (!mref_reader.is_state_satisfied())
8647 return -CEPHFS_ENOTCONN;
8648
8649 std::unique_lock cl(client_lock);
8650
8651 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8652
8653 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8654 << dec << " at_end=" << dirp->at_end()
8655 << " hash_order=" << dirp->hash_order() << dendl;
8656
8657 struct dirent de;
8658 struct ceph_statx stx;
8659 memset(&de, 0, sizeof(de));
8660 memset(&stx, 0, sizeof(stx));
8661
8662 InodeRef& diri = dirp->inode;
8663
8664 if (dirp->at_end())
8665 return 0;
8666
8667 if (dirp->offset == 0) {
8668 ldout(cct, 15) << " including ." << dendl;
8669 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8670 uint64_t next_off = 1;
8671
8672 int r;
8673 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8674 if (r < 0)
8675 return r;
8676
8677 fill_statx(diri, caps, &stx);
8678 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8679
8680 Inode *inode = NULL;
8681 if (getref) {
8682 inode = diri.get();
8683 _ll_get(inode);
8684 }
8685
8686 cl.unlock();
8687 r = cb(p, &de, &stx, next_off, inode);
8688 cl.lock();
8689 if (r < 0)
8690 return r;
8691
8692 dirp->offset = next_off;
8693 if (r > 0)
8694 return r;
8695 }
8696 if (dirp->offset == 1) {
8697 ldout(cct, 15) << " including .." << dendl;
8698 uint64_t next_off = 2;
8699 InodeRef in;
8700 if (diri->dentries.empty())
8701 in = diri;
8702 else
8703 in = diri->get_first_parent()->dir->parent_inode;
8704
8705 int r;
8706 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8707 if (r < 0)
8708 return r;
8709
8710 fill_statx(in, caps, &stx);
8711 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8712
8713 Inode *inode = NULL;
8714 if (getref) {
8715 inode = in.get();
8716 _ll_get(inode);
8717 }
8718
8719 cl.unlock();
8720 r = cb(p, &de, &stx, next_off, inode);
8721 cl.lock();
8722 if (r < 0)
8723 return r;
8724
8725 dirp->offset = next_off;
8726 if (r > 0)
8727 return r;
8728 }
8729
8730 // can we read from our cache?
8731 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8732 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8733 << dirp->inode->is_complete_and_ordered()
8734 << " issued " << ccap_string(dirp->inode->caps_issued())
8735 << dendl;
8736 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8737 dirp->inode->is_complete_and_ordered() &&
8738 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8739 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8740 if (err != -CEPHFS_EAGAIN)
8741 return err;
8742 }
8743
8744 while (1) {
8745 if (dirp->at_end())
8746 return 0;
8747
8748 bool check_caps = true;
8749 if (!dirp->is_cached()) {
8750 int r = _readdir_get_frag(dirp);
8751 if (r)
8752 return r;
8753 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8754 // different than the requested one. (our dirfragtree was outdated)
8755 check_caps = false;
8756 }
8757 frag_t fg = dirp->buffer_frag;
8758
8759 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8760 << " offset " << hex << dirp->offset << dendl;
8761
8762 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8763 dirp->offset, dir_result_t::dentry_off_lt());
8764 it != dirp->buffer.end();
8765 ++it) {
8766 dir_result_t::dentry &entry = *it;
8767
8768 uint64_t next_off = entry.offset + 1;
8769
8770 int r;
8771 if (check_caps) {
8772 int mask = caps;
8773 if(entry.inode->is_dir()){
8774 mask |= CEPH_STAT_RSTAT;
8775 }
8776 r = _getattr(entry.inode, mask, dirp->perms);
8777 if (r < 0)
8778 return r;
8779 }
8780
8781 fill_statx(entry.inode, caps, &stx);
8782 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8783
8784 Inode *inode = NULL;
8785 if (getref) {
8786 inode = entry.inode.get();
8787 _ll_get(inode);
8788 }
8789
8790 cl.unlock();
8791 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8792 cl.lock();
8793
8794 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8795 << " = " << r << dendl;
8796 if (r < 0)
8797 return r;
8798
8799 dirp->offset = next_off;
8800 if (r > 0)
8801 return r;
8802 }
8803
8804 if (dirp->next_offset > 2) {
8805 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8806 _readdir_drop_dirp_buffer(dirp);
8807 continue; // more!
8808 }
8809
8810 if (!fg.is_rightmost()) {
8811 // next frag!
8812 _readdir_next_frag(dirp);
8813 continue;
8814 }
8815
8816 if (diri->shared_gen == dirp->start_shared_gen &&
8817 diri->dir_release_count == dirp->release_count) {
8818 if (diri->dir_ordered_count == dirp->ordered_count) {
8819 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8820 if (diri->dir) {
8821 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8822 diri->dir->readdir_cache.resize(dirp->cache_index);
8823 }
8824 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8825 } else {
8826 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8827 diri->flags |= I_COMPLETE;
8828 }
8829 }
8830
8831 dirp->set_end();
8832 return 0;
8833 }
8834 ceph_abort();
8835 return 0;
8836 }
8837
8838
8839 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8840 {
8841 return readdirplus_r(d, de, 0, 0, 0, NULL);
8842 }
8843
8844 /*
8845 * readdirplus_r
8846 *
8847 * returns
8848 * 1 if we got a dirent
8849 * 0 for end of directory
8850 * <0 on error
8851 */
8852
8853 struct single_readdir {
8854 struct dirent *de;
8855 struct ceph_statx *stx;
8856 Inode *inode;
8857 bool full;
8858 };
8859
8860 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8861 struct ceph_statx *stx, off_t off,
8862 Inode *in)
8863 {
8864 single_readdir *c = static_cast<single_readdir *>(p);
8865
8866 if (c->full)
8867 return -1; // already filled this dirent
8868
8869 *c->de = *de;
8870 if (c->stx)
8871 *c->stx = *stx;
8872 c->inode = in;
8873 c->full = true;
8874 return 1;
8875 }
8876
8877 struct dirent *Client::readdir(dir_result_t *d)
8878 {
8879 int ret;
8880 auto& de = d->de;
8881 single_readdir sr;
8882 sr.de = &de;
8883 sr.stx = NULL;
8884 sr.inode = NULL;
8885 sr.full = false;
8886
8887 // our callback fills the dirent and sets sr.full=true on first
8888 // call, and returns -1 the second time around.
8889 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8890 if (ret < -1) {
8891 errno = -ret; // this sucks.
8892 return (dirent *) NULL;
8893 }
8894 if (sr.full) {
8895 return &de;
8896 }
8897 return (dirent *) NULL;
8898 }
8899
8900 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8901 struct ceph_statx *stx, unsigned want,
8902 unsigned flags, Inode **out)
8903 {
8904 single_readdir sr;
8905 sr.de = de;
8906 sr.stx = stx;
8907 sr.inode = NULL;
8908 sr.full = false;
8909
8910 // our callback fills the dirent and sets sr.full=true on first
8911 // call, and returns -1 the second time around.
8912 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8913 if (r < -1)
8914 return r;
8915 if (out)
8916 *out = sr.inode;
8917 if (sr.full)
8918 return 1;
8919 return 0;
8920 }
8921
8922
8923 /* getdents */
8924 struct getdents_result {
8925 char *buf;
8926 int buflen;
8927 int pos;
8928 bool fullent;
8929 };
8930
8931 static int _readdir_getdent_cb(void *p, struct dirent *de,
8932 struct ceph_statx *stx, off_t off, Inode *in)
8933 {
8934 struct getdents_result *c = static_cast<getdents_result *>(p);
8935
8936 int dlen;
8937 if (c->fullent)
8938 dlen = sizeof(*de);
8939 else
8940 dlen = strlen(de->d_name) + 1;
8941
8942 if (c->pos + dlen > c->buflen)
8943 return -1; // doesn't fit
8944
8945 if (c->fullent) {
8946 memcpy(c->buf + c->pos, de, sizeof(*de));
8947 } else {
8948 memcpy(c->buf + c->pos, de->d_name, dlen);
8949 }
8950 c->pos += dlen;
8951 return 0;
8952 }
8953
8954 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8955 {
8956 getdents_result gr;
8957 gr.buf = buf;
8958 gr.buflen = buflen;
8959 gr.fullent = fullent;
8960 gr.pos = 0;
8961
8962 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8963
8964 if (r < 0) { // some error
8965 if (r == -1) { // buffer ran out of space
8966 if (gr.pos) { // but we got some entries already!
8967 return gr.pos;
8968 } // or we need a larger buffer
8969 return -CEPHFS_ERANGE;
8970 } else { // actual error, return it
8971 return r;
8972 }
8973 }
8974 return gr.pos;
8975 }
8976
8977
8978 /* getdir */
8979 struct getdir_result {
8980 list<string> *contents;
8981 int num;
8982 };
8983
8984 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8985 {
8986 getdir_result *r = static_cast<getdir_result *>(p);
8987
8988 r->contents->push_back(de->d_name);
8989 r->num++;
8990 return 0;
8991 }
8992
8993 int Client::getdir(const char *relpath, list<string>& contents,
8994 const UserPerm& perms)
8995 {
8996 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8997 tout(cct) << "getdir" << std::endl;
8998 tout(cct) << relpath << std::endl;
8999
9000 dir_result_t *d;
9001 int r = opendir(relpath, &d, perms);
9002 if (r < 0)
9003 return r;
9004
9005 getdir_result gr;
9006 gr.contents = &contents;
9007 gr.num = 0;
9008 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9009
9010 closedir(d);
9011
9012 if (r < 0)
9013 return r;
9014 return gr.num;
9015 }
9016
9017
9018 /****** file i/o **********/
9019 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9020 mode_t mode, int stripe_unit, int stripe_count,
9021 int object_size, const char *data_pool, std::string alternate_name)
9022 {
9023 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9024 if (!mref_reader.is_state_satisfied())
9025 return -CEPHFS_ENOTCONN;
9026
9027 int cflags = ceph_flags_sys2wire(flags);
9028
9029 ldout(cct, 3) << "open enter(" << relpath << ", " << cflags << "," << mode << ")" << dendl;
9030 tout(cct) << "open" << std::endl;
9031 tout(cct) << relpath << std::endl;
9032 tout(cct) << cflags << std::endl;
9033
9034 Fh *fh = NULL;
9035
9036 #if defined(__linux__) && defined(O_PATH)
9037 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9038 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9039 * in kernel (fs/open.c). */
9040 if (flags & O_PATH)
9041 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9042 #endif
9043
9044 filepath path(relpath);
9045 InodeRef in;
9046 bool created = false;
9047 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9048 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9049 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9050
9051 std::scoped_lock lock(client_lock);
9052 int r = path_walk(path, &in, perms, followsym, mask);
9053
9054 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9055 return -CEPHFS_EEXIST;
9056
9057 #if defined(__linux__) && defined(O_PATH)
9058 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9059 #else
9060 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9061 #endif
9062 return -CEPHFS_ELOOP;
9063
9064 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9065 filepath dirpath = path;
9066 string dname = dirpath.last_dentry();
9067 dirpath.pop_dentry();
9068 InodeRef dir;
9069 r = path_walk(dirpath, &dir, perms, true,
9070 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
9071 if (r < 0)
9072 goto out;
9073 if (cct->_conf->client_permissions) {
9074 r = may_create(dir.get(), perms);
9075 if (r < 0)
9076 goto out;
9077 }
9078 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9079 stripe_count, object_size, data_pool, &created, perms,
9080 std::move(alternate_name));
9081 }
9082 if (r < 0)
9083 goto out;
9084
9085 if (!created) {
9086 // posix says we can only check permissions of existing files
9087 if (cct->_conf->client_permissions) {
9088 r = may_open(in.get(), flags, perms);
9089 if (r < 0)
9090 goto out;
9091 }
9092 }
9093
9094 if (!fh)
9095 r = _open(in.get(), flags, mode, &fh, perms);
9096 if (r >= 0) {
9097 // allocate a integer file descriptor
9098 ceph_assert(fh);
9099 r = get_fd();
9100 ceph_assert(fd_map.count(r) == 0);
9101 fd_map[r] = fh;
9102 }
9103
9104 out:
9105 tout(cct) << r << std::endl;
9106 ldout(cct, 3) << "open exit(" << path << ", " << cflags << ") = " << r << dendl;
9107 return r;
9108 }
9109
9110 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9111 const UserPerm& perms)
9112 {
9113 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9114
9115 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9116 if (!mref_reader.is_state_satisfied())
9117 return -CEPHFS_ENOTCONN;
9118
9119 std::scoped_lock lock(client_lock);
9120 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9121 filepath path(ino);
9122 req->set_filepath(path);
9123
9124 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9125 char f[30];
9126 sprintf(f, "%u", h);
9127 filepath path2(dirino);
9128 path2.push_dentry(string(f));
9129 req->set_filepath2(path2);
9130
9131 int r = make_request(req, perms, NULL, NULL,
9132 rand() % mdsmap->get_num_in_mds());
9133 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9134 return r;
9135 }
9136
9137
9138 /**
9139 * Load inode into local cache.
9140 *
9141 * If inode pointer is non-NULL, and take a reference on
9142 * the resulting Inode object in one operation, so that caller
9143 * can safely assume inode will still be there after return.
9144 */
9145 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9146 {
9147 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9148
9149 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9150 if (!mref_reader.is_state_satisfied())
9151 return -CEPHFS_ENOTCONN;
9152
9153 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9154 filepath path(vino.ino);
9155 req->set_filepath(path);
9156
9157 /*
9158 * The MDS expects either a "real" snapid here or 0. The special value
9159 * carveouts for the snapid are all at the end of the range so we can
9160 * just look for any snapid below this value.
9161 */
9162 if (vino.snapid < CEPH_NOSNAP)
9163 req->head.args.lookupino.snapid = vino.snapid;
9164
9165 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9166 if (r == 0 && inode != NULL) {
9167 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9168 ceph_assert(p != inode_map.end());
9169 *inode = p->second;
9170 _ll_get(*inode);
9171 }
9172 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9173 return r;
9174 }
9175
9176 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9177 {
9178 vinodeno_t vino(ino, CEPH_NOSNAP);
9179 std::scoped_lock lock(client_lock);
9180 return _lookup_vino(vino, perms, inode);
9181 }
9182
9183 /**
9184 * Find the parent inode of `ino` and insert it into
9185 * our cache. Conditionally also set `parent` to a referenced
9186 * Inode* if caller provides non-NULL value.
9187 */
9188 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9189 {
9190 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9191
9192 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9193 filepath path(ino->ino);
9194 req->set_filepath(path);
9195
9196 InodeRef target;
9197 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9198 // Give caller a reference to the parent ino if they provided a pointer.
9199 if (parent != NULL) {
9200 if (r == 0) {
9201 *parent = target.get();
9202 _ll_get(*parent);
9203 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9204 } else {
9205 *parent = NULL;
9206 }
9207 }
9208 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9209 return r;
9210 }
9211
9212 /**
9213 * Populate the parent dentry for `ino`, provided it is
9214 * a child of `parent`.
9215 */
9216 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9217 {
9218 ceph_assert(parent->is_dir());
9219 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9220
9221 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9222 if (!mref_reader.is_state_satisfied())
9223 return -CEPHFS_ENOTCONN;
9224
9225 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9226 req->set_filepath2(filepath(parent->ino));
9227 req->set_filepath(filepath(ino->ino));
9228 req->set_inode(ino);
9229
9230 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9231 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9232 return r;
9233 }
9234
9235 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9236 {
9237 std::scoped_lock lock(client_lock);
9238 return _lookup_name(ino, parent, perms);
9239 }
9240
9241 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9242 {
9243 ceph_assert(in);
9244 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9245
9246 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9247
9248 if (in->snapid != CEPH_NOSNAP) {
9249 in->snap_cap_refs++;
9250 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9251 << ccap_string(in->caps_issued()) << dendl;
9252 }
9253
9254 const auto& conf = cct->_conf;
9255 f->readahead.set_trigger_requests(1);
9256 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9257 uint64_t max_readahead = Readahead::NO_LIMIT;
9258 if (conf->client_readahead_max_bytes) {
9259 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9260 }
9261 if (conf->client_readahead_max_periods) {
9262 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9263 }
9264 f->readahead.set_max_readahead_size(max_readahead);
9265 vector<uint64_t> alignments;
9266 alignments.push_back(in->layout.get_period());
9267 alignments.push_back(in->layout.stripe_unit);
9268 f->readahead.set_alignments(alignments);
9269
9270 return f;
9271 }
9272
9273 int Client::_release_fh(Fh *f)
9274 {
9275 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9276 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9277 Inode *in = f->inode.get();
9278 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9279
9280 in->unset_deleg(f);
9281
9282 if (in->snapid == CEPH_NOSNAP) {
9283 if (in->put_open_ref(f->mode)) {
9284 _flush(in, new C_Client_FlushComplete(this, in));
9285 check_caps(in, 0);
9286 }
9287 } else {
9288 ceph_assert(in->snap_cap_refs > 0);
9289 in->snap_cap_refs--;
9290 }
9291
9292 _release_filelocks(f);
9293
9294 // Finally, read any async err (i.e. from flushes)
9295 int err = f->take_async_err();
9296 if (err != 0) {
9297 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9298 << cpp_strerror(err) << dendl;
9299 } else {
9300 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9301 }
9302
9303 _put_fh(f);
9304
9305 return err;
9306 }
9307
9308 void Client::_put_fh(Fh *f)
9309 {
9310 int left = f->put();
9311 if (!left) {
9312 delete f;
9313 }
9314 }
9315
9316 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9317 const UserPerm& perms)
9318 {
9319 if (in->snapid != CEPH_NOSNAP &&
9320 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9321 return -CEPHFS_EROFS;
9322 }
9323
9324 // use normalized flags to generate cmode
9325 int cflags = ceph_flags_sys2wire(flags);
9326 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9327 cflags |= CEPH_O_LAZY;
9328
9329 int cmode = ceph_flags_to_mode(cflags);
9330 int want = ceph_caps_for_mode(cmode);
9331 int result = 0;
9332
9333 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9334
9335 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9336 // update wanted?
9337 check_caps(in, CHECK_CAPS_NODELAY);
9338 } else {
9339
9340 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9341 filepath path;
9342 in->make_nosnap_relative_path(path);
9343 req->set_filepath(path);
9344 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9345 req->head.args.open.mode = mode;
9346 req->head.args.open.pool = -1;
9347 if (cct->_conf->client_debug_getattr_caps)
9348 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9349 else
9350 req->head.args.open.mask = 0;
9351 req->head.args.open.old_size = in->size; // for O_TRUNC
9352 req->set_inode(in);
9353 result = make_request(req, perms);
9354
9355 /*
9356 * NFS expects that delegations will be broken on a conflicting open,
9357 * not just when there is actual conflicting access to the file. SMB leases
9358 * and oplocks also have similar semantics.
9359 *
9360 * Ensure that clients that have delegations enabled will wait on minimal
9361 * caps during open, just to ensure that other clients holding delegations
9362 * return theirs first.
9363 */
9364 if (deleg_timeout && result == 0) {
9365 int need = 0, have;
9366
9367 if (cmode & CEPH_FILE_MODE_WR)
9368 need |= CEPH_CAP_FILE_WR;
9369 if (cmode & CEPH_FILE_MODE_RD)
9370 need |= CEPH_CAP_FILE_RD;
9371
9372 Fh fh(in, flags, cmode, fd_gen, perms);
9373 result = get_caps(&fh, need, want, &have, -1);
9374 if (result < 0) {
9375 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9376 " . Denying open: " <<
9377 cpp_strerror(result) << dendl;
9378 } else {
9379 put_cap_ref(in, need);
9380 }
9381 }
9382 }
9383
9384 // success?
9385 if (result >= 0) {
9386 if (fhp)
9387 *fhp = _create_fh(in, flags, cmode, perms);
9388 } else {
9389 in->put_open_ref(cmode);
9390 }
9391
9392 trim_cache();
9393
9394 return result;
9395 }
9396
9397 int Client::_renew_caps(Inode *in)
9398 {
9399 int wanted = in->caps_file_wanted();
9400 if (in->is_any_caps() &&
9401 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9402 check_caps(in, CHECK_CAPS_NODELAY);
9403 return 0;
9404 }
9405
9406 int flags = 0;
9407 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9408 flags = O_RDWR;
9409 else if (wanted & CEPH_CAP_FILE_RD)
9410 flags = O_RDONLY;
9411 else if (wanted & CEPH_CAP_FILE_WR)
9412 flags = O_WRONLY;
9413
9414 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9415 filepath path;
9416 in->make_nosnap_relative_path(path);
9417 req->set_filepath(path);
9418 req->head.args.open.flags = flags;
9419 req->head.args.open.pool = -1;
9420 if (cct->_conf->client_debug_getattr_caps)
9421 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9422 else
9423 req->head.args.open.mask = 0;
9424 req->set_inode(in);
9425
9426 // duplicate in case Cap goes away; not sure if that race is a concern?
9427 const UserPerm *pperm = in->get_best_perms();
9428 UserPerm perms;
9429 if (pperm != NULL)
9430 perms = *pperm;
9431 int ret = make_request(req, perms);
9432 return ret;
9433 }
9434
9435 int Client::close(int fd)
9436 {
9437 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9438 if (!mref_reader.is_state_satisfied())
9439 return -CEPHFS_ENOTCONN;
9440
9441 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9442 tout(cct) << "close" << std::endl;
9443 tout(cct) << fd << std::endl;
9444
9445 std::scoped_lock lock(client_lock);
9446 Fh *fh = get_filehandle(fd);
9447 if (!fh)
9448 return -CEPHFS_EBADF;
9449 int err = _release_fh(fh);
9450 fd_map.erase(fd);
9451 put_fd(fd);
9452 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9453 return err;
9454 }
9455
9456
9457 // ------------
9458 // read, write
9459
9460 loff_t Client::lseek(int fd, loff_t offset, int whence)
9461 {
9462 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9463 if (!mref_reader.is_state_satisfied())
9464 return -CEPHFS_ENOTCONN;
9465
9466 tout(cct) << "lseek" << std::endl;
9467 tout(cct) << fd << std::endl;
9468 tout(cct) << offset << std::endl;
9469 tout(cct) << whence << std::endl;
9470
9471 std::scoped_lock lock(client_lock);
9472 Fh *f = get_filehandle(fd);
9473 if (!f)
9474 return -CEPHFS_EBADF;
9475 #if defined(__linux__) && defined(O_PATH)
9476 if (f->flags & O_PATH)
9477 return -CEPHFS_EBADF;
9478 #endif
9479 return _lseek(f, offset, whence);
9480 }
9481
9482 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9483 {
9484 Inode *in = f->inode.get();
9485 bool whence_check = false;
9486 loff_t pos = -1;
9487
9488 switch (whence) {
9489 case SEEK_END:
9490 whence_check = true;
9491 break;
9492
9493 #ifdef SEEK_DATA
9494 case SEEK_DATA:
9495 whence_check = true;
9496 break;
9497 #endif
9498
9499 #ifdef SEEK_HOLE
9500 case SEEK_HOLE:
9501 whence_check = true;
9502 break;
9503 #endif
9504 }
9505
9506 if (whence_check) {
9507 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9508 if (r < 0)
9509 return r;
9510 }
9511
9512 switch (whence) {
9513 case SEEK_SET:
9514 pos = offset;
9515 break;
9516
9517 case SEEK_CUR:
9518 pos = f->pos + offset;
9519 break;
9520
9521 case SEEK_END:
9522 pos = in->size + offset;
9523 break;
9524
9525 #ifdef SEEK_DATA
9526 case SEEK_DATA:
9527 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9528 return -CEPHFS_ENXIO;
9529 pos = offset;
9530 break;
9531 #endif
9532
9533 #ifdef SEEK_HOLE
9534 case SEEK_HOLE:
9535 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9536 return -CEPHFS_ENXIO;
9537 pos = in->size;
9538 break;
9539 #endif
9540
9541 default:
9542 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9543 return -CEPHFS_EINVAL;
9544 }
9545
9546 if (pos < 0) {
9547 return -CEPHFS_EINVAL;
9548 } else {
9549 f->pos = pos;
9550 }
9551
9552 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9553 return f->pos;
9554 }
9555
9556
9557 void Client::lock_fh_pos(Fh *f)
9558 {
9559 ldout(cct, 10) << __func__ << " " << f << dendl;
9560
9561 if (f->pos_locked || !f->pos_waiters.empty()) {
9562 ceph::condition_variable cond;
9563 f->pos_waiters.push_back(&cond);
9564 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9565 std::unique_lock l{client_lock, std::adopt_lock};
9566 cond.wait(l, [f, me=&cond] {
9567 return !f->pos_locked && f->pos_waiters.front() == me;
9568 });
9569 l.release();
9570 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9571 ceph_assert(f->pos_waiters.front() == &cond);
9572 f->pos_waiters.pop_front();
9573 }
9574
9575 f->pos_locked = true;
9576 }
9577
9578 void Client::unlock_fh_pos(Fh *f)
9579 {
9580 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9581
9582 ldout(cct, 10) << __func__ << " " << f << dendl;
9583 f->pos_locked = false;
9584 if (!f->pos_waiters.empty()) {
9585 // only wake up the oldest waiter
9586 auto cond = f->pos_waiters.front();
9587 cond->notify_one();
9588 }
9589 }
9590
9591 int Client::uninline_data(Inode *in, Context *onfinish)
9592 {
9593 if (!in->inline_data.length()) {
9594 onfinish->complete(0);
9595 return 0;
9596 }
9597
9598 char oid_buf[32];
9599 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9600 object_t oid = oid_buf;
9601
9602 ObjectOperation create_ops;
9603 create_ops.create(false);
9604
9605 objecter->mutate(oid,
9606 OSDMap::file_to_object_locator(in->layout),
9607 create_ops,
9608 in->snaprealm->get_snap_context(),
9609 ceph::real_clock::now(),
9610 0,
9611 NULL);
9612
9613 bufferlist inline_version_bl;
9614 encode(in->inline_version, inline_version_bl);
9615
9616 ObjectOperation uninline_ops;
9617 uninline_ops.cmpxattr("inline_version",
9618 CEPH_OSD_CMPXATTR_OP_GT,
9619 CEPH_OSD_CMPXATTR_MODE_U64,
9620 inline_version_bl);
9621 bufferlist inline_data = in->inline_data;
9622 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9623 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9624
9625 objecter->mutate(oid,
9626 OSDMap::file_to_object_locator(in->layout),
9627 uninline_ops,
9628 in->snaprealm->get_snap_context(),
9629 ceph::real_clock::now(),
9630 0,
9631 onfinish);
9632
9633 return 0;
9634 }
9635
9636 //
9637
9638 // blocking osd interface
9639
9640 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9641 {
9642 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9643 if (!mref_reader.is_state_satisfied())
9644 return -CEPHFS_ENOTCONN;
9645
9646 tout(cct) << "read" << std::endl;
9647 tout(cct) << fd << std::endl;
9648 tout(cct) << size << std::endl;
9649 tout(cct) << offset << std::endl;
9650
9651 std::unique_lock lock(client_lock);
9652 Fh *f = get_filehandle(fd);
9653 if (!f)
9654 return -CEPHFS_EBADF;
9655 #if defined(__linux__) && defined(O_PATH)
9656 if (f->flags & O_PATH)
9657 return -CEPHFS_EBADF;
9658 #endif
9659 bufferlist bl;
9660 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9661 size = std::min(size, (loff_t)INT_MAX);
9662 int r = _read(f, offset, size, &bl);
9663 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9664 if (r >= 0) {
9665 lock.unlock();
9666 bl.begin().copy(bl.length(), buf);
9667 r = bl.length();
9668 }
9669 return r;
9670 }
9671
9672 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9673 {
9674 if (iovcnt < 0)
9675 return -CEPHFS_EINVAL;
9676 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9677 }
9678
9679 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9680 {
9681 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9682
9683 int want, have = 0;
9684 bool movepos = false;
9685 std::unique_ptr<C_SaferCond> onuninline;
9686 int64_t rc = 0;
9687 const auto& conf = cct->_conf;
9688 Inode *in = f->inode.get();
9689 utime_t lat;
9690 utime_t start = ceph_clock_now();
9691
9692 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9693 return -CEPHFS_EBADF;
9694 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9695
9696 if (offset < 0) {
9697 lock_fh_pos(f);
9698 offset = f->pos;
9699 movepos = true;
9700 }
9701 loff_t start_pos = offset;
9702
9703 if (in->inline_version == 0) {
9704 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9705 if (r < 0) {
9706 rc = r;
9707 goto done;
9708 }
9709 ceph_assert(in->inline_version > 0);
9710 }
9711
9712 retry:
9713 if (f->mode & CEPH_FILE_MODE_LAZY)
9714 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9715 else
9716 want = CEPH_CAP_FILE_CACHE;
9717 {
9718 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9719 if (r < 0) {
9720 rc = r;
9721 goto done;
9722 }
9723 }
9724 if (f->flags & O_DIRECT)
9725 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9726
9727 if (in->inline_version < CEPH_INLINE_NONE) {
9728 if (!(have & CEPH_CAP_FILE_CACHE)) {
9729 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9730 uninline_data(in, onuninline.get());
9731 } else {
9732 uint32_t len = in->inline_data.length();
9733 uint64_t endoff = offset + size;
9734 if (endoff > in->size)
9735 endoff = in->size;
9736
9737 if (offset < len) {
9738 if (endoff <= len) {
9739 bl->substr_of(in->inline_data, offset, endoff - offset);
9740 } else {
9741 bl->substr_of(in->inline_data, offset, len - offset);
9742 bl->append_zero(endoff - len);
9743 }
9744 rc = endoff - offset;
9745 } else if ((uint64_t)offset < endoff) {
9746 bl->append_zero(endoff - offset);
9747 rc = endoff - offset;
9748 } else {
9749 rc = 0;
9750 }
9751 goto success;
9752 }
9753 }
9754
9755 if (!conf->client_debug_force_sync_read &&
9756 conf->client_oc &&
9757 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9758
9759 if (f->flags & O_RSYNC) {
9760 _flush_range(in, offset, size);
9761 }
9762 rc = _read_async(f, offset, size, bl);
9763 if (rc < 0)
9764 goto done;
9765 } else {
9766 if (f->flags & O_DIRECT)
9767 _flush_range(in, offset, size);
9768
9769 bool checkeof = false;
9770 rc = _read_sync(f, offset, size, bl, &checkeof);
9771 if (rc < 0)
9772 goto done;
9773 if (checkeof) {
9774 offset += rc;
9775 size -= rc;
9776
9777 put_cap_ref(in, CEPH_CAP_FILE_RD);
9778 have = 0;
9779 // reverify size
9780 {
9781 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9782 if (r < 0) {
9783 rc = r;
9784 goto done;
9785 }
9786 }
9787
9788 // eof? short read.
9789 if ((uint64_t)offset < in->size)
9790 goto retry;
9791 }
9792 }
9793
9794 success:
9795 ceph_assert(rc >= 0);
9796 if (movepos) {
9797 // adjust fd pos
9798 f->pos = start_pos + rc;
9799 }
9800
9801 lat = ceph_clock_now();
9802 lat -= start;
9803 logger->tinc(l_c_read, lat);
9804
9805 done:
9806 // done!
9807
9808 if (onuninline) {
9809 client_lock.unlock();
9810 int ret = onuninline->wait();
9811 client_lock.lock();
9812 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
9813 in->inline_data.clear();
9814 in->inline_version = CEPH_INLINE_NONE;
9815 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9816 check_caps(in, 0);
9817 } else
9818 rc = ret;
9819 }
9820 if (have) {
9821 put_cap_ref(in, CEPH_CAP_FILE_RD);
9822 }
9823 if (movepos) {
9824 unlock_fh_pos(f);
9825 }
9826 return rc;
9827 }
9828
9829 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9830 client(c), f(f) {
9831 f->get();
9832 f->readahead.inc_pending();
9833 }
9834
9835 Client::C_Readahead::~C_Readahead() {
9836 f->readahead.dec_pending();
9837 client->_put_fh(f);
9838 }
9839
9840 void Client::C_Readahead::finish(int r) {
9841 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9842 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9843 }
9844
9845 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9846 {
9847 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9848
9849 const auto& conf = cct->_conf;
9850 Inode *in = f->inode.get();
9851
9852 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9853
9854 // trim read based on file size?
9855 if (off >= in->size)
9856 return 0;
9857 if (len == 0)
9858 return 0;
9859 if (off + len > in->size) {
9860 len = in->size - off;
9861 }
9862
9863 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9864 << " max_bytes=" << f->readahead.get_max_readahead_size()
9865 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9866
9867 // read (and possibly block)
9868 int r = 0;
9869 C_SaferCond onfinish("Client::_read_async flock");
9870 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9871 off, len, bl, 0, &onfinish);
9872 if (r == 0) {
9873 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9874 client_lock.unlock();
9875 r = onfinish.wait();
9876 client_lock.lock();
9877 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9878 }
9879
9880 if(f->readahead.get_min_readahead_size() > 0) {
9881 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9882 if (readahead_extent.second > 0) {
9883 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9884 << " (caller wants " << off << "~" << len << ")" << dendl;
9885 Context *onfinish2 = new C_Readahead(this, f);
9886 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9887 readahead_extent.first, readahead_extent.second,
9888 NULL, 0, onfinish2);
9889 if (r2 == 0) {
9890 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9891 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9892 } else {
9893 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9894 delete onfinish2;
9895 }
9896 }
9897 }
9898
9899 return r;
9900 }
9901
9902 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9903 bool *checkeof)
9904 {
9905 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9906
9907 Inode *in = f->inode.get();
9908 uint64_t pos = off;
9909 int left = len;
9910 int read = 0;
9911
9912 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9913
9914 // 0 success, 1 continue and < 0 error happen.
9915 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
9916 int r = onfinish.wait();
9917
9918 // if we get ENOENT from OSD, assume 0 bytes returned
9919 if (r == -CEPHFS_ENOENT)
9920 r = 0;
9921 if (r < 0)
9922 return r;
9923
9924 if (tbl.length()) {
9925 r = tbl.length();
9926
9927 read += r;
9928 pos += r;
9929 left -= r;
9930 bl->claim_append(tbl);
9931 }
9932 // short read?
9933 if (r >= 0 && r < wanted) {
9934 if (pos < in->size) {
9935 // zero up to known EOF
9936 int64_t some = in->size - pos;
9937 if (some > left)
9938 some = left;
9939 auto z = buffer::ptr_node::create(some);
9940 z->zero();
9941 bl->push_back(std::move(z));
9942 read += some;
9943 pos += some;
9944 left -= some;
9945 if (left == 0)
9946 return 0;
9947 }
9948
9949 *checkeof = true;
9950 return 0;
9951 }
9952 return 1;
9953 };
9954
9955 while (left > 0) {
9956 C_SaferCond onfinish("Client::_read_sync flock");
9957 bufferlist tbl;
9958
9959 int wanted = left;
9960 filer->read_trunc(in->ino, &in->layout, in->snapid,
9961 pos, left, &tbl, 0,
9962 in->truncate_size, in->truncate_seq,
9963 &onfinish);
9964 client_lock.unlock();
9965 int r = wait_and_copy(onfinish, tbl, wanted);
9966 client_lock.lock();
9967 if (!r)
9968 return read;
9969 if (r < 0)
9970 return r;
9971 }
9972 return read;
9973 }
9974
9975 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9976 {
9977 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9978 if (!mref_reader.is_state_satisfied())
9979 return -CEPHFS_ENOTCONN;
9980
9981 tout(cct) << "write" << std::endl;
9982 tout(cct) << fd << std::endl;
9983 tout(cct) << size << std::endl;
9984 tout(cct) << offset << std::endl;
9985
9986 std::scoped_lock lock(client_lock);
9987 Fh *fh = get_filehandle(fd);
9988 if (!fh)
9989 return -CEPHFS_EBADF;
9990 #if defined(__linux__) && defined(O_PATH)
9991 if (fh->flags & O_PATH)
9992 return -CEPHFS_EBADF;
9993 #endif
9994 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9995 size = std::min(size, (loff_t)INT_MAX);
9996 int r = _write(fh, offset, size, buf, NULL, false);
9997 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9998 return r;
9999 }
10000
10001 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10002 {
10003 if (iovcnt < 0)
10004 return -CEPHFS_EINVAL;
10005 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10006 }
10007
10008 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10009 unsigned iovcnt, int64_t offset, bool write,
10010 bool clamp_to_int, std::unique_lock<ceph::mutex> &cl)
10011 {
10012 #if defined(__linux__) && defined(O_PATH)
10013 if (fh->flags & O_PATH)
10014 return -CEPHFS_EBADF;
10015 #endif
10016 loff_t totallen = 0;
10017 for (unsigned i = 0; i < iovcnt; i++) {
10018 totallen += iov[i].iov_len;
10019 }
10020
10021 /*
10022 * Some of the API functions take 64-bit size values, but only return
10023 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10024 * we don't do I/Os larger than the values we can return.
10025 */
10026 if (clamp_to_int) {
10027 totallen = std::min(totallen, (loff_t)INT_MAX);
10028 }
10029 if (write) {
10030 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10031 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10032 return w;
10033 } else {
10034 bufferlist bl;
10035 int64_t r = _read(fh, offset, totallen, &bl);
10036 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10037 if (r <= 0)
10038 return r;
10039
10040 cl.unlock();
10041 auto iter = bl.cbegin();
10042 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10043 /*
10044 * This piece of code aims to handle the case that bufferlist
10045 * does not have enough data to fill in the iov
10046 */
10047 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10048 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10049 resid -= round_size;
10050 /* iter is self-updating */
10051 }
10052 cl.lock();
10053 return r;
10054 }
10055 }
10056
10057 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10058 {
10059 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10060 if (!mref_reader.is_state_satisfied())
10061 return -CEPHFS_ENOTCONN;
10062
10063 tout(cct) << fd << std::endl;
10064 tout(cct) << offset << std::endl;
10065
10066 std::unique_lock cl(client_lock);
10067 Fh *fh = get_filehandle(fd);
10068 if (!fh)
10069 return -CEPHFS_EBADF;
10070 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true, cl);
10071 }
10072
10073 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10074 const struct iovec *iov, int iovcnt)
10075 {
10076 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10077
10078 uint64_t fpos = 0;
10079
10080 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
10081 return -CEPHFS_EFBIG;
10082
10083 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10084 Inode *in = f->inode.get();
10085
10086 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10087 return -CEPHFS_ENOSPC;
10088 }
10089
10090 ceph_assert(in->snapid == CEPH_NOSNAP);
10091
10092 // was Fh opened as writeable?
10093 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10094 return -CEPHFS_EBADF;
10095
10096 // use/adjust fd pos?
10097 if (offset < 0) {
10098 lock_fh_pos(f);
10099 /*
10100 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10101 * change out from under us.
10102 */
10103 if (f->flags & O_APPEND) {
10104 auto r = _lseek(f, 0, SEEK_END);
10105 if (r < 0) {
10106 unlock_fh_pos(f);
10107 return r;
10108 }
10109 }
10110 offset = f->pos;
10111 fpos = offset+size;
10112 unlock_fh_pos(f);
10113 }
10114
10115 // check quota
10116 uint64_t endoff = offset + size;
10117 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10118 f->actor_perms)) {
10119 return -CEPHFS_EDQUOT;
10120 }
10121
10122 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10123
10124 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10125
10126 // time it.
10127 utime_t start = ceph_clock_now();
10128
10129 if (in->inline_version == 0) {
10130 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10131 if (r < 0)
10132 return r;
10133 ceph_assert(in->inline_version > 0);
10134 }
10135
10136 // copy into fresh buffer (since our write may be resub, async)
10137 bufferlist bl;
10138 if (buf) {
10139 if (size > 0)
10140 bl.append(buf, size);
10141 } else if (iov){
10142 for (int i = 0; i < iovcnt; i++) {
10143 if (iov[i].iov_len > 0) {
10144 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10145 }
10146 }
10147 }
10148
10149 utime_t lat;
10150 uint64_t totalwritten;
10151 int want, have;
10152 if (f->mode & CEPH_FILE_MODE_LAZY)
10153 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10154 else
10155 want = CEPH_CAP_FILE_BUFFER;
10156 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10157 if (r < 0)
10158 return r;
10159
10160 /* clear the setuid/setgid bits, if any */
10161 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10162 struct ceph_statx stx = { 0 };
10163
10164 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10165 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10166 if (r < 0)
10167 return r;
10168 } else {
10169 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10170 }
10171
10172 if (f->flags & O_DIRECT)
10173 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10174
10175 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10176
10177 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10178
10179 if (in->inline_version < CEPH_INLINE_NONE) {
10180 if (endoff > cct->_conf->client_max_inline_size ||
10181 endoff > CEPH_INLINE_MAX_SIZE ||
10182 !(have & CEPH_CAP_FILE_BUFFER)) {
10183 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10184 uninline_data(in, onuninline.get());
10185 } else {
10186 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10187
10188 uint32_t len = in->inline_data.length();
10189
10190 if (endoff < len)
10191 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10192
10193 if (offset < len)
10194 in->inline_data.splice(offset, len - offset);
10195 else if (offset > len)
10196 in->inline_data.append_zero(offset - len);
10197
10198 in->inline_data.append(bl);
10199 in->inline_version++;
10200
10201 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10202
10203 goto success;
10204 }
10205 }
10206
10207 if (cct->_conf->client_oc &&
10208 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10209 // do buffered write
10210 if (!in->oset.dirty_or_tx)
10211 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10212
10213 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10214
10215 // async, caching, non-blocking.
10216 r = objectcacher->file_write(&in->oset, &in->layout,
10217 in->snaprealm->get_snap_context(),
10218 offset, size, bl, ceph::real_clock::now(),
10219 0);
10220 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10221
10222 if (r < 0)
10223 goto done;
10224
10225 // flush cached write if O_SYNC is set on file fh
10226 // O_DSYNC == O_SYNC on linux < 2.6.33
10227 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10228 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10229 _flush_range(in, offset, size);
10230 }
10231 } else {
10232 if (f->flags & O_DIRECT)
10233 _flush_range(in, offset, size);
10234
10235 // simple, non-atomic sync write
10236 C_SaferCond onfinish("Client::_write flock");
10237 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10238
10239 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10240 offset, size, bl, ceph::real_clock::now(), 0,
10241 in->truncate_size, in->truncate_seq,
10242 &onfinish);
10243 client_lock.unlock();
10244 r = onfinish.wait();
10245 client_lock.lock();
10246 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10247 if (r < 0)
10248 goto done;
10249 }
10250
10251 // if we get here, write was successful, update client metadata
10252 success:
10253 // time
10254 lat = ceph_clock_now();
10255 lat -= start;
10256 logger->tinc(l_c_wrlat, lat);
10257
10258 if (fpos) {
10259 lock_fh_pos(f);
10260 f->pos = fpos;
10261 unlock_fh_pos(f);
10262 }
10263 totalwritten = size;
10264 r = (int64_t)totalwritten;
10265
10266 // extend file?
10267 if (totalwritten + offset > in->size) {
10268 in->size = totalwritten + offset;
10269 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10270
10271 if (is_quota_bytes_approaching(in, f->actor_perms)) {
10272 check_caps(in, CHECK_CAPS_NODELAY);
10273 } else if (is_max_size_approaching(in)) {
10274 check_caps(in, 0);
10275 }
10276
10277 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10278 } else {
10279 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10280 }
10281
10282 // mtime
10283 in->mtime = in->ctime = ceph_clock_now();
10284 in->change_attr++;
10285 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10286
10287 done:
10288
10289 if (nullptr != onuninline) {
10290 client_lock.unlock();
10291 int uninline_ret = onuninline->wait();
10292 client_lock.lock();
10293
10294 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10295 in->inline_data.clear();
10296 in->inline_version = CEPH_INLINE_NONE;
10297 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10298 check_caps(in, 0);
10299 } else
10300 r = uninline_ret;
10301 }
10302
10303 put_cap_ref(in, CEPH_CAP_FILE_WR);
10304 return r;
10305 }
10306
10307 int Client::_flush(Fh *f)
10308 {
10309 Inode *in = f->inode.get();
10310 int err = f->take_async_err();
10311 if (err != 0) {
10312 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10313 << cpp_strerror(err) << dendl;
10314 } else {
10315 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10316 }
10317
10318 return err;
10319 }
10320
10321 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10322 {
10323 struct ceph_statx stx;
10324 stx.stx_size = length;
10325 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10326 }
10327
10328 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10329 {
10330 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10331 if (!mref_reader.is_state_satisfied())
10332 return -CEPHFS_ENOTCONN;
10333
10334 tout(cct) << __func__ << std::endl;
10335 tout(cct) << fd << std::endl;
10336 tout(cct) << length << std::endl;
10337
10338 std::scoped_lock lock(client_lock);
10339 Fh *f = get_filehandle(fd);
10340 if (!f)
10341 return -CEPHFS_EBADF;
10342 #if defined(__linux__) && defined(O_PATH)
10343 if (f->flags & O_PATH)
10344 return -CEPHFS_EBADF;
10345 #endif
10346 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10347 return -CEPHFS_EBADF;
10348 struct stat attr;
10349 attr.st_size = length;
10350 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10351 }
10352
10353 int Client::fsync(int fd, bool syncdataonly)
10354 {
10355 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10356 if (!mref_reader.is_state_satisfied())
10357 return -CEPHFS_ENOTCONN;
10358
10359 tout(cct) << "fsync" << std::endl;
10360 tout(cct) << fd << std::endl;
10361 tout(cct) << syncdataonly << std::endl;
10362
10363 std::scoped_lock lock(client_lock);
10364 Fh *f = get_filehandle(fd);
10365 if (!f)
10366 return -CEPHFS_EBADF;
10367 #if defined(__linux__) && defined(O_PATH)
10368 if (f->flags & O_PATH)
10369 return -CEPHFS_EBADF;
10370 #endif
10371 int r = _fsync(f, syncdataonly);
10372 if (r == 0) {
10373 // The IOs in this fsync were okay, but maybe something happened
10374 // in the background that we shoudl be reporting?
10375 r = f->take_async_err();
10376 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10377 << ") = 0, async_err = " << r << dendl;
10378 } else {
10379 // Assume that an error we encountered during fsync, even reported
10380 // synchronously, would also have applied the error to the Fh, and we
10381 // should clear it here to avoid returning the same error again on next
10382 // call.
10383 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10384 << r << dendl;
10385 f->take_async_err();
10386 }
10387 return r;
10388 }
10389
10390 int Client::_fsync(Inode *in, bool syncdataonly)
10391 {
10392 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10393
10394 int r = 0;
10395 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10396 ceph_tid_t flush_tid = 0;
10397 InodeRef tmp_ref;
10398 utime_t lat;
10399 utime_t start = ceph_clock_now();
10400
10401 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10402
10403 if (cct->_conf->client_oc) {
10404 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10405 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10406 _flush(in, object_cacher_completion.get());
10407 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10408 }
10409
10410 if (!syncdataonly && in->dirty_caps) {
10411 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10412 if (in->flushing_caps)
10413 flush_tid = last_flush_tid;
10414 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10415
10416 if (!syncdataonly && !in->unsafe_ops.empty()) {
10417 flush_mdlog_sync();
10418
10419 MetaRequest *req = in->unsafe_ops.back();
10420 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10421
10422 req->get();
10423 wait_on_list(req->waitfor_safe);
10424 put_request(req);
10425 }
10426
10427 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10428 client_lock.unlock();
10429 ldout(cct, 15) << "waiting on data to flush" << dendl;
10430 r = object_cacher_completion->wait();
10431 client_lock.lock();
10432 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10433 } else {
10434 // FIXME: this can starve
10435 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10436 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10437 << " uncommitted, waiting" << dendl;
10438 wait_on_list(in->waitfor_commit);
10439 }
10440 }
10441
10442 if (!r) {
10443 if (flush_tid > 0)
10444 wait_sync_caps(in, flush_tid);
10445
10446 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10447 } else {
10448 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10449 << cpp_strerror(-r) << dendl;
10450 }
10451
10452 lat = ceph_clock_now();
10453 lat -= start;
10454 logger->tinc(l_c_fsync, lat);
10455
10456 return r;
10457 }
10458
10459 int Client::_fsync(Fh *f, bool syncdataonly)
10460 {
10461 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10462 return _fsync(f->inode.get(), syncdataonly);
10463 }
10464
10465 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10466 {
10467 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10468 if (!mref_reader.is_state_satisfied())
10469 return -CEPHFS_ENOTCONN;
10470
10471 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10472 tout(cct) << fd << std::endl;
10473
10474 std::scoped_lock lock(client_lock);
10475 Fh *f = get_filehandle(fd);
10476 if (!f)
10477 return -CEPHFS_EBADF;
10478 int r = _getattr(f->inode, mask, perms);
10479 if (r < 0)
10480 return r;
10481 fill_stat(f->inode, stbuf, NULL);
10482 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10483 return r;
10484 }
10485
10486 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10487 unsigned int want, unsigned int flags)
10488 {
10489 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10490 if (!mref_reader.is_state_satisfied())
10491 return -CEPHFS_ENOTCONN;
10492
10493 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10494 tout(cct) << fd << std::endl;
10495
10496 std::scoped_lock lock(client_lock);
10497 Fh *f = get_filehandle(fd);
10498 if (!f)
10499 return -CEPHFS_EBADF;
10500
10501 unsigned mask = statx_to_mask(flags, want);
10502
10503 int r = 0;
10504 if (mask && !f->inode->caps_issued_mask(mask, true)) {
10505 r = _getattr(f->inode, mask, perms);
10506 if (r < 0) {
10507 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10508 return r;
10509 }
10510 }
10511
10512 fill_statx(f->inode, mask, stx);
10513 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10514 return r;
10515 }
10516
10517 // not written yet, but i want to link!
10518
10519 int Client::chdir(const char *relpath, std::string &new_cwd,
10520 const UserPerm& perms)
10521 {
10522 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10523 if (!mref_reader.is_state_satisfied())
10524 return -CEPHFS_ENOTCONN;
10525
10526 tout(cct) << "chdir" << std::endl;
10527 tout(cct) << relpath << std::endl;
10528
10529 filepath path(relpath);
10530 InodeRef in;
10531
10532 std::scoped_lock lock(client_lock);
10533 int r = path_walk(path, &in, perms);
10534 if (r < 0)
10535 return r;
10536
10537 if (!(in.get()->is_dir()))
10538 return -CEPHFS_ENOTDIR;
10539
10540 if (cwd != in)
10541 cwd.swap(in);
10542 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10543
10544 _getcwd(new_cwd, perms);
10545 return 0;
10546 }
10547
10548 void Client::_getcwd(string& dir, const UserPerm& perms)
10549 {
10550 filepath path;
10551 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10552
10553 Inode *in = cwd.get();
10554 while (in != root) {
10555 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10556
10557 // A cwd or ancester is unlinked
10558 if (in->dentries.empty()) {
10559 return;
10560 }
10561
10562 Dentry *dn = in->get_first_parent();
10563
10564
10565 if (!dn) {
10566 // look it up
10567 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10568 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10569 filepath path(in->ino);
10570 req->set_filepath(path);
10571 req->set_inode(in);
10572 int res = make_request(req, perms);
10573 if (res < 0)
10574 break;
10575
10576 // start over
10577 path = filepath();
10578 in = cwd.get();
10579 continue;
10580 }
10581 path.push_front_dentry(dn->name);
10582 in = dn->dir->parent_inode;
10583 }
10584 dir = "/";
10585 dir += path.get_path();
10586 }
10587
10588 void Client::getcwd(string& dir, const UserPerm& perms)
10589 {
10590 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10591 if (!mref_reader.is_state_satisfied())
10592 return;
10593
10594 std::scoped_lock l(client_lock);
10595
10596 _getcwd(dir, perms);
10597 }
10598
10599 int Client::statfs(const char *path, struct statvfs *stbuf,
10600 const UserPerm& perms)
10601 {
10602 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10603 if (!mref_reader.is_state_satisfied())
10604 return -CEPHFS_ENOTCONN;
10605
10606 tout(cct) << __func__ << std::endl;
10607 unsigned long int total_files_on_fs;
10608
10609 ceph_statfs stats;
10610 C_SaferCond cond;
10611
10612 std::unique_lock lock(client_lock);
10613 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10614 if (data_pools.size() == 1) {
10615 objecter->get_fs_stats(stats, data_pools[0], &cond);
10616 } else {
10617 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10618 }
10619
10620 lock.unlock();
10621 int rval = cond.wait();
10622 lock.lock();
10623
10624 assert(root);
10625 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10626
10627 if (rval < 0) {
10628 ldout(cct, 1) << "underlying call to statfs returned error: "
10629 << cpp_strerror(rval)
10630 << dendl;
10631 return rval;
10632 }
10633
10634 memset(stbuf, 0, sizeof(*stbuf));
10635
10636 /*
10637 * we're going to set a block size of 4MB so we can represent larger
10638 * FSes without overflowing. Additionally convert the space
10639 * measurements from KB to bytes while making them in terms of
10640 * blocks. We use 4MB only because it is big enough, and because it
10641 * actually *is* the (ceph) default block size.
10642 */
10643 const int CEPH_BLOCK_SHIFT = 22;
10644 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10645 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10646 stbuf->f_files = total_files_on_fs;
10647 stbuf->f_ffree = -1;
10648 stbuf->f_favail = -1;
10649 stbuf->f_fsid = -1; // ??
10650 stbuf->f_flag = 0; // ??
10651 stbuf->f_namemax = NAME_MAX;
10652
10653 // Usually quota_root will == root_ancestor, but if the mount root has no
10654 // quota but we can see a parent of it that does have a quota, we'll
10655 // respect that one instead.
10656 ceph_assert(root != nullptr);
10657 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10658
10659 // get_quota_root should always give us something
10660 // because client quotas are always enabled
10661 ceph_assert(quota_root != nullptr);
10662
10663 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10664
10665 // Skip the getattr if any sessions are stale, as we don't want to
10666 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10667 // is unhealthy.
10668 if (!_any_stale_sessions()) {
10669 int r = _getattr(quota_root, 0, perms, true);
10670 if (r != 0) {
10671 // Ignore return value: error getting latest inode metadata is not a good
10672 // reason to break "df".
10673 lderr(cct) << "Error in getattr on quota root 0x"
10674 << std::hex << quota_root->ino << std::dec
10675 << " statfs result may be outdated" << dendl;
10676 }
10677 }
10678
10679 // Special case: if there is a size quota set on the Inode acting
10680 // as the root for this client mount, then report the quota status
10681 // as the filesystem statistics.
10682 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10683 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10684 // It is possible for a quota to be exceeded: arithmetic here must
10685 // handle case where used > total.
10686 const fsblkcnt_t free = total > used ? total - used : 0;
10687
10688 stbuf->f_blocks = total;
10689 stbuf->f_bfree = free;
10690 stbuf->f_bavail = free;
10691 } else {
10692 // General case: report the cluster statistics returned from RADOS. Because
10693 // multiple pools may be used without one filesystem namespace via
10694 // layouts, this is the most correct thing we can do.
10695 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10696 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10697 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10698 }
10699
10700 return rval;
10701 }
10702
10703 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10704 struct flock *fl, uint64_t owner, bool removing)
10705 {
10706 ldout(cct, 10) << __func__ << " ino " << in->ino
10707 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10708 << " type " << fl->l_type << " owner " << owner
10709 << " " << fl->l_start << "~" << fl->l_len << dendl;
10710
10711 if (in->flags & I_ERROR_FILELOCK)
10712 return -CEPHFS_EIO;
10713
10714 int lock_cmd;
10715 if (F_RDLCK == fl->l_type)
10716 lock_cmd = CEPH_LOCK_SHARED;
10717 else if (F_WRLCK == fl->l_type)
10718 lock_cmd = CEPH_LOCK_EXCL;
10719 else if (F_UNLCK == fl->l_type)
10720 lock_cmd = CEPH_LOCK_UNLOCK;
10721 else
10722 return -CEPHFS_EIO;
10723
10724 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10725 sleep = 0;
10726
10727 /*
10728 * Set the most significant bit, so that MDS knows the 'owner'
10729 * is sufficient to identify the owner of lock. (old code uses
10730 * both 'owner' and 'pid')
10731 */
10732 owner |= (1ULL << 63);
10733
10734 MetaRequest *req = new MetaRequest(op);
10735 filepath path;
10736 in->make_nosnap_relative_path(path);
10737 req->set_filepath(path);
10738 req->set_inode(in);
10739
10740 req->head.args.filelock_change.rule = lock_type;
10741 req->head.args.filelock_change.type = lock_cmd;
10742 req->head.args.filelock_change.owner = owner;
10743 req->head.args.filelock_change.pid = fl->l_pid;
10744 req->head.args.filelock_change.start = fl->l_start;
10745 req->head.args.filelock_change.length = fl->l_len;
10746 req->head.args.filelock_change.wait = sleep;
10747
10748 int ret;
10749 bufferlist bl;
10750
10751 if (sleep && switch_interrupt_cb) {
10752 // enable interrupt
10753 switch_interrupt_cb(callback_handle, req->get());
10754 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10755 // disable interrupt
10756 switch_interrupt_cb(callback_handle, NULL);
10757 if (ret == 0 && req->aborted()) {
10758 // effect of this lock request has been revoked by the 'lock intr' request
10759 ret = req->get_abort_code();
10760 }
10761 put_request(req);
10762 } else {
10763 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10764 }
10765
10766 if (ret == 0) {
10767 if (op == CEPH_MDS_OP_GETFILELOCK) {
10768 ceph_filelock filelock;
10769 auto p = bl.cbegin();
10770 decode(filelock, p);
10771
10772 if (CEPH_LOCK_SHARED == filelock.type)
10773 fl->l_type = F_RDLCK;
10774 else if (CEPH_LOCK_EXCL == filelock.type)
10775 fl->l_type = F_WRLCK;
10776 else
10777 fl->l_type = F_UNLCK;
10778
10779 fl->l_whence = SEEK_SET;
10780 fl->l_start = filelock.start;
10781 fl->l_len = filelock.length;
10782 fl->l_pid = filelock.pid;
10783 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10784 ceph_lock_state_t *lock_state;
10785 if (lock_type == CEPH_LOCK_FCNTL) {
10786 if (!in->fcntl_locks)
10787 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10788 lock_state = in->fcntl_locks.get();
10789 } else if (lock_type == CEPH_LOCK_FLOCK) {
10790 if (!in->flock_locks)
10791 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10792 lock_state = in->flock_locks.get();
10793 } else {
10794 ceph_abort();
10795 return -CEPHFS_EINVAL;
10796 }
10797 _update_lock_state(fl, owner, lock_state);
10798
10799 if (!removing) {
10800 if (lock_type == CEPH_LOCK_FCNTL) {
10801 if (!fh->fcntl_locks)
10802 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10803 lock_state = fh->fcntl_locks.get();
10804 } else {
10805 if (!fh->flock_locks)
10806 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10807 lock_state = fh->flock_locks.get();
10808 }
10809 _update_lock_state(fl, owner, lock_state);
10810 }
10811 } else
10812 ceph_abort();
10813 }
10814 return ret;
10815 }
10816
10817 int Client::_interrupt_filelock(MetaRequest *req)
10818 {
10819 // Set abort code, but do not kick. The abort code prevents the request
10820 // from being re-sent.
10821 req->abort(-CEPHFS_EINTR);
10822 if (req->mds < 0)
10823 return 0; // haven't sent the request
10824
10825 Inode *in = req->inode();
10826
10827 int lock_type;
10828 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10829 lock_type = CEPH_LOCK_FLOCK_INTR;
10830 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10831 lock_type = CEPH_LOCK_FCNTL_INTR;
10832 else {
10833 ceph_abort();
10834 return -CEPHFS_EINVAL;
10835 }
10836
10837 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10838 filepath path;
10839 in->make_nosnap_relative_path(path);
10840 intr_req->set_filepath(path);
10841 intr_req->set_inode(in);
10842 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10843 intr_req->head.args.filelock_change.rule = lock_type;
10844 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10845
10846 UserPerm perms(req->get_uid(), req->get_gid());
10847 return make_request(intr_req, perms, NULL, NULL, -1);
10848 }
10849
10850 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10851 {
10852 if (!in->fcntl_locks && !in->flock_locks)
10853 return;
10854
10855 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10856 encode(nr_fcntl_locks, bl);
10857 if (nr_fcntl_locks) {
10858 auto &lock_state = in->fcntl_locks;
10859 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10860 p != lock_state->held_locks.end();
10861 ++p)
10862 encode(p->second, bl);
10863 }
10864
10865 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10866 encode(nr_flock_locks, bl);
10867 if (nr_flock_locks) {
10868 auto &lock_state = in->flock_locks;
10869 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10870 p != lock_state->held_locks.end();
10871 ++p)
10872 encode(p->second, bl);
10873 }
10874
10875 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10876 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10877 }
10878
10879 void Client::_release_filelocks(Fh *fh)
10880 {
10881 if (!fh->fcntl_locks && !fh->flock_locks)
10882 return;
10883
10884 Inode *in = fh->inode.get();
10885 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10886
10887 list<ceph_filelock> activated_locks;
10888
10889 list<pair<int, ceph_filelock> > to_release;
10890
10891 if (fh->fcntl_locks) {
10892 auto &lock_state = fh->fcntl_locks;
10893 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10894 auto q = p++;
10895 if (in->flags & I_ERROR_FILELOCK) {
10896 lock_state->remove_lock(q->second, activated_locks);
10897 } else {
10898 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10899 }
10900 }
10901 lock_state.reset();
10902 }
10903 if (fh->flock_locks) {
10904 auto &lock_state = fh->flock_locks;
10905 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10906 auto q = p++;
10907 if (in->flags & I_ERROR_FILELOCK) {
10908 lock_state->remove_lock(q->second, activated_locks);
10909 } else {
10910 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10911 }
10912 }
10913 lock_state.reset();
10914 }
10915
10916 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10917 in->flags &= ~I_ERROR_FILELOCK;
10918
10919 if (to_release.empty())
10920 return;
10921
10922 struct flock fl;
10923 memset(&fl, 0, sizeof(fl));
10924 fl.l_whence = SEEK_SET;
10925 fl.l_type = F_UNLCK;
10926
10927 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10928 p != to_release.end();
10929 ++p) {
10930 fl.l_start = p->second.start;
10931 fl.l_len = p->second.length;
10932 fl.l_pid = p->second.pid;
10933 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10934 p->second.owner, true);
10935 }
10936 }
10937
10938 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10939 ceph_lock_state_t *lock_state)
10940 {
10941 int lock_cmd;
10942 if (F_RDLCK == fl->l_type)
10943 lock_cmd = CEPH_LOCK_SHARED;
10944 else if (F_WRLCK == fl->l_type)
10945 lock_cmd = CEPH_LOCK_EXCL;
10946 else
10947 lock_cmd = CEPH_LOCK_UNLOCK;;
10948
10949 ceph_filelock filelock;
10950 filelock.start = fl->l_start;
10951 filelock.length = fl->l_len;
10952 filelock.client = 0;
10953 // see comment in _do_filelock()
10954 filelock.owner = owner | (1ULL << 63);
10955 filelock.pid = fl->l_pid;
10956 filelock.type = lock_cmd;
10957
10958 if (filelock.type == CEPH_LOCK_UNLOCK) {
10959 list<ceph_filelock> activated_locks;
10960 lock_state->remove_lock(filelock, activated_locks);
10961 } else {
10962 bool r = lock_state->add_lock(filelock, false, false, NULL);
10963 ceph_assert(r);
10964 }
10965 }
10966
10967 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10968 {
10969 Inode *in = fh->inode.get();
10970 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10971 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10972 return ret;
10973 }
10974
10975 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10976 {
10977 Inode *in = fh->inode.get();
10978 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10979 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10980 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10981 return ret;
10982 }
10983
10984 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10985 {
10986 Inode *in = fh->inode.get();
10987 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10988
10989 int sleep = !(cmd & LOCK_NB);
10990 cmd &= ~LOCK_NB;
10991
10992 int type;
10993 switch (cmd) {
10994 case LOCK_SH:
10995 type = F_RDLCK;
10996 break;
10997 case LOCK_EX:
10998 type = F_WRLCK;
10999 break;
11000 case LOCK_UN:
11001 type = F_UNLCK;
11002 break;
11003 default:
11004 return -CEPHFS_EINVAL;
11005 }
11006
11007 struct flock fl;
11008 memset(&fl, 0, sizeof(fl));
11009 fl.l_type = type;
11010 fl.l_whence = SEEK_SET;
11011
11012 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11013 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11014 return ret;
11015 }
11016
11017 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11018 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11019 if (!mref_reader.is_state_satisfied()) {
11020 return -CEPHFS_ENOTCONN;
11021 }
11022
11023 std::unique_lock locker(client_lock);
11024 InodeRef in;
11025 int r = Client::path_walk(path, &in, perms, true);
11026 if (r < 0) {
11027 return r;
11028 }
11029
11030 if (in->snapid == CEPH_NOSNAP) {
11031 return -CEPHFS_EINVAL;
11032 }
11033
11034 snap_info->id = in->snapid;
11035 snap_info->metadata = in->snap_metadata;
11036 return 0;
11037 }
11038
11039 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11040 {
11041 /* Since the only thing this does is wrap a call to statfs, and
11042 statfs takes a lock, it doesn't seem we have a need to split it
11043 out. */
11044 return statfs(0, stbuf, perms);
11045 }
11046
11047 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11048 {
11049 if (!args)
11050 return;
11051 std::scoped_lock l(client_lock);
11052 ldout(cct, 10) << __func__ << " cb " << args->handle
11053 << " invalidate_ino_cb " << args->ino_cb
11054 << " invalidate_dentry_cb " << args->dentry_cb
11055 << " switch_interrupt_cb " << args->switch_intr_cb
11056 << " remount_cb " << args->remount_cb
11057 << dendl;
11058 callback_handle = args->handle;
11059 if (args->ino_cb) {
11060 ino_invalidate_cb = args->ino_cb;
11061 async_ino_invalidator.start();
11062 }
11063 if (args->dentry_cb) {
11064 dentry_invalidate_cb = args->dentry_cb;
11065 async_dentry_invalidator.start();
11066 }
11067 if (args->switch_intr_cb) {
11068 switch_interrupt_cb = args->switch_intr_cb;
11069 interrupt_finisher.start();
11070 }
11071 if (args->remount_cb) {
11072 remount_cb = args->remount_cb;
11073 remount_finisher.start();
11074 }
11075 if (args->ino_release_cb) {
11076 ino_release_cb = args->ino_release_cb;
11077 async_ino_releasor.start();
11078 }
11079 if (args->umask_cb)
11080 umask_cb = args->umask_cb;
11081 }
11082
11083 int Client::test_dentry_handling(bool can_invalidate)
11084 {
11085 int r = 0;
11086
11087 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11088 if (!iref_reader.is_state_satisfied())
11089 return -CEPHFS_ENOTCONN;
11090
11091 can_invalidate_dentries = can_invalidate;
11092
11093 if (can_invalidate_dentries) {
11094 ceph_assert(dentry_invalidate_cb);
11095 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11096 r = 0;
11097 } else {
11098 ceph_assert(remount_cb);
11099 ldout(cct, 1) << "using remount_cb" << dendl;
11100 r = _do_remount(false);
11101 }
11102
11103 return r;
11104 }
11105
11106 int Client::_sync_fs()
11107 {
11108 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11109
11110 ldout(cct, 10) << __func__ << dendl;
11111
11112 // flush file data
11113 std::unique_ptr<C_SaferCond> cond = nullptr;
11114 if (cct->_conf->client_oc) {
11115 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11116 objectcacher->flush_all(cond.get());
11117 }
11118
11119 // flush caps
11120 flush_caps_sync();
11121 ceph_tid_t flush_tid = last_flush_tid;
11122
11123 // wait for unsafe mds requests
11124 wait_unsafe_requests();
11125
11126 wait_sync_caps(flush_tid);
11127
11128 if (nullptr != cond) {
11129 client_lock.unlock();
11130 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11131 cond->wait();
11132 ldout(cct, 15) << __func__ << " flush finished" << dendl;
11133 client_lock.lock();
11134 }
11135
11136 return 0;
11137 }
11138
11139 int Client::sync_fs()
11140 {
11141 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11142 if (!mref_reader.is_state_satisfied())
11143 return -CEPHFS_ENOTCONN;
11144
11145 std::scoped_lock l(client_lock);
11146
11147 return _sync_fs();
11148 }
11149
11150 int64_t Client::drop_caches()
11151 {
11152 std::scoped_lock l(client_lock);
11153 return objectcacher->release_all();
11154 }
11155
11156 int Client::_lazyio(Fh *fh, int enable)
11157 {
11158 Inode *in = fh->inode.get();
11159 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11160
11161 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11162 return 0;
11163
11164 int orig_mode = fh->mode;
11165 if (enable) {
11166 fh->mode |= CEPH_FILE_MODE_LAZY;
11167 in->get_open_ref(fh->mode);
11168 in->put_open_ref(orig_mode);
11169 check_caps(in, CHECK_CAPS_NODELAY);
11170 } else {
11171 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11172 in->get_open_ref(fh->mode);
11173 in->put_open_ref(orig_mode);
11174 check_caps(in, 0);
11175 }
11176
11177 return 0;
11178 }
11179
11180 int Client::lazyio(int fd, int enable)
11181 {
11182 std::scoped_lock l(client_lock);
11183 Fh *f = get_filehandle(fd);
11184 if (!f)
11185 return -CEPHFS_EBADF;
11186
11187 return _lazyio(f, enable);
11188 }
11189
11190 int Client::ll_lazyio(Fh *fh, int enable)
11191 {
11192 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11193 tout(cct) << __func__ << std::endl;
11194
11195 std::scoped_lock lock(client_lock);
11196 return _lazyio(fh, enable);
11197 }
11198
11199 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11200 {
11201 std::scoped_lock l(client_lock);
11202 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11203 << ", " << offset << ", " << count << ")" << dendl;
11204
11205 Fh *f = get_filehandle(fd);
11206 if (!f)
11207 return -CEPHFS_EBADF;
11208
11209 // for now
11210 _fsync(f, true);
11211
11212 return 0;
11213 }
11214
11215 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11216 {
11217 std::scoped_lock l(client_lock);
11218 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11219 << ", " << offset << ", " << count << ")" << dendl;
11220
11221 Fh *f = get_filehandle(fd);
11222 if (!f)
11223 return -CEPHFS_EBADF;
11224 Inode *in = f->inode.get();
11225
11226 _fsync(f, true);
11227 if (_release(in)) {
11228 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11229 if (r < 0)
11230 return r;
11231 }
11232 return 0;
11233 }
11234
11235
11236 // =============================
11237 // snaps
11238
11239 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11240 mode_t mode, const std::map<std::string, std::string> &metadata)
11241 {
11242 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11243 if (!mref_reader.is_state_satisfied())
11244 return -CEPHFS_ENOTCONN;
11245
11246 std::scoped_lock l(client_lock);
11247
11248 filepath path(relpath);
11249 InodeRef in;
11250 int r = path_walk(path, &in, perm);
11251 if (r < 0)
11252 return r;
11253 if (cct->_conf->client_permissions) {
11254 r = may_create(in.get(), perm);
11255 if (r < 0)
11256 return r;
11257 }
11258 Inode *snapdir = open_snapdir(in.get());
11259 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11260 }
11261
11262 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11263 {
11264 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11265 if (!mref_reader.is_state_satisfied())
11266 return -CEPHFS_ENOTCONN;
11267
11268 std::scoped_lock l(client_lock);
11269
11270 filepath path(relpath);
11271 InodeRef in;
11272 int r = path_walk(path, &in, perms);
11273 if (r < 0)
11274 return r;
11275 Inode *snapdir = open_snapdir(in.get());
11276 if (cct->_conf->client_permissions) {
11277 r = may_delete(snapdir, check_perms ? name : NULL, perms);
11278 if (r < 0)
11279 return r;
11280 }
11281 return _rmdir(snapdir, name, perms);
11282 }
11283
11284 // =============================
11285 // expose caps
11286
11287 int Client::get_caps_issued(int fd)
11288 {
11289 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11290 if (!mref_reader.is_state_satisfied())
11291 return -CEPHFS_ENOTCONN;
11292
11293 std::scoped_lock lock(client_lock);
11294
11295 Fh *f = get_filehandle(fd);
11296 if (!f)
11297 return -CEPHFS_EBADF;
11298
11299 return f->inode->caps_issued();
11300 }
11301
11302 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11303 {
11304 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11305 if (!mref_reader.is_state_satisfied())
11306 return -CEPHFS_ENOTCONN;
11307
11308 std::scoped_lock lock(client_lock);
11309
11310 filepath p(path);
11311 InodeRef in;
11312 int r = path_walk(p, &in, perms, true);
11313 if (r < 0)
11314 return r;
11315 return in->caps_issued();
11316 }
11317
11318 // =========================================
11319 // low level
11320
11321 Inode *Client::open_snapdir(Inode *diri)
11322 {
11323 Inode *in;
11324 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11325 if (!inode_map.count(vino)) {
11326 in = new Inode(this, vino, &diri->layout);
11327
11328 in->ino = diri->ino;
11329 in->snapid = CEPH_SNAPDIR;
11330 in->mode = diri->mode;
11331 in->uid = diri->uid;
11332 in->gid = diri->gid;
11333 in->nlink = 1;
11334 in->mtime = diri->mtime;
11335 in->ctime = diri->ctime;
11336 in->btime = diri->btime;
11337 in->atime = diri->atime;
11338 in->size = diri->size;
11339 in->change_attr = diri->change_attr;
11340
11341 in->dirfragtree.clear();
11342 in->snapdir_parent = diri;
11343 diri->flags |= I_SNAPDIR_OPEN;
11344 inode_map[vino] = in;
11345 if (use_faked_inos())
11346 _assign_faked_ino(in);
11347 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11348 } else {
11349 in = inode_map[vino];
11350 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11351 }
11352 return in;
11353 }
11354
11355 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11356 Inode **out, const UserPerm& perms)
11357 {
11358 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11359 if (!mref_reader.is_state_satisfied())
11360 return -CEPHFS_ENOTCONN;
11361
11362 vinodeno_t vparent = _get_vino(parent);
11363 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11364 tout(cct) << __func__ << std::endl;
11365 tout(cct) << name << std::endl;
11366
11367 std::scoped_lock lock(client_lock);
11368
11369 int r = 0;
11370 if (!fuse_default_permissions) {
11371 if (strcmp(name, ".") && strcmp(name, "..")) {
11372 r = may_lookup(parent, perms);
11373 if (r < 0)
11374 return r;
11375 }
11376 }
11377
11378 string dname(name);
11379 InodeRef in;
11380
11381 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11382 if (r < 0) {
11383 attr->st_ino = 0;
11384 goto out;
11385 }
11386
11387 ceph_assert(in);
11388 fill_stat(in, attr);
11389 _ll_get(in.get());
11390
11391 out:
11392 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11393 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11394 tout(cct) << attr->st_ino << std::endl;
11395 *out = in.get();
11396 return r;
11397 }
11398
11399 int Client::ll_lookup_vino(
11400 vinodeno_t vino,
11401 const UserPerm& perms,
11402 Inode **inode)
11403 {
11404 ceph_assert(inode != NULL);
11405 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11406 if (!mref_reader.is_state_satisfied())
11407 return -CEPHFS_ENOTCONN;
11408
11409 std::scoped_lock lock(client_lock);
11410 ldout(cct, 3) << __func__ << " " << vino << dendl;
11411
11412 // Check the cache first
11413 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11414 if (p != inode_map.end()) {
11415 *inode = p->second;
11416 _ll_get(*inode);
11417 return 0;
11418 }
11419
11420 uint64_t snapid = vino.snapid;
11421
11422 // for snapdir, find the non-snapped dir inode
11423 if (snapid == CEPH_SNAPDIR)
11424 vino.snapid = CEPH_NOSNAP;
11425
11426 int r = _lookup_vino(vino, perms, inode);
11427 if (r)
11428 return r;
11429 ceph_assert(*inode != NULL);
11430
11431 if (snapid == CEPH_SNAPDIR) {
11432 Inode *tmp = *inode;
11433
11434 // open the snapdir and put the inode ref
11435 *inode = open_snapdir(tmp);
11436 _ll_forget(tmp, 1);
11437 _ll_get(*inode);
11438 }
11439 return 0;
11440 }
11441
11442 int Client::ll_lookup_inode(
11443 struct inodeno_t ino,
11444 const UserPerm& perms,
11445 Inode **inode)
11446 {
11447 vinodeno_t vino(ino, CEPH_NOSNAP);
11448 return ll_lookup_vino(vino, perms, inode);
11449 }
11450
11451 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11452 struct ceph_statx *stx, unsigned want, unsigned flags,
11453 const UserPerm& perms)
11454 {
11455 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11456 if (!mref_reader.is_state_satisfied())
11457 return -CEPHFS_ENOTCONN;
11458
11459 vinodeno_t vparent = _get_vino(parent);
11460 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11461 tout(cct) << "ll_lookupx" << std::endl;
11462 tout(cct) << name << std::endl;
11463
11464 std::scoped_lock lock(client_lock);
11465
11466 int r = 0;
11467 if (!fuse_default_permissions) {
11468 r = may_lookup(parent, perms);
11469 if (r < 0)
11470 return r;
11471 }
11472
11473 string dname(name);
11474 InodeRef in;
11475
11476 unsigned mask = statx_to_mask(flags, want);
11477 r = _lookup(parent, dname, mask, &in, perms);
11478 if (r < 0) {
11479 stx->stx_ino = 0;
11480 stx->stx_mask = 0;
11481 } else {
11482 ceph_assert(in);
11483 fill_statx(in, mask, stx);
11484 _ll_get(in.get());
11485 }
11486
11487 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11488 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11489 tout(cct) << stx->stx_ino << std::endl;
11490 *out = in.get();
11491 return r;
11492 }
11493
11494 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11495 unsigned int want, unsigned int flags, const UserPerm& perms)
11496 {
11497 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11498 if (!mref_reader.is_state_satisfied())
11499 return -CEPHFS_ENOTCONN;
11500
11501 filepath fp(name, 0);
11502 InodeRef in;
11503 int rc;
11504 unsigned mask = statx_to_mask(flags, want);
11505
11506 ldout(cct, 3) << __func__ << " " << name << dendl;
11507 tout(cct) << __func__ << std::endl;
11508 tout(cct) << name << std::endl;
11509
11510 std::scoped_lock lock(client_lock);
11511 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11512 if (rc < 0) {
11513 /* zero out mask, just in case... */
11514 stx->stx_mask = 0;
11515 stx->stx_ino = 0;
11516 *out = NULL;
11517 return rc;
11518 } else {
11519 ceph_assert(in);
11520 fill_statx(in, mask, stx);
11521 _ll_get(in.get());
11522 *out = in.get();
11523 return 0;
11524 }
11525 }
11526
11527 void Client::_ll_get(Inode *in)
11528 {
11529 if (in->ll_ref == 0) {
11530 in->get();
11531 if (in->is_dir() && !in->dentries.empty()) {
11532 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11533 in->get_first_parent()->get(); // pin dentry
11534 }
11535 if (in->snapid != CEPH_NOSNAP)
11536 ll_snap_ref[in->snapid]++;
11537 }
11538 in->ll_get();
11539 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11540 }
11541
11542 int Client::_ll_put(Inode *in, uint64_t num)
11543 {
11544 in->ll_put(num);
11545 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11546 if (in->ll_ref == 0) {
11547 if (in->is_dir() && !in->dentries.empty()) {
11548 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11549 in->get_first_parent()->put(); // unpin dentry
11550 }
11551 if (in->snapid != CEPH_NOSNAP) {
11552 auto p = ll_snap_ref.find(in->snapid);
11553 ceph_assert(p != ll_snap_ref.end());
11554 ceph_assert(p->second > 0);
11555 if (--p->second == 0)
11556 ll_snap_ref.erase(p);
11557 }
11558 put_inode(in);
11559 return 0;
11560 } else {
11561 return in->ll_ref;
11562 }
11563 }
11564
11565 void Client::_ll_drop_pins()
11566 {
11567 ldout(cct, 10) << __func__ << dendl;
11568 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11569 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11570 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11571 it != inode_map.end();
11572 it = next) {
11573 Inode *in = it->second;
11574 next = it;
11575 ++next;
11576 if (in->ll_ref){
11577 to_be_put.insert(in);
11578 _ll_put(in, in->ll_ref);
11579 }
11580 }
11581 }
11582
11583 bool Client::_ll_forget(Inode *in, uint64_t count)
11584 {
11585 inodeno_t ino = in->ino;
11586
11587 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11588 tout(cct) << __func__ << std::endl;
11589 tout(cct) << ino.val << std::endl;
11590 tout(cct) << count << std::endl;
11591
11592 // Ignore forget if we're no longer mounted
11593 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11594 if (!mref_reader.is_state_satisfied())
11595 return true;
11596
11597 if (ino == 1) return true; // ignore forget on root.
11598
11599 bool last = false;
11600 if (in->ll_ref < count) {
11601 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11602 << ", which only has ll_ref=" << in->ll_ref << dendl;
11603 _ll_put(in, in->ll_ref);
11604 last = true;
11605 } else {
11606 if (_ll_put(in, count) == 0)
11607 last = true;
11608 }
11609
11610 return last;
11611 }
11612
11613 bool Client::ll_forget(Inode *in, uint64_t count)
11614 {
11615 std::scoped_lock lock(client_lock);
11616 return _ll_forget(in, count);
11617 }
11618
11619 bool Client::ll_put(Inode *in)
11620 {
11621 /* ll_forget already takes the lock */
11622 return ll_forget(in, 1);
11623 }
11624
11625 int Client::ll_get_snap_ref(snapid_t snap)
11626 {
11627 std::scoped_lock lock(client_lock);
11628 auto p = ll_snap_ref.find(snap);
11629 if (p != ll_snap_ref.end())
11630 return p->second;
11631 return 0;
11632 }
11633
11634 snapid_t Client::ll_get_snapid(Inode *in)
11635 {
11636 std::scoped_lock lock(client_lock);
11637 return in->snapid;
11638 }
11639
11640 Inode *Client::ll_get_inode(ino_t ino)
11641 {
11642 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11643 if (!mref_reader.is_state_satisfied())
11644 return NULL;
11645
11646 std::scoped_lock lock(client_lock);
11647
11648 vinodeno_t vino = _map_faked_ino(ino);
11649 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11650 if (p == inode_map.end())
11651 return NULL;
11652 Inode *in = p->second;
11653 _ll_get(in);
11654 return in;
11655 }
11656
11657 Inode *Client::ll_get_inode(vinodeno_t vino)
11658 {
11659 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11660 if (!mref_reader.is_state_satisfied())
11661 return NULL;
11662
11663 std::scoped_lock lock(client_lock);
11664
11665 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11666 if (p == inode_map.end())
11667 return NULL;
11668 Inode *in = p->second;
11669 _ll_get(in);
11670 return in;
11671 }
11672
11673 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11674 {
11675 vinodeno_t vino = _get_vino(in);
11676
11677 ldout(cct, 8) << __func__ << " " << vino << dendl;
11678 tout(cct) << __func__ << std::endl;
11679 tout(cct) << vino.ino.val << std::endl;
11680
11681 if (vino.snapid < CEPH_NOSNAP)
11682 return 0;
11683 else
11684 return _getattr(in, caps, perms);
11685 }
11686
11687 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11688 {
11689 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11690 if (!mref_reader.is_state_satisfied())
11691 return -CEPHFS_ENOTCONN;
11692
11693 std::scoped_lock lock(client_lock);
11694
11695 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11696
11697 if (res == 0)
11698 fill_stat(in, attr);
11699 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11700 return res;
11701 }
11702
11703 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11704 unsigned int flags, const UserPerm& perms)
11705 {
11706 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11707 if (!mref_reader.is_state_satisfied())
11708 return -CEPHFS_ENOTCONN;
11709
11710 std::scoped_lock lock(client_lock);
11711
11712 int res = 0;
11713 unsigned mask = statx_to_mask(flags, want);
11714
11715 if (mask && !in->caps_issued_mask(mask, true))
11716 res = _ll_getattr(in, mask, perms);
11717
11718 if (res == 0)
11719 fill_statx(in, mask, stx);
11720 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11721 return res;
11722 }
11723
11724 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11725 const UserPerm& perms, InodeRef *inp)
11726 {
11727 vinodeno_t vino = _get_vino(in);
11728
11729 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11730 << dendl;
11731 tout(cct) << __func__ << std::endl;
11732 tout(cct) << vino.ino.val << std::endl;
11733 tout(cct) << stx->stx_mode << std::endl;
11734 tout(cct) << stx->stx_uid << std::endl;
11735 tout(cct) << stx->stx_gid << std::endl;
11736 tout(cct) << stx->stx_size << std::endl;
11737 tout(cct) << stx->stx_mtime << std::endl;
11738 tout(cct) << stx->stx_atime << std::endl;
11739 tout(cct) << stx->stx_btime << std::endl;
11740 tout(cct) << mask << std::endl;
11741
11742 if (!fuse_default_permissions) {
11743 int res = may_setattr(in, stx, mask, perms);
11744 if (res < 0)
11745 return res;
11746 }
11747
11748 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11749
11750 return __setattrx(in, stx, mask, perms, inp);
11751 }
11752
11753 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11754 const UserPerm& perms)
11755 {
11756 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11757 if (!mref_reader.is_state_satisfied())
11758 return -CEPHFS_ENOTCONN;
11759
11760 std::scoped_lock lock(client_lock);
11761
11762 InodeRef target(in);
11763 int res = _ll_setattrx(in, stx, mask, perms, &target);
11764 if (res == 0) {
11765 ceph_assert(in == target.get());
11766 fill_statx(in, in->caps_issued(), stx);
11767 }
11768
11769 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11770 return res;
11771 }
11772
11773 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11774 const UserPerm& perms)
11775 {
11776 struct ceph_statx stx;
11777 stat_to_statx(attr, &stx);
11778
11779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11780 if (!mref_reader.is_state_satisfied())
11781 return -CEPHFS_ENOTCONN;
11782
11783 std::scoped_lock lock(client_lock);
11784
11785 InodeRef target(in);
11786 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11787 if (res == 0) {
11788 ceph_assert(in == target.get());
11789 fill_stat(in, attr);
11790 }
11791
11792 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11793 return res;
11794 }
11795
11796
11797 // ----------
11798 // xattrs
11799
11800 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11801 const UserPerm& perms)
11802 {
11803 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11804 if (!mref_reader.is_state_satisfied())
11805 return -CEPHFS_ENOTCONN;
11806
11807 std::scoped_lock lock(client_lock);
11808
11809 InodeRef in;
11810 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11811 if (r < 0)
11812 return r;
11813 return _getxattr(in, name, value, size, perms);
11814 }
11815
11816 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11817 const UserPerm& perms)
11818 {
11819 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11820 if (!mref_reader.is_state_satisfied())
11821 return -CEPHFS_ENOTCONN;
11822
11823 std::scoped_lock lock(client_lock);
11824
11825 InodeRef in;
11826 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11827 if (r < 0)
11828 return r;
11829 return _getxattr(in, name, value, size, perms);
11830 }
11831
11832 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11833 const UserPerm& perms)
11834 {
11835 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11836 if (!mref_reader.is_state_satisfied())
11837 return -CEPHFS_ENOTCONN;
11838
11839 std::scoped_lock lock(client_lock);
11840
11841 Fh *f = get_filehandle(fd);
11842 if (!f)
11843 return -CEPHFS_EBADF;
11844 return _getxattr(f->inode, name, value, size, perms);
11845 }
11846
11847 int Client::listxattr(const char *path, char *list, size_t size,
11848 const UserPerm& perms)
11849 {
11850 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11851 if (!mref_reader.is_state_satisfied())
11852 return -CEPHFS_ENOTCONN;
11853
11854 std::scoped_lock lock(client_lock);
11855
11856 InodeRef in;
11857 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11858 if (r < 0)
11859 return r;
11860 return Client::_listxattr(in.get(), list, size, perms);
11861 }
11862
11863 int Client::llistxattr(const char *path, char *list, size_t size,
11864 const UserPerm& perms)
11865 {
11866 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11867 if (!mref_reader.is_state_satisfied())
11868 return -CEPHFS_ENOTCONN;
11869
11870 std::scoped_lock lock(client_lock);
11871
11872 InodeRef in;
11873 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11874 if (r < 0)
11875 return r;
11876 return Client::_listxattr(in.get(), list, size, perms);
11877 }
11878
11879 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11880 {
11881 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11882 if (!mref_reader.is_state_satisfied())
11883 return -CEPHFS_ENOTCONN;
11884
11885 std::scoped_lock lock(client_lock);
11886
11887 Fh *f = get_filehandle(fd);
11888 if (!f)
11889 return -CEPHFS_EBADF;
11890 return Client::_listxattr(f->inode.get(), list, size, perms);
11891 }
11892
11893 int Client::removexattr(const char *path, const char *name,
11894 const UserPerm& perms)
11895 {
11896 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11897 if (!mref_reader.is_state_satisfied())
11898 return -CEPHFS_ENOTCONN;
11899
11900 std::scoped_lock lock(client_lock);
11901
11902 InodeRef in;
11903 int r = Client::path_walk(path, &in, perms, true);
11904 if (r < 0)
11905 return r;
11906 return _removexattr(in, name, perms);
11907 }
11908
11909 int Client::lremovexattr(const char *path, const char *name,
11910 const UserPerm& perms)
11911 {
11912 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11913 if (!mref_reader.is_state_satisfied())
11914 return -CEPHFS_ENOTCONN;
11915
11916 std::scoped_lock lock(client_lock);
11917
11918 InodeRef in;
11919 int r = Client::path_walk(path, &in, perms, false);
11920 if (r < 0)
11921 return r;
11922 return _removexattr(in, name, perms);
11923 }
11924
11925 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11926 {
11927 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11928 if (!mref_reader.is_state_satisfied())
11929 return -CEPHFS_ENOTCONN;
11930
11931 std::scoped_lock lock(client_lock);
11932
11933 Fh *f = get_filehandle(fd);
11934 if (!f)
11935 return -CEPHFS_EBADF;
11936 return _removexattr(f->inode, name, perms);
11937 }
11938
11939 int Client::setxattr(const char *path, const char *name, const void *value,
11940 size_t size, int flags, const UserPerm& perms)
11941 {
11942 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11943 if (!mref_reader.is_state_satisfied())
11944 return -CEPHFS_ENOTCONN;
11945
11946 _setxattr_maybe_wait_for_osdmap(name, value, size);
11947
11948 std::scoped_lock lock(client_lock);
11949
11950 InodeRef in;
11951 int r = Client::path_walk(path, &in, perms, true);
11952 if (r < 0)
11953 return r;
11954 return _setxattr(in, name, value, size, flags, perms);
11955 }
11956
11957 int Client::lsetxattr(const char *path, const char *name, const void *value,
11958 size_t size, int flags, const UserPerm& perms)
11959 {
11960 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11961 if (!mref_reader.is_state_satisfied())
11962 return -CEPHFS_ENOTCONN;
11963
11964 _setxattr_maybe_wait_for_osdmap(name, value, size);
11965
11966 std::scoped_lock lock(client_lock);
11967
11968 InodeRef in;
11969 int r = Client::path_walk(path, &in, perms, false);
11970 if (r < 0)
11971 return r;
11972 return _setxattr(in, name, value, size, flags, perms);
11973 }
11974
11975 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11976 int flags, const UserPerm& perms)
11977 {
11978 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11979 if (!mref_reader.is_state_satisfied())
11980 return -CEPHFS_ENOTCONN;
11981
11982 _setxattr_maybe_wait_for_osdmap(name, value, size);
11983
11984 std::scoped_lock lock(client_lock);
11985
11986 Fh *f = get_filehandle(fd);
11987 if (!f)
11988 return -CEPHFS_EBADF;
11989 return _setxattr(f->inode, name, value, size, flags, perms);
11990 }
11991
11992 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11993 const UserPerm& perms)
11994 {
11995 int r;
11996
11997 const VXattr *vxattr = _match_vxattr(in, name);
11998 if (vxattr) {
11999 r = -CEPHFS_ENODATA;
12000
12001 // Do a force getattr to get the latest quota before returning
12002 // a value to userspace.
12003 int flags = 0;
12004 if (vxattr->flags & VXATTR_RSTAT) {
12005 flags |= CEPH_STAT_RSTAT;
12006 }
12007 if (vxattr->flags & VXATTR_DIRSTAT) {
12008 flags |= CEPH_CAP_FILE_SHARED;
12009 }
12010 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12011 if (r != 0) {
12012 // Error from getattr!
12013 return r;
12014 }
12015
12016 // call pointer-to-member function
12017 char buf[256];
12018 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12019 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12020 } else {
12021 r = -CEPHFS_ENODATA;
12022 }
12023
12024 if (size != 0) {
12025 if (r > (int)size) {
12026 r = -CEPHFS_ERANGE;
12027 } else if (r > 0) {
12028 memcpy(value, buf, r);
12029 }
12030 }
12031 goto out;
12032 }
12033
12034 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12035 r = -CEPHFS_EOPNOTSUPP;
12036 goto out;
12037 }
12038
12039 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12040 if (r == 0) {
12041 string n(name);
12042 r = -CEPHFS_ENODATA;
12043 if (in->xattrs.count(n)) {
12044 r = in->xattrs[n].length();
12045 if (r > 0 && size != 0) {
12046 if (size >= (unsigned)r)
12047 memcpy(value, in->xattrs[n].c_str(), r);
12048 else
12049 r = -CEPHFS_ERANGE;
12050 }
12051 }
12052 }
12053 out:
12054 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12055 return r;
12056 }
12057
12058 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12059 const UserPerm& perms)
12060 {
12061 if (cct->_conf->client_permissions) {
12062 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12063 if (r < 0)
12064 return r;
12065 }
12066 return _getxattr(in.get(), name, value, size, perms);
12067 }
12068
12069 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12070 size_t size, const UserPerm& perms)
12071 {
12072 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12073 if (!mref_reader.is_state_satisfied())
12074 return -CEPHFS_ENOTCONN;
12075
12076 vinodeno_t vino = _get_vino(in);
12077
12078 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12079 tout(cct) << __func__ << std::endl;
12080 tout(cct) << vino.ino.val << std::endl;
12081 tout(cct) << name << std::endl;
12082
12083 std::scoped_lock lock(client_lock);
12084 if (!fuse_default_permissions) {
12085 int r = xattr_permission(in, name, MAY_READ, perms);
12086 if (r < 0)
12087 return r;
12088 }
12089
12090 return _getxattr(in, name, value, size, perms);
12091 }
12092
12093 int Client::_listxattr(Inode *in, char *name, size_t size,
12094 const UserPerm& perms)
12095 {
12096 bool len_only = (size == 0);
12097 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12098 if (r != 0) {
12099 goto out;
12100 }
12101
12102 r = 0;
12103 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12104 if (xattr_name.rfind("ceph.", 0) == 0) {
12105 continue;
12106 }
12107
12108 size_t this_len = xattr_name.length() + 1;
12109 r += this_len;
12110 if (len_only)
12111 continue;
12112
12113 if (this_len > size) {
12114 r = -CEPHFS_ERANGE;
12115 goto out;
12116 }
12117
12118 memcpy(name, xattr_name.c_str(), this_len);
12119 name += this_len;
12120 size -= this_len;
12121 }
12122 out:
12123 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12124 return r;
12125 }
12126
12127 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12128 const UserPerm& perms)
12129 {
12130 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12131 if (!mref_reader.is_state_satisfied())
12132 return -CEPHFS_ENOTCONN;
12133
12134 vinodeno_t vino = _get_vino(in);
12135
12136 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12137 tout(cct) << __func__ << std::endl;
12138 tout(cct) << vino.ino.val << std::endl;
12139 tout(cct) << size << std::endl;
12140
12141 std::scoped_lock lock(client_lock);
12142 return _listxattr(in, names, size, perms);
12143 }
12144
12145 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12146 size_t size, int flags, const UserPerm& perms)
12147 {
12148
12149 int xattr_flags = 0;
12150 if (!value)
12151 xattr_flags |= CEPH_XATTR_REMOVE;
12152 if (flags & XATTR_CREATE)
12153 xattr_flags |= CEPH_XATTR_CREATE;
12154 if (flags & XATTR_REPLACE)
12155 xattr_flags |= CEPH_XATTR_REPLACE;
12156
12157 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12158 filepath path;
12159 in->make_nosnap_relative_path(path);
12160 req->set_filepath(path);
12161 req->set_string2(name);
12162 req->set_inode(in);
12163 req->head.args.setxattr.flags = xattr_flags;
12164
12165 bufferlist bl;
12166 assert (value || size == 0);
12167 bl.append((const char*)value, size);
12168 req->set_data(bl);
12169
12170 int res = make_request(req, perms);
12171
12172 trim_cache();
12173 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12174 res << dendl;
12175 return res;
12176 }
12177
12178 int Client::_setxattr(Inode *in, const char *name, const void *value,
12179 size_t size, int flags, const UserPerm& perms)
12180 {
12181 if (in->snapid != CEPH_NOSNAP) {
12182 return -CEPHFS_EROFS;
12183 }
12184
12185 if (size == 0) {
12186 value = "";
12187 } else if (value == NULL) {
12188 return -CEPHFS_EINVAL;
12189 }
12190
12191 bool posix_acl_xattr = false;
12192 if (acl_type == POSIX_ACL)
12193 posix_acl_xattr = !strncmp(name, "system.", 7);
12194
12195 if (strncmp(name, "user.", 5) &&
12196 strncmp(name, "security.", 9) &&
12197 strncmp(name, "trusted.", 8) &&
12198 strncmp(name, "ceph.", 5) &&
12199 !posix_acl_xattr)
12200 return -CEPHFS_EOPNOTSUPP;
12201
12202 bool check_realm = false;
12203
12204 if (posix_acl_xattr) {
12205 if (!strcmp(name, ACL_EA_ACCESS)) {
12206 mode_t new_mode = in->mode;
12207 if (value) {
12208 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12209 if (ret < 0)
12210 return ret;
12211 if (ret == 0) {
12212 value = NULL;
12213 size = 0;
12214 }
12215 if (new_mode != in->mode) {
12216 struct ceph_statx stx;
12217 stx.stx_mode = new_mode;
12218 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12219 if (ret < 0)
12220 return ret;
12221 }
12222 }
12223 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12224 if (value) {
12225 if (!S_ISDIR(in->mode))
12226 return -CEPHFS_EACCES;
12227 int ret = posix_acl_check(value, size);
12228 if (ret < 0)
12229 return -CEPHFS_EINVAL;
12230 if (ret == 0) {
12231 value = NULL;
12232 size = 0;
12233 }
12234 }
12235 } else {
12236 return -CEPHFS_EOPNOTSUPP;
12237 }
12238 } else {
12239 const VXattr *vxattr = _match_vxattr(in, name);
12240 if (vxattr) {
12241 if (vxattr->readonly)
12242 return -CEPHFS_EOPNOTSUPP;
12243 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12244 check_realm = true;
12245 }
12246 }
12247
12248 int ret = _do_setxattr(in, name, value, size, flags, perms);
12249 if (ret >= 0 && check_realm) {
12250 // check if snaprealm was created for quota inode
12251 if (in->quota.is_enable() &&
12252 !(in->snaprealm && in->snaprealm->ino == in->ino))
12253 ret = -CEPHFS_EOPNOTSUPP;
12254 }
12255
12256 return ret;
12257 }
12258
12259 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12260 size_t size, int flags, const UserPerm& perms)
12261 {
12262 if (cct->_conf->client_permissions) {
12263 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12264 if (r < 0)
12265 return r;
12266 }
12267 return _setxattr(in.get(), name, value, size, flags, perms);
12268 }
12269
12270 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12271 {
12272 string tmp;
12273 if (name == "layout") {
12274 string::iterator begin = value.begin();
12275 string::iterator end = value.end();
12276 keys_and_values<string::iterator> p; // create instance of parser
12277 std::map<string, string> m; // map to receive results
12278 if (!qi::parse(begin, end, p, m)) { // returns true if successful
12279 return -CEPHFS_EINVAL;
12280 }
12281 if (begin != end)
12282 return -CEPHFS_EINVAL;
12283 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12284 if (q->first == "pool") {
12285 tmp = q->second;
12286 break;
12287 }
12288 }
12289 } else if (name == "layout.pool") {
12290 tmp = value;
12291 }
12292
12293 if (tmp.length()) {
12294 int64_t pool;
12295 try {
12296 pool = boost::lexical_cast<unsigned>(tmp);
12297 if (!osdmap->have_pg_pool(pool))
12298 return -CEPHFS_ENOENT;
12299 } catch (boost::bad_lexical_cast const&) {
12300 pool = osdmap->lookup_pg_pool_name(tmp);
12301 if (pool < 0) {
12302 return -CEPHFS_ENOENT;
12303 }
12304 }
12305 }
12306
12307 return 0;
12308 }
12309
12310 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12311 {
12312 // For setting pool of layout, MetaRequest need osdmap epoch.
12313 // There is a race which create a new data pool but client and mds both don't have.
12314 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12315 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12316 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12317 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12318 string rest(strstr(name, "layout"));
12319 string v((const char*)value, size);
12320 int r = objecter->with_osdmap([&](const OSDMap& o) {
12321 return _setxattr_check_data_pool(rest, v, &o);
12322 });
12323
12324 if (r == -CEPHFS_ENOENT) {
12325 bs::error_code ec;
12326 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12327 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12328 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12329 }
12330 }
12331 }
12332
12333 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12334 size_t size, int flags, const UserPerm& perms)
12335 {
12336 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12337 if (!mref_reader.is_state_satisfied())
12338 return -CEPHFS_ENOTCONN;
12339
12340 _setxattr_maybe_wait_for_osdmap(name, value, size);
12341
12342 vinodeno_t vino = _get_vino(in);
12343
12344 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12345 tout(cct) << __func__ << std::endl;
12346 tout(cct) << vino.ino.val << std::endl;
12347 tout(cct) << name << std::endl;
12348
12349 std::scoped_lock lock(client_lock);
12350 if (!fuse_default_permissions) {
12351 int r = xattr_permission(in, name, MAY_WRITE, perms);
12352 if (r < 0)
12353 return r;
12354 }
12355 return _setxattr(in, name, value, size, flags, perms);
12356 }
12357
12358 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12359 {
12360 if (in->snapid != CEPH_NOSNAP) {
12361 return -CEPHFS_EROFS;
12362 }
12363
12364 // same xattrs supported by kernel client
12365 if (strncmp(name, "user.", 5) &&
12366 strncmp(name, "system.", 7) &&
12367 strncmp(name, "security.", 9) &&
12368 strncmp(name, "trusted.", 8) &&
12369 strncmp(name, "ceph.", 5))
12370 return -CEPHFS_EOPNOTSUPP;
12371
12372 const VXattr *vxattr = _match_vxattr(in, name);
12373 if (vxattr && vxattr->readonly)
12374 return -CEPHFS_EOPNOTSUPP;
12375
12376 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12377 filepath path;
12378 in->make_nosnap_relative_path(path);
12379 req->set_filepath(path);
12380 req->set_filepath2(name);
12381 req->set_inode(in);
12382
12383 int res = make_request(req, perms);
12384
12385 trim_cache();
12386 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12387 return res;
12388 }
12389
12390 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12391 {
12392 if (cct->_conf->client_permissions) {
12393 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12394 if (r < 0)
12395 return r;
12396 }
12397 return _removexattr(in.get(), name, perms);
12398 }
12399
12400 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12401 {
12402 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12403 if (!mref_reader.is_state_satisfied())
12404 return -CEPHFS_ENOTCONN;
12405
12406 vinodeno_t vino = _get_vino(in);
12407
12408 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12409 tout(cct) << "ll_removexattr" << std::endl;
12410 tout(cct) << vino.ino.val << std::endl;
12411 tout(cct) << name << std::endl;
12412
12413 std::scoped_lock lock(client_lock);
12414 if (!fuse_default_permissions) {
12415 int r = xattr_permission(in, name, MAY_WRITE, perms);
12416 if (r < 0)
12417 return r;
12418 }
12419
12420 return _removexattr(in, name, perms);
12421 }
12422
12423 bool Client::_vxattrcb_quota_exists(Inode *in)
12424 {
12425 return in->quota.is_enable() &&
12426 (in->snapid != CEPH_NOSNAP ||
12427 (in->snaprealm && in->snaprealm->ino == in->ino));
12428 }
12429 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12430 {
12431 return snprintf(val, size,
12432 "max_bytes=%lld max_files=%lld",
12433 (long long int)in->quota.max_bytes,
12434 (long long int)in->quota.max_files);
12435 }
12436 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12437 {
12438 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12439 }
12440 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12441 {
12442 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12443 }
12444
12445 bool Client::_vxattrcb_layout_exists(Inode *in)
12446 {
12447 return in->layout != file_layout_t();
12448 }
12449 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12450 {
12451 int r = snprintf(val, size,
12452 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12453 (unsigned long long)in->layout.stripe_unit,
12454 (unsigned long long)in->layout.stripe_count,
12455 (unsigned long long)in->layout.object_size);
12456 objecter->with_osdmap([&](const OSDMap& o) {
12457 if (o.have_pg_pool(in->layout.pool_id))
12458 r += snprintf(val + r, size - r, "%s",
12459 o.get_pool_name(in->layout.pool_id).c_str());
12460 else
12461 r += snprintf(val + r, size - r, "%" PRIu64,
12462 (uint64_t)in->layout.pool_id);
12463 });
12464 if (in->layout.pool_ns.length())
12465 r += snprintf(val + r, size - r, " pool_namespace=%s",
12466 in->layout.pool_ns.c_str());
12467 return r;
12468 }
12469 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12470 {
12471 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12472 }
12473 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12474 {
12475 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12476 }
12477 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12478 {
12479 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12480 }
12481 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12482 {
12483 size_t r;
12484 objecter->with_osdmap([&](const OSDMap& o) {
12485 if (o.have_pg_pool(in->layout.pool_id))
12486 r = snprintf(val, size, "%s", o.get_pool_name(
12487 in->layout.pool_id).c_str());
12488 else
12489 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12490 });
12491 return r;
12492 }
12493 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12494 {
12495 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12496 }
12497 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12498 {
12499 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12500 }
12501 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12502 {
12503 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12504 }
12505 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12506 {
12507 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12508 }
12509 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12510 {
12511 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
12512 }
12513 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12514 {
12515 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
12516 }
12517 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12518 {
12519 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
12520 }
12521 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12522 {
12523 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12524 }
12525 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12526 {
12527 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
12528 }
12529 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12530 {
12531 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12532 (long)in->rstat.rctime.nsec());
12533 }
12534 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12535 {
12536 return in->dir_pin != -CEPHFS_ENODATA;
12537 }
12538 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12539 {
12540 return snprintf(val, size, "%ld", (long)in->dir_pin);
12541 }
12542
12543 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12544 {
12545 return !in->snap_btime.is_zero();
12546 }
12547
12548 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12549 {
12550 return snprintf(val, size, "%llu.%09lu",
12551 (long long unsigned)in->snap_btime.sec(),
12552 (long unsigned)in->snap_btime.nsec());
12553 }
12554
12555 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12556 {
12557 // checking one of the xattrs would suffice
12558 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12559 }
12560
12561 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12562 {
12563 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12564 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12565 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12566 in->xattrs["ceph.mirror.info.fs_id"].length(),
12567 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12568 }
12569
12570 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12571 {
12572 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12573 }
12574
12575 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12576 {
12577 auto name = messenger->get_myname();
12578 return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12579 }
12580
12581 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12582 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12583
12584 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12585 { \
12586 name: CEPH_XATTR_NAME(_type, _name), \
12587 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12588 readonly: true, \
12589 exists_cb: NULL, \
12590 flags: _flags, \
12591 }
12592 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12593 { \
12594 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12595 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12596 readonly: false, \
12597 exists_cb: &Client::_vxattrcb_layout_exists, \
12598 flags: 0, \
12599 }
12600 #define XATTR_QUOTA_FIELD(_type, _name) \
12601 { \
12602 name: CEPH_XATTR_NAME(_type, _name), \
12603 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12604 readonly: false, \
12605 exists_cb: &Client::_vxattrcb_quota_exists, \
12606 flags: 0, \
12607 }
12608
12609 const Client::VXattr Client::_dir_vxattrs[] = {
12610 {
12611 name: "ceph.dir.layout",
12612 getxattr_cb: &Client::_vxattrcb_layout,
12613 readonly: false,
12614 exists_cb: &Client::_vxattrcb_layout_exists,
12615 flags: 0,
12616 },
12617 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12618 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12619 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12620 XATTR_LAYOUT_FIELD(dir, layout, pool),
12621 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12622 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12623 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12624 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12625 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12626 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12627 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12628 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
12629 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12630 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12631 {
12632 name: "ceph.quota",
12633 getxattr_cb: &Client::_vxattrcb_quota,
12634 readonly: false,
12635 exists_cb: &Client::_vxattrcb_quota_exists,
12636 flags: 0,
12637 },
12638 XATTR_QUOTA_FIELD(quota, max_bytes),
12639 XATTR_QUOTA_FIELD(quota, max_files),
12640 {
12641 name: "ceph.dir.pin",
12642 getxattr_cb: &Client::_vxattrcb_dir_pin,
12643 readonly: false,
12644 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12645 flags: 0,
12646 },
12647 {
12648 name: "ceph.snap.btime",
12649 getxattr_cb: &Client::_vxattrcb_snap_btime,
12650 readonly: true,
12651 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12652 flags: 0,
12653 },
12654 {
12655 name: "ceph.mirror.info",
12656 getxattr_cb: &Client::_vxattrcb_mirror_info,
12657 readonly: false,
12658 exists_cb: &Client::_vxattrcb_mirror_info_exists,
12659 flags: 0,
12660 },
12661 { name: "" } /* Required table terminator */
12662 };
12663
12664 const Client::VXattr Client::_file_vxattrs[] = {
12665 {
12666 name: "ceph.file.layout",
12667 getxattr_cb: &Client::_vxattrcb_layout,
12668 readonly: false,
12669 exists_cb: &Client::_vxattrcb_layout_exists,
12670 flags: 0,
12671 },
12672 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12673 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12674 XATTR_LAYOUT_FIELD(file, layout, object_size),
12675 XATTR_LAYOUT_FIELD(file, layout, pool),
12676 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12677 {
12678 name: "ceph.snap.btime",
12679 getxattr_cb: &Client::_vxattrcb_snap_btime,
12680 readonly: true,
12681 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12682 flags: 0,
12683 },
12684 { name: "" } /* Required table terminator */
12685 };
12686
12687 const Client::VXattr Client::_common_vxattrs[] = {
12688 {
12689 name: "ceph.cluster_fsid",
12690 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12691 readonly: true,
12692 exists_cb: nullptr,
12693 flags: 0,
12694 },
12695 {
12696 name: "ceph.client_id",
12697 getxattr_cb: &Client::_vxattrcb_client_id,
12698 readonly: true,
12699 exists_cb: nullptr,
12700 flags: 0,
12701 },
12702 { name: "" } /* Required table terminator */
12703 };
12704
12705 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12706 {
12707 if (in->is_dir())
12708 return _dir_vxattrs;
12709 else if (in->is_file())
12710 return _file_vxattrs;
12711 return NULL;
12712 }
12713
12714 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12715 {
12716 if (strncmp(name, "ceph.", 5) == 0) {
12717 const VXattr *vxattr = _get_vxattrs(in);
12718 if (vxattr) {
12719 while (!vxattr->name.empty()) {
12720 if (vxattr->name == name)
12721 return vxattr;
12722 vxattr++;
12723 }
12724 }
12725
12726 // for common vxattrs
12727 vxattr = _common_vxattrs;
12728 while (!vxattr->name.empty()) {
12729 if (vxattr->name == name)
12730 return vxattr;
12731 vxattr++;
12732 }
12733 }
12734
12735 return NULL;
12736 }
12737
12738 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12739 {
12740 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12741 if (!mref_reader.is_state_satisfied())
12742 return -CEPHFS_ENOTCONN;
12743
12744 vinodeno_t vino = _get_vino(in);
12745
12746 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12747 tout(cct) << "ll_readlink" << std::endl;
12748 tout(cct) << vino.ino.val << std::endl;
12749
12750 std::scoped_lock lock(client_lock);
12751 for (auto dn : in->dentries) {
12752 touch_dn(dn);
12753 }
12754
12755 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12756 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12757 return r;
12758 }
12759
12760 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12761 const UserPerm& perms, InodeRef *inp)
12762 {
12763 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12764 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12765 << ", gid " << perms.gid() << ")" << dendl;
12766
12767 if (strlen(name) > NAME_MAX)
12768 return -CEPHFS_ENAMETOOLONG;
12769
12770 if (dir->snapid != CEPH_NOSNAP) {
12771 return -CEPHFS_EROFS;
12772 }
12773 if (is_quota_files_exceeded(dir, perms)) {
12774 return -CEPHFS_EDQUOT;
12775 }
12776
12777 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12778
12779 filepath path;
12780 dir->make_nosnap_relative_path(path);
12781 path.push_dentry(name);
12782 req->set_filepath(path);
12783 req->set_inode(dir);
12784 req->head.args.mknod.rdev = rdev;
12785 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12786 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12787
12788 bufferlist xattrs_bl;
12789 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12790 if (res < 0)
12791 goto fail;
12792 req->head.args.mknod.mode = mode;
12793 if (xattrs_bl.length() > 0)
12794 req->set_data(xattrs_bl);
12795
12796 Dentry *de;
12797 res = get_or_create(dir, name, &de);
12798 if (res < 0)
12799 goto fail;
12800 req->set_dentry(de);
12801
12802 res = make_request(req, perms, inp);
12803
12804 trim_cache();
12805
12806 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12807 return res;
12808
12809 fail:
12810 put_request(req);
12811 return res;
12812 }
12813
12814 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12815 dev_t rdev, struct stat *attr, Inode **out,
12816 const UserPerm& perms)
12817 {
12818 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12819 if (!mref_reader.is_state_satisfied())
12820 return -CEPHFS_ENOTCONN;
12821
12822 vinodeno_t vparent = _get_vino(parent);
12823
12824 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12825 tout(cct) << "ll_mknod" << std::endl;
12826 tout(cct) << vparent.ino.val << std::endl;
12827 tout(cct) << name << std::endl;
12828 tout(cct) << mode << std::endl;
12829 tout(cct) << rdev << std::endl;
12830
12831 std::scoped_lock lock(client_lock);
12832 if (!fuse_default_permissions) {
12833 int r = may_create(parent, perms);
12834 if (r < 0)
12835 return r;
12836 }
12837
12838 InodeRef in;
12839 int r = _mknod(parent, name, mode, rdev, perms, &in);
12840 if (r == 0) {
12841 fill_stat(in, attr);
12842 _ll_get(in.get());
12843 }
12844 tout(cct) << attr->st_ino << std::endl;
12845 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12846 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12847 *out = in.get();
12848 return r;
12849 }
12850
12851 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12852 dev_t rdev, Inode **out,
12853 struct ceph_statx *stx, unsigned want, unsigned flags,
12854 const UserPerm& perms)
12855 {
12856 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12857 if (!mref_reader.is_state_satisfied())
12858 return -CEPHFS_ENOTCONN;
12859
12860 unsigned caps = statx_to_mask(flags, want);
12861
12862 vinodeno_t vparent = _get_vino(parent);
12863
12864 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12865 tout(cct) << "ll_mknodx" << std::endl;
12866 tout(cct) << vparent.ino.val << std::endl;
12867 tout(cct) << name << std::endl;
12868 tout(cct) << mode << std::endl;
12869 tout(cct) << rdev << std::endl;
12870
12871 std::scoped_lock lock(client_lock);
12872
12873 if (!fuse_default_permissions) {
12874 int r = may_create(parent, perms);
12875 if (r < 0)
12876 return r;
12877 }
12878
12879 InodeRef in;
12880 int r = _mknod(parent, name, mode, rdev, perms, &in);
12881 if (r == 0) {
12882 fill_statx(in, caps, stx);
12883 _ll_get(in.get());
12884 }
12885 tout(cct) << stx->stx_ino << std::endl;
12886 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12887 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12888 *out = in.get();
12889 return r;
12890 }
12891
12892 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12893 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12894 int object_size, const char *data_pool, bool *created,
12895 const UserPerm& perms, std::string alternate_name)
12896 {
12897 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12898 mode << dec << ")" << dendl;
12899
12900 if (strlen(name) > NAME_MAX)
12901 return -CEPHFS_ENAMETOOLONG;
12902 if (dir->snapid != CEPH_NOSNAP) {
12903 return -CEPHFS_EROFS;
12904 }
12905 if (is_quota_files_exceeded(dir, perms)) {
12906 return -CEPHFS_EDQUOT;
12907 }
12908
12909 // use normalized flags to generate cmode
12910 int cflags = ceph_flags_sys2wire(flags);
12911 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12912 cflags |= CEPH_O_LAZY;
12913
12914 int cmode = ceph_flags_to_mode(cflags);
12915
12916 int64_t pool_id = -1;
12917 if (data_pool && *data_pool) {
12918 pool_id = objecter->with_osdmap(
12919 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12920 if (pool_id < 0)
12921 return -CEPHFS_EINVAL;
12922 if (pool_id > 0xffffffffll)
12923 return -CEPHFS_ERANGE; // bummer!
12924 }
12925
12926 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12927
12928 filepath path;
12929 dir->make_nosnap_relative_path(path);
12930 path.push_dentry(name);
12931 req->set_filepath(path);
12932 req->set_alternate_name(std::move(alternate_name));
12933 req->set_inode(dir);
12934 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12935
12936 req->head.args.open.stripe_unit = stripe_unit;
12937 req->head.args.open.stripe_count = stripe_count;
12938 req->head.args.open.object_size = object_size;
12939 if (cct->_conf->client_debug_getattr_caps)
12940 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12941 else
12942 req->head.args.open.mask = 0;
12943 req->head.args.open.pool = pool_id;
12944 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12945 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12946
12947 mode |= S_IFREG;
12948 bufferlist xattrs_bl;
12949 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12950 if (res < 0)
12951 goto fail;
12952 req->head.args.open.mode = mode;
12953 if (xattrs_bl.length() > 0)
12954 req->set_data(xattrs_bl);
12955
12956 Dentry *de;
12957 res = get_or_create(dir, name, &de);
12958 if (res < 0)
12959 goto fail;
12960 req->set_dentry(de);
12961
12962 res = make_request(req, perms, inp, created);
12963 if (res < 0) {
12964 goto reply_error;
12965 }
12966
12967 /* If the caller passed a value in fhp, do the open */
12968 if(fhp) {
12969 (*inp)->get_open_ref(cmode);
12970 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12971 }
12972
12973 reply_error:
12974 trim_cache();
12975
12976 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12977 << " layout " << stripe_unit
12978 << ' ' << stripe_count
12979 << ' ' << object_size
12980 <<") = " << res << dendl;
12981 return res;
12982
12983 fail:
12984 put_request(req);
12985 return res;
12986 }
12987
12988 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12989 InodeRef *inp, const std::map<std::string, std::string> &metadata,
12990 std::string alternate_name)
12991 {
12992 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12993 << mode << dec << ", uid " << perm.uid()
12994 << ", gid " << perm.gid() << ")" << dendl;
12995
12996 if (strlen(name) > NAME_MAX)
12997 return -CEPHFS_ENAMETOOLONG;
12998
12999 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13000 return -CEPHFS_EROFS;
13001 }
13002 if (is_quota_files_exceeded(dir, perm)) {
13003 return -CEPHFS_EDQUOT;
13004 }
13005
13006 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13007 MetaRequest *req = new MetaRequest(is_snap_op ?
13008 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13009
13010 filepath path;
13011 dir->make_nosnap_relative_path(path);
13012 path.push_dentry(name);
13013 req->set_filepath(path);
13014 req->set_inode(dir);
13015 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13016 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13017 req->set_alternate_name(std::move(alternate_name));
13018
13019 mode |= S_IFDIR;
13020 bufferlist bl;
13021 int res = _posix_acl_create(dir, &mode, bl, perm);
13022 if (res < 0)
13023 goto fail;
13024 req->head.args.mkdir.mode = mode;
13025 if (is_snap_op) {
13026 SnapPayload payload;
13027 // clear the bufferlist that may have been populated by the call
13028 // to _posix_acl_create(). MDS mksnap does not make use of it.
13029 // So, reuse it to pass metadata payload.
13030 bl.clear();
13031 payload.metadata = metadata;
13032 encode(payload, bl);
13033 }
13034 if (bl.length() > 0) {
13035 req->set_data(bl);
13036 }
13037
13038 Dentry *de;
13039 res = get_or_create(dir, name, &de);
13040 if (res < 0)
13041 goto fail;
13042 req->set_dentry(de);
13043
13044 ldout(cct, 10) << "_mkdir: making request" << dendl;
13045 res = make_request(req, perm, inp);
13046 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13047
13048 trim_cache();
13049
13050 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13051 return res;
13052
13053 fail:
13054 put_request(req);
13055 return res;
13056 }
13057
13058 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13059 struct stat *attr, Inode **out, const UserPerm& perm)
13060 {
13061 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13062 if (!mref_reader.is_state_satisfied())
13063 return -CEPHFS_ENOTCONN;
13064
13065 vinodeno_t vparent = _get_vino(parent);
13066
13067 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13068 tout(cct) << "ll_mkdir" << std::endl;
13069 tout(cct) << vparent.ino.val << std::endl;
13070 tout(cct) << name << std::endl;
13071 tout(cct) << mode << std::endl;
13072
13073 std::scoped_lock lock(client_lock);
13074
13075 if (!fuse_default_permissions) {
13076 int r = may_create(parent, perm);
13077 if (r < 0)
13078 return r;
13079 }
13080
13081 InodeRef in;
13082 int r = _mkdir(parent, name, mode, perm, &in);
13083 if (r == 0) {
13084 fill_stat(in, attr);
13085 _ll_get(in.get());
13086 }
13087 tout(cct) << attr->st_ino << std::endl;
13088 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13089 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13090 *out = in.get();
13091 return r;
13092 }
13093
13094 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13095 struct ceph_statx *stx, unsigned want, unsigned flags,
13096 const UserPerm& perms)
13097 {
13098 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13099 if (!mref_reader.is_state_satisfied())
13100 return -CEPHFS_ENOTCONN;
13101
13102 vinodeno_t vparent = _get_vino(parent);
13103
13104 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13105 tout(cct) << "ll_mkdirx" << std::endl;
13106 tout(cct) << vparent.ino.val << std::endl;
13107 tout(cct) << name << std::endl;
13108 tout(cct) << mode << std::endl;
13109
13110 std::scoped_lock lock(client_lock);
13111
13112 if (!fuse_default_permissions) {
13113 int r = may_create(parent, perms);
13114 if (r < 0)
13115 return r;
13116 }
13117
13118 InodeRef in;
13119 int r = _mkdir(parent, name, mode, perms, &in);
13120 if (r == 0) {
13121 fill_statx(in, statx_to_mask(flags, want), stx);
13122 _ll_get(in.get());
13123 } else {
13124 stx->stx_ino = 0;
13125 stx->stx_mask = 0;
13126 }
13127 tout(cct) << stx->stx_ino << std::endl;
13128 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13129 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13130 *out = in.get();
13131 return r;
13132 }
13133
13134 int Client::_symlink(Inode *dir, const char *name, const char *target,
13135 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13136 {
13137 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13138 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13139 << dendl;
13140
13141 if (strlen(name) > NAME_MAX)
13142 return -CEPHFS_ENAMETOOLONG;
13143
13144 if (dir->snapid != CEPH_NOSNAP) {
13145 return -CEPHFS_EROFS;
13146 }
13147 if (is_quota_files_exceeded(dir, perms)) {
13148 return -CEPHFS_EDQUOT;
13149 }
13150
13151 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13152
13153 filepath path;
13154 dir->make_nosnap_relative_path(path);
13155 path.push_dentry(name);
13156 req->set_filepath(path);
13157 req->set_alternate_name(std::move(alternate_name));
13158 req->set_inode(dir);
13159 req->set_string2(target);
13160 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13161 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13162
13163 Dentry *de;
13164 int res = get_or_create(dir, name, &de);
13165 if (res < 0)
13166 goto fail;
13167 req->set_dentry(de);
13168
13169 res = make_request(req, perms, inp);
13170
13171 trim_cache();
13172 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13173 res << dendl;
13174 return res;
13175
13176 fail:
13177 put_request(req);
13178 return res;
13179 }
13180
13181 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13182 struct stat *attr, Inode **out, const UserPerm& perms)
13183 {
13184 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13185 if (!mref_reader.is_state_satisfied())
13186 return -CEPHFS_ENOTCONN;
13187
13188 vinodeno_t vparent = _get_vino(parent);
13189
13190 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13191 << dendl;
13192 tout(cct) << "ll_symlink" << std::endl;
13193 tout(cct) << vparent.ino.val << std::endl;
13194 tout(cct) << name << std::endl;
13195 tout(cct) << value << std::endl;
13196
13197 std::scoped_lock lock(client_lock);
13198
13199 if (!fuse_default_permissions) {
13200 int r = may_create(parent, perms);
13201 if (r < 0)
13202 return r;
13203 }
13204
13205 InodeRef in;
13206 int r = _symlink(parent, name, value, perms, "", &in);
13207 if (r == 0) {
13208 fill_stat(in, attr);
13209 _ll_get(in.get());
13210 }
13211 tout(cct) << attr->st_ino << std::endl;
13212 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13213 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13214 *out = in.get();
13215 return r;
13216 }
13217
13218 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13219 Inode **out, struct ceph_statx *stx, unsigned want,
13220 unsigned flags, const UserPerm& perms)
13221 {
13222 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13223 if (!mref_reader.is_state_satisfied())
13224 return -CEPHFS_ENOTCONN;
13225
13226 vinodeno_t vparent = _get_vino(parent);
13227
13228 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13229 << dendl;
13230 tout(cct) << "ll_symlinkx" << std::endl;
13231 tout(cct) << vparent.ino.val << std::endl;
13232 tout(cct) << name << std::endl;
13233 tout(cct) << value << std::endl;
13234
13235 std::scoped_lock lock(client_lock);
13236
13237 if (!fuse_default_permissions) {
13238 int r = may_create(parent, perms);
13239 if (r < 0)
13240 return r;
13241 }
13242
13243 InodeRef in;
13244 int r = _symlink(parent, name, value, perms, "", &in);
13245 if (r == 0) {
13246 fill_statx(in, statx_to_mask(flags, want), stx);
13247 _ll_get(in.get());
13248 }
13249 tout(cct) << stx->stx_ino << std::endl;
13250 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13251 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13252 *out = in.get();
13253 return r;
13254 }
13255
13256 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13257 {
13258 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13259 << " uid " << perm.uid() << " gid " << perm.gid()
13260 << ")" << dendl;
13261
13262 if (dir->snapid != CEPH_NOSNAP) {
13263 return -CEPHFS_EROFS;
13264 }
13265
13266 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13267
13268 filepath path;
13269 dir->make_nosnap_relative_path(path);
13270 path.push_dentry(name);
13271 req->set_filepath(path);
13272
13273 InodeRef otherin;
13274 Inode *in;
13275 Dentry *de;
13276
13277 int res = get_or_create(dir, name, &de);
13278 if (res < 0)
13279 goto fail;
13280 req->set_dentry(de);
13281 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13282 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13283
13284 res = _lookup(dir, name, 0, &otherin, perm);
13285 if (res < 0)
13286 goto fail;
13287
13288 in = otherin.get();
13289 req->set_other_inode(in);
13290 in->break_all_delegs();
13291 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13292
13293 req->set_inode(dir);
13294
13295 res = make_request(req, perm);
13296
13297 trim_cache();
13298 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13299 return res;
13300
13301 fail:
13302 put_request(req);
13303 return res;
13304 }
13305
13306 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13307 {
13308 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13309 if (!mref_reader.is_state_satisfied())
13310 return -CEPHFS_ENOTCONN;
13311
13312 vinodeno_t vino = _get_vino(in);
13313
13314 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13315 tout(cct) << "ll_unlink" << std::endl;
13316 tout(cct) << vino.ino.val << std::endl;
13317 tout(cct) << name << std::endl;
13318
13319 std::scoped_lock lock(client_lock);
13320
13321 if (!fuse_default_permissions) {
13322 int r = may_delete(in, name, perm);
13323 if (r < 0)
13324 return r;
13325 }
13326 return _unlink(in, name, perm);
13327 }
13328
13329 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13330 {
13331 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13332 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13333
13334 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13335 return -CEPHFS_EROFS;
13336 }
13337
13338 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13339 MetaRequest *req = new MetaRequest(op);
13340 filepath path;
13341 dir->make_nosnap_relative_path(path);
13342 path.push_dentry(name);
13343 req->set_filepath(path);
13344 req->set_inode(dir);
13345
13346 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13347 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13348 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13349
13350 InodeRef in;
13351
13352 Dentry *de;
13353 int res = get_or_create(dir, name, &de);
13354 if (res < 0)
13355 goto fail;
13356 if (op == CEPH_MDS_OP_RMDIR)
13357 req->set_dentry(de);
13358 else
13359 de->get();
13360
13361 res = _lookup(dir, name, 0, &in, perms);
13362 if (res < 0)
13363 goto fail;
13364
13365 if (op == CEPH_MDS_OP_RMSNAP) {
13366 unlink(de, true, true);
13367 de->put();
13368 }
13369 req->set_other_inode(in.get());
13370
13371 res = make_request(req, perms);
13372
13373 trim_cache();
13374 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13375 return res;
13376
13377 fail:
13378 put_request(req);
13379 return res;
13380 }
13381
13382 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13383 {
13384 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13385 if (!mref_reader.is_state_satisfied())
13386 return -CEPHFS_ENOTCONN;
13387
13388 vinodeno_t vino = _get_vino(in);
13389
13390 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13391 tout(cct) << "ll_rmdir" << std::endl;
13392 tout(cct) << vino.ino.val << std::endl;
13393 tout(cct) << name << std::endl;
13394
13395 std::scoped_lock lock(client_lock);
13396
13397 if (!fuse_default_permissions) {
13398 int r = may_delete(in, name, perms);
13399 if (r < 0)
13400 return r;
13401 }
13402
13403 return _rmdir(in, name, perms);
13404 }
13405
13406 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13407 {
13408 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13409 << todir->ino << " " << toname
13410 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13411 << dendl;
13412
13413 if (fromdir->snapid != todir->snapid)
13414 return -CEPHFS_EXDEV;
13415
13416 int op = CEPH_MDS_OP_RENAME;
13417 if (fromdir->snapid != CEPH_NOSNAP) {
13418 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13419 op = CEPH_MDS_OP_RENAMESNAP;
13420 else
13421 return -CEPHFS_EROFS;
13422 }
13423 if (fromdir != todir) {
13424 Inode *fromdir_root =
13425 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13426 Inode *todir_root =
13427 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13428 if (fromdir_root != todir_root) {
13429 return -CEPHFS_EXDEV;
13430 }
13431 }
13432
13433 InodeRef target;
13434 MetaRequest *req = new MetaRequest(op);
13435
13436 filepath from;
13437 fromdir->make_nosnap_relative_path(from);
13438 from.push_dentry(fromname);
13439 filepath to;
13440 todir->make_nosnap_relative_path(to);
13441 to.push_dentry(toname);
13442 req->set_filepath(to);
13443 req->set_filepath2(from);
13444 req->set_alternate_name(std::move(alternate_name));
13445
13446 Dentry *oldde;
13447 int res = get_or_create(fromdir, fromname, &oldde);
13448 if (res < 0)
13449 goto fail;
13450 Dentry *de;
13451 res = get_or_create(todir, toname, &de);
13452 if (res < 0)
13453 goto fail;
13454
13455 if (op == CEPH_MDS_OP_RENAME) {
13456 req->set_old_dentry(oldde);
13457 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13458 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13459
13460 req->set_dentry(de);
13461 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13462 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13463
13464 InodeRef oldin, otherin;
13465 res = _lookup(fromdir, fromname, 0, &oldin, perm);
13466 if (res < 0)
13467 goto fail;
13468
13469 Inode *oldinode = oldin.get();
13470 oldinode->break_all_delegs();
13471 req->set_old_inode(oldinode);
13472 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13473
13474 res = _lookup(todir, toname, 0, &otherin, perm);
13475 switch (res) {
13476 case 0:
13477 {
13478 Inode *in = otherin.get();
13479 req->set_other_inode(in);
13480 in->break_all_delegs();
13481 }
13482 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13483 break;
13484 case -CEPHFS_ENOENT:
13485 break;
13486 default:
13487 goto fail;
13488 }
13489
13490 req->set_inode(todir);
13491 } else {
13492 // renamesnap reply contains no tracedn, so we need to invalidate
13493 // dentry manually
13494 unlink(oldde, true, true);
13495 unlink(de, true, true);
13496
13497 req->set_inode(todir);
13498 }
13499
13500 res = make_request(req, perm, &target);
13501 ldout(cct, 10) << "rename result is " << res << dendl;
13502
13503 // renamed item from our cache
13504
13505 trim_cache();
13506 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
13507 return res;
13508
13509 fail:
13510 put_request(req);
13511 return res;
13512 }
13513
13514 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13515 const char *newname, const UserPerm& perm)
13516 {
13517 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13518 if (!mref_reader.is_state_satisfied())
13519 return -CEPHFS_ENOTCONN;
13520
13521 vinodeno_t vparent = _get_vino(parent);
13522 vinodeno_t vnewparent = _get_vino(newparent);
13523
13524 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13525 << vnewparent << " " << newname << dendl;
13526 tout(cct) << "ll_rename" << std::endl;
13527 tout(cct) << vparent.ino.val << std::endl;
13528 tout(cct) << name << std::endl;
13529 tout(cct) << vnewparent.ino.val << std::endl;
13530 tout(cct) << newname << std::endl;
13531
13532 std::scoped_lock lock(client_lock);
13533
13534 if (!fuse_default_permissions) {
13535 int r = may_delete(parent, name, perm);
13536 if (r < 0)
13537 return r;
13538 r = may_delete(newparent, newname, perm);
13539 if (r < 0 && r != -CEPHFS_ENOENT)
13540 return r;
13541 }
13542
13543 return _rename(parent, name, newparent, newname, perm, "");
13544 }
13545
13546 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
13547 {
13548 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13549 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13550
13551 if (strlen(newname) > NAME_MAX)
13552 return -CEPHFS_ENAMETOOLONG;
13553
13554 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13555 return -CEPHFS_EROFS;
13556 }
13557 if (is_quota_files_exceeded(dir, perm)) {
13558 return -CEPHFS_EDQUOT;
13559 }
13560
13561 in->break_all_delegs();
13562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13563
13564 filepath path(newname, dir->ino);
13565 req->set_filepath(path);
13566 req->set_alternate_name(std::move(alternate_name));
13567 filepath existing(in->ino);
13568 req->set_filepath2(existing);
13569
13570 req->set_inode(dir);
13571 req->inode_drop = CEPH_CAP_FILE_SHARED;
13572 req->inode_unless = CEPH_CAP_FILE_EXCL;
13573
13574 Dentry *de;
13575 int res = get_or_create(dir, newname, &de);
13576 if (res < 0)
13577 goto fail;
13578 req->set_dentry(de);
13579
13580 res = make_request(req, perm, inp);
13581 ldout(cct, 10) << "link result is " << res << dendl;
13582
13583 trim_cache();
13584 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13585 return res;
13586
13587 fail:
13588 put_request(req);
13589 return res;
13590 }
13591
13592 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13593 const UserPerm& perm)
13594 {
13595 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13596 if (!mref_reader.is_state_satisfied())
13597 return -CEPHFS_ENOTCONN;
13598
13599 vinodeno_t vino = _get_vino(in);
13600 vinodeno_t vnewparent = _get_vino(newparent);
13601
13602 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13603 newname << dendl;
13604 tout(cct) << "ll_link" << std::endl;
13605 tout(cct) << vino.ino.val << std::endl;
13606 tout(cct) << vnewparent << std::endl;
13607 tout(cct) << newname << std::endl;
13608
13609 InodeRef target;
13610
13611 std::scoped_lock lock(client_lock);
13612
13613 if (!fuse_default_permissions) {
13614 if (S_ISDIR(in->mode))
13615 return -CEPHFS_EPERM;
13616
13617 int r = may_hardlink(in, perm);
13618 if (r < 0)
13619 return r;
13620
13621 r = may_create(newparent, perm);
13622 if (r < 0)
13623 return r;
13624 }
13625
13626 return _link(in, newparent, newname, perm, "", &target);
13627 }
13628
13629 int Client::ll_num_osds(void)
13630 {
13631 std::scoped_lock lock(client_lock);
13632 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13633 }
13634
13635 int Client::ll_osdaddr(int osd, uint32_t *addr)
13636 {
13637 std::scoped_lock lock(client_lock);
13638
13639 entity_addr_t g;
13640 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13641 if (!o.exists(osd))
13642 return false;
13643 g = o.get_addrs(osd).front();
13644 return true;
13645 });
13646 if (!exists)
13647 return -1;
13648 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13649 *addr = ntohl(nb_addr);
13650 return 0;
13651 }
13652
13653 uint32_t Client::ll_stripe_unit(Inode *in)
13654 {
13655 std::scoped_lock lock(client_lock);
13656 return in->layout.stripe_unit;
13657 }
13658
13659 uint64_t Client::ll_snap_seq(Inode *in)
13660 {
13661 std::scoped_lock lock(client_lock);
13662 return in->snaprealm->seq;
13663 }
13664
13665 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13666 {
13667 std::scoped_lock lock(client_lock);
13668 *layout = in->layout;
13669 return 0;
13670 }
13671
13672 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13673 {
13674 return ll_file_layout(fh->inode.get(), layout);
13675 }
13676
13677 /* Currently we cannot take advantage of redundancy in reads, since we
13678 would have to go through all possible placement groups (a
13679 potentially quite large number determined by a hash), and use CRUSH
13680 to calculate the appropriate set of OSDs for each placement group,
13681 then index into that. An array with one entry per OSD is much more
13682 tractable and works for demonstration purposes. */
13683
13684 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13685 file_layout_t* layout)
13686 {
13687 std::scoped_lock lock(client_lock);
13688
13689 inodeno_t ino = in->ino;
13690 uint32_t object_size = layout->object_size;
13691 uint32_t su = layout->stripe_unit;
13692 uint32_t stripe_count = layout->stripe_count;
13693 uint64_t stripes_per_object = object_size / su;
13694 uint64_t stripeno = 0, stripepos = 0;
13695
13696 if(stripe_count) {
13697 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13698 stripepos = blockno % stripe_count; // which object in the object set (X)
13699 }
13700 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13701 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13702
13703 object_t oid = file_object_t(ino, objectno);
13704 return objecter->with_osdmap([&](const OSDMap& o) {
13705 ceph_object_layout olayout =
13706 o.file_to_object_layout(oid, *layout);
13707 pg_t pg = (pg_t)olayout.ol_pgid;
13708 vector<int> osds;
13709 int primary;
13710 o.pg_to_acting_osds(pg, &osds, &primary);
13711 return primary;
13712 });
13713 }
13714
13715 /* Return the offset of the block, internal to the object */
13716
13717 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13718 {
13719 std::scoped_lock lock(client_lock);
13720 file_layout_t *layout=&(in->layout);
13721 uint32_t object_size = layout->object_size;
13722 uint32_t su = layout->stripe_unit;
13723 uint64_t stripes_per_object = object_size / su;
13724
13725 return (blockno % stripes_per_object) * su;
13726 }
13727
13728 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13729 const UserPerm& perms)
13730 {
13731 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13732 if (!mref_reader.is_state_satisfied())
13733 return -CEPHFS_ENOTCONN;
13734
13735 vinodeno_t vino = _get_vino(in);
13736
13737 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13738 tout(cct) << "ll_opendir" << std::endl;
13739 tout(cct) << vino.ino.val << std::endl;
13740
13741 std::scoped_lock lock(client_lock);
13742
13743 if (!fuse_default_permissions) {
13744 int r = may_open(in, flags, perms);
13745 if (r < 0)
13746 return r;
13747 }
13748
13749 int r = _opendir(in, dirpp, perms);
13750 tout(cct) << (uintptr_t)*dirpp << std::endl;
13751
13752 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13753 << dendl;
13754 return r;
13755 }
13756
13757 int Client::ll_releasedir(dir_result_t *dirp)
13758 {
13759 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13760 if (!mref_reader.is_state_satisfied())
13761 return -CEPHFS_ENOTCONN;
13762
13763 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13764 tout(cct) << "ll_releasedir" << std::endl;
13765 tout(cct) << (uintptr_t)dirp << std::endl;
13766
13767 std::scoped_lock lock(client_lock);
13768
13769 _closedir(dirp);
13770 return 0;
13771 }
13772
13773 int Client::ll_fsyncdir(dir_result_t *dirp)
13774 {
13775 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13776 if (!mref_reader.is_state_satisfied())
13777 return -CEPHFS_ENOTCONN;
13778
13779 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13780 tout(cct) << "ll_fsyncdir" << std::endl;
13781 tout(cct) << (uintptr_t)dirp << std::endl;
13782
13783 std::scoped_lock lock(client_lock);
13784 return _fsync(dirp->inode.get(), false);
13785 }
13786
13787 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13788 {
13789 ceph_assert(!(flags & O_CREAT));
13790
13791 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13792 if (!mref_reader.is_state_satisfied())
13793 return -CEPHFS_ENOTCONN;
13794
13795 vinodeno_t vino = _get_vino(in);
13796
13797 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13798 tout(cct) << "ll_open" << std::endl;
13799 tout(cct) << vino.ino.val << std::endl;
13800 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13801
13802 std::scoped_lock lock(client_lock);
13803
13804 int r;
13805 if (!fuse_default_permissions) {
13806 r = may_open(in, flags, perms);
13807 if (r < 0)
13808 goto out;
13809 }
13810
13811 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13812
13813 out:
13814 Fh *fhptr = fhp ? *fhp : NULL;
13815 if (fhptr) {
13816 ll_unclosed_fh_set.insert(fhptr);
13817 }
13818 tout(cct) << (uintptr_t)fhptr << std::endl;
13819 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13820 " = " << r << " (" << fhptr << ")" << dendl;
13821 return r;
13822 }
13823
13824 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13825 int flags, InodeRef *in, int caps, Fh **fhp,
13826 const UserPerm& perms)
13827 {
13828 *fhp = NULL;
13829
13830 vinodeno_t vparent = _get_vino(parent);
13831
13832 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13833 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13834 << ", gid " << perms.gid() << dendl;
13835 tout(cct) << "ll_create" << std::endl;
13836 tout(cct) << vparent.ino.val << std::endl;
13837 tout(cct) << name << std::endl;
13838 tout(cct) << mode << std::endl;
13839 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13840
13841 bool created = false;
13842 int r = _lookup(parent, name, caps, in, perms);
13843
13844 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13845 return -CEPHFS_EEXIST;
13846
13847 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
13848 if (!fuse_default_permissions) {
13849 r = may_create(parent, perms);
13850 if (r < 0)
13851 goto out;
13852 }
13853 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13854 perms, "");
13855 if (r < 0)
13856 goto out;
13857 }
13858
13859 if (r < 0)
13860 goto out;
13861
13862 ceph_assert(*in);
13863
13864 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13865 if (!created) {
13866 if (!fuse_default_permissions) {
13867 r = may_open(in->get(), flags, perms);
13868 if (r < 0) {
13869 if (*fhp) {
13870 int release_r = _release_fh(*fhp);
13871 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13872 }
13873 goto out;
13874 }
13875 }
13876 if (*fhp == NULL) {
13877 r = _open(in->get(), flags, mode, fhp, perms);
13878 if (r < 0)
13879 goto out;
13880 }
13881 }
13882
13883 out:
13884 if (*fhp) {
13885 ll_unclosed_fh_set.insert(*fhp);
13886 }
13887
13888 ino_t ino = 0;
13889 if (r >= 0) {
13890 Inode *inode = in->get();
13891 if (use_faked_inos())
13892 ino = inode->faked_ino;
13893 else
13894 ino = inode->ino;
13895 }
13896
13897 tout(cct) << (uintptr_t)*fhp << std::endl;
13898 tout(cct) << ino << std::endl;
13899 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13900 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13901 *fhp << " " << hex << ino << dec << ")" << dendl;
13902
13903 return r;
13904 }
13905
13906 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13907 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13908 const UserPerm& perms)
13909 {
13910 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13911 if (!mref_reader.is_state_satisfied())
13912 return -CEPHFS_ENOTCONN;
13913
13914 std::scoped_lock lock(client_lock);
13915 InodeRef in;
13916
13917 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13918 fhp, perms);
13919 if (r >= 0) {
13920 ceph_assert(in);
13921
13922 // passing an Inode in outp requires an additional ref
13923 if (outp) {
13924 _ll_get(in.get());
13925 *outp = in.get();
13926 }
13927 fill_stat(in, attr);
13928 } else {
13929 attr->st_ino = 0;
13930 }
13931
13932 return r;
13933 }
13934
13935 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13936 int oflags, Inode **outp, Fh **fhp,
13937 struct ceph_statx *stx, unsigned want, unsigned lflags,
13938 const UserPerm& perms)
13939 {
13940 unsigned caps = statx_to_mask(lflags, want);
13941 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13942 if (!mref_reader.is_state_satisfied())
13943 return -CEPHFS_ENOTCONN;
13944
13945 std::scoped_lock lock(client_lock);
13946 InodeRef in;
13947
13948 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13949 if (r >= 0) {
13950 ceph_assert(in);
13951
13952 // passing an Inode in outp requires an additional ref
13953 if (outp) {
13954 _ll_get(in.get());
13955 *outp = in.get();
13956 }
13957 fill_statx(in, caps, stx);
13958 } else {
13959 stx->stx_ino = 0;
13960 stx->stx_mask = 0;
13961 }
13962
13963 return r;
13964 }
13965
13966 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13967 {
13968 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13969 if (!mref_reader.is_state_satisfied())
13970 return -CEPHFS_ENOTCONN;
13971
13972 tout(cct) << "ll_lseek" << std::endl;
13973 tout(cct) << offset << std::endl;
13974 tout(cct) << whence << std::endl;
13975
13976 std::scoped_lock lock(client_lock);
13977 return _lseek(fh, offset, whence);
13978 }
13979
13980 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13981 {
13982 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13983 if (!mref_reader.is_state_satisfied())
13984 return -CEPHFS_ENOTCONN;
13985
13986 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13987 tout(cct) << "ll_read" << std::endl;
13988 tout(cct) << (uintptr_t)fh << std::endl;
13989 tout(cct) << off << std::endl;
13990 tout(cct) << len << std::endl;
13991
13992 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13993 len = std::min(len, (loff_t)INT_MAX);
13994 std::scoped_lock lock(client_lock);
13995
13996 int r = _read(fh, off, len, bl);
13997 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13998 << dendl;
13999 return r;
14000 }
14001
14002 int Client::ll_read_block(Inode *in, uint64_t blockid,
14003 char *buf,
14004 uint64_t offset,
14005 uint64_t length,
14006 file_layout_t* layout)
14007 {
14008 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14009 if (!mref_reader.is_state_satisfied())
14010 return -CEPHFS_ENOTCONN;
14011
14012 vinodeno_t vino = _get_vino(in);
14013 object_t oid = file_object_t(vino.ino, blockid);
14014 C_SaferCond onfinish;
14015 bufferlist bl;
14016
14017 objecter->read(oid,
14018 object_locator_t(layout->pool_id),
14019 offset,
14020 length,
14021 vino.snapid,
14022 &bl,
14023 CEPH_OSD_FLAG_READ,
14024 &onfinish);
14025
14026 int r = onfinish.wait();
14027 if (r >= 0) {
14028 bl.begin().copy(bl.length(), buf);
14029 r = bl.length();
14030 }
14031
14032 return r;
14033 }
14034
14035 /* It appears that the OSD doesn't return success unless the entire
14036 buffer was written, return the write length on success. */
14037
14038 int Client::ll_write_block(Inode *in, uint64_t blockid,
14039 char* buf, uint64_t offset,
14040 uint64_t length, file_layout_t* layout,
14041 uint64_t snapseq, uint32_t sync)
14042 {
14043 vinodeno_t vino = ll_get_vino(in);
14044 int r = 0;
14045 std::unique_ptr<C_SaferCond> onsafe = nullptr;
14046
14047 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14048 if (!mref_reader.is_state_satisfied())
14049 return -CEPHFS_ENOTCONN;
14050
14051 if (length == 0) {
14052 return -CEPHFS_EINVAL;
14053 }
14054 if (true || sync) {
14055 /* if write is stable, the epilogue is waiting on
14056 * flock */
14057 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14058 }
14059 object_t oid = file_object_t(vino.ino, blockid);
14060 SnapContext fakesnap;
14061 ceph::bufferlist bl;
14062 if (length > 0) {
14063 bl.push_back(buffer::copy(buf, length));
14064 }
14065
14066 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14067 << dendl;
14068
14069 fakesnap.seq = snapseq;
14070
14071 /* lock just in time */
14072 objecter->write(oid,
14073 object_locator_t(layout->pool_id),
14074 offset,
14075 length,
14076 fakesnap,
14077 bl,
14078 ceph::real_clock::now(),
14079 0,
14080 onsafe.get());
14081
14082 if (nullptr != onsafe) {
14083 r = onsafe->wait();
14084 }
14085
14086 if (r < 0) {
14087 return r;
14088 } else {
14089 return length;
14090 }
14091 }
14092
14093 int Client::ll_commit_blocks(Inode *in,
14094 uint64_t offset,
14095 uint64_t length)
14096 {
14097 /*
14098 BarrierContext *bctx;
14099 vinodeno_t vino = _get_vino(in);
14100 uint64_t ino = vino.ino;
14101
14102 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14103 << offset << " to " << length << dendl;
14104
14105 if (length == 0) {
14106 return -CEPHFS_EINVAL;
14107 }
14108
14109 std::scoped_lock lock(client_lock);
14110 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14111 if (p != barriers.end()) {
14112 barrier_interval civ(offset, offset + length);
14113 p->second->commit_barrier(civ);
14114 }
14115 */
14116 return 0;
14117 }
14118
14119 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14120 {
14121 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14122 "~" << len << dendl;
14123 tout(cct) << "ll_write" << std::endl;
14124 tout(cct) << (uintptr_t)fh << std::endl;
14125 tout(cct) << off << std::endl;
14126 tout(cct) << len << std::endl;
14127
14128 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14129 if (!mref_reader.is_state_satisfied())
14130 return -CEPHFS_ENOTCONN;
14131
14132 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14133 len = std::min(len, (loff_t)INT_MAX);
14134 std::scoped_lock lock(client_lock);
14135
14136 int r = _write(fh, off, len, data, NULL, 0);
14137 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14138 << dendl;
14139 return r;
14140 }
14141
14142 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14143 {
14144 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14145 if (!mref_reader.is_state_satisfied())
14146 return -CEPHFS_ENOTCONN;
14147
14148 std::unique_lock cl(client_lock);
14149 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false, cl);
14150 }
14151
14152 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14153 {
14154 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14155 if (!mref_reader.is_state_satisfied())
14156 return -CEPHFS_ENOTCONN;
14157
14158 std::unique_lock cl(client_lock);
14159 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false, cl);
14160 }
14161
14162 int Client::ll_flush(Fh *fh)
14163 {
14164 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14165 if (!mref_reader.is_state_satisfied())
14166 return -CEPHFS_ENOTCONN;
14167
14168 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14169 tout(cct) << "ll_flush" << std::endl;
14170 tout(cct) << (uintptr_t)fh << std::endl;
14171
14172 std::scoped_lock lock(client_lock);
14173 return _flush(fh);
14174 }
14175
14176 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14177 {
14178 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14179 if (!mref_reader.is_state_satisfied())
14180 return -CEPHFS_ENOTCONN;
14181
14182 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14183 tout(cct) << "ll_fsync" << std::endl;
14184 tout(cct) << (uintptr_t)fh << std::endl;
14185
14186 std::scoped_lock lock(client_lock);
14187 int r = _fsync(fh, syncdataonly);
14188 if (r) {
14189 // If we're returning an error, clear it from the FH
14190 fh->take_async_err();
14191 }
14192 return r;
14193 }
14194
14195 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14196 {
14197 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14198 if (!mref_reader.is_state_satisfied())
14199 return -CEPHFS_ENOTCONN;
14200
14201 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14202 tout(cct) << "ll_sync_inode" << std::endl;
14203 tout(cct) << (uintptr_t)in << std::endl;
14204
14205 std::scoped_lock lock(client_lock);
14206 return _fsync(in, syncdataonly);
14207 }
14208
14209 #ifdef FALLOC_FL_PUNCH_HOLE
14210
14211 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14212 {
14213 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14214
14215 if (offset < 0 || length <= 0)
14216 return -CEPHFS_EINVAL;
14217
14218 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14219 return -CEPHFS_EOPNOTSUPP;
14220
14221 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14222 return -CEPHFS_EOPNOTSUPP;
14223
14224 Inode *in = fh->inode.get();
14225
14226 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14227 !(mode & FALLOC_FL_PUNCH_HOLE)) {
14228 return -CEPHFS_ENOSPC;
14229 }
14230
14231 if (in->snapid != CEPH_NOSNAP)
14232 return -CEPHFS_EROFS;
14233
14234 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14235 return -CEPHFS_EBADF;
14236
14237 uint64_t size = offset + length;
14238 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14239 size > in->size &&
14240 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14241 return -CEPHFS_EDQUOT;
14242 }
14243
14244 int have;
14245 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14246 if (r < 0)
14247 return r;
14248
14249 std::unique_ptr<C_SaferCond> onuninline = nullptr;
14250 if (mode & FALLOC_FL_PUNCH_HOLE) {
14251 if (in->inline_version < CEPH_INLINE_NONE &&
14252 (have & CEPH_CAP_FILE_BUFFER)) {
14253 bufferlist bl;
14254 auto inline_iter = in->inline_data.cbegin();
14255 int len = in->inline_data.length();
14256 if (offset < len) {
14257 if (offset > 0)
14258 inline_iter.copy(offset, bl);
14259 int size = length;
14260 if (offset + size > len)
14261 size = len - offset;
14262 if (size > 0)
14263 bl.append_zero(size);
14264 if (offset + size < len) {
14265 inline_iter += size;
14266 inline_iter.copy(len - offset - size, bl);
14267 }
14268 in->inline_data = bl;
14269 in->inline_version++;
14270 }
14271 in->mtime = in->ctime = ceph_clock_now();
14272 in->change_attr++;
14273 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14274 } else {
14275 if (in->inline_version < CEPH_INLINE_NONE) {
14276 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14277 uninline_data(in, onuninline.get());
14278 }
14279
14280 C_SaferCond onfinish("Client::_punch_hole flock");
14281
14282 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14283
14284 _invalidate_inode_cache(in, offset, length);
14285 filer->zero(in->ino, &in->layout,
14286 in->snaprealm->get_snap_context(),
14287 offset, length,
14288 ceph::real_clock::now(),
14289 0, true, &onfinish);
14290 in->mtime = in->ctime = ceph_clock_now();
14291 in->change_attr++;
14292 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14293
14294 client_lock.unlock();
14295 onfinish.wait();
14296 client_lock.lock();
14297 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14298 }
14299 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14300 uint64_t size = offset + length;
14301 if (size > in->size) {
14302 in->size = size;
14303 in->mtime = in->ctime = ceph_clock_now();
14304 in->change_attr++;
14305 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14306
14307 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14308 check_caps(in, CHECK_CAPS_NODELAY);
14309 } else if (is_max_size_approaching(in)) {
14310 check_caps(in, 0);
14311 }
14312 }
14313 }
14314
14315 if (nullptr != onuninline) {
14316 client_lock.unlock();
14317 int ret = onuninline->wait();
14318 client_lock.lock();
14319
14320 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14321 in->inline_data.clear();
14322 in->inline_version = CEPH_INLINE_NONE;
14323 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14324 check_caps(in, 0);
14325 } else
14326 r = ret;
14327 }
14328
14329 put_cap_ref(in, CEPH_CAP_FILE_WR);
14330 return r;
14331 }
14332 #else
14333
14334 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14335 {
14336 return -CEPHFS_EOPNOTSUPP;
14337 }
14338
14339 #endif
14340
14341
14342 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14343 {
14344 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14345 if (!mref_reader.is_state_satisfied())
14346 return -CEPHFS_ENOTCONN;
14347
14348 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14349 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14350 tout(cct) << (uintptr_t)fh << std::endl;
14351
14352 std::scoped_lock lock(client_lock);
14353 return _fallocate(fh, mode, offset, length);
14354 }
14355
14356 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14357 {
14358 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14359 if (!mref_reader.is_state_satisfied())
14360 return -CEPHFS_ENOTCONN;
14361
14362 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14363
14364 std::scoped_lock lock(client_lock);
14365 Fh *fh = get_filehandle(fd);
14366 if (!fh)
14367 return -CEPHFS_EBADF;
14368 #if defined(__linux__) && defined(O_PATH)
14369 if (fh->flags & O_PATH)
14370 return -CEPHFS_EBADF;
14371 #endif
14372 return _fallocate(fh, mode, offset, length);
14373 }
14374
14375 int Client::ll_release(Fh *fh)
14376 {
14377 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14378 if (!mref_reader.is_state_satisfied())
14379 return -CEPHFS_ENOTCONN;
14380
14381 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14382 dendl;
14383 tout(cct) << __func__ << " (fh)" << std::endl;
14384 tout(cct) << (uintptr_t)fh << std::endl;
14385
14386 std::scoped_lock lock(client_lock);
14387
14388 if (ll_unclosed_fh_set.count(fh))
14389 ll_unclosed_fh_set.erase(fh);
14390 return _release_fh(fh);
14391 }
14392
14393 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14394 {
14395 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14396 if (!mref_reader.is_state_satisfied())
14397 return -CEPHFS_ENOTCONN;
14398
14399 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14400 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14401
14402 std::scoped_lock lock(client_lock);
14403 return _getlk(fh, fl, owner);
14404 }
14405
14406 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14407 {
14408 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14409 if (!mref_reader.is_state_satisfied())
14410 return -CEPHFS_ENOTCONN;
14411
14412 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14413 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14414
14415 std::scoped_lock lock(client_lock);
14416 return _setlk(fh, fl, owner, sleep);
14417 }
14418
14419 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14420 {
14421 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14422 if (!mref_reader.is_state_satisfied())
14423 return -CEPHFS_ENOTCONN;
14424
14425 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14426 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14427
14428 std::scoped_lock lock(client_lock);
14429 return _flock(fh, cmd, owner);
14430 }
14431
14432 int Client::set_deleg_timeout(uint32_t timeout)
14433 {
14434 std::scoped_lock lock(client_lock);
14435
14436 /*
14437 * The whole point is to prevent blocklisting so we must time out the
14438 * delegation before the session autoclose timeout kicks in.
14439 */
14440 if (timeout >= mdsmap->get_session_autoclose())
14441 return -CEPHFS_EINVAL;
14442
14443 deleg_timeout = timeout;
14444 return 0;
14445 }
14446
14447 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14448 {
14449 int ret = -CEPHFS_EINVAL;
14450
14451 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14452 if (!mref_reader.is_state_satisfied())
14453 return -CEPHFS_ENOTCONN;
14454
14455 std::scoped_lock lock(client_lock);
14456
14457 Inode *inode = fh->inode.get();
14458
14459 switch(cmd) {
14460 case CEPH_DELEGATION_NONE:
14461 inode->unset_deleg(fh);
14462 ret = 0;
14463 break;
14464 default:
14465 try {
14466 ret = inode->set_deleg(fh, cmd, cb, priv);
14467 } catch (std::bad_alloc&) {
14468 ret = -CEPHFS_ENOMEM;
14469 }
14470 break;
14471 }
14472 return ret;
14473 }
14474
14475 class C_Client_RequestInterrupt : public Context {
14476 private:
14477 Client *client;
14478 MetaRequest *req;
14479 public:
14480 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14481 req->get();
14482 }
14483 void finish(int r) override {
14484 std::scoped_lock l(client->client_lock);
14485 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14486 client->_interrupt_filelock(req);
14487 client->put_request(req);
14488 }
14489 };
14490
14491 void Client::ll_interrupt(void *d)
14492 {
14493 MetaRequest *req = static_cast<MetaRequest*>(d);
14494 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14495 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
14496 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14497 }
14498
14499 // =========================================
14500 // layout
14501
14502 // expose file layouts
14503
14504 int Client::describe_layout(const char *relpath, file_layout_t *lp,
14505 const UserPerm& perms)
14506 {
14507 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14508 if (!mref_reader.is_state_satisfied())
14509 return -CEPHFS_ENOTCONN;
14510
14511 std::scoped_lock lock(client_lock);
14512
14513 filepath path(relpath);
14514 InodeRef in;
14515 int r = path_walk(path, &in, perms);
14516 if (r < 0)
14517 return r;
14518
14519 *lp = in->layout;
14520
14521 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
14522 return 0;
14523 }
14524
14525 int Client::fdescribe_layout(int fd, file_layout_t *lp)
14526 {
14527 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14528 if (!mref_reader.is_state_satisfied())
14529 return -CEPHFS_ENOTCONN;
14530
14531 std::scoped_lock lock(client_lock);
14532
14533 Fh *f = get_filehandle(fd);
14534 if (!f)
14535 return -CEPHFS_EBADF;
14536 Inode *in = f->inode.get();
14537
14538 *lp = in->layout;
14539
14540 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
14541 return 0;
14542 }
14543
14544 int64_t Client::get_default_pool_id()
14545 {
14546 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14547 if (!mref_reader.is_state_satisfied())
14548 return -CEPHFS_ENOTCONN;
14549
14550 std::scoped_lock lock(client_lock);
14551
14552 /* first data pool is the default */
14553 return mdsmap->get_first_data_pool();
14554 }
14555
14556 // expose osdmap
14557
14558 int64_t Client::get_pool_id(const char *pool_name)
14559 {
14560 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14561 if (!mref_reader.is_state_satisfied())
14562 return -CEPHFS_ENOTCONN;
14563
14564 std::scoped_lock lock(client_lock);
14565
14566 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14567 pool_name);
14568 }
14569
14570 string Client::get_pool_name(int64_t pool)
14571 {
14572 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14573 if (!mref_reader.is_state_satisfied())
14574 return string();
14575
14576 std::scoped_lock lock(client_lock);
14577
14578 return objecter->with_osdmap([pool](const OSDMap& o) {
14579 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14580 });
14581 }
14582
14583 int Client::get_pool_replication(int64_t pool)
14584 {
14585 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14586 if (!mref_reader.is_state_satisfied())
14587 return -CEPHFS_ENOTCONN;
14588
14589 std::scoped_lock lock(client_lock);
14590
14591 return objecter->with_osdmap([pool](const OSDMap& o) {
14592 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
14593 });
14594 }
14595
14596 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14597 {
14598 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14599 if (!mref_reader.is_state_satisfied())
14600 return -CEPHFS_ENOTCONN;
14601
14602 std::scoped_lock lock(client_lock);
14603
14604 Fh *f = get_filehandle(fd);
14605 if (!f)
14606 return -CEPHFS_EBADF;
14607 Inode *in = f->inode.get();
14608
14609 vector<ObjectExtent> extents;
14610 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14611 ceph_assert(extents.size() == 1);
14612
14613 objecter->with_osdmap([&](const OSDMap& o) {
14614 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14615 o.pg_to_acting_osds(pg, osds);
14616 });
14617
14618 if (osds.empty())
14619 return -CEPHFS_EINVAL;
14620
14621 /*
14622 * Return the remainder of the extent (stripe unit)
14623 *
14624 * If length = 1 is passed to Striper::file_to_extents we get a single
14625 * extent back, but its length is one so we still need to compute the length
14626 * to the end of the stripe unit.
14627 *
14628 * If length = su then we may get 1 or 2 objects back in the extents vector
14629 * which would have to be examined. Even then, the offsets are local to the
14630 * object, so matching up to the file offset is extra work.
14631 *
14632 * It seems simpler to stick with length = 1 and manually compute the
14633 * remainder.
14634 */
14635 if (len) {
14636 uint64_t su = in->layout.stripe_unit;
14637 *len = su - (off % su);
14638 }
14639
14640 return 0;
14641 }
14642
14643 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14644 {
14645 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14646 if (!mref_reader.is_state_satisfied())
14647 return -CEPHFS_ENOTCONN;
14648
14649 std::scoped_lock lock(client_lock);
14650
14651 if (id < 0)
14652 return -CEPHFS_EINVAL;
14653 return objecter->with_osdmap([&](const OSDMap& o) {
14654 return o.crush->get_full_location_ordered(id, path);
14655 });
14656 }
14657
14658 int Client::get_file_stripe_address(int fd, loff_t offset,
14659 vector<entity_addr_t>& address)
14660 {
14661 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14662 if (!mref_reader.is_state_satisfied())
14663 return -CEPHFS_ENOTCONN;
14664
14665 std::scoped_lock lock(client_lock);
14666
14667 Fh *f = get_filehandle(fd);
14668 if (!f)
14669 return -CEPHFS_EBADF;
14670 Inode *in = f->inode.get();
14671
14672 // which object?
14673 vector<ObjectExtent> extents;
14674 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14675 in->truncate_size, extents);
14676 ceph_assert(extents.size() == 1);
14677
14678 // now we have the object and its 'layout'
14679 return objecter->with_osdmap([&](const OSDMap& o) {
14680 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14681 vector<int> osds;
14682 o.pg_to_acting_osds(pg, osds);
14683 if (osds.empty())
14684 return -CEPHFS_EINVAL;
14685 for (unsigned i = 0; i < osds.size(); i++) {
14686 entity_addr_t addr = o.get_addrs(osds[i]).front();
14687 address.push_back(addr);
14688 }
14689 return 0;
14690 });
14691 }
14692
14693 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14694 {
14695 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14696 if (!mref_reader.is_state_satisfied())
14697 return -CEPHFS_ENOTCONN;
14698
14699 std::scoped_lock lock(client_lock);
14700
14701 return objecter->with_osdmap([&](const OSDMap& o) {
14702 if (!o.exists(osd))
14703 return -CEPHFS_ENOENT;
14704
14705 addr = o.get_addrs(osd).front();
14706 return 0;
14707 });
14708 }
14709
14710 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14711 loff_t length, loff_t offset)
14712 {
14713 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14714 if (!mref_reader.is_state_satisfied())
14715 return -CEPHFS_ENOTCONN;
14716
14717 std::scoped_lock lock(client_lock);
14718
14719 Fh *f = get_filehandle(fd);
14720 if (!f)
14721 return -CEPHFS_EBADF;
14722 Inode *in = f->inode.get();
14723
14724 // map to a list of extents
14725 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14726
14727 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14728 return 0;
14729 }
14730
14731
14732 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
14733 int Client::get_local_osd()
14734 {
14735 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14736 if (!mref_reader.is_state_satisfied())
14737 return -CEPHFS_ENOTCONN;
14738
14739 std::scoped_lock lock(client_lock);
14740
14741 objecter->with_osdmap([this](const OSDMap& o) {
14742 if (o.get_epoch() != local_osd_epoch) {
14743 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14744 local_osd_epoch = o.get_epoch();
14745 }
14746 });
14747 return local_osd;
14748 }
14749
14750
14751
14752
14753
14754
14755 // ===============================
14756
14757 void Client::ms_handle_connect(Connection *con)
14758 {
14759 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14760 }
14761
14762 bool Client::ms_handle_reset(Connection *con)
14763 {
14764 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14765 return false;
14766 }
14767
14768 void Client::ms_handle_remote_reset(Connection *con)
14769 {
14770 std::scoped_lock lock(client_lock);
14771 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14772 switch (con->get_peer_type()) {
14773 case CEPH_ENTITY_TYPE_MDS:
14774 {
14775 // kludge to figure out which mds this is; fixme with a Connection* state
14776 mds_rank_t mds = MDS_RANK_NONE;
14777 MetaSession *s = NULL;
14778 for (auto &p : mds_sessions) {
14779 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14780 mds = p.first;
14781 s = &p.second;
14782 }
14783 }
14784 if (mds >= 0) {
14785 assert (s != NULL);
14786 switch (s->state) {
14787 case MetaSession::STATE_CLOSING:
14788 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14789 _closed_mds_session(s);
14790 break;
14791
14792 case MetaSession::STATE_OPENING:
14793 {
14794 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14795 list<Context*> waiters;
14796 waiters.swap(s->waiting_for_open);
14797 _closed_mds_session(s);
14798 MetaSession *news = _get_or_open_mds_session(mds);
14799 news->waiting_for_open.swap(waiters);
14800 }
14801 break;
14802
14803 case MetaSession::STATE_OPEN:
14804 {
14805 objecter->maybe_request_map(); /* to check if we are blocklisted */
14806 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14807 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14808 _closed_mds_session(s);
14809 } else {
14810 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14811 s->state = MetaSession::STATE_STALE;
14812 }
14813 }
14814 break;
14815
14816 case MetaSession::STATE_NEW:
14817 case MetaSession::STATE_CLOSED:
14818 default:
14819 break;
14820 }
14821 }
14822 }
14823 break;
14824 }
14825 }
14826
14827 bool Client::ms_handle_refused(Connection *con)
14828 {
14829 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14830 return false;
14831 }
14832
14833 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14834 {
14835 Inode *quota_in = root_ancestor;
14836 SnapRealm *realm = in->snaprealm;
14837 while (realm) {
14838 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14839 if (realm->ino != in->ino) {
14840 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14841 if (p == inode_map.end())
14842 break;
14843
14844 if (p->second->quota.is_enable()) {
14845 quota_in = p->second;
14846 break;
14847 }
14848 }
14849 realm = realm->pparent;
14850 }
14851 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14852 return quota_in;
14853 }
14854
14855 /**
14856 * Traverse quota ancestors of the Inode, return true
14857 * if any of them passes the passed function
14858 */
14859 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14860 std::function<bool (const Inode &in)> test)
14861 {
14862 while (true) {
14863 ceph_assert(in != NULL);
14864 if (test(*in)) {
14865 return true;
14866 }
14867
14868 if (in == root_ancestor) {
14869 // We're done traversing, drop out
14870 return false;
14871 } else {
14872 // Continue up the tree
14873 in = get_quota_root(in, perms);
14874 }
14875 }
14876
14877 return false;
14878 }
14879
14880 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14881 {
14882 return check_quota_condition(in, perms,
14883 [](const Inode &in) {
14884 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14885 });
14886 }
14887
14888 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14889 const UserPerm& perms)
14890 {
14891 return check_quota_condition(in, perms,
14892 [&new_bytes](const Inode &in) {
14893 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14894 > in.quota.max_bytes;
14895 });
14896 }
14897
14898 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14899 {
14900 ceph_assert(in->size >= in->reported_size);
14901 const uint64_t size = in->size - in->reported_size;
14902 return check_quota_condition(in, perms,
14903 [&size](const Inode &in) {
14904 if (in.quota.max_bytes) {
14905 if (in.rstat.rbytes >= in.quota.max_bytes) {
14906 return true;
14907 }
14908
14909 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14910 return (space >> 4) < size;
14911 } else {
14912 return false;
14913 }
14914 });
14915 }
14916
14917 enum {
14918 POOL_CHECKED = 1,
14919 POOL_CHECKING = 2,
14920 POOL_READ = 4,
14921 POOL_WRITE = 8,
14922 };
14923
14924 int Client::check_pool_perm(Inode *in, int need)
14925 {
14926 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14927
14928 if (!cct->_conf->client_check_pool_perm)
14929 return 0;
14930
14931 /* Only need to do this for regular files */
14932 if (!in->is_file())
14933 return 0;
14934
14935 int64_t pool_id = in->layout.pool_id;
14936 std::string pool_ns = in->layout.pool_ns;
14937 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14938 int have = 0;
14939 while (true) {
14940 auto it = pool_perms.find(perm_key);
14941 if (it == pool_perms.end())
14942 break;
14943 if (it->second == POOL_CHECKING) {
14944 // avoid concurrent checkings
14945 wait_on_list(waiting_for_pool_perm);
14946 } else {
14947 have = it->second;
14948 ceph_assert(have & POOL_CHECKED);
14949 break;
14950 }
14951 }
14952
14953 if (!have) {
14954 if (in->snapid != CEPH_NOSNAP) {
14955 // pool permission check needs to write to the first object. But for snapshot,
14956 // head of the first object may have alread been deleted. To avoid creating
14957 // orphan object, skip the check for now.
14958 return 0;
14959 }
14960
14961 pool_perms[perm_key] = POOL_CHECKING;
14962
14963 char oid_buf[32];
14964 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14965 object_t oid = oid_buf;
14966
14967 SnapContext nullsnapc;
14968
14969 C_SaferCond rd_cond;
14970 ObjectOperation rd_op;
14971 rd_op.stat(nullptr, nullptr, nullptr);
14972
14973 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14974 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14975
14976 C_SaferCond wr_cond;
14977 ObjectOperation wr_op;
14978 wr_op.create(true);
14979
14980 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14981 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14982
14983 client_lock.unlock();
14984 int rd_ret = rd_cond.wait();
14985 int wr_ret = wr_cond.wait();
14986 client_lock.lock();
14987
14988 bool errored = false;
14989
14990 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
14991 have |= POOL_READ;
14992 else if (rd_ret != -CEPHFS_EPERM) {
14993 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14994 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14995 errored = true;
14996 }
14997
14998 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
14999 have |= POOL_WRITE;
15000 else if (wr_ret != -CEPHFS_EPERM) {
15001 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15002 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15003 errored = true;
15004 }
15005
15006 if (errored) {
15007 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15008 // Raise EIO because actual error code might be misleading for
15009 // userspace filesystem user.
15010 pool_perms.erase(perm_key);
15011 signal_cond_list(waiting_for_pool_perm);
15012 return -CEPHFS_EIO;
15013 }
15014
15015 pool_perms[perm_key] = have | POOL_CHECKED;
15016 signal_cond_list(waiting_for_pool_perm);
15017 }
15018
15019 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15020 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15021 << " need " << ccap_string(need) << ", but no read perm" << dendl;
15022 return -CEPHFS_EPERM;
15023 }
15024 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15025 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15026 << " need " << ccap_string(need) << ", but no write perm" << dendl;
15027 return -CEPHFS_EPERM;
15028 }
15029
15030 return 0;
15031 }
15032
15033 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15034 {
15035 if (acl_type == POSIX_ACL) {
15036 if (in->xattrs.count(ACL_EA_ACCESS)) {
15037 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15038
15039 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15040 }
15041 }
15042 return -CEPHFS_EAGAIN;
15043 }
15044
15045 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15046 {
15047 if (acl_type == NO_ACL)
15048 return 0;
15049
15050 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15051 if (r < 0)
15052 goto out;
15053
15054 if (acl_type == POSIX_ACL) {
15055 if (in->xattrs.count(ACL_EA_ACCESS)) {
15056 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15057 bufferptr acl(access_acl.c_str(), access_acl.length());
15058 r = posix_acl_access_chmod(acl, mode);
15059 if (r < 0)
15060 goto out;
15061 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15062 } else {
15063 r = 0;
15064 }
15065 }
15066 out:
15067 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15068 return r;
15069 }
15070
15071 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15072 const UserPerm& perms)
15073 {
15074 if (acl_type == NO_ACL)
15075 return 0;
15076
15077 if (S_ISLNK(*mode))
15078 return 0;
15079
15080 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15081 if (r < 0)
15082 goto out;
15083
15084 if (acl_type == POSIX_ACL) {
15085 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15086 map<string, bufferptr> xattrs;
15087
15088 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15089 bufferptr acl(default_acl.c_str(), default_acl.length());
15090 r = posix_acl_inherit_mode(acl, mode);
15091 if (r < 0)
15092 goto out;
15093
15094 if (r > 0) {
15095 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15096 if (r < 0)
15097 goto out;
15098 if (r > 0)
15099 xattrs[ACL_EA_ACCESS] = acl;
15100 }
15101
15102 if (S_ISDIR(*mode))
15103 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15104
15105 r = xattrs.size();
15106 if (r > 0)
15107 encode(xattrs, xattrs_bl);
15108 } else {
15109 if (umask_cb)
15110 *mode &= ~umask_cb(callback_handle);
15111 r = 0;
15112 }
15113 }
15114 out:
15115 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15116 return r;
15117 }
15118
15119 void Client::set_filer_flags(int flags)
15120 {
15121 std::scoped_lock l(client_lock);
15122 ceph_assert(flags == 0 ||
15123 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15124 objecter->add_global_op_flags(flags);
15125 }
15126
15127 void Client::clear_filer_flags(int flags)
15128 {
15129 std::scoped_lock l(client_lock);
15130 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15131 objecter->clear_global_op_flag(flags);
15132 }
15133
15134 // called before mount
15135 void Client::set_uuid(const std::string& uuid)
15136 {
15137 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15138 ceph_assert(iref_reader.is_state_satisfied());
15139
15140 std::scoped_lock l(client_lock);
15141 assert(!uuid.empty());
15142
15143 metadata["uuid"] = uuid;
15144 _close_sessions();
15145 }
15146
15147 // called before mount. 0 means infinite
15148 void Client::set_session_timeout(unsigned timeout)
15149 {
15150 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15151 ceph_assert(iref_reader.is_state_satisfied());
15152
15153 std::scoped_lock l(client_lock);
15154
15155 metadata["timeout"] = stringify(timeout);
15156 }
15157
15158 // called before mount
15159 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15160 const std::string& fs_name)
15161 {
15162 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15163 if (!iref_reader.is_state_satisfied())
15164 return -CEPHFS_ENOTCONN;
15165
15166 if (uuid.empty())
15167 return -CEPHFS_EINVAL;
15168
15169 std::unique_lock l(client_lock);
15170 {
15171 auto it = metadata.find("uuid");
15172 if (it != metadata.end() && it->second == uuid)
15173 return -CEPHFS_EINVAL;
15174 }
15175
15176 int r = subscribe_mdsmap(fs_name);
15177 if (r < 0) {
15178 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15179 return r;
15180 }
15181
15182 if (metadata.empty())
15183 populate_metadata("");
15184
15185 while (mdsmap->get_epoch() == 0)
15186 wait_on_list(waiting_for_mdsmap);
15187
15188 reclaim_errno = 0;
15189 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15190 if (!mdsmap->is_up(mds)) {
15191 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15192 wait_on_list(waiting_for_mdsmap);
15193 continue;
15194 }
15195
15196 MetaSession *session;
15197 if (!have_open_session(mds)) {
15198 session = _get_or_open_mds_session(mds);
15199 if (session->state == MetaSession::STATE_REJECTED)
15200 return -CEPHFS_EPERM;
15201 if (session->state != MetaSession::STATE_OPENING) {
15202 // umounting?
15203 return -CEPHFS_EINVAL;
15204 }
15205 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15206 wait_on_context_list(session->waiting_for_open);
15207 continue;
15208 }
15209
15210 session = &mds_sessions.at(mds);
15211 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15212 return -CEPHFS_EOPNOTSUPP;
15213
15214 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15215 session->reclaim_state == MetaSession::RECLAIMING) {
15216 session->reclaim_state = MetaSession::RECLAIMING;
15217 auto m = make_message<MClientReclaim>(uuid, flags);
15218 session->con->send_message2(std::move(m));
15219 wait_on_list(waiting_for_reclaim);
15220 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15221 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15222 } else {
15223 mds++;
15224 }
15225 }
15226
15227 // didn't find target session in any mds
15228 if (reclaim_target_addrs.empty()) {
15229 if (flags & CEPH_RECLAIM_RESET)
15230 return -CEPHFS_ENOENT;
15231 return -CEPHFS_ENOTRECOVERABLE;
15232 }
15233
15234 if (flags & CEPH_RECLAIM_RESET)
15235 return 0;
15236
15237 // use blocklist to check if target session was killed
15238 // (config option mds_session_blocklist_on_evict needs to be true)
15239 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15240 bs::error_code ec;
15241 l.unlock();
15242 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15243 l.lock();
15244
15245 if (ec)
15246 return ceph::from_error_code(ec);
15247
15248 bool blocklisted = objecter->with_osdmap(
15249 [this](const OSDMap &osd_map) -> bool {
15250 return osd_map.is_blocklisted(reclaim_target_addrs);
15251 });
15252 if (blocklisted)
15253 return -CEPHFS_ENOTRECOVERABLE;
15254
15255 metadata["reclaiming_uuid"] = uuid;
15256 return 0;
15257 }
15258
15259 void Client::finish_reclaim()
15260 {
15261 auto it = metadata.find("reclaiming_uuid");
15262 if (it == metadata.end()) {
15263 for (auto &p : mds_sessions)
15264 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15265 return;
15266 }
15267
15268 for (auto &p : mds_sessions) {
15269 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15270 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15271 p.second.con->send_message2(std::move(m));
15272 }
15273
15274 metadata["uuid"] = it->second;
15275 metadata.erase(it);
15276 }
15277
15278 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15279 {
15280 mds_rank_t from = mds_rank_t(reply->get_source().num());
15281 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15282
15283 std::scoped_lock cl(client_lock);
15284 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
15285 if (!session) {
15286 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15287 return;
15288 }
15289
15290 if (reply->get_result() >= 0) {
15291 session->reclaim_state = MetaSession::RECLAIM_OK;
15292 if (reply->get_epoch() > reclaim_osd_epoch)
15293 reclaim_osd_epoch = reply->get_epoch();
15294 if (!reply->get_addrs().empty())
15295 reclaim_target_addrs = reply->get_addrs();
15296 } else {
15297 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15298 reclaim_errno = reply->get_result();
15299 }
15300
15301 signal_cond_list(waiting_for_reclaim);
15302 }
15303
15304 /**
15305 * This is included in cap release messages, to cause
15306 * the MDS to wait until this OSD map epoch. It is necessary
15307 * in corner cases where we cancel RADOS ops, so that
15308 * nobody else tries to do IO to the same objects in
15309 * the same epoch as the cancelled ops.
15310 */
15311 void Client::set_cap_epoch_barrier(epoch_t e)
15312 {
15313 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15314 cap_epoch_barrier = e;
15315 }
15316
15317 const char** Client::get_tracked_conf_keys() const
15318 {
15319 static const char* keys[] = {
15320 "client_cache_size",
15321 "client_cache_mid",
15322 "client_acl_type",
15323 "client_deleg_timeout",
15324 "client_deleg_break_on_open",
15325 "client_oc_size",
15326 "client_oc_max_objects",
15327 "client_oc_max_dirty",
15328 "client_oc_target_dirty",
15329 "client_oc_max_dirty_age",
15330 NULL
15331 };
15332 return keys;
15333 }
15334
15335 void Client::handle_conf_change(const ConfigProxy& conf,
15336 const std::set <std::string> &changed)
15337 {
15338 std::scoped_lock lock(client_lock);
15339
15340 if (changed.count("client_cache_mid")) {
15341 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15342 }
15343 if (changed.count("client_acl_type")) {
15344 acl_type = NO_ACL;
15345 if (cct->_conf->client_acl_type == "posix_acl")
15346 acl_type = POSIX_ACL;
15347 }
15348 if (changed.count("client_oc_size")) {
15349 objectcacher->set_max_size(cct->_conf->client_oc_size);
15350 }
15351 if (changed.count("client_oc_max_objects")) {
15352 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15353 }
15354 if (changed.count("client_oc_max_dirty")) {
15355 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15356 }
15357 if (changed.count("client_oc_target_dirty")) {
15358 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15359 }
15360 if (changed.count("client_oc_max_dirty_age")) {
15361 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15362 }
15363 }
15364
15365 void intrusive_ptr_add_ref(Inode *in)
15366 {
15367 in->get();
15368 }
15369
15370 void intrusive_ptr_release(Inode *in)
15371 {
15372 in->client->put_inode(in);
15373 }
15374
15375 mds_rank_t Client::_get_random_up_mds() const
15376 {
15377 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15378
15379 std::set<mds_rank_t> up;
15380 mdsmap->get_up_mds_set(up);
15381
15382 if (up.empty())
15383 return MDS_RANK_NONE;
15384 std::set<mds_rank_t>::const_iterator p = up.begin();
15385 for (int n = rand() % up.size(); n; n--)
15386 ++p;
15387 return *p;
15388 }
15389
15390
15391 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15392 boost::asio::io_context& ictx)
15393 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15394 {
15395 monclient->set_messenger(m);
15396 objecter->set_client_incarnation(0);
15397 }
15398
15399 StandaloneClient::~StandaloneClient()
15400 {
15401 delete objecter;
15402 objecter = nullptr;
15403 }
15404
15405 int StandaloneClient::init()
15406 {
15407 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15408 ceph_assert(iref_writer.is_first_writer());
15409
15410 _pre_init();
15411 objecter->init();
15412
15413 client_lock.lock();
15414
15415 messenger->add_dispatcher_tail(objecter);
15416 messenger->add_dispatcher_tail(this);
15417
15418 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15419 int r = monclient->init();
15420 if (r < 0) {
15421 // need to do cleanup because we're in an intermediate init state
15422 {
15423 std::scoped_lock l(timer_lock);
15424 timer.shutdown();
15425 }
15426
15427 client_lock.unlock();
15428 objecter->shutdown();
15429 objectcacher->stop();
15430 monclient->shutdown();
15431 return r;
15432 }
15433 objecter->start();
15434
15435 client_lock.unlock();
15436 _finish_init();
15437 iref_writer.update_state(CLIENT_INITIALIZED);
15438
15439 return 0;
15440 }
15441
15442 void StandaloneClient::shutdown()
15443 {
15444 Client::shutdown();
15445 objecter->shutdown();
15446 monclient->shutdown();
15447 }