]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
58b194fe73d3b2856c4d9f70d67c405b84d1f4c7
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #else
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
77
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
83
84 #define dout_subsys ceph_subsys_client
85
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
90
91 #include "Client.h"
92 #include "Inode.h"
93 #include "Dentry.h"
94 #include "Delegation.h"
95 #include "Dir.h"
96 #include "ClientSnapRealm.h"
97 #include "Fh.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
102
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
105
106 #include "include/cephfs/ceph_ll_client.h"
107
108 #if HAVE_GETGROUPLIST
109 #include <grp.h>
110 #include <pwd.h>
111 #include <unistd.h>
112 #endif
113
114 #undef dout_prefix
115 #define dout_prefix *_dout << "client." << whoami << " "
116
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119 // FreeBSD fails to define this
120 #ifndef O_DSYNC
121 #define O_DSYNC 0x0
122 #endif
123 // Darwin fails to define this
124 #ifndef O_RSYNC
125 #define O_RSYNC 0x0
126 #endif
127
128 #ifndef O_DIRECT
129 #define O_DIRECT 0x0
130 #endif
131
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
137 // them as invalid.
138 #ifndef O_NOFOLLOW
139 #define O_NOFOLLOW 0x0
140 #endif
141
142 #ifndef O_SYNC
143 #define O_SYNC 0x0
144 #endif
145
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
148 #ifndef S_IXUGO
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150 #endif
151
152 using std::dec;
153 using std::hex;
154 using std::list;
155 using std::oct;
156 using std::pair;
157 using std::string;
158 using std::vector;
159
160 using namespace TOPNSPC::common;
161
162 namespace bs = boost::system;
163 namespace ca = ceph::async;
164
165 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166 {
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169 }
170
171 bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177 }
178
179
180 // -------------
181
182 Client::CommandHook::CommandHook(Client *client) :
183 m_client(client)
184 {
185 }
186
187 int Client::CommandHook::call(
188 std::string_view command,
189 const cmdmap_t& cmdmap,
190 Formatter *f,
191 std::ostream& errss,
192 bufferlist& out)
193 {
194 f->open_object_section("result");
195 {
196 std::scoped_lock l{m_client->client_lock};
197 if (command == "mds_requests")
198 m_client->dump_mds_requests(f);
199 else if (command == "mds_sessions") {
200 bool cap_dump = false;
201 cmd_getval(cmdmap, "cap_dump", cap_dump);
202 m_client->dump_mds_sessions(f, cap_dump);
203 } else if (command == "dump_cache")
204 m_client->dump_cache(f);
205 else if (command == "kick_stale_sessions")
206 m_client->_kick_stale_sessions();
207 else if (command == "status")
208 m_client->dump_status(f);
209 else
210 ceph_abort_msg("bad command registered");
211 }
212 f->close_section();
213 return 0;
214 }
215
216
217 // -------------
218
219 int Client::get_fd_inode(int fd, InodeRef *in) {
220 int r = 0;
221 if (fd == CEPHFS_AT_FDCWD) {
222 *in = cwd;
223 } else {
224 Fh *f = get_filehandle(fd);
225 if (!f) {
226 r = -CEPHFS_EBADF;
227 } else {
228 *in = f->inode;
229 }
230 }
231 return r;
232 }
233
234 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
235 : inode(in), offset(0), next_offset(2),
236 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
237 perms(perms)
238 { }
239
240 void Client::_reset_faked_inos()
241 {
242 ino_t start = 1024;
243 free_faked_inos.clear();
244 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
245 last_used_faked_ino = 0;
246 last_used_faked_root = 0;
247 #ifdef _WIN32
248 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
249 // Windows structures, including Dokan ones, are using 64B identifiers.
250 _use_faked_inos = false;
251 #else
252 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
253 #endif
254 }
255
256 void Client::_assign_faked_ino(Inode *in)
257 {
258 if (0 == last_used_faked_ino)
259 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
260 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
261 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
262 last_used_faked_ino = 2048;
263 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
264 }
265 ceph_assert(it != free_faked_inos.end());
266 if (last_used_faked_ino < it.get_start()) {
267 ceph_assert(it.get_len() > 0);
268 last_used_faked_ino = it.get_start();
269 } else {
270 ++last_used_faked_ino;
271 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
272 }
273 in->faked_ino = last_used_faked_ino;
274 free_faked_inos.erase(in->faked_ino);
275 faked_ino_map[in->faked_ino] = in->vino();
276 }
277
278 /*
279 * In the faked mode, if you export multiple subdirectories,
280 * you will see that the inode numbers of the exported subdirectories
281 * are the same. so we distinguish the mount point by reserving
282 * the "fake ids" between "1024~2048" and combining the last
283 * 10bits(0x3ff) of the "root inodes".
284 */
285 void Client::_assign_faked_root(Inode *in)
286 {
287 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
288 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
289 last_used_faked_root = 0;
290 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
291 }
292 ceph_assert(it != free_faked_inos.end());
293 vinodeno_t inode_info = in->vino();
294 uint64_t inode_num = (uint64_t)inode_info.ino;
295 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
296 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
297 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
298
299 in->faked_ino = last_used_faked_root;
300 free_faked_inos.erase(in->faked_ino);
301 faked_ino_map[in->faked_ino] = in->vino();
302 }
303
304 void Client::_release_faked_ino(Inode *in)
305 {
306 free_faked_inos.insert(in->faked_ino);
307 faked_ino_map.erase(in->faked_ino);
308 }
309
310 vinodeno_t Client::_map_faked_ino(ino_t ino)
311 {
312 vinodeno_t vino;
313 if (ino == 1)
314 vino = root->vino();
315 else if (faked_ino_map.count(ino))
316 vino = faked_ino_map[ino];
317 else
318 vino = vinodeno_t(0, CEPH_NOSNAP);
319 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
320 return vino;
321 }
322
323 vinodeno_t Client::map_faked_ino(ino_t ino)
324 {
325 std::scoped_lock lock(client_lock);
326 return _map_faked_ino(ino);
327 }
328
329 // cons/des
330
331 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
332 : Dispatcher(m->cct->get()),
333 timer(m->cct, timer_lock, false),
334 messenger(m),
335 monclient(mc),
336 objecter(objecter_),
337 whoami(mc->get_global_id()),
338 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
339 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
340 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
341 async_ino_invalidator(m->cct),
342 async_dentry_invalidator(m->cct),
343 interrupt_finisher(m->cct),
344 remount_finisher(m->cct),
345 async_ino_releasor(m->cct),
346 objecter_finisher(m->cct),
347 m_command_hook(this),
348 fscid(0)
349 {
350 _reset_faked_inos();
351
352 user_id = cct->_conf->client_mount_uid;
353 group_id = cct->_conf->client_mount_gid;
354 fuse_default_permissions = cct->_conf.get_val<bool>(
355 "fuse_default_permissions");
356
357 if (cct->_conf->client_acl_type == "posix_acl")
358 acl_type = POSIX_ACL;
359
360 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
361
362 // file handles
363 free_fd_set.insert(10, 1<<30);
364
365 mdsmap.reset(new MDSMap);
366
367 // osd interfaces
368 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
369 &client_lock));
370 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
371 client_flush_set_callback, // all commit callback
372 (void*)this,
373 cct->_conf->client_oc_size,
374 cct->_conf->client_oc_max_objects,
375 cct->_conf->client_oc_max_dirty,
376 cct->_conf->client_oc_target_dirty,
377 cct->_conf->client_oc_max_dirty_age,
378 true));
379 }
380
381
382 Client::~Client()
383 {
384 ceph_assert(ceph_mutex_is_not_locked(client_lock));
385
386 // If the task is crashed or aborted and doesn't
387 // get any chance to run the umount and shutdow.
388 {
389 std::scoped_lock l{client_lock};
390 tick_thread_stopped = true;
391 upkeep_cond.notify_one();
392 }
393
394 if (upkeeper.joinable())
395 upkeeper.join();
396
397 // It is necessary to hold client_lock, because any inode destruction
398 // may call into ObjectCacher, which asserts that it's lock (which is
399 // client_lock) is held.
400 std::scoped_lock l{client_lock};
401 tear_down_cache();
402 }
403
404 void Client::tear_down_cache()
405 {
406 // fd's
407 for (auto &[fd, fh] : fd_map) {
408 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
409 _release_fh(fh);
410 }
411 fd_map.clear();
412
413 while (!opened_dirs.empty()) {
414 dir_result_t *dirp = *opened_dirs.begin();
415 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
416 _closedir(dirp);
417 }
418
419 // caps!
420 // *** FIXME ***
421
422 // empty lru
423 trim_cache();
424 ceph_assert(lru.lru_get_size() == 0);
425
426 // close root ino
427 ceph_assert(inode_map.size() <= 1 + root_parents.size());
428 if (root && inode_map.size() == 1 + root_parents.size()) {
429 root.reset();
430 }
431
432 ceph_assert(inode_map.empty());
433 }
434
435 inodeno_t Client::get_root_ino()
436 {
437 std::scoped_lock l(client_lock);
438 if (use_faked_inos())
439 return root->faked_ino;
440 else
441 return root->ino;
442 }
443
444 Inode *Client::get_root()
445 {
446 std::scoped_lock l(client_lock);
447 root->ll_get();
448 return root.get();
449 }
450
451
452 // debug crapola
453
454 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
455 {
456 filepath path;
457 in->make_long_path(path);
458 ldout(cct, 1) << "dump_inode: "
459 << (disconnected ? "DISCONNECTED ":"")
460 << "inode " << in->ino
461 << " " << path
462 << " ref " << in->get_nref()
463 << " " << *in << dendl;
464
465 if (f) {
466 f->open_object_section("inode");
467 f->dump_stream("path") << path;
468 if (disconnected)
469 f->dump_int("disconnected", 1);
470 in->dump(f);
471 f->close_section();
472 }
473
474 did.insert(in);
475 if (in->dir) {
476 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
477 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
478 it != in->dir->dentries.end();
479 ++it) {
480 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
481 if (f) {
482 f->open_object_section("dentry");
483 it->second->dump(f);
484 f->close_section();
485 }
486 if (it->second->inode)
487 dump_inode(f, it->second->inode.get(), did, false);
488 }
489 }
490 }
491
492 void Client::dump_cache(Formatter *f)
493 {
494 set<Inode*> did;
495
496 ldout(cct, 1) << __func__ << dendl;
497
498 if (f)
499 f->open_array_section("cache");
500
501 if (root)
502 dump_inode(f, root.get(), did, true);
503
504 // make a second pass to catch anything disconnected
505 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
506 it != inode_map.end();
507 ++it) {
508 if (did.count(it->second))
509 continue;
510 dump_inode(f, it->second, did, true);
511 }
512
513 if (f)
514 f->close_section();
515 }
516
517 void Client::dump_status(Formatter *f)
518 {
519 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
520
521 ldout(cct, 1) << __func__ << dendl;
522
523 const epoch_t osd_epoch
524 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
525
526 if (f) {
527 f->open_object_section("metadata");
528 for (const auto& kv : metadata)
529 f->dump_string(kv.first.c_str(), kv.second);
530 f->close_section();
531
532 f->dump_int("dentry_count", lru.lru_get_size());
533 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
534 f->dump_int("id", get_nodeid().v);
535 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
536 f->dump_object("inst", inst);
537 f->dump_object("addr", inst.addr);
538 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
539 f->dump_string("addr_str", inst.addr.get_legacy_str());
540 f->dump_int("inode_count", inode_map.size());
541 f->dump_int("mds_epoch", mdsmap->get_epoch());
542 f->dump_int("osd_epoch", osd_epoch);
543 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
544 f->dump_bool("blocklisted", blocklisted);
545 f->dump_string("fs_name", mdsmap->get_fs_name());
546 }
547 }
548
549 void Client::_pre_init()
550 {
551 timer.init();
552
553 objecter_finisher.start();
554 filer.reset(new Filer(objecter, &objecter_finisher));
555 objecter->enable_blocklist_events();
556
557 objectcacher->start();
558 }
559
560 int Client::init()
561 {
562 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
563 ceph_assert(iref_writer.is_first_writer());
564
565 _pre_init();
566 {
567 std::scoped_lock l{client_lock};
568 messenger->add_dispatcher_tail(this);
569 }
570 _finish_init();
571 iref_writer.update_state(CLIENT_INITIALIZED);
572 return 0;
573 }
574
575 void Client::_finish_init()
576 {
577 {
578 std::scoped_lock l{client_lock};
579 // logger
580 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
581 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
582 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
583 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
584 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
585 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
586 logger.reset(plb.create_perf_counters());
587 cct->get_perfcounters_collection()->add(logger.get());
588 }
589
590 cct->_conf.add_observer(this);
591
592 AdminSocket* admin_socket = cct->get_admin_socket();
593 int ret = admin_socket->register_command("mds_requests",
594 &m_command_hook,
595 "show in-progress mds requests");
596 if (ret < 0) {
597 lderr(cct) << "error registering admin socket command: "
598 << cpp_strerror(-ret) << dendl;
599 }
600 ret = admin_socket->register_command("mds_sessions "
601 "name=cap_dump,type=CephBool,req=false",
602 &m_command_hook,
603 "show mds session state");
604 if (ret < 0) {
605 lderr(cct) << "error registering admin socket command: "
606 << cpp_strerror(-ret) << dendl;
607 }
608 ret = admin_socket->register_command("dump_cache",
609 &m_command_hook,
610 "show in-memory metadata cache contents");
611 if (ret < 0) {
612 lderr(cct) << "error registering admin socket command: "
613 << cpp_strerror(-ret) << dendl;
614 }
615 ret = admin_socket->register_command("kick_stale_sessions",
616 &m_command_hook,
617 "kick sessions that were remote reset");
618 if (ret < 0) {
619 lderr(cct) << "error registering admin socket command: "
620 << cpp_strerror(-ret) << dendl;
621 }
622 ret = admin_socket->register_command("status",
623 &m_command_hook,
624 "show overall client status");
625 if (ret < 0) {
626 lderr(cct) << "error registering admin socket command: "
627 << cpp_strerror(-ret) << dendl;
628 }
629 }
630
631 void Client::shutdown()
632 {
633 ldout(cct, 1) << __func__ << dendl;
634
635 // If we were not mounted, but were being used for sending
636 // MDS commands, we may have sessions that need closing.
637 {
638 std::scoped_lock l{client_lock};
639
640 // To make sure the tick thread will be stoppped before
641 // destructing the Client, just in case like the _mount()
642 // failed but didn't not get a chance to stop the tick
643 // thread
644 tick_thread_stopped = true;
645 upkeep_cond.notify_one();
646
647 _close_sessions();
648 }
649 cct->_conf.remove_observer(this);
650
651 cct->get_admin_socket()->unregister_commands(&m_command_hook);
652
653 if (ino_invalidate_cb) {
654 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
655 async_ino_invalidator.wait_for_empty();
656 async_ino_invalidator.stop();
657 }
658
659 if (dentry_invalidate_cb) {
660 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
661 async_dentry_invalidator.wait_for_empty();
662 async_dentry_invalidator.stop();
663 }
664
665 if (switch_interrupt_cb) {
666 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
667 interrupt_finisher.wait_for_empty();
668 interrupt_finisher.stop();
669 }
670
671 if (remount_cb) {
672 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
673 remount_finisher.wait_for_empty();
674 remount_finisher.stop();
675 }
676
677 if (ino_release_cb) {
678 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
679 async_ino_releasor.wait_for_empty();
680 async_ino_releasor.stop();
681 }
682
683 objectcacher->stop(); // outside of client_lock! this does a join.
684
685 /*
686 * We are shuting down the client.
687 *
688 * Just declare the state to CLIENT_NEW to block and fail any
689 * new comming "reader" and then try to wait all the in-flight
690 * "readers" to finish.
691 */
692 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
693 if (!iref_writer.is_first_writer())
694 return;
695 iref_writer.wait_readers_done();
696
697 {
698 std::scoped_lock l(timer_lock);
699 timer.shutdown();
700 }
701
702 objecter_finisher.wait_for_empty();
703 objecter_finisher.stop();
704
705 if (logger) {
706 cct->get_perfcounters_collection()->remove(logger.get());
707 logger.reset();
708 }
709 }
710
711
712 // ===================
713 // metadata cache stuff
714
715 void Client::trim_cache(bool trim_kernel_dcache)
716 {
717 uint64_t max = cct->_conf->client_cache_size;
718 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
719 unsigned last = 0;
720 while (lru.lru_get_size() != last) {
721 last = lru.lru_get_size();
722
723 if (!is_unmounting() && lru.lru_get_size() <= max) break;
724
725 // trim!
726 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
727 if (!dn)
728 break; // done
729
730 trim_dentry(dn);
731 }
732
733 if (trim_kernel_dcache && lru.lru_get_size() > max)
734 _invalidate_kernel_dcache();
735
736 // hose root?
737 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
738 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
739 root.reset();
740 }
741 }
742
743 void Client::trim_cache_for_reconnect(MetaSession *s)
744 {
745 mds_rank_t mds = s->mds_num;
746 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
747
748 int trimmed = 0;
749 list<Dentry*> skipped;
750 while (lru.lru_get_size() > 0) {
751 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
752 if (!dn)
753 break;
754
755 if ((dn->inode && dn->inode->caps.count(mds)) ||
756 dn->dir->parent_inode->caps.count(mds)) {
757 trim_dentry(dn);
758 trimmed++;
759 } else
760 skipped.push_back(dn);
761 }
762
763 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
764 lru.lru_insert_mid(*p);
765
766 ldout(cct, 20) << __func__ << " mds." << mds
767 << " trimmed " << trimmed << " dentries" << dendl;
768
769 if (s->caps.size() > 0)
770 _invalidate_kernel_dcache();
771 }
772
773 void Client::trim_dentry(Dentry *dn)
774 {
775 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
776 << " in dir "
777 << std::hex << dn->dir->parent_inode->ino << std::dec
778 << dendl;
779 if (dn->inode) {
780 Inode *diri = dn->dir->parent_inode;
781 clear_dir_complete_and_ordered(diri, true);
782 }
783 unlink(dn, false, false); // drop dir, drop dentry
784 }
785
786
787 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
788 uint64_t truncate_seq, uint64_t truncate_size)
789 {
790 uint64_t prior_size = in->size;
791
792 if (truncate_seq > in->truncate_seq ||
793 (truncate_seq == in->truncate_seq && size > in->size)) {
794 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
795 in->size = size;
796 in->reported_size = size;
797 if (truncate_seq != in->truncate_seq) {
798 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
799 << truncate_seq << dendl;
800 in->truncate_seq = truncate_seq;
801 in->oset.truncate_seq = truncate_seq;
802
803 // truncate cached file data
804 if (prior_size > size) {
805 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
806 }
807 }
808
809 // truncate inline data
810 if (in->inline_version < CEPH_INLINE_NONE) {
811 uint32_t len = in->inline_data.length();
812 if (size < len)
813 in->inline_data.splice(size, len - size);
814 }
815 }
816 if (truncate_seq >= in->truncate_seq &&
817 in->truncate_size != truncate_size) {
818 if (in->is_file()) {
819 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
820 << truncate_size << dendl;
821 in->truncate_size = truncate_size;
822 in->oset.truncate_size = truncate_size;
823 } else {
824 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
825 }
826 }
827 }
828
829 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
830 utime_t ctime, utime_t mtime, utime_t atime)
831 {
832 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
833 << " ctime " << ctime << " mtime " << mtime << dendl;
834
835 if (time_warp_seq > in->time_warp_seq)
836 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
837 << " is higher than local time_warp_seq "
838 << in->time_warp_seq << dendl;
839
840 int warn = false;
841 // be careful with size, mtime, atime
842 if (issued & (CEPH_CAP_FILE_EXCL|
843 CEPH_CAP_FILE_WR|
844 CEPH_CAP_FILE_BUFFER|
845 CEPH_CAP_AUTH_EXCL|
846 CEPH_CAP_XATTR_EXCL)) {
847 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
848 if (ctime > in->ctime)
849 in->ctime = ctime;
850 if (time_warp_seq > in->time_warp_seq) {
851 //the mds updated times, so take those!
852 in->mtime = mtime;
853 in->atime = atime;
854 in->time_warp_seq = time_warp_seq;
855 } else if (time_warp_seq == in->time_warp_seq) {
856 //take max times
857 if (mtime > in->mtime)
858 in->mtime = mtime;
859 if (atime > in->atime)
860 in->atime = atime;
861 } else if (issued & CEPH_CAP_FILE_EXCL) {
862 //ignore mds values as we have a higher seq
863 } else warn = true;
864 } else {
865 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
866 if (time_warp_seq >= in->time_warp_seq) {
867 in->ctime = ctime;
868 in->mtime = mtime;
869 in->atime = atime;
870 in->time_warp_seq = time_warp_seq;
871 } else warn = true;
872 }
873 if (warn) {
874 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
875 << time_warp_seq << " is lower than local time_warp_seq "
876 << in->time_warp_seq
877 << dendl;
878 }
879 }
880
881 void Client::_fragmap_remove_non_leaves(Inode *in)
882 {
883 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
884 if (!in->dirfragtree.is_leaf(p->first))
885 in->fragmap.erase(p++);
886 else
887 ++p;
888 }
889
890 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
891 {
892 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
893 if (p->second == mds)
894 in->fragmap.erase(p++);
895 else
896 ++p;
897 }
898
899 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
900 MetaSession *session,
901 const UserPerm& request_perms)
902 {
903 Inode *in;
904 bool was_new = false;
905 if (inode_map.count(st->vino)) {
906 in = inode_map[st->vino];
907 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
908 } else {
909 in = new Inode(this, st->vino, &st->layout);
910 inode_map[st->vino] = in;
911
912 if (use_faked_inos())
913 _assign_faked_ino(in);
914
915 if (!root) {
916 root = in;
917 if (use_faked_inos())
918 _assign_faked_root(root.get());
919 root_ancestor = in;
920 cwd = root;
921 } else if (is_mounting()) {
922 root_parents[root_ancestor] = in;
923 root_ancestor = in;
924 }
925
926 // immutable bits
927 in->ino = st->vino.ino;
928 in->snapid = st->vino.snapid;
929 in->mode = st->mode & S_IFMT;
930 was_new = true;
931 }
932
933 in->rdev = st->rdev;
934 if (in->is_symlink())
935 in->symlink = st->symlink;
936
937 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
938 bool new_version = false;
939 if (in->version == 0 ||
940 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
941 (in->version & ~1) < st->version))
942 new_version = true;
943
944 int issued;
945 in->caps_issued(&issued);
946 issued |= in->caps_dirty();
947 int new_issued = ~issued & (int)st->cap.caps;
948
949 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
950 !(issued & CEPH_CAP_AUTH_EXCL)) {
951 in->mode = st->mode;
952 in->uid = st->uid;
953 in->gid = st->gid;
954 in->btime = st->btime;
955 in->snap_btime = st->snap_btime;
956 in->snap_metadata = st->snap_metadata;
957 }
958
959 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
960 !(issued & CEPH_CAP_LINK_EXCL)) {
961 in->nlink = st->nlink;
962 }
963
964 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
965 update_inode_file_time(in, issued, st->time_warp_seq,
966 st->ctime, st->mtime, st->atime);
967 }
968
969 if (new_version ||
970 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
971 in->layout = st->layout;
972 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
973 }
974
975 if (in->is_dir()) {
976 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
977 in->dirstat = st->dirstat;
978 }
979 // dir_layout/rstat/quota are not tracked by capability, update them only if
980 // the inode stat is from auth mds
981 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
982 in->dir_layout = st->dir_layout;
983 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
984 in->rstat = st->rstat;
985 in->quota = st->quota;
986 in->dir_pin = st->dir_pin;
987 }
988 // move me if/when version reflects fragtree changes.
989 if (in->dirfragtree != st->dirfragtree) {
990 in->dirfragtree = st->dirfragtree;
991 _fragmap_remove_non_leaves(in);
992 }
993 }
994
995 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
996 st->xattrbl.length() &&
997 st->xattr_version > in->xattr_version) {
998 auto p = st->xattrbl.cbegin();
999 decode(in->xattrs, p);
1000 in->xattr_version = st->xattr_version;
1001 }
1002
1003 if (st->inline_version > in->inline_version) {
1004 in->inline_data = st->inline_data;
1005 in->inline_version = st->inline_version;
1006 }
1007
1008 /* always take a newer change attr */
1009 if (st->change_attr > in->change_attr)
1010 in->change_attr = st->change_attr;
1011
1012 if (st->version > in->version)
1013 in->version = st->version;
1014
1015 if (was_new)
1016 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1017
1018 if (!st->cap.caps)
1019 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1020
1021 if (in->snapid == CEPH_NOSNAP) {
1022 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1023 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1024 st->cap.flags, request_perms);
1025 if (in->auth_cap && in->auth_cap->session == session) {
1026 in->max_size = st->max_size;
1027 in->rstat = st->rstat;
1028 }
1029
1030 // setting I_COMPLETE needs to happen after adding the cap
1031 if (in->is_dir() &&
1032 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1033 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1034 in->dirstat.nfiles == 0 &&
1035 in->dirstat.nsubdirs == 0) {
1036 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1037 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1038 if (in->dir) {
1039 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1040 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1041 in->dir->readdir_cache.clear();
1042 for (const auto& p : in->dir->dentries) {
1043 unlink(p.second, true, true); // keep dir, keep dentry
1044 }
1045 if (in->dir->dentries.empty())
1046 close_dir(in->dir);
1047 }
1048 }
1049 } else {
1050 in->snap_caps |= st->cap.caps;
1051 }
1052
1053 in->fscrypt = st->fscrypt;
1054 return in;
1055 }
1056
1057
1058 /*
1059 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1060 */
1061 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1062 Inode *in, utime_t from, MetaSession *session,
1063 Dentry *old_dentry)
1064 {
1065 Dentry *dn = NULL;
1066 if (dir->dentries.count(dname))
1067 dn = dir->dentries[dname];
1068
1069 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1070 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1071 << dendl;
1072
1073 if (dn && dn->inode) {
1074 if (dn->inode->vino() == in->vino()) {
1075 touch_dn(dn);
1076 ldout(cct, 12) << " had dentry " << dname
1077 << " with correct vino " << dn->inode->vino()
1078 << dendl;
1079 } else {
1080 ldout(cct, 12) << " had dentry " << dname
1081 << " with WRONG vino " << dn->inode->vino()
1082 << dendl;
1083 unlink(dn, true, true); // keep dir, keep dentry
1084 }
1085 }
1086
1087 if (!dn || !dn->inode) {
1088 InodeRef tmp_ref(in);
1089 if (old_dentry) {
1090 if (old_dentry->dir != dir) {
1091 Inode *old_diri = old_dentry->dir->parent_inode;
1092 clear_dir_complete_and_ordered(old_diri, false);
1093 }
1094 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1095 }
1096 Inode *diri = dir->parent_inode;
1097 clear_dir_complete_and_ordered(diri, false);
1098 dn = link(dir, dname, in, dn);
1099 }
1100
1101 update_dentry_lease(dn, dlease, from, session);
1102 return dn;
1103 }
1104
1105 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1106 {
1107 utime_t dttl = from;
1108 dttl += (float)dlease->duration_ms / 1000.0;
1109
1110 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1111
1112 ceph_assert(dn);
1113
1114 if (dlease->mask & CEPH_LEASE_VALID) {
1115 if (dttl > dn->lease_ttl) {
1116 ldout(cct, 10) << "got dentry lease on " << dn->name
1117 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1118 dn->lease_ttl = dttl;
1119 dn->lease_mds = session->mds_num;
1120 dn->lease_seq = dlease->seq;
1121 dn->lease_gen = session->cap_gen;
1122 }
1123 }
1124 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1125 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1126 dn->mark_primary();
1127 dn->alternate_name = std::move(dlease->alternate_name);
1128 }
1129
1130
1131 /*
1132 * update MDS location cache for a single inode
1133 */
1134 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1135 {
1136 // auth
1137 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1138 if (dst->auth >= 0) {
1139 in->fragmap[dst->frag] = dst->auth;
1140 } else {
1141 in->fragmap.erase(dst->frag);
1142 }
1143 if (!in->dirfragtree.is_leaf(dst->frag)) {
1144 in->dirfragtree.force_to_leaf(cct, dst->frag);
1145 _fragmap_remove_non_leaves(in);
1146 }
1147
1148 // replicated, only update from auth mds reply
1149 if (from == dst->auth) {
1150 in->dir_replicated = !dst->dist.empty();
1151 if (!dst->dist.empty())
1152 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1153 else
1154 in->frag_repmap.erase(dst->frag);
1155 }
1156 }
1157
1158 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1159 {
1160 if (complete)
1161 diri->dir_release_count++;
1162 else
1163 diri->dir_ordered_count++;
1164 if (diri->flags & I_COMPLETE) {
1165 if (complete) {
1166 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1167 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1168 } else {
1169 if (diri->flags & I_DIR_ORDERED) {
1170 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1171 diri->flags &= ~I_DIR_ORDERED;
1172 }
1173 }
1174 if (diri->dir)
1175 diri->dir->readdir_cache.clear();
1176 }
1177 }
1178
1179 /*
1180 * insert results from readdir or lssnap into the metadata cache.
1181 */
1182 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1183
1184 auto& reply = request->reply;
1185 ConnectionRef con = request->reply->get_connection();
1186 uint64_t features;
1187 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1188 features = (uint64_t)-1;
1189 }
1190 else {
1191 features = con->get_features();
1192 }
1193
1194 dir_result_t *dirp = request->dirp;
1195 ceph_assert(dirp);
1196
1197 // the extra buffer list is only set for readdir and lssnap replies
1198 auto p = reply->get_extra_bl().cbegin();
1199 if (!p.end()) {
1200 // snapdir?
1201 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1202 ceph_assert(diri);
1203 diri = open_snapdir(diri);
1204 }
1205
1206 // only open dir if we're actually adding stuff to it!
1207 Dir *dir = diri->open_dir();
1208 ceph_assert(dir);
1209
1210 // dirstat
1211 DirStat dst(p, features);
1212 __u32 numdn;
1213 __u16 flags;
1214 decode(numdn, p);
1215 decode(flags, p);
1216
1217 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1218 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1219
1220 frag_t fg = (unsigned)request->head.args.readdir.frag;
1221 unsigned readdir_offset = dirp->next_offset;
1222 string readdir_start = dirp->last_name;
1223 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1224
1225 unsigned last_hash = 0;
1226 if (hash_order) {
1227 if (!readdir_start.empty()) {
1228 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1229 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1230 /* mds understands offset_hash */
1231 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1232 }
1233 }
1234
1235 if (fg != dst.frag) {
1236 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1237 fg = dst.frag;
1238 if (!hash_order) {
1239 readdir_offset = 2;
1240 readdir_start.clear();
1241 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1242 }
1243 }
1244
1245 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1246 << ", hash_order=" << hash_order
1247 << ", readdir_start " << readdir_start
1248 << ", last_hash " << last_hash
1249 << ", next_offset " << readdir_offset << dendl;
1250
1251 if (diri->snapid != CEPH_SNAPDIR &&
1252 fg.is_leftmost() && readdir_offset == 2 &&
1253 !(hash_order && last_hash)) {
1254 dirp->release_count = diri->dir_release_count;
1255 dirp->ordered_count = diri->dir_ordered_count;
1256 dirp->start_shared_gen = diri->shared_gen;
1257 dirp->cache_index = 0;
1258 }
1259
1260 dirp->buffer_frag = fg;
1261
1262 _readdir_drop_dirp_buffer(dirp);
1263 dirp->buffer.reserve(numdn);
1264
1265 string dname;
1266 LeaseStat dlease;
1267 for (unsigned i=0; i<numdn; i++) {
1268 decode(dname, p);
1269 dlease.decode(p, features);
1270 InodeStat ist(p, features);
1271
1272 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1273
1274 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1275 request->perms);
1276 Dentry *dn;
1277 if (diri->dir->dentries.count(dname)) {
1278 Dentry *olddn = diri->dir->dentries[dname];
1279 if (olddn->inode != in) {
1280 // replace incorrect dentry
1281 unlink(olddn, true, true); // keep dir, dentry
1282 dn = link(dir, dname, in, olddn);
1283 ceph_assert(dn == olddn);
1284 } else {
1285 // keep existing dn
1286 dn = olddn;
1287 touch_dn(dn);
1288 }
1289 } else {
1290 // new dn
1291 dn = link(dir, dname, in, NULL);
1292 }
1293 dn->alternate_name = std::move(dlease.alternate_name);
1294
1295 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1296 if (hash_order) {
1297 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1298 if (hash != last_hash)
1299 readdir_offset = 2;
1300 last_hash = hash;
1301 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1302 } else {
1303 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1304 }
1305 // add to readdir cache
1306 if (dirp->release_count == diri->dir_release_count &&
1307 dirp->ordered_count == diri->dir_ordered_count &&
1308 dirp->start_shared_gen == diri->shared_gen) {
1309 if (dirp->cache_index == dir->readdir_cache.size()) {
1310 if (i == 0) {
1311 ceph_assert(!dirp->inode->is_complete_and_ordered());
1312 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1313 }
1314 dir->readdir_cache.push_back(dn);
1315 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1316 if (dirp->inode->is_complete_and_ordered())
1317 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1318 else
1319 dir->readdir_cache[dirp->cache_index] = dn;
1320 } else {
1321 ceph_abort_msg("unexpected readdir buffer idx");
1322 }
1323 dirp->cache_index++;
1324 }
1325 // add to cached result list
1326 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1327 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1328 }
1329
1330 if (numdn > 0)
1331 dirp->last_name = dname;
1332 if (end)
1333 dirp->next_offset = 2;
1334 else
1335 dirp->next_offset = readdir_offset;
1336
1337 if (dir->is_empty())
1338 close_dir(dir);
1339 }
1340 }
1341
1342 /** insert_trace
1343 *
1344 * insert a trace from a MDS reply into the cache.
1345 */
1346 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1347 {
1348 auto& reply = request->reply;
1349 int op = request->get_op();
1350
1351 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1352 << " is_target=" << (int)reply->head.is_target
1353 << " is_dentry=" << (int)reply->head.is_dentry
1354 << dendl;
1355
1356 auto p = reply->get_trace_bl().cbegin();
1357 if (request->got_unsafe) {
1358 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1359 ceph_assert(p.end());
1360 return NULL;
1361 }
1362
1363 if (p.end()) {
1364 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1365
1366 Dentry *d = request->dentry();
1367 if (d) {
1368 Inode *diri = d->dir->parent_inode;
1369 clear_dir_complete_and_ordered(diri, true);
1370 }
1371
1372 if (d && reply->get_result() == 0) {
1373 if (op == CEPH_MDS_OP_RENAME) {
1374 // rename
1375 Dentry *od = request->old_dentry();
1376 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1377 ceph_assert(od);
1378 unlink(od, true, true); // keep dir, dentry
1379 } else if (op == CEPH_MDS_OP_RMDIR ||
1380 op == CEPH_MDS_OP_UNLINK) {
1381 // unlink, rmdir
1382 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1383 unlink(d, true, true); // keep dir, dentry
1384 }
1385 }
1386 return NULL;
1387 }
1388
1389 ConnectionRef con = request->reply->get_connection();
1390 uint64_t features;
1391 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1392 features = (uint64_t)-1;
1393 }
1394 else {
1395 features = con->get_features();
1396 }
1397 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1398
1399 // snap trace
1400 SnapRealm *realm = NULL;
1401 if (reply->snapbl.length())
1402 update_snap_trace(reply->snapbl, &realm);
1403
1404 ldout(cct, 10) << " hrm "
1405 << " is_target=" << (int)reply->head.is_target
1406 << " is_dentry=" << (int)reply->head.is_dentry
1407 << dendl;
1408
1409 InodeStat dirst;
1410 DirStat dst;
1411 string dname;
1412 LeaseStat dlease;
1413 InodeStat ist;
1414
1415 if (reply->head.is_dentry) {
1416 dirst.decode(p, features);
1417 dst.decode(p, features);
1418 decode(dname, p);
1419 dlease.decode(p, features);
1420 }
1421
1422 Inode *in = 0;
1423 if (reply->head.is_target) {
1424 ist.decode(p, features);
1425 if (cct->_conf->client_debug_getattr_caps) {
1426 unsigned wanted = 0;
1427 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1428 wanted = request->head.args.getattr.mask;
1429 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1430 wanted = request->head.args.open.mask;
1431
1432 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1433 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1434 ceph_abort_msg("MDS reply does not contain xattrs");
1435 }
1436
1437 in = add_update_inode(&ist, request->sent_stamp, session,
1438 request->perms);
1439 }
1440
1441 Inode *diri = NULL;
1442 if (reply->head.is_dentry) {
1443 diri = add_update_inode(&dirst, request->sent_stamp, session,
1444 request->perms);
1445 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1446 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
1447
1448 if (in) {
1449 Dir *dir = diri->open_dir();
1450 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1451 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1452 } else {
1453 Dentry *dn = NULL;
1454 if (diri->dir && diri->dir->dentries.count(dname)) {
1455 dn = diri->dir->dentries[dname];
1456 if (dn->inode) {
1457 clear_dir_complete_and_ordered(diri, false);
1458 unlink(dn, true, true); // keep dir, dentry
1459 }
1460 }
1461 if (dlease.duration_ms > 0) {
1462 if (!dn) {
1463 Dir *dir = diri->open_dir();
1464 dn = link(dir, dname, NULL, NULL);
1465 }
1466 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1467 }
1468 }
1469 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1470 op == CEPH_MDS_OP_MKSNAP) {
1471 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1472 // fake it for snap lookup
1473 vinodeno_t vino = ist.vino;
1474 vino.snapid = CEPH_SNAPDIR;
1475 ceph_assert(inode_map.count(vino));
1476 diri = inode_map[vino];
1477
1478 string dname = request->path.last_dentry();
1479
1480 LeaseStat dlease;
1481 dlease.duration_ms = 0;
1482
1483 if (in) {
1484 Dir *dir = diri->open_dir();
1485 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1486 } else {
1487 if (diri->dir && diri->dir->dentries.count(dname)) {
1488 Dentry *dn = diri->dir->dentries[dname];
1489 if (dn->inode)
1490 unlink(dn, true, true); // keep dir, dentry
1491 }
1492 }
1493 }
1494
1495 if (in) {
1496 if (op == CEPH_MDS_OP_READDIR ||
1497 op == CEPH_MDS_OP_LSSNAP) {
1498 insert_readdir_results(request, session, in);
1499 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1500 // hack: return parent inode instead
1501 in = diri;
1502 }
1503
1504 if (request->dentry() == NULL && in != request->inode()) {
1505 // pin the target inode if its parent dentry is not pinned
1506 request->set_other_inode(in);
1507 }
1508 }
1509
1510 if (realm)
1511 put_snap_realm(realm);
1512
1513 request->target = in;
1514 return in;
1515 }
1516
1517 // -------
1518
1519 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1520 {
1521 mds_rank_t mds = MDS_RANK_NONE;
1522 __u32 hash = 0;
1523 bool is_hash = false;
1524
1525 Inode *in = NULL;
1526 Dentry *de = NULL;
1527
1528 if (req->resend_mds >= 0) {
1529 mds = req->resend_mds;
1530 req->resend_mds = -1;
1531 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1532 goto out;
1533 }
1534
1535 if (cct->_conf->client_use_random_mds)
1536 goto random_mds;
1537
1538 in = req->inode();
1539 de = req->dentry();
1540 if (in) {
1541 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1542 if (req->path.depth()) {
1543 hash = in->hash_dentry_name(req->path[0]);
1544 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1545 << " on " << req->path[0]
1546 << " => " << hash << dendl;
1547 is_hash = true;
1548 }
1549 } else if (de) {
1550 if (de->inode) {
1551 in = de->inode.get();
1552 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1553 } else {
1554 in = de->dir->parent_inode;
1555 hash = in->hash_dentry_name(de->name);
1556 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1557 << " on " << de->name
1558 << " => " << hash << dendl;
1559 is_hash = true;
1560 }
1561 }
1562 if (in) {
1563 if (in->snapid != CEPH_NOSNAP) {
1564 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1565 while (in->snapid != CEPH_NOSNAP) {
1566 if (in->snapid == CEPH_SNAPDIR)
1567 in = in->snapdir_parent.get();
1568 else if (!in->dentries.empty())
1569 /* In most cases there will only be one dentry, so getting it
1570 * will be the correct action. If there are multiple hard links,
1571 * I think the MDS should be able to redirect as needed*/
1572 in = in->get_first_parent()->dir->parent_inode;
1573 else {
1574 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1575 break;
1576 }
1577 }
1578 is_hash = false;
1579 }
1580
1581 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1582 << " hash=" << hash << dendl;
1583
1584 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1585 frag_t fg = in->dirfragtree[hash];
1586 if (!req->auth_is_best()) {
1587 auto repmapit = in->frag_repmap.find(fg);
1588 if (repmapit != in->frag_repmap.end()) {
1589 auto& repmap = repmapit->second;
1590 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1591 mds = repmap.at(r);
1592 }
1593 } else if (in->fragmap.count(fg)) {
1594 mds = in->fragmap[fg];
1595 if (phash_diri)
1596 *phash_diri = in;
1597 } else if (in->auth_cap) {
1598 req->send_to_auth = true;
1599 mds = in->auth_cap->session->mds_num;
1600 }
1601 if (mds >= 0) {
1602 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1603 goto out;
1604 }
1605 }
1606
1607 if (in->auth_cap && req->auth_is_best()) {
1608 mds = in->auth_cap->session->mds_num;
1609 } else if (!in->caps.empty()) {
1610 mds = in->caps.begin()->second.session->mds_num;
1611 } else {
1612 goto random_mds;
1613 }
1614 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1615
1616 goto out;
1617 }
1618
1619 random_mds:
1620 if (mds < 0) {
1621 mds = _get_random_up_mds();
1622 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1623 }
1624
1625 out:
1626 ldout(cct, 20) << "mds is " << mds << dendl;
1627 return mds;
1628 }
1629
1630 void Client::connect_mds_targets(mds_rank_t mds)
1631 {
1632 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1633 ceph_assert(mds_sessions.count(mds));
1634 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1635 for (const auto &rank : info.export_targets) {
1636 if (mds_sessions.count(rank) == 0 &&
1637 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1638 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1639 << " export target mds." << rank << dendl;
1640 _open_mds_session(rank);
1641 }
1642 }
1643 }
1644
1645 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1646 {
1647 f->dump_int("id", get_nodeid().v);
1648 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1649 f->dump_object("inst", inst);
1650 f->dump_stream("inst_str") << inst;
1651 f->dump_stream("addr_str") << inst.addr;
1652 f->open_array_section("sessions");
1653 for (const auto &p : mds_sessions) {
1654 f->open_object_section("session");
1655 p.second->dump(f, cap_dump);
1656 f->close_section();
1657 }
1658 f->close_section();
1659 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1660 }
1661
1662 void Client::dump_mds_requests(Formatter *f)
1663 {
1664 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1665 p != mds_requests.end();
1666 ++p) {
1667 f->open_object_section("request");
1668 p->second->dump(f);
1669 f->close_section();
1670 }
1671 }
1672
1673 int Client::verify_reply_trace(int r, MetaSession *session,
1674 MetaRequest *request, const MConstRef<MClientReply>& reply,
1675 InodeRef *ptarget, bool *pcreated,
1676 const UserPerm& perms)
1677 {
1678 // check whether this request actually did the create, and set created flag
1679 bufferlist extra_bl;
1680 inodeno_t created_ino;
1681 bool got_created_ino = false;
1682 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1683
1684 extra_bl = reply->get_extra_bl();
1685 if (extra_bl.length() >= 8) {
1686 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1687 struct openc_response_t ocres;
1688
1689 decode(ocres, extra_bl);
1690 created_ino = ocres.created_ino;
1691 /*
1692 * The userland cephfs client doesn't have a way to do an async create
1693 * (yet), so just discard delegated_inos for now. Eventually we should
1694 * store them and use them in create calls, even if they are synchronous,
1695 * if only for testing purposes.
1696 */
1697 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1698 } else {
1699 // u64 containing number of created ino
1700 decode(created_ino, extra_bl);
1701 }
1702 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1703 got_created_ino = true;
1704 }
1705
1706 if (pcreated)
1707 *pcreated = got_created_ino;
1708
1709 if (request->target) {
1710 *ptarget = request->target;
1711 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1712 } else {
1713 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1714 (*ptarget) = p->second;
1715 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1716 } else {
1717 // we got a traceless reply, and need to look up what we just
1718 // created. for now, do this by name. someday, do this by the
1719 // ino... which we know! FIXME.
1720 InodeRef target;
1721 Dentry *d = request->dentry();
1722 if (d) {
1723 if (d->dir) {
1724 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1725 << d->dir->parent_inode->ino << "/" << d->name
1726 << " got_ino " << got_created_ino
1727 << " ino " << created_ino
1728 << dendl;
1729 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1730 &target, perms);
1731 } else {
1732 // if the dentry is not linked, just do our best. see #5021.
1733 ceph_abort_msg("how did this happen? i want logs!");
1734 }
1735 } else {
1736 Inode *in = request->inode();
1737 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1738 << in->ino << dendl;
1739 r = _getattr(in, request->regetattr_mask, perms, true);
1740 target = in;
1741 }
1742 if (r >= 0) {
1743 // verify ino returned in reply and trace_dist are the same
1744 if (got_created_ino &&
1745 created_ino.val != target->ino.val) {
1746 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1747 r = -CEPHFS_EINTR;
1748 }
1749 if (ptarget)
1750 ptarget->swap(target);
1751 }
1752 }
1753 }
1754
1755 return r;
1756 }
1757
1758
1759 /**
1760 * make a request
1761 *
1762 * Blocking helper to make an MDS request.
1763 *
1764 * If the ptarget flag is set, behavior changes slightly: the caller
1765 * expects to get a pointer to the inode we are creating or operating
1766 * on. As a result, we will follow up any traceless mutation reply
1767 * with a getattr or lookup to transparently handle a traceless reply
1768 * from the MDS (as when the MDS restarts and the client has to replay
1769 * a request).
1770 *
1771 * @param request the MetaRequest to execute
1772 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1773 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1774 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1775 * @param use_mds [optional] prefer a specific mds (-1 for default)
1776 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1777 */
1778 int Client::make_request(MetaRequest *request,
1779 const UserPerm& perms,
1780 InodeRef *ptarget, bool *pcreated,
1781 mds_rank_t use_mds,
1782 bufferlist *pdirbl)
1783 {
1784 int r = 0;
1785
1786 // assign a unique tid
1787 ceph_tid_t tid = ++last_tid;
1788 request->set_tid(tid);
1789
1790 // and timestamp
1791 request->op_stamp = ceph_clock_now();
1792
1793 // make note
1794 mds_requests[tid] = request->get();
1795 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1796 oldest_tid = tid;
1797
1798 request->set_caller_perms(perms);
1799
1800 if (cct->_conf->client_inject_fixed_oldest_tid) {
1801 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1802 request->set_oldest_client_tid(1);
1803 } else {
1804 request->set_oldest_client_tid(oldest_tid);
1805 }
1806
1807 // hack target mds?
1808 if (use_mds >= 0)
1809 request->resend_mds = use_mds;
1810
1811 MetaSessionRef session = NULL;
1812 while (1) {
1813 if (request->aborted())
1814 break;
1815
1816 if (blocklisted) {
1817 request->abort(-CEPHFS_EBLOCKLISTED);
1818 break;
1819 }
1820
1821 // set up wait cond
1822 ceph::condition_variable caller_cond;
1823 request->caller_cond = &caller_cond;
1824
1825 // choose mds
1826 Inode *hash_diri = NULL;
1827 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1828 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1829 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1830 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1831 if (hash_diri) {
1832 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1833 _fragmap_remove_stopped_mds(hash_diri, mds);
1834 } else {
1835 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1836 request->resend_mds = _get_random_up_mds();
1837 }
1838 } else {
1839 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1840 wait_on_list(waiting_for_mdsmap);
1841 }
1842 continue;
1843 }
1844
1845 // open a session?
1846 if (!have_open_session(mds)) {
1847 session = _get_or_open_mds_session(mds);
1848 if (session->state == MetaSession::STATE_REJECTED) {
1849 request->abort(-CEPHFS_EPERM);
1850 break;
1851 }
1852 // wait
1853 if (session->state == MetaSession::STATE_OPENING) {
1854 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1855 wait_on_context_list(session->waiting_for_open);
1856 continue;
1857 }
1858
1859 if (!have_open_session(mds))
1860 continue;
1861 } else {
1862 session = mds_sessions.at(mds);
1863 }
1864
1865 // send request.
1866 send_request(request, session.get());
1867
1868 // wait for signal
1869 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1870 request->kick = false;
1871 std::unique_lock l{client_lock, std::adopt_lock};
1872 caller_cond.wait(l, [request] {
1873 return (request->reply || // reply
1874 request->resend_mds >= 0 || // forward
1875 request->kick);
1876 });
1877 l.release();
1878 request->caller_cond = nullptr;
1879
1880 // did we get a reply?
1881 if (request->reply)
1882 break;
1883 }
1884
1885 if (!request->reply) {
1886 ceph_assert(request->aborted());
1887 ceph_assert(!request->got_unsafe);
1888 r = request->get_abort_code();
1889 request->item.remove_myself();
1890 unregister_request(request);
1891 put_request(request);
1892 return r;
1893 }
1894
1895 // got it!
1896 auto reply = std::move(request->reply);
1897 r = reply->get_result();
1898 if (r >= 0)
1899 request->success = true;
1900
1901 // kick dispatcher (we've got it!)
1902 ceph_assert(request->dispatch_cond);
1903 request->dispatch_cond->notify_all();
1904 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1905 request->dispatch_cond = 0;
1906
1907 if (r >= 0 && ptarget)
1908 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
1909
1910 if (pdirbl)
1911 *pdirbl = reply->get_extra_bl();
1912
1913 // -- log times --
1914 utime_t lat = ceph_clock_now();
1915 lat -= request->sent_stamp;
1916 ldout(cct, 20) << "lat " << lat << dendl;
1917 logger->tinc(l_c_lat, lat);
1918 logger->tinc(l_c_reply, lat);
1919
1920 put_request(request);
1921 return r;
1922 }
1923
1924 void Client::unregister_request(MetaRequest *req)
1925 {
1926 mds_requests.erase(req->tid);
1927 if (req->tid == oldest_tid) {
1928 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1929 while (true) {
1930 if (p == mds_requests.end()) {
1931 oldest_tid = 0;
1932 break;
1933 }
1934 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1935 oldest_tid = p->first;
1936 break;
1937 }
1938 ++p;
1939 }
1940 }
1941 put_request(req);
1942 }
1943
1944 void Client::put_request(MetaRequest *request)
1945 {
1946 if (request->_put()) {
1947 int op = -1;
1948 if (request->success)
1949 op = request->get_op();
1950 InodeRef other_in;
1951 request->take_other_inode(&other_in);
1952 delete request;
1953
1954 if (other_in &&
1955 (op == CEPH_MDS_OP_RMDIR ||
1956 op == CEPH_MDS_OP_RENAME ||
1957 op == CEPH_MDS_OP_RMSNAP)) {
1958 _try_to_trim_inode(other_in.get(), false);
1959 }
1960 }
1961 }
1962
1963 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1964 mds_rank_t mds, int drop,
1965 int unless, int force)
1966 {
1967 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1968 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1969 << ", force:" << force << ")" << dendl;
1970 int released = 0;
1971 auto it = in->caps.find(mds);
1972 if (it != in->caps.end()) {
1973 Cap &cap = it->second;
1974 drop &= ~(in->dirty_caps | get_caps_used(in));
1975 if ((drop & cap.issued) &&
1976 !(unless & cap.issued)) {
1977 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1978 cap.issued &= ~drop;
1979 cap.implemented &= ~drop;
1980 released = 1;
1981 } else {
1982 released = force;
1983 }
1984 if (released) {
1985 cap.wanted = in->caps_wanted();
1986 if (&cap == in->auth_cap &&
1987 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1988 in->requested_max_size = 0;
1989 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1990 }
1991 ceph_mds_request_release rel;
1992 rel.ino = in->ino;
1993 rel.cap_id = cap.cap_id;
1994 rel.seq = cap.seq;
1995 rel.issue_seq = cap.issue_seq;
1996 rel.mseq = cap.mseq;
1997 rel.caps = cap.implemented;
1998 rel.wanted = cap.wanted;
1999 rel.dname_len = 0;
2000 rel.dname_seq = 0;
2001 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2002 }
2003 }
2004 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2005 << released << dendl;
2006 return released;
2007 }
2008
2009 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2010 mds_rank_t mds, int drop, int unless)
2011 {
2012 ldout(cct, 20) << __func__ << " enter(dn:"
2013 << dn << ")" << dendl;
2014 int released = 0;
2015 if (dn->dir)
2016 released = encode_inode_release(dn->dir->parent_inode, req,
2017 mds, drop, unless, 1);
2018 if (released && dn->lease_mds == mds) {
2019 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2020 auto& rel = req->cap_releases.back();
2021 rel.item.dname_len = dn->name.length();
2022 rel.item.dname_seq = dn->lease_seq;
2023 rel.dname = dn->name;
2024 dn->lease_mds = -1;
2025 }
2026 ldout(cct, 25) << __func__ << " exit(dn:"
2027 << dn << ")" << dendl;
2028 }
2029
2030
2031 /*
2032 * This requires the MClientRequest *request member to be set.
2033 * It will error out horribly without one.
2034 * Additionally, if you set any *drop member, you'd better have
2035 * set the corresponding dentry!
2036 */
2037 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2038 {
2039 ldout(cct, 20) << __func__ << " enter (req: "
2040 << req << ", mds: " << mds << ")" << dendl;
2041 if (req->inode_drop && req->inode())
2042 encode_inode_release(req->inode(), req,
2043 mds, req->inode_drop,
2044 req->inode_unless);
2045
2046 if (req->old_inode_drop && req->old_inode())
2047 encode_inode_release(req->old_inode(), req,
2048 mds, req->old_inode_drop,
2049 req->old_inode_unless);
2050 if (req->other_inode_drop && req->other_inode())
2051 encode_inode_release(req->other_inode(), req,
2052 mds, req->other_inode_drop,
2053 req->other_inode_unless);
2054
2055 if (req->dentry_drop && req->dentry())
2056 encode_dentry_release(req->dentry(), req,
2057 mds, req->dentry_drop,
2058 req->dentry_unless);
2059
2060 if (req->old_dentry_drop && req->old_dentry())
2061 encode_dentry_release(req->old_dentry(), req,
2062 mds, req->old_dentry_drop,
2063 req->old_dentry_unless);
2064 ldout(cct, 25) << __func__ << " exit (req: "
2065 << req << ", mds " << mds <<dendl;
2066 }
2067
2068 bool Client::have_open_session(mds_rank_t mds)
2069 {
2070 const auto &it = mds_sessions.find(mds);
2071 return it != mds_sessions.end() &&
2072 (it->second->state == MetaSession::STATE_OPEN ||
2073 it->second->state == MetaSession::STATE_STALE);
2074 }
2075
2076 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2077 {
2078 const auto &it = mds_sessions.find(mds);
2079 if (it == mds_sessions.end() || it->second->con != con) {
2080 return NULL;
2081 } else {
2082 return it->second;
2083 }
2084 }
2085
2086 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2087 {
2088 auto it = mds_sessions.find(mds);
2089 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2090 }
2091
2092 /**
2093 * Populate a map of strings with client-identifying metadata,
2094 * such as the hostname. Call this once at initialization.
2095 */
2096 void Client::populate_metadata(const std::string &mount_root)
2097 {
2098 // Hostname
2099 #ifdef _WIN32
2100 // TODO: move this to compat.h
2101 char hostname[64];
2102 DWORD hostname_sz = 64;
2103 GetComputerNameA(hostname, &hostname_sz);
2104 metadata["hostname"] = hostname;
2105 #else
2106 struct utsname u;
2107 int r = uname(&u);
2108 if (r >= 0) {
2109 metadata["hostname"] = u.nodename;
2110 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2111 } else {
2112 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2113 }
2114 #endif
2115
2116 metadata["pid"] = stringify(getpid());
2117
2118 // Ceph entity id (the '0' in "client.0")
2119 metadata["entity_id"] = cct->_conf->name.get_id();
2120
2121 // Our mount position
2122 if (!mount_root.empty()) {
2123 metadata["root"] = mount_root;
2124 }
2125
2126 // Ceph version
2127 metadata["ceph_version"] = pretty_version_to_str();
2128 metadata["ceph_sha1"] = git_version_to_str();
2129
2130 // Apply any metadata from the user's configured overrides
2131 std::vector<std::string> tokens;
2132 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2133 for (const auto &i : tokens) {
2134 auto eqpos = i.find("=");
2135 // Throw out anything that isn't of the form "<str>=<str>"
2136 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2137 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2138 continue;
2139 }
2140 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2141 }
2142 }
2143
2144 /**
2145 * Optionally add or override client metadata fields.
2146 */
2147 void Client::update_metadata(std::string const &k, std::string const &v)
2148 {
2149 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2150 ceph_assert(iref_reader.is_state_satisfied());
2151
2152 std::scoped_lock l(client_lock);
2153
2154 auto it = metadata.find(k);
2155 if (it != metadata.end()) {
2156 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2157 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2158 }
2159
2160 metadata[k] = v;
2161 }
2162
2163 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2164 {
2165 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2166 auto addrs = mdsmap->get_addrs(mds);
2167 auto em = mds_sessions.emplace(std::piecewise_construct,
2168 std::forward_as_tuple(mds),
2169 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2170 ceph_assert(em.second); /* not already present */
2171 auto session = em.first->second;
2172
2173 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2174 m->metadata = metadata;
2175 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2176 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2177 session->con->send_message2(std::move(m));
2178 return session;
2179 }
2180
2181 void Client::_close_mds_session(MetaSession *s)
2182 {
2183 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2184 s->state = MetaSession::STATE_CLOSING;
2185 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2186 }
2187
2188 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2189 {
2190 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2191 if (rejected && s->state != MetaSession::STATE_CLOSING)
2192 s->state = MetaSession::STATE_REJECTED;
2193 else
2194 s->state = MetaSession::STATE_CLOSED;
2195 s->con->mark_down();
2196 signal_context_list(s->waiting_for_open);
2197 mount_cond.notify_all();
2198 remove_session_caps(s, err);
2199 kick_requests_closed(s);
2200 mds_ranks_closing.erase(s->mds_num);
2201 if (s->state == MetaSession::STATE_CLOSED)
2202 mds_sessions.erase(s->mds_num);
2203 }
2204
2205 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2206 {
2207 mds_rank_t from = mds_rank_t(m->get_source().num());
2208 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2209
2210 std::scoped_lock cl(client_lock);
2211 auto session = _get_mds_session(from, m->get_connection().get());
2212 if (!session) {
2213 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2214 return;
2215 }
2216
2217 switch (m->get_op()) {
2218 case CEPH_SESSION_OPEN:
2219 {
2220 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2221 missing_features -= m->supported_features;
2222 if (!missing_features.empty()) {
2223 lderr(cct) << "mds." << from << " lacks required features '"
2224 << missing_features << "', closing session " << dendl;
2225 _close_mds_session(session.get());
2226 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2227 break;
2228 }
2229 session->mds_features = std::move(m->supported_features);
2230
2231 renew_caps(session.get());
2232 session->state = MetaSession::STATE_OPEN;
2233 if (is_unmounting())
2234 mount_cond.notify_all();
2235 else
2236 connect_mds_targets(from);
2237 signal_context_list(session->waiting_for_open);
2238 break;
2239 }
2240
2241 case CEPH_SESSION_CLOSE:
2242 _closed_mds_session(session.get());
2243 break;
2244
2245 case CEPH_SESSION_RENEWCAPS:
2246 if (session->cap_renew_seq == m->get_seq()) {
2247 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2248 session->cap_ttl =
2249 session->last_cap_renew_request + mdsmap->get_session_timeout();
2250 if (was_stale)
2251 wake_up_session_caps(session.get(), false);
2252 }
2253 break;
2254
2255 case CEPH_SESSION_STALE:
2256 // invalidate session caps/leases
2257 session->cap_gen++;
2258 session->cap_ttl = ceph_clock_now();
2259 session->cap_ttl -= 1;
2260 renew_caps(session.get());
2261 break;
2262
2263 case CEPH_SESSION_RECALL_STATE:
2264 /*
2265 * Call the renew caps and flush cap releases just before
2266 * triming the caps in case the tick() won't get a chance
2267 * to run them, which could cause the client to be blocklisted
2268 * and MDS daemons trying to recall the caps again and
2269 * again.
2270 *
2271 * In most cases it will do nothing, and the new cap releases
2272 * added by trim_caps() followed will be deferred flushing
2273 * by tick().
2274 */
2275 renew_and_flush_cap_releases();
2276 trim_caps(session.get(), m->get_max_caps());
2277 break;
2278
2279 case CEPH_SESSION_FLUSHMSG:
2280 /* flush cap release */
2281 if (auto& m = session->release; m) {
2282 session->con->send_message2(std::move(m));
2283 }
2284 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2285 break;
2286
2287 case CEPH_SESSION_FORCE_RO:
2288 force_session_readonly(session.get());
2289 break;
2290
2291 case CEPH_SESSION_REJECT:
2292 {
2293 std::string_view error_str;
2294 auto it = m->metadata.find("error_string");
2295 if (it != m->metadata.end())
2296 error_str = it->second;
2297 else
2298 error_str = "unknown error";
2299 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2300
2301 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2302 }
2303 break;
2304
2305 default:
2306 ceph_abort();
2307 }
2308 }
2309
2310 bool Client::_any_stale_sessions() const
2311 {
2312 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2313
2314 for (const auto &p : mds_sessions) {
2315 if (p.second->state == MetaSession::STATE_STALE) {
2316 return true;
2317 }
2318 }
2319
2320 return false;
2321 }
2322
2323 void Client::_kick_stale_sessions()
2324 {
2325 ldout(cct, 1) << __func__ << dendl;
2326
2327 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2328 auto s = it->second;
2329 if (s->state == MetaSession::STATE_REJECTED) {
2330 mds_sessions.erase(it->first);
2331 continue;
2332 }
2333 if (s->state == MetaSession::STATE_STALE)
2334 _closed_mds_session(s.get());
2335 }
2336 }
2337
2338 void Client::send_request(MetaRequest *request, MetaSession *session,
2339 bool drop_cap_releases)
2340 {
2341 // make the request
2342 mds_rank_t mds = session->mds_num;
2343 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2344 << " for mds." << mds << dendl;
2345 auto r = build_client_request(request);
2346 if (request->dentry()) {
2347 r->set_dentry_wanted();
2348 }
2349 if (request->got_unsafe) {
2350 r->set_replayed_op();
2351 if (request->target)
2352 r->head.ino = request->target->ino;
2353 } else {
2354 encode_cap_releases(request, mds);
2355 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2356 request->cap_releases.clear();
2357 else
2358 r->releases.swap(request->cap_releases);
2359 }
2360 r->set_mdsmap_epoch(mdsmap->get_epoch());
2361 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2362 objecter->with_osdmap([r](const OSDMap& o) {
2363 r->set_osdmap_epoch(o.get_epoch());
2364 });
2365 }
2366
2367 if (request->mds == -1) {
2368 request->sent_stamp = ceph_clock_now();
2369 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2370 }
2371 request->mds = mds;
2372
2373 Inode *in = request->inode();
2374 if (in) {
2375 auto it = in->caps.find(mds);
2376 if (it != in->caps.end()) {
2377 request->sent_on_mseq = it->second.mseq;
2378 }
2379 }
2380
2381 session->requests.push_back(&request->item);
2382
2383 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2384 session->con->send_message2(std::move(r));
2385 }
2386
2387 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2388 {
2389 auto req = make_message<MClientRequest>(request->get_op());
2390 req->set_tid(request->tid);
2391 req->set_stamp(request->op_stamp);
2392 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2393
2394 // if the filepath's haven't been set, set them!
2395 if (request->path.empty()) {
2396 Inode *in = request->inode();
2397 Dentry *de = request->dentry();
2398 if (in)
2399 in->make_nosnap_relative_path(request->path);
2400 else if (de) {
2401 if (de->inode)
2402 de->inode->make_nosnap_relative_path(request->path);
2403 else if (de->dir) {
2404 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2405 request->path.push_dentry(de->name);
2406 }
2407 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2408 << " No path, inode, or appropriately-endowed dentry given!"
2409 << dendl;
2410 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2411 << " No path, inode, or dentry given!"
2412 << dendl;
2413 }
2414 req->set_filepath(request->get_filepath());
2415 req->set_filepath2(request->get_filepath2());
2416 req->set_alternate_name(request->alternate_name);
2417 req->set_data(request->data);
2418 req->set_retry_attempt(request->retry_attempt++);
2419 req->head.num_fwd = request->num_fwd;
2420 const gid_t *_gids;
2421 int gid_count = request->perms.get_gids(&_gids);
2422 req->set_gid_list(gid_count, _gids);
2423 return req;
2424 }
2425
2426
2427
2428 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2429 {
2430 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2431
2432 std::scoped_lock cl(client_lock);
2433 auto session = _get_mds_session(mds, fwd->get_connection().get());
2434 if (!session) {
2435 return;
2436 }
2437 ceph_tid_t tid = fwd->get_tid();
2438
2439 if (mds_requests.count(tid) == 0) {
2440 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2441 return;
2442 }
2443
2444 MetaRequest *request = mds_requests[tid];
2445 ceph_assert(request);
2446
2447 // reset retry counter
2448 request->retry_attempt = 0;
2449
2450 // request not forwarded, or dest mds has no session.
2451 // resend.
2452 ldout(cct, 10) << __func__ << " tid " << tid
2453 << " fwd " << fwd->get_num_fwd()
2454 << " to mds." << fwd->get_dest_mds()
2455 << ", resending to " << fwd->get_dest_mds()
2456 << dendl;
2457
2458 request->mds = -1;
2459 request->item.remove_myself();
2460 request->num_fwd = fwd->get_num_fwd();
2461 request->resend_mds = fwd->get_dest_mds();
2462 request->caller_cond->notify_all();
2463 }
2464
2465 bool Client::is_dir_operation(MetaRequest *req)
2466 {
2467 int op = req->get_op();
2468 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2469 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2470 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2471 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2472 return true;
2473 return false;
2474 }
2475
2476 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2477 {
2478 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2479
2480 std::scoped_lock cl(client_lock);
2481 auto session = _get_mds_session(mds_num, reply->get_connection().get());
2482 if (!session) {
2483 return;
2484 }
2485
2486 ceph_tid_t tid = reply->get_tid();
2487 bool is_safe = reply->is_safe();
2488
2489 if (mds_requests.count(tid) == 0) {
2490 lderr(cct) << __func__ << " no pending request on tid " << tid
2491 << " safe is:" << is_safe << dendl;
2492 return;
2493 }
2494 MetaRequest *request = mds_requests.at(tid);
2495
2496 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2497 << " tid " << tid << dendl;
2498
2499 if (request->got_unsafe && !is_safe) {
2500 //duplicate response
2501 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2502 << mds_num << " safe:" << is_safe << dendl;
2503 return;
2504 }
2505
2506 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
2507 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2508 << " from mds." << request->mds << dendl;
2509 request->send_to_auth = true;
2510 request->resend_mds = choose_target_mds(request);
2511 Inode *in = request->inode();
2512 std::map<mds_rank_t, Cap>::const_iterator it;
2513 if (request->resend_mds >= 0 &&
2514 request->resend_mds == request->mds &&
2515 (in == NULL ||
2516 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2517 request->sent_on_mseq == it->second.mseq)) {
2518 ldout(cct, 20) << "have to return ESTALE" << dendl;
2519 } else {
2520 request->caller_cond->notify_all();
2521 return;
2522 }
2523 }
2524
2525 ceph_assert(!request->reply);
2526 request->reply = reply;
2527 insert_trace(request, session.get());
2528
2529 // Handle unsafe reply
2530 if (!is_safe) {
2531 request->got_unsafe = true;
2532 session->unsafe_requests.push_back(&request->unsafe_item);
2533 if (is_dir_operation(request)) {
2534 Inode *dir = request->inode();
2535 ceph_assert(dir);
2536 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2537 }
2538 if (request->target) {
2539 InodeRef &in = request->target;
2540 in->unsafe_ops.push_back(&request->unsafe_target_item);
2541 }
2542 }
2543
2544 // Only signal the caller once (on the first reply):
2545 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2546 if (!is_safe || !request->got_unsafe) {
2547 ceph::condition_variable cond;
2548 request->dispatch_cond = &cond;
2549
2550 // wake up waiter
2551 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2552 request->caller_cond->notify_all();
2553
2554 // wake for kick back
2555 std::unique_lock l{client_lock, std::adopt_lock};
2556 cond.wait(l, [tid, request, &cond, this] {
2557 if (request->dispatch_cond) {
2558 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2559 << tid << " " << &cond << dendl;
2560 }
2561 return !request->dispatch_cond;
2562 });
2563 l.release();
2564 }
2565
2566 if (is_safe) {
2567 // the filesystem change is committed to disk
2568 // we're done, clean up
2569 if (request->got_unsafe) {
2570 request->unsafe_item.remove_myself();
2571 request->unsafe_dir_item.remove_myself();
2572 request->unsafe_target_item.remove_myself();
2573 signal_cond_list(request->waitfor_safe);
2574 }
2575 request->item.remove_myself();
2576 unregister_request(request);
2577 }
2578 if (is_unmounting())
2579 mount_cond.notify_all();
2580 }
2581
2582 void Client::_handle_full_flag(int64_t pool)
2583 {
2584 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2585 << "on " << pool << dendl;
2586 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2587 // to do this rather than blocking, because otherwise when we fill up we
2588 // potentially lock caps forever on files with dirty pages, and we need
2589 // to be able to release those caps to the MDS so that it can delete files
2590 // and free up space.
2591 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2592
2593 // For all inodes with layouts in this pool and a pending flush write op
2594 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2595 // from ObjectCacher so that it doesn't re-issue the write in response to
2596 // the ENOSPC error.
2597 // Fortunately since we're cancelling everything in a given pool, we don't
2598 // need to know which ops belong to which ObjectSet, we can just blow all
2599 // the un-flushed cached data away and mark any dirty inodes' async_err
2600 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2601 // affecting this pool, and all the objectsets we're purging were also
2602 // in this pool.
2603 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2604 i != inode_map.end(); ++i)
2605 {
2606 Inode *inode = i->second;
2607 if (inode->oset.dirty_or_tx
2608 && (pool == -1 || inode->layout.pool_id == pool)) {
2609 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2610 << " has dirty objects, purging and setting ENOSPC" << dendl;
2611 objectcacher->purge_set(&inode->oset);
2612 inode->set_async_err(-CEPHFS_ENOSPC);
2613 }
2614 }
2615
2616 if (cancelled_epoch != (epoch_t)-1) {
2617 set_cap_epoch_barrier(cancelled_epoch);
2618 }
2619 }
2620
2621 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2622 {
2623 std::set<entity_addr_t> new_blocklists;
2624
2625 std::scoped_lock cl(client_lock);
2626 objecter->consume_blocklist_events(&new_blocklists);
2627
2628 const auto myaddrs = messenger->get_myaddrs();
2629 bool new_blocklist = false;
2630 bool prenautilus = objecter->with_osdmap(
2631 [&](const OSDMap& o) {
2632 return o.require_osd_release < ceph_release_t::nautilus;
2633 });
2634 if (!blocklisted) {
2635 for (auto a : myaddrs.v) {
2636 // blocklist entries are always TYPE_ANY for nautilus+
2637 a.set_type(entity_addr_t::TYPE_ANY);
2638 if (new_blocklists.count(a)) {
2639 new_blocklist = true;
2640 break;
2641 }
2642 if (prenautilus) {
2643 // ...except pre-nautilus, they were TYPE_LEGACY
2644 a.set_type(entity_addr_t::TYPE_LEGACY);
2645 if (new_blocklists.count(a)) {
2646 new_blocklist = true;
2647 break;
2648 }
2649 }
2650 }
2651 }
2652 if (new_blocklist) {
2653 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2654 return o.get_epoch();
2655 });
2656 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2657 blocklisted = true;
2658
2659 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2660
2661 // Since we know all our OSD ops will fail, cancel them all preemtively,
2662 // so that on an unhealthy cluster we can umount promptly even if e.g.
2663 // some PGs were inaccessible.
2664 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2665
2666 }
2667
2668 if (blocklisted) {
2669 // Handle case where we were blocklisted but no longer are
2670 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2671 return o.is_blocklisted(myaddrs);});
2672 }
2673
2674 // Always subscribe to next osdmap for blocklisted client
2675 // until this client is not blocklisted.
2676 if (blocklisted) {
2677 objecter->maybe_request_map();
2678 }
2679
2680 if (objecter->osdmap_full_flag()) {
2681 _handle_full_flag(-1);
2682 } else {
2683 // Accumulate local list of full pools so that I can drop
2684 // the objecter lock before re-entering objecter in
2685 // cancel_writes
2686 std::vector<int64_t> full_pools;
2687
2688 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2689 for (const auto& kv : o.get_pools()) {
2690 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2691 full_pools.push_back(kv.first);
2692 }
2693 }
2694 });
2695
2696 for (auto p : full_pools)
2697 _handle_full_flag(p);
2698
2699 // Subscribe to subsequent maps to watch for the full flag going
2700 // away. For the global full flag objecter does this for us, but
2701 // it pays no attention to the per-pool full flag so in this branch
2702 // we do it ourselves.
2703 if (!full_pools.empty()) {
2704 objecter->maybe_request_map();
2705 }
2706 }
2707 }
2708
2709
2710 // ------------------------
2711 // incoming messages
2712
2713
2714 bool Client::ms_dispatch2(const MessageRef &m)
2715 {
2716 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2717 if (!iref_reader.is_state_satisfied()) {
2718 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2719 return true;
2720 }
2721
2722 switch (m->get_type()) {
2723 // mounting and mds sessions
2724 case CEPH_MSG_MDS_MAP:
2725 handle_mds_map(ref_cast<MMDSMap>(m));
2726 break;
2727 case CEPH_MSG_FS_MAP:
2728 handle_fs_map(ref_cast<MFSMap>(m));
2729 break;
2730 case CEPH_MSG_FS_MAP_USER:
2731 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2732 break;
2733 case CEPH_MSG_CLIENT_SESSION:
2734 handle_client_session(ref_cast<MClientSession>(m));
2735 break;
2736
2737 case CEPH_MSG_OSD_MAP:
2738 handle_osd_map(ref_cast<MOSDMap>(m));
2739 break;
2740
2741 // requests
2742 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2743 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2744 break;
2745 case CEPH_MSG_CLIENT_REPLY:
2746 handle_client_reply(ref_cast<MClientReply>(m));
2747 break;
2748
2749 // reclaim reply
2750 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2751 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2752 break;
2753
2754 case CEPH_MSG_CLIENT_SNAP:
2755 handle_snap(ref_cast<MClientSnap>(m));
2756 break;
2757 case CEPH_MSG_CLIENT_CAPS:
2758 handle_caps(ref_cast<MClientCaps>(m));
2759 break;
2760 case CEPH_MSG_CLIENT_LEASE:
2761 handle_lease(ref_cast<MClientLease>(m));
2762 break;
2763 case MSG_COMMAND_REPLY:
2764 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2765 handle_command_reply(ref_cast<MCommandReply>(m));
2766 } else {
2767 return false;
2768 }
2769 break;
2770 case CEPH_MSG_CLIENT_QUOTA:
2771 handle_quota(ref_cast<MClientQuota>(m));
2772 break;
2773
2774 default:
2775 return false;
2776 }
2777
2778 // unmounting?
2779 std::scoped_lock cl(client_lock);
2780 if (is_unmounting()) {
2781 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2782 << "+" << inode_map.size() << dendl;
2783 uint64_t size = lru.lru_get_size() + inode_map.size();
2784 trim_cache();
2785 if (size > lru.lru_get_size() + inode_map.size()) {
2786 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2787 mount_cond.notify_all();
2788 } else {
2789 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2790 << "+" << inode_map.size() << dendl;
2791 }
2792 }
2793
2794 return true;
2795 }
2796
2797 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2798 {
2799 std::scoped_lock cl(client_lock);
2800 fsmap.reset(new FSMap(m->get_fsmap()));
2801
2802 signal_cond_list(waiting_for_fsmap);
2803
2804 monclient->sub_got("fsmap", fsmap->get_epoch());
2805 }
2806
2807 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2808 {
2809 std::scoped_lock cl(client_lock);
2810 fsmap_user.reset(new FSMapUser);
2811 *fsmap_user = m->get_fsmap();
2812
2813 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2814 signal_cond_list(waiting_for_fsmap);
2815 }
2816
2817 // Cancel all the commands for missing or laggy GIDs
2818 void Client::cancel_commands(const MDSMap& newmap)
2819 {
2820 std::vector<ceph_tid_t> cancel_ops;
2821
2822 std::scoped_lock cmd_lock(command_lock);
2823 auto &commands = command_table.get_commands();
2824 for (const auto &[tid, op] : commands) {
2825 const mds_gid_t op_mds_gid = op.mds_gid;
2826 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2827 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2828 cancel_ops.push_back(tid);
2829 if (op.outs) {
2830 std::ostringstream ss;
2831 ss << "MDS " << op_mds_gid << " went away";
2832 *(op.outs) = ss.str();
2833 }
2834 /*
2835 * No need to make the con->mark_down under
2836 * client_lock here, because the con will
2837 * has its own lock.
2838 */
2839 op.con->mark_down();
2840 if (op.on_finish)
2841 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2842 }
2843 }
2844
2845 for (const auto &tid : cancel_ops)
2846 command_table.erase(tid);
2847 }
2848
2849 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2850 {
2851 std::unique_lock cl(client_lock);
2852 if (m->get_epoch() <= mdsmap->get_epoch()) {
2853 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2854 << " is identical to or older than our "
2855 << mdsmap->get_epoch() << dendl;
2856 return;
2857 }
2858
2859 cl.unlock();
2860 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2861 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2862 _mdsmap->decode(m->get_encoded());
2863 cancel_commands(*_mdsmap.get());
2864 cl.lock();
2865
2866 _mdsmap.swap(mdsmap);
2867
2868 // reset session
2869 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2870 mds_rank_t mds = p->first;
2871 MetaSessionRef session = p->second;
2872 ++p;
2873
2874 int oldstate = _mdsmap->get_state(mds);
2875 int newstate = mdsmap->get_state(mds);
2876 if (!mdsmap->is_up(mds)) {
2877 session->con->mark_down();
2878 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2879 auto old_inc = _mdsmap->get_incarnation(mds);
2880 auto new_inc = mdsmap->get_incarnation(mds);
2881 if (old_inc != new_inc) {
2882 ldout(cct, 1) << "mds incarnation changed from "
2883 << old_inc << " to " << new_inc << dendl;
2884 oldstate = MDSMap::STATE_NULL;
2885 }
2886 session->con->mark_down();
2887 session->addrs = mdsmap->get_addrs(mds);
2888 // When new MDS starts to take over, notify kernel to trim unused entries
2889 // in its dcache/icache. Hopefully, the kernel will release some unused
2890 // inodes before the new MDS enters reconnect state.
2891 trim_cache_for_reconnect(session.get());
2892 } else if (oldstate == newstate)
2893 continue; // no change
2894
2895 session->mds_state = newstate;
2896 if (newstate == MDSMap::STATE_RECONNECT) {
2897 session->con = messenger->connect_to_mds(session->addrs);
2898 send_reconnect(session.get());
2899 } else if (newstate > MDSMap::STATE_RECONNECT) {
2900 if (oldstate < MDSMap::STATE_RECONNECT) {
2901 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2902 _closed_mds_session(session.get());
2903 continue;
2904 }
2905 if (newstate >= MDSMap::STATE_ACTIVE) {
2906 if (oldstate < MDSMap::STATE_ACTIVE) {
2907 // kick new requests
2908 kick_requests(session.get());
2909 kick_flushing_caps(session.get());
2910 signal_context_list(session->waiting_for_open);
2911 wake_up_session_caps(session.get(), true);
2912 }
2913 connect_mds_targets(mds);
2914 }
2915 } else if (newstate == MDSMap::STATE_NULL &&
2916 mds >= mdsmap->get_max_mds()) {
2917 _closed_mds_session(session.get());
2918 }
2919 }
2920
2921 // kick any waiting threads
2922 signal_cond_list(waiting_for_mdsmap);
2923
2924 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2925 }
2926
2927 void Client::send_reconnect(MetaSession *session)
2928 {
2929 mds_rank_t mds = session->mds_num;
2930 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2931
2932 // trim unused caps to reduce MDS's cache rejoin time
2933 trim_cache_for_reconnect(session);
2934
2935 session->readonly = false;
2936
2937 session->release.reset();
2938
2939 // reset my cap seq number
2940 session->seq = 0;
2941 //connect to the mds' offload targets
2942 connect_mds_targets(mds);
2943 //make sure unsafe requests get saved
2944 resend_unsafe_requests(session);
2945
2946 early_kick_flushing_caps(session);
2947
2948 auto m = make_message<MClientReconnect>();
2949 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2950
2951 // i have an open session.
2952 ceph::unordered_set<inodeno_t> did_snaprealm;
2953 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2954 p != inode_map.end();
2955 ++p) {
2956 Inode *in = p->second;
2957 auto it = in->caps.find(mds);
2958 if (it != in->caps.end()) {
2959 if (allow_multi &&
2960 m->get_approx_size() >=
2961 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2962 m->mark_more();
2963 session->con->send_message2(std::move(m));
2964
2965 m = make_message<MClientReconnect>();
2966 }
2967
2968 Cap &cap = it->second;
2969 ldout(cct, 10) << " caps on " << p->first
2970 << " " << ccap_string(cap.issued)
2971 << " wants " << ccap_string(in->caps_wanted())
2972 << dendl;
2973 filepath path;
2974 in->make_short_path(path);
2975 ldout(cct, 10) << " path " << path << dendl;
2976
2977 bufferlist flockbl;
2978 _encode_filelocks(in, flockbl);
2979
2980 cap.seq = 0; // reset seq.
2981 cap.issue_seq = 0; // reset seq.
2982 cap.mseq = 0; // reset seq.
2983 // cap gen should catch up with session cap_gen
2984 if (cap.gen < session->cap_gen) {
2985 cap.gen = session->cap_gen;
2986 cap.issued = cap.implemented = CEPH_CAP_PIN;
2987 } else {
2988 cap.issued = cap.implemented;
2989 }
2990 snapid_t snap_follows = 0;
2991 if (!in->cap_snaps.empty())
2992 snap_follows = in->cap_snaps.begin()->first;
2993
2994 m->add_cap(p->first.ino,
2995 cap.cap_id,
2996 path.get_ino(), path.get_path(), // ino
2997 in->caps_wanted(), // wanted
2998 cap.issued, // issued
2999 in->snaprealm->ino,
3000 snap_follows,
3001 flockbl);
3002
3003 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3004 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3005 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3006 did_snaprealm.insert(in->snaprealm->ino);
3007 }
3008 }
3009 }
3010
3011 if (!allow_multi)
3012 m->set_encoding_version(0); // use connection features to choose encoding
3013 session->con->send_message2(std::move(m));
3014
3015 mount_cond.notify_all();
3016
3017 if (session->reclaim_state == MetaSession::RECLAIMING)
3018 signal_cond_list(waiting_for_reclaim);
3019 }
3020
3021
3022 void Client::kick_requests(MetaSession *session)
3023 {
3024 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3025 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3026 p != mds_requests.end();
3027 ++p) {
3028 MetaRequest *req = p->second;
3029 if (req->got_unsafe)
3030 continue;
3031 if (req->aborted()) {
3032 if (req->caller_cond) {
3033 req->kick = true;
3034 req->caller_cond->notify_all();
3035 }
3036 continue;
3037 }
3038 if (req->retry_attempt > 0)
3039 continue; // new requests only
3040 if (req->mds == session->mds_num) {
3041 send_request(p->second, session);
3042 }
3043 }
3044 }
3045
3046 void Client::resend_unsafe_requests(MetaSession *session)
3047 {
3048 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3049 !iter.end();
3050 ++iter)
3051 send_request(*iter, session);
3052
3053 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3054 // process completed requests in clientreplay stage.
3055 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3056 p != mds_requests.end();
3057 ++p) {
3058 MetaRequest *req = p->second;
3059 if (req->got_unsafe)
3060 continue;
3061 if (req->aborted())
3062 continue;
3063 if (req->retry_attempt == 0)
3064 continue; // old requests only
3065 if (req->mds == session->mds_num)
3066 send_request(req, session, true);
3067 }
3068 }
3069
3070 void Client::wait_unsafe_requests()
3071 {
3072 list<MetaRequest*> last_unsafe_reqs;
3073 for (const auto &p : mds_sessions) {
3074 const auto s = p.second;
3075 if (!s->unsafe_requests.empty()) {
3076 MetaRequest *req = s->unsafe_requests.back();
3077 req->get();
3078 last_unsafe_reqs.push_back(req);
3079 }
3080 }
3081
3082 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3083 p != last_unsafe_reqs.end();
3084 ++p) {
3085 MetaRequest *req = *p;
3086 if (req->unsafe_item.is_on_list())
3087 wait_on_list(req->waitfor_safe);
3088 put_request(req);
3089 }
3090 }
3091
3092 void Client::kick_requests_closed(MetaSession *session)
3093 {
3094 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3095 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3096 p != mds_requests.end(); ) {
3097 MetaRequest *req = p->second;
3098 ++p;
3099 if (req->mds == session->mds_num) {
3100 if (req->caller_cond) {
3101 req->kick = true;
3102 req->caller_cond->notify_all();
3103 }
3104 req->item.remove_myself();
3105 if (req->got_unsafe) {
3106 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3107 req->unsafe_item.remove_myself();
3108 if (is_dir_operation(req)) {
3109 Inode *dir = req->inode();
3110 ceph_assert(dir);
3111 dir->set_async_err(-CEPHFS_EIO);
3112 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3113 << dir->ino << " " << req->get_tid() << dendl;
3114 req->unsafe_dir_item.remove_myself();
3115 }
3116 if (req->target) {
3117 InodeRef &in = req->target;
3118 in->set_async_err(-CEPHFS_EIO);
3119 lderr(cct) << "kick_requests_closed drop req of inode : "
3120 << in->ino << " " << req->get_tid() << dendl;
3121 req->unsafe_target_item.remove_myself();
3122 }
3123 signal_cond_list(req->waitfor_safe);
3124 unregister_request(req);
3125 }
3126 }
3127 }
3128 ceph_assert(session->requests.empty());
3129 ceph_assert(session->unsafe_requests.empty());
3130 }
3131
3132
3133
3134
3135 /************
3136 * leases
3137 */
3138
3139 void Client::got_mds_push(MetaSession *s)
3140 {
3141 s->seq++;
3142 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3143 if (s->state == MetaSession::STATE_CLOSING) {
3144 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3145 }
3146 }
3147
3148 void Client::handle_lease(const MConstRef<MClientLease>& m)
3149 {
3150 ldout(cct, 10) << __func__ << " " << *m << dendl;
3151
3152 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3153 mds_rank_t mds = mds_rank_t(m->get_source().num());
3154
3155 std::scoped_lock cl(client_lock);
3156 auto session = _get_mds_session(mds, m->get_connection().get());
3157 if (!session) {
3158 return;
3159 }
3160
3161 got_mds_push(session.get());
3162
3163 ceph_seq_t seq = m->get_seq();
3164
3165 Inode *in;
3166 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3167 if (inode_map.count(vino) == 0) {
3168 ldout(cct, 10) << " don't have vino " << vino << dendl;
3169 goto revoke;
3170 }
3171 in = inode_map[vino];
3172
3173 if (m->get_mask() & CEPH_LEASE_VALID) {
3174 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3175 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3176 goto revoke;
3177 }
3178 Dentry *dn = in->dir->dentries[m->dname];
3179 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3180 dn->lease_mds = -1;
3181 }
3182
3183 revoke:
3184 {
3185 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3186 m->get_mask(), m->get_ino(),
3187 m->get_first(), m->get_last(), m->dname);
3188 m->get_connection()->send_message2(std::move(reply));
3189 }
3190 }
3191
3192 void Client::_put_inode(Inode *in, int n)
3193 {
3194 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3195
3196 int left = in->get_nref();
3197 ceph_assert(left >= n + 1);
3198 in->iput(n);
3199 left -= n;
3200 if (left == 1) { // the last one will be held by the inode_map
3201 // release any caps
3202 remove_all_caps(in);
3203
3204 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3205 bool unclean = objectcacher->release_set(&in->oset);
3206 ceph_assert(!unclean);
3207 inode_map.erase(in->vino());
3208 if (use_faked_inos())
3209 _release_faked_ino(in);
3210
3211 if (root == nullptr) {
3212 root_ancestor = 0;
3213 while (!root_parents.empty())
3214 root_parents.erase(root_parents.begin());
3215 }
3216
3217 in->iput();
3218 }
3219 }
3220
3221 void Client::delay_put_inodes(bool wakeup)
3222 {
3223 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3224
3225 std::map<Inode*,int> release;
3226 {
3227 std::scoped_lock dl(delay_i_lock);
3228 release.swap(delay_i_release);
3229 }
3230
3231 if (release.empty())
3232 return;
3233
3234 for (auto &[in, cnt] : release)
3235 _put_inode(in, cnt);
3236
3237 if (wakeup)
3238 mount_cond.notify_all();
3239 }
3240
3241 void Client::put_inode(Inode *in, int n)
3242 {
3243 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3244
3245 std::scoped_lock dl(delay_i_lock);
3246 delay_i_release[in] += n;
3247 }
3248
3249 void Client::close_dir(Dir *dir)
3250 {
3251 Inode *in = dir->parent_inode;
3252 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3253 ceph_assert(dir->is_empty());
3254 ceph_assert(in->dir == dir);
3255 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3256 if (!in->dentries.empty())
3257 in->get_first_parent()->put(); // unpin dentry
3258
3259 delete in->dir;
3260 in->dir = 0;
3261 put_inode(in); // unpin inode
3262 }
3263
3264 /**
3265 * Don't call this with in==NULL, use get_or_create for that
3266 * leave dn set to default NULL unless you're trying to add
3267 * a new inode to a pre-created Dentry
3268 */
3269 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3270 {
3271 if (!dn) {
3272 // create a new Dentry
3273 dn = new Dentry(dir, name);
3274
3275 lru.lru_insert_mid(dn); // mid or top?
3276
3277 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3278 << " dn " << dn << " (new dn)" << dendl;
3279 } else {
3280 ceph_assert(!dn->inode);
3281 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3282 << " dn " << dn << " (old dn)" << dendl;
3283 }
3284
3285 if (in) { // link to inode
3286 InodeRef tmp_ref;
3287 // only one parent for directories!
3288 if (in->is_dir() && !in->dentries.empty()) {
3289 tmp_ref = in; // prevent unlink below from freeing the inode.
3290 Dentry *olddn = in->get_first_parent();
3291 ceph_assert(olddn->dir != dir || olddn->name != name);
3292 Inode *old_diri = olddn->dir->parent_inode;
3293 clear_dir_complete_and_ordered(old_diri, true);
3294 unlink(olddn, true, true); // keep dir, dentry
3295 }
3296
3297 dn->link(in);
3298 inc_dentry_nr();
3299 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3300 }
3301
3302 return dn;
3303 }
3304
3305 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3306 {
3307 InodeRef in(dn->inode);
3308 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3309 << " inode " << dn->inode << dendl;
3310
3311 // unlink from inode
3312 if (dn->inode) {
3313 dn->unlink();
3314 dec_dentry_nr();
3315 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3316 }
3317
3318 if (keepdentry) {
3319 dn->lease_mds = -1;
3320 } else {
3321 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3322
3323 // unlink from dir
3324 Dir *dir = dn->dir;
3325 dn->detach();
3326
3327 // delete den
3328 lru.lru_remove(dn);
3329 dn->put();
3330
3331 if (dir->is_empty() && !keepdir)
3332 close_dir(dir);
3333 }
3334 }
3335
3336 /**
3337 * For asynchronous flushes, check for errors from the IO and
3338 * update the inode if necessary
3339 */
3340 class C_Client_FlushComplete : public Context {
3341 private:
3342 Client *client;
3343 InodeRef inode;
3344 public:
3345 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3346 void finish(int r) override {
3347 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3348 if (r != 0) {
3349 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3350 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3351 << " 0x" << std::hex << inode->ino << std::dec
3352 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3353 inode->set_async_err(r);
3354 }
3355 }
3356 };
3357
3358
3359 /****
3360 * caps
3361 */
3362
3363 void Client::get_cap_ref(Inode *in, int cap)
3364 {
3365 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3366 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3367 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3368 in->iget();
3369 }
3370 if ((cap & CEPH_CAP_FILE_CACHE) &&
3371 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3372 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3373 in->iget();
3374 }
3375 in->get_cap_ref(cap);
3376 }
3377
3378 void Client::put_cap_ref(Inode *in, int cap)
3379 {
3380 int last = in->put_cap_ref(cap);
3381 if (last) {
3382 int put_nref = 0;
3383 int drop = last & ~in->caps_issued();
3384 if (in->snapid == CEPH_NOSNAP) {
3385 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3386 !in->cap_snaps.empty() &&
3387 in->cap_snaps.rbegin()->second.writing) {
3388 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3389 in->cap_snaps.rbegin()->second.writing = 0;
3390 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3391 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3392 }
3393 if (last & CEPH_CAP_FILE_BUFFER) {
3394 for (auto &p : in->cap_snaps)
3395 p.second.dirty_data = 0;
3396 signal_cond_list(in->waitfor_commit);
3397 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3398 ++put_nref;
3399 }
3400 }
3401 if (last & CEPH_CAP_FILE_CACHE) {
3402 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3403 ++put_nref;
3404 }
3405 if (drop)
3406 check_caps(in, 0);
3407 if (put_nref)
3408 put_inode(in, put_nref);
3409 }
3410 }
3411
3412 // get caps for a given file handle -- the inode should have @need caps
3413 // issued by the mds and @want caps not revoked (or not under revocation).
3414 // this routine blocks till the cap requirement is satisfied. also account
3415 // (track) for capability hit when required (when cap requirement succeedes).
3416 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3417 {
3418 Inode *in = fh->inode.get();
3419
3420 int r = check_pool_perm(in, need);
3421 if (r < 0)
3422 return r;
3423
3424 while (1) {
3425 int file_wanted = in->caps_file_wanted();
3426 if ((file_wanted & need) != need) {
3427 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3428 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3429 << dendl;
3430 return -CEPHFS_EBADF;
3431 }
3432
3433 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3434 return -CEPHFS_EBADF;
3435
3436 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3437 return -CEPHFS_EIO;
3438
3439 int implemented;
3440 int have = in->caps_issued(&implemented);
3441
3442 bool waitfor_caps = false;
3443 bool waitfor_commit = false;
3444
3445 if (have & need & CEPH_CAP_FILE_WR) {
3446 if (endoff > 0) {
3447 if ((endoff >= (loff_t)in->max_size ||
3448 endoff > (loff_t)(in->size << 1)) &&
3449 endoff > (loff_t)in->wanted_max_size) {
3450 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3451 in->wanted_max_size = endoff;
3452 }
3453 if (in->wanted_max_size > in->max_size &&
3454 in->wanted_max_size > in->requested_max_size)
3455 check_caps(in, 0);
3456 }
3457
3458 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3459 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3460 waitfor_caps = true;
3461 }
3462 if (!in->cap_snaps.empty()) {
3463 if (in->cap_snaps.rbegin()->second.writing) {
3464 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3465 waitfor_caps = true;
3466 }
3467 for (auto &p : in->cap_snaps) {
3468 if (p.second.dirty_data) {
3469 waitfor_commit = true;
3470 break;
3471 }
3472 }
3473 if (waitfor_commit) {
3474 _flush(in, new C_Client_FlushComplete(this, in));
3475 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3476 }
3477 }
3478 }
3479
3480 if (!waitfor_caps && !waitfor_commit) {
3481 if ((have & need) == need) {
3482 int revoking = implemented & ~have;
3483 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3484 << " need " << ccap_string(need) << " want " << ccap_string(want)
3485 << " revoking " << ccap_string(revoking)
3486 << dendl;
3487 if ((revoking & want) == 0) {
3488 *phave = need | (have & want);
3489 in->get_cap_ref(need);
3490 cap_hit();
3491 return 0;
3492 }
3493 }
3494 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3495 waitfor_caps = true;
3496 }
3497
3498 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3499 in->auth_cap->session->readonly)
3500 return -CEPHFS_EROFS;
3501
3502 if (in->flags & I_CAP_DROPPED) {
3503 int mds_wanted = in->caps_mds_wanted();
3504 if ((mds_wanted & need) != need) {
3505 int ret = _renew_caps(in);
3506 if (ret < 0)
3507 return ret;
3508 continue;
3509 }
3510 if (!(file_wanted & ~mds_wanted))
3511 in->flags &= ~I_CAP_DROPPED;
3512 }
3513
3514 if (waitfor_caps)
3515 wait_on_list(in->waitfor_caps);
3516 else if (waitfor_commit)
3517 wait_on_list(in->waitfor_commit);
3518 }
3519 }
3520
3521 int Client::get_caps_used(Inode *in)
3522 {
3523 unsigned used = in->caps_used();
3524 if (!(used & CEPH_CAP_FILE_CACHE) &&
3525 !objectcacher->set_is_empty(&in->oset))
3526 used |= CEPH_CAP_FILE_CACHE;
3527 return used;
3528 }
3529
3530 void Client::cap_delay_requeue(Inode *in)
3531 {
3532 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3533 in->hold_caps_until = ceph_clock_now();
3534 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3535 delayed_list.push_back(&in->delay_cap_item);
3536 }
3537
3538 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3539 int flags, int used, int want, int retain,
3540 int flush, ceph_tid_t flush_tid)
3541 {
3542 int held = cap->issued | cap->implemented;
3543 int revoking = cap->implemented & ~cap->issued;
3544 retain &= ~revoking;
3545 int dropping = cap->issued & ~retain;
3546 int op = CEPH_CAP_OP_UPDATE;
3547
3548 ldout(cct, 10) << __func__ << " " << *in
3549 << " mds." << session->mds_num << " seq " << cap->seq
3550 << " used " << ccap_string(used)
3551 << " want " << ccap_string(want)
3552 << " flush " << ccap_string(flush)
3553 << " retain " << ccap_string(retain)
3554 << " held "<< ccap_string(held)
3555 << " revoking " << ccap_string(revoking)
3556 << " dropping " << ccap_string(dropping)
3557 << dendl;
3558
3559 if (cct->_conf->client_inject_release_failure && revoking) {
3560 const int would_have_issued = cap->issued & retain;
3561 const int would_have_implemented = cap->implemented & (cap->issued | used);
3562 // Simulated bug:
3563 // - tell the server we think issued is whatever they issued plus whatever we implemented
3564 // - leave what we have implemented in place
3565 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3566 cap->issued = cap->issued | cap->implemented;
3567
3568 // Make an exception for revoking xattr caps: we are injecting
3569 // failure to release other caps, but allow xattr because client
3570 // will block on xattr ops if it can't release these to MDS (#9800)
3571 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3572 cap->issued ^= xattr_mask & revoking;
3573 cap->implemented ^= xattr_mask & revoking;
3574
3575 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3576 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3577 } else {
3578 // Normal behaviour
3579 cap->issued &= retain;
3580 cap->implemented &= cap->issued | used;
3581 }
3582
3583 snapid_t follows = 0;
3584
3585 if (flush)
3586 follows = in->snaprealm->get_snap_context().seq;
3587
3588 auto m = make_message<MClientCaps>(op,
3589 in->ino,
3590 0,
3591 cap->cap_id, cap->seq,
3592 cap->implemented,
3593 want,
3594 flush,
3595 cap->mseq,
3596 cap_epoch_barrier);
3597 m->caller_uid = in->cap_dirtier_uid;
3598 m->caller_gid = in->cap_dirtier_gid;
3599
3600 m->head.issue_seq = cap->issue_seq;
3601 m->set_tid(flush_tid);
3602
3603 m->head.uid = in->uid;
3604 m->head.gid = in->gid;
3605 m->head.mode = in->mode;
3606
3607 m->head.nlink = in->nlink;
3608
3609 if (flush & CEPH_CAP_XATTR_EXCL) {
3610 encode(in->xattrs, m->xattrbl);
3611 m->head.xattr_version = in->xattr_version;
3612 }
3613
3614 m->size = in->size;
3615 m->max_size = in->max_size;
3616 m->truncate_seq = in->truncate_seq;
3617 m->truncate_size = in->truncate_size;
3618 m->mtime = in->mtime;
3619 m->atime = in->atime;
3620 m->ctime = in->ctime;
3621 m->btime = in->btime;
3622 m->time_warp_seq = in->time_warp_seq;
3623 m->change_attr = in->change_attr;
3624
3625 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3626 !in->cap_snaps.empty() &&
3627 in->cap_snaps.rbegin()->second.flush_tid == 0)
3628 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3629 m->flags = flags;
3630
3631 if (flush & CEPH_CAP_FILE_WR) {
3632 m->inline_version = in->inline_version;
3633 m->inline_data = in->inline_data;
3634 }
3635
3636 in->reported_size = in->size;
3637 m->set_snap_follows(follows);
3638 cap->wanted = want;
3639 if (cap == in->auth_cap) {
3640 if (want & CEPH_CAP_ANY_FILE_WR) {
3641 m->set_max_size(in->wanted_max_size);
3642 in->requested_max_size = in->wanted_max_size;
3643 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3644 } else {
3645 in->requested_max_size = 0;
3646 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3647 }
3648 }
3649
3650 if (!session->flushing_caps_tids.empty())
3651 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3652
3653 session->con->send_message2(std::move(m));
3654 }
3655
3656 static bool is_max_size_approaching(Inode *in)
3657 {
3658 /* mds will adjust max size according to the reported size */
3659 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3660 return false;
3661 if (in->size >= in->max_size)
3662 return true;
3663 /* half of previous max_size increment has been used */
3664 if (in->max_size > in->reported_size &&
3665 (in->size << 1) >= in->max_size + in->reported_size)
3666 return true;
3667 return false;
3668 }
3669
3670 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3671 {
3672 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3673 return used;
3674 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3675 return used;
3676
3677 if (issued & CEPH_CAP_FILE_LAZYIO) {
3678 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3679 used &= ~CEPH_CAP_FILE_CACHE;
3680 used |= CEPH_CAP_FILE_LAZYIO;
3681 }
3682 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3683 used &= ~CEPH_CAP_FILE_BUFFER;
3684 used |= CEPH_CAP_FILE_LAZYIO;
3685 }
3686 } else {
3687 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3688 used &= ~CEPH_CAP_FILE_CACHE;
3689 used |= CEPH_CAP_FILE_LAZYIO;
3690 }
3691 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3692 used &= ~CEPH_CAP_FILE_BUFFER;
3693 used |= CEPH_CAP_FILE_LAZYIO;
3694 }
3695 }
3696 return used;
3697 }
3698
3699 /**
3700 * check_caps
3701 *
3702 * Examine currently used and wanted versus held caps. Release, flush or ack
3703 * revoked caps to the MDS as appropriate.
3704 *
3705 * @param in the inode to check
3706 * @param flags flags to apply to cap check
3707 */
3708 void Client::check_caps(Inode *in, unsigned flags)
3709 {
3710 unsigned wanted = in->caps_wanted();
3711 unsigned used = get_caps_used(in);
3712 unsigned cap_used;
3713
3714 int implemented;
3715 int issued = in->caps_issued(&implemented);
3716 int revoking = implemented & ~issued;
3717
3718 int orig_used = used;
3719 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3720
3721 int retain = wanted | used | CEPH_CAP_PIN;
3722 if (!is_unmounting() && in->nlink > 0) {
3723 if (wanted) {
3724 retain |= CEPH_CAP_ANY;
3725 } else if (in->is_dir() &&
3726 (issued & CEPH_CAP_FILE_SHARED) &&
3727 (in->flags & I_COMPLETE)) {
3728 // we do this here because we don't want to drop to Fs (and then
3729 // drop the Fs if we do a create!) if that alone makes us send lookups
3730 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3731 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3732 retain |= wanted;
3733 } else {
3734 retain |= CEPH_CAP_ANY_SHARED;
3735 // keep RD only if we didn't have the file open RW,
3736 // because then the mds would revoke it anyway to
3737 // journal max_size=0.
3738 if (in->max_size == 0)
3739 retain |= CEPH_CAP_ANY_RD;
3740 }
3741 }
3742
3743 ldout(cct, 10) << __func__ << " on " << *in
3744 << " wanted " << ccap_string(wanted)
3745 << " used " << ccap_string(used)
3746 << " issued " << ccap_string(issued)
3747 << " revoking " << ccap_string(revoking)
3748 << " flags=" << flags
3749 << dendl;
3750
3751 if (in->snapid != CEPH_NOSNAP)
3752 return; //snap caps last forever, can't write
3753
3754 if (in->caps.empty())
3755 return; // guard if at end of func
3756
3757 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3758 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3759 if (_release(in))
3760 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3761 }
3762
3763 for (auto &[mds, cap] : in->caps) {
3764 auto session = mds_sessions.at(mds);
3765
3766 cap_used = used;
3767 if (in->auth_cap && &cap != in->auth_cap)
3768 cap_used &= ~in->auth_cap->issued;
3769
3770 revoking = cap.implemented & ~cap.issued;
3771
3772 ldout(cct, 10) << " cap mds." << mds
3773 << " issued " << ccap_string(cap.issued)
3774 << " implemented " << ccap_string(cap.implemented)
3775 << " revoking " << ccap_string(revoking) << dendl;
3776
3777 if (in->wanted_max_size > in->max_size &&
3778 in->wanted_max_size > in->requested_max_size &&
3779 &cap == in->auth_cap)
3780 goto ack;
3781
3782 /* approaching file_max? */
3783 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3784 &cap == in->auth_cap &&
3785 is_max_size_approaching(in)) {
3786 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3787 << ", reported " << in->reported_size << dendl;
3788 goto ack;
3789 }
3790
3791 /* completed revocation? */
3792 if (revoking && (revoking & cap_used) == 0) {
3793 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3794 goto ack;
3795 }
3796
3797 /* want more caps from mds? */
3798 if (wanted & ~(cap.wanted | cap.issued))
3799 goto ack;
3800
3801 if (!revoking && is_unmounting() && (cap_used == 0))
3802 goto ack;
3803
3804 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3805 !in->dirty_caps) // and we have no dirty caps
3806 continue;
3807
3808 if (!(flags & CHECK_CAPS_NODELAY)) {
3809 ldout(cct, 10) << "delaying cap release" << dendl;
3810 cap_delay_requeue(in);
3811 continue;
3812 }
3813
3814 ack:
3815 if (&cap == in->auth_cap) {
3816 if (in->flags & I_KICK_FLUSH) {
3817 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3818 << " to mds." << mds << dendl;
3819 kick_flushing_caps(in, session.get());
3820 }
3821 if (!in->cap_snaps.empty() &&
3822 in->cap_snaps.rbegin()->second.flush_tid == 0)
3823 flush_snaps(in);
3824 }
3825
3826 int flushing;
3827 int msg_flags = 0;
3828 ceph_tid_t flush_tid;
3829 if (in->auth_cap == &cap && in->dirty_caps) {
3830 flushing = mark_caps_flushing(in, &flush_tid);
3831 if (flags & CHECK_CAPS_SYNCHRONOUS)
3832 msg_flags |= MClientCaps::FLAG_SYNC;
3833 } else {
3834 flushing = 0;
3835 flush_tid = 0;
3836 }
3837
3838 in->delay_cap_item.remove_myself();
3839 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
3840 flushing, flush_tid);
3841 }
3842 }
3843
3844
3845 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3846 {
3847 int used = get_caps_used(in);
3848 int dirty = in->caps_dirty();
3849 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3850
3851 if (in->cap_snaps.size() &&
3852 in->cap_snaps.rbegin()->second.writing) {
3853 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3854 return;
3855 } else if (in->caps_dirty() ||
3856 (used & CEPH_CAP_FILE_WR) ||
3857 (dirty & CEPH_CAP_ANY_WR)) {
3858 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3859 ceph_assert(capsnapem.second); /* element inserted */
3860 CapSnap &capsnap = capsnapem.first->second;
3861 capsnap.context = old_snapc;
3862 capsnap.issued = in->caps_issued();
3863 capsnap.dirty = in->caps_dirty();
3864
3865 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3866
3867 capsnap.uid = in->uid;
3868 capsnap.gid = in->gid;
3869 capsnap.mode = in->mode;
3870 capsnap.btime = in->btime;
3871 capsnap.xattrs = in->xattrs;
3872 capsnap.xattr_version = in->xattr_version;
3873 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3874 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3875
3876 if (used & CEPH_CAP_FILE_WR) {
3877 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3878 capsnap.writing = 1;
3879 } else {
3880 finish_cap_snap(in, capsnap, used);
3881 }
3882 } else {
3883 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3884 }
3885 }
3886
3887 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3888 {
3889 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3890 capsnap.size = in->size;
3891 capsnap.mtime = in->mtime;
3892 capsnap.atime = in->atime;
3893 capsnap.ctime = in->ctime;
3894 capsnap.time_warp_seq = in->time_warp_seq;
3895 capsnap.change_attr = in->change_attr;
3896 capsnap.dirty |= in->caps_dirty();
3897
3898 /* Only reset it if it wasn't set before */
3899 if (capsnap.cap_dirtier_uid == -1) {
3900 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3901 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3902 }
3903
3904 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3905 capsnap.inline_data = in->inline_data;
3906 capsnap.inline_version = in->inline_version;
3907 }
3908
3909 if (used & CEPH_CAP_FILE_BUFFER) {
3910 capsnap.writing = 1;
3911 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3912 << " WRBUFFER, delaying" << dendl;
3913 } else {
3914 capsnap.dirty_data = 0;
3915 flush_snaps(in);
3916 }
3917 }
3918
3919 void Client::send_flush_snap(Inode *in, MetaSession *session,
3920 snapid_t follows, CapSnap& capsnap)
3921 {
3922 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3923 in->ino, in->snaprealm->ino, 0,
3924 in->auth_cap->mseq, cap_epoch_barrier);
3925 m->caller_uid = capsnap.cap_dirtier_uid;
3926 m->caller_gid = capsnap.cap_dirtier_gid;
3927
3928 m->set_client_tid(capsnap.flush_tid);
3929 m->head.snap_follows = follows;
3930
3931 m->head.caps = capsnap.issued;
3932 m->head.dirty = capsnap.dirty;
3933
3934 m->head.uid = capsnap.uid;
3935 m->head.gid = capsnap.gid;
3936 m->head.mode = capsnap.mode;
3937 m->btime = capsnap.btime;
3938
3939 m->size = capsnap.size;
3940
3941 m->head.xattr_version = capsnap.xattr_version;
3942 encode(capsnap.xattrs, m->xattrbl);
3943
3944 m->ctime = capsnap.ctime;
3945 m->btime = capsnap.btime;
3946 m->mtime = capsnap.mtime;
3947 m->atime = capsnap.atime;
3948 m->time_warp_seq = capsnap.time_warp_seq;
3949 m->change_attr = capsnap.change_attr;
3950
3951 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3952 m->inline_version = in->inline_version;
3953 m->inline_data = in->inline_data;
3954 }
3955
3956 ceph_assert(!session->flushing_caps_tids.empty());
3957 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3958
3959 session->con->send_message2(std::move(m));
3960 }
3961
3962 void Client::flush_snaps(Inode *in)
3963 {
3964 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3965 ceph_assert(in->cap_snaps.size());
3966
3967 // pick auth mds
3968 ceph_assert(in->auth_cap);
3969 MetaSession *session = in->auth_cap->session;
3970
3971 for (auto &p : in->cap_snaps) {
3972 CapSnap &capsnap = p.second;
3973 // only do new flush
3974 if (capsnap.flush_tid > 0)
3975 continue;
3976
3977 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3978 << " follows " << p.first
3979 << " size " << capsnap.size
3980 << " mtime " << capsnap.mtime
3981 << " dirty_data=" << capsnap.dirty_data
3982 << " writing=" << capsnap.writing
3983 << " on " << *in << dendl;
3984 if (capsnap.dirty_data || capsnap.writing)
3985 break;
3986
3987 capsnap.flush_tid = ++last_flush_tid;
3988 session->flushing_caps_tids.insert(capsnap.flush_tid);
3989 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3990 if (!in->flushing_cap_item.is_on_list())
3991 session->flushing_caps.push_back(&in->flushing_cap_item);
3992
3993 send_flush_snap(in, session, p.first, capsnap);
3994 }
3995 }
3996
3997 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3998 {
3999 ceph::condition_variable cond;
4000 ls.push_back(&cond);
4001 std::unique_lock l{client_lock, std::adopt_lock};
4002 cond.wait(l);
4003 l.release();
4004 ls.remove(&cond);
4005 }
4006
4007 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4008 {
4009 for (auto cond : ls) {
4010 cond->notify_all();
4011 }
4012 }
4013
4014 void Client::wait_on_context_list(list<Context*>& ls)
4015 {
4016 ceph::condition_variable cond;
4017 bool done = false;
4018 int r;
4019 ls.push_back(new C_Cond(cond, &done, &r));
4020 std::unique_lock l{client_lock, std::adopt_lock};
4021 cond.wait(l, [&done] { return done;});
4022 l.release();
4023 }
4024
4025 void Client::signal_context_list(list<Context*>& ls)
4026 {
4027 while (!ls.empty()) {
4028 ls.front()->complete(0);
4029 ls.pop_front();
4030 }
4031 }
4032
4033 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4034 {
4035 for (const auto &cap : s->caps) {
4036 auto &in = cap->inode;
4037 if (reconnect) {
4038 in.requested_max_size = 0;
4039 in.wanted_max_size = 0;
4040 } else {
4041 if (cap->gen < s->cap_gen) {
4042 // mds did not re-issue stale cap.
4043 cap->issued = cap->implemented = CEPH_CAP_PIN;
4044 // make sure mds knows what we want.
4045 if (in.caps_file_wanted() & ~cap->wanted)
4046 in.flags |= I_CAP_DROPPED;
4047 }
4048 }
4049 signal_cond_list(in.waitfor_caps);
4050 }
4051 }
4052
4053
4054 // flush dirty data (from objectcache)
4055
4056 class C_Client_CacheInvalidate : public Context {
4057 private:
4058 Client *client;
4059 vinodeno_t ino;
4060 int64_t offset, length;
4061 public:
4062 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4063 client(c), offset(off), length(len) {
4064 if (client->use_faked_inos())
4065 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4066 else
4067 ino = in->vino();
4068 }
4069 void finish(int r) override {
4070 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4071 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4072 client->_async_invalidate(ino, offset, length);
4073 }
4074 };
4075
4076 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4077 {
4078 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4079 if (!mref_reader.is_state_satisfied())
4080 return;
4081
4082 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4083 ino_invalidate_cb(callback_handle, ino, off, len);
4084 }
4085
4086 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4087
4088 if (ino_invalidate_cb)
4089 // we queue the invalidate, which calls the callback and decrements the ref
4090 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4091 }
4092
4093 void Client::_invalidate_inode_cache(Inode *in)
4094 {
4095 ldout(cct, 10) << __func__ << " " << *in << dendl;
4096
4097 // invalidate our userspace inode cache
4098 if (cct->_conf->client_oc) {
4099 objectcacher->release_set(&in->oset);
4100 if (!objectcacher->set_is_empty(&in->oset))
4101 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4102 }
4103
4104 _schedule_invalidate_callback(in, 0, 0);
4105 }
4106
4107 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4108 {
4109 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4110
4111 // invalidate our userspace inode cache
4112 if (cct->_conf->client_oc) {
4113 vector<ObjectExtent> ls;
4114 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4115 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4116 }
4117
4118 _schedule_invalidate_callback(in, off, len);
4119 }
4120
4121 bool Client::_release(Inode *in)
4122 {
4123 ldout(cct, 20) << "_release " << *in << dendl;
4124 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4125 _invalidate_inode_cache(in);
4126 return true;
4127 }
4128 return false;
4129 }
4130
4131 bool Client::_flush(Inode *in, Context *onfinish)
4132 {
4133 ldout(cct, 10) << "_flush " << *in << dendl;
4134
4135 if (!in->oset.dirty_or_tx) {
4136 ldout(cct, 10) << " nothing to flush" << dendl;
4137 onfinish->complete(0);
4138 return true;
4139 }
4140
4141 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4142 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4143 objectcacher->purge_set(&in->oset);
4144 if (onfinish) {
4145 onfinish->complete(-CEPHFS_ENOSPC);
4146 }
4147 return true;
4148 }
4149
4150 return objectcacher->flush_set(&in->oset, onfinish);
4151 }
4152
4153 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4154 {
4155 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4156 if (!in->oset.dirty_or_tx) {
4157 ldout(cct, 10) << " nothing to flush" << dendl;
4158 return;
4159 }
4160
4161 C_SaferCond onflush("Client::_flush_range flock");
4162 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4163 offset, size, &onflush);
4164 if (!ret) {
4165 // wait for flush
4166 client_lock.unlock();
4167 onflush.wait();
4168 client_lock.lock();
4169 }
4170 }
4171
4172 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4173 {
4174 // std::scoped_lock l(client_lock);
4175 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4176 Inode *in = static_cast<Inode *>(oset->parent);
4177 ceph_assert(in);
4178 _flushed(in);
4179 }
4180
4181 void Client::_flushed(Inode *in)
4182 {
4183 ldout(cct, 10) << "_flushed " << *in << dendl;
4184
4185 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4186 }
4187
4188
4189
4190 // checks common to add_update_cap, handle_cap_grant
4191 void Client::check_cap_issue(Inode *in, unsigned issued)
4192 {
4193 unsigned had = in->caps_issued();
4194
4195 if ((issued & CEPH_CAP_FILE_CACHE) &&
4196 !(had & CEPH_CAP_FILE_CACHE))
4197 in->cache_gen++;
4198
4199 if ((issued & CEPH_CAP_FILE_SHARED) !=
4200 (had & CEPH_CAP_FILE_SHARED)) {
4201 if (issued & CEPH_CAP_FILE_SHARED)
4202 in->shared_gen++;
4203 if (in->is_dir())
4204 clear_dir_complete_and_ordered(in, true);
4205 }
4206 }
4207
4208 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4209 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4210 inodeno_t realm, int flags, const UserPerm& cap_perms)
4211 {
4212 if (!in->is_any_caps()) {
4213 ceph_assert(in->snaprealm == 0);
4214 in->snaprealm = get_snap_realm(realm);
4215 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4216 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4217 } else {
4218 ceph_assert(in->snaprealm);
4219 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4220 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4221 in->snaprealm_item.remove_myself();
4222 auto oldrealm = in->snaprealm;
4223 in->snaprealm = get_snap_realm(realm);
4224 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4225 put_snap_realm(oldrealm);
4226 }
4227 }
4228
4229 mds_rank_t mds = mds_session->mds_num;
4230 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4231 Cap &cap = capem.first->second;
4232 if (!capem.second) {
4233 if (cap.gen < mds_session->cap_gen)
4234 cap.issued = cap.implemented = CEPH_CAP_PIN;
4235
4236 /*
4237 * auth mds of the inode changed. we received the cap export
4238 * message, but still haven't received the cap import message.
4239 * handle_cap_export() updated the new auth MDS' cap.
4240 *
4241 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4242 * a message that was send before the cap import message. So
4243 * don't remove caps.
4244 */
4245 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4246 if (&cap != in->auth_cap)
4247 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4248
4249 ceph_assert(cap.cap_id == cap_id);
4250 seq = cap.seq;
4251 mseq = cap.mseq;
4252 issued |= cap.issued;
4253 flags |= CEPH_CAP_FLAG_AUTH;
4254 }
4255 } else {
4256 inc_pinned_icaps();
4257 }
4258
4259 check_cap_issue(in, issued);
4260
4261 if (flags & CEPH_CAP_FLAG_AUTH) {
4262 if (in->auth_cap != &cap &&
4263 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4264 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4265 ldout(cct, 10) << __func__ << " changing auth cap: "
4266 << "add myself to new auth MDS' flushing caps list" << dendl;
4267 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4268 }
4269 in->auth_cap = &cap;
4270 }
4271 }
4272
4273 unsigned old_caps = cap.issued;
4274 cap.cap_id = cap_id;
4275 cap.issued = issued;
4276 cap.implemented |= issued;
4277 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4278 cap.wanted = wanted;
4279 else
4280 cap.wanted |= wanted;
4281 cap.seq = seq;
4282 cap.issue_seq = seq;
4283 cap.mseq = mseq;
4284 cap.gen = mds_session->cap_gen;
4285 cap.latest_perms = cap_perms;
4286 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4287 << " from mds." << mds
4288 << " on " << *in
4289 << dendl;
4290
4291 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4292 // non-auth MDS is revoking the newly grant caps ?
4293 for (auto &p : in->caps) {
4294 if (&p.second == &cap)
4295 continue;
4296 if (p.second.implemented & ~p.second.issued & issued) {
4297 check_caps(in, CHECK_CAPS_NODELAY);
4298 break;
4299 }
4300 }
4301 }
4302
4303 if (issued & ~old_caps)
4304 signal_cond_list(in->waitfor_caps);
4305 }
4306
4307 void Client::remove_cap(Cap *cap, bool queue_release)
4308 {
4309 auto &in = cap->inode;
4310 MetaSession *session = cap->session;
4311 mds_rank_t mds = cap->session->mds_num;
4312
4313 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4314
4315 if (queue_release) {
4316 session->enqueue_cap_release(
4317 in.ino,
4318 cap->cap_id,
4319 cap->issue_seq,
4320 cap->mseq,
4321 cap_epoch_barrier);
4322 } else {
4323 dec_pinned_icaps();
4324 }
4325
4326
4327 if (in.auth_cap == cap) {
4328 if (in.flushing_cap_item.is_on_list()) {
4329 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4330 in.flushing_cap_item.remove_myself();
4331 }
4332 in.auth_cap = NULL;
4333 }
4334 size_t n = in.caps.erase(mds);
4335 ceph_assert(n == 1);
4336 cap = nullptr;
4337
4338 if (!in.is_any_caps()) {
4339 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4340 in.snaprealm_item.remove_myself();
4341 put_snap_realm(in.snaprealm);
4342 in.snaprealm = 0;
4343 }
4344 }
4345
4346 void Client::remove_all_caps(Inode *in)
4347 {
4348 while (!in->caps.empty())
4349 remove_cap(&in->caps.begin()->second, true);
4350 }
4351
4352 void Client::remove_session_caps(MetaSession *s, int err)
4353 {
4354 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4355
4356 while (s->caps.size()) {
4357 Cap *cap = *s->caps.begin();
4358 InodeRef in(&cap->inode);
4359 bool dirty_caps = false;
4360 if (in->auth_cap == cap) {
4361 dirty_caps = in->dirty_caps | in->flushing_caps;
4362 in->wanted_max_size = 0;
4363 in->requested_max_size = 0;
4364 if (in->has_any_filelocks())
4365 in->flags |= I_ERROR_FILELOCK;
4366 }
4367 auto caps = cap->implemented;
4368 if (cap->wanted | cap->issued)
4369 in->flags |= I_CAP_DROPPED;
4370 remove_cap(cap, false);
4371 in->cap_snaps.clear();
4372 if (dirty_caps) {
4373 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4374 if (in->flushing_caps) {
4375 num_flushing_caps--;
4376 in->flushing_cap_tids.clear();
4377 }
4378 in->flushing_caps = 0;
4379 in->mark_caps_clean();
4380 put_inode(in.get());
4381 }
4382 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4383 if (caps && !in->caps_issued_mask(caps, true)) {
4384 if (err == -CEPHFS_EBLOCKLISTED) {
4385 if (in->oset.dirty_or_tx) {
4386 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4387 in->set_async_err(err);
4388 }
4389 objectcacher->purge_set(&in->oset);
4390 } else {
4391 objectcacher->release_set(&in->oset);
4392 }
4393 _schedule_invalidate_callback(in.get(), 0, 0);
4394 }
4395
4396 signal_cond_list(in->waitfor_caps);
4397 }
4398 s->flushing_caps_tids.clear();
4399 sync_cond.notify_all();
4400 }
4401
4402 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4403 {
4404 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4405 bool abort_on_failure = false;
4406
4407 errno = 0;
4408 int r = remount_cb(callback_handle);
4409 if (r == 0) {
4410 retries_on_invalidate = 0;
4411 } else {
4412 int e = errno;
4413 client_t whoami = get_nodeid();
4414 if (r == -1) {
4415 lderr(cct) <<
4416 "failed to remount (to trim kernel dentries): "
4417 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4418 } else {
4419 lderr(cct) <<
4420 "failed to remount (to trim kernel dentries): "
4421 "return code = " << r << dendl;
4422 }
4423 bool should_abort =
4424 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4425 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4426 !(retry_on_error && (++retries_on_invalidate < max_retries));
4427 if (should_abort && !is_unmounting()) {
4428 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4429 abort_on_failure = true;
4430 }
4431 }
4432 return std::make_pair(r, abort_on_failure);
4433 }
4434
4435 class C_Client_Remount : public Context {
4436 private:
4437 Client *client;
4438 public:
4439 explicit C_Client_Remount(Client *c) : client(c) {}
4440 void finish(int r) override {
4441 ceph_assert(r == 0);
4442 client->_do_remount(true);
4443 }
4444 };
4445
4446 void Client::_invalidate_kernel_dcache()
4447 {
4448 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4449 if (!mref_reader.is_state_satisfied())
4450 return;
4451
4452 if (can_invalidate_dentries) {
4453 if (dentry_invalidate_cb && root->dir) {
4454 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4455 p != root->dir->dentries.end();
4456 ++p) {
4457 if (p->second->inode)
4458 _schedule_invalidate_dentry_callback(p->second, false);
4459 }
4460 }
4461 } else if (remount_cb) {
4462 // Hacky:
4463 // when remounting a file system, linux kernel trims all unused dentries in the fs
4464 remount_finisher.queue(new C_Client_Remount(this));
4465 }
4466 }
4467
4468 void Client::_trim_negative_child_dentries(InodeRef& in)
4469 {
4470 if (!in->is_dir())
4471 return;
4472
4473 Dir* dir = in->dir;
4474 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4475 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4476 Dentry *dn = p->second;
4477 ++p;
4478 ceph_assert(!dn->inode);
4479 if (dn->lru_is_expireable())
4480 unlink(dn, true, false); // keep dir, drop dentry
4481 }
4482 if (dir->dentries.empty()) {
4483 close_dir(dir);
4484 }
4485 }
4486
4487 if (in->flags & I_SNAPDIR_OPEN) {
4488 InodeRef snapdir = open_snapdir(in.get());
4489 _trim_negative_child_dentries(snapdir);
4490 }
4491 }
4492
4493 class C_Client_CacheRelease : public Context {
4494 private:
4495 Client *client;
4496 vinodeno_t ino;
4497 public:
4498 C_Client_CacheRelease(Client *c, Inode *in) :
4499 client(c) {
4500 if (client->use_faked_inos())
4501 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4502 else
4503 ino = in->vino();
4504 }
4505 void finish(int r) override {
4506 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4507 client->_async_inode_release(ino);
4508 }
4509 };
4510
4511 void Client::_async_inode_release(vinodeno_t ino)
4512 {
4513 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4514 if (!mref_reader.is_state_satisfied())
4515 return;
4516
4517 ldout(cct, 10) << __func__ << " " << ino << dendl;
4518 ino_release_cb(callback_handle, ino);
4519 }
4520
4521 void Client::_schedule_ino_release_callback(Inode *in) {
4522
4523 if (ino_release_cb)
4524 // we queue the invalidate, which calls the callback and decrements the ref
4525 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4526 }
4527
4528 void Client::trim_caps(MetaSession *s, uint64_t max)
4529 {
4530 mds_rank_t mds = s->mds_num;
4531 size_t caps_size = s->caps.size();
4532 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4533 << " caps " << caps_size << dendl;
4534
4535 uint64_t trimmed = 0;
4536 auto p = s->caps.begin();
4537 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4538 * looking at from getting deleted during traversal. */
4539 while ((caps_size - trimmed) > max && !p.end()) {
4540 Cap *cap = *p;
4541 InodeRef in(&cap->inode);
4542
4543 // Increment p early because it will be invalidated if cap
4544 // is deleted inside remove_cap
4545 ++p;
4546
4547 if (in->caps.size() > 1 && cap != in->auth_cap) {
4548 int mine = cap->issued | cap->implemented;
4549 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4550 // disposable non-auth cap
4551 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4552 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4553 cap = (remove_cap(cap, true), nullptr);
4554 trimmed++;
4555 }
4556 } else {
4557 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4558 _trim_negative_child_dentries(in);
4559 bool all = true;
4560 auto q = in->dentries.begin();
4561 while (q != in->dentries.end()) {
4562 Dentry *dn = *q;
4563 ++q;
4564 if (dn->lru_is_expireable()) {
4565 if (can_invalidate_dentries &&
4566 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4567 // Only issue one of these per DN for inodes in root: handle
4568 // others more efficiently by calling for root-child DNs at
4569 // the end of this function.
4570 _schedule_invalidate_dentry_callback(dn, true);
4571 }
4572 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4573 to_trim.insert(dn);
4574 } else {
4575 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4576 all = false;
4577 }
4578 }
4579 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4580 _schedule_ino_release_callback(in.get());
4581 }
4582 if (all && in->ino != CEPH_INO_ROOT) {
4583 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4584 trimmed++;
4585 }
4586 }
4587 }
4588 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4589 for (const auto &dn : to_trim) {
4590 trim_dentry(dn);
4591 }
4592 to_trim.clear();
4593
4594 caps_size = s->caps.size();
4595 if (caps_size > (size_t)max)
4596 _invalidate_kernel_dcache();
4597 }
4598
4599 void Client::force_session_readonly(MetaSession *s)
4600 {
4601 s->readonly = true;
4602 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4603 auto &in = (*p)->inode;
4604 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4605 signal_cond_list(in.waitfor_caps);
4606 }
4607 }
4608
4609 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4610 {
4611 MetaSession *session = in->auth_cap->session;
4612
4613 int flushing = in->dirty_caps;
4614 ceph_assert(flushing);
4615
4616 ceph_tid_t flush_tid = ++last_flush_tid;
4617 in->flushing_cap_tids[flush_tid] = flushing;
4618
4619 if (!in->flushing_caps) {
4620 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4621 num_flushing_caps++;
4622 } else {
4623 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4624 }
4625
4626 in->flushing_caps |= flushing;
4627 in->mark_caps_clean();
4628
4629 if (!in->flushing_cap_item.is_on_list())
4630 session->flushing_caps.push_back(&in->flushing_cap_item);
4631 session->flushing_caps_tids.insert(flush_tid);
4632
4633 *ptid = flush_tid;
4634 return flushing;
4635 }
4636
4637 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4638 {
4639 for (auto &p : in->cap_snaps) {
4640 CapSnap &capsnap = p.second;
4641 if (capsnap.flush_tid > 0) {
4642 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4643 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4644 }
4645 }
4646 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4647 it != in->flushing_cap_tids.end();
4648 ++it) {
4649 old_s->flushing_caps_tids.erase(it->first);
4650 new_s->flushing_caps_tids.insert(it->first);
4651 }
4652 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4653 }
4654
4655 /*
4656 * Flush all the dirty caps back to the MDS. Because the callers
4657 * generally wait on the result of this function (syncfs and umount
4658 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4659 */
4660 void Client::flush_caps_sync()
4661 {
4662 ldout(cct, 10) << __func__ << dendl;
4663 for (auto &q : mds_sessions) {
4664 auto s = q.second;
4665 xlist<Inode*>::iterator p = s->dirty_list.begin();
4666 while (!p.end()) {
4667 unsigned flags = CHECK_CAPS_NODELAY;
4668 Inode *in = *p;
4669
4670 ++p;
4671 if (p.end())
4672 flags |= CHECK_CAPS_SYNCHRONOUS;
4673 check_caps(in, flags);
4674 }
4675 }
4676 }
4677
4678 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4679 {
4680 while (in->flushing_caps) {
4681 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4682 ceph_assert(it != in->flushing_cap_tids.end());
4683 if (it->first > want)
4684 break;
4685 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4686 << ccap_string(it->second) << " want " << want
4687 << " last " << it->first << dendl;
4688 wait_on_list(in->waitfor_caps);
4689 }
4690 }
4691
4692 void Client::wait_sync_caps(ceph_tid_t want)
4693 {
4694 retry:
4695 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4696 << num_flushing_caps << " total flushing)" << dendl;
4697 for (auto &p : mds_sessions) {
4698 auto s = p.second;
4699 if (s->flushing_caps_tids.empty())
4700 continue;
4701 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4702 if (oldest_tid <= want) {
4703 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4704 << " (want " << want << ")" << dendl;
4705 std::unique_lock l{client_lock, std::adopt_lock};
4706 sync_cond.wait(l);
4707 l.release();
4708 goto retry;
4709 }
4710 }
4711 }
4712
4713 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4714 {
4715 in->flags &= ~I_KICK_FLUSH;
4716
4717 Cap *cap = in->auth_cap;
4718 ceph_assert(cap->session == session);
4719
4720 ceph_tid_t last_snap_flush = 0;
4721 for (auto p = in->flushing_cap_tids.rbegin();
4722 p != in->flushing_cap_tids.rend();
4723 ++p) {
4724 if (!p->second) {
4725 last_snap_flush = p->first;
4726 break;
4727 }
4728 }
4729
4730 int wanted = in->caps_wanted();
4731 int used = get_caps_used(in) | in->caps_dirty();
4732 auto it = in->cap_snaps.begin();
4733 for (auto& p : in->flushing_cap_tids) {
4734 if (p.second) {
4735 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4736 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4737 p.second, p.first);
4738 } else {
4739 ceph_assert(it != in->cap_snaps.end());
4740 ceph_assert(it->second.flush_tid == p.first);
4741 send_flush_snap(in, session, it->first, it->second);
4742 ++it;
4743 }
4744 }
4745 }
4746
4747 void Client::kick_flushing_caps(MetaSession *session)
4748 {
4749 mds_rank_t mds = session->mds_num;
4750 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4751
4752 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4753 Inode *in = *p;
4754 if (in->flags & I_KICK_FLUSH) {
4755 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4756 kick_flushing_caps(in, session);
4757 }
4758 }
4759 }
4760
4761 void Client::early_kick_flushing_caps(MetaSession *session)
4762 {
4763 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4764 Inode *in = *p;
4765 Cap *cap = in->auth_cap;
4766 ceph_assert(cap);
4767
4768 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4769 // stage. This guarantees that MDS processes the cap flush message before issuing
4770 // the flushing caps to other client.
4771 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4772 in->flags |= I_KICK_FLUSH;
4773 continue;
4774 }
4775
4776 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4777 << " to mds." << session->mds_num << dendl;
4778 // send_reconnect() also will reset these sequence numbers. make sure
4779 // sequence numbers in cap flush message match later reconnect message.
4780 cap->seq = 0;
4781 cap->issue_seq = 0;
4782 cap->mseq = 0;
4783 cap->issued = cap->implemented;
4784
4785 kick_flushing_caps(in, session);
4786 }
4787 }
4788
4789 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4790 {
4791 list<SnapRealm*> q;
4792 q.push_back(realm);
4793
4794 while (!q.empty()) {
4795 realm = q.front();
4796 q.pop_front();
4797
4798 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4799 realm->invalidate_cache();
4800
4801 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4802 p != realm->pchildren.end();
4803 ++p)
4804 q.push_back(*p);
4805 }
4806 }
4807
4808 SnapRealm *Client::get_snap_realm(inodeno_t r)
4809 {
4810 SnapRealm *realm = snap_realms[r];
4811 if (!realm)
4812 snap_realms[r] = realm = new SnapRealm(r);
4813 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4814 realm->nref++;
4815 return realm;
4816 }
4817
4818 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4819 {
4820 if (snap_realms.count(r) == 0) {
4821 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4822 return NULL;
4823 }
4824 SnapRealm *realm = snap_realms[r];
4825 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4826 realm->nref++;
4827 return realm;
4828 }
4829
4830 void Client::put_snap_realm(SnapRealm *realm)
4831 {
4832 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4833 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4834 if (--realm->nref == 0) {
4835 snap_realms.erase(realm->ino);
4836 if (realm->pparent) {
4837 realm->pparent->pchildren.erase(realm);
4838 put_snap_realm(realm->pparent);
4839 }
4840 delete realm;
4841 }
4842 }
4843
4844 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4845 {
4846 if (realm->parent != parent) {
4847 ldout(cct, 10) << __func__ << " " << *realm
4848 << " " << realm->parent << " -> " << parent << dendl;
4849 realm->parent = parent;
4850 if (realm->pparent) {
4851 realm->pparent->pchildren.erase(realm);
4852 put_snap_realm(realm->pparent);
4853 }
4854 realm->pparent = get_snap_realm(parent);
4855 realm->pparent->pchildren.insert(realm);
4856 return true;
4857 }
4858 return false;
4859 }
4860
4861 static bool has_new_snaps(const SnapContext& old_snapc,
4862 const SnapContext& new_snapc)
4863 {
4864 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4865 }
4866
4867
4868 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4869 {
4870 SnapRealm *first_realm = NULL;
4871 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4872
4873 map<SnapRealm*, SnapContext> dirty_realms;
4874
4875 auto p = bl.cbegin();
4876 while (!p.end()) {
4877 SnapRealmInfo info;
4878 decode(info, p);
4879 SnapRealm *realm = get_snap_realm(info.ino());
4880
4881 bool invalidate = false;
4882
4883 if (info.seq() > realm->seq) {
4884 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4885 << dendl;
4886
4887 if (flush) {
4888 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4889 // flush me + children
4890 list<SnapRealm*> q;
4891 q.push_back(realm);
4892 while (!q.empty()) {
4893 SnapRealm *realm = q.front();
4894 q.pop_front();
4895
4896 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4897 p != realm->pchildren.end();
4898 ++p)
4899 q.push_back(*p);
4900
4901 if (dirty_realms.count(realm) == 0) {
4902 realm->nref++;
4903 dirty_realms[realm] = realm->get_snap_context();
4904 }
4905 }
4906 }
4907
4908 // update
4909 realm->seq = info.seq();
4910 realm->created = info.created();
4911 realm->parent_since = info.parent_since();
4912 realm->prior_parent_snaps = info.prior_parent_snaps;
4913 realm->my_snaps = info.my_snaps;
4914 invalidate = true;
4915 }
4916
4917 // _always_ verify parent
4918 if (adjust_realm_parent(realm, info.parent()))
4919 invalidate = true;
4920
4921 if (invalidate) {
4922 invalidate_snaprealm_and_children(realm);
4923 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4924 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4925 } else {
4926 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4927 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4928 }
4929
4930 if (!first_realm)
4931 first_realm = realm;
4932 else
4933 put_snap_realm(realm);
4934 }
4935
4936 for (auto &[realm, snapc] : dirty_realms) {
4937 // if there are new snaps ?
4938 if (has_new_snaps(snapc, realm->get_snap_context())) {
4939 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4940 for (auto&& in : realm->inodes_with_caps) {
4941 queue_cap_snap(in, snapc);
4942 }
4943 } else {
4944 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4945 }
4946 put_snap_realm(realm);
4947 }
4948
4949 if (realm_ret)
4950 *realm_ret = first_realm;
4951 else
4952 put_snap_realm(first_realm);
4953 }
4954
4955 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4956 {
4957 ldout(cct, 10) << __func__ << " " << *m << dendl;
4958 mds_rank_t mds = mds_rank_t(m->get_source().num());
4959
4960 std::scoped_lock cl(client_lock);
4961 auto session = _get_mds_session(mds, m->get_connection().get());
4962 if (!session) {
4963 return;
4964 }
4965
4966 got_mds_push(session.get());
4967
4968 map<Inode*, SnapContext> to_move;
4969 SnapRealm *realm = 0;
4970
4971 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4972 ceph_assert(m->head.split);
4973 SnapRealmInfo info;
4974 auto p = m->bl.cbegin();
4975 decode(info, p);
4976 ceph_assert(info.ino() == m->head.split);
4977
4978 // flush, then move, ino's.
4979 realm = get_snap_realm(info.ino());
4980 ldout(cct, 10) << " splitting off " << *realm << dendl;
4981 for (auto& ino : m->split_inos) {
4982 vinodeno_t vino(ino, CEPH_NOSNAP);
4983 if (inode_map.count(vino)) {
4984 Inode *in = inode_map[vino];
4985 if (!in->snaprealm || in->snaprealm == realm)
4986 continue;
4987 if (in->snaprealm->created > info.created()) {
4988 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4989 << *in->snaprealm << dendl;
4990 continue;
4991 }
4992 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4993
4994
4995 in->snaprealm_item.remove_myself();
4996 to_move[in] = in->snaprealm->get_snap_context();
4997 put_snap_realm(in->snaprealm);
4998 }
4999 }
5000
5001 // move child snaprealms, too
5002 for (auto& child_realm : m->split_realms) {
5003 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5004 SnapRealm *child = get_snap_realm_maybe(child_realm);
5005 if (!child)
5006 continue;
5007 adjust_realm_parent(child, realm->ino);
5008 put_snap_realm(child);
5009 }
5010 }
5011
5012 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5013
5014 if (realm) {
5015 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5016 Inode *in = p->first;
5017 in->snaprealm = realm;
5018 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5019 realm->nref++;
5020 // queue for snap writeback
5021 if (has_new_snaps(p->second, realm->get_snap_context()))
5022 queue_cap_snap(in, p->second);
5023 }
5024 put_snap_realm(realm);
5025 }
5026 }
5027
5028 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5029 {
5030 mds_rank_t mds = mds_rank_t(m->get_source().num());
5031
5032 std::scoped_lock cl(client_lock);
5033 auto session = _get_mds_session(mds, m->get_connection().get());
5034 if (!session) {
5035 return;
5036 }
5037
5038 got_mds_push(session.get());
5039
5040 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5041
5042 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5043 if (inode_map.count(vino)) {
5044 Inode *in = NULL;
5045 in = inode_map[vino];
5046
5047 if (in) {
5048 in->quota = m->quota;
5049 in->rstat = m->rstat;
5050 }
5051 }
5052 }
5053
5054 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5055 {
5056 mds_rank_t mds = mds_rank_t(m->get_source().num());
5057
5058 std::scoped_lock cl(client_lock);
5059 auto session = _get_mds_session(mds, m->get_connection().get());
5060 if (!session) {
5061 return;
5062 }
5063
5064 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5065 // Pause RADOS operations until we see the required epoch
5066 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5067 }
5068
5069 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5070 // Record the barrier so that we will transmit it to MDS when releasing
5071 set_cap_epoch_barrier(m->osd_epoch_barrier);
5072 }
5073
5074 got_mds_push(session.get());
5075
5076 Inode *in;
5077 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5078 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5079 in = it->second;
5080 } else {
5081 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5082 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5083 session->enqueue_cap_release(
5084 m->get_ino(),
5085 m->get_cap_id(),
5086 m->get_seq(),
5087 m->get_mseq(),
5088 cap_epoch_barrier);
5089 } else {
5090 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5091 }
5092
5093 // in case the mds is waiting on e.g. a revocation
5094 flush_cap_releases();
5095 return;
5096 }
5097
5098 switch (m->get_op()) {
5099 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5100 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5101 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5102 }
5103
5104 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5105 Cap &cap = in->caps.at(mds);
5106
5107 switch (m->get_op()) {
5108 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5109 case CEPH_CAP_OP_IMPORT:
5110 case CEPH_CAP_OP_REVOKE:
5111 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5112 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5113 }
5114 } else {
5115 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5116 return;
5117 }
5118 }
5119
5120 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5121 {
5122 mds_rank_t mds = session->mds_num;
5123
5124 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5125 << " IMPORT from mds." << mds << dendl;
5126
5127 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5128 Cap *cap = NULL;
5129 UserPerm cap_perms;
5130 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5131 cap = &it->second;
5132 cap_perms = cap->latest_perms;
5133 }
5134
5135 // add/update it
5136 SnapRealm *realm = NULL;
5137 update_snap_trace(m->snapbl, &realm);
5138
5139 int issued = m->get_caps();
5140 int wanted = m->get_wanted();
5141 add_update_cap(in, session, m->get_cap_id(),
5142 issued, wanted, m->get_seq(), m->get_mseq(),
5143 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5144
5145 if (cap && cap->cap_id == m->peer.cap_id) {
5146 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5147 }
5148
5149 if (realm)
5150 put_snap_realm(realm);
5151
5152 if (in->auth_cap && in->auth_cap->session == session) {
5153 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5154 in->requested_max_size > m->get_max_size()) {
5155 in->requested_max_size = 0;
5156 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5157 }
5158 // reflush any/all caps (if we are now the auth_cap)
5159 kick_flushing_caps(in, session);
5160 }
5161 }
5162
5163 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5164 {
5165 mds_rank_t mds = session->mds_num;
5166
5167 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5168 << " EXPORT from mds." << mds << dendl;
5169
5170 auto it = in->caps.find(mds);
5171 if (it != in->caps.end()) {
5172 Cap &cap = it->second;
5173 if (cap.cap_id == m->get_cap_id()) {
5174 if (m->peer.cap_id) {
5175 const auto peer_mds = mds_rank_t(m->peer.mds);
5176 auto tsession = _get_or_open_mds_session(peer_mds);
5177 auto it = in->caps.find(peer_mds);
5178 if (it != in->caps.end()) {
5179 Cap &tcap = it->second;
5180 if (tcap.cap_id == m->peer.cap_id &&
5181 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5182 tcap.cap_id = m->peer.cap_id;
5183 tcap.seq = m->peer.seq - 1;
5184 tcap.issue_seq = tcap.seq;
5185 tcap.issued |= cap.issued;
5186 tcap.implemented |= cap.issued;
5187 if (&cap == in->auth_cap)
5188 in->auth_cap = &tcap;
5189 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5190 adjust_session_flushing_caps(in, session, tsession.get());
5191 }
5192 } else {
5193 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5194 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5195 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5196 cap.latest_perms);
5197 }
5198 } else {
5199 if (cap.wanted | cap.issued)
5200 in->flags |= I_CAP_DROPPED;
5201 }
5202
5203 remove_cap(&cap, false);
5204 }
5205 }
5206 }
5207
5208 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5209 {
5210 mds_rank_t mds = session->mds_num;
5211 ceph_assert(in->caps.count(mds));
5212
5213 ldout(cct, 10) << __func__ << " on ino " << *in
5214 << " size " << in->size << " -> " << m->get_size()
5215 << dendl;
5216
5217 int issued;
5218 in->caps_issued(&issued);
5219 issued |= in->caps_dirty();
5220 update_inode_file_size(in, issued, m->get_size(),
5221 m->get_truncate_seq(), m->get_truncate_size());
5222 }
5223
5224 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5225 {
5226 ceph_tid_t flush_ack_tid = m->get_client_tid();
5227 int dirty = m->get_dirty();
5228 int cleaned = 0;
5229 int flushed = 0;
5230
5231 auto it = in->flushing_cap_tids.begin();
5232 if (it->first < flush_ack_tid) {
5233 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5234 << " got unexpected flush ack tid " << flush_ack_tid
5235 << " expected is " << it->first << dendl;
5236 }
5237 for (; it != in->flushing_cap_tids.end(); ) {
5238 if (!it->second) {
5239 // cap snap
5240 ++it;
5241 continue;
5242 }
5243 if (it->first == flush_ack_tid)
5244 cleaned = it->second;
5245 if (it->first <= flush_ack_tid) {
5246 session->flushing_caps_tids.erase(it->first);
5247 in->flushing_cap_tids.erase(it++);
5248 ++flushed;
5249 continue;
5250 }
5251 cleaned &= ~it->second;
5252 if (!cleaned)
5253 break;
5254 ++it;
5255 }
5256
5257 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5258 << " cleaned " << ccap_string(cleaned) << " on " << *in
5259 << " with " << ccap_string(dirty) << dendl;
5260
5261 if (flushed) {
5262 signal_cond_list(in->waitfor_caps);
5263 if (session->flushing_caps_tids.empty() ||
5264 *session->flushing_caps_tids.begin() > flush_ack_tid)
5265 sync_cond.notify_all();
5266 }
5267
5268 if (!dirty) {
5269 in->cap_dirtier_uid = -1;
5270 in->cap_dirtier_gid = -1;
5271 }
5272
5273 if (!cleaned) {
5274 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5275 } else {
5276 if (in->flushing_caps) {
5277 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5278 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5279 in->flushing_caps &= ~cleaned;
5280 if (in->flushing_caps == 0) {
5281 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5282 num_flushing_caps--;
5283 if (in->flushing_cap_tids.empty())
5284 in->flushing_cap_item.remove_myself();
5285 }
5286 if (!in->caps_dirty())
5287 put_inode(in);
5288 }
5289 }
5290 }
5291
5292
5293 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5294 {
5295 ceph_tid_t flush_ack_tid = m->get_client_tid();
5296 mds_rank_t mds = session->mds_num;
5297 ceph_assert(in->caps.count(mds));
5298 snapid_t follows = m->get_snap_follows();
5299
5300 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5301 auto& capsnap = it->second;
5302 if (flush_ack_tid != capsnap.flush_tid) {
5303 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5304 } else {
5305 InodeRef tmp_ref(in);
5306 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5307 << " on " << *in << dendl;
5308 session->flushing_caps_tids.erase(capsnap.flush_tid);
5309 in->flushing_cap_tids.erase(capsnap.flush_tid);
5310 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5311 in->flushing_cap_item.remove_myself();
5312 in->cap_snaps.erase(it);
5313
5314 signal_cond_list(in->waitfor_caps);
5315 if (session->flushing_caps_tids.empty() ||
5316 *session->flushing_caps_tids.begin() > flush_ack_tid)
5317 sync_cond.notify_all();
5318 }
5319 } else {
5320 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5321 << " on " << *in << dendl;
5322 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5323 }
5324 }
5325
5326 class C_Client_DentryInvalidate : public Context {
5327 private:
5328 Client *client;
5329 vinodeno_t dirino;
5330 vinodeno_t ino;
5331 string name;
5332 public:
5333 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5334 client(c), name(dn->name) {
5335 if (client->use_faked_inos()) {
5336 dirino.ino = dn->dir->parent_inode->faked_ino;
5337 if (del)
5338 ino.ino = dn->inode->faked_ino;
5339 } else {
5340 dirino = dn->dir->parent_inode->vino();
5341 if (del)
5342 ino = dn->inode->vino();
5343 }
5344 if (!del)
5345 ino.ino = inodeno_t();
5346 }
5347 void finish(int r) override {
5348 // _async_dentry_invalidate is responsible for its own locking
5349 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5350 client->_async_dentry_invalidate(dirino, ino, name);
5351 }
5352 };
5353
5354 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5355 {
5356 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5357 if (!mref_reader.is_state_satisfied())
5358 return;
5359
5360 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5361 << " in dir " << dirino << dendl;
5362 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5363 }
5364
5365 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5366 {
5367 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5368 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5369 }
5370
5371 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5372 {
5373 int ref = in->get_nref();
5374 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5375
5376 if (in->dir && !in->dir->dentries.empty()) {
5377 for (auto p = in->dir->dentries.begin();
5378 p != in->dir->dentries.end(); ) {
5379 Dentry *dn = p->second;
5380 ++p;
5381 /* rmsnap removes whole subtree, need trim inodes recursively.
5382 * we don't need to invalidate dentries recursively. because
5383 * invalidating a directory dentry effectively invalidate
5384 * whole subtree */
5385 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5386 _try_to_trim_inode(dn->inode.get(), false);
5387
5388 if (dn->lru_is_expireable())
5389 unlink(dn, true, false); // keep dir, drop dentry
5390 }
5391 if (in->dir->dentries.empty()) {
5392 close_dir(in->dir);
5393 --ref;
5394 }
5395 }
5396
5397 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5398 InodeRef snapdir = open_snapdir(in);
5399 _try_to_trim_inode(snapdir.get(), false);
5400 --ref;
5401 }
5402
5403 if (ref > 1) {
5404 auto q = in->dentries.begin();
5405 while (q != in->dentries.end()) {
5406 Dentry *dn = *q;
5407 ++q;
5408 if( in->ll_ref > 0 && sched_inval) {
5409 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5410 // so in->dentries doesn't always reflect the state of kernel's dcache.
5411 _schedule_invalidate_dentry_callback(dn, true);
5412 }
5413 unlink(dn, true, true);
5414 }
5415 }
5416 }
5417
5418 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5419 {
5420 mds_rank_t mds = session->mds_num;
5421 int used = get_caps_used(in);
5422 int wanted = in->caps_wanted();
5423 int flags = 0;
5424
5425 const unsigned new_caps = m->get_caps();
5426 const bool was_stale = session->cap_gen > cap->gen;
5427 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5428 << " mds." << mds << " seq " << m->get_seq()
5429 << " caps now " << ccap_string(new_caps)
5430 << " was " << ccap_string(cap->issued)
5431 << (was_stale ? " (stale)" : "") << dendl;
5432
5433 if (was_stale)
5434 cap->issued = cap->implemented = CEPH_CAP_PIN;
5435 cap->seq = m->get_seq();
5436 cap->gen = session->cap_gen;
5437
5438 check_cap_issue(in, new_caps);
5439
5440 // update inode
5441 int issued;
5442 in->caps_issued(&issued);
5443 issued |= in->caps_dirty();
5444
5445 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5446 !(issued & CEPH_CAP_AUTH_EXCL)) {
5447 in->mode = m->head.mode;
5448 in->uid = m->head.uid;
5449 in->gid = m->head.gid;
5450 in->btime = m->btime;
5451 }
5452 bool deleted_inode = false;
5453 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5454 !(issued & CEPH_CAP_LINK_EXCL)) {
5455 in->nlink = m->head.nlink;
5456 if (in->nlink == 0)
5457 deleted_inode = true;
5458 }
5459 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5460 m->xattrbl.length() &&
5461 m->head.xattr_version > in->xattr_version) {
5462 auto p = m->xattrbl.cbegin();
5463 decode(in->xattrs, p);
5464 in->xattr_version = m->head.xattr_version;
5465 }
5466
5467 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5468 in->dirstat.nfiles = m->get_nfiles();
5469 in->dirstat.nsubdirs = m->get_nsubdirs();
5470 }
5471
5472 if (new_caps & CEPH_CAP_ANY_RD) {
5473 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5474 m->get_ctime(), m->get_mtime(), m->get_atime());
5475 }
5476
5477 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5478 in->layout = m->get_layout();
5479 update_inode_file_size(in, issued, m->get_size(),
5480 m->get_truncate_seq(), m->get_truncate_size());
5481 }
5482
5483 if (m->inline_version > in->inline_version) {
5484 in->inline_data = m->inline_data;
5485 in->inline_version = m->inline_version;
5486 }
5487
5488 /* always take a newer change attr */
5489 if (m->get_change_attr() > in->change_attr)
5490 in->change_attr = m->get_change_attr();
5491
5492 // max_size
5493 if (cap == in->auth_cap &&
5494 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5495 (m->get_max_size() != in->max_size)) {
5496 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5497 in->max_size = m->get_max_size();
5498 if (in->max_size > in->wanted_max_size) {
5499 in->wanted_max_size = 0;
5500 in->requested_max_size = 0;
5501 }
5502 }
5503
5504 bool check = false;
5505 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5506 (wanted & ~(cap->wanted | new_caps))) {
5507 // If mds is importing cap, prior cap messages that update 'wanted'
5508 // may get dropped by mds (migrate seq mismatch).
5509 //
5510 // We don't send cap message to update 'wanted' if what we want are
5511 // already issued. If mds revokes caps, cap message that releases caps
5512 // also tells mds what we want. But if caps got revoked by mds forcedly
5513 // (session stale). We may haven't told mds what we want.
5514 check = true;
5515 }
5516
5517
5518 // update caps
5519 auto revoked = cap->issued & ~new_caps;
5520 if (revoked) {
5521 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5522 cap->issued = new_caps;
5523 cap->implemented |= new_caps;
5524
5525 // recall delegations if we're losing caps necessary for them
5526 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5527 in->recall_deleg(false);
5528 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5529 in->recall_deleg(true);
5530
5531 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5532 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5533 !_flush(in, new C_Client_FlushComplete(this, in))) {
5534 // waitin' for flush
5535 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5536 if (_release(in)) {
5537 check = true;
5538 flags = CHECK_CAPS_NODELAY;
5539 }
5540 } else {
5541 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5542 check = true;
5543 flags = CHECK_CAPS_NODELAY;
5544 }
5545 } else if (cap->issued == new_caps) {
5546 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5547 } else {
5548 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5549 cap->issued = new_caps;
5550 cap->implemented |= new_caps;
5551
5552 if (cap == in->auth_cap) {
5553 // non-auth MDS is revoking the newly grant caps ?
5554 for (const auto &p : in->caps) {
5555 if (&p.second == cap)
5556 continue;
5557 if (p.second.implemented & ~p.second.issued & new_caps) {
5558 check = true;
5559 break;
5560 }
5561 }
5562 }
5563 }
5564
5565 if (check)
5566 check_caps(in, flags);
5567
5568 // wake up waiters
5569 if (new_caps)
5570 signal_cond_list(in->waitfor_caps);
5571
5572 // may drop inode's last ref
5573 if (deleted_inode)
5574 _try_to_trim_inode(in, true);
5575 }
5576
5577 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5578 {
5579 if (perms.uid() == 0) {
5580 // Executable are overridable when there is at least one exec bit set
5581 if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
5582 return -CEPHFS_EACCES;
5583 return 0;
5584 }
5585
5586 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5587 int ret = _posix_acl_permission(in, perms, want);
5588 if (ret != -CEPHFS_EAGAIN)
5589 return ret;
5590 }
5591
5592 // check permissions before doing anything else
5593 if (!in->check_mode(perms, want))
5594 return -CEPHFS_EACCES;
5595 return 0;
5596 }
5597
5598 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5599 const UserPerm& perms)
5600 {
5601 int r = _getattr_for_perm(in, perms);
5602 if (r < 0)
5603 goto out;
5604
5605 r = 0;
5606 if (strncmp(name, "system.", 7) == 0) {
5607 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5608 r = -CEPHFS_EPERM;
5609 } else {
5610 r = inode_permission(in, perms, want);
5611 }
5612 out:
5613 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5614 return r;
5615 }
5616
5617 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5618 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5619 return out;
5620 }
5621
5622 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5623 const UserPerm& perms)
5624 {
5625 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5626 int r = _getattr_for_perm(in, perms);
5627 if (r < 0)
5628 goto out;
5629
5630 if (mask & CEPH_SETATTR_SIZE) {
5631 r = inode_permission(in, perms, MAY_WRITE);
5632 if (r < 0)
5633 goto out;
5634 }
5635
5636 r = -CEPHFS_EPERM;
5637 if (mask & CEPH_SETATTR_UID) {
5638 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5639 goto out;
5640 }
5641 if (mask & CEPH_SETATTR_GID) {
5642 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5643 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5644 goto out;
5645 }
5646
5647 if (mask & CEPH_SETATTR_MODE) {
5648 if (perms.uid() != 0 && perms.uid() != in->uid)
5649 goto out;
5650
5651 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5652 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5653 stx->stx_mode &= ~S_ISGID;
5654 }
5655
5656 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5657 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5658 if (perms.uid() != 0 && perms.uid() != in->uid) {
5659 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5660 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5661 check_mask |= CEPH_SETATTR_MTIME;
5662 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5663 check_mask |= CEPH_SETATTR_ATIME;
5664 if (check_mask & mask) {
5665 goto out;
5666 } else {
5667 r = inode_permission(in, perms, MAY_WRITE);
5668 if (r < 0)
5669 goto out;
5670 }
5671 }
5672 }
5673 r = 0;
5674 out:
5675 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5676 return r;
5677 }
5678
5679 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5680 {
5681 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5682 unsigned want = 0;
5683
5684 if ((flags & O_ACCMODE) == O_WRONLY)
5685 want = MAY_WRITE;
5686 else if ((flags & O_ACCMODE) == O_RDWR)
5687 want = MAY_READ | MAY_WRITE;
5688 else if ((flags & O_ACCMODE) == O_RDONLY)
5689 want = MAY_READ;
5690 if (flags & O_TRUNC)
5691 want |= MAY_WRITE;
5692
5693 int r = 0;
5694 switch (in->mode & S_IFMT) {
5695 case S_IFLNK:
5696 r = -CEPHFS_ELOOP;
5697 goto out;
5698 case S_IFDIR:
5699 if (want & MAY_WRITE) {
5700 r = -CEPHFS_EISDIR;
5701 goto out;
5702 }
5703 break;
5704 }
5705
5706 r = _getattr_for_perm(in, perms);
5707 if (r < 0)
5708 goto out;
5709
5710 r = inode_permission(in, perms, want);
5711 out:
5712 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5713 return r;
5714 }
5715
5716 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5717 {
5718 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5719 int r = _getattr_for_perm(dir, perms);
5720 if (r < 0)
5721 goto out;
5722
5723 r = inode_permission(dir, perms, MAY_EXEC);
5724 out:
5725 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5726 return r;
5727 }
5728
5729 int Client::may_create(Inode *dir, const UserPerm& perms)
5730 {
5731 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5732 int r = _getattr_for_perm(dir, perms);
5733 if (r < 0)
5734 goto out;
5735
5736 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5737 out:
5738 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5739 return r;
5740 }
5741
5742 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5743 {
5744 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5745 int r = _getattr_for_perm(dir, perms);
5746 if (r < 0)
5747 goto out;
5748
5749 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5750 if (r < 0)
5751 goto out;
5752
5753 /* 'name == NULL' means rmsnap w/o permission checks */
5754 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5755 InodeRef otherin;
5756 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5757 if (r < 0)
5758 goto out;
5759 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5760 r = -CEPHFS_EPERM;
5761 }
5762 out:
5763 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5764 return r;
5765 }
5766
5767 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5768 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5769
5770 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5771 if (!mref_reader.is_state_satisfied())
5772 return -ENOTCONN;
5773
5774 filepath path(relpath);
5775 string name = path.last_dentry();
5776 path.pop_dentry();
5777 InodeRef dir;
5778
5779 std::scoped_lock lock(client_lock);
5780 int r = path_walk(path, &dir, perms);
5781 if (r < 0)
5782 return r;
5783 if (cct->_conf->client_permissions) {
5784 int r = may_delete(dir.get(), name.c_str(), perms);
5785 if (r < 0)
5786 return r;
5787 }
5788
5789 return 0;
5790 }
5791
5792 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5793 {
5794 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5795 int r = _getattr_for_perm(in, perms);
5796 if (r < 0)
5797 goto out;
5798
5799 if (perms.uid() == 0 || perms.uid() == in->uid) {
5800 r = 0;
5801 goto out;
5802 }
5803
5804 r = -CEPHFS_EPERM;
5805 if (!S_ISREG(in->mode))
5806 goto out;
5807
5808 if (in->mode & S_ISUID)
5809 goto out;
5810
5811 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5812 goto out;
5813
5814 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5815 out:
5816 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5817 return r;
5818 }
5819
5820 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5821 {
5822 int mask = CEPH_STAT_CAP_MODE;
5823 bool force = false;
5824 if (acl_type != NO_ACL) {
5825 mask |= CEPH_STAT_CAP_XATTR;
5826 force = in->xattr_version == 0;
5827 }
5828 return _getattr(in, mask, perms, force);
5829 }
5830
5831 vinodeno_t Client::_get_vino(Inode *in)
5832 {
5833 /* The caller must hold the client lock */
5834 return vinodeno_t(in->ino, in->snapid);
5835 }
5836
5837 /**
5838 * Resolve an MDS spec to a list of MDS daemon GIDs.
5839 *
5840 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5841 * It may be '*' in which case it matches all GIDs.
5842 *
5843 * If no error is returned, the `targets` vector will be populated with at least
5844 * one MDS.
5845 */
5846 int Client::resolve_mds(
5847 const std::string &mds_spec,
5848 std::vector<mds_gid_t> *targets)
5849 {
5850 ceph_assert(fsmap);
5851 ceph_assert(targets != nullptr);
5852
5853 mds_role_t role;
5854 CachedStackStringStream css;
5855 int role_r = fsmap->parse_role(mds_spec, &role, *css);
5856 if (role_r == 0) {
5857 // We got a role, resolve it to a GID
5858 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5859 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5860 << role << "' aka " << info.human_name() << dendl;
5861 targets->push_back(info.global_id);
5862 return 0;
5863 }
5864
5865 std::string strtol_err;
5866 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5867 if (strtol_err.empty()) {
5868 // It is a possible GID
5869 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5870 if (fsmap->gid_exists(mds_gid)) {
5871 auto& info = fsmap->get_info_gid(mds_gid);
5872 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5873 << info.human_name() << dendl;
5874 targets->push_back(mds_gid);
5875 return 0;
5876 } else {
5877 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5878 << dendl;
5879 lderr(cct) << "FSMap: " << *fsmap << dendl;
5880 return -CEPHFS_ENOENT;
5881 }
5882 } else if (mds_spec == "*") {
5883 // It is a wildcard: use all MDSs
5884 const auto& mds_info = fsmap->get_mds_info();
5885
5886 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5887 if (mds_info.empty()) {
5888 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5889 lderr(cct) << "FSMap: " << *fsmap << dendl;
5890 return -CEPHFS_ENOENT;
5891 }
5892
5893 for (const auto& [gid, info] : mds_info) {
5894 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5895 targets->push_back(gid);
5896 }
5897 return 0;
5898 } else {
5899 // It did not parse as an integer, it is not a wildcard, it must be a name
5900 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5901 if (mds_gid == 0) {
5902 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
5903 lderr(cct) << "FSMap: " << *fsmap << dendl;
5904 return -CEPHFS_ENOENT;
5905 } else {
5906 auto& info = fsmap->get_info_gid(mds_gid);
5907 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5908 << "' to " << info.human_name() << dendl;
5909 targets->push_back(mds_gid);
5910 }
5911 return 0;
5912 }
5913 }
5914
5915
5916 /**
5917 * Authenticate with mon and establish global ID
5918 */
5919 int Client::authenticate()
5920 {
5921 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5922
5923 if (monclient->is_authenticated()) {
5924 return 0;
5925 }
5926
5927 client_lock.unlock();
5928 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5929 client_lock.lock();
5930 if (r < 0) {
5931 return r;
5932 }
5933
5934 whoami = monclient->get_global_id();
5935 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5936
5937 return 0;
5938 }
5939
5940 int Client::fetch_fsmap(bool user)
5941 {
5942 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5943
5944 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5945 // rather than MDSMap because no one MDSMap contains all the daemons, and
5946 // a `tell` can address any daemon.
5947 version_t fsmap_latest;
5948 bs::error_code ec;
5949 do {
5950 client_lock.unlock();
5951 std::tie(fsmap_latest, std::ignore) =
5952 monclient->get_version("fsmap", ca::use_blocked[ec]);
5953 client_lock.lock();
5954 } while (ec == bs::errc::resource_unavailable_try_again);
5955
5956 if (ec) {
5957 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5958 return ceph::from_error_code(ec);
5959 }
5960
5961 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5962
5963 if (user) {
5964 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5965 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5966 monclient->renew_subs();
5967 wait_on_list(waiting_for_fsmap);
5968 }
5969 ceph_assert(fsmap_user);
5970 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5971 } else {
5972 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5973 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5974 monclient->renew_subs();
5975 wait_on_list(waiting_for_fsmap);
5976 }
5977 ceph_assert(fsmap);
5978 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5979 }
5980 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5981 << fsmap_latest << dendl;
5982 return 0;
5983 }
5984
5985 /**
5986 *
5987 * @mds_spec one of ID, rank, GID, "*"
5988 *
5989 */
5990 int Client::mds_command(
5991 const std::string &mds_spec,
5992 const vector<string>& cmd,
5993 const bufferlist& inbl,
5994 bufferlist *outbl,
5995 string *outs,
5996 Context *onfinish)
5997 {
5998 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
5999 if (!iref_reader.is_state_satisfied())
6000 return -CEPHFS_ENOTCONN;
6001
6002 std::unique_lock cl(client_lock);
6003
6004 int r;
6005 r = authenticate();
6006 if (r < 0) {
6007 return r;
6008 }
6009
6010 r = fetch_fsmap(false);
6011 if (r < 0) {
6012 return r;
6013 }
6014
6015 // Look up MDS target(s) of the command
6016 std::vector<mds_gid_t> targets;
6017 r = resolve_mds(mds_spec, &targets);
6018 if (r < 0) {
6019 return r;
6020 }
6021
6022 // If daemons are laggy, we won't send them commands. If all
6023 // are laggy then we fail.
6024 std::vector<mds_gid_t> non_laggy;
6025 for (const auto& gid : targets) {
6026 const auto info = fsmap->get_info_gid(gid);
6027 if (!info.laggy()) {
6028 non_laggy.push_back(gid);
6029 }
6030 }
6031 if (non_laggy.size() == 0) {
6032 *outs = "All targeted MDS daemons are laggy";
6033 return -CEPHFS_ENOENT;
6034 }
6035
6036 if (metadata.empty()) {
6037 // We are called on an unmounted client, so metadata
6038 // won't be initialized yet.
6039 populate_metadata("");
6040 }
6041
6042 // Send commands to targets
6043 C_GatherBuilder gather(cct, onfinish);
6044 for (const auto& target_gid : non_laggy) {
6045 const auto info = fsmap->get_info_gid(target_gid);
6046
6047 // Open a connection to the target MDS
6048 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6049
6050 cl.unlock();
6051 {
6052 std::scoped_lock cmd_lock(command_lock);
6053 // Generate MDSCommandOp state
6054 auto &op = command_table.start_command();
6055
6056 op.on_finish = gather.new_sub();
6057 op.cmd = cmd;
6058 op.outbl = outbl;
6059 op.outs = outs;
6060 op.inbl = inbl;
6061 op.mds_gid = target_gid;
6062 op.con = conn;
6063
6064 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6065 << " tid=" << op.tid << cmd << dendl;
6066
6067 // Construct and send MCommand
6068 MessageRef m = op.get_message(monclient->get_fsid());
6069 conn->send_message2(std::move(m));
6070 }
6071 cl.lock();
6072 }
6073 gather.activate();
6074
6075 return 0;
6076 }
6077
6078 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6079 {
6080 ceph_tid_t const tid = m->get_tid();
6081
6082 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6083
6084 std::scoped_lock cmd_lock(command_lock);
6085 if (!command_table.exists(tid)) {
6086 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6087 return;
6088 }
6089
6090 auto &op = command_table.get_command(tid);
6091 if (op.outbl) {
6092 *op.outbl = m->get_data();
6093 }
6094 if (op.outs) {
6095 *op.outs = m->rs;
6096 }
6097
6098 if (op.on_finish) {
6099 op.on_finish->complete(m->r);
6100 }
6101
6102 command_table.erase(tid);
6103 }
6104
6105 // -------------------
6106 // MOUNT
6107
6108 int Client::subscribe_mdsmap(const std::string &fs_name)
6109 {
6110 int r = authenticate();
6111 if (r < 0) {
6112 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6113 return r;
6114 }
6115
6116 std::string resolved_fs_name;
6117 if (fs_name.empty()) {
6118 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6119 if (resolved_fs_name.empty())
6120 // Try the backwards compatibility fs name option
6121 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6122 } else {
6123 resolved_fs_name = fs_name;
6124 }
6125
6126 std::string want = "mdsmap";
6127 if (!resolved_fs_name.empty()) {
6128 r = fetch_fsmap(true);
6129 if (r < 0)
6130 return r;
6131 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6132 if (fscid == FS_CLUSTER_ID_NONE) {
6133 return -CEPHFS_ENOENT;
6134 }
6135
6136 std::ostringstream oss;
6137 oss << want << "." << fscid;
6138 want = oss.str();
6139 }
6140 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6141
6142 monclient->sub_want(want, 0, 0);
6143 monclient->renew_subs();
6144
6145 return 0;
6146 }
6147
6148 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6149 bool require_mds, const std::string &fs_name)
6150 {
6151 ceph_assert(is_initialized());
6152
6153 /*
6154 * To make sure that the _unmount() must wait until the mount()
6155 * is done.
6156 */
6157 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6158 if (!mref_writer.is_first_writer()) // already mounting or mounted
6159 return 0;
6160
6161 std::unique_lock cl(client_lock);
6162
6163 int r = subscribe_mdsmap(fs_name);
6164 if (r < 0) {
6165 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6166 return r;
6167 }
6168
6169 start_tick_thread(); // start tick thread
6170
6171 if (require_mds) {
6172 while (1) {
6173 auto availability = mdsmap->is_cluster_available();
6174 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6175 // Error out
6176 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6177 return CEPH_FUSE_NO_MDS_UP;
6178 } else if (availability == MDSMap::AVAILABLE) {
6179 // Continue to mount
6180 break;
6181 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6182 // Else, wait. MDSMonitor will update the map to bring
6183 // us to a conclusion eventually.
6184 wait_on_list(waiting_for_mdsmap);
6185 } else {
6186 // Unexpected value!
6187 ceph_abort();
6188 }
6189 }
6190 }
6191
6192 populate_metadata(mount_root.empty() ? "/" : mount_root);
6193
6194 filepath fp(CEPH_INO_ROOT);
6195 if (!mount_root.empty()) {
6196 fp = filepath(mount_root.c_str());
6197 }
6198 while (true) {
6199 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6200 req->set_filepath(fp);
6201 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6202 int res = make_request(req, perms);
6203 if (res < 0) {
6204 if (res == -CEPHFS_EACCES && root) {
6205 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6206 break;
6207 }
6208 return res;
6209 }
6210
6211 if (fp.depth())
6212 fp.pop_dentry();
6213 else
6214 break;
6215 }
6216
6217 ceph_assert(root);
6218 _ll_get(root.get());
6219
6220 // trace?
6221 if (!cct->_conf->client_trace.empty()) {
6222 traceout.open(cct->_conf->client_trace.c_str());
6223 if (traceout.is_open()) {
6224 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6225 } else {
6226 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6227 }
6228 }
6229
6230 /*
6231 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6232 ldout(cct, 3) << "op: struct stat st;" << dendl;
6233 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6234 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6235 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6236 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6237 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6238 ldout(cct, 3) << "op: int fd;" << dendl;
6239 */
6240
6241 mref_writer.update_state(CLIENT_MOUNTED);
6242 return 0;
6243 }
6244
6245 // UNMOUNT
6246
6247 void Client::_close_sessions()
6248 {
6249 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6250 if (it->second->state == MetaSession::STATE_REJECTED)
6251 mds_sessions.erase(it++);
6252 else
6253 ++it;
6254 }
6255
6256 while (!mds_sessions.empty()) {
6257 // send session closes!
6258 for (auto &p : mds_sessions) {
6259 if (p.second->state != MetaSession::STATE_CLOSING) {
6260 _close_mds_session(p.second.get());
6261 mds_ranks_closing.insert(p.first);
6262 }
6263 }
6264
6265 // wait for sessions to close
6266 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6267 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6268 << timo << "s)" << dendl;
6269 std::unique_lock l{client_lock, std::adopt_lock};
6270 if (!timo) {
6271 mount_cond.wait(l);
6272 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6273 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6274 while (!mds_ranks_closing.empty()) {
6275 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6276 // this prunes entry from mds_sessions and mds_ranks_closing
6277 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6278 }
6279 }
6280
6281 mds_ranks_closing.clear();
6282 l.release();
6283 }
6284 }
6285
6286 void Client::flush_mdlog_sync(Inode *in)
6287 {
6288 if (in->unsafe_ops.empty()) {
6289 return;
6290 }
6291
6292 std::set<mds_rank_t> anchor;
6293 for (auto &&p : in->unsafe_ops) {
6294 anchor.emplace(p->mds);
6295 }
6296 if (in->auth_cap) {
6297 anchor.emplace(in->auth_cap->session->mds_num);
6298 }
6299
6300 for (auto &rank : anchor) {
6301 auto session = &mds_sessions.at(rank);
6302 flush_mdlog(session->get());
6303 }
6304 }
6305
6306 void Client::flush_mdlog_sync()
6307 {
6308 if (mds_requests.empty())
6309 return;
6310 for (auto &p : mds_sessions) {
6311 flush_mdlog(p.second.get());
6312 }
6313 }
6314
6315 void Client::flush_mdlog(MetaSession *session)
6316 {
6317 // Only send this to Luminous or newer MDS daemons, older daemons
6318 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6319 const uint64_t features = session->con->get_features();
6320 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6321 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6322 session->con->send_message2(std::move(m));
6323 }
6324 }
6325
6326
6327 void Client::_abort_mds_sessions(int err)
6328 {
6329 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6330 auto req = p->second;
6331 ++p;
6332 // unsafe requests will be removed during close session below.
6333 if (req->got_unsafe)
6334 continue;
6335
6336 req->abort(err);
6337 if (req->caller_cond) {
6338 req->kick = true;
6339 req->caller_cond->notify_all();
6340 }
6341 }
6342
6343 // Process aborts on any requests that were on this waitlist.
6344 // Any requests that were on a waiting_for_open session waitlist
6345 // will get kicked during close session below.
6346 signal_cond_list(waiting_for_mdsmap);
6347
6348 // Force-close all sessions
6349 while(!mds_sessions.empty()) {
6350 auto session = mds_sessions.begin()->second;
6351 _closed_mds_session(session.get(), err);
6352 }
6353 }
6354
6355 void Client::_unmount(bool abort)
6356 {
6357 /*
6358 * We are unmounting the client.
6359 *
6360 * Just declare the state to STATE_UNMOUNTING to block and fail
6361 * any new comming "reader" and then try to wait all the in-flight
6362 * "readers" to finish.
6363 */
6364 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6365 if (!mref_writer.is_first_writer())
6366 return;
6367 mref_writer.wait_readers_done();
6368
6369 std::unique_lock lock{client_lock};
6370
6371 if (abort || blocklisted) {
6372 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6373 } else {
6374 ldout(cct, 2) << "unmounting" << dendl;
6375 }
6376
6377 deleg_timeout = 0;
6378
6379 if (abort) {
6380 mount_aborted = true;
6381 // Abort all mds sessions
6382 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6383
6384 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6385 } else {
6386 // flush the mdlog for pending requests, if any
6387 flush_mdlog_sync();
6388 }
6389
6390 mount_cond.wait(lock, [this] {
6391 if (!mds_requests.empty()) {
6392 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6393 << dendl;
6394 }
6395 return mds_requests.empty();
6396 });
6397
6398 cwd.reset();
6399 root.reset();
6400
6401 // clean up any unclosed files
6402 while (!fd_map.empty()) {
6403 Fh *fh = fd_map.begin()->second;
6404 fd_map.erase(fd_map.begin());
6405 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6406 _release_fh(fh);
6407 }
6408
6409 while (!ll_unclosed_fh_set.empty()) {
6410 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6411 Fh *fh = *it;
6412 ll_unclosed_fh_set.erase(fh);
6413 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6414 _release_fh(fh);
6415 }
6416
6417 while (!opened_dirs.empty()) {
6418 dir_result_t *dirp = *opened_dirs.begin();
6419 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6420 _closedir(dirp);
6421 }
6422
6423 _ll_drop_pins();
6424
6425 if (cct->_conf->client_oc) {
6426 // flush/release all buffered data
6427 std::list<InodeRef> anchor;
6428 for (auto& p : inode_map) {
6429 Inode *in = p.second;
6430 if (!in) {
6431 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6432 ceph_assert(in);
6433 }
6434
6435 // prevent inode from getting freed
6436 anchor.emplace_back(in);
6437
6438 if (abort || blocklisted) {
6439 objectcacher->purge_set(&in->oset);
6440 } else if (!in->caps.empty()) {
6441 _release(in);
6442 _flush(in, new C_Client_FlushComplete(this, in));
6443 }
6444 }
6445 }
6446
6447 if (abort || blocklisted) {
6448 for (auto &q : mds_sessions) {
6449 auto s = q.second;
6450 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6451 Inode *in = *p;
6452 ++p;
6453 if (in->dirty_caps) {
6454 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6455 in->mark_caps_clean();
6456 put_inode(in);
6457 }
6458 }
6459 }
6460 } else {
6461 flush_caps_sync();
6462 wait_sync_caps(last_flush_tid);
6463 }
6464
6465 // empty lru cache
6466 trim_cache();
6467
6468 delay_put_inodes();
6469
6470 while (lru.lru_get_size() > 0 ||
6471 !inode_map.empty()) {
6472 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6473 << "+" << inode_map.size() << " items"
6474 << ", waiting (for caps to release?)"
6475 << dendl;
6476
6477 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6478 r == std::cv_status::timeout) {
6479 dump_cache(NULL);
6480 }
6481 }
6482 ceph_assert(lru.lru_get_size() == 0);
6483 ceph_assert(inode_map.empty());
6484
6485 // stop tracing
6486 if (!cct->_conf->client_trace.empty()) {
6487 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6488 traceout.close();
6489 }
6490
6491 // stop the tick thread
6492 tick_thread_stopped = true;
6493 upkeep_cond.notify_one();
6494
6495 _close_sessions();
6496
6497 mref_writer.update_state(CLIENT_UNMOUNTED);
6498
6499 ldout(cct, 2) << "unmounted." << dendl;
6500 }
6501
6502 void Client::unmount()
6503 {
6504 _unmount(false);
6505 }
6506
6507 void Client::abort_conn()
6508 {
6509 _unmount(true);
6510 }
6511
6512 void Client::flush_cap_releases()
6513 {
6514 uint64_t nr_caps = 0;
6515
6516 // send any cap releases
6517 for (auto &p : mds_sessions) {
6518 auto session = p.second;
6519 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6520 p.first)) {
6521 nr_caps += session->release->caps.size();
6522 if (cct->_conf->client_inject_release_failure) {
6523 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6524 } else {
6525 session->con->send_message2(std::move(session->release));
6526 }
6527 session->release.reset();
6528 }
6529 }
6530
6531 if (nr_caps > 0) {
6532 dec_pinned_icaps(nr_caps);
6533 }
6534 }
6535
6536 void Client::renew_and_flush_cap_releases()
6537 {
6538 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6539
6540 if (!mount_aborted && mdsmap->get_epoch()) {
6541 // renew caps?
6542 utime_t el = ceph_clock_now() - last_cap_renew;
6543 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6544 renew_caps();
6545
6546 flush_cap_releases();
6547 }
6548 }
6549
6550 void Client::tick()
6551 {
6552 ldout(cct, 20) << "tick" << dendl;
6553
6554 utime_t now = ceph_clock_now();
6555
6556 /*
6557 * If the mount() is not finished
6558 */
6559 if (is_mounting() && !mds_requests.empty()) {
6560 MetaRequest *req = mds_requests.begin()->second;
6561
6562 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6563 req->abort(-CEPHFS_ETIMEDOUT);
6564 if (req->caller_cond) {
6565 req->kick = true;
6566 req->caller_cond->notify_all();
6567 }
6568 signal_cond_list(waiting_for_mdsmap);
6569 for (auto &p : mds_sessions) {
6570 signal_context_list(p.second->waiting_for_open);
6571 }
6572 }
6573 }
6574
6575 renew_and_flush_cap_releases();
6576
6577 // delayed caps
6578 xlist<Inode*>::iterator p = delayed_list.begin();
6579 while (!p.end()) {
6580 Inode *in = *p;
6581 ++p;
6582 if (!mount_aborted && in->hold_caps_until > now)
6583 break;
6584 delayed_list.pop_front();
6585 if (!mount_aborted)
6586 check_caps(in, CHECK_CAPS_NODELAY);
6587 }
6588
6589 if (!mount_aborted)
6590 collect_and_send_metrics();
6591
6592 delay_put_inodes(is_unmounting());
6593 trim_cache(true);
6594
6595 if (blocklisted && (is_mounted() || is_unmounting()) &&
6596 last_auto_reconnect + 30 * 60 < now &&
6597 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6598 messenger->client_reset();
6599 fd_gen++; // invalidate open files
6600 blocklisted = false;
6601 _kick_stale_sessions();
6602 last_auto_reconnect = now;
6603 }
6604 }
6605
6606 void Client::start_tick_thread()
6607 {
6608 upkeeper = std::thread([this]() {
6609 using time = ceph::coarse_mono_time;
6610 using sec = std::chrono::seconds;
6611
6612 auto last_tick = time::min();
6613
6614 std::unique_lock cl(client_lock);
6615 while (!tick_thread_stopped) {
6616 auto now = clock::now();
6617 auto since = now - last_tick;
6618
6619 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6620 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6621
6622 auto interval = std::max(t_interval, d_interval);
6623 if (likely(since >= interval*.90)) {
6624 tick();
6625 last_tick = clock::now();
6626 } else {
6627 interval -= since;
6628 }
6629
6630 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6631 if (!tick_thread_stopped)
6632 upkeep_cond.wait_for(cl, interval);
6633 }
6634 });
6635 }
6636
6637 void Client::collect_and_send_metrics() {
6638 ldout(cct, 20) << __func__ << dendl;
6639
6640 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6641
6642 // right now, we only track and send global metrics. its sufficient
6643 // to send these metrics to MDS rank0.
6644 collect_and_send_global_metrics();
6645 }
6646
6647 void Client::collect_and_send_global_metrics() {
6648 ldout(cct, 20) << __func__ << dendl;
6649 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6650
6651 if (!have_open_session((mds_rank_t)0)) {
6652 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6653 << dendl;
6654 return;
6655 }
6656 auto session = _get_or_open_mds_session((mds_rank_t)0);
6657 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6658 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6659 return;
6660 }
6661
6662 ClientMetricMessage metric;
6663 std::vector<ClientMetricMessage> message;
6664
6665 // read latency
6666 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6667 message.push_back(metric);
6668
6669 // write latency
6670 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6671 message.push_back(metric);
6672
6673 // metadata latency
6674 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6675 message.push_back(metric);
6676
6677 // cap hit ratio -- nr_caps is unused right now
6678 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6679 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6680 message.push_back(metric);
6681
6682 // dentry lease hit ratio
6683 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6684 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6685 message.push_back(metric);
6686
6687 // opened files
6688 {
6689 auto [opened_files, total_inodes] = get_opened_files_rates();
6690 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6691 }
6692 message.push_back(metric);
6693
6694 // pinned i_caps
6695 {
6696 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6697 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6698 }
6699 message.push_back(metric);
6700
6701 // opened inodes
6702 {
6703 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6704 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6705 }
6706 message.push_back(metric);
6707
6708 // read io sizes
6709 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6710 total_read_size));
6711 message.push_back(metric);
6712
6713 // write io sizes
6714 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6715 total_write_size));
6716 message.push_back(metric);
6717
6718 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6719 }
6720
6721 void Client::renew_caps()
6722 {
6723 ldout(cct, 10) << "renew_caps()" << dendl;
6724 last_cap_renew = ceph_clock_now();
6725
6726 for (auto &p : mds_sessions) {
6727 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6728 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6729 renew_caps(p.second.get());
6730 }
6731 }
6732
6733 void Client::renew_caps(MetaSession *session)
6734 {
6735 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6736 session->last_cap_renew_request = ceph_clock_now();
6737 uint64_t seq = ++session->cap_renew_seq;
6738 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6739 }
6740
6741
6742 // ===============================================================
6743 // high level (POSIXy) interface
6744
6745 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6746 InodeRef *target, const UserPerm& perms)
6747 {
6748 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6749 MetaRequest *req = new MetaRequest(op);
6750 filepath path;
6751 dir->make_nosnap_relative_path(path);
6752 path.push_dentry(name);
6753 req->set_filepath(path);
6754 req->set_inode(dir);
6755 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6756 mask |= DEBUG_GETATTR_CAPS;
6757 req->head.args.getattr.mask = mask;
6758
6759 ldout(cct, 10) << __func__ << " on " << path << dendl;
6760
6761 int r = make_request(req, perms, target);
6762 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6763 return r;
6764 }
6765
6766 bool Client::_dentry_valid(const Dentry *dn)
6767 {
6768 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6769
6770 // is dn lease valid?
6771 utime_t now = ceph_clock_now();
6772 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6773 mds_sessions.count(dn->lease_mds)) {
6774 auto s = mds_sessions.at(dn->lease_mds);
6775 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
6776 dlease_hit();
6777 return true;
6778 }
6779
6780 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6781 << " vs lease_gen " << dn->lease_gen << dendl;
6782 }
6783
6784 dlease_miss();
6785 return false;
6786 }
6787
6788 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6789 const UserPerm& perms, std::string* alternate_name)
6790 {
6791 int r = 0;
6792 Dentry *dn = NULL;
6793 bool did_lookup_request = false;
6794 // can only request shared caps
6795 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6796
6797 if (dname == "..") {
6798 if (dir->dentries.empty()) {
6799 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6800 filepath path(dir->ino);
6801 req->set_filepath(path);
6802
6803 InodeRef tmptarget;
6804 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6805
6806 if (r == 0) {
6807 *target = std::move(tmptarget);
6808 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6809 } else {
6810 *target = dir;
6811 }
6812 }
6813 else
6814 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6815 goto done;
6816 }
6817
6818 if (dname == ".") {
6819 *target = dir;
6820 goto done;
6821 }
6822
6823 if (!dir->is_dir()) {
6824 r = -CEPHFS_ENOTDIR;
6825 goto done;
6826 }
6827
6828 if (dname.length() > NAME_MAX) {
6829 r = -CEPHFS_ENAMETOOLONG;
6830 goto done;
6831 }
6832
6833 if (dname == cct->_conf->client_snapdir &&
6834 dir->snapid == CEPH_NOSNAP) {
6835 *target = open_snapdir(dir);
6836 goto done;
6837 }
6838
6839 relookup:
6840 if (dir->dir &&
6841 dir->dir->dentries.count(dname)) {
6842 dn = dir->dir->dentries[dname];
6843
6844 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6845 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6846
6847 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6848 if (_dentry_valid(dn)) {
6849 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6850 // make trim_caps() behave.
6851 dir->try_touch_cap(dn->lease_mds);
6852 goto hit_dn;
6853 }
6854 // dir shared caps?
6855 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6856 if (dn->cap_shared_gen == dir->shared_gen &&
6857 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6858 goto hit_dn;
6859 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6860 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6861 << *dir << " dn '" << dname << "'" << dendl;
6862 return -CEPHFS_ENOENT;
6863 }
6864 }
6865 } else {
6866 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6867 }
6868 } else {
6869 // can we conclude ENOENT locally?
6870 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6871 (dir->flags & I_COMPLETE)) {
6872 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6873 return -CEPHFS_ENOENT;
6874 }
6875 }
6876
6877 if (did_lookup_request) {
6878 r = 0;
6879 goto done;
6880 }
6881 r = _do_lookup(dir, dname, mask, target, perms);
6882 did_lookup_request = true;
6883 if (r == 0) {
6884 /* complete lookup to get dentry for alternate_name */
6885 goto relookup;
6886 } else {
6887 goto done;
6888 }
6889
6890 hit_dn:
6891 if (dn->inode) {
6892 *target = dn->inode;
6893 if (alternate_name)
6894 *alternate_name = dn->alternate_name;
6895 } else {
6896 r = -CEPHFS_ENOENT;
6897 }
6898 touch_dn(dn);
6899 goto done;
6900
6901 done:
6902 if (r < 0)
6903 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6904 else
6905 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6906 return r;
6907 }
6908
6909 int Client::get_or_create(Inode *dir, const char* name,
6910 Dentry **pdn, bool expect_null)
6911 {
6912 // lookup
6913 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6914 dir->open_dir();
6915 if (dir->dir->dentries.count(name)) {
6916 Dentry *dn = dir->dir->dentries[name];
6917 if (_dentry_valid(dn)) {
6918 if (expect_null)
6919 return -CEPHFS_EEXIST;
6920 }
6921 *pdn = dn;
6922 } else {
6923 // otherwise link up a new one
6924 *pdn = link(dir->dir, name, NULL, NULL);
6925 }
6926
6927 // success
6928 return 0;
6929 }
6930
6931 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6932 {
6933 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6934 if (!mref_reader.is_state_satisfied())
6935 return -CEPHFS_ENOTCONN;
6936
6937 ldout(cct, 10) << __func__ << ": " << path << dendl;
6938
6939 std::scoped_lock lock(client_lock);
6940
6941 return path_walk(path, wdr, perms, followsym);
6942 }
6943
6944 int Client::path_walk(const filepath& origpath, InodeRef *end,
6945 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
6946 {
6947 walk_dentry_result wdr;
6948 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
6949 *end = std::move(wdr.in);
6950 return rc;
6951 }
6952
6953 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
6954 bool followsym, int mask, InodeRef dirinode)
6955 {
6956 filepath path = origpath;
6957 InodeRef cur;
6958 std::string alternate_name;
6959 if (origpath.absolute())
6960 cur = root;
6961 else if (!dirinode)
6962 cur = cwd;
6963 else {
6964 cur = dirinode;
6965 }
6966 ceph_assert(cur);
6967
6968 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
6969 ldout(cct, 10) << __func__ << " " << path << dendl;
6970
6971 int symlinks = 0;
6972
6973 unsigned i=0;
6974 while (i < path.depth() && cur) {
6975 int caps = 0;
6976 const string &dname = path[i];
6977 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6978 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6979 InodeRef next;
6980 if (cct->_conf->client_permissions) {
6981 int r = may_lookup(cur.get(), perms);
6982 if (r < 0)
6983 return r;
6984 caps = CEPH_CAP_AUTH_SHARED;
6985 }
6986
6987 /* Get extra requested caps on the last component */
6988 if (i == (path.depth() - 1))
6989 caps |= mask;
6990 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
6991 if (r < 0)
6992 return r;
6993 // only follow trailing symlink if followsym. always follow
6994 // 'directory' symlinks.
6995 if (next && next->is_symlink()) {
6996 symlinks++;
6997 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6998 if (symlinks > MAXSYMLINKS) {
6999 return -CEPHFS_ELOOP;
7000 }
7001
7002 if (i < path.depth() - 1) {
7003 // dir symlink
7004 // replace consumed components of path with symlink dir target
7005 filepath resolved(next->symlink.c_str());
7006 resolved.append(path.postfixpath(i + 1));
7007 path = resolved;
7008 i = 0;
7009 if (next->symlink[0] == '/') {
7010 cur = root;
7011 }
7012 continue;
7013 } else if (followsym) {
7014 if (next->symlink[0] == '/') {
7015 path = next->symlink.c_str();
7016 i = 0;
7017 // reset position
7018 cur = root;
7019 } else {
7020 filepath more(next->symlink.c_str());
7021 // we need to remove the symlink component from off of the path
7022 // before adding the target that the symlink points to. remain
7023 // at the same position in the path.
7024 path.pop_dentry();
7025 path.append(more);
7026 }
7027 continue;
7028 }
7029 }
7030 cur.swap(next);
7031 i++;
7032 }
7033 if (!cur)
7034 return -CEPHFS_ENOENT;
7035 if (result) {
7036 result->in = std::move(cur);
7037 result->alternate_name = std::move(alternate_name);
7038 }
7039 return 0;
7040 }
7041
7042
7043 // namespace ops
7044
7045 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7046 {
7047 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7048 if (!mref_reader.is_state_satisfied())
7049 return -CEPHFS_ENOTCONN;
7050
7051 tout(cct) << "link" << std::endl;
7052 tout(cct) << relexisting << std::endl;
7053 tout(cct) << relpath << std::endl;
7054
7055 filepath existing(relexisting);
7056
7057 InodeRef in, dir;
7058
7059 std::scoped_lock lock(client_lock);
7060 int r = path_walk(existing, &in, perm, true);
7061 if (r < 0)
7062 return r;
7063 if (std::string(relpath) == "/") {
7064 r = -CEPHFS_EEXIST;
7065 return r;
7066 }
7067 filepath path(relpath);
7068 string name = path.last_dentry();
7069 path.pop_dentry();
7070
7071 r = path_walk(path, &dir, perm, true);
7072 if (r < 0)
7073 return r;
7074 if (cct->_conf->client_permissions) {
7075 if (S_ISDIR(in->mode)) {
7076 r = -CEPHFS_EPERM;
7077 return r;
7078 }
7079 r = may_hardlink(in.get(), perm);
7080 if (r < 0)
7081 return r;
7082 r = may_create(dir.get(), perm);
7083 if (r < 0)
7084 return r;
7085 }
7086 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7087 return r;
7088 }
7089
7090 int Client::unlink(const char *relpath, const UserPerm& perm)
7091 {
7092 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7093 }
7094
7095 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7096 {
7097 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7098 if (!mref_reader.is_state_satisfied()) {
7099 return -CEPHFS_ENOTCONN;
7100 }
7101
7102 tout(cct) << __func__ << std::endl;
7103 tout(cct) << dirfd << std::endl;
7104 tout(cct) << relpath << std::endl;
7105 tout(cct) << flags << std::endl;
7106
7107 if (std::string(relpath) == "/") {
7108 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7109 }
7110
7111 filepath path(relpath);
7112 string name = path.last_dentry();
7113 path.pop_dentry();
7114 InodeRef dir;
7115
7116 std::scoped_lock lock(client_lock);
7117
7118 InodeRef dirinode;
7119 int r = get_fd_inode(dirfd, &dirinode);
7120 if (r < 0) {
7121 return r;
7122 }
7123
7124 r = path_walk(path, &dir, perm, true, 0, dirinode);
7125 if (r < 0) {
7126 return r;
7127 }
7128 if (cct->_conf->client_permissions) {
7129 r = may_delete(dir.get(), name.c_str(), perm);
7130 if (r < 0) {
7131 return r;
7132 }
7133 }
7134 if (flags & AT_REMOVEDIR) {
7135 r = _rmdir(dir.get(), name.c_str(), perm);
7136 } else {
7137 r = _unlink(dir.get(), name.c_str(), perm);
7138 }
7139 return r;
7140 }
7141
7142 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7143 {
7144 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7145 if (!mref_reader.is_state_satisfied())
7146 return -CEPHFS_ENOTCONN;
7147
7148 tout(cct) << __func__ << std::endl;
7149 tout(cct) << relfrom << std::endl;
7150 tout(cct) << relto << std::endl;
7151
7152 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7153 return -CEPHFS_EBUSY;
7154
7155 filepath from(relfrom);
7156 filepath to(relto);
7157 string fromname = from.last_dentry();
7158 from.pop_dentry();
7159 string toname = to.last_dentry();
7160 to.pop_dentry();
7161
7162 InodeRef fromdir, todir;
7163
7164 std::scoped_lock lock(client_lock);
7165 int r = path_walk(from, &fromdir, perm);
7166 if (r < 0)
7167 goto out;
7168 r = path_walk(to, &todir, perm);
7169 if (r < 0)
7170 goto out;
7171
7172 if (cct->_conf->client_permissions) {
7173 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7174 if (r < 0)
7175 return r;
7176 r = may_delete(todir.get(), toname.c_str(), perm);
7177 if (r < 0 && r != -CEPHFS_ENOENT)
7178 return r;
7179 }
7180 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7181 out:
7182 return r;
7183 }
7184
7185 // dirs
7186
7187 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7188 {
7189 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7190 }
7191
7192 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7193 std::string alternate_name)
7194 {
7195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7196 if (!mref_reader.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN;
7198
7199 tout(cct) << __func__ << std::endl;
7200 tout(cct) << dirfd << std::endl;
7201 tout(cct) << relpath << std::endl;
7202 tout(cct) << mode << std::endl;
7203 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7204
7205 if (std::string(relpath) == "/") {
7206 return -CEPHFS_EEXIST;
7207 }
7208
7209 filepath path(relpath);
7210 string name = path.last_dentry();
7211 path.pop_dentry();
7212 InodeRef dir;
7213
7214 std::scoped_lock lock(client_lock);
7215
7216 InodeRef dirinode;
7217 int r = get_fd_inode(dirfd, &dirinode);
7218 if (r < 0) {
7219 return r;
7220 }
7221
7222 r = path_walk(path, &dir, perm, true, 0, dirinode);
7223 if (r < 0) {
7224 return r;
7225 }
7226 if (cct->_conf->client_permissions) {
7227 r = may_create(dir.get(), perm);
7228 if (r < 0) {
7229 return r;
7230 }
7231 }
7232 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7233 }
7234
7235 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7236 {
7237 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7238 if (!mref_reader.is_state_satisfied())
7239 return -CEPHFS_ENOTCONN;
7240
7241 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7242 tout(cct) << __func__ << std::endl;
7243 tout(cct) << relpath << std::endl;
7244 tout(cct) << mode << std::endl;
7245
7246 //get through existing parts of path
7247 filepath path(relpath);
7248 unsigned int i;
7249 int r = 0, caps = 0;
7250 InodeRef cur, next;
7251
7252 std::scoped_lock lock(client_lock);
7253 cur = cwd;
7254 for (i=0; i<path.depth(); ++i) {
7255 if (cct->_conf->client_permissions) {
7256 r = may_lookup(cur.get(), perms);
7257 if (r < 0)
7258 break;
7259 caps = CEPH_CAP_AUTH_SHARED;
7260 }
7261 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7262 if (r < 0)
7263 break;
7264 cur.swap(next);
7265 }
7266 if (r!=-CEPHFS_ENOENT) return r;
7267 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7268 //make new directory at each level
7269 for (; i<path.depth(); ++i) {
7270 if (cct->_conf->client_permissions) {
7271 r = may_create(cur.get(), perms);
7272 if (r < 0)
7273 return r;
7274 }
7275 //make new dir
7276 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7277
7278 //check proper creation/existence
7279 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7280 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7281 }
7282 if (r < 0)
7283 return r;
7284 //move to new dir and continue
7285 cur.swap(next);
7286 ldout(cct, 20) << __func__ << ": successfully created directory "
7287 << filepath(cur->ino).get_path() << dendl;
7288 }
7289 return 0;
7290 }
7291
7292 int Client::rmdir(const char *relpath, const UserPerm& perms)
7293 {
7294 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7295 }
7296
7297 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7298 {
7299 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7300 if (!mref_reader.is_state_satisfied())
7301 return -CEPHFS_ENOTCONN;
7302
7303 tout(cct) << __func__ << std::endl;
7304 tout(cct) << relpath << std::endl;
7305 tout(cct) << mode << std::endl;
7306 tout(cct) << rdev << std::endl;
7307
7308 if (std::string(relpath) == "/")
7309 return -CEPHFS_EEXIST;
7310
7311 filepath path(relpath);
7312 string name = path.last_dentry();
7313 path.pop_dentry();
7314 InodeRef dir;
7315
7316 std::scoped_lock lock(client_lock);
7317 int r = path_walk(path, &dir, perms);
7318 if (r < 0)
7319 return r;
7320 if (cct->_conf->client_permissions) {
7321 int r = may_create(dir.get(), perms);
7322 if (r < 0)
7323 return r;
7324 }
7325 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7326 }
7327
7328 // symlinks
7329
7330 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7331 {
7332 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7333 }
7334
7335 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7336 std::string alternate_name)
7337 {
7338 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7339 if (!mref_reader.is_state_satisfied()) {
7340 return -CEPHFS_ENOTCONN;
7341 }
7342
7343 tout(cct) << __func__ << std::endl;
7344 tout(cct) << target << std::endl;
7345 tout(cct) << dirfd << std::endl;
7346 tout(cct) << relpath << std::endl;
7347
7348 if (std::string(relpath) == "/") {
7349 return -CEPHFS_EEXIST;
7350 }
7351
7352 filepath path(relpath);
7353 string name = path.last_dentry();
7354 path.pop_dentry();
7355 InodeRef dir;
7356
7357 std::scoped_lock lock(client_lock);
7358
7359 InodeRef dirinode;
7360 int r = get_fd_inode(dirfd, &dirinode);
7361 if (r < 0) {
7362 return r;
7363 }
7364 r = path_walk(path, &dir, perms, true, 0, dirinode);
7365 if (r < 0) {
7366 return r;
7367 }
7368 if (cct->_conf->client_permissions) {
7369 int r = may_create(dir.get(), perms);
7370 if (r < 0) {
7371 return r;
7372 }
7373 }
7374 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7375 }
7376
7377 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7378 {
7379 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7380 }
7381
7382 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7383 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7384 if (!mref_reader.is_state_satisfied()) {
7385 return -CEPHFS_ENOTCONN;
7386 }
7387
7388 tout(cct) << __func__ << std::endl;
7389 tout(cct) << dirfd << std::endl;
7390 tout(cct) << relpath << std::endl;
7391
7392 InodeRef dirinode;
7393 std::scoped_lock lock(client_lock);
7394 int r = get_fd_inode(dirfd, &dirinode);
7395 if (r < 0) {
7396 return r;
7397 }
7398
7399 InodeRef in;
7400 filepath path(relpath);
7401 r = path_walk(path, &in, perms, false, 0, dirinode);
7402 if (r < 0) {
7403 return r;
7404 }
7405
7406 return _readlink(in.get(), buf, size);
7407 }
7408
7409 int Client::_readlink(Inode *in, char *buf, size_t size)
7410 {
7411 if (!in->is_symlink())
7412 return -CEPHFS_EINVAL;
7413
7414 // copy into buf (at most size bytes)
7415 int r = in->symlink.length();
7416 if (r > (int)size)
7417 r = size;
7418 memcpy(buf, in->symlink.c_str(), r);
7419 return r;
7420 }
7421
7422
7423 // inode stuff
7424
7425 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7426 {
7427 bool yes = in->caps_issued_mask(mask, true);
7428
7429 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7430 if (yes && !force)
7431 return 0;
7432
7433 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7434 filepath path;
7435 in->make_nosnap_relative_path(path);
7436 req->set_filepath(path);
7437 req->set_inode(in);
7438 req->head.args.getattr.mask = mask;
7439
7440 int res = make_request(req, perms);
7441 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7442 return res;
7443 }
7444
7445 int Client::_getvxattr(
7446 Inode *in,
7447 const UserPerm& perms,
7448 const char *xattr_name,
7449 ssize_t size,
7450 void *value,
7451 mds_rank_t rank)
7452 {
7453 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7454 return -CEPHFS_ENODATA;
7455 }
7456
7457 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7458 filepath path;
7459 in->make_nosnap_relative_path(path);
7460 req->set_filepath(path);
7461 req->set_inode(in);
7462 req->set_string2(xattr_name);
7463
7464 bufferlist bl;
7465 int res = make_request(req, perms, nullptr, nullptr, rank, &bl);
7466 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7467
7468 if (res < 0) {
7469 return res;
7470 }
7471
7472 std::string buf;
7473 auto p = bl.cbegin();
7474
7475 DECODE_START(1, p);
7476 decode(buf, p);
7477 DECODE_FINISH(p);
7478
7479 ssize_t len = buf.length();
7480
7481 res = len; // refer to man getxattr(2) for output buffer size == 0
7482
7483 if (size > 0) {
7484 if (len > size) {
7485 res = -CEPHFS_ERANGE; // insufficient output buffer space
7486 } else {
7487 memcpy(value, buf.c_str(), len);
7488 }
7489 }
7490 return res;
7491 }
7492
7493 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7494 const UserPerm& perms, InodeRef *inp)
7495 {
7496 int issued = in->caps_issued();
7497 union ceph_mds_request_args args;
7498 bool kill_sguid = false;
7499 int inode_drop = 0;
7500
7501 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7502 ccap_string(issued) << dendl;
7503
7504 if (in->snapid != CEPH_NOSNAP) {
7505 return -CEPHFS_EROFS;
7506 }
7507 if ((mask & CEPH_SETATTR_SIZE) &&
7508 (uint64_t)stx->stx_size > in->size &&
7509 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7510 perms)) {
7511 return -CEPHFS_EDQUOT;
7512 }
7513
7514 memset(&args, 0, sizeof(args));
7515
7516 // make the change locally?
7517 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7518 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7519 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7520 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7521 << in->cap_dirtier_gid << ", forcing sync setattr"
7522 << dendl;
7523 /*
7524 * This works because we implicitly flush the caps as part of the
7525 * request, so the cap update check will happen with the writeback
7526 * cap context, and then the setattr check will happen with the
7527 * caller's context.
7528 *
7529 * In reality this pattern is likely pretty rare (different users
7530 * setattr'ing the same file). If that turns out not to be the
7531 * case later, we can build a more complex pipelined cap writeback
7532 * infrastructure...
7533 */
7534 mask |= CEPH_SETATTR_CTIME;
7535 }
7536
7537 if (!mask) {
7538 // caller just needs us to bump the ctime
7539 in->ctime = ceph_clock_now();
7540 in->cap_dirtier_uid = perms.uid();
7541 in->cap_dirtier_gid = perms.gid();
7542 if (issued & CEPH_CAP_AUTH_EXCL)
7543 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7544 else if (issued & CEPH_CAP_FILE_EXCL)
7545 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7546 else if (issued & CEPH_CAP_XATTR_EXCL)
7547 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7548 else
7549 mask |= CEPH_SETATTR_CTIME;
7550 }
7551
7552 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7553 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7554
7555 mask &= ~CEPH_SETATTR_KILL_SGUID;
7556 } else if (mask & CEPH_SETATTR_SIZE) {
7557 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7558 mask |= CEPH_SETATTR_KILL_SGUID;
7559 inode_drop |= CEPH_CAP_AUTH_SHARED;
7560 }
7561
7562 if (mask & CEPH_SETATTR_UID) {
7563 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7564
7565 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7566 in->ctime = ceph_clock_now();
7567 in->cap_dirtier_uid = perms.uid();
7568 in->cap_dirtier_gid = perms.gid();
7569 in->uid = stx->stx_uid;
7570 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7571 mask &= ~CEPH_SETATTR_UID;
7572 kill_sguid = true;
7573 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7574 in->uid != stx->stx_uid) {
7575 args.setattr.uid = stx->stx_uid;
7576 inode_drop |= CEPH_CAP_AUTH_SHARED;
7577 } else {
7578 mask &= ~CEPH_SETATTR_UID;
7579 }
7580 }
7581
7582 if (mask & CEPH_SETATTR_GID) {
7583 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7584
7585 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7586 in->ctime = ceph_clock_now();
7587 in->cap_dirtier_uid = perms.uid();
7588 in->cap_dirtier_gid = perms.gid();
7589 in->gid = stx->stx_gid;
7590 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7591 mask &= ~CEPH_SETATTR_GID;
7592 kill_sguid = true;
7593 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7594 in->gid != stx->stx_gid) {
7595 args.setattr.gid = stx->stx_gid;
7596 inode_drop |= CEPH_CAP_AUTH_SHARED;
7597 } else {
7598 mask &= ~CEPH_SETATTR_GID;
7599 }
7600 }
7601
7602 if (mask & CEPH_SETATTR_MODE) {
7603 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7604
7605 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7606 in->ctime = ceph_clock_now();
7607 in->cap_dirtier_uid = perms.uid();
7608 in->cap_dirtier_gid = perms.gid();
7609 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7610 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7611 mask &= ~CEPH_SETATTR_MODE;
7612 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7613 in->mode != stx->stx_mode) {
7614 args.setattr.mode = stx->stx_mode;
7615 inode_drop |= CEPH_CAP_AUTH_SHARED;
7616 } else {
7617 mask &= ~CEPH_SETATTR_MODE;
7618 }
7619 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7620 kill_sguid && S_ISREG(in->mode) &&
7621 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7622 /* Must squash the any setuid/setgid bits with an ownership change */
7623 in->mode &= ~(S_ISUID|S_ISGID);
7624 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7625 }
7626
7627 if (mask & CEPH_SETATTR_BTIME) {
7628 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7629
7630 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7631 in->ctime = ceph_clock_now();
7632 in->cap_dirtier_uid = perms.uid();
7633 in->cap_dirtier_gid = perms.gid();
7634 in->btime = utime_t(stx->stx_btime);
7635 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7636 mask &= ~CEPH_SETATTR_BTIME;
7637 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7638 in->btime != utime_t(stx->stx_btime)) {
7639 args.setattr.btime = utime_t(stx->stx_btime);
7640 inode_drop |= CEPH_CAP_AUTH_SHARED;
7641 } else {
7642 mask &= ~CEPH_SETATTR_BTIME;
7643 }
7644 }
7645
7646 if (mask & CEPH_SETATTR_SIZE) {
7647 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7648 //too big!
7649 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7650 return -CEPHFS_EFBIG;
7651 }
7652
7653 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7654 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7655 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7656 stx->stx_size >= in->size) {
7657 if (stx->stx_size > in->size) {
7658 in->size = in->reported_size = stx->stx_size;
7659 in->cap_dirtier_uid = perms.uid();
7660 in->cap_dirtier_gid = perms.gid();
7661 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7662 mask &= ~(CEPH_SETATTR_SIZE);
7663 mask |= CEPH_SETATTR_MTIME;
7664 } else {
7665 // ignore it when size doesn't change
7666 mask &= ~(CEPH_SETATTR_SIZE);
7667 }
7668 } else {
7669 args.setattr.size = stx->stx_size;
7670 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7671 CEPH_CAP_FILE_WR;
7672 }
7673 }
7674
7675 if (mask & CEPH_SETATTR_MTIME) {
7676 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7677 in->mtime = utime_t(stx->stx_mtime);
7678 in->ctime = ceph_clock_now();
7679 in->cap_dirtier_uid = perms.uid();
7680 in->cap_dirtier_gid = perms.gid();
7681 in->time_warp_seq++;
7682 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7683 mask &= ~CEPH_SETATTR_MTIME;
7684 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7685 utime_t(stx->stx_mtime) > in->mtime) {
7686 in->mtime = utime_t(stx->stx_mtime);
7687 in->ctime = ceph_clock_now();
7688 in->cap_dirtier_uid = perms.uid();
7689 in->cap_dirtier_gid = perms.gid();
7690 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7691 mask &= ~CEPH_SETATTR_MTIME;
7692 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7693 in->mtime != utime_t(stx->stx_mtime)) {
7694 args.setattr.mtime = utime_t(stx->stx_mtime);
7695 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7696 CEPH_CAP_FILE_WR;
7697 } else {
7698 mask &= ~CEPH_SETATTR_MTIME;
7699 }
7700 }
7701
7702 if (mask & CEPH_SETATTR_ATIME) {
7703 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7704 in->atime = utime_t(stx->stx_atime);
7705 in->ctime = ceph_clock_now();
7706 in->cap_dirtier_uid = perms.uid();
7707 in->cap_dirtier_gid = perms.gid();
7708 in->time_warp_seq++;
7709 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7710 mask &= ~CEPH_SETATTR_ATIME;
7711 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7712 utime_t(stx->stx_atime) > in->atime) {
7713 in->atime = utime_t(stx->stx_atime);
7714 in->ctime = ceph_clock_now();
7715 in->cap_dirtier_uid = perms.uid();
7716 in->cap_dirtier_gid = perms.gid();
7717 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7718 mask &= ~CEPH_SETATTR_ATIME;
7719 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7720 in->atime != utime_t(stx->stx_atime)) {
7721 args.setattr.atime = utime_t(stx->stx_atime);
7722 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7723 CEPH_CAP_FILE_WR;
7724 } else {
7725 mask &= ~CEPH_SETATTR_ATIME;
7726 }
7727 }
7728
7729 if (!mask) {
7730 in->change_attr++;
7731 return 0;
7732 }
7733
7734 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7735
7736 filepath path;
7737
7738 in->make_nosnap_relative_path(path);
7739 req->set_filepath(path);
7740 req->set_inode(in);
7741
7742 req->head.args = args;
7743 req->inode_drop = inode_drop;
7744 req->head.args.setattr.mask = mask;
7745 req->regetattr_mask = mask;
7746
7747 int res = make_request(req, perms, inp);
7748 ldout(cct, 10) << "_setattr result=" << res << dendl;
7749 return res;
7750 }
7751
7752 /* Note that we only care about attrs that setattr cares about */
7753 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7754 {
7755 stx->stx_size = st->st_size;
7756 stx->stx_mode = st->st_mode;
7757 stx->stx_uid = st->st_uid;
7758 stx->stx_gid = st->st_gid;
7759 #ifdef __APPLE__
7760 stx->stx_mtime = st->st_mtimespec;
7761 stx->stx_atime = st->st_atimespec;
7762 #elif __WIN32
7763 stx->stx_mtime.tv_sec = st->st_mtime;
7764 stx->stx_atime.tv_sec = st->st_atime;
7765 #else
7766 stx->stx_mtime = st->st_mtim;
7767 stx->stx_atime = st->st_atim;
7768 #endif
7769 }
7770
7771 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7772 const UserPerm& perms, InodeRef *inp)
7773 {
7774 int ret = _do_setattr(in, stx, mask, perms, inp);
7775 if (ret < 0)
7776 return ret;
7777 if (mask & CEPH_SETATTR_MODE)
7778 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7779 return ret;
7780 }
7781
7782 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7783 const UserPerm& perms)
7784 {
7785 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7786 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7787 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7788 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7789 if (cct->_conf->client_permissions) {
7790 int r = may_setattr(in.get(), stx, mask, perms);
7791 if (r < 0)
7792 return r;
7793 }
7794 return __setattrx(in.get(), stx, mask, perms);
7795 }
7796
7797 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7798 const UserPerm& perms)
7799 {
7800 struct ceph_statx stx;
7801
7802 stat_to_statx(attr, &stx);
7803 mask &= ~CEPH_SETATTR_BTIME;
7804
7805 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7806 mask &= ~CEPH_SETATTR_UID;
7807 }
7808 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7809 mask &= ~CEPH_SETATTR_GID;
7810 }
7811
7812 return _setattrx(in, &stx, mask, perms);
7813 }
7814
7815 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7816 const UserPerm& perms)
7817 {
7818 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7819 if (!mref_reader.is_state_satisfied())
7820 return -CEPHFS_ENOTCONN;
7821
7822 tout(cct) << __func__ << std::endl;
7823 tout(cct) << relpath << std::endl;
7824 tout(cct) << mask << std::endl;
7825
7826 filepath path(relpath);
7827 InodeRef in;
7828
7829 std::scoped_lock lock(client_lock);
7830 int r = path_walk(path, &in, perms);
7831 if (r < 0)
7832 return r;
7833 return _setattr(in, attr, mask, perms);
7834 }
7835
7836 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7837 const UserPerm& perms, int flags)
7838 {
7839 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7840 if (!mref_reader.is_state_satisfied())
7841 return -CEPHFS_ENOTCONN;
7842
7843 tout(cct) << __func__ << std::endl;
7844 tout(cct) << relpath << std::endl;
7845 tout(cct) << mask << std::endl;
7846
7847 filepath path(relpath);
7848 InodeRef in;
7849
7850 std::scoped_lock lock(client_lock);
7851 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7852 if (r < 0)
7853 return r;
7854 return _setattrx(in, stx, mask, perms);
7855 }
7856
7857 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7858 {
7859 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7860 if (!mref_reader.is_state_satisfied())
7861 return -CEPHFS_ENOTCONN;
7862
7863 tout(cct) << __func__ << std::endl;
7864 tout(cct) << fd << std::endl;
7865 tout(cct) << mask << std::endl;
7866
7867 std::scoped_lock lock(client_lock);
7868 Fh *f = get_filehandle(fd);
7869 if (!f)
7870 return -CEPHFS_EBADF;
7871 #if defined(__linux__) && defined(O_PATH)
7872 if (f->flags & O_PATH)
7873 return -CEPHFS_EBADF;
7874 #endif
7875 return _setattr(f->inode, attr, mask, perms);
7876 }
7877
7878 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7879 {
7880 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7881 if (!mref_reader.is_state_satisfied())
7882 return -CEPHFS_ENOTCONN;
7883
7884 tout(cct) << __func__ << std::endl;
7885 tout(cct) << fd << std::endl;
7886 tout(cct) << mask << std::endl;
7887
7888 std::scoped_lock lock(client_lock);
7889 Fh *f = get_filehandle(fd);
7890 if (!f)
7891 return -CEPHFS_EBADF;
7892 #if defined(__linux__) && defined(O_PATH)
7893 if (f->flags & O_PATH)
7894 return -CEPHFS_EBADF;
7895 #endif
7896 return _setattrx(f->inode, stx, mask, perms);
7897 }
7898
7899 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7900 frag_info_t *dirstat, int mask)
7901 {
7902 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7903 if (!mref_reader.is_state_satisfied())
7904 return -CEPHFS_ENOTCONN;
7905
7906 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7907 tout(cct) << "stat" << std::endl;
7908 tout(cct) << relpath << std::endl;
7909
7910 filepath path(relpath);
7911 InodeRef in;
7912
7913 std::scoped_lock lock(client_lock);
7914 int r = path_walk(path, &in, perms, true, mask);
7915 if (r < 0)
7916 return r;
7917 r = _getattr(in, mask, perms);
7918 if (r < 0) {
7919 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7920 return r;
7921 }
7922 fill_stat(in, stbuf, dirstat);
7923 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7924 return r;
7925 }
7926
7927 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7928 {
7929 unsigned mask = 0;
7930
7931 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7932 if (flags & AT_NO_ATTR_SYNC)
7933 goto out;
7934
7935 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7936 mask |= CEPH_CAP_PIN;
7937 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7938 mask |= CEPH_CAP_AUTH_SHARED;
7939 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7940 mask |= CEPH_CAP_LINK_SHARED;
7941 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7942 mask |= CEPH_CAP_FILE_SHARED;
7943 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7944 mask |= CEPH_CAP_XATTR_SHARED;
7945 out:
7946 return mask;
7947 }
7948
7949 int Client::statx(const char *relpath, struct ceph_statx *stx,
7950 const UserPerm& perms,
7951 unsigned int want, unsigned int flags)
7952 {
7953 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7954 }
7955
7956 int Client::lstat(const char *relpath, struct stat *stbuf,
7957 const UserPerm& perms, frag_info_t *dirstat, int mask)
7958 {
7959 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7960 if (!mref_reader.is_state_satisfied())
7961 return -CEPHFS_ENOTCONN;
7962
7963 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7964 tout(cct) << __func__ << std::endl;
7965 tout(cct) << relpath << std::endl;
7966
7967 filepath path(relpath);
7968 InodeRef in;
7969
7970 std::scoped_lock lock(client_lock);
7971 // don't follow symlinks
7972 int r = path_walk(path, &in, perms, false, mask);
7973 if (r < 0)
7974 return r;
7975 r = _getattr(in, mask, perms);
7976 if (r < 0) {
7977 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7978 return r;
7979 }
7980 fill_stat(in, stbuf, dirstat);
7981 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7982 return r;
7983 }
7984
7985 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7986 {
7987 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7988 << " mode 0" << oct << in->mode << dec
7989 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7990 memset(st, 0, sizeof(struct stat));
7991 if (use_faked_inos())
7992 st->st_ino = in->faked_ino;
7993 else
7994 st->st_ino = in->ino;
7995 st->st_dev = in->snapid;
7996 st->st_mode = in->mode;
7997 st->st_rdev = in->rdev;
7998 if (in->is_dir()) {
7999 switch (in->nlink) {
8000 case 0:
8001 st->st_nlink = 0; /* dir is unlinked */
8002 break;
8003 case 1:
8004 st->st_nlink = 1 /* parent dentry */
8005 + 1 /* <dir>/. */
8006 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8007 break;
8008 default:
8009 ceph_abort();
8010 }
8011 } else {
8012 st->st_nlink = in->nlink;
8013 }
8014 st->st_uid = in->uid;
8015 st->st_gid = in->gid;
8016 if (in->ctime > in->mtime) {
8017 stat_set_ctime_sec(st, in->ctime.sec());
8018 stat_set_ctime_nsec(st, in->ctime.nsec());
8019 } else {
8020 stat_set_ctime_sec(st, in->mtime.sec());
8021 stat_set_ctime_nsec(st, in->mtime.nsec());
8022 }
8023 stat_set_atime_sec(st, in->atime.sec());
8024 stat_set_atime_nsec(st, in->atime.nsec());
8025 stat_set_mtime_sec(st, in->mtime.sec());
8026 stat_set_mtime_nsec(st, in->mtime.nsec());
8027 if (in->is_dir()) {
8028 if (cct->_conf->client_dirsize_rbytes)
8029 st->st_size = in->rstat.rbytes;
8030 else
8031 st->st_size = in->dirstat.size();
8032 // The Windows "stat" structure provides just a subset of the fields that are
8033 // available on Linux.
8034 #ifndef _WIN32
8035 st->st_blocks = 1;
8036 #endif
8037 } else {
8038 st->st_size = in->size;
8039 #ifndef _WIN32
8040 st->st_blocks = (in->size + 511) >> 9;
8041 #endif
8042 }
8043 #ifndef _WIN32
8044 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8045 #endif
8046
8047 if (dirstat)
8048 *dirstat = in->dirstat;
8049 if (rstat)
8050 *rstat = in->rstat;
8051
8052 return in->caps_issued();
8053 }
8054
8055 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8056 {
8057 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8058 << " mode 0" << oct << in->mode << dec
8059 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8060 memset(stx, 0, sizeof(struct ceph_statx));
8061
8062 /*
8063 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
8064 * so that all bits are set.
8065 */
8066 if (!mask)
8067 mask = ~0;
8068
8069 /* These are always considered to be available */
8070 stx->stx_dev = in->snapid;
8071 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8072
8073 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8074 stx->stx_mode = S_IFMT & in->mode;
8075 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8076 stx->stx_rdev = in->rdev;
8077 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8078
8079 if (mask & CEPH_CAP_AUTH_SHARED) {
8080 stx->stx_uid = in->uid;
8081 stx->stx_gid = in->gid;
8082 stx->stx_mode = in->mode;
8083 in->btime.to_timespec(&stx->stx_btime);
8084 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8085 }
8086
8087 if (mask & CEPH_CAP_LINK_SHARED) {
8088 if (in->is_dir()) {
8089 switch (in->nlink) {
8090 case 0:
8091 stx->stx_nlink = 0; /* dir is unlinked */
8092 break;
8093 case 1:
8094 stx->stx_nlink = 1 /* parent dentry */
8095 + 1 /* <dir>/. */
8096 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8097 break;
8098 default:
8099 ceph_abort();
8100 }
8101 } else {
8102 stx->stx_nlink = in->nlink;
8103 }
8104 stx->stx_mask |= CEPH_STATX_NLINK;
8105 }
8106
8107 if (mask & CEPH_CAP_FILE_SHARED) {
8108
8109 in->atime.to_timespec(&stx->stx_atime);
8110 in->mtime.to_timespec(&stx->stx_mtime);
8111
8112 if (in->is_dir()) {
8113 if (cct->_conf->client_dirsize_rbytes)
8114 stx->stx_size = in->rstat.rbytes;
8115 else
8116 stx->stx_size = in->dirstat.size();
8117 stx->stx_blocks = 1;
8118 } else {
8119 stx->stx_size = in->size;
8120 stx->stx_blocks = (in->size + 511) >> 9;
8121 }
8122 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8123 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8124 }
8125
8126 /* Change time and change_attr both require all shared caps to view */
8127 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8128 stx->stx_version = in->change_attr;
8129 if (in->ctime > in->mtime)
8130 in->ctime.to_timespec(&stx->stx_ctime);
8131 else
8132 in->mtime.to_timespec(&stx->stx_ctime);
8133 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8134 }
8135
8136 }
8137
8138 void Client::touch_dn(Dentry *dn)
8139 {
8140 lru.lru_touch(dn);
8141 }
8142
8143 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8144 {
8145 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8146 }
8147
8148 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8149 {
8150 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8151 if (!mref_reader.is_state_satisfied())
8152 return -CEPHFS_ENOTCONN;
8153
8154 tout(cct) << __func__ << std::endl;
8155 tout(cct) << fd << std::endl;
8156 tout(cct) << mode << std::endl;
8157
8158 std::scoped_lock lock(client_lock);
8159 Fh *f = get_filehandle(fd);
8160 if (!f)
8161 return -CEPHFS_EBADF;
8162 #if defined(__linux__) && defined(O_PATH)
8163 if (f->flags & O_PATH)
8164 return -CEPHFS_EBADF;
8165 #endif
8166 struct stat attr;
8167 attr.st_mode = mode;
8168 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8169 }
8170
8171 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8172 const UserPerm& perms) {
8173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8174 if (!mref_reader.is_state_satisfied()) {
8175 return -CEPHFS_ENOTCONN;
8176 }
8177
8178 tout(cct) << __func__ << std::endl;
8179 tout(cct) << dirfd << std::endl;
8180 tout(cct) << relpath << std::endl;
8181 tout(cct) << mode << std::endl;
8182 tout(cct) << flags << std::endl;
8183
8184 filepath path(relpath);
8185 InodeRef in;
8186 InodeRef dirinode;
8187
8188 std::scoped_lock lock(client_lock);
8189 int r = get_fd_inode(dirfd, &dirinode);
8190 if (r < 0) {
8191 return r;
8192 }
8193
8194 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8195 if (r < 0) {
8196 return r;
8197 }
8198 struct stat attr;
8199 attr.st_mode = mode;
8200 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8201 }
8202
8203 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8204 {
8205 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8206 }
8207
8208 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8209 const UserPerm& perms)
8210 {
8211 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8212 }
8213
8214 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8215 {
8216 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8217 if (!mref_reader.is_state_satisfied())
8218 return -CEPHFS_ENOTCONN;
8219
8220 tout(cct) << __func__ << std::endl;
8221 tout(cct) << fd << std::endl;
8222 tout(cct) << new_uid << std::endl;
8223 tout(cct) << new_gid << std::endl;
8224
8225 std::scoped_lock lock(client_lock);
8226 Fh *f = get_filehandle(fd);
8227 if (!f)
8228 return -CEPHFS_EBADF;
8229 #if defined(__linux__) && defined(O_PATH)
8230 if (f->flags & O_PATH)
8231 return -CEPHFS_EBADF;
8232 #endif
8233 struct stat attr;
8234 attr.st_uid = new_uid;
8235 attr.st_gid = new_gid;
8236 int mask = 0;
8237 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8238 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8239 return _setattr(f->inode, &attr, mask, perms);
8240 }
8241
8242 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8243 const UserPerm& perms)
8244 {
8245 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8246 }
8247
8248 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8249 int flags, const UserPerm& perms) {
8250 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8251 if (!mref_reader.is_state_satisfied()) {
8252 return -CEPHFS_ENOTCONN;
8253 }
8254
8255 tout(cct) << __func__ << std::endl;
8256 tout(cct) << dirfd << std::endl;
8257 tout(cct) << relpath << std::endl;
8258 tout(cct) << new_uid << std::endl;
8259 tout(cct) << new_gid << std::endl;
8260 tout(cct) << flags << std::endl;
8261
8262 filepath path(relpath);
8263 InodeRef in;
8264 InodeRef dirinode;
8265
8266 std::scoped_lock lock(client_lock);
8267 int r = get_fd_inode(dirfd, &dirinode);
8268 if (r < 0) {
8269 return r;
8270 }
8271
8272 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8273 if (r < 0) {
8274 return r;
8275 }
8276 struct stat attr;
8277 attr.st_uid = new_uid;
8278 attr.st_gid = new_gid;
8279 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8280 }
8281
8282 static void attr_set_atime_and_mtime(struct stat *attr,
8283 const utime_t &atime,
8284 const utime_t &mtime)
8285 {
8286 stat_set_atime_sec(attr, atime.tv.tv_sec);
8287 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8288 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8289 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8290 }
8291
8292 // for [l]utime() invoke the timeval variant as the timespec
8293 // variant are not yet implemented. for futime[s](), invoke
8294 // the timespec variant.
8295 int Client::utime(const char *relpath, struct utimbuf *buf,
8296 const UserPerm& perms)
8297 {
8298 struct timeval tv[2];
8299 tv[0].tv_sec = buf->actime;
8300 tv[0].tv_usec = 0;
8301 tv[1].tv_sec = buf->modtime;
8302 tv[1].tv_usec = 0;
8303
8304 return utimes(relpath, tv, perms);
8305 }
8306
8307 int Client::lutime(const char *relpath, struct utimbuf *buf,
8308 const UserPerm& perms)
8309 {
8310 struct timeval tv[2];
8311 tv[0].tv_sec = buf->actime;
8312 tv[0].tv_usec = 0;
8313 tv[1].tv_sec = buf->modtime;
8314 tv[1].tv_usec = 0;
8315
8316 return lutimes(relpath, tv, perms);
8317 }
8318
8319 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8320 {
8321 struct timespec ts[2];
8322 ts[0].tv_sec = buf->actime;
8323 ts[0].tv_nsec = 0;
8324 ts[1].tv_sec = buf->modtime;
8325 ts[1].tv_nsec = 0;
8326
8327 return futimens(fd, ts, perms);
8328 }
8329
8330 int Client::utimes(const char *relpath, struct timeval times[2],
8331 const UserPerm& perms)
8332 {
8333 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8334 if (!mref_reader.is_state_satisfied())
8335 return -CEPHFS_ENOTCONN;
8336
8337 tout(cct) << __func__ << std::endl;
8338 tout(cct) << relpath << std::endl;
8339 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8340 << std::endl;
8341 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8342 << std::endl;
8343
8344 filepath path(relpath);
8345 InodeRef in;
8346
8347 std::scoped_lock lock(client_lock);
8348 int r = path_walk(path, &in, perms);
8349 if (r < 0)
8350 return r;
8351 struct stat attr;
8352 utime_t atime(times[0]);
8353 utime_t mtime(times[1]);
8354
8355 attr_set_atime_and_mtime(&attr, atime, mtime);
8356 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8357 }
8358
8359 int Client::lutimes(const char *relpath, struct timeval times[2],
8360 const UserPerm& perms)
8361 {
8362 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8363 if (!mref_reader.is_state_satisfied())
8364 return -CEPHFS_ENOTCONN;
8365
8366 tout(cct) << __func__ << std::endl;
8367 tout(cct) << relpath << std::endl;
8368 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8369 << std::endl;
8370 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8371 << std::endl;
8372
8373 filepath path(relpath);
8374 InodeRef in;
8375
8376 std::scoped_lock lock(client_lock);
8377 int r = path_walk(path, &in, perms, false);
8378 if (r < 0)
8379 return r;
8380 struct stat attr;
8381 utime_t atime(times[0]);
8382 utime_t mtime(times[1]);
8383
8384 attr_set_atime_and_mtime(&attr, atime, mtime);
8385 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8386 }
8387
8388 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8389 {
8390 struct timespec ts[2];
8391 ts[0].tv_sec = times[0].tv_sec;
8392 ts[0].tv_nsec = times[0].tv_usec * 1000;
8393 ts[1].tv_sec = times[1].tv_sec;
8394 ts[1].tv_nsec = times[1].tv_usec * 1000;
8395
8396 return futimens(fd, ts, perms);
8397 }
8398
8399 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8400 {
8401 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8402 if (!mref_reader.is_state_satisfied())
8403 return -CEPHFS_ENOTCONN;
8404
8405 tout(cct) << __func__ << std::endl;
8406 tout(cct) << fd << std::endl;
8407 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8408 << std::endl;
8409 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8410 << std::endl;
8411
8412 std::scoped_lock lock(client_lock);
8413 Fh *f = get_filehandle(fd);
8414 if (!f)
8415 return -CEPHFS_EBADF;
8416 #if defined(__linux__) && defined(O_PATH)
8417 if (f->flags & O_PATH)
8418 return -CEPHFS_EBADF;
8419 #endif
8420 struct stat attr;
8421 utime_t atime(times[0]);
8422 utime_t mtime(times[1]);
8423
8424 attr_set_atime_and_mtime(&attr, atime, mtime);
8425 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8426 }
8427
8428 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8429 const UserPerm& perms) {
8430 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8431 if (!mref_reader.is_state_satisfied()) {
8432 return -CEPHFS_ENOTCONN;
8433 }
8434
8435 tout(cct) << __func__ << std::endl;
8436 tout(cct) << dirfd << std::endl;
8437 tout(cct) << relpath << std::endl;
8438 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8439 << std::endl;
8440 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8441 << std::endl;
8442 tout(cct) << flags << std::endl;
8443
8444 filepath path(relpath);
8445 InodeRef in;
8446 InodeRef dirinode;
8447
8448 std::scoped_lock lock(client_lock);
8449 int r = get_fd_inode(dirfd, &dirinode);
8450 if (r < 0) {
8451 return r;
8452 }
8453
8454 #if defined(__linux__) && defined(O_PATH)
8455 if (flags & O_PATH) {
8456 return -CEPHFS_EBADF;
8457 }
8458 #endif
8459
8460 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8461 if (r < 0) {
8462 return r;
8463 }
8464 struct stat attr;
8465 utime_t atime(times[0]);
8466 utime_t mtime(times[1]);
8467
8468 attr_set_atime_and_mtime(&attr, atime, mtime);
8469 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8470 }
8471
8472 int Client::flock(int fd, int operation, uint64_t owner)
8473 {
8474 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8475 if (!mref_reader.is_state_satisfied())
8476 return -CEPHFS_ENOTCONN;
8477
8478 tout(cct) << __func__ << std::endl;
8479 tout(cct) << fd << std::endl;
8480 tout(cct) << operation << std::endl;
8481 tout(cct) << owner << std::endl;
8482
8483 std::scoped_lock lock(client_lock);
8484 Fh *f = get_filehandle(fd);
8485 if (!f)
8486 return -CEPHFS_EBADF;
8487
8488 return _flock(f, operation, owner);
8489 }
8490
8491 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8492 {
8493 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8494 if (!mref_reader.is_state_satisfied())
8495 return -CEPHFS_ENOTCONN;
8496
8497 tout(cct) << __func__ << std::endl;
8498 tout(cct) << relpath << std::endl;
8499
8500 filepath path(relpath);
8501 InodeRef in;
8502
8503 std::scoped_lock lock(client_lock);
8504 int r = path_walk(path, &in, perms, true);
8505 if (r < 0)
8506 return r;
8507 if (cct->_conf->client_permissions) {
8508 int r = may_open(in.get(), O_RDONLY, perms);
8509 if (r < 0)
8510 return r;
8511 }
8512 r = _opendir(in.get(), dirpp, perms);
8513 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8514 if (r != -CEPHFS_ENOTDIR)
8515 tout(cct) << (uintptr_t)*dirpp << std::endl;
8516 return r;
8517 }
8518
8519 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8520 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8521 if (!mref_reader.is_state_satisfied()) {
8522 return -CEPHFS_ENOTCONN;
8523 }
8524
8525 tout(cct) << __func__ << std::endl;
8526 tout(cct) << dirfd << std::endl;
8527
8528 InodeRef dirinode;
8529 std::scoped_lock locker(client_lock);
8530 int r = get_fd_inode(dirfd, &dirinode);
8531 if (r < 0) {
8532 return r;
8533 }
8534
8535 if (cct->_conf->client_permissions) {
8536 r = may_open(dirinode.get(), O_RDONLY, perms);
8537 if (r < 0) {
8538 return r;
8539 }
8540 }
8541 r = _opendir(dirinode.get(), dirpp, perms);
8542 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8543 if (r != -CEPHFS_ENOTDIR) {
8544 tout(cct) << (uintptr_t)*dirpp << std::endl;
8545 }
8546 return r;
8547 }
8548
8549 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8550 {
8551 if (!in->is_dir())
8552 return -CEPHFS_ENOTDIR;
8553 *dirpp = new dir_result_t(in, perms);
8554 opened_dirs.insert(*dirpp);
8555 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8556 return 0;
8557 }
8558
8559
8560 int Client::closedir(dir_result_t *dir)
8561 {
8562 tout(cct) << __func__ << std::endl;
8563 tout(cct) << (uintptr_t)dir << std::endl;
8564
8565 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8566 std::scoped_lock lock(client_lock);
8567 _closedir(dir);
8568 return 0;
8569 }
8570
8571 void Client::_closedir(dir_result_t *dirp)
8572 {
8573 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8574
8575 if (dirp->inode) {
8576 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8577 dirp->inode.reset();
8578 }
8579 _readdir_drop_dirp_buffer(dirp);
8580 opened_dirs.erase(dirp);
8581 delete dirp;
8582 }
8583
8584 void Client::rewinddir(dir_result_t *dirp)
8585 {
8586 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8587
8588 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8589 if (!mref_reader.is_state_satisfied())
8590 return;
8591
8592 std::scoped_lock lock(client_lock);
8593 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8594 _readdir_drop_dirp_buffer(d);
8595 d->reset();
8596 }
8597
8598 loff_t Client::telldir(dir_result_t *dirp)
8599 {
8600 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8601 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8602 return d->offset;
8603 }
8604
8605 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8606 {
8607 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8608
8609 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8610 if (!mref_reader.is_state_satisfied())
8611 return;
8612
8613 std::scoped_lock lock(client_lock);
8614
8615 if (offset == dirp->offset)
8616 return;
8617
8618 if (offset > dirp->offset)
8619 dirp->release_count = 0; // bump if we do a forward seek
8620 else
8621 dirp->ordered_count = 0; // disable filling readdir cache
8622
8623 if (dirp->hash_order()) {
8624 if (dirp->offset > offset) {
8625 _readdir_drop_dirp_buffer(dirp);
8626 dirp->reset();
8627 }
8628 } else {
8629 if (offset == 0 ||
8630 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8631 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8632 _readdir_drop_dirp_buffer(dirp);
8633 dirp->reset();
8634 }
8635 }
8636
8637 dirp->offset = offset;
8638 }
8639
8640
8641 //struct dirent {
8642 // ino_t d_ino; /* inode number */
8643 // off_t d_off; /* offset to the next dirent */
8644 // unsigned short d_reclen; /* length of this record */
8645 // unsigned char d_type; /* type of file */
8646 // char d_name[256]; /* filename */
8647 //};
8648 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8649 {
8650 strncpy(de->d_name, name, 255);
8651 de->d_name[255] = '\0';
8652 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8653 de->d_ino = ino;
8654 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8655 de->d_off = next_off;
8656 #endif
8657 de->d_reclen = 1;
8658 de->d_type = IFTODT(type);
8659 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8660 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8661 #endif
8662 }
8663
8664 void Client::_readdir_next_frag(dir_result_t *dirp)
8665 {
8666 frag_t fg = dirp->buffer_frag;
8667
8668 if (fg.is_rightmost()) {
8669 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8670 dirp->set_end();
8671 return;
8672 }
8673
8674 // advance
8675 fg = fg.next();
8676 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8677
8678 if (dirp->hash_order()) {
8679 // keep last_name
8680 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8681 if (dirp->offset < new_offset) // don't decrease offset
8682 dirp->offset = new_offset;
8683 } else {
8684 dirp->last_name.clear();
8685 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8686 _readdir_rechoose_frag(dirp);
8687 }
8688 }
8689
8690 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8691 {
8692 ceph_assert(dirp->inode);
8693
8694 if (dirp->hash_order())
8695 return;
8696
8697 frag_t cur = frag_t(dirp->offset_high());
8698 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8699 if (fg != cur) {
8700 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8701 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8702 dirp->last_name.clear();
8703 dirp->next_offset = 2;
8704 }
8705 }
8706
8707 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8708 {
8709 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8710 dirp->buffer.clear();
8711 }
8712
8713 int Client::_readdir_get_frag(dir_result_t *dirp)
8714 {
8715 ceph_assert(dirp);
8716 ceph_assert(dirp->inode);
8717
8718 // get the current frag.
8719 frag_t fg;
8720 if (dirp->hash_order())
8721 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8722 else
8723 fg = frag_t(dirp->offset_high());
8724
8725 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8726 << " offset " << hex << dirp->offset << dec << dendl;
8727
8728 int op = CEPH_MDS_OP_READDIR;
8729 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8730 op = CEPH_MDS_OP_LSSNAP;
8731
8732 InodeRef& diri = dirp->inode;
8733
8734 MetaRequest *req = new MetaRequest(op);
8735 filepath path;
8736 diri->make_nosnap_relative_path(path);
8737 req->set_filepath(path);
8738 req->set_inode(diri.get());
8739 req->head.args.readdir.frag = fg;
8740 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8741 if (dirp->last_name.length()) {
8742 req->path2.set_path(dirp->last_name);
8743 } else if (dirp->hash_order()) {
8744 req->head.args.readdir.offset_hash = dirp->offset_high();
8745 }
8746 req->dirp = dirp;
8747
8748 bufferlist dirbl;
8749 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8750
8751 if (res == -CEPHFS_EAGAIN) {
8752 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8753 _readdir_rechoose_frag(dirp);
8754 return _readdir_get_frag(dirp);
8755 }
8756
8757 if (res == 0) {
8758 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8759 << " size " << dirp->buffer.size() << dendl;
8760 } else {
8761 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8762 dirp->set_end();
8763 }
8764
8765 return res;
8766 }
8767
8768 struct dentry_off_lt {
8769 bool operator()(const Dentry* dn, int64_t off) const {
8770 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8771 }
8772 };
8773
8774 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8775 int caps, bool getref)
8776 {
8777 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8778 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8779 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8780 << dendl;
8781 Dir *dir = dirp->inode->dir;
8782
8783 if (!dir) {
8784 ldout(cct, 10) << " dir is empty" << dendl;
8785 dirp->set_end();
8786 return 0;
8787 }
8788
8789 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8790 dir->readdir_cache.end(),
8791 dirp->offset, dentry_off_lt());
8792
8793 string dn_name;
8794 while (true) {
8795 int mask = caps;
8796 if (!dirp->inode->is_complete_and_ordered())
8797 return -CEPHFS_EAGAIN;
8798 if (pd == dir->readdir_cache.end())
8799 break;
8800 Dentry *dn = *pd;
8801 if (dn->inode == NULL) {
8802 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8803 ++pd;
8804 continue;
8805 }
8806 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8807 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8808 ++pd;
8809 continue;
8810 }
8811
8812 int idx = pd - dir->readdir_cache.begin();
8813 if (dn->inode->is_dir()) {
8814 mask |= CEPH_STAT_RSTAT;
8815 }
8816 int r = _getattr(dn->inode, mask, dirp->perms);
8817 if (r < 0)
8818 return r;
8819
8820 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8821 pd = dir->readdir_cache.begin() + idx;
8822 if (pd >= dir->readdir_cache.end() || *pd != dn)
8823 return -CEPHFS_EAGAIN;
8824
8825 struct ceph_statx stx;
8826 struct dirent de;
8827 fill_statx(dn->inode, caps, &stx);
8828
8829 uint64_t next_off = dn->offset + 1;
8830 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8831 ++pd;
8832 if (pd == dir->readdir_cache.end())
8833 next_off = dir_result_t::END;
8834
8835 Inode *in = NULL;
8836 if (getref) {
8837 in = dn->inode.get();
8838 _ll_get(in);
8839 }
8840
8841 dn_name = dn->name; // fill in name while we have lock
8842
8843 client_lock.unlock();
8844 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8845 client_lock.lock();
8846 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8847 << " = " << r << dendl;
8848 if (r < 0) {
8849 return r;
8850 }
8851
8852 dirp->offset = next_off;
8853 if (dirp->at_end())
8854 dirp->next_offset = 2;
8855 else
8856 dirp->next_offset = dirp->offset_low();
8857 dirp->last_name = dn_name; // we successfully returned this one; update!
8858 dirp->release_count = 0; // last_name no longer match cache index
8859 if (r > 0)
8860 return r;
8861 }
8862
8863 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8864 dirp->set_end();
8865 return 0;
8866 }
8867
8868 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8869 unsigned want, unsigned flags, bool getref)
8870 {
8871 int caps = statx_to_mask(flags, want);
8872
8873 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8874 if (!mref_reader.is_state_satisfied())
8875 return -CEPHFS_ENOTCONN;
8876
8877 std::unique_lock cl(client_lock);
8878
8879 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8880
8881 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8882 << dec << " at_end=" << dirp->at_end()
8883 << " hash_order=" << dirp->hash_order() << dendl;
8884
8885 struct dirent de;
8886 struct ceph_statx stx;
8887 memset(&de, 0, sizeof(de));
8888 memset(&stx, 0, sizeof(stx));
8889
8890 InodeRef& diri = dirp->inode;
8891
8892 if (dirp->at_end())
8893 return 0;
8894
8895 if (dirp->offset == 0) {
8896 ldout(cct, 15) << " including ." << dendl;
8897 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8898 uint64_t next_off = 1;
8899
8900 int r;
8901 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8902 if (r < 0)
8903 return r;
8904
8905 fill_statx(diri, caps, &stx);
8906 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8907
8908 Inode *inode = NULL;
8909 if (getref) {
8910 inode = diri.get();
8911 _ll_get(inode);
8912 }
8913
8914 cl.unlock();
8915 r = cb(p, &de, &stx, next_off, inode);
8916 cl.lock();
8917 if (r < 0)
8918 return r;
8919
8920 dirp->offset = next_off;
8921 if (r > 0)
8922 return r;
8923 }
8924 if (dirp->offset == 1) {
8925 ldout(cct, 15) << " including .." << dendl;
8926 uint64_t next_off = 2;
8927 InodeRef in;
8928 if (diri->dentries.empty())
8929 in = diri;
8930 else
8931 in = diri->get_first_parent()->dir->parent_inode;
8932
8933 int r;
8934 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8935 if (r < 0)
8936 return r;
8937
8938 fill_statx(in, caps, &stx);
8939 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8940
8941 Inode *inode = NULL;
8942 if (getref) {
8943 inode = in.get();
8944 _ll_get(inode);
8945 }
8946
8947 cl.unlock();
8948 r = cb(p, &de, &stx, next_off, inode);
8949 cl.lock();
8950 if (r < 0)
8951 return r;
8952
8953 dirp->offset = next_off;
8954 if (r > 0)
8955 return r;
8956 }
8957
8958 // can we read from our cache?
8959 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8960 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8961 << dirp->inode->is_complete_and_ordered()
8962 << " issued " << ccap_string(dirp->inode->caps_issued())
8963 << dendl;
8964 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8965 dirp->inode->is_complete_and_ordered() &&
8966 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8967 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8968 if (err != -CEPHFS_EAGAIN)
8969 return err;
8970 }
8971
8972 while (1) {
8973 if (dirp->at_end())
8974 return 0;
8975
8976 bool check_caps = true;
8977 if (!dirp->is_cached()) {
8978 int r = _readdir_get_frag(dirp);
8979 if (r)
8980 return r;
8981 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8982 // different than the requested one. (our dirfragtree was outdated)
8983 check_caps = false;
8984 }
8985 frag_t fg = dirp->buffer_frag;
8986
8987 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8988 << " offset " << hex << dirp->offset << dendl;
8989
8990 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8991 dirp->offset, dir_result_t::dentry_off_lt());
8992 it != dirp->buffer.end();
8993 ++it) {
8994 dir_result_t::dentry &entry = *it;
8995
8996 uint64_t next_off = entry.offset + 1;
8997
8998 int r;
8999 if (check_caps) {
9000 int mask = caps;
9001 if(entry.inode->is_dir()){
9002 mask |= CEPH_STAT_RSTAT;
9003 }
9004 r = _getattr(entry.inode, mask, dirp->perms);
9005 if (r < 0)
9006 return r;
9007 }
9008
9009 fill_statx(entry.inode, caps, &stx);
9010 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9011
9012 Inode *inode = NULL;
9013 if (getref) {
9014 inode = entry.inode.get();
9015 _ll_get(inode);
9016 }
9017
9018 cl.unlock();
9019 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9020 cl.lock();
9021
9022 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9023 << " = " << r << dendl;
9024 if (r < 0)
9025 return r;
9026
9027 dirp->offset = next_off;
9028 if (r > 0)
9029 return r;
9030 }
9031
9032 if (dirp->next_offset > 2) {
9033 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9034 _readdir_drop_dirp_buffer(dirp);
9035 continue; // more!
9036 }
9037
9038 if (!fg.is_rightmost()) {
9039 // next frag!
9040 _readdir_next_frag(dirp);
9041 continue;
9042 }
9043
9044 if (diri->shared_gen == dirp->start_shared_gen &&
9045 diri->dir_release_count == dirp->release_count) {
9046 if (diri->dir_ordered_count == dirp->ordered_count) {
9047 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9048 if (diri->dir) {
9049 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9050 diri->dir->readdir_cache.resize(dirp->cache_index);
9051 }
9052 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9053 } else {
9054 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9055 diri->flags |= I_COMPLETE;
9056 }
9057 }
9058
9059 dirp->set_end();
9060 return 0;
9061 }
9062 ceph_abort();
9063 return 0;
9064 }
9065
9066
9067 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9068 {
9069 return readdirplus_r(d, de, 0, 0, 0, NULL);
9070 }
9071
9072 /*
9073 * readdirplus_r
9074 *
9075 * returns
9076 * 1 if we got a dirent
9077 * 0 for end of directory
9078 * <0 on error
9079 */
9080
9081 struct single_readdir {
9082 struct dirent *de;
9083 struct ceph_statx *stx;
9084 Inode *inode;
9085 bool full;
9086 };
9087
9088 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9089 struct ceph_statx *stx, off_t off,
9090 Inode *in)
9091 {
9092 single_readdir *c = static_cast<single_readdir *>(p);
9093
9094 if (c->full)
9095 return -1; // already filled this dirent
9096
9097 *c->de = *de;
9098 if (c->stx)
9099 *c->stx = *stx;
9100 c->inode = in;
9101 c->full = true;
9102 return 1;
9103 }
9104
9105 struct dirent *Client::readdir(dir_result_t *d)
9106 {
9107 int ret;
9108 auto& de = d->de;
9109 single_readdir sr;
9110 sr.de = &de;
9111 sr.stx = NULL;
9112 sr.inode = NULL;
9113 sr.full = false;
9114
9115 // our callback fills the dirent and sets sr.full=true on first
9116 // call, and returns -1 the second time around.
9117 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9118 if (ret < -1) {
9119 errno = -ret; // this sucks.
9120 return (dirent *) NULL;
9121 }
9122 if (sr.full) {
9123 return &de;
9124 }
9125 return (dirent *) NULL;
9126 }
9127
9128 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9129 struct ceph_statx *stx, unsigned want,
9130 unsigned flags, Inode **out)
9131 {
9132 single_readdir sr;
9133 sr.de = de;
9134 sr.stx = stx;
9135 sr.inode = NULL;
9136 sr.full = false;
9137
9138 // our callback fills the dirent and sets sr.full=true on first
9139 // call, and returns -1 the second time around.
9140 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9141 if (r < -1)
9142 return r;
9143 if (out)
9144 *out = sr.inode;
9145 if (sr.full)
9146 return 1;
9147 return 0;
9148 }
9149
9150
9151 /* getdents */
9152 struct getdents_result {
9153 char *buf;
9154 int buflen;
9155 int pos;
9156 bool fullent;
9157 };
9158
9159 static int _readdir_getdent_cb(void *p, struct dirent *de,
9160 struct ceph_statx *stx, off_t off, Inode *in)
9161 {
9162 struct getdents_result *c = static_cast<getdents_result *>(p);
9163
9164 int dlen;
9165 if (c->fullent)
9166 dlen = sizeof(*de);
9167 else
9168 dlen = strlen(de->d_name) + 1;
9169
9170 if (c->pos + dlen > c->buflen)
9171 return -1; // doesn't fit
9172
9173 if (c->fullent) {
9174 memcpy(c->buf + c->pos, de, sizeof(*de));
9175 } else {
9176 memcpy(c->buf + c->pos, de->d_name, dlen);
9177 }
9178 c->pos += dlen;
9179 return 0;
9180 }
9181
9182 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9183 {
9184 getdents_result gr;
9185 gr.buf = buf;
9186 gr.buflen = buflen;
9187 gr.fullent = fullent;
9188 gr.pos = 0;
9189
9190 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9191
9192 if (r < 0) { // some error
9193 if (r == -1) { // buffer ran out of space
9194 if (gr.pos) { // but we got some entries already!
9195 return gr.pos;
9196 } // or we need a larger buffer
9197 return -CEPHFS_ERANGE;
9198 } else { // actual error, return it
9199 return r;
9200 }
9201 }
9202 return gr.pos;
9203 }
9204
9205
9206 /* getdir */
9207 struct getdir_result {
9208 list<string> *contents;
9209 int num;
9210 };
9211
9212 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9213 {
9214 getdir_result *r = static_cast<getdir_result *>(p);
9215
9216 r->contents->push_back(de->d_name);
9217 r->num++;
9218 return 0;
9219 }
9220
9221 int Client::getdir(const char *relpath, list<string>& contents,
9222 const UserPerm& perms)
9223 {
9224 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9225 tout(cct) << "getdir" << std::endl;
9226 tout(cct) << relpath << std::endl;
9227
9228 dir_result_t *d;
9229 int r = opendir(relpath, &d, perms);
9230 if (r < 0)
9231 return r;
9232
9233 getdir_result gr;
9234 gr.contents = &contents;
9235 gr.num = 0;
9236 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9237
9238 closedir(d);
9239
9240 if (r < 0)
9241 return r;
9242 return gr.num;
9243 }
9244
9245
9246 /****** file i/o **********/
9247
9248 // common parts for open and openat. call with client_lock locked.
9249 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9250 const UserPerm& perms, mode_t mode, int stripe_unit,
9251 int stripe_count, int object_size, const char *data_pool,
9252 std::string alternate_name) {
9253 ceph_assert(ceph_mutex_is_locked(client_lock));
9254 int cflags = ceph_flags_sys2wire(flags);
9255 tout(cct) << cflags << std::endl;
9256
9257 Fh *fh = NULL;
9258
9259 #if defined(__linux__) && defined(O_PATH)
9260 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9261 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9262 * in kernel (fs/open.c). */
9263 if (flags & O_PATH)
9264 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9265 #endif
9266
9267 filepath path(relpath);
9268 InodeRef in;
9269 bool created = false;
9270 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9271 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9272 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9273
9274 InodeRef dirinode = nullptr;
9275 int r = get_fd_inode(dirfd, &dirinode);
9276 if (r < 0) {
9277 return r;
9278 }
9279
9280 r = path_walk(path, &in, perms, followsym, mask, dirinode);
9281 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9282 return -CEPHFS_EEXIST;
9283
9284 #if defined(__linux__) && defined(O_PATH)
9285 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9286 #else
9287 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9288 #endif
9289 return -CEPHFS_ELOOP;
9290
9291 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9292 filepath dirpath = path;
9293 string dname = dirpath.last_dentry();
9294 dirpath.pop_dentry();
9295 InodeRef dir;
9296 r = path_walk(dirpath, &dir, perms, true,
9297 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9298 if (r < 0) {
9299 goto out;
9300 }
9301 if (cct->_conf->client_permissions) {
9302 r = may_create(dir.get(), perms);
9303 if (r < 0)
9304 goto out;
9305 }
9306 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9307 stripe_count, object_size, data_pool, &created, perms,
9308 std::move(alternate_name));
9309 }
9310 if (r < 0)
9311 goto out;
9312
9313 if (!created) {
9314 // posix says we can only check permissions of existing files
9315 if (cct->_conf->client_permissions) {
9316 r = may_open(in.get(), flags, perms);
9317 if (r < 0)
9318 goto out;
9319 }
9320 }
9321
9322 if (!fh)
9323 r = _open(in.get(), flags, mode, &fh, perms);
9324 if (r >= 0) {
9325 // allocate a integer file descriptor
9326 ceph_assert(fh);
9327 r = get_fd();
9328 ceph_assert(fd_map.count(r) == 0);
9329 fd_map[r] = fh;
9330 }
9331
9332 out:
9333 return r;
9334 }
9335
9336 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9337 mode_t mode, int stripe_unit, int stripe_count,
9338 int object_size, const char *data_pool, std::string alternate_name)
9339 {
9340 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9341 stripe_count, object_size, data_pool, alternate_name);
9342 }
9343
9344 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9345 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9346 const char *data_pool, std::string alternate_name) {
9347 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9348 if (!mref_reader.is_state_satisfied()) {
9349 return -CEPHFS_ENOTCONN;
9350 }
9351
9352 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9353 tout(cct) << dirfd << std::endl;
9354 tout(cct) << relpath << std::endl;
9355 tout(cct) << flags << std::endl;
9356 tout(cct) << mode << std::endl;
9357
9358 std::scoped_lock locker(client_lock);
9359 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9360 object_size, data_pool, alternate_name);
9361
9362 tout(cct) << r << std::endl;
9363 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9364 return r;
9365 }
9366
9367 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9368 const UserPerm& perms)
9369 {
9370 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9371
9372 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9373 if (!mref_reader.is_state_satisfied())
9374 return -CEPHFS_ENOTCONN;
9375
9376 std::scoped_lock lock(client_lock);
9377 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9378 filepath path(ino);
9379 req->set_filepath(path);
9380
9381 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9382 char f[30];
9383 sprintf(f, "%u", h);
9384 filepath path2(dirino);
9385 path2.push_dentry(string(f));
9386 req->set_filepath2(path2);
9387
9388 int r = make_request(req, perms, NULL, NULL,
9389 rand() % mdsmap->get_num_in_mds());
9390 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9391 return r;
9392 }
9393
9394
9395 /**
9396 * Load inode into local cache.
9397 *
9398 * If inode pointer is non-NULL, and take a reference on
9399 * the resulting Inode object in one operation, so that caller
9400 * can safely assume inode will still be there after return.
9401 */
9402 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9403 {
9404 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9405
9406 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9407 if (!mref_reader.is_state_satisfied())
9408 return -CEPHFS_ENOTCONN;
9409
9410 if (is_reserved_vino(vino))
9411 return -CEPHFS_ESTALE;
9412
9413 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9414 filepath path(vino.ino);
9415 req->set_filepath(path);
9416
9417 /*
9418 * The MDS expects either a "real" snapid here or 0. The special value
9419 * carveouts for the snapid are all at the end of the range so we can
9420 * just look for any snapid below this value.
9421 */
9422 if (vino.snapid < CEPH_NOSNAP)
9423 req->head.args.lookupino.snapid = vino.snapid;
9424
9425 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9426 if (r == 0 && inode != NULL) {
9427 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9428 ceph_assert(p != inode_map.end());
9429 *inode = p->second;
9430 _ll_get(*inode);
9431 }
9432 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9433 return r;
9434 }
9435
9436 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9437 {
9438 vinodeno_t vino(ino, CEPH_NOSNAP);
9439 std::scoped_lock lock(client_lock);
9440 return _lookup_vino(vino, perms, inode);
9441 }
9442
9443 /**
9444 * Find the parent inode of `ino` and insert it into
9445 * our cache. Conditionally also set `parent` to a referenced
9446 * Inode* if caller provides non-NULL value.
9447 */
9448 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9449 {
9450 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9451
9452 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9453 filepath path(ino->ino);
9454 req->set_filepath(path);
9455
9456 InodeRef target;
9457 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9458 // Give caller a reference to the parent ino if they provided a pointer.
9459 if (parent != NULL) {
9460 if (r == 0) {
9461 *parent = target.get();
9462 _ll_get(*parent);
9463 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9464 } else {
9465 *parent = NULL;
9466 }
9467 }
9468 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9469 return r;
9470 }
9471
9472 /**
9473 * Populate the parent dentry for `ino`, provided it is
9474 * a child of `parent`.
9475 */
9476 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9477 {
9478 ceph_assert(parent->is_dir());
9479 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9480
9481 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9482 if (!mref_reader.is_state_satisfied())
9483 return -CEPHFS_ENOTCONN;
9484
9485 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9486 req->set_filepath2(filepath(parent->ino));
9487 req->set_filepath(filepath(ino->ino));
9488 req->set_inode(ino);
9489
9490 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9491 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9492 return r;
9493 }
9494
9495 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9496 {
9497 std::scoped_lock lock(client_lock);
9498 return _lookup_name(ino, parent, perms);
9499 }
9500
9501 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9502 {
9503 ceph_assert(in);
9504 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9505
9506 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9507
9508 if (in->snapid != CEPH_NOSNAP) {
9509 in->snap_cap_refs++;
9510 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9511 << ccap_string(in->caps_issued()) << dendl;
9512 }
9513
9514 const auto& conf = cct->_conf;
9515 f->readahead.set_trigger_requests(1);
9516 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9517 uint64_t max_readahead = Readahead::NO_LIMIT;
9518 if (conf->client_readahead_max_bytes) {
9519 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9520 }
9521 if (conf->client_readahead_max_periods) {
9522 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9523 }
9524 f->readahead.set_max_readahead_size(max_readahead);
9525 vector<uint64_t> alignments;
9526 alignments.push_back(in->layout.get_period());
9527 alignments.push_back(in->layout.stripe_unit);
9528 f->readahead.set_alignments(alignments);
9529
9530 return f;
9531 }
9532
9533 int Client::_release_fh(Fh *f)
9534 {
9535 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9536 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9537 Inode *in = f->inode.get();
9538 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9539
9540 in->unset_deleg(f);
9541
9542 if (in->snapid == CEPH_NOSNAP) {
9543 if (in->put_open_ref(f->mode)) {
9544 _flush(in, new C_Client_FlushComplete(this, in));
9545 check_caps(in, 0);
9546 }
9547 } else {
9548 ceph_assert(in->snap_cap_refs > 0);
9549 in->snap_cap_refs--;
9550 }
9551
9552 _release_filelocks(f);
9553
9554 // Finally, read any async err (i.e. from flushes)
9555 int err = f->take_async_err();
9556 if (err != 0) {
9557 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9558 << cpp_strerror(err) << dendl;
9559 } else {
9560 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9561 }
9562
9563 _put_fh(f);
9564
9565 return err;
9566 }
9567
9568 void Client::_put_fh(Fh *f)
9569 {
9570 int left = f->put();
9571 if (!left) {
9572 delete f;
9573 }
9574 }
9575
9576 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9577 const UserPerm& perms)
9578 {
9579 if (in->snapid != CEPH_NOSNAP &&
9580 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9581 return -CEPHFS_EROFS;
9582 }
9583
9584 // use normalized flags to generate cmode
9585 int cflags = ceph_flags_sys2wire(flags);
9586 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9587 cflags |= CEPH_O_LAZY;
9588
9589 int cmode = ceph_flags_to_mode(cflags);
9590 int want = ceph_caps_for_mode(cmode);
9591 int result = 0;
9592
9593 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9594
9595 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9596 // update wanted?
9597 check_caps(in, CHECK_CAPS_NODELAY);
9598 } else {
9599
9600 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9601 filepath path;
9602 in->make_nosnap_relative_path(path);
9603 req->set_filepath(path);
9604 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9605 req->head.args.open.mode = mode;
9606 req->head.args.open.pool = -1;
9607 if (cct->_conf->client_debug_getattr_caps)
9608 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9609 else
9610 req->head.args.open.mask = 0;
9611 req->head.args.open.old_size = in->size; // for O_TRUNC
9612 req->set_inode(in);
9613 result = make_request(req, perms);
9614
9615 /*
9616 * NFS expects that delegations will be broken on a conflicting open,
9617 * not just when there is actual conflicting access to the file. SMB leases
9618 * and oplocks also have similar semantics.
9619 *
9620 * Ensure that clients that have delegations enabled will wait on minimal
9621 * caps during open, just to ensure that other clients holding delegations
9622 * return theirs first.
9623 */
9624 if (deleg_timeout && result == 0) {
9625 int need = 0, have;
9626
9627 if (cmode & CEPH_FILE_MODE_WR)
9628 need |= CEPH_CAP_FILE_WR;
9629 if (cmode & CEPH_FILE_MODE_RD)
9630 need |= CEPH_CAP_FILE_RD;
9631
9632 Fh fh(in, flags, cmode, fd_gen, perms);
9633 result = get_caps(&fh, need, want, &have, -1);
9634 if (result < 0) {
9635 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9636 " . Denying open: " <<
9637 cpp_strerror(result) << dendl;
9638 } else {
9639 put_cap_ref(in, need);
9640 }
9641 }
9642 }
9643
9644 // success?
9645 if (result >= 0) {
9646 if (fhp)
9647 *fhp = _create_fh(in, flags, cmode, perms);
9648 } else {
9649 in->put_open_ref(cmode);
9650 }
9651
9652 trim_cache();
9653
9654 return result;
9655 }
9656
9657 int Client::_renew_caps(Inode *in)
9658 {
9659 int wanted = in->caps_file_wanted();
9660 if (in->is_any_caps() &&
9661 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9662 check_caps(in, CHECK_CAPS_NODELAY);
9663 return 0;
9664 }
9665
9666 int flags = 0;
9667 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9668 flags = O_RDWR;
9669 else if (wanted & CEPH_CAP_FILE_RD)
9670 flags = O_RDONLY;
9671 else if (wanted & CEPH_CAP_FILE_WR)
9672 flags = O_WRONLY;
9673
9674 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9675 filepath path;
9676 in->make_nosnap_relative_path(path);
9677 req->set_filepath(path);
9678 req->head.args.open.flags = flags;
9679 req->head.args.open.pool = -1;
9680 if (cct->_conf->client_debug_getattr_caps)
9681 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9682 else
9683 req->head.args.open.mask = 0;
9684 req->set_inode(in);
9685
9686 // duplicate in case Cap goes away; not sure if that race is a concern?
9687 const UserPerm *pperm = in->get_best_perms();
9688 UserPerm perms;
9689 if (pperm != NULL)
9690 perms = *pperm;
9691 int ret = make_request(req, perms);
9692 return ret;
9693 }
9694
9695 int Client::_close(int fd)
9696 {
9697 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9698 tout(cct) << "close" << std::endl;
9699 tout(cct) << fd << std::endl;
9700
9701 Fh *fh = get_filehandle(fd);
9702 if (!fh)
9703 return -CEPHFS_EBADF;
9704 int err = _release_fh(fh);
9705 fd_map.erase(fd);
9706 put_fd(fd);
9707 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9708 return err;
9709 }
9710
9711 int Client::close(int fd) {
9712 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9713 if (!mref_reader.is_state_satisfied())
9714 return -CEPHFS_ENOTCONN;
9715
9716 std::scoped_lock lock(client_lock);
9717 return _close(fd);
9718 }
9719
9720 // ------------
9721 // read, write
9722
9723 loff_t Client::lseek(int fd, loff_t offset, int whence)
9724 {
9725 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9726 if (!mref_reader.is_state_satisfied())
9727 return -CEPHFS_ENOTCONN;
9728
9729 tout(cct) << "lseek" << std::endl;
9730 tout(cct) << fd << std::endl;
9731 tout(cct) << offset << std::endl;
9732 tout(cct) << whence << std::endl;
9733
9734 std::scoped_lock lock(client_lock);
9735 Fh *f = get_filehandle(fd);
9736 if (!f)
9737 return -CEPHFS_EBADF;
9738 #if defined(__linux__) && defined(O_PATH)
9739 if (f->flags & O_PATH)
9740 return -CEPHFS_EBADF;
9741 #endif
9742 return _lseek(f, offset, whence);
9743 }
9744
9745 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9746 {
9747 Inode *in = f->inode.get();
9748 bool whence_check = false;
9749 loff_t pos = -1;
9750
9751 switch (whence) {
9752 case SEEK_END:
9753 whence_check = true;
9754 break;
9755
9756 #ifdef SEEK_DATA
9757 case SEEK_DATA:
9758 whence_check = true;
9759 break;
9760 #endif
9761
9762 #ifdef SEEK_HOLE
9763 case SEEK_HOLE:
9764 whence_check = true;
9765 break;
9766 #endif
9767 }
9768
9769 if (whence_check) {
9770 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9771 if (r < 0)
9772 return r;
9773 }
9774
9775 switch (whence) {
9776 case SEEK_SET:
9777 pos = offset;
9778 break;
9779
9780 case SEEK_CUR:
9781 pos = f->pos + offset;
9782 break;
9783
9784 case SEEK_END:
9785 pos = in->size + offset;
9786 break;
9787
9788 #ifdef SEEK_DATA
9789 case SEEK_DATA:
9790 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9791 return -CEPHFS_ENXIO;
9792 pos = offset;
9793 break;
9794 #endif
9795
9796 #ifdef SEEK_HOLE
9797 case SEEK_HOLE:
9798 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9799 return -CEPHFS_ENXIO;
9800 pos = in->size;
9801 break;
9802 #endif
9803
9804 default:
9805 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9806 return -CEPHFS_EINVAL;
9807 }
9808
9809 if (pos < 0) {
9810 return -CEPHFS_EINVAL;
9811 } else {
9812 f->pos = pos;
9813 }
9814
9815 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9816 return f->pos;
9817 }
9818
9819
9820 void Client::lock_fh_pos(Fh *f)
9821 {
9822 ldout(cct, 10) << __func__ << " " << f << dendl;
9823
9824 if (f->pos_locked || !f->pos_waiters.empty()) {
9825 ceph::condition_variable cond;
9826 f->pos_waiters.push_back(&cond);
9827 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9828 std::unique_lock l{client_lock, std::adopt_lock};
9829 cond.wait(l, [f, me=&cond] {
9830 return !f->pos_locked && f->pos_waiters.front() == me;
9831 });
9832 l.release();
9833 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9834 ceph_assert(f->pos_waiters.front() == &cond);
9835 f->pos_waiters.pop_front();
9836 }
9837
9838 f->pos_locked = true;
9839 }
9840
9841 void Client::unlock_fh_pos(Fh *f)
9842 {
9843 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9844
9845 ldout(cct, 10) << __func__ << " " << f << dendl;
9846 f->pos_locked = false;
9847 if (!f->pos_waiters.empty()) {
9848 // only wake up the oldest waiter
9849 auto cond = f->pos_waiters.front();
9850 cond->notify_one();
9851 }
9852 }
9853
9854 int Client::uninline_data(Inode *in, Context *onfinish)
9855 {
9856 if (!in->inline_data.length()) {
9857 onfinish->complete(0);
9858 return 0;
9859 }
9860
9861 char oid_buf[32];
9862 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9863 object_t oid = oid_buf;
9864
9865 ObjectOperation create_ops;
9866 create_ops.create(false);
9867
9868 objecter->mutate(oid,
9869 OSDMap::file_to_object_locator(in->layout),
9870 create_ops,
9871 in->snaprealm->get_snap_context(),
9872 ceph::real_clock::now(),
9873 0,
9874 NULL);
9875
9876 bufferlist inline_version_bl;
9877 encode(in->inline_version, inline_version_bl);
9878
9879 ObjectOperation uninline_ops;
9880 uninline_ops.cmpxattr("inline_version",
9881 CEPH_OSD_CMPXATTR_OP_GT,
9882 CEPH_OSD_CMPXATTR_MODE_U64,
9883 inline_version_bl);
9884 bufferlist inline_data = in->inline_data;
9885 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9886 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9887
9888 objecter->mutate(oid,
9889 OSDMap::file_to_object_locator(in->layout),
9890 uninline_ops,
9891 in->snaprealm->get_snap_context(),
9892 ceph::real_clock::now(),
9893 0,
9894 onfinish);
9895
9896 return 0;
9897 }
9898
9899 //
9900
9901 // blocking osd interface
9902
9903 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9904 {
9905 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9906 if (!mref_reader.is_state_satisfied())
9907 return -CEPHFS_ENOTCONN;
9908
9909 tout(cct) << "read" << std::endl;
9910 tout(cct) << fd << std::endl;
9911 tout(cct) << size << std::endl;
9912 tout(cct) << offset << std::endl;
9913
9914 std::unique_lock lock(client_lock);
9915 Fh *f = get_filehandle(fd);
9916 if (!f)
9917 return -CEPHFS_EBADF;
9918 #if defined(__linux__) && defined(O_PATH)
9919 if (f->flags & O_PATH)
9920 return -CEPHFS_EBADF;
9921 #endif
9922 bufferlist bl;
9923 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9924 size = std::min(size, (loff_t)INT_MAX);
9925 int r = _read(f, offset, size, &bl);
9926 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9927 if (r >= 0) {
9928 lock.unlock();
9929 bl.begin().copy(bl.length(), buf);
9930 r = bl.length();
9931 }
9932 return r;
9933 }
9934
9935 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9936 {
9937 if (iovcnt < 0)
9938 return -CEPHFS_EINVAL;
9939 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9940 }
9941
9942 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9943 {
9944 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9945
9946 int want, have = 0;
9947 bool movepos = false;
9948 std::unique_ptr<C_SaferCond> onuninline;
9949 int64_t rc = 0;
9950 const auto& conf = cct->_conf;
9951 Inode *in = f->inode.get();
9952 utime_t lat;
9953 utime_t start = ceph_clock_now();
9954
9955 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9956 return -CEPHFS_EBADF;
9957 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9958
9959 if (offset < 0) {
9960 lock_fh_pos(f);
9961 offset = f->pos;
9962 movepos = true;
9963 }
9964 loff_t start_pos = offset;
9965
9966 if (in->inline_version == 0) {
9967 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9968 if (r < 0) {
9969 rc = r;
9970 goto done;
9971 }
9972 ceph_assert(in->inline_version > 0);
9973 }
9974
9975 retry:
9976 if (f->mode & CEPH_FILE_MODE_LAZY)
9977 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9978 else
9979 want = CEPH_CAP_FILE_CACHE;
9980 {
9981 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9982 if (r < 0) {
9983 rc = r;
9984 goto done;
9985 }
9986 }
9987 if (f->flags & O_DIRECT)
9988 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9989
9990 if (in->inline_version < CEPH_INLINE_NONE) {
9991 if (!(have & CEPH_CAP_FILE_CACHE)) {
9992 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9993 uninline_data(in, onuninline.get());
9994 } else {
9995 uint32_t len = in->inline_data.length();
9996 uint64_t endoff = offset + size;
9997 if (endoff > in->size)
9998 endoff = in->size;
9999
10000 if (offset < len) {
10001 if (endoff <= len) {
10002 bl->substr_of(in->inline_data, offset, endoff - offset);
10003 } else {
10004 bl->substr_of(in->inline_data, offset, len - offset);
10005 bl->append_zero(endoff - len);
10006 }
10007 rc = endoff - offset;
10008 } else if ((uint64_t)offset < endoff) {
10009 bl->append_zero(endoff - offset);
10010 rc = endoff - offset;
10011 } else {
10012 rc = 0;
10013 }
10014 goto success;
10015 }
10016 }
10017
10018 if (!conf->client_debug_force_sync_read &&
10019 conf->client_oc &&
10020 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10021
10022 if (f->flags & O_RSYNC) {
10023 _flush_range(in, offset, size);
10024 }
10025 rc = _read_async(f, offset, size, bl);
10026 if (rc < 0)
10027 goto done;
10028 } else {
10029 if (f->flags & O_DIRECT)
10030 _flush_range(in, offset, size);
10031
10032 bool checkeof = false;
10033 rc = _read_sync(f, offset, size, bl, &checkeof);
10034 if (rc < 0)
10035 goto done;
10036 if (checkeof) {
10037 offset += rc;
10038 size -= rc;
10039
10040 put_cap_ref(in, CEPH_CAP_FILE_RD);
10041 have = 0;
10042 // reverify size
10043 {
10044 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10045 if (r < 0) {
10046 rc = r;
10047 goto done;
10048 }
10049 }
10050
10051 // eof? short read.
10052 if ((uint64_t)offset < in->size)
10053 goto retry;
10054 }
10055 }
10056
10057 success:
10058 ceph_assert(rc >= 0);
10059 update_read_io_size(bl->length());
10060 if (movepos) {
10061 // adjust fd pos
10062 f->pos = start_pos + rc;
10063 }
10064
10065 lat = ceph_clock_now();
10066 lat -= start;
10067 logger->tinc(l_c_read, lat);
10068
10069 done:
10070 // done!
10071
10072 if (onuninline) {
10073 client_lock.unlock();
10074 int ret = onuninline->wait();
10075 client_lock.lock();
10076 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
10077 in->inline_data.clear();
10078 in->inline_version = CEPH_INLINE_NONE;
10079 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10080 check_caps(in, 0);
10081 } else
10082 rc = ret;
10083 }
10084 if (have) {
10085 put_cap_ref(in, CEPH_CAP_FILE_RD);
10086 }
10087 if (movepos) {
10088 unlock_fh_pos(f);
10089 }
10090 return rc;
10091 }
10092
10093 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10094 client(c), f(f) {
10095 f->get();
10096 f->readahead.inc_pending();
10097 }
10098
10099 Client::C_Readahead::~C_Readahead() {
10100 f->readahead.dec_pending();
10101 client->_put_fh(f);
10102 }
10103
10104 void Client::C_Readahead::finish(int r) {
10105 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10106 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10107 if (r > 0) {
10108 client->update_read_io_size(r);
10109 }
10110 }
10111
10112 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10113 {
10114 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10115
10116 const auto& conf = cct->_conf;
10117 Inode *in = f->inode.get();
10118
10119 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10120
10121 // trim read based on file size?
10122 if (off >= in->size)
10123 return 0;
10124 if (len == 0)
10125 return 0;
10126 if (off + len > in->size) {
10127 len = in->size - off;
10128 }
10129
10130 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10131 << " max_bytes=" << f->readahead.get_max_readahead_size()
10132 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10133
10134 // read (and possibly block)
10135 int r = 0;
10136 C_SaferCond onfinish("Client::_read_async flock");
10137 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10138 off, len, bl, 0, &onfinish);
10139 if (r == 0) {
10140 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10141 client_lock.unlock();
10142 r = onfinish.wait();
10143 client_lock.lock();
10144 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10145 update_read_io_size(bl->length());
10146 }
10147
10148 if(f->readahead.get_min_readahead_size() > 0) {
10149 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10150 if (readahead_extent.second > 0) {
10151 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10152 << " (caller wants " << off << "~" << len << ")" << dendl;
10153 Context *onfinish2 = new C_Readahead(this, f);
10154 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10155 readahead_extent.first, readahead_extent.second,
10156 NULL, 0, onfinish2);
10157 if (r2 == 0) {
10158 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10159 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10160 } else {
10161 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10162 delete onfinish2;
10163 }
10164 }
10165 }
10166
10167 return r;
10168 }
10169
10170 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10171 bool *checkeof)
10172 {
10173 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10174
10175 Inode *in = f->inode.get();
10176 uint64_t pos = off;
10177 int left = len;
10178 int read = 0;
10179
10180 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10181
10182 // 0 success, 1 continue and < 0 error happen.
10183 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10184 int r = onfinish.wait();
10185
10186 // if we get ENOENT from OSD, assume 0 bytes returned
10187 if (r == -CEPHFS_ENOENT)
10188 r = 0;
10189 if (r < 0)
10190 return r;
10191
10192 if (tbl.length()) {
10193 r = tbl.length();
10194
10195 read += r;
10196 pos += r;
10197 left -= r;
10198 bl->claim_append(tbl);
10199 }
10200 // short read?
10201 if (r >= 0 && r < wanted) {
10202 if (pos < in->size) {
10203 // zero up to known EOF
10204 int64_t some = in->size - pos;
10205 if (some > left)
10206 some = left;
10207 auto z = buffer::ptr_node::create(some);
10208 z->zero();
10209 bl->push_back(std::move(z));
10210 read += some;
10211 pos += some;
10212 left -= some;
10213 if (left == 0)
10214 return 0;
10215 }
10216
10217 *checkeof = true;
10218 return 0;
10219 }
10220 return 1;
10221 };
10222
10223 while (left > 0) {
10224 C_SaferCond onfinish("Client::_read_sync flock");
10225 bufferlist tbl;
10226
10227 int wanted = left;
10228 filer->read_trunc(in->ino, &in->layout, in->snapid,
10229 pos, left, &tbl, 0,
10230 in->truncate_size, in->truncate_seq,
10231 &onfinish);
10232 client_lock.unlock();
10233 int r = wait_and_copy(onfinish, tbl, wanted);
10234 client_lock.lock();
10235 if (!r)
10236 return read;
10237 if (r < 0)
10238 return r;
10239 }
10240 return read;
10241 }
10242
10243 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10244 {
10245 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10246 if (!mref_reader.is_state_satisfied())
10247 return -CEPHFS_ENOTCONN;
10248
10249 tout(cct) << "write" << std::endl;
10250 tout(cct) << fd << std::endl;
10251 tout(cct) << size << std::endl;
10252 tout(cct) << offset << std::endl;
10253
10254 std::scoped_lock lock(client_lock);
10255 Fh *fh = get_filehandle(fd);
10256 if (!fh)
10257 return -CEPHFS_EBADF;
10258 #if defined(__linux__) && defined(O_PATH)
10259 if (fh->flags & O_PATH)
10260 return -CEPHFS_EBADF;
10261 #endif
10262 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10263 size = std::min(size, (loff_t)INT_MAX);
10264 int r = _write(fh, offset, size, buf, NULL, false);
10265 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10266 return r;
10267 }
10268
10269 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10270 {
10271 if (iovcnt < 0)
10272 return -CEPHFS_EINVAL;
10273 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10274 }
10275
10276 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10277 unsigned iovcnt, int64_t offset,
10278 bool write, bool clamp_to_int)
10279 {
10280 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10281
10282 #if defined(__linux__) && defined(O_PATH)
10283 if (fh->flags & O_PATH)
10284 return -CEPHFS_EBADF;
10285 #endif
10286 loff_t totallen = 0;
10287 for (unsigned i = 0; i < iovcnt; i++) {
10288 totallen += iov[i].iov_len;
10289 }
10290
10291 /*
10292 * Some of the API functions take 64-bit size values, but only return
10293 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10294 * we don't do I/Os larger than the values we can return.
10295 */
10296 if (clamp_to_int) {
10297 totallen = std::min(totallen, (loff_t)INT_MAX);
10298 }
10299 if (write) {
10300 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10301 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10302 return w;
10303 } else {
10304 bufferlist bl;
10305 int64_t r = _read(fh, offset, totallen, &bl);
10306 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10307 if (r <= 0)
10308 return r;
10309
10310 client_lock.unlock();
10311 auto iter = bl.cbegin();
10312 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10313 /*
10314 * This piece of code aims to handle the case that bufferlist
10315 * does not have enough data to fill in the iov
10316 */
10317 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10318 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10319 resid -= round_size;
10320 /* iter is self-updating */
10321 }
10322 client_lock.lock();
10323 return r;
10324 }
10325 }
10326
10327 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10328 {
10329 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10330 if (!mref_reader.is_state_satisfied())
10331 return -CEPHFS_ENOTCONN;
10332
10333 tout(cct) << fd << std::endl;
10334 tout(cct) << offset << std::endl;
10335
10336 std::scoped_lock cl(client_lock);
10337 Fh *fh = get_filehandle(fd);
10338 if (!fh)
10339 return -CEPHFS_EBADF;
10340 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10341 }
10342
10343 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10344 const struct iovec *iov, int iovcnt)
10345 {
10346 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10347
10348 uint64_t fpos = 0;
10349
10350 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
10351 return -CEPHFS_EFBIG;
10352
10353 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10354 Inode *in = f->inode.get();
10355
10356 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10357 return -CEPHFS_ENOSPC;
10358 }
10359
10360 ceph_assert(in->snapid == CEPH_NOSNAP);
10361
10362 // was Fh opened as writeable?
10363 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10364 return -CEPHFS_EBADF;
10365
10366 // use/adjust fd pos?
10367 if (offset < 0) {
10368 lock_fh_pos(f);
10369 /*
10370 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10371 * change out from under us.
10372 */
10373 if (f->flags & O_APPEND) {
10374 auto r = _lseek(f, 0, SEEK_END);
10375 if (r < 0) {
10376 unlock_fh_pos(f);
10377 return r;
10378 }
10379 }
10380 offset = f->pos;
10381 fpos = offset+size;
10382 unlock_fh_pos(f);
10383 }
10384
10385 // check quota
10386 uint64_t endoff = offset + size;
10387 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10388 f->actor_perms)) {
10389 return -CEPHFS_EDQUOT;
10390 }
10391
10392 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10393
10394 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10395
10396 // time it.
10397 utime_t start = ceph_clock_now();
10398
10399 if (in->inline_version == 0) {
10400 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10401 if (r < 0)
10402 return r;
10403 ceph_assert(in->inline_version > 0);
10404 }
10405
10406 // copy into fresh buffer (since our write may be resub, async)
10407 bufferlist bl;
10408 if (buf) {
10409 if (size > 0)
10410 bl.append(buf, size);
10411 } else if (iov){
10412 for (int i = 0; i < iovcnt; i++) {
10413 if (iov[i].iov_len > 0) {
10414 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10415 }
10416 }
10417 }
10418
10419 utime_t lat;
10420 uint64_t totalwritten;
10421 int want, have;
10422 if (f->mode & CEPH_FILE_MODE_LAZY)
10423 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10424 else
10425 want = CEPH_CAP_FILE_BUFFER;
10426 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10427 if (r < 0)
10428 return r;
10429
10430 /* clear the setuid/setgid bits, if any */
10431 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10432 struct ceph_statx stx = { 0 };
10433
10434 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10435 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10436 if (r < 0)
10437 return r;
10438 } else {
10439 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10440 }
10441
10442 if (f->flags & O_DIRECT)
10443 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10444
10445 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10446
10447 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10448
10449 if (in->inline_version < CEPH_INLINE_NONE) {
10450 if (endoff > cct->_conf->client_max_inline_size ||
10451 endoff > CEPH_INLINE_MAX_SIZE ||
10452 !(have & CEPH_CAP_FILE_BUFFER)) {
10453 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10454 uninline_data(in, onuninline.get());
10455 } else {
10456 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10457
10458 uint32_t len = in->inline_data.length();
10459
10460 if (endoff < len)
10461 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10462
10463 if (offset < len)
10464 in->inline_data.splice(offset, len - offset);
10465 else if (offset > len)
10466 in->inline_data.append_zero(offset - len);
10467
10468 in->inline_data.append(bl);
10469 in->inline_version++;
10470
10471 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10472
10473 goto success;
10474 }
10475 }
10476
10477 if (cct->_conf->client_oc &&
10478 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10479 // do buffered write
10480 if (!in->oset.dirty_or_tx)
10481 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10482
10483 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10484
10485 // async, caching, non-blocking.
10486 r = objectcacher->file_write(&in->oset, &in->layout,
10487 in->snaprealm->get_snap_context(),
10488 offset, size, bl, ceph::real_clock::now(),
10489 0);
10490 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10491
10492 if (r < 0)
10493 goto done;
10494
10495 // flush cached write if O_SYNC is set on file fh
10496 // O_DSYNC == O_SYNC on linux < 2.6.33
10497 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10498 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10499 _flush_range(in, offset, size);
10500 }
10501 } else {
10502 if (f->flags & O_DIRECT)
10503 _flush_range(in, offset, size);
10504
10505 // simple, non-atomic sync write
10506 C_SaferCond onfinish("Client::_write flock");
10507 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10508
10509 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10510 offset, size, bl, ceph::real_clock::now(), 0,
10511 in->truncate_size, in->truncate_seq,
10512 &onfinish);
10513 client_lock.unlock();
10514 r = onfinish.wait();
10515 client_lock.lock();
10516 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10517 if (r < 0)
10518 goto done;
10519 }
10520
10521 // if we get here, write was successful, update client metadata
10522 success:
10523 update_write_io_size(size);
10524 // time
10525 lat = ceph_clock_now();
10526 lat -= start;
10527 logger->tinc(l_c_wrlat, lat);
10528
10529 if (fpos) {
10530 lock_fh_pos(f);
10531 f->pos = fpos;
10532 unlock_fh_pos(f);
10533 }
10534 totalwritten = size;
10535 r = (int64_t)totalwritten;
10536
10537 // extend file?
10538 if (totalwritten + offset > in->size) {
10539 in->size = totalwritten + offset;
10540 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10541
10542 if (is_quota_bytes_approaching(in, f->actor_perms)) {
10543 check_caps(in, CHECK_CAPS_NODELAY);
10544 } else if (is_max_size_approaching(in)) {
10545 check_caps(in, 0);
10546 }
10547
10548 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10549 } else {
10550 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10551 }
10552
10553 // mtime
10554 in->mtime = in->ctime = ceph_clock_now();
10555 in->change_attr++;
10556 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10557
10558 done:
10559
10560 if (nullptr != onuninline) {
10561 client_lock.unlock();
10562 int uninline_ret = onuninline->wait();
10563 client_lock.lock();
10564
10565 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10566 in->inline_data.clear();
10567 in->inline_version = CEPH_INLINE_NONE;
10568 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10569 check_caps(in, 0);
10570 } else
10571 r = uninline_ret;
10572 }
10573
10574 put_cap_ref(in, CEPH_CAP_FILE_WR);
10575 return r;
10576 }
10577
10578 int Client::_flush(Fh *f)
10579 {
10580 Inode *in = f->inode.get();
10581 int err = f->take_async_err();
10582 if (err != 0) {
10583 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10584 << cpp_strerror(err) << dendl;
10585 } else {
10586 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10587 }
10588
10589 return err;
10590 }
10591
10592 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10593 {
10594 struct ceph_statx stx;
10595 stx.stx_size = length;
10596 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10597 }
10598
10599 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10600 {
10601 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10602 if (!mref_reader.is_state_satisfied())
10603 return -CEPHFS_ENOTCONN;
10604
10605 tout(cct) << __func__ << std::endl;
10606 tout(cct) << fd << std::endl;
10607 tout(cct) << length << std::endl;
10608
10609 std::scoped_lock lock(client_lock);
10610 Fh *f = get_filehandle(fd);
10611 if (!f)
10612 return -CEPHFS_EBADF;
10613 #if defined(__linux__) && defined(O_PATH)
10614 if (f->flags & O_PATH)
10615 return -CEPHFS_EBADF;
10616 #endif
10617 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10618 return -CEPHFS_EBADF;
10619 struct stat attr;
10620 attr.st_size = length;
10621 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10622 }
10623
10624 int Client::fsync(int fd, bool syncdataonly)
10625 {
10626 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10627 if (!mref_reader.is_state_satisfied())
10628 return -CEPHFS_ENOTCONN;
10629
10630 tout(cct) << "fsync" << std::endl;
10631 tout(cct) << fd << std::endl;
10632 tout(cct) << syncdataonly << std::endl;
10633
10634 std::scoped_lock lock(client_lock);
10635 Fh *f = get_filehandle(fd);
10636 if (!f)
10637 return -CEPHFS_EBADF;
10638 #if defined(__linux__) && defined(O_PATH)
10639 if (f->flags & O_PATH)
10640 return -CEPHFS_EBADF;
10641 #endif
10642 int r = _fsync(f, syncdataonly);
10643 if (r == 0) {
10644 // The IOs in this fsync were okay, but maybe something happened
10645 // in the background that we shoudl be reporting?
10646 r = f->take_async_err();
10647 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10648 << ") = 0, async_err = " << r << dendl;
10649 } else {
10650 // Assume that an error we encountered during fsync, even reported
10651 // synchronously, would also have applied the error to the Fh, and we
10652 // should clear it here to avoid returning the same error again on next
10653 // call.
10654 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10655 << r << dendl;
10656 f->take_async_err();
10657 }
10658 return r;
10659 }
10660
10661 int Client::_fsync(Inode *in, bool syncdataonly)
10662 {
10663 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10664
10665 int r = 0;
10666 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10667 ceph_tid_t flush_tid = 0;
10668 InodeRef tmp_ref;
10669 utime_t lat;
10670 utime_t start = ceph_clock_now();
10671
10672 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10673
10674 if (cct->_conf->client_oc) {
10675 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10676 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10677 _flush(in, object_cacher_completion.get());
10678 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10679 }
10680
10681 if (!syncdataonly && in->dirty_caps) {
10682 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10683 if (in->flushing_caps)
10684 flush_tid = last_flush_tid;
10685 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10686
10687 if (!syncdataonly && !in->unsafe_ops.empty()) {
10688 flush_mdlog_sync(in);
10689
10690 MetaRequest *req = in->unsafe_ops.back();
10691 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10692
10693 req->get();
10694 wait_on_list(req->waitfor_safe);
10695 put_request(req);
10696 }
10697
10698 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10699 client_lock.unlock();
10700 ldout(cct, 15) << "waiting on data to flush" << dendl;
10701 r = object_cacher_completion->wait();
10702 client_lock.lock();
10703 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10704 } else {
10705 // FIXME: this can starve
10706 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10707 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10708 << " uncommitted, waiting" << dendl;
10709 wait_on_list(in->waitfor_commit);
10710 }
10711 }
10712
10713 if (!r) {
10714 if (flush_tid > 0)
10715 wait_sync_caps(in, flush_tid);
10716
10717 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10718 } else {
10719 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10720 << cpp_strerror(-r) << dendl;
10721 }
10722
10723 lat = ceph_clock_now();
10724 lat -= start;
10725 logger->tinc(l_c_fsync, lat);
10726
10727 return r;
10728 }
10729
10730 int Client::_fsync(Fh *f, bool syncdataonly)
10731 {
10732 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10733 return _fsync(f->inode.get(), syncdataonly);
10734 }
10735
10736 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10737 {
10738 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10739 if (!mref_reader.is_state_satisfied())
10740 return -CEPHFS_ENOTCONN;
10741
10742 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10743 tout(cct) << fd << std::endl;
10744
10745 std::scoped_lock lock(client_lock);
10746 Fh *f = get_filehandle(fd);
10747 if (!f)
10748 return -CEPHFS_EBADF;
10749 int r = _getattr(f->inode, mask, perms);
10750 if (r < 0)
10751 return r;
10752 fill_stat(f->inode, stbuf, NULL);
10753 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10754 return r;
10755 }
10756
10757 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10758 unsigned int want, unsigned int flags)
10759 {
10760 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10761 if (!mref_reader.is_state_satisfied())
10762 return -CEPHFS_ENOTCONN;
10763
10764 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10765 tout(cct) << fd << std::endl;
10766
10767 std::scoped_lock lock(client_lock);
10768 Fh *f = get_filehandle(fd);
10769 if (!f)
10770 return -CEPHFS_EBADF;
10771
10772 unsigned mask = statx_to_mask(flags, want);
10773
10774 int r = 0;
10775 if (mask) {
10776 r = _getattr(f->inode, mask, perms);
10777 if (r < 0) {
10778 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10779 return r;
10780 }
10781 }
10782
10783 fill_statx(f->inode, mask, stx);
10784 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10785 return r;
10786 }
10787
10788 int Client::statxat(int dirfd, const char *relpath,
10789 struct ceph_statx *stx, const UserPerm& perms,
10790 unsigned int want, unsigned int flags) {
10791 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10792 if (!mref_reader.is_state_satisfied()) {
10793 return -CEPHFS_ENOTCONN;
10794 }
10795
10796 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10797 tout(cct) << dirfd << std::endl;
10798 tout(cct) << relpath << std::endl;
10799
10800 unsigned mask = statx_to_mask(flags, want);
10801
10802 InodeRef dirinode;
10803 std::scoped_lock lock(client_lock);
10804 int r = get_fd_inode(dirfd, &dirinode);
10805 if (r < 0) {
10806 return r;
10807 }
10808
10809 InodeRef in;
10810 filepath path(relpath);
10811 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10812 if (r < 0) {
10813 return r;
10814 }
10815 r = _getattr(in, mask, perms);
10816 if (r < 0) {
10817 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10818 return r;
10819 }
10820
10821 fill_statx(in, mask, stx);
10822 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10823 return r;
10824 }
10825
10826 // not written yet, but i want to link!
10827
10828 int Client::chdir(const char *relpath, std::string &new_cwd,
10829 const UserPerm& perms)
10830 {
10831 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10832 if (!mref_reader.is_state_satisfied())
10833 return -CEPHFS_ENOTCONN;
10834
10835 tout(cct) << "chdir" << std::endl;
10836 tout(cct) << relpath << std::endl;
10837
10838 filepath path(relpath);
10839 InodeRef in;
10840
10841 std::scoped_lock lock(client_lock);
10842 int r = path_walk(path, &in, perms);
10843 if (r < 0)
10844 return r;
10845
10846 if (!(in.get()->is_dir()))
10847 return -CEPHFS_ENOTDIR;
10848
10849 if (cwd != in)
10850 cwd.swap(in);
10851 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10852
10853 _getcwd(new_cwd, perms);
10854 return 0;
10855 }
10856
10857 void Client::_getcwd(string& dir, const UserPerm& perms)
10858 {
10859 filepath path;
10860 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10861
10862 Inode *in = cwd.get();
10863 while (in != root.get()) {
10864 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10865
10866 // A cwd or ancester is unlinked
10867 if (in->dentries.empty()) {
10868 return;
10869 }
10870
10871 Dentry *dn = in->get_first_parent();
10872
10873
10874 if (!dn) {
10875 // look it up
10876 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10877 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10878 filepath path(in->ino);
10879 req->set_filepath(path);
10880 req->set_inode(in);
10881 int res = make_request(req, perms);
10882 if (res < 0)
10883 break;
10884
10885 // start over
10886 path = filepath();
10887 in = cwd.get();
10888 continue;
10889 }
10890 path.push_front_dentry(dn->name);
10891 in = dn->dir->parent_inode;
10892 }
10893 dir = "/";
10894 dir += path.get_path();
10895 }
10896
10897 void Client::getcwd(string& dir, const UserPerm& perms)
10898 {
10899 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10900 if (!mref_reader.is_state_satisfied())
10901 return;
10902
10903 std::scoped_lock l(client_lock);
10904
10905 _getcwd(dir, perms);
10906 }
10907
10908 int Client::statfs(const char *path, struct statvfs *stbuf,
10909 const UserPerm& perms)
10910 {
10911 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10912 if (!mref_reader.is_state_satisfied())
10913 return -CEPHFS_ENOTCONN;
10914
10915 tout(cct) << __func__ << std::endl;
10916 unsigned long int total_files_on_fs;
10917
10918 ceph_statfs stats;
10919 C_SaferCond cond;
10920
10921 std::unique_lock lock(client_lock);
10922 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10923 if (data_pools.size() == 1) {
10924 objecter->get_fs_stats(stats, data_pools[0], &cond);
10925 } else {
10926 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
10927 }
10928
10929 lock.unlock();
10930 int rval = cond.wait();
10931 lock.lock();
10932
10933 ceph_assert(root);
10934 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10935
10936 if (rval < 0) {
10937 ldout(cct, 1) << "underlying call to statfs returned error: "
10938 << cpp_strerror(rval)
10939 << dendl;
10940 return rval;
10941 }
10942
10943 memset(stbuf, 0, sizeof(*stbuf));
10944
10945 /*
10946 * we're going to set a block size of 4MB so we can represent larger
10947 * FSes without overflowing. Additionally convert the space
10948 * measurements from KB to bytes while making them in terms of
10949 * blocks. We use 4MB only because it is big enough, and because it
10950 * actually *is* the (ceph) default block size.
10951 */
10952 const int CEPH_BLOCK_SHIFT = 22;
10953 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10954 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10955 stbuf->f_files = total_files_on_fs;
10956 stbuf->f_ffree = -1;
10957 stbuf->f_favail = -1;
10958 stbuf->f_fsid = -1; // ??
10959 stbuf->f_flag = 0; // ??
10960 stbuf->f_namemax = NAME_MAX;
10961
10962 // Usually quota_root will == root_ancestor, but if the mount root has no
10963 // quota but we can see a parent of it that does have a quota, we'll
10964 // respect that one instead.
10965 ceph_assert(root != nullptr);
10966 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
10967
10968 // get_quota_root should always give us something
10969 // because client quotas are always enabled
10970 ceph_assert(quota_root != nullptr);
10971
10972 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10973
10974 // Skip the getattr if any sessions are stale, as we don't want to
10975 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10976 // is unhealthy.
10977 if (!_any_stale_sessions()) {
10978 int r = _getattr(quota_root, 0, perms, true);
10979 if (r != 0) {
10980 // Ignore return value: error getting latest inode metadata is not a good
10981 // reason to break "df".
10982 lderr(cct) << "Error in getattr on quota root 0x"
10983 << std::hex << quota_root->ino << std::dec
10984 << " statfs result may be outdated" << dendl;
10985 }
10986 }
10987
10988 // Special case: if there is a size quota set on the Inode acting
10989 // as the root for this client mount, then report the quota status
10990 // as the filesystem statistics.
10991 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10992 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10993 // It is possible for a quota to be exceeded: arithmetic here must
10994 // handle case where used > total.
10995 const fsblkcnt_t free = total > used ? total - used : 0;
10996
10997 stbuf->f_blocks = total;
10998 stbuf->f_bfree = free;
10999 stbuf->f_bavail = free;
11000 } else {
11001 // General case: report the cluster statistics returned from RADOS. Because
11002 // multiple pools may be used without one filesystem namespace via
11003 // layouts, this is the most correct thing we can do.
11004 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11005 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11006 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11007 }
11008
11009 return rval;
11010 }
11011
11012 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11013 struct flock *fl, uint64_t owner, bool removing)
11014 {
11015 ldout(cct, 10) << __func__ << " ino " << in->ino
11016 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11017 << " type " << fl->l_type << " owner " << owner
11018 << " " << fl->l_start << "~" << fl->l_len << dendl;
11019
11020 if (in->flags & I_ERROR_FILELOCK)
11021 return -CEPHFS_EIO;
11022
11023 int lock_cmd;
11024 if (F_RDLCK == fl->l_type)
11025 lock_cmd = CEPH_LOCK_SHARED;
11026 else if (F_WRLCK == fl->l_type)
11027 lock_cmd = CEPH_LOCK_EXCL;
11028 else if (F_UNLCK == fl->l_type)
11029 lock_cmd = CEPH_LOCK_UNLOCK;
11030 else
11031 return -CEPHFS_EIO;
11032
11033 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11034 sleep = 0;
11035
11036 /*
11037 * Set the most significant bit, so that MDS knows the 'owner'
11038 * is sufficient to identify the owner of lock. (old code uses
11039 * both 'owner' and 'pid')
11040 */
11041 owner |= (1ULL << 63);
11042
11043 MetaRequest *req = new MetaRequest(op);
11044 filepath path;
11045 in->make_nosnap_relative_path(path);
11046 req->set_filepath(path);
11047 req->set_inode(in);
11048
11049 req->head.args.filelock_change.rule = lock_type;
11050 req->head.args.filelock_change.type = lock_cmd;
11051 req->head.args.filelock_change.owner = owner;
11052 req->head.args.filelock_change.pid = fl->l_pid;
11053 req->head.args.filelock_change.start = fl->l_start;
11054 req->head.args.filelock_change.length = fl->l_len;
11055 req->head.args.filelock_change.wait = sleep;
11056
11057 int ret;
11058 bufferlist bl;
11059
11060 if (sleep && switch_interrupt_cb) {
11061 // enable interrupt
11062 switch_interrupt_cb(callback_handle, req->get());
11063 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11064 // disable interrupt
11065 switch_interrupt_cb(callback_handle, NULL);
11066 if (ret == 0 && req->aborted()) {
11067 // effect of this lock request has been revoked by the 'lock intr' request
11068 ret = req->get_abort_code();
11069 }
11070 put_request(req);
11071 } else {
11072 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11073 }
11074
11075 if (ret == 0) {
11076 if (op == CEPH_MDS_OP_GETFILELOCK) {
11077 ceph_filelock filelock;
11078 auto p = bl.cbegin();
11079 decode(filelock, p);
11080
11081 if (CEPH_LOCK_SHARED == filelock.type)
11082 fl->l_type = F_RDLCK;
11083 else if (CEPH_LOCK_EXCL == filelock.type)
11084 fl->l_type = F_WRLCK;
11085 else
11086 fl->l_type = F_UNLCK;
11087
11088 fl->l_whence = SEEK_SET;
11089 fl->l_start = filelock.start;
11090 fl->l_len = filelock.length;
11091 fl->l_pid = filelock.pid;
11092 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11093 ceph_lock_state_t *lock_state;
11094 if (lock_type == CEPH_LOCK_FCNTL) {
11095 if (!in->fcntl_locks)
11096 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11097 lock_state = in->fcntl_locks.get();
11098 } else if (lock_type == CEPH_LOCK_FLOCK) {
11099 if (!in->flock_locks)
11100 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11101 lock_state = in->flock_locks.get();
11102 } else {
11103 ceph_abort();
11104 return -CEPHFS_EINVAL;
11105 }
11106 _update_lock_state(fl, owner, lock_state);
11107
11108 if (!removing) {
11109 if (lock_type == CEPH_LOCK_FCNTL) {
11110 if (!fh->fcntl_locks)
11111 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11112 lock_state = fh->fcntl_locks.get();
11113 } else {
11114 if (!fh->flock_locks)
11115 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11116 lock_state = fh->flock_locks.get();
11117 }
11118 _update_lock_state(fl, owner, lock_state);
11119 }
11120 } else
11121 ceph_abort();
11122 }
11123 return ret;
11124 }
11125
11126 int Client::_interrupt_filelock(MetaRequest *req)
11127 {
11128 // Set abort code, but do not kick. The abort code prevents the request
11129 // from being re-sent.
11130 req->abort(-CEPHFS_EINTR);
11131 if (req->mds < 0)
11132 return 0; // haven't sent the request
11133
11134 Inode *in = req->inode();
11135
11136 int lock_type;
11137 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11138 lock_type = CEPH_LOCK_FLOCK_INTR;
11139 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11140 lock_type = CEPH_LOCK_FCNTL_INTR;
11141 else {
11142 ceph_abort();
11143 return -CEPHFS_EINVAL;
11144 }
11145
11146 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11147 filepath path;
11148 in->make_nosnap_relative_path(path);
11149 intr_req->set_filepath(path);
11150 intr_req->set_inode(in);
11151 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11152 intr_req->head.args.filelock_change.rule = lock_type;
11153 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11154
11155 UserPerm perms(req->get_uid(), req->get_gid());
11156 return make_request(intr_req, perms, NULL, NULL, -1);
11157 }
11158
11159 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11160 {
11161 if (!in->fcntl_locks && !in->flock_locks)
11162 return;
11163
11164 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11165 encode(nr_fcntl_locks, bl);
11166 if (nr_fcntl_locks) {
11167 auto &lock_state = in->fcntl_locks;
11168 for(auto p = lock_state->held_locks.begin();
11169 p != lock_state->held_locks.end();
11170 ++p)
11171 encode(p->second, bl);
11172 }
11173
11174 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11175 encode(nr_flock_locks, bl);
11176 if (nr_flock_locks) {
11177 auto &lock_state = in->flock_locks;
11178 for(auto p = lock_state->held_locks.begin();
11179 p != lock_state->held_locks.end();
11180 ++p)
11181 encode(p->second, bl);
11182 }
11183
11184 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11185 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11186 }
11187
11188 void Client::_release_filelocks(Fh *fh)
11189 {
11190 if (!fh->fcntl_locks && !fh->flock_locks)
11191 return;
11192
11193 Inode *in = fh->inode.get();
11194 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11195
11196 list<ceph_filelock> activated_locks;
11197
11198 list<pair<int, ceph_filelock> > to_release;
11199
11200 if (fh->fcntl_locks) {
11201 auto &lock_state = fh->fcntl_locks;
11202 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11203 auto q = p++;
11204 if (in->flags & I_ERROR_FILELOCK) {
11205 lock_state->remove_lock(q->second, activated_locks);
11206 } else {
11207 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11208 }
11209 }
11210 lock_state.reset();
11211 }
11212 if (fh->flock_locks) {
11213 auto &lock_state = fh->flock_locks;
11214 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11215 auto q = p++;
11216 if (in->flags & I_ERROR_FILELOCK) {
11217 lock_state->remove_lock(q->second, activated_locks);
11218 } else {
11219 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11220 }
11221 }
11222 lock_state.reset();
11223 }
11224
11225 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11226 in->flags &= ~I_ERROR_FILELOCK;
11227
11228 if (to_release.empty())
11229 return;
11230
11231 struct flock fl;
11232 memset(&fl, 0, sizeof(fl));
11233 fl.l_whence = SEEK_SET;
11234 fl.l_type = F_UNLCK;
11235
11236 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11237 p != to_release.end();
11238 ++p) {
11239 fl.l_start = p->second.start;
11240 fl.l_len = p->second.length;
11241 fl.l_pid = p->second.pid;
11242 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11243 p->second.owner, true);
11244 }
11245 }
11246
11247 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11248 ceph_lock_state_t *lock_state)
11249 {
11250 int lock_cmd;
11251 if (F_RDLCK == fl->l_type)
11252 lock_cmd = CEPH_LOCK_SHARED;
11253 else if (F_WRLCK == fl->l_type)
11254 lock_cmd = CEPH_LOCK_EXCL;
11255 else
11256 lock_cmd = CEPH_LOCK_UNLOCK;;
11257
11258 ceph_filelock filelock;
11259 filelock.start = fl->l_start;
11260 filelock.length = fl->l_len;
11261 filelock.client = 0;
11262 // see comment in _do_filelock()
11263 filelock.owner = owner | (1ULL << 63);
11264 filelock.pid = fl->l_pid;
11265 filelock.type = lock_cmd;
11266
11267 if (filelock.type == CEPH_LOCK_UNLOCK) {
11268 list<ceph_filelock> activated_locks;
11269 lock_state->remove_lock(filelock, activated_locks);
11270 } else {
11271 bool r = lock_state->add_lock(filelock, false, false, NULL);
11272 ceph_assert(r);
11273 }
11274 }
11275
11276 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11277 {
11278 Inode *in = fh->inode.get();
11279 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11280 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11281 return ret;
11282 }
11283
11284 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11285 {
11286 Inode *in = fh->inode.get();
11287 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11288 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11289 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11290 return ret;
11291 }
11292
11293 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11294 {
11295 Inode *in = fh->inode.get();
11296 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11297
11298 int sleep = !(cmd & LOCK_NB);
11299 cmd &= ~LOCK_NB;
11300
11301 int type;
11302 switch (cmd) {
11303 case LOCK_SH:
11304 type = F_RDLCK;
11305 break;
11306 case LOCK_EX:
11307 type = F_WRLCK;
11308 break;
11309 case LOCK_UN:
11310 type = F_UNLCK;
11311 break;
11312 default:
11313 return -CEPHFS_EINVAL;
11314 }
11315
11316 struct flock fl;
11317 memset(&fl, 0, sizeof(fl));
11318 fl.l_type = type;
11319 fl.l_whence = SEEK_SET;
11320
11321 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11322 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11323 return ret;
11324 }
11325
11326 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11327 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11328 if (!mref_reader.is_state_satisfied()) {
11329 return -CEPHFS_ENOTCONN;
11330 }
11331
11332 std::scoped_lock lock(client_lock);
11333 InodeRef in;
11334 int r = Client::path_walk(path, &in, perms, true);
11335 if (r < 0) {
11336 return r;
11337 }
11338
11339 if (in->snapid == CEPH_NOSNAP) {
11340 return -CEPHFS_EINVAL;
11341 }
11342
11343 snap_info->id = in->snapid;
11344 snap_info->metadata = in->snap_metadata;
11345 return 0;
11346 }
11347
11348 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11349 {
11350 /* Since the only thing this does is wrap a call to statfs, and
11351 statfs takes a lock, it doesn't seem we have a need to split it
11352 out. */
11353 return statfs(0, stbuf, perms);
11354 }
11355
11356 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11357 {
11358 if (!args)
11359 return;
11360
11361 ldout(cct, 10) << __func__ << " cb " << args->handle
11362 << " invalidate_ino_cb " << args->ino_cb
11363 << " invalidate_dentry_cb " << args->dentry_cb
11364 << " switch_interrupt_cb " << args->switch_intr_cb
11365 << " remount_cb " << args->remount_cb
11366 << dendl;
11367 callback_handle = args->handle;
11368 if (args->ino_cb) {
11369 ino_invalidate_cb = args->ino_cb;
11370 async_ino_invalidator.start();
11371 }
11372 if (args->dentry_cb) {
11373 dentry_invalidate_cb = args->dentry_cb;
11374 async_dentry_invalidator.start();
11375 }
11376 if (args->switch_intr_cb) {
11377 switch_interrupt_cb = args->switch_intr_cb;
11378 interrupt_finisher.start();
11379 }
11380 if (args->remount_cb) {
11381 remount_cb = args->remount_cb;
11382 remount_finisher.start();
11383 }
11384 if (args->ino_release_cb) {
11385 ino_release_cb = args->ino_release_cb;
11386 async_ino_releasor.start();
11387 }
11388 if (args->umask_cb)
11389 umask_cb = args->umask_cb;
11390 }
11391
11392 // This is deprecated, use ll_register_callbacks2() instead.
11393 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11394 {
11395 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11396
11397 _ll_register_callbacks(args);
11398 }
11399
11400 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11401 {
11402 if (is_mounting() || is_mounted() || is_unmounting())
11403 return -CEPHFS_EBUSY;
11404
11405 _ll_register_callbacks(args);
11406 return 0;
11407 }
11408
11409 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11410 {
11411 std::pair <int, bool> r(0, false);
11412
11413 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11414 if (!iref_reader.is_state_satisfied())
11415 return std::make_pair(-CEPHFS_ENOTCONN, false);
11416
11417 can_invalidate_dentries = can_invalidate;
11418
11419 if (can_invalidate_dentries) {
11420 ceph_assert(dentry_invalidate_cb);
11421 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11422 } else {
11423 ceph_assert(remount_cb);
11424 ldout(cct, 1) << "using remount_cb" << dendl;
11425 r = _do_remount(false);
11426 }
11427
11428 return r;
11429 }
11430
11431 int Client::_sync_fs()
11432 {
11433 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11434
11435 ldout(cct, 10) << __func__ << dendl;
11436
11437 // flush file data
11438 std::unique_ptr<C_SaferCond> cond = nullptr;
11439 if (cct->_conf->client_oc) {
11440 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11441 objectcacher->flush_all(cond.get());
11442 }
11443
11444 // flush caps
11445 flush_caps_sync();
11446 ceph_tid_t flush_tid = last_flush_tid;
11447
11448 // wait for unsafe mds requests
11449 wait_unsafe_requests();
11450
11451 wait_sync_caps(flush_tid);
11452
11453 if (nullptr != cond) {
11454 client_lock.unlock();
11455 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11456 cond->wait();
11457 ldout(cct, 15) << __func__ << " flush finished" << dendl;
11458 client_lock.lock();
11459 }
11460
11461 return 0;
11462 }
11463
11464 int Client::sync_fs()
11465 {
11466 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11467 if (!mref_reader.is_state_satisfied())
11468 return -CEPHFS_ENOTCONN;
11469
11470 std::scoped_lock l(client_lock);
11471
11472 return _sync_fs();
11473 }
11474
11475 int64_t Client::drop_caches()
11476 {
11477 std::scoped_lock l(client_lock);
11478 return objectcacher->release_all();
11479 }
11480
11481 int Client::_lazyio(Fh *fh, int enable)
11482 {
11483 Inode *in = fh->inode.get();
11484 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11485
11486 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11487 return 0;
11488
11489 int orig_mode = fh->mode;
11490 if (enable) {
11491 fh->mode |= CEPH_FILE_MODE_LAZY;
11492 in->get_open_ref(fh->mode);
11493 in->put_open_ref(orig_mode);
11494 check_caps(in, CHECK_CAPS_NODELAY);
11495 } else {
11496 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11497 in->get_open_ref(fh->mode);
11498 in->put_open_ref(orig_mode);
11499 check_caps(in, 0);
11500 }
11501
11502 return 0;
11503 }
11504
11505 int Client::lazyio(int fd, int enable)
11506 {
11507 std::scoped_lock l(client_lock);
11508 Fh *f = get_filehandle(fd);
11509 if (!f)
11510 return -CEPHFS_EBADF;
11511
11512 return _lazyio(f, enable);
11513 }
11514
11515 int Client::ll_lazyio(Fh *fh, int enable)
11516 {
11517 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11518 tout(cct) << __func__ << std::endl;
11519
11520 std::scoped_lock lock(client_lock);
11521 return _lazyio(fh, enable);
11522 }
11523
11524 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11525 {
11526 std::scoped_lock l(client_lock);
11527 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11528 << ", " << offset << ", " << count << ")" << dendl;
11529
11530 Fh *f = get_filehandle(fd);
11531 if (!f)
11532 return -CEPHFS_EBADF;
11533
11534 // for now
11535 _fsync(f, true);
11536
11537 return 0;
11538 }
11539
11540 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11541 {
11542 std::scoped_lock l(client_lock);
11543 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11544 << ", " << offset << ", " << count << ")" << dendl;
11545
11546 Fh *f = get_filehandle(fd);
11547 if (!f)
11548 return -CEPHFS_EBADF;
11549 Inode *in = f->inode.get();
11550
11551 _fsync(f, true);
11552 if (_release(in)) {
11553 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11554 if (r < 0)
11555 return r;
11556 }
11557 return 0;
11558 }
11559
11560
11561 // =============================
11562 // snaps
11563
11564 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11565 mode_t mode, const std::map<std::string, std::string> &metadata)
11566 {
11567 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11568 if (!mref_reader.is_state_satisfied())
11569 return -CEPHFS_ENOTCONN;
11570
11571 std::scoped_lock l(client_lock);
11572
11573 filepath path(relpath);
11574 InodeRef in;
11575 int r = path_walk(path, &in, perm);
11576 if (r < 0)
11577 return r;
11578 if (cct->_conf->client_permissions) {
11579 r = may_create(in.get(), perm);
11580 if (r < 0)
11581 return r;
11582 }
11583 Inode *snapdir = open_snapdir(in.get());
11584 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11585 }
11586
11587 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11588 {
11589 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11590 if (!mref_reader.is_state_satisfied())
11591 return -CEPHFS_ENOTCONN;
11592
11593 std::scoped_lock l(client_lock);
11594
11595 filepath path(relpath);
11596 InodeRef in;
11597 int r = path_walk(path, &in, perms);
11598 if (r < 0)
11599 return r;
11600 Inode *snapdir = open_snapdir(in.get());
11601 if (cct->_conf->client_permissions) {
11602 r = may_delete(snapdir, check_perms ? name : NULL, perms);
11603 if (r < 0)
11604 return r;
11605 }
11606 return _rmdir(snapdir, name, perms);
11607 }
11608
11609 // =============================
11610 // expose caps
11611
11612 int Client::get_caps_issued(int fd)
11613 {
11614 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11615 if (!mref_reader.is_state_satisfied())
11616 return -CEPHFS_ENOTCONN;
11617
11618 std::scoped_lock lock(client_lock);
11619
11620 Fh *f = get_filehandle(fd);
11621 if (!f)
11622 return -CEPHFS_EBADF;
11623
11624 return f->inode->caps_issued();
11625 }
11626
11627 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11628 {
11629 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11630 if (!mref_reader.is_state_satisfied())
11631 return -CEPHFS_ENOTCONN;
11632
11633 std::scoped_lock lock(client_lock);
11634
11635 filepath p(path);
11636 InodeRef in;
11637 int r = path_walk(p, &in, perms, true);
11638 if (r < 0)
11639 return r;
11640 return in->caps_issued();
11641 }
11642
11643 // =========================================
11644 // low level
11645
11646 Inode *Client::open_snapdir(Inode *diri)
11647 {
11648 Inode *in;
11649 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11650 if (!inode_map.count(vino)) {
11651 in = new Inode(this, vino, &diri->layout);
11652
11653 in->ino = diri->ino;
11654 in->snapid = CEPH_SNAPDIR;
11655 in->mode = diri->mode;
11656 in->uid = diri->uid;
11657 in->gid = diri->gid;
11658 in->nlink = 1;
11659 in->mtime = diri->mtime;
11660 in->ctime = diri->ctime;
11661 in->btime = diri->btime;
11662 in->atime = diri->atime;
11663 in->size = diri->size;
11664 in->change_attr = diri->change_attr;
11665
11666 in->dirfragtree.clear();
11667 in->snapdir_parent = diri;
11668 diri->flags |= I_SNAPDIR_OPEN;
11669 inode_map[vino] = in;
11670 if (use_faked_inos())
11671 _assign_faked_ino(in);
11672 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11673 } else {
11674 in = inode_map[vino];
11675 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11676 }
11677 return in;
11678 }
11679
11680 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11681 Inode **out, const UserPerm& perms)
11682 {
11683 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11684 if (!mref_reader.is_state_satisfied())
11685 return -CEPHFS_ENOTCONN;
11686
11687 vinodeno_t vparent = _get_vino(parent);
11688 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11689 tout(cct) << __func__ << std::endl;
11690 tout(cct) << name << std::endl;
11691
11692 std::scoped_lock lock(client_lock);
11693
11694 int r = 0;
11695 if (!fuse_default_permissions) {
11696 if (strcmp(name, ".") && strcmp(name, "..")) {
11697 r = may_lookup(parent, perms);
11698 if (r < 0)
11699 return r;
11700 }
11701 }
11702
11703 string dname(name);
11704 InodeRef in;
11705
11706 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11707 if (r < 0) {
11708 attr->st_ino = 0;
11709 goto out;
11710 }
11711
11712 ceph_assert(in);
11713 fill_stat(in, attr);
11714 _ll_get(in.get());
11715
11716 out:
11717 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11718 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11719 tout(cct) << attr->st_ino << std::endl;
11720 *out = in.get();
11721 return r;
11722 }
11723
11724 int Client::ll_lookup_vino(
11725 vinodeno_t vino,
11726 const UserPerm& perms,
11727 Inode **inode)
11728 {
11729 ceph_assert(inode != NULL);
11730 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11731 if (!mref_reader.is_state_satisfied())
11732 return -CEPHFS_ENOTCONN;
11733
11734 if (is_reserved_vino(vino))
11735 return -CEPHFS_ESTALE;
11736
11737 std::scoped_lock lock(client_lock);
11738 ldout(cct, 3) << __func__ << " " << vino << dendl;
11739
11740 // Check the cache first
11741 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11742 if (p != inode_map.end()) {
11743 *inode = p->second;
11744 _ll_get(*inode);
11745 return 0;
11746 }
11747
11748 uint64_t snapid = vino.snapid;
11749
11750 // for snapdir, find the non-snapped dir inode
11751 if (snapid == CEPH_SNAPDIR)
11752 vino.snapid = CEPH_NOSNAP;
11753
11754 int r = _lookup_vino(vino, perms, inode);
11755 if (r)
11756 return r;
11757 ceph_assert(*inode != NULL);
11758
11759 if (snapid == CEPH_SNAPDIR) {
11760 Inode *tmp = *inode;
11761
11762 // open the snapdir and put the inode ref
11763 *inode = open_snapdir(tmp);
11764 _ll_forget(tmp, 1);
11765 _ll_get(*inode);
11766 }
11767 return 0;
11768 }
11769
11770 int Client::ll_lookup_inode(
11771 struct inodeno_t ino,
11772 const UserPerm& perms,
11773 Inode **inode)
11774 {
11775 vinodeno_t vino(ino, CEPH_NOSNAP);
11776 return ll_lookup_vino(vino, perms, inode);
11777 }
11778
11779 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11780 struct ceph_statx *stx, unsigned want, unsigned flags,
11781 const UserPerm& perms)
11782 {
11783 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11784 if (!mref_reader.is_state_satisfied())
11785 return -CEPHFS_ENOTCONN;
11786
11787 vinodeno_t vparent = _get_vino(parent);
11788 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11789 tout(cct) << "ll_lookupx" << std::endl;
11790 tout(cct) << name << std::endl;
11791
11792 std::scoped_lock lock(client_lock);
11793
11794 int r = 0;
11795 if (!fuse_default_permissions) {
11796 r = may_lookup(parent, perms);
11797 if (r < 0)
11798 return r;
11799 }
11800
11801 string dname(name);
11802 InodeRef in;
11803
11804 unsigned mask = statx_to_mask(flags, want);
11805 r = _lookup(parent, dname, mask, &in, perms);
11806 if (r < 0) {
11807 stx->stx_ino = 0;
11808 stx->stx_mask = 0;
11809 } else {
11810 ceph_assert(in);
11811 fill_statx(in, mask, stx);
11812 _ll_get(in.get());
11813 }
11814
11815 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11816 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11817 tout(cct) << stx->stx_ino << std::endl;
11818 *out = in.get();
11819 return r;
11820 }
11821
11822 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11823 unsigned int want, unsigned int flags, const UserPerm& perms)
11824 {
11825 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11826 if (!mref_reader.is_state_satisfied())
11827 return -CEPHFS_ENOTCONN;
11828
11829 filepath fp(name, 0);
11830 InodeRef in;
11831 int rc;
11832 unsigned mask = statx_to_mask(flags, want);
11833
11834 ldout(cct, 3) << __func__ << " " << name << dendl;
11835 tout(cct) << __func__ << std::endl;
11836 tout(cct) << name << std::endl;
11837
11838 std::scoped_lock lock(client_lock);
11839 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11840 if (rc < 0) {
11841 /* zero out mask, just in case... */
11842 stx->stx_mask = 0;
11843 stx->stx_ino = 0;
11844 *out = NULL;
11845 return rc;
11846 } else {
11847 ceph_assert(in);
11848 fill_statx(in, mask, stx);
11849 _ll_get(in.get());
11850 *out = in.get();
11851 return 0;
11852 }
11853 }
11854
11855 void Client::_ll_get(Inode *in)
11856 {
11857 if (in->ll_ref == 0) {
11858 in->iget();
11859 if (in->is_dir() && !in->dentries.empty()) {
11860 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11861 in->get_first_parent()->get(); // pin dentry
11862 }
11863 if (in->snapid != CEPH_NOSNAP)
11864 ll_snap_ref[in->snapid]++;
11865 }
11866 in->ll_get();
11867 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11868 }
11869
11870 int Client::_ll_put(Inode *in, uint64_t num)
11871 {
11872 in->ll_put(num);
11873 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11874 if (in->ll_ref == 0) {
11875 if (in->is_dir() && !in->dentries.empty()) {
11876 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11877 in->get_first_parent()->put(); // unpin dentry
11878 }
11879 if (in->snapid != CEPH_NOSNAP) {
11880 auto p = ll_snap_ref.find(in->snapid);
11881 ceph_assert(p != ll_snap_ref.end());
11882 ceph_assert(p->second > 0);
11883 if (--p->second == 0)
11884 ll_snap_ref.erase(p);
11885 }
11886 put_inode(in);
11887 return 0;
11888 } else {
11889 return in->ll_ref;
11890 }
11891 }
11892
11893 void Client::_ll_drop_pins()
11894 {
11895 ldout(cct, 10) << __func__ << dendl;
11896 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11897 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11898 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11899 it != inode_map.end();
11900 it = next) {
11901 Inode *in = it->second;
11902 next = it;
11903 ++next;
11904 if (in->ll_ref){
11905 to_be_put.insert(in);
11906 _ll_put(in, in->ll_ref);
11907 }
11908 }
11909 }
11910
11911 bool Client::_ll_forget(Inode *in, uint64_t count)
11912 {
11913 inodeno_t ino = in->ino;
11914
11915 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11916 tout(cct) << __func__ << std::endl;
11917 tout(cct) << ino.val << std::endl;
11918 tout(cct) << count << std::endl;
11919
11920 // Ignore forget if we're no longer mounted
11921 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11922 if (!mref_reader.is_state_satisfied())
11923 return true;
11924
11925 if (ino == 1) return true; // ignore forget on root.
11926
11927 bool last = false;
11928 if (in->ll_ref < count) {
11929 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11930 << ", which only has ll_ref=" << in->ll_ref << dendl;
11931 _ll_put(in, in->ll_ref);
11932 last = true;
11933 } else {
11934 if (_ll_put(in, count) == 0)
11935 last = true;
11936 }
11937
11938 return last;
11939 }
11940
11941 bool Client::ll_forget(Inode *in, uint64_t count)
11942 {
11943 std::scoped_lock lock(client_lock);
11944 return _ll_forget(in, count);
11945 }
11946
11947 bool Client::ll_put(Inode *in)
11948 {
11949 /* ll_forget already takes the lock */
11950 return ll_forget(in, 1);
11951 }
11952
11953 int Client::ll_get_snap_ref(snapid_t snap)
11954 {
11955 std::scoped_lock lock(client_lock);
11956 auto p = ll_snap_ref.find(snap);
11957 if (p != ll_snap_ref.end())
11958 return p->second;
11959 return 0;
11960 }
11961
11962 snapid_t Client::ll_get_snapid(Inode *in)
11963 {
11964 std::scoped_lock lock(client_lock);
11965 return in->snapid;
11966 }
11967
11968 Inode *Client::ll_get_inode(ino_t ino)
11969 {
11970 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11971 if (!mref_reader.is_state_satisfied())
11972 return NULL;
11973
11974 std::scoped_lock lock(client_lock);
11975
11976 vinodeno_t vino = _map_faked_ino(ino);
11977 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11978 if (p == inode_map.end())
11979 return NULL;
11980 Inode *in = p->second;
11981 _ll_get(in);
11982 return in;
11983 }
11984
11985 Inode *Client::ll_get_inode(vinodeno_t vino)
11986 {
11987 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11988 if (!mref_reader.is_state_satisfied())
11989 return NULL;
11990
11991 if (is_reserved_vino(vino))
11992 return NULL;
11993
11994 std::scoped_lock lock(client_lock);
11995
11996 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11997 if (p == inode_map.end())
11998 return NULL;
11999 Inode *in = p->second;
12000 _ll_get(in);
12001 return in;
12002 }
12003
12004 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12005 {
12006 vinodeno_t vino = _get_vino(in);
12007
12008 ldout(cct, 8) << __func__ << " " << vino << dendl;
12009 tout(cct) << __func__ << std::endl;
12010 tout(cct) << vino.ino.val << std::endl;
12011
12012 if (vino.snapid < CEPH_NOSNAP)
12013 return 0;
12014 else
12015 return _getattr(in, caps, perms);
12016 }
12017
12018 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12019 {
12020 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12021 if (!mref_reader.is_state_satisfied())
12022 return -CEPHFS_ENOTCONN;
12023
12024 std::scoped_lock lock(client_lock);
12025
12026 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12027
12028 if (res == 0)
12029 fill_stat(in, attr);
12030 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12031 return res;
12032 }
12033
12034 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12035 unsigned int flags, const UserPerm& perms)
12036 {
12037 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12038 if (!mref_reader.is_state_satisfied())
12039 return -CEPHFS_ENOTCONN;
12040
12041 std::scoped_lock lock(client_lock);
12042
12043 int res = 0;
12044 unsigned mask = statx_to_mask(flags, want);
12045
12046 if (mask && !in->caps_issued_mask(mask, true))
12047 res = _ll_getattr(in, mask, perms);
12048
12049 if (res == 0)
12050 fill_statx(in, mask, stx);
12051 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12052 return res;
12053 }
12054
12055 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12056 const UserPerm& perms, InodeRef *inp)
12057 {
12058 vinodeno_t vino = _get_vino(in);
12059
12060 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12061 << dendl;
12062 tout(cct) << __func__ << std::endl;
12063 tout(cct) << vino.ino.val << std::endl;
12064 tout(cct) << stx->stx_mode << std::endl;
12065 tout(cct) << stx->stx_uid << std::endl;
12066 tout(cct) << stx->stx_gid << std::endl;
12067 tout(cct) << stx->stx_size << std::endl;
12068 tout(cct) << stx->stx_mtime << std::endl;
12069 tout(cct) << stx->stx_atime << std::endl;
12070 tout(cct) << stx->stx_btime << std::endl;
12071 tout(cct) << mask << std::endl;
12072
12073 if (!fuse_default_permissions) {
12074 int res = may_setattr(in, stx, mask, perms);
12075 if (res < 0)
12076 return res;
12077 }
12078
12079 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12080
12081 return __setattrx(in, stx, mask, perms, inp);
12082 }
12083
12084 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12085 const UserPerm& perms)
12086 {
12087 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12088 if (!mref_reader.is_state_satisfied())
12089 return -CEPHFS_ENOTCONN;
12090
12091 std::scoped_lock lock(client_lock);
12092
12093 InodeRef target(in);
12094 int res = _ll_setattrx(in, stx, mask, perms, &target);
12095 if (res == 0) {
12096 ceph_assert(in == target.get());
12097 fill_statx(in, in->caps_issued(), stx);
12098 }
12099
12100 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12101 return res;
12102 }
12103
12104 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12105 const UserPerm& perms)
12106 {
12107 struct ceph_statx stx;
12108 stat_to_statx(attr, &stx);
12109
12110 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12111 if (!mref_reader.is_state_satisfied())
12112 return -CEPHFS_ENOTCONN;
12113
12114 std::scoped_lock lock(client_lock);
12115
12116 InodeRef target(in);
12117 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12118 if (res == 0) {
12119 ceph_assert(in == target.get());
12120 fill_stat(in, attr);
12121 }
12122
12123 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12124 return res;
12125 }
12126
12127
12128 // ----------
12129 // xattrs
12130
12131 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12132 const UserPerm& perms)
12133 {
12134 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12135 if (!mref_reader.is_state_satisfied())
12136 return -CEPHFS_ENOTCONN;
12137
12138 std::scoped_lock lock(client_lock);
12139
12140 InodeRef in;
12141 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12142 if (r < 0)
12143 return r;
12144 return _getxattr(in, name, value, size, perms);
12145 }
12146
12147 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12148 const UserPerm& perms)
12149 {
12150 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12151 if (!mref_reader.is_state_satisfied())
12152 return -CEPHFS_ENOTCONN;
12153
12154 std::scoped_lock lock(client_lock);
12155
12156 InodeRef in;
12157 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12158 if (r < 0)
12159 return r;
12160 return _getxattr(in, name, value, size, perms);
12161 }
12162
12163 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12164 const UserPerm& perms)
12165 {
12166 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12167 if (!mref_reader.is_state_satisfied())
12168 return -CEPHFS_ENOTCONN;
12169
12170 std::scoped_lock lock(client_lock);
12171
12172 Fh *f = get_filehandle(fd);
12173 if (!f)
12174 return -CEPHFS_EBADF;
12175 return _getxattr(f->inode, name, value, size, perms);
12176 }
12177
12178 int Client::listxattr(const char *path, char *list, size_t size,
12179 const UserPerm& perms)
12180 {
12181 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12182 if (!mref_reader.is_state_satisfied())
12183 return -CEPHFS_ENOTCONN;
12184
12185 std::scoped_lock lock(client_lock);
12186
12187 InodeRef in;
12188 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12189 if (r < 0)
12190 return r;
12191 return Client::_listxattr(in.get(), list, size, perms);
12192 }
12193
12194 int Client::llistxattr(const char *path, char *list, size_t size,
12195 const UserPerm& perms)
12196 {
12197 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12198 if (!mref_reader.is_state_satisfied())
12199 return -CEPHFS_ENOTCONN;
12200
12201 std::scoped_lock lock(client_lock);
12202
12203 InodeRef in;
12204 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12205 if (r < 0)
12206 return r;
12207 return Client::_listxattr(in.get(), list, size, perms);
12208 }
12209
12210 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12211 {
12212 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12213 if (!mref_reader.is_state_satisfied())
12214 return -CEPHFS_ENOTCONN;
12215
12216 std::scoped_lock lock(client_lock);
12217
12218 Fh *f = get_filehandle(fd);
12219 if (!f)
12220 return -CEPHFS_EBADF;
12221 return Client::_listxattr(f->inode.get(), list, size, perms);
12222 }
12223
12224 int Client::removexattr(const char *path, const char *name,
12225 const UserPerm& perms)
12226 {
12227 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12228 if (!mref_reader.is_state_satisfied())
12229 return -CEPHFS_ENOTCONN;
12230
12231 std::scoped_lock lock(client_lock);
12232
12233 InodeRef in;
12234 int r = Client::path_walk(path, &in, perms, true);
12235 if (r < 0)
12236 return r;
12237 return _removexattr(in, name, perms);
12238 }
12239
12240 int Client::lremovexattr(const char *path, const char *name,
12241 const UserPerm& perms)
12242 {
12243 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12244 if (!mref_reader.is_state_satisfied())
12245 return -CEPHFS_ENOTCONN;
12246
12247 std::scoped_lock lock(client_lock);
12248
12249 InodeRef in;
12250 int r = Client::path_walk(path, &in, perms, false);
12251 if (r < 0)
12252 return r;
12253 return _removexattr(in, name, perms);
12254 }
12255
12256 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12257 {
12258 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12259 if (!mref_reader.is_state_satisfied())
12260 return -CEPHFS_ENOTCONN;
12261
12262 std::scoped_lock lock(client_lock);
12263
12264 Fh *f = get_filehandle(fd);
12265 if (!f)
12266 return -CEPHFS_EBADF;
12267 return _removexattr(f->inode, name, perms);
12268 }
12269
12270 int Client::setxattr(const char *path, const char *name, const void *value,
12271 size_t size, int flags, const UserPerm& perms)
12272 {
12273 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12274 if (!mref_reader.is_state_satisfied())
12275 return -CEPHFS_ENOTCONN;
12276
12277 _setxattr_maybe_wait_for_osdmap(name, value, size);
12278
12279 std::scoped_lock lock(client_lock);
12280
12281 InodeRef in;
12282 int r = Client::path_walk(path, &in, perms, true);
12283 if (r < 0)
12284 return r;
12285 return _setxattr(in, name, value, size, flags, perms);
12286 }
12287
12288 int Client::lsetxattr(const char *path, const char *name, const void *value,
12289 size_t size, int flags, const UserPerm& perms)
12290 {
12291 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12292 if (!mref_reader.is_state_satisfied())
12293 return -CEPHFS_ENOTCONN;
12294
12295 _setxattr_maybe_wait_for_osdmap(name, value, size);
12296
12297 std::scoped_lock lock(client_lock);
12298
12299 InodeRef in;
12300 int r = Client::path_walk(path, &in, perms, false);
12301 if (r < 0)
12302 return r;
12303 return _setxattr(in, name, value, size, flags, perms);
12304 }
12305
12306 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12307 int flags, const UserPerm& perms)
12308 {
12309 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12310 if (!mref_reader.is_state_satisfied())
12311 return -CEPHFS_ENOTCONN;
12312
12313 _setxattr_maybe_wait_for_osdmap(name, value, size);
12314
12315 std::scoped_lock lock(client_lock);
12316
12317 Fh *f = get_filehandle(fd);
12318 if (!f)
12319 return -CEPHFS_EBADF;
12320 return _setxattr(f->inode, name, value, size, flags, perms);
12321 }
12322
12323 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12324 const UserPerm& perms)
12325 {
12326 int r;
12327 const VXattr *vxattr = nullptr;
12328
12329 vxattr = _match_vxattr(in, name);
12330 if (vxattr) {
12331 r = -CEPHFS_ENODATA;
12332
12333 // Do a force getattr to get the latest quota before returning
12334 // a value to userspace.
12335 int flags = 0;
12336 if (vxattr->flags & VXATTR_RSTAT) {
12337 flags |= CEPH_STAT_RSTAT;
12338 }
12339 if (vxattr->flags & VXATTR_DIRSTAT) {
12340 flags |= CEPH_CAP_FILE_SHARED;
12341 }
12342 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12343 if (r != 0) {
12344 // Error from getattr!
12345 return r;
12346 }
12347
12348 // call pointer-to-member function
12349 char buf[256];
12350 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12351 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12352 } else {
12353 r = -CEPHFS_ENODATA;
12354 }
12355
12356 if (size != 0) {
12357 if (r > (int)size) {
12358 r = -CEPHFS_ERANGE;
12359 } else if (r > 0) {
12360 memcpy(value, buf, r);
12361 }
12362 }
12363 goto out;
12364 }
12365
12366 if (!strncmp(name, "ceph.", 5)) {
12367 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12368 goto out;
12369 }
12370
12371 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12372 r = -CEPHFS_EOPNOTSUPP;
12373 goto out;
12374 }
12375
12376 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12377 if (r == 0) {
12378 string n(name);
12379 r = -CEPHFS_ENODATA;
12380 if (in->xattrs.count(n)) {
12381 r = in->xattrs[n].length();
12382 if (r > 0 && size != 0) {
12383 if (size >= (unsigned)r)
12384 memcpy(value, in->xattrs[n].c_str(), r);
12385 else
12386 r = -CEPHFS_ERANGE;
12387 }
12388 }
12389 }
12390 out:
12391 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12392 return r;
12393 }
12394
12395 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12396 const UserPerm& perms)
12397 {
12398 if (cct->_conf->client_permissions) {
12399 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12400 if (r < 0)
12401 return r;
12402 }
12403 return _getxattr(in.get(), name, value, size, perms);
12404 }
12405
12406 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12407 size_t size, const UserPerm& perms)
12408 {
12409 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12410 if (!mref_reader.is_state_satisfied())
12411 return -CEPHFS_ENOTCONN;
12412
12413 vinodeno_t vino = _get_vino(in);
12414
12415 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12416 tout(cct) << __func__ << std::endl;
12417 tout(cct) << vino.ino.val << std::endl;
12418 tout(cct) << name << std::endl;
12419
12420 std::scoped_lock lock(client_lock);
12421 if (!fuse_default_permissions) {
12422 int r = xattr_permission(in, name, MAY_READ, perms);
12423 if (r < 0)
12424 return r;
12425 }
12426
12427 return _getxattr(in, name, value, size, perms);
12428 }
12429
12430 int Client::_listxattr(Inode *in, char *name, size_t size,
12431 const UserPerm& perms)
12432 {
12433 bool len_only = (size == 0);
12434 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12435 if (r != 0) {
12436 goto out;
12437 }
12438
12439 r = 0;
12440 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12441 if (xattr_name.rfind("ceph.", 0) == 0) {
12442 continue;
12443 }
12444
12445 size_t this_len = xattr_name.length() + 1;
12446 r += this_len;
12447 if (len_only)
12448 continue;
12449
12450 if (this_len > size) {
12451 r = -CEPHFS_ERANGE;
12452 goto out;
12453 }
12454
12455 memcpy(name, xattr_name.c_str(), this_len);
12456 name += this_len;
12457 size -= this_len;
12458 }
12459 out:
12460 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12461 return r;
12462 }
12463
12464 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12465 const UserPerm& perms)
12466 {
12467 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12468 if (!mref_reader.is_state_satisfied())
12469 return -CEPHFS_ENOTCONN;
12470
12471 vinodeno_t vino = _get_vino(in);
12472
12473 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12474 tout(cct) << __func__ << std::endl;
12475 tout(cct) << vino.ino.val << std::endl;
12476 tout(cct) << size << std::endl;
12477
12478 std::scoped_lock lock(client_lock);
12479 return _listxattr(in, names, size, perms);
12480 }
12481
12482 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12483 size_t size, int flags, const UserPerm& perms)
12484 {
12485
12486 int xattr_flags = 0;
12487 if (!value)
12488 xattr_flags |= CEPH_XATTR_REMOVE;
12489 if (flags & XATTR_CREATE)
12490 xattr_flags |= CEPH_XATTR_CREATE;
12491 if (flags & XATTR_REPLACE)
12492 xattr_flags |= CEPH_XATTR_REPLACE;
12493
12494 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12495 filepath path;
12496 in->make_nosnap_relative_path(path);
12497 req->set_filepath(path);
12498 req->set_string2(name);
12499 req->set_inode(in);
12500 req->head.args.setxattr.flags = xattr_flags;
12501
12502 bufferlist bl;
12503 ceph_assert(value || size == 0);
12504 bl.append((const char*)value, size);
12505 req->set_data(bl);
12506
12507 int res = make_request(req, perms);
12508
12509 trim_cache();
12510 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12511 res << dendl;
12512 return res;
12513 }
12514
12515 int Client::_setxattr(Inode *in, const char *name, const void *value,
12516 size_t size, int flags, const UserPerm& perms)
12517 {
12518 if (in->snapid != CEPH_NOSNAP) {
12519 return -CEPHFS_EROFS;
12520 }
12521
12522 if (size == 0) {
12523 value = "";
12524 } else if (value == NULL) {
12525 return -CEPHFS_EINVAL;
12526 }
12527
12528 bool posix_acl_xattr = false;
12529 if (acl_type == POSIX_ACL)
12530 posix_acl_xattr = !strncmp(name, "system.", 7);
12531
12532 if (strncmp(name, "user.", 5) &&
12533 strncmp(name, "security.", 9) &&
12534 strncmp(name, "trusted.", 8) &&
12535 strncmp(name, "ceph.", 5) &&
12536 !posix_acl_xattr)
12537 return -CEPHFS_EOPNOTSUPP;
12538
12539 bool check_realm = false;
12540
12541 if (posix_acl_xattr) {
12542 if (!strcmp(name, ACL_EA_ACCESS)) {
12543 mode_t new_mode = in->mode;
12544 if (value) {
12545 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12546 if (ret < 0)
12547 return ret;
12548 if (ret == 0) {
12549 value = NULL;
12550 size = 0;
12551 }
12552 if (new_mode != in->mode) {
12553 struct ceph_statx stx;
12554 stx.stx_mode = new_mode;
12555 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12556 if (ret < 0)
12557 return ret;
12558 }
12559 }
12560 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12561 if (value) {
12562 if (!S_ISDIR(in->mode))
12563 return -CEPHFS_EACCES;
12564 int ret = posix_acl_check(value, size);
12565 if (ret < 0)
12566 return -CEPHFS_EINVAL;
12567 if (ret == 0) {
12568 value = NULL;
12569 size = 0;
12570 }
12571 }
12572 } else {
12573 return -CEPHFS_EOPNOTSUPP;
12574 }
12575 } else {
12576 const VXattr *vxattr = _match_vxattr(in, name);
12577 if (vxattr) {
12578 if (vxattr->readonly)
12579 return -CEPHFS_EOPNOTSUPP;
12580 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12581 check_realm = true;
12582 }
12583 }
12584
12585 int ret = _do_setxattr(in, name, value, size, flags, perms);
12586 if (ret >= 0 && check_realm) {
12587 // check if snaprealm was created for quota inode
12588 if (in->quota.is_enable() &&
12589 !(in->snaprealm && in->snaprealm->ino == in->ino))
12590 ret = -CEPHFS_EOPNOTSUPP;
12591 }
12592
12593 return ret;
12594 }
12595
12596 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12597 size_t size, int flags, const UserPerm& perms)
12598 {
12599 if (cct->_conf->client_permissions) {
12600 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12601 if (r < 0)
12602 return r;
12603 }
12604 return _setxattr(in.get(), name, value, size, flags, perms);
12605 }
12606
12607 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12608 {
12609 string tmp;
12610 if (name == "layout") {
12611 string::iterator begin = value.begin();
12612 string::iterator end = value.end();
12613 keys_and_values<string::iterator> p; // create instance of parser
12614 std::map<string, string> m; // map to receive results
12615 if (!qi::parse(begin, end, p, m)) { // returns true if successful
12616 return -CEPHFS_EINVAL;
12617 }
12618 if (begin != end)
12619 return -CEPHFS_EINVAL;
12620 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12621 if (q->first == "pool") {
12622 tmp = q->second;
12623 break;
12624 }
12625 }
12626 } else if (name == "layout.pool") {
12627 tmp = value;
12628 }
12629
12630 if (tmp.length()) {
12631 int64_t pool;
12632 try {
12633 pool = boost::lexical_cast<unsigned>(tmp);
12634 if (!osdmap->have_pg_pool(pool))
12635 return -CEPHFS_ENOENT;
12636 } catch (boost::bad_lexical_cast const&) {
12637 pool = osdmap->lookup_pg_pool_name(tmp);
12638 if (pool < 0) {
12639 return -CEPHFS_ENOENT;
12640 }
12641 }
12642 }
12643
12644 return 0;
12645 }
12646
12647 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12648 {
12649 // For setting pool of layout, MetaRequest need osdmap epoch.
12650 // There is a race which create a new data pool but client and mds both don't have.
12651 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12652 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12653 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12654 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12655 string rest(strstr(name, "layout"));
12656 string v((const char*)value, size);
12657 int r = objecter->with_osdmap([&](const OSDMap& o) {
12658 return _setxattr_check_data_pool(rest, v, &o);
12659 });
12660
12661 if (r == -CEPHFS_ENOENT) {
12662 bs::error_code ec;
12663 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12664 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12665 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12666 }
12667 }
12668 }
12669
12670 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12671 size_t size, int flags, const UserPerm& perms)
12672 {
12673 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12674 if (!mref_reader.is_state_satisfied())
12675 return -CEPHFS_ENOTCONN;
12676
12677 _setxattr_maybe_wait_for_osdmap(name, value, size);
12678
12679 vinodeno_t vino = _get_vino(in);
12680
12681 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12682 tout(cct) << __func__ << std::endl;
12683 tout(cct) << vino.ino.val << std::endl;
12684 tout(cct) << name << std::endl;
12685
12686 std::scoped_lock lock(client_lock);
12687 if (!fuse_default_permissions) {
12688 int r = xattr_permission(in, name, MAY_WRITE, perms);
12689 if (r < 0)
12690 return r;
12691 }
12692 return _setxattr(in, name, value, size, flags, perms);
12693 }
12694
12695 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12696 {
12697 if (in->snapid != CEPH_NOSNAP) {
12698 return -CEPHFS_EROFS;
12699 }
12700
12701 // same xattrs supported by kernel client
12702 if (strncmp(name, "user.", 5) &&
12703 strncmp(name, "system.", 7) &&
12704 strncmp(name, "security.", 9) &&
12705 strncmp(name, "trusted.", 8) &&
12706 strncmp(name, "ceph.", 5))
12707 return -CEPHFS_EOPNOTSUPP;
12708
12709 const VXattr *vxattr = _match_vxattr(in, name);
12710 if (vxattr && vxattr->readonly)
12711 return -CEPHFS_EOPNOTSUPP;
12712
12713 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12714 filepath path;
12715 in->make_nosnap_relative_path(path);
12716 req->set_filepath(path);
12717 req->set_filepath2(name);
12718 req->set_inode(in);
12719
12720 int res = make_request(req, perms);
12721
12722 trim_cache();
12723 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12724 return res;
12725 }
12726
12727 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12728 {
12729 if (cct->_conf->client_permissions) {
12730 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12731 if (r < 0)
12732 return r;
12733 }
12734 return _removexattr(in.get(), name, perms);
12735 }
12736
12737 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12738 {
12739 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12740 if (!mref_reader.is_state_satisfied())
12741 return -CEPHFS_ENOTCONN;
12742
12743 vinodeno_t vino = _get_vino(in);
12744
12745 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12746 tout(cct) << "ll_removexattr" << std::endl;
12747 tout(cct) << vino.ino.val << std::endl;
12748 tout(cct) << name << std::endl;
12749
12750 std::scoped_lock lock(client_lock);
12751 if (!fuse_default_permissions) {
12752 int r = xattr_permission(in, name, MAY_WRITE, perms);
12753 if (r < 0)
12754 return r;
12755 }
12756
12757 return _removexattr(in, name, perms);
12758 }
12759
12760 bool Client::_vxattrcb_quota_exists(Inode *in)
12761 {
12762 return in->quota.is_enable() &&
12763 (in->snapid != CEPH_NOSNAP ||
12764 (in->snaprealm && in->snaprealm->ino == in->ino));
12765 }
12766 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12767 {
12768 return snprintf(val, size,
12769 "max_bytes=%lld max_files=%lld",
12770 (long long int)in->quota.max_bytes,
12771 (long long int)in->quota.max_files);
12772 }
12773 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12774 {
12775 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12776 }
12777 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12778 {
12779 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12780 }
12781
12782 bool Client::_vxattrcb_layout_exists(Inode *in)
12783 {
12784 return in->layout != file_layout_t();
12785 }
12786 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12787 {
12788 int r = snprintf(val, size,
12789 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12790 (unsigned long long)in->layout.stripe_unit,
12791 (unsigned long long)in->layout.stripe_count,
12792 (unsigned long long)in->layout.object_size);
12793 objecter->with_osdmap([&](const OSDMap& o) {
12794 if (o.have_pg_pool(in->layout.pool_id))
12795 r += snprintf(val + r, size - r, "%s",
12796 o.get_pool_name(in->layout.pool_id).c_str());
12797 else
12798 r += snprintf(val + r, size - r, "%" PRIu64,
12799 (uint64_t)in->layout.pool_id);
12800 });
12801 if (in->layout.pool_ns.length())
12802 r += snprintf(val + r, size - r, " pool_namespace=%s",
12803 in->layout.pool_ns.c_str());
12804 return r;
12805 }
12806 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12807 {
12808 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12809 }
12810 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12811 {
12812 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12813 }
12814 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12815 {
12816 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12817 }
12818 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12819 {
12820 size_t r;
12821 objecter->with_osdmap([&](const OSDMap& o) {
12822 if (o.have_pg_pool(in->layout.pool_id))
12823 r = snprintf(val, size, "%s", o.get_pool_name(
12824 in->layout.pool_id).c_str());
12825 else
12826 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12827 });
12828 return r;
12829 }
12830 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12831 {
12832 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12833 }
12834 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12835 {
12836 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12837 }
12838 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12839 {
12840 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12841 }
12842 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12843 {
12844 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12845 }
12846 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12847 {
12848 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
12849 }
12850 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12851 {
12852 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
12853 }
12854 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12855 {
12856 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
12857 }
12858 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12859 {
12860 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12861 }
12862 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12863 {
12864 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
12865 }
12866 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12867 {
12868 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12869 (long)in->rstat.rctime.nsec());
12870 }
12871 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12872 {
12873 return in->dir_pin != -CEPHFS_ENODATA;
12874 }
12875 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12876 {
12877 return snprintf(val, size, "%ld", (long)in->dir_pin);
12878 }
12879
12880 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12881 {
12882 return !in->snap_btime.is_zero();
12883 }
12884
12885 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12886 {
12887 return snprintf(val, size, "%llu.%09lu",
12888 (long long unsigned)in->snap_btime.sec(),
12889 (long unsigned)in->snap_btime.nsec());
12890 }
12891
12892 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
12893 {
12894 int issued;
12895
12896 in->caps_issued(&issued);
12897 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
12898 }
12899
12900 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12901 {
12902 // checking one of the xattrs would suffice
12903 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12904 }
12905
12906 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12907 {
12908 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12909 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12910 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12911 in->xattrs["ceph.mirror.info.fs_id"].length(),
12912 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12913 }
12914
12915 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12916 {
12917 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12918 }
12919
12920 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12921 {
12922 auto name = messenger->get_myname();
12923 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
12924 }
12925
12926 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12927 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12928
12929 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12930 { \
12931 name: CEPH_XATTR_NAME(_type, _name), \
12932 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12933 readonly: true, \
12934 exists_cb: NULL, \
12935 flags: _flags, \
12936 }
12937 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12938 { \
12939 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12940 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12941 readonly: false, \
12942 exists_cb: &Client::_vxattrcb_layout_exists, \
12943 flags: 0, \
12944 }
12945 #define XATTR_QUOTA_FIELD(_type, _name) \
12946 { \
12947 name: CEPH_XATTR_NAME(_type, _name), \
12948 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12949 readonly: false, \
12950 exists_cb: &Client::_vxattrcb_quota_exists, \
12951 flags: 0, \
12952 }
12953
12954 const Client::VXattr Client::_dir_vxattrs[] = {
12955 {
12956 name: "ceph.dir.layout",
12957 getxattr_cb: &Client::_vxattrcb_layout,
12958 readonly: false,
12959 exists_cb: &Client::_vxattrcb_layout_exists,
12960 flags: 0,
12961 },
12962 // FIXME
12963 // Delete the following dir layout field definitions for release "S"
12964 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12965 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12966 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12967 XATTR_LAYOUT_FIELD(dir, layout, pool),
12968 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12969 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12970 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12971 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12972 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12973 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12974 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12975 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
12976 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12977 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12978 {
12979 name: "ceph.quota",
12980 getxattr_cb: &Client::_vxattrcb_quota,
12981 readonly: false,
12982 exists_cb: &Client::_vxattrcb_quota_exists,
12983 flags: 0,
12984 },
12985 XATTR_QUOTA_FIELD(quota, max_bytes),
12986 XATTR_QUOTA_FIELD(quota, max_files),
12987 // FIXME
12988 // Delete the following dir pin field definitions for release "S"
12989 {
12990 name: "ceph.dir.pin",
12991 getxattr_cb: &Client::_vxattrcb_dir_pin,
12992 readonly: false,
12993 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12994 flags: 0,
12995 },
12996 {
12997 name: "ceph.snap.btime",
12998 getxattr_cb: &Client::_vxattrcb_snap_btime,
12999 readonly: true,
13000 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13001 flags: 0,
13002 },
13003 {
13004 name: "ceph.mirror.info",
13005 getxattr_cb: &Client::_vxattrcb_mirror_info,
13006 readonly: false,
13007 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13008 flags: 0,
13009 },
13010 {
13011 name: "ceph.caps",
13012 getxattr_cb: &Client::_vxattrcb_caps,
13013 readonly: true,
13014 exists_cb: NULL,
13015 flags: 0,
13016 },
13017 { name: "" } /* Required table terminator */
13018 };
13019
13020 const Client::VXattr Client::_file_vxattrs[] = {
13021 {
13022 name: "ceph.file.layout",
13023 getxattr_cb: &Client::_vxattrcb_layout,
13024 readonly: false,
13025 exists_cb: &Client::_vxattrcb_layout_exists,
13026 flags: 0,
13027 },
13028 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13029 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13030 XATTR_LAYOUT_FIELD(file, layout, object_size),
13031 XATTR_LAYOUT_FIELD(file, layout, pool),
13032 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13033 {
13034 name: "ceph.snap.btime",
13035 getxattr_cb: &Client::_vxattrcb_snap_btime,
13036 readonly: true,
13037 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13038 flags: 0,
13039 },
13040 {
13041 name: "ceph.caps",
13042 getxattr_cb: &Client::_vxattrcb_caps,
13043 readonly: true,
13044 exists_cb: NULL,
13045 flags: 0,
13046 },
13047 { name: "" } /* Required table terminator */
13048 };
13049
13050 const Client::VXattr Client::_common_vxattrs[] = {
13051 {
13052 name: "ceph.cluster_fsid",
13053 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13054 readonly: true,
13055 exists_cb: nullptr,
13056 flags: 0,
13057 },
13058 {
13059 name: "ceph.client_id",
13060 getxattr_cb: &Client::_vxattrcb_client_id,
13061 readonly: true,
13062 exists_cb: nullptr,
13063 flags: 0,
13064 },
13065 { name: "" } /* Required table terminator */
13066 };
13067
13068 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13069 {
13070 if (in->is_dir())
13071 return _dir_vxattrs;
13072 else if (in->is_file())
13073 return _file_vxattrs;
13074 return NULL;
13075 }
13076
13077 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13078 {
13079 if (strncmp(name, "ceph.", 5) == 0) {
13080 const VXattr *vxattr = _get_vxattrs(in);
13081 if (vxattr) {
13082 while (!vxattr->name.empty()) {
13083 if (vxattr->name == name)
13084 return vxattr;
13085 vxattr++;
13086 }
13087 }
13088
13089 // for common vxattrs
13090 vxattr = _common_vxattrs;
13091 while (!vxattr->name.empty()) {
13092 if (vxattr->name == name)
13093 return vxattr;
13094 vxattr++;
13095 }
13096 }
13097
13098 return NULL;
13099 }
13100
13101 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13102 {
13103 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13104 if (!mref_reader.is_state_satisfied())
13105 return -CEPHFS_ENOTCONN;
13106
13107 vinodeno_t vino = _get_vino(in);
13108
13109 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13110 tout(cct) << "ll_readlink" << std::endl;
13111 tout(cct) << vino.ino.val << std::endl;
13112
13113 std::scoped_lock lock(client_lock);
13114 for (auto dn : in->dentries) {
13115 touch_dn(dn);
13116 }
13117
13118 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13119 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13120 return r;
13121 }
13122
13123 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13124 const UserPerm& perms, InodeRef *inp)
13125 {
13126 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13127 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13128 << ", gid " << perms.gid() << ")" << dendl;
13129
13130 if (strlen(name) > NAME_MAX)
13131 return -CEPHFS_ENAMETOOLONG;
13132
13133 if (dir->snapid != CEPH_NOSNAP) {
13134 return -CEPHFS_EROFS;
13135 }
13136 if (is_quota_files_exceeded(dir, perms)) {
13137 return -CEPHFS_EDQUOT;
13138 }
13139
13140 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13141
13142 filepath path;
13143 dir->make_nosnap_relative_path(path);
13144 path.push_dentry(name);
13145 req->set_filepath(path);
13146 req->set_inode(dir);
13147 req->head.args.mknod.rdev = rdev;
13148 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13149 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13150
13151 bufferlist xattrs_bl;
13152 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13153 if (res < 0)
13154 goto fail;
13155 req->head.args.mknod.mode = mode;
13156 if (xattrs_bl.length() > 0)
13157 req->set_data(xattrs_bl);
13158
13159 Dentry *de;
13160 res = get_or_create(dir, name, &de);
13161 if (res < 0)
13162 goto fail;
13163 req->set_dentry(de);
13164
13165 res = make_request(req, perms, inp);
13166
13167 trim_cache();
13168
13169 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13170 return res;
13171
13172 fail:
13173 put_request(req);
13174 return res;
13175 }
13176
13177 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13178 dev_t rdev, struct stat *attr, Inode **out,
13179 const UserPerm& perms)
13180 {
13181 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13182 if (!mref_reader.is_state_satisfied())
13183 return -CEPHFS_ENOTCONN;
13184
13185 vinodeno_t vparent = _get_vino(parent);
13186
13187 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13188 tout(cct) << "ll_mknod" << std::endl;
13189 tout(cct) << vparent.ino.val << std::endl;
13190 tout(cct) << name << std::endl;
13191 tout(cct) << mode << std::endl;
13192 tout(cct) << rdev << std::endl;
13193
13194 std::scoped_lock lock(client_lock);
13195 if (!fuse_default_permissions) {
13196 int r = may_create(parent, perms);
13197 if (r < 0)
13198 return r;
13199 }
13200
13201 InodeRef in;
13202 int r = _mknod(parent, name, mode, rdev, perms, &in);
13203 if (r == 0) {
13204 fill_stat(in, attr);
13205 _ll_get(in.get());
13206 }
13207 tout(cct) << attr->st_ino << std::endl;
13208 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13209 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13210 *out = in.get();
13211 return r;
13212 }
13213
13214 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13215 dev_t rdev, Inode **out,
13216 struct ceph_statx *stx, unsigned want, unsigned flags,
13217 const UserPerm& perms)
13218 {
13219 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13220 if (!mref_reader.is_state_satisfied())
13221 return -CEPHFS_ENOTCONN;
13222
13223 unsigned caps = statx_to_mask(flags, want);
13224
13225 vinodeno_t vparent = _get_vino(parent);
13226
13227 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13228 tout(cct) << "ll_mknodx" << std::endl;
13229 tout(cct) << vparent.ino.val << std::endl;
13230 tout(cct) << name << std::endl;
13231 tout(cct) << mode << std::endl;
13232 tout(cct) << rdev << std::endl;
13233
13234 std::scoped_lock lock(client_lock);
13235
13236 if (!fuse_default_permissions) {
13237 int r = may_create(parent, perms);
13238 if (r < 0)
13239 return r;
13240 }
13241
13242 InodeRef in;
13243 int r = _mknod(parent, name, mode, rdev, perms, &in);
13244 if (r == 0) {
13245 fill_statx(in, caps, stx);
13246 _ll_get(in.get());
13247 }
13248 tout(cct) << stx->stx_ino << std::endl;
13249 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13250 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13251 *out = in.get();
13252 return r;
13253 }
13254
13255 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13256 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13257 int object_size, const char *data_pool, bool *created,
13258 const UserPerm& perms, std::string alternate_name)
13259 {
13260 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13261 mode << dec << ")" << dendl;
13262
13263 if (strlen(name) > NAME_MAX)
13264 return -CEPHFS_ENAMETOOLONG;
13265 if (dir->snapid != CEPH_NOSNAP) {
13266 return -CEPHFS_EROFS;
13267 }
13268 if (is_quota_files_exceeded(dir, perms)) {
13269 return -CEPHFS_EDQUOT;
13270 }
13271
13272 // use normalized flags to generate cmode
13273 int cflags = ceph_flags_sys2wire(flags);
13274 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13275 cflags |= CEPH_O_LAZY;
13276
13277 int cmode = ceph_flags_to_mode(cflags);
13278
13279 int64_t pool_id = -1;
13280 if (data_pool && *data_pool) {
13281 pool_id = objecter->with_osdmap(
13282 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13283 if (pool_id < 0)
13284 return -CEPHFS_EINVAL;
13285 if (pool_id > 0xffffffffll)
13286 return -CEPHFS_ERANGE; // bummer!
13287 }
13288
13289 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13290
13291 filepath path;
13292 dir->make_nosnap_relative_path(path);
13293 path.push_dentry(name);
13294 req->set_filepath(path);
13295 req->set_alternate_name(std::move(alternate_name));
13296 req->set_inode(dir);
13297 req->head.args.open.flags = cflags | CEPH_O_CREAT;
13298
13299 req->head.args.open.stripe_unit = stripe_unit;
13300 req->head.args.open.stripe_count = stripe_count;
13301 req->head.args.open.object_size = object_size;
13302 if (cct->_conf->client_debug_getattr_caps)
13303 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13304 else
13305 req->head.args.open.mask = 0;
13306 req->head.args.open.pool = pool_id;
13307 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13308 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13309
13310 mode |= S_IFREG;
13311 bufferlist xattrs_bl;
13312 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13313 if (res < 0)
13314 goto fail;
13315 req->head.args.open.mode = mode;
13316 if (xattrs_bl.length() > 0)
13317 req->set_data(xattrs_bl);
13318
13319 Dentry *de;
13320 res = get_or_create(dir, name, &de);
13321 if (res < 0)
13322 goto fail;
13323 req->set_dentry(de);
13324
13325 res = make_request(req, perms, inp, created);
13326 if (res < 0) {
13327 goto reply_error;
13328 }
13329
13330 /* If the caller passed a value in fhp, do the open */
13331 if(fhp) {
13332 (*inp)->get_open_ref(cmode);
13333 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13334 }
13335
13336 reply_error:
13337 trim_cache();
13338
13339 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
13340 << " layout " << stripe_unit
13341 << ' ' << stripe_count
13342 << ' ' << object_size
13343 <<") = " << res << dendl;
13344 return res;
13345
13346 fail:
13347 put_request(req);
13348 return res;
13349 }
13350
13351 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
13352 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13353 std::string alternate_name)
13354 {
13355 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
13356 << mode << dec << ", uid " << perm.uid()
13357 << ", gid " << perm.gid() << ")" << dendl;
13358
13359 if (strlen(name) > NAME_MAX)
13360 return -CEPHFS_ENAMETOOLONG;
13361
13362 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13363 return -CEPHFS_EROFS;
13364 }
13365 if (is_quota_files_exceeded(dir, perm)) {
13366 return -CEPHFS_EDQUOT;
13367 }
13368
13369 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13370 MetaRequest *req = new MetaRequest(is_snap_op ?
13371 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13372
13373 filepath path;
13374 dir->make_nosnap_relative_path(path);
13375 path.push_dentry(name);
13376 req->set_filepath(path);
13377 req->set_inode(dir);
13378 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13379 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13380 req->set_alternate_name(std::move(alternate_name));
13381
13382 mode |= S_IFDIR;
13383 bufferlist bl;
13384 int res = _posix_acl_create(dir, &mode, bl, perm);
13385 if (res < 0)
13386 goto fail;
13387 req->head.args.mkdir.mode = mode;
13388 if (is_snap_op) {
13389 SnapPayload payload;
13390 // clear the bufferlist that may have been populated by the call
13391 // to _posix_acl_create(). MDS mksnap does not make use of it.
13392 // So, reuse it to pass metadata payload.
13393 bl.clear();
13394 payload.metadata = metadata;
13395 encode(payload, bl);
13396 }
13397 if (bl.length() > 0) {
13398 req->set_data(bl);
13399 }
13400
13401 Dentry *de;
13402 res = get_or_create(dir, name, &de);
13403 if (res < 0)
13404 goto fail;
13405 req->set_dentry(de);
13406
13407 ldout(cct, 10) << "_mkdir: making request" << dendl;
13408 res = make_request(req, perm, inp);
13409 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13410
13411 trim_cache();
13412
13413 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13414 return res;
13415
13416 fail:
13417 put_request(req);
13418 return res;
13419 }
13420
13421 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13422 struct stat *attr, Inode **out, const UserPerm& perm)
13423 {
13424 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13425 if (!mref_reader.is_state_satisfied())
13426 return -CEPHFS_ENOTCONN;
13427
13428 vinodeno_t vparent = _get_vino(parent);
13429
13430 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13431 tout(cct) << "ll_mkdir" << std::endl;
13432 tout(cct) << vparent.ino.val << std::endl;
13433 tout(cct) << name << std::endl;
13434 tout(cct) << mode << std::endl;
13435
13436 std::scoped_lock lock(client_lock);
13437
13438 if (!fuse_default_permissions) {
13439 int r = may_create(parent, perm);
13440 if (r < 0)
13441 return r;
13442 }
13443
13444 InodeRef in;
13445 int r = _mkdir(parent, name, mode, perm, &in);
13446 if (r == 0) {
13447 fill_stat(in, attr);
13448 _ll_get(in.get());
13449 }
13450 tout(cct) << attr->st_ino << std::endl;
13451 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13452 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13453 *out = in.get();
13454 return r;
13455 }
13456
13457 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13458 struct ceph_statx *stx, unsigned want, unsigned flags,
13459 const UserPerm& perms)
13460 {
13461 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13462 if (!mref_reader.is_state_satisfied())
13463 return -CEPHFS_ENOTCONN;
13464
13465 vinodeno_t vparent = _get_vino(parent);
13466
13467 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13468 tout(cct) << "ll_mkdirx" << std::endl;
13469 tout(cct) << vparent.ino.val << std::endl;
13470 tout(cct) << name << std::endl;
13471 tout(cct) << mode << std::endl;
13472
13473 std::scoped_lock lock(client_lock);
13474
13475 if (!fuse_default_permissions) {
13476 int r = may_create(parent, perms);
13477 if (r < 0)
13478 return r;
13479 }
13480
13481 InodeRef in;
13482 int r = _mkdir(parent, name, mode, perms, &in);
13483 if (r == 0) {
13484 fill_statx(in, statx_to_mask(flags, want), stx);
13485 _ll_get(in.get());
13486 } else {
13487 stx->stx_ino = 0;
13488 stx->stx_mask = 0;
13489 }
13490 tout(cct) << stx->stx_ino << std::endl;
13491 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13492 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13493 *out = in.get();
13494 return r;
13495 }
13496
13497 int Client::_symlink(Inode *dir, const char *name, const char *target,
13498 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13499 {
13500 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13501 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13502 << dendl;
13503
13504 if (strlen(name) > NAME_MAX)
13505 return -CEPHFS_ENAMETOOLONG;
13506
13507 if (dir->snapid != CEPH_NOSNAP) {
13508 return -CEPHFS_EROFS;
13509 }
13510 if (is_quota_files_exceeded(dir, perms)) {
13511 return -CEPHFS_EDQUOT;
13512 }
13513
13514 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13515
13516 filepath path;
13517 dir->make_nosnap_relative_path(path);
13518 path.push_dentry(name);
13519 req->set_filepath(path);
13520 req->set_alternate_name(std::move(alternate_name));
13521 req->set_inode(dir);
13522 req->set_string2(target);
13523 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13524 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13525
13526 Dentry *de;
13527 int res = get_or_create(dir, name, &de);
13528 if (res < 0)
13529 goto fail;
13530 req->set_dentry(de);
13531
13532 res = make_request(req, perms, inp);
13533
13534 trim_cache();
13535 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13536 res << dendl;
13537 return res;
13538
13539 fail:
13540 put_request(req);
13541 return res;
13542 }
13543
13544 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13545 struct stat *attr, Inode **out, const UserPerm& perms)
13546 {
13547 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13548 if (!mref_reader.is_state_satisfied())
13549 return -CEPHFS_ENOTCONN;
13550
13551 vinodeno_t vparent = _get_vino(parent);
13552
13553 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13554 << dendl;
13555 tout(cct) << "ll_symlink" << std::endl;
13556 tout(cct) << vparent.ino.val << std::endl;
13557 tout(cct) << name << std::endl;
13558 tout(cct) << value << std::endl;
13559
13560 std::scoped_lock lock(client_lock);
13561
13562 if (!fuse_default_permissions) {
13563 int r = may_create(parent, perms);
13564 if (r < 0)
13565 return r;
13566 }
13567
13568 InodeRef in;
13569 int r = _symlink(parent, name, value, perms, "", &in);
13570 if (r == 0) {
13571 fill_stat(in, attr);
13572 _ll_get(in.get());
13573 }
13574 tout(cct) << attr->st_ino << std::endl;
13575 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13576 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13577 *out = in.get();
13578 return r;
13579 }
13580
13581 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13582 Inode **out, struct ceph_statx *stx, unsigned want,
13583 unsigned flags, const UserPerm& perms)
13584 {
13585 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13586 if (!mref_reader.is_state_satisfied())
13587 return -CEPHFS_ENOTCONN;
13588
13589 vinodeno_t vparent = _get_vino(parent);
13590
13591 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13592 << dendl;
13593 tout(cct) << "ll_symlinkx" << std::endl;
13594 tout(cct) << vparent.ino.val << std::endl;
13595 tout(cct) << name << std::endl;
13596 tout(cct) << value << std::endl;
13597
13598 std::scoped_lock lock(client_lock);
13599
13600 if (!fuse_default_permissions) {
13601 int r = may_create(parent, perms);
13602 if (r < 0)
13603 return r;
13604 }
13605
13606 InodeRef in;
13607 int r = _symlink(parent, name, value, perms, "", &in);
13608 if (r == 0) {
13609 fill_statx(in, statx_to_mask(flags, want), stx);
13610 _ll_get(in.get());
13611 }
13612 tout(cct) << stx->stx_ino << std::endl;
13613 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13614 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13615 *out = in.get();
13616 return r;
13617 }
13618
13619 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13620 {
13621 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13622 << " uid " << perm.uid() << " gid " << perm.gid()
13623 << ")" << dendl;
13624
13625 if (dir->snapid != CEPH_NOSNAP) {
13626 return -CEPHFS_EROFS;
13627 }
13628
13629 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13630
13631 filepath path;
13632 dir->make_nosnap_relative_path(path);
13633 path.push_dentry(name);
13634 req->set_filepath(path);
13635
13636 InodeRef otherin;
13637 Inode *in;
13638 Dentry *de;
13639
13640 int res = get_or_create(dir, name, &de);
13641 if (res < 0)
13642 goto fail;
13643 req->set_dentry(de);
13644 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13645 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13646
13647 res = _lookup(dir, name, 0, &otherin, perm);
13648 if (res < 0)
13649 goto fail;
13650
13651 in = otherin.get();
13652 req->set_other_inode(in);
13653 in->break_all_delegs();
13654 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13655
13656 req->set_inode(dir);
13657
13658 res = make_request(req, perm);
13659
13660 trim_cache();
13661 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13662 return res;
13663
13664 fail:
13665 put_request(req);
13666 return res;
13667 }
13668
13669 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13670 {
13671 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13672 if (!mref_reader.is_state_satisfied())
13673 return -CEPHFS_ENOTCONN;
13674
13675 vinodeno_t vino = _get_vino(in);
13676
13677 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13678 tout(cct) << "ll_unlink" << std::endl;
13679 tout(cct) << vino.ino.val << std::endl;
13680 tout(cct) << name << std::endl;
13681
13682 std::scoped_lock lock(client_lock);
13683
13684 if (!fuse_default_permissions) {
13685 int r = may_delete(in, name, perm);
13686 if (r < 0)
13687 return r;
13688 }
13689 return _unlink(in, name, perm);
13690 }
13691
13692 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13693 {
13694 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13695 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13696
13697 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13698 return -CEPHFS_EROFS;
13699 }
13700
13701 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13702 MetaRequest *req = new MetaRequest(op);
13703 filepath path;
13704 dir->make_nosnap_relative_path(path);
13705 path.push_dentry(name);
13706 req->set_filepath(path);
13707 req->set_inode(dir);
13708
13709 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13710 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13711 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13712
13713 InodeRef in;
13714
13715 Dentry *de;
13716 int res = get_or_create(dir, name, &de);
13717 if (res < 0)
13718 goto fail;
13719 if (op == CEPH_MDS_OP_RMDIR)
13720 req->set_dentry(de);
13721 else
13722 de->get();
13723
13724 res = _lookup(dir, name, 0, &in, perms);
13725 if (res < 0)
13726 goto fail;
13727
13728 if (op == CEPH_MDS_OP_RMSNAP) {
13729 unlink(de, true, true);
13730 de->put();
13731 }
13732 req->set_other_inode(in.get());
13733
13734 res = make_request(req, perms);
13735
13736 trim_cache();
13737 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13738 return res;
13739
13740 fail:
13741 put_request(req);
13742 return res;
13743 }
13744
13745 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13746 {
13747 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13748 if (!mref_reader.is_state_satisfied())
13749 return -CEPHFS_ENOTCONN;
13750
13751 vinodeno_t vino = _get_vino(in);
13752
13753 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13754 tout(cct) << "ll_rmdir" << std::endl;
13755 tout(cct) << vino.ino.val << std::endl;
13756 tout(cct) << name << std::endl;
13757
13758 std::scoped_lock lock(client_lock);
13759
13760 if (!fuse_default_permissions) {
13761 int r = may_delete(in, name, perms);
13762 if (r < 0)
13763 return r;
13764 }
13765
13766 return _rmdir(in, name, perms);
13767 }
13768
13769 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13770 {
13771 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13772 << todir->ino << " " << toname
13773 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13774 << dendl;
13775
13776 if (fromdir->snapid != todir->snapid)
13777 return -CEPHFS_EXDEV;
13778
13779 int op = CEPH_MDS_OP_RENAME;
13780 if (fromdir->snapid != CEPH_NOSNAP) {
13781 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13782 op = CEPH_MDS_OP_RENAMESNAP;
13783 else
13784 return -CEPHFS_EROFS;
13785 }
13786 if (fromdir != todir) {
13787 Inode *fromdir_root =
13788 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13789 Inode *todir_root =
13790 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13791 if (fromdir_root != todir_root) {
13792 return -CEPHFS_EXDEV;
13793 }
13794 }
13795
13796 InodeRef target;
13797 MetaRequest *req = new MetaRequest(op);
13798
13799 filepath from;
13800 fromdir->make_nosnap_relative_path(from);
13801 from.push_dentry(fromname);
13802 filepath to;
13803 todir->make_nosnap_relative_path(to);
13804 to.push_dentry(toname);
13805 req->set_filepath(to);
13806 req->set_filepath2(from);
13807 req->set_alternate_name(std::move(alternate_name));
13808
13809 Dentry *oldde;
13810 int res = get_or_create(fromdir, fromname, &oldde);
13811 if (res < 0)
13812 goto fail;
13813 Dentry *de;
13814 res = get_or_create(todir, toname, &de);
13815 if (res < 0)
13816 goto fail;
13817
13818 if (op == CEPH_MDS_OP_RENAME) {
13819 req->set_old_dentry(oldde);
13820 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13821 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13822
13823 req->set_dentry(de);
13824 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13825 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13826
13827 InodeRef oldin, otherin;
13828 res = _lookup(fromdir, fromname, 0, &oldin, perm);
13829 if (res < 0)
13830 goto fail;
13831
13832 Inode *oldinode = oldin.get();
13833 oldinode->break_all_delegs();
13834 req->set_old_inode(oldinode);
13835 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13836
13837 res = _lookup(todir, toname, 0, &otherin, perm);
13838 switch (res) {
13839 case 0:
13840 {
13841 Inode *in = otherin.get();
13842 req->set_other_inode(in);
13843 in->break_all_delegs();
13844 }
13845 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13846 break;
13847 case -CEPHFS_ENOENT:
13848 break;
13849 default:
13850 goto fail;
13851 }
13852
13853 req->set_inode(todir);
13854 } else {
13855 // renamesnap reply contains no tracedn, so we need to invalidate
13856 // dentry manually
13857 unlink(oldde, true, true);
13858 unlink(de, true, true);
13859
13860 req->set_inode(todir);
13861 }
13862
13863 res = make_request(req, perm, &target);
13864 ldout(cct, 10) << "rename result is " << res << dendl;
13865
13866 // renamed item from our cache
13867
13868 trim_cache();
13869 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
13870 return res;
13871
13872 fail:
13873 put_request(req);
13874 return res;
13875 }
13876
13877 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13878 const char *newname, const UserPerm& perm)
13879 {
13880 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13881 if (!mref_reader.is_state_satisfied())
13882 return -CEPHFS_ENOTCONN;
13883
13884 vinodeno_t vparent = _get_vino(parent);
13885 vinodeno_t vnewparent = _get_vino(newparent);
13886
13887 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13888 << vnewparent << " " << newname << dendl;
13889 tout(cct) << "ll_rename" << std::endl;
13890 tout(cct) << vparent.ino.val << std::endl;
13891 tout(cct) << name << std::endl;
13892 tout(cct) << vnewparent.ino.val << std::endl;
13893 tout(cct) << newname << std::endl;
13894
13895 std::scoped_lock lock(client_lock);
13896
13897 if (!fuse_default_permissions) {
13898 int r = may_delete(parent, name, perm);
13899 if (r < 0)
13900 return r;
13901 r = may_delete(newparent, newname, perm);
13902 if (r < 0 && r != -CEPHFS_ENOENT)
13903 return r;
13904 }
13905
13906 return _rename(parent, name, newparent, newname, perm, "");
13907 }
13908
13909 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
13910 {
13911 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13912 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13913
13914 if (strlen(newname) > NAME_MAX)
13915 return -CEPHFS_ENAMETOOLONG;
13916
13917 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13918 return -CEPHFS_EROFS;
13919 }
13920 if (is_quota_files_exceeded(dir, perm)) {
13921 return -CEPHFS_EDQUOT;
13922 }
13923
13924 in->break_all_delegs();
13925 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13926
13927 filepath path(newname, dir->ino);
13928 req->set_filepath(path);
13929 req->set_alternate_name(std::move(alternate_name));
13930 filepath existing(in->ino);
13931 req->set_filepath2(existing);
13932
13933 req->set_inode(dir);
13934 req->inode_drop = CEPH_CAP_FILE_SHARED;
13935 req->inode_unless = CEPH_CAP_FILE_EXCL;
13936
13937 Dentry *de;
13938 int res = get_or_create(dir, newname, &de);
13939 if (res < 0)
13940 goto fail;
13941 req->set_dentry(de);
13942
13943 res = make_request(req, perm, inp);
13944 ldout(cct, 10) << "link result is " << res << dendl;
13945
13946 trim_cache();
13947 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13948 return res;
13949
13950 fail:
13951 put_request(req);
13952 return res;
13953 }
13954
13955 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13956 const UserPerm& perm)
13957 {
13958 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13959 if (!mref_reader.is_state_satisfied())
13960 return -CEPHFS_ENOTCONN;
13961
13962 vinodeno_t vino = _get_vino(in);
13963 vinodeno_t vnewparent = _get_vino(newparent);
13964
13965 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13966 newname << dendl;
13967 tout(cct) << "ll_link" << std::endl;
13968 tout(cct) << vino.ino.val << std::endl;
13969 tout(cct) << vnewparent << std::endl;
13970 tout(cct) << newname << std::endl;
13971
13972 InodeRef target;
13973
13974 std::scoped_lock lock(client_lock);
13975
13976 if (!fuse_default_permissions) {
13977 if (S_ISDIR(in->mode))
13978 return -CEPHFS_EPERM;
13979
13980 int r = may_hardlink(in, perm);
13981 if (r < 0)
13982 return r;
13983
13984 r = may_create(newparent, perm);
13985 if (r < 0)
13986 return r;
13987 }
13988
13989 return _link(in, newparent, newname, perm, "", &target);
13990 }
13991
13992 int Client::ll_num_osds(void)
13993 {
13994 std::scoped_lock lock(client_lock);
13995 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13996 }
13997
13998 int Client::ll_osdaddr(int osd, uint32_t *addr)
13999 {
14000 std::scoped_lock lock(client_lock);
14001
14002 entity_addr_t g;
14003 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14004 if (!o.exists(osd))
14005 return false;
14006 g = o.get_addrs(osd).front();
14007 return true;
14008 });
14009 if (!exists)
14010 return -1;
14011 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14012 *addr = ntohl(nb_addr);
14013 return 0;
14014 }
14015
14016 uint32_t Client::ll_stripe_unit(Inode *in)
14017 {
14018 std::scoped_lock lock(client_lock);
14019 return in->layout.stripe_unit;
14020 }
14021
14022 uint64_t Client::ll_snap_seq(Inode *in)
14023 {
14024 std::scoped_lock lock(client_lock);
14025 return in->snaprealm->seq;
14026 }
14027
14028 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14029 {
14030 std::scoped_lock lock(client_lock);
14031 *layout = in->layout;
14032 return 0;
14033 }
14034
14035 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14036 {
14037 return ll_file_layout(fh->inode.get(), layout);
14038 }
14039
14040 /* Currently we cannot take advantage of redundancy in reads, since we
14041 would have to go through all possible placement groups (a
14042 potentially quite large number determined by a hash), and use CRUSH
14043 to calculate the appropriate set of OSDs for each placement group,
14044 then index into that. An array with one entry per OSD is much more
14045 tractable and works for demonstration purposes. */
14046
14047 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14048 file_layout_t* layout)
14049 {
14050 std::scoped_lock lock(client_lock);
14051
14052 inodeno_t ino = in->ino;
14053 uint32_t object_size = layout->object_size;
14054 uint32_t su = layout->stripe_unit;
14055 uint32_t stripe_count = layout->stripe_count;
14056 uint64_t stripes_per_object = object_size / su;
14057 uint64_t stripeno = 0, stripepos = 0;
14058
14059 if(stripe_count) {
14060 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14061 stripepos = blockno % stripe_count; // which object in the object set (X)
14062 }
14063 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14064 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14065
14066 object_t oid = file_object_t(ino, objectno);
14067 return objecter->with_osdmap([&](const OSDMap& o) {
14068 ceph_object_layout olayout =
14069 o.file_to_object_layout(oid, *layout);
14070 pg_t pg = (pg_t)olayout.ol_pgid;
14071 vector<int> osds;
14072 int primary;
14073 o.pg_to_acting_osds(pg, &osds, &primary);
14074 return primary;
14075 });
14076 }
14077
14078 /* Return the offset of the block, internal to the object */
14079
14080 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14081 {
14082 std::scoped_lock lock(client_lock);
14083 file_layout_t *layout=&(in->layout);
14084 uint32_t object_size = layout->object_size;
14085 uint32_t su = layout->stripe_unit;
14086 uint64_t stripes_per_object = object_size / su;
14087
14088 return (blockno % stripes_per_object) * su;
14089 }
14090
14091 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14092 const UserPerm& perms)
14093 {
14094 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14095 if (!mref_reader.is_state_satisfied())
14096 return -CEPHFS_ENOTCONN;
14097
14098 vinodeno_t vino = _get_vino(in);
14099
14100 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14101 tout(cct) << "ll_opendir" << std::endl;
14102 tout(cct) << vino.ino.val << std::endl;
14103
14104 std::scoped_lock lock(client_lock);
14105
14106 if (!fuse_default_permissions) {
14107 int r = may_open(in, flags, perms);
14108 if (r < 0)
14109 return r;
14110 }
14111
14112 int r = _opendir(in, dirpp, perms);
14113 tout(cct) << (uintptr_t)*dirpp << std::endl;
14114
14115 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14116 << dendl;
14117 return r;
14118 }
14119
14120 int Client::ll_releasedir(dir_result_t *dirp)
14121 {
14122 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14123 if (!mref_reader.is_state_satisfied())
14124 return -CEPHFS_ENOTCONN;
14125
14126 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14127 tout(cct) << "ll_releasedir" << std::endl;
14128 tout(cct) << (uintptr_t)dirp << std::endl;
14129
14130 std::scoped_lock lock(client_lock);
14131
14132 _closedir(dirp);
14133 return 0;
14134 }
14135
14136 int Client::ll_fsyncdir(dir_result_t *dirp)
14137 {
14138 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14139 if (!mref_reader.is_state_satisfied())
14140 return -CEPHFS_ENOTCONN;
14141
14142 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14143 tout(cct) << "ll_fsyncdir" << std::endl;
14144 tout(cct) << (uintptr_t)dirp << std::endl;
14145
14146 std::scoped_lock lock(client_lock);
14147 return _fsync(dirp->inode.get(), false);
14148 }
14149
14150 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14151 {
14152 ceph_assert(!(flags & O_CREAT));
14153
14154 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14155 if (!mref_reader.is_state_satisfied())
14156 return -CEPHFS_ENOTCONN;
14157
14158 vinodeno_t vino = _get_vino(in);
14159
14160 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14161 tout(cct) << "ll_open" << std::endl;
14162 tout(cct) << vino.ino.val << std::endl;
14163 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14164
14165 std::scoped_lock lock(client_lock);
14166
14167 int r;
14168 if (!fuse_default_permissions) {
14169 r = may_open(in, flags, perms);
14170 if (r < 0)
14171 goto out;
14172 }
14173
14174 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14175
14176 out:
14177 Fh *fhptr = fhp ? *fhp : NULL;
14178 if (fhptr) {
14179 ll_unclosed_fh_set.insert(fhptr);
14180 }
14181 tout(cct) << (uintptr_t)fhptr << std::endl;
14182 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14183 " = " << r << " (" << fhptr << ")" << dendl;
14184 return r;
14185 }
14186
14187 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14188 int flags, InodeRef *in, int caps, Fh **fhp,
14189 const UserPerm& perms)
14190 {
14191 *fhp = NULL;
14192
14193 vinodeno_t vparent = _get_vino(parent);
14194
14195 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14196 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14197 << ", gid " << perms.gid() << dendl;
14198 tout(cct) << "ll_create" << std::endl;
14199 tout(cct) << vparent.ino.val << std::endl;
14200 tout(cct) << name << std::endl;
14201 tout(cct) << mode << std::endl;
14202 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14203
14204 bool created = false;
14205 int r = _lookup(parent, name, caps, in, perms);
14206
14207 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14208 return -CEPHFS_EEXIST;
14209
14210 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14211 if (!fuse_default_permissions) {
14212 r = may_create(parent, perms);
14213 if (r < 0)
14214 goto out;
14215 }
14216 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14217 perms, "");
14218 if (r < 0)
14219 goto out;
14220 }
14221
14222 if (r < 0)
14223 goto out;
14224
14225 ceph_assert(*in);
14226
14227 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14228 if (!created) {
14229 if (!fuse_default_permissions) {
14230 r = may_open(in->get(), flags, perms);
14231 if (r < 0) {
14232 if (*fhp) {
14233 int release_r = _release_fh(*fhp);
14234 ceph_assert(release_r == 0); // during create, no async data ops should have happened
14235 }
14236 goto out;
14237 }
14238 }
14239 if (*fhp == NULL) {
14240 r = _open(in->get(), flags, mode, fhp, perms);
14241 if (r < 0)
14242 goto out;
14243 }
14244 }
14245
14246 out:
14247 if (*fhp) {
14248 ll_unclosed_fh_set.insert(*fhp);
14249 }
14250
14251 ino_t ino = 0;
14252 if (r >= 0) {
14253 Inode *inode = in->get();
14254 if (use_faked_inos())
14255 ino = inode->faked_ino;
14256 else
14257 ino = inode->ino;
14258 }
14259
14260 tout(cct) << (uintptr_t)*fhp << std::endl;
14261 tout(cct) << ino << std::endl;
14262 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14263 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14264 *fhp << " " << hex << ino << dec << ")" << dendl;
14265
14266 return r;
14267 }
14268
14269 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14270 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14271 const UserPerm& perms)
14272 {
14273 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14274 if (!mref_reader.is_state_satisfied())
14275 return -CEPHFS_ENOTCONN;
14276
14277 std::scoped_lock lock(client_lock);
14278 InodeRef in;
14279
14280 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14281 fhp, perms);
14282 if (r >= 0) {
14283 ceph_assert(in);
14284
14285 // passing an Inode in outp requires an additional ref
14286 if (outp) {
14287 _ll_get(in.get());
14288 *outp = in.get();
14289 }
14290 fill_stat(in, attr);
14291 } else {
14292 attr->st_ino = 0;
14293 }
14294
14295 return r;
14296 }
14297
14298 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14299 int oflags, Inode **outp, Fh **fhp,
14300 struct ceph_statx *stx, unsigned want, unsigned lflags,
14301 const UserPerm& perms)
14302 {
14303 unsigned caps = statx_to_mask(lflags, want);
14304 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14305 if (!mref_reader.is_state_satisfied())
14306 return -CEPHFS_ENOTCONN;
14307
14308 std::scoped_lock lock(client_lock);
14309 InodeRef in;
14310
14311 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14312 if (r >= 0) {
14313 ceph_assert(in);
14314
14315 // passing an Inode in outp requires an additional ref
14316 if (outp) {
14317 _ll_get(in.get());
14318 *outp = in.get();
14319 }
14320 fill_statx(in, caps, stx);
14321 } else {
14322 stx->stx_ino = 0;
14323 stx->stx_mask = 0;
14324 }
14325
14326 return r;
14327 }
14328
14329 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14330 {
14331 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14332 if (!mref_reader.is_state_satisfied())
14333 return -CEPHFS_ENOTCONN;
14334
14335 tout(cct) << "ll_lseek" << std::endl;
14336 tout(cct) << offset << std::endl;
14337 tout(cct) << whence << std::endl;
14338
14339 std::scoped_lock lock(client_lock);
14340 return _lseek(fh, offset, whence);
14341 }
14342
14343 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14344 {
14345 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14346 if (!mref_reader.is_state_satisfied())
14347 return -CEPHFS_ENOTCONN;
14348
14349 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14350 tout(cct) << "ll_read" << std::endl;
14351 tout(cct) << (uintptr_t)fh << std::endl;
14352 tout(cct) << off << std::endl;
14353 tout(cct) << len << std::endl;
14354
14355 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14356 len = std::min(len, (loff_t)INT_MAX);
14357 std::scoped_lock lock(client_lock);
14358
14359 int r = _read(fh, off, len, bl);
14360 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14361 << dendl;
14362 return r;
14363 }
14364
14365 int Client::ll_read_block(Inode *in, uint64_t blockid,
14366 char *buf,
14367 uint64_t offset,
14368 uint64_t length,
14369 file_layout_t* layout)
14370 {
14371 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14372 if (!mref_reader.is_state_satisfied())
14373 return -CEPHFS_ENOTCONN;
14374
14375 vinodeno_t vino = _get_vino(in);
14376 object_t oid = file_object_t(vino.ino, blockid);
14377 C_SaferCond onfinish;
14378 bufferlist bl;
14379
14380 objecter->read(oid,
14381 object_locator_t(layout->pool_id),
14382 offset,
14383 length,
14384 vino.snapid,
14385 &bl,
14386 CEPH_OSD_FLAG_READ,
14387 &onfinish);
14388
14389 int r = onfinish.wait();
14390 if (r >= 0) {
14391 bl.begin().copy(bl.length(), buf);
14392 r = bl.length();
14393 }
14394
14395 return r;
14396 }
14397
14398 /* It appears that the OSD doesn't return success unless the entire
14399 buffer was written, return the write length on success. */
14400
14401 int Client::ll_write_block(Inode *in, uint64_t blockid,
14402 char* buf, uint64_t offset,
14403 uint64_t length, file_layout_t* layout,
14404 uint64_t snapseq, uint32_t sync)
14405 {
14406 vinodeno_t vino = ll_get_vino(in);
14407 int r = 0;
14408 std::unique_ptr<C_SaferCond> onsafe = nullptr;
14409
14410 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14411 if (!mref_reader.is_state_satisfied())
14412 return -CEPHFS_ENOTCONN;
14413
14414 if (length == 0) {
14415 return -CEPHFS_EINVAL;
14416 }
14417 if (true || sync) {
14418 /* if write is stable, the epilogue is waiting on
14419 * flock */
14420 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14421 }
14422 object_t oid = file_object_t(vino.ino, blockid);
14423 SnapContext fakesnap;
14424 ceph::bufferlist bl;
14425 if (length > 0) {
14426 bl.push_back(buffer::copy(buf, length));
14427 }
14428
14429 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14430 << dendl;
14431
14432 fakesnap.seq = snapseq;
14433
14434 /* lock just in time */
14435 objecter->write(oid,
14436 object_locator_t(layout->pool_id),
14437 offset,
14438 length,
14439 fakesnap,
14440 bl,
14441 ceph::real_clock::now(),
14442 0,
14443 onsafe.get());
14444
14445 if (nullptr != onsafe) {
14446 r = onsafe->wait();
14447 }
14448
14449 if (r < 0) {
14450 return r;
14451 } else {
14452 return length;
14453 }
14454 }
14455
14456 int Client::ll_commit_blocks(Inode *in,
14457 uint64_t offset,
14458 uint64_t length)
14459 {
14460 /*
14461 BarrierContext *bctx;
14462 vinodeno_t vino = _get_vino(in);
14463 uint64_t ino = vino.ino;
14464
14465 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14466 << offset << " to " << length << dendl;
14467
14468 if (length == 0) {
14469 return -CEPHFS_EINVAL;
14470 }
14471
14472 std::scoped_lock lock(client_lock);
14473 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14474 if (p != barriers.end()) {
14475 barrier_interval civ(offset, offset + length);
14476 p->second->commit_barrier(civ);
14477 }
14478 */
14479 return 0;
14480 }
14481
14482 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14483 {
14484 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14485 "~" << len << dendl;
14486 tout(cct) << "ll_write" << std::endl;
14487 tout(cct) << (uintptr_t)fh << std::endl;
14488 tout(cct) << off << std::endl;
14489 tout(cct) << len << std::endl;
14490
14491 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14492 if (!mref_reader.is_state_satisfied())
14493 return -CEPHFS_ENOTCONN;
14494
14495 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14496 len = std::min(len, (loff_t)INT_MAX);
14497 std::scoped_lock lock(client_lock);
14498
14499 int r = _write(fh, off, len, data, NULL, 0);
14500 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14501 << dendl;
14502 return r;
14503 }
14504
14505 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14506 {
14507 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14508 if (!mref_reader.is_state_satisfied())
14509 return -CEPHFS_ENOTCONN;
14510
14511 std::scoped_lock cl(client_lock);
14512 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
14513 }
14514
14515 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14516 {
14517 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14518 if (!mref_reader.is_state_satisfied())
14519 return -CEPHFS_ENOTCONN;
14520
14521 std::scoped_lock cl(client_lock);
14522 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
14523 }
14524
14525 int Client::ll_flush(Fh *fh)
14526 {
14527 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14528 if (!mref_reader.is_state_satisfied())
14529 return -CEPHFS_ENOTCONN;
14530
14531 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14532 tout(cct) << "ll_flush" << std::endl;
14533 tout(cct) << (uintptr_t)fh << std::endl;
14534
14535 std::scoped_lock lock(client_lock);
14536 return _flush(fh);
14537 }
14538
14539 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14540 {
14541 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14542 if (!mref_reader.is_state_satisfied())
14543 return -CEPHFS_ENOTCONN;
14544
14545 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14546 tout(cct) << "ll_fsync" << std::endl;
14547 tout(cct) << (uintptr_t)fh << std::endl;
14548
14549 std::scoped_lock lock(client_lock);
14550 int r = _fsync(fh, syncdataonly);
14551 if (r) {
14552 // If we're returning an error, clear it from the FH
14553 fh->take_async_err();
14554 }
14555 return r;
14556 }
14557
14558 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14559 {
14560 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14561 if (!mref_reader.is_state_satisfied())
14562 return -CEPHFS_ENOTCONN;
14563
14564 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14565 tout(cct) << "ll_sync_inode" << std::endl;
14566 tout(cct) << (uintptr_t)in << std::endl;
14567
14568 std::scoped_lock lock(client_lock);
14569 return _fsync(in, syncdataonly);
14570 }
14571
14572 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14573 {
14574 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14575
14576 if (offset < 0 || length <= 0)
14577 return -CEPHFS_EINVAL;
14578
14579 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14580 return -CEPHFS_EOPNOTSUPP;
14581
14582 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14583 return -CEPHFS_EOPNOTSUPP;
14584
14585 Inode *in = fh->inode.get();
14586
14587 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14588 !(mode & FALLOC_FL_PUNCH_HOLE)) {
14589 return -CEPHFS_ENOSPC;
14590 }
14591
14592 if (in->snapid != CEPH_NOSNAP)
14593 return -CEPHFS_EROFS;
14594
14595 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14596 return -CEPHFS_EBADF;
14597
14598 uint64_t size = offset + length;
14599 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14600 size > in->size &&
14601 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14602 return -CEPHFS_EDQUOT;
14603 }
14604
14605 int have;
14606 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14607 if (r < 0)
14608 return r;
14609
14610 std::unique_ptr<C_SaferCond> onuninline = nullptr;
14611 if (mode & FALLOC_FL_PUNCH_HOLE) {
14612 if (in->inline_version < CEPH_INLINE_NONE &&
14613 (have & CEPH_CAP_FILE_BUFFER)) {
14614 bufferlist bl;
14615 auto inline_iter = in->inline_data.cbegin();
14616 int len = in->inline_data.length();
14617 if (offset < len) {
14618 if (offset > 0)
14619 inline_iter.copy(offset, bl);
14620 int size = length;
14621 if (offset + size > len)
14622 size = len - offset;
14623 if (size > 0)
14624 bl.append_zero(size);
14625 if (offset + size < len) {
14626 inline_iter += size;
14627 inline_iter.copy(len - offset - size, bl);
14628 }
14629 in->inline_data = bl;
14630 in->inline_version++;
14631 }
14632 in->mtime = in->ctime = ceph_clock_now();
14633 in->change_attr++;
14634 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14635 } else {
14636 if (in->inline_version < CEPH_INLINE_NONE) {
14637 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14638 uninline_data(in, onuninline.get());
14639 }
14640
14641 C_SaferCond onfinish("Client::_punch_hole flock");
14642
14643 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14644
14645 _invalidate_inode_cache(in, offset, length);
14646 filer->zero(in->ino, &in->layout,
14647 in->snaprealm->get_snap_context(),
14648 offset, length,
14649 ceph::real_clock::now(),
14650 0, true, &onfinish);
14651 in->mtime = in->ctime = ceph_clock_now();
14652 in->change_attr++;
14653 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14654
14655 client_lock.unlock();
14656 onfinish.wait();
14657 client_lock.lock();
14658 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14659 }
14660 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14661 uint64_t size = offset + length;
14662 if (size > in->size) {
14663 in->size = size;
14664 in->mtime = in->ctime = ceph_clock_now();
14665 in->change_attr++;
14666 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14667
14668 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14669 check_caps(in, CHECK_CAPS_NODELAY);
14670 } else if (is_max_size_approaching(in)) {
14671 check_caps(in, 0);
14672 }
14673 }
14674 }
14675
14676 if (nullptr != onuninline) {
14677 client_lock.unlock();
14678 int ret = onuninline->wait();
14679 client_lock.lock();
14680
14681 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14682 in->inline_data.clear();
14683 in->inline_version = CEPH_INLINE_NONE;
14684 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14685 check_caps(in, 0);
14686 } else
14687 r = ret;
14688 }
14689
14690 put_cap_ref(in, CEPH_CAP_FILE_WR);
14691 return r;
14692 }
14693
14694 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14695 {
14696 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14697 if (!mref_reader.is_state_satisfied())
14698 return -CEPHFS_ENOTCONN;
14699
14700 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14701 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14702 tout(cct) << (uintptr_t)fh << std::endl;
14703
14704 std::scoped_lock lock(client_lock);
14705 return _fallocate(fh, mode, offset, length);
14706 }
14707
14708 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14709 {
14710 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14711 if (!mref_reader.is_state_satisfied())
14712 return -CEPHFS_ENOTCONN;
14713
14714 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14715
14716 std::scoped_lock lock(client_lock);
14717 Fh *fh = get_filehandle(fd);
14718 if (!fh)
14719 return -CEPHFS_EBADF;
14720 #if defined(__linux__) && defined(O_PATH)
14721 if (fh->flags & O_PATH)
14722 return -CEPHFS_EBADF;
14723 #endif
14724 return _fallocate(fh, mode, offset, length);
14725 }
14726
14727 int Client::ll_release(Fh *fh)
14728 {
14729 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14730 if (!mref_reader.is_state_satisfied())
14731 return -CEPHFS_ENOTCONN;
14732
14733 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14734 dendl;
14735 tout(cct) << __func__ << " (fh)" << std::endl;
14736 tout(cct) << (uintptr_t)fh << std::endl;
14737
14738 std::scoped_lock lock(client_lock);
14739
14740 if (ll_unclosed_fh_set.count(fh))
14741 ll_unclosed_fh_set.erase(fh);
14742 return _release_fh(fh);
14743 }
14744
14745 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14746 {
14747 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14748 if (!mref_reader.is_state_satisfied())
14749 return -CEPHFS_ENOTCONN;
14750
14751 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14752 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14753
14754 std::scoped_lock lock(client_lock);
14755 return _getlk(fh, fl, owner);
14756 }
14757
14758 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14759 {
14760 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14761 if (!mref_reader.is_state_satisfied())
14762 return -CEPHFS_ENOTCONN;
14763
14764 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14765 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14766
14767 std::scoped_lock lock(client_lock);
14768 return _setlk(fh, fl, owner, sleep);
14769 }
14770
14771 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14772 {
14773 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14774 if (!mref_reader.is_state_satisfied())
14775 return -CEPHFS_ENOTCONN;
14776
14777 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14778 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14779
14780 std::scoped_lock lock(client_lock);
14781 return _flock(fh, cmd, owner);
14782 }
14783
14784 int Client::set_deleg_timeout(uint32_t timeout)
14785 {
14786 std::scoped_lock lock(client_lock);
14787
14788 /*
14789 * The whole point is to prevent blocklisting so we must time out the
14790 * delegation before the session autoclose timeout kicks in.
14791 */
14792 if (timeout >= mdsmap->get_session_autoclose())
14793 return -CEPHFS_EINVAL;
14794
14795 deleg_timeout = timeout;
14796 return 0;
14797 }
14798
14799 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14800 {
14801 int ret = -CEPHFS_EINVAL;
14802
14803 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14804 if (!mref_reader.is_state_satisfied())
14805 return -CEPHFS_ENOTCONN;
14806
14807 std::scoped_lock lock(client_lock);
14808
14809 Inode *inode = fh->inode.get();
14810
14811 switch(cmd) {
14812 case CEPH_DELEGATION_NONE:
14813 inode->unset_deleg(fh);
14814 ret = 0;
14815 break;
14816 default:
14817 try {
14818 ret = inode->set_deleg(fh, cmd, cb, priv);
14819 } catch (std::bad_alloc&) {
14820 ret = -CEPHFS_ENOMEM;
14821 }
14822 break;
14823 }
14824 return ret;
14825 }
14826
14827 class C_Client_RequestInterrupt : public Context {
14828 private:
14829 Client *client;
14830 MetaRequest *req;
14831 public:
14832 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14833 req->get();
14834 }
14835 void finish(int r) override {
14836 std::scoped_lock l(client->client_lock);
14837 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14838 client->_interrupt_filelock(req);
14839 client->put_request(req);
14840 }
14841 };
14842
14843 void Client::ll_interrupt(void *d)
14844 {
14845 MetaRequest *req = static_cast<MetaRequest*>(d);
14846 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14847 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
14848 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14849 }
14850
14851 // =========================================
14852 // layout
14853
14854 // expose file layouts
14855
14856 int Client::describe_layout(const char *relpath, file_layout_t *lp,
14857 const UserPerm& perms)
14858 {
14859 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14860 if (!mref_reader.is_state_satisfied())
14861 return -CEPHFS_ENOTCONN;
14862
14863 std::scoped_lock lock(client_lock);
14864
14865 filepath path(relpath);
14866 InodeRef in;
14867 int r = path_walk(path, &in, perms);
14868 if (r < 0)
14869 return r;
14870
14871 *lp = in->layout;
14872
14873 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
14874 return 0;
14875 }
14876
14877 int Client::fdescribe_layout(int fd, file_layout_t *lp)
14878 {
14879 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14880 if (!mref_reader.is_state_satisfied())
14881 return -CEPHFS_ENOTCONN;
14882
14883 std::scoped_lock lock(client_lock);
14884
14885 Fh *f = get_filehandle(fd);
14886 if (!f)
14887 return -CEPHFS_EBADF;
14888 Inode *in = f->inode.get();
14889
14890 *lp = in->layout;
14891
14892 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
14893 return 0;
14894 }
14895
14896 int64_t Client::get_default_pool_id()
14897 {
14898 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14899 if (!mref_reader.is_state_satisfied())
14900 return -CEPHFS_ENOTCONN;
14901
14902 std::scoped_lock lock(client_lock);
14903
14904 /* first data pool is the default */
14905 return mdsmap->get_first_data_pool();
14906 }
14907
14908 // expose osdmap
14909
14910 int64_t Client::get_pool_id(const char *pool_name)
14911 {
14912 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14913 if (!mref_reader.is_state_satisfied())
14914 return -CEPHFS_ENOTCONN;
14915
14916 std::scoped_lock lock(client_lock);
14917
14918 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14919 pool_name);
14920 }
14921
14922 string Client::get_pool_name(int64_t pool)
14923 {
14924 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14925 if (!mref_reader.is_state_satisfied())
14926 return string();
14927
14928 std::scoped_lock lock(client_lock);
14929
14930 return objecter->with_osdmap([pool](const OSDMap& o) {
14931 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14932 });
14933 }
14934
14935 int Client::get_pool_replication(int64_t pool)
14936 {
14937 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14938 if (!mref_reader.is_state_satisfied())
14939 return -CEPHFS_ENOTCONN;
14940
14941 std::scoped_lock lock(client_lock);
14942
14943 return objecter->with_osdmap([pool](const OSDMap& o) {
14944 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
14945 });
14946 }
14947
14948 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14949 {
14950 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14951 if (!mref_reader.is_state_satisfied())
14952 return -CEPHFS_ENOTCONN;
14953
14954 std::scoped_lock lock(client_lock);
14955
14956 Fh *f = get_filehandle(fd);
14957 if (!f)
14958 return -CEPHFS_EBADF;
14959 Inode *in = f->inode.get();
14960
14961 vector<ObjectExtent> extents;
14962 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14963 ceph_assert(extents.size() == 1);
14964
14965 objecter->with_osdmap([&](const OSDMap& o) {
14966 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14967 o.pg_to_acting_osds(pg, osds);
14968 });
14969
14970 if (osds.empty())
14971 return -CEPHFS_EINVAL;
14972
14973 /*
14974 * Return the remainder of the extent (stripe unit)
14975 *
14976 * If length = 1 is passed to Striper::file_to_extents we get a single
14977 * extent back, but its length is one so we still need to compute the length
14978 * to the end of the stripe unit.
14979 *
14980 * If length = su then we may get 1 or 2 objects back in the extents vector
14981 * which would have to be examined. Even then, the offsets are local to the
14982 * object, so matching up to the file offset is extra work.
14983 *
14984 * It seems simpler to stick with length = 1 and manually compute the
14985 * remainder.
14986 */
14987 if (len) {
14988 uint64_t su = in->layout.stripe_unit;
14989 *len = su - (off % su);
14990 }
14991
14992 return 0;
14993 }
14994
14995 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14996 {
14997 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14998 if (!mref_reader.is_state_satisfied())
14999 return -CEPHFS_ENOTCONN;
15000
15001 std::scoped_lock lock(client_lock);
15002
15003 if (id < 0)
15004 return -CEPHFS_EINVAL;
15005 return objecter->with_osdmap([&](const OSDMap& o) {
15006 return o.crush->get_full_location_ordered(id, path);
15007 });
15008 }
15009
15010 int Client::get_file_stripe_address(int fd, loff_t offset,
15011 vector<entity_addr_t>& address)
15012 {
15013 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15014 if (!mref_reader.is_state_satisfied())
15015 return -CEPHFS_ENOTCONN;
15016
15017 std::scoped_lock lock(client_lock);
15018
15019 Fh *f = get_filehandle(fd);
15020 if (!f)
15021 return -CEPHFS_EBADF;
15022 Inode *in = f->inode.get();
15023
15024 // which object?
15025 vector<ObjectExtent> extents;
15026 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15027 in->truncate_size, extents);
15028 ceph_assert(extents.size() == 1);
15029
15030 // now we have the object and its 'layout'
15031 return objecter->with_osdmap([&](const OSDMap& o) {
15032 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15033 vector<int> osds;
15034 o.pg_to_acting_osds(pg, osds);
15035 if (osds.empty())
15036 return -CEPHFS_EINVAL;
15037 for (unsigned i = 0; i < osds.size(); i++) {
15038 entity_addr_t addr = o.get_addrs(osds[i]).front();
15039 address.push_back(addr);
15040 }
15041 return 0;
15042 });
15043 }
15044
15045 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15046 {
15047 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15048 if (!mref_reader.is_state_satisfied())
15049 return -CEPHFS_ENOTCONN;
15050
15051 std::scoped_lock lock(client_lock);
15052
15053 return objecter->with_osdmap([&](const OSDMap& o) {
15054 if (!o.exists(osd))
15055 return -CEPHFS_ENOENT;
15056
15057 addr = o.get_addrs(osd).front();
15058 return 0;
15059 });
15060 }
15061
15062 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15063 loff_t length, loff_t offset)
15064 {
15065 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15066 if (!mref_reader.is_state_satisfied())
15067 return -CEPHFS_ENOTCONN;
15068
15069 std::scoped_lock lock(client_lock);
15070
15071 Fh *f = get_filehandle(fd);
15072 if (!f)
15073 return -CEPHFS_EBADF;
15074 Inode *in = f->inode.get();
15075
15076 // map to a list of extents
15077 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15078
15079 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15080 return 0;
15081 }
15082
15083
15084 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15085 int Client::get_local_osd()
15086 {
15087 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15088 if (!mref_reader.is_state_satisfied())
15089 return -CEPHFS_ENOTCONN;
15090
15091 std::scoped_lock lock(client_lock);
15092
15093 objecter->with_osdmap([this](const OSDMap& o) {
15094 if (o.get_epoch() != local_osd_epoch) {
15095 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15096 local_osd_epoch = o.get_epoch();
15097 }
15098 });
15099 return local_osd;
15100 }
15101
15102
15103
15104
15105
15106
15107 // ===============================
15108
15109 void Client::ms_handle_connect(Connection *con)
15110 {
15111 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15112 }
15113
15114 bool Client::ms_handle_reset(Connection *con)
15115 {
15116 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15117 return false;
15118 }
15119
15120 void Client::ms_handle_remote_reset(Connection *con)
15121 {
15122 std::scoped_lock lock(client_lock);
15123 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15124 switch (con->get_peer_type()) {
15125 case CEPH_ENTITY_TYPE_MDS:
15126 {
15127 // kludge to figure out which mds this is; fixme with a Connection* state
15128 mds_rank_t mds = MDS_RANK_NONE;
15129 MetaSessionRef s = NULL;
15130 for (auto &p : mds_sessions) {
15131 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15132 mds = p.first;
15133 s = p.second;
15134 }
15135 }
15136 if (mds >= 0) {
15137 ceph_assert(s != NULL);
15138 switch (s->state) {
15139 case MetaSession::STATE_CLOSING:
15140 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15141 _closed_mds_session(s.get());
15142 break;
15143
15144 case MetaSession::STATE_OPENING:
15145 {
15146 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15147 list<Context*> waiters;
15148 waiters.swap(s->waiting_for_open);
15149 _closed_mds_session(s.get());
15150 auto news = _get_or_open_mds_session(mds);
15151 news->waiting_for_open.swap(waiters);
15152 }
15153 break;
15154
15155 case MetaSession::STATE_OPEN:
15156 {
15157 objecter->maybe_request_map(); /* to check if we are blocklisted */
15158 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15159 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15160 _closed_mds_session(s.get());
15161 } else {
15162 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15163 s->state = MetaSession::STATE_STALE;
15164 }
15165 }
15166 break;
15167
15168 case MetaSession::STATE_NEW:
15169 case MetaSession::STATE_CLOSED:
15170 default:
15171 break;
15172 }
15173 }
15174 }
15175 break;
15176 }
15177 }
15178
15179 bool Client::ms_handle_refused(Connection *con)
15180 {
15181 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15182 return false;
15183 }
15184
15185 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15186 {
15187 Inode *quota_in = root_ancestor;
15188 SnapRealm *realm = in->snaprealm;
15189 while (realm) {
15190 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15191 if (realm->ino != in->ino) {
15192 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15193 if (p == inode_map.end())
15194 break;
15195
15196 if (p->second->quota.is_enable()) {
15197 quota_in = p->second;
15198 break;
15199 }
15200 }
15201 realm = realm->pparent;
15202 }
15203 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15204 return quota_in;
15205 }
15206
15207 /**
15208 * Traverse quota ancestors of the Inode, return true
15209 * if any of them passes the passed function
15210 */
15211 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15212 std::function<bool (const Inode &in)> test)
15213 {
15214 while (true) {
15215 ceph_assert(in != NULL);
15216 if (test(*in)) {
15217 return true;
15218 }
15219
15220 if (in == root_ancestor) {
15221 // We're done traversing, drop out
15222 return false;
15223 } else {
15224 // Continue up the tree
15225 in = get_quota_root(in, perms);
15226 }
15227 }
15228
15229 return false;
15230 }
15231
15232 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15233 {
15234 return check_quota_condition(in, perms,
15235 [](const Inode &in) {
15236 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15237 });
15238 }
15239
15240 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15241 const UserPerm& perms)
15242 {
15243 return check_quota_condition(in, perms,
15244 [&new_bytes](const Inode &in) {
15245 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15246 > in.quota.max_bytes;
15247 });
15248 }
15249
15250 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15251 {
15252 ceph_assert(in->size >= in->reported_size);
15253 const uint64_t size = in->size - in->reported_size;
15254 return check_quota_condition(in, perms,
15255 [&size](const Inode &in) {
15256 if (in.quota.max_bytes) {
15257 if (in.rstat.rbytes >= in.quota.max_bytes) {
15258 return true;
15259 }
15260
15261 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15262 return (space >> 4) < size;
15263 } else {
15264 return false;
15265 }
15266 });
15267 }
15268
15269 enum {
15270 POOL_CHECKED = 1,
15271 POOL_CHECKING = 2,
15272 POOL_READ = 4,
15273 POOL_WRITE = 8,
15274 };
15275
15276 int Client::check_pool_perm(Inode *in, int need)
15277 {
15278 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15279
15280 if (!cct->_conf->client_check_pool_perm)
15281 return 0;
15282
15283 /* Only need to do this for regular files */
15284 if (!in->is_file())
15285 return 0;
15286
15287 int64_t pool_id = in->layout.pool_id;
15288 std::string pool_ns = in->layout.pool_ns;
15289 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15290 int have = 0;
15291 while (true) {
15292 auto it = pool_perms.find(perm_key);
15293 if (it == pool_perms.end())
15294 break;
15295 if (it->second == POOL_CHECKING) {
15296 // avoid concurrent checkings
15297 wait_on_list(waiting_for_pool_perm);
15298 } else {
15299 have = it->second;
15300 ceph_assert(have & POOL_CHECKED);
15301 break;
15302 }
15303 }
15304
15305 if (!have) {
15306 if (in->snapid != CEPH_NOSNAP) {
15307 // pool permission check needs to write to the first object. But for snapshot,
15308 // head of the first object may have already been deleted. To avoid creating
15309 // orphan object, skip the check for now.
15310 return 0;
15311 }
15312
15313 pool_perms[perm_key] = POOL_CHECKING;
15314
15315 char oid_buf[32];
15316 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15317 object_t oid = oid_buf;
15318
15319 SnapContext nullsnapc;
15320
15321 C_SaferCond rd_cond;
15322 ObjectOperation rd_op;
15323 rd_op.stat(nullptr, nullptr, nullptr);
15324
15325 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15326 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15327
15328 C_SaferCond wr_cond;
15329 ObjectOperation wr_op;
15330 wr_op.create(true);
15331
15332 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15333 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15334
15335 client_lock.unlock();
15336 int rd_ret = rd_cond.wait();
15337 int wr_ret = wr_cond.wait();
15338 client_lock.lock();
15339
15340 bool errored = false;
15341
15342 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
15343 have |= POOL_READ;
15344 else if (rd_ret != -CEPHFS_EPERM) {
15345 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15346 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15347 errored = true;
15348 }
15349
15350 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
15351 have |= POOL_WRITE;
15352 else if (wr_ret != -CEPHFS_EPERM) {
15353 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15354 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15355 errored = true;
15356 }
15357
15358 if (errored) {
15359 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15360 // Raise EIO because actual error code might be misleading for
15361 // userspace filesystem user.
15362 pool_perms.erase(perm_key);
15363 signal_cond_list(waiting_for_pool_perm);
15364 return -CEPHFS_EIO;
15365 }
15366
15367 pool_perms[perm_key] = have | POOL_CHECKED;
15368 signal_cond_list(waiting_for_pool_perm);
15369 }
15370
15371 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15372 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15373 << " need " << ccap_string(need) << ", but no read perm" << dendl;
15374 return -CEPHFS_EPERM;
15375 }
15376 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15377 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15378 << " need " << ccap_string(need) << ", but no write perm" << dendl;
15379 return -CEPHFS_EPERM;
15380 }
15381
15382 return 0;
15383 }
15384
15385 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15386 {
15387 if (acl_type == POSIX_ACL) {
15388 if (in->xattrs.count(ACL_EA_ACCESS)) {
15389 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15390
15391 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15392 }
15393 }
15394 return -CEPHFS_EAGAIN;
15395 }
15396
15397 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15398 {
15399 if (acl_type == NO_ACL)
15400 return 0;
15401
15402 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15403 if (r < 0)
15404 goto out;
15405
15406 if (acl_type == POSIX_ACL) {
15407 if (in->xattrs.count(ACL_EA_ACCESS)) {
15408 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15409 bufferptr acl(access_acl.c_str(), access_acl.length());
15410 r = posix_acl_access_chmod(acl, mode);
15411 if (r < 0)
15412 goto out;
15413 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15414 } else {
15415 r = 0;
15416 }
15417 }
15418 out:
15419 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15420 return r;
15421 }
15422
15423 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15424 const UserPerm& perms)
15425 {
15426 if (acl_type == NO_ACL)
15427 return 0;
15428
15429 if (S_ISLNK(*mode))
15430 return 0;
15431
15432 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15433 if (r < 0)
15434 goto out;
15435
15436 if (acl_type == POSIX_ACL) {
15437 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15438 map<string, bufferptr> xattrs;
15439
15440 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15441 bufferptr acl(default_acl.c_str(), default_acl.length());
15442 r = posix_acl_inherit_mode(acl, mode);
15443 if (r < 0)
15444 goto out;
15445
15446 if (r > 0) {
15447 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15448 if (r < 0)
15449 goto out;
15450 if (r > 0)
15451 xattrs[ACL_EA_ACCESS] = acl;
15452 }
15453
15454 if (S_ISDIR(*mode))
15455 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15456
15457 r = xattrs.size();
15458 if (r > 0)
15459 encode(xattrs, xattrs_bl);
15460 } else {
15461 if (umask_cb)
15462 *mode &= ~umask_cb(callback_handle);
15463 r = 0;
15464 }
15465 }
15466 out:
15467 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15468 return r;
15469 }
15470
15471 void Client::set_filer_flags(int flags)
15472 {
15473 std::scoped_lock l(client_lock);
15474 ceph_assert(flags == 0 ||
15475 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15476 objecter->add_global_op_flags(flags);
15477 }
15478
15479 void Client::clear_filer_flags(int flags)
15480 {
15481 std::scoped_lock l(client_lock);
15482 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15483 objecter->clear_global_op_flag(flags);
15484 }
15485
15486 // called before mount
15487 void Client::set_uuid(const std::string& uuid)
15488 {
15489 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15490 ceph_assert(iref_reader.is_state_satisfied());
15491
15492 std::scoped_lock l(client_lock);
15493 ceph_assert(!uuid.empty());
15494
15495 metadata["uuid"] = uuid;
15496 _close_sessions();
15497 }
15498
15499 // called before mount. 0 means infinite
15500 void Client::set_session_timeout(unsigned timeout)
15501 {
15502 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15503 ceph_assert(iref_reader.is_state_satisfied());
15504
15505 std::scoped_lock l(client_lock);
15506
15507 metadata["timeout"] = stringify(timeout);
15508 }
15509
15510 // called before mount
15511 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15512 const std::string& fs_name)
15513 {
15514 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15515 if (!iref_reader.is_state_satisfied())
15516 return -CEPHFS_ENOTCONN;
15517
15518 if (uuid.empty())
15519 return -CEPHFS_EINVAL;
15520
15521 std::unique_lock l(client_lock);
15522 {
15523 auto it = metadata.find("uuid");
15524 if (it != metadata.end() && it->second == uuid)
15525 return -CEPHFS_EINVAL;
15526 }
15527
15528 int r = subscribe_mdsmap(fs_name);
15529 if (r < 0) {
15530 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15531 return r;
15532 }
15533
15534 if (metadata.empty())
15535 populate_metadata("");
15536
15537 while (mdsmap->get_epoch() == 0)
15538 wait_on_list(waiting_for_mdsmap);
15539
15540 reclaim_errno = 0;
15541 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15542 if (!mdsmap->is_up(mds)) {
15543 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15544 wait_on_list(waiting_for_mdsmap);
15545 continue;
15546 }
15547
15548 MetaSessionRef session;
15549 if (!have_open_session(mds)) {
15550 session = _get_or_open_mds_session(mds);
15551 if (session->state == MetaSession::STATE_REJECTED)
15552 return -CEPHFS_EPERM;
15553 if (session->state != MetaSession::STATE_OPENING) {
15554 // umounting?
15555 return -CEPHFS_EINVAL;
15556 }
15557 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15558 wait_on_context_list(session->waiting_for_open);
15559 continue;
15560 }
15561
15562 session = mds_sessions.at(mds);
15563 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15564 return -CEPHFS_EOPNOTSUPP;
15565
15566 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15567 session->reclaim_state == MetaSession::RECLAIMING) {
15568 session->reclaim_state = MetaSession::RECLAIMING;
15569 auto m = make_message<MClientReclaim>(uuid, flags);
15570 session->con->send_message2(std::move(m));
15571 wait_on_list(waiting_for_reclaim);
15572 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15573 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15574 } else {
15575 mds++;
15576 }
15577 }
15578
15579 // didn't find target session in any mds
15580 if (reclaim_target_addrs.empty()) {
15581 if (flags & CEPH_RECLAIM_RESET)
15582 return -CEPHFS_ENOENT;
15583 return -CEPHFS_ENOTRECOVERABLE;
15584 }
15585
15586 if (flags & CEPH_RECLAIM_RESET)
15587 return 0;
15588
15589 // use blocklist to check if target session was killed
15590 // (config option mds_session_blocklist_on_evict needs to be true)
15591 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15592 bs::error_code ec;
15593 l.unlock();
15594 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15595 l.lock();
15596
15597 if (ec)
15598 return ceph::from_error_code(ec);
15599
15600 bool blocklisted = objecter->with_osdmap(
15601 [this](const OSDMap &osd_map) -> bool {
15602 return osd_map.is_blocklisted(reclaim_target_addrs);
15603 });
15604 if (blocklisted)
15605 return -CEPHFS_ENOTRECOVERABLE;
15606
15607 metadata["reclaiming_uuid"] = uuid;
15608 return 0;
15609 }
15610
15611 void Client::finish_reclaim()
15612 {
15613 auto it = metadata.find("reclaiming_uuid");
15614 if (it == metadata.end()) {
15615 for (auto &p : mds_sessions)
15616 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15617 return;
15618 }
15619
15620 for (auto &p : mds_sessions) {
15621 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15622 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15623 p.second->con->send_message2(std::move(m));
15624 }
15625
15626 metadata["uuid"] = it->second;
15627 metadata.erase(it);
15628 }
15629
15630 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15631 {
15632 mds_rank_t from = mds_rank_t(reply->get_source().num());
15633 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15634
15635 std::scoped_lock cl(client_lock);
15636 auto session = _get_mds_session(from, reply->get_connection().get());
15637 if (!session) {
15638 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15639 return;
15640 }
15641
15642 if (reply->get_result() >= 0) {
15643 session->reclaim_state = MetaSession::RECLAIM_OK;
15644 if (reply->get_epoch() > reclaim_osd_epoch)
15645 reclaim_osd_epoch = reply->get_epoch();
15646 if (!reply->get_addrs().empty())
15647 reclaim_target_addrs = reply->get_addrs();
15648 } else {
15649 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15650 reclaim_errno = reply->get_result();
15651 }
15652
15653 signal_cond_list(waiting_for_reclaim);
15654 }
15655
15656 /**
15657 * This is included in cap release messages, to cause
15658 * the MDS to wait until this OSD map epoch. It is necessary
15659 * in corner cases where we cancel RADOS ops, so that
15660 * nobody else tries to do IO to the same objects in
15661 * the same epoch as the cancelled ops.
15662 */
15663 void Client::set_cap_epoch_barrier(epoch_t e)
15664 {
15665 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15666 cap_epoch_barrier = e;
15667 }
15668
15669 const char** Client::get_tracked_conf_keys() const
15670 {
15671 static const char* keys[] = {
15672 "client_cache_size",
15673 "client_cache_mid",
15674 "client_acl_type",
15675 "client_deleg_timeout",
15676 "client_deleg_break_on_open",
15677 "client_oc_size",
15678 "client_oc_max_objects",
15679 "client_oc_max_dirty",
15680 "client_oc_target_dirty",
15681 "client_oc_max_dirty_age",
15682 NULL
15683 };
15684 return keys;
15685 }
15686
15687 void Client::handle_conf_change(const ConfigProxy& conf,
15688 const std::set <std::string> &changed)
15689 {
15690 std::scoped_lock lock(client_lock);
15691
15692 if (changed.count("client_cache_mid")) {
15693 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15694 }
15695 if (changed.count("client_acl_type")) {
15696 acl_type = NO_ACL;
15697 if (cct->_conf->client_acl_type == "posix_acl")
15698 acl_type = POSIX_ACL;
15699 }
15700 if (changed.count("client_oc_size")) {
15701 objectcacher->set_max_size(cct->_conf->client_oc_size);
15702 }
15703 if (changed.count("client_oc_max_objects")) {
15704 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15705 }
15706 if (changed.count("client_oc_max_dirty")) {
15707 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15708 }
15709 if (changed.count("client_oc_target_dirty")) {
15710 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15711 }
15712 if (changed.count("client_oc_max_dirty_age")) {
15713 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15714 }
15715 }
15716
15717 void intrusive_ptr_add_ref(Inode *in)
15718 {
15719 in->iget();
15720 }
15721
15722 void intrusive_ptr_release(Inode *in)
15723 {
15724 in->client->put_inode(in);
15725 }
15726
15727 mds_rank_t Client::_get_random_up_mds() const
15728 {
15729 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15730
15731 std::set<mds_rank_t> up;
15732 mdsmap->get_up_mds_set(up);
15733
15734 if (up.empty())
15735 return MDS_RANK_NONE;
15736 std::set<mds_rank_t>::const_iterator p = up.begin();
15737 for (int n = rand() % up.size(); n; n--)
15738 ++p;
15739 return *p;
15740 }
15741
15742
15743 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15744 boost::asio::io_context& ictx)
15745 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15746 {
15747 monclient->set_messenger(m);
15748 objecter->set_client_incarnation(0);
15749 }
15750
15751 StandaloneClient::~StandaloneClient()
15752 {
15753 delete objecter;
15754 objecter = nullptr;
15755 }
15756
15757 int StandaloneClient::init()
15758 {
15759 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15760 ceph_assert(iref_writer.is_first_writer());
15761
15762 _pre_init();
15763 objecter->init();
15764
15765 client_lock.lock();
15766
15767 messenger->add_dispatcher_tail(objecter);
15768 messenger->add_dispatcher_tail(this);
15769
15770 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15771 int r = monclient->init();
15772 if (r < 0) {
15773 // need to do cleanup because we're in an intermediate init state
15774 {
15775 std::scoped_lock l(timer_lock);
15776 timer.shutdown();
15777 }
15778
15779 client_lock.unlock();
15780 objecter->shutdown();
15781 objectcacher->stop();
15782 monclient->shutdown();
15783 return r;
15784 }
15785 objecter->start();
15786
15787 client_lock.unlock();
15788 _finish_init();
15789 iref_writer.update_state(CLIENT_INITIALIZED);
15790
15791 return 0;
15792 }
15793
15794 void StandaloneClient::shutdown()
15795 {
15796 Client::shutdown();
15797 objecter->shutdown();
15798 monclient->shutdown();
15799 }