]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
5572446f20b1ae6136762c56b1a36125e8549754
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #else
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
77
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
83
84 #define dout_subsys ceph_subsys_client
85
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
90
91 #include "Client.h"
92 #include "Inode.h"
93 #include "Dentry.h"
94 #include "Delegation.h"
95 #include "Dir.h"
96 #include "ClientSnapRealm.h"
97 #include "Fh.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
102
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
105
106 #include "include/cephfs/ceph_ll_client.h"
107
108 #if HAVE_GETGROUPLIST
109 #include <grp.h>
110 #include <pwd.h>
111 #include <unistd.h>
112 #endif
113
114 #undef dout_prefix
115 #define dout_prefix *_dout << "client." << whoami << " "
116
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119 // FreeBSD fails to define this
120 #ifndef O_DSYNC
121 #define O_DSYNC 0x0
122 #endif
123 // Darwin fails to define this
124 #ifndef O_RSYNC
125 #define O_RSYNC 0x0
126 #endif
127
128 #ifndef O_DIRECT
129 #define O_DIRECT 0x0
130 #endif
131
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
137 // them as invalid.
138 #ifndef O_NOFOLLOW
139 #define O_NOFOLLOW 0x0
140 #endif
141
142 #ifndef O_SYNC
143 #define O_SYNC 0x0
144 #endif
145
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
148 #ifndef S_IXUGO
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150 #endif
151
152 using std::dec;
153 using std::hex;
154 using std::list;
155 using std::oct;
156 using std::pair;
157 using std::string;
158 using std::vector;
159
160 using namespace TOPNSPC::common;
161
162 namespace bs = boost::system;
163 namespace ca = ceph::async;
164
165 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166 {
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169 }
170
171 bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177 }
178
179 // running average and standard deviation -- presented in
180 // Donald Knuth's TAoCP, Volume II.
181 double calc_average(double old_avg, double value, uint64_t count) {
182 double new_avg;
183 if (count == 1) {
184 new_avg = value;
185 } else {
186 new_avg = old_avg + ((value - old_avg) / count);
187 }
188
189 return new_avg;
190 }
191
192 double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
193 double value, uint64_t count) {
194 double new_sq_sum;
195 if (count == 1) {
196 new_sq_sum = 0.0;
197 } else {
198 new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
199 }
200
201 return new_sq_sum;
202 }
203
204 // -------------
205
206 Client::CommandHook::CommandHook(Client *client) :
207 m_client(client)
208 {
209 }
210
211 int Client::CommandHook::call(
212 std::string_view command,
213 const cmdmap_t& cmdmap,
214 Formatter *f,
215 std::ostream& errss,
216 bufferlist& out)
217 {
218 f->open_object_section("result");
219 {
220 std::scoped_lock l{m_client->client_lock};
221 if (command == "mds_requests")
222 m_client->dump_mds_requests(f);
223 else if (command == "mds_sessions") {
224 bool cap_dump = false;
225 cmd_getval(cmdmap, "cap_dump", cap_dump);
226 m_client->dump_mds_sessions(f, cap_dump);
227 } else if (command == "dump_cache")
228 m_client->dump_cache(f);
229 else if (command == "kick_stale_sessions")
230 m_client->_kick_stale_sessions();
231 else if (command == "status")
232 m_client->dump_status(f);
233 else
234 ceph_abort_msg("bad command registered");
235 }
236 f->close_section();
237 return 0;
238 }
239
240
241 // -------------
242
243 int Client::get_fd_inode(int fd, InodeRef *in) {
244 int r = 0;
245 if (fd == CEPHFS_AT_FDCWD) {
246 *in = cwd;
247 } else {
248 Fh *f = get_filehandle(fd);
249 if (!f) {
250 r = -CEPHFS_EBADF;
251 } else {
252 *in = f->inode;
253 }
254 }
255 return r;
256 }
257
258 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
259 : inode(in), offset(0), next_offset(2),
260 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
261 perms(perms)
262 { }
263
264 void Client::_reset_faked_inos()
265 {
266 ino_t start = 1024;
267 free_faked_inos.clear();
268 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
269 last_used_faked_ino = 0;
270 last_used_faked_root = 0;
271 #ifdef _WIN32
272 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
273 // Windows structures, including Dokan ones, are using 64B identifiers.
274 _use_faked_inos = false;
275 #else
276 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
277 #endif
278 }
279
280 void Client::_assign_faked_ino(Inode *in)
281 {
282 if (0 == last_used_faked_ino)
283 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
284 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
285 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
286 last_used_faked_ino = 2048;
287 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
288 }
289 ceph_assert(it != free_faked_inos.end());
290 if (last_used_faked_ino < it.get_start()) {
291 ceph_assert(it.get_len() > 0);
292 last_used_faked_ino = it.get_start();
293 } else {
294 ++last_used_faked_ino;
295 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
296 }
297 in->faked_ino = last_used_faked_ino;
298 free_faked_inos.erase(in->faked_ino);
299 faked_ino_map[in->faked_ino] = in->vino();
300 }
301
302 /*
303 * In the faked mode, if you export multiple subdirectories,
304 * you will see that the inode numbers of the exported subdirectories
305 * are the same. so we distinguish the mount point by reserving
306 * the "fake ids" between "1024~2048" and combining the last
307 * 10bits(0x3ff) of the "root inodes".
308 */
309 void Client::_assign_faked_root(Inode *in)
310 {
311 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
312 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
313 last_used_faked_root = 0;
314 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
315 }
316 ceph_assert(it != free_faked_inos.end());
317 vinodeno_t inode_info = in->vino();
318 uint64_t inode_num = (uint64_t)inode_info.ino;
319 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
320 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
321 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
322
323 in->faked_ino = last_used_faked_root;
324 free_faked_inos.erase(in->faked_ino);
325 faked_ino_map[in->faked_ino] = in->vino();
326 }
327
328 void Client::_release_faked_ino(Inode *in)
329 {
330 free_faked_inos.insert(in->faked_ino);
331 faked_ino_map.erase(in->faked_ino);
332 }
333
334 vinodeno_t Client::_map_faked_ino(ino_t ino)
335 {
336 vinodeno_t vino;
337 if (ino == 1)
338 vino = root->vino();
339 else if (faked_ino_map.count(ino))
340 vino = faked_ino_map[ino];
341 else
342 vino = vinodeno_t(0, CEPH_NOSNAP);
343 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
344 return vino;
345 }
346
347 vinodeno_t Client::map_faked_ino(ino_t ino)
348 {
349 std::scoped_lock lock(client_lock);
350 return _map_faked_ino(ino);
351 }
352
353 // cons/des
354
355 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
356 : Dispatcher(m->cct->get()),
357 timer(m->cct, timer_lock, false),
358 messenger(m),
359 monclient(mc),
360 objecter(objecter_),
361 whoami(mc->get_global_id()),
362 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
363 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
364 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
365 async_ino_invalidator(m->cct),
366 async_dentry_invalidator(m->cct),
367 interrupt_finisher(m->cct),
368 remount_finisher(m->cct),
369 async_ino_releasor(m->cct),
370 objecter_finisher(m->cct),
371 m_command_hook(this),
372 fscid(0)
373 {
374 _reset_faked_inos();
375
376 user_id = cct->_conf->client_mount_uid;
377 group_id = cct->_conf->client_mount_gid;
378 fuse_default_permissions = cct->_conf.get_val<bool>(
379 "fuse_default_permissions");
380
381 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
382 "client_collect_and_send_global_metrics");
383
384 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
385 "client_mount_timeout");
386
387 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
388 "client_caps_release_delay");
389
390 if (cct->_conf->client_acl_type == "posix_acl")
391 acl_type = POSIX_ACL;
392
393 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
394
395 // file handles
396 free_fd_set.insert(10, 1<<30);
397
398 mdsmap.reset(new MDSMap);
399
400 // osd interfaces
401 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
402 &client_lock));
403 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
404 client_flush_set_callback, // all commit callback
405 (void*)this,
406 cct->_conf->client_oc_size,
407 cct->_conf->client_oc_max_objects,
408 cct->_conf->client_oc_max_dirty,
409 cct->_conf->client_oc_target_dirty,
410 cct->_conf->client_oc_max_dirty_age,
411 true));
412 }
413
414
415 Client::~Client()
416 {
417 ceph_assert(ceph_mutex_is_not_locked(client_lock));
418
419 // If the task is crashed or aborted and doesn't
420 // get any chance to run the umount and shutdow.
421 {
422 std::scoped_lock l{client_lock};
423 tick_thread_stopped = true;
424 upkeep_cond.notify_one();
425 }
426
427 if (upkeeper.joinable())
428 upkeeper.join();
429
430 // It is necessary to hold client_lock, because any inode destruction
431 // may call into ObjectCacher, which asserts that it's lock (which is
432 // client_lock) is held.
433 std::scoped_lock l{client_lock};
434 tear_down_cache();
435 }
436
437 void Client::tear_down_cache()
438 {
439 // fd's
440 for (auto &[fd, fh] : fd_map) {
441 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
442 _release_fh(fh);
443 }
444 fd_map.clear();
445
446 while (!opened_dirs.empty()) {
447 dir_result_t *dirp = *opened_dirs.begin();
448 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
449 _closedir(dirp);
450 }
451
452 // caps!
453 // *** FIXME ***
454
455 // empty lru
456 trim_cache();
457 ceph_assert(lru.lru_get_size() == 0);
458
459 // close root ino
460 ceph_assert(inode_map.size() <= 1 + root_parents.size());
461 if (root && inode_map.size() == 1 + root_parents.size()) {
462 root.reset();
463 }
464
465 ceph_assert(inode_map.empty());
466 }
467
468 inodeno_t Client::get_root_ino()
469 {
470 std::scoped_lock l(client_lock);
471 if (use_faked_inos())
472 return root->faked_ino;
473 else
474 return root->ino;
475 }
476
477 Inode *Client::get_root()
478 {
479 std::scoped_lock l(client_lock);
480 root->ll_get();
481 return root.get();
482 }
483
484
485 // debug crapola
486
487 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
488 {
489 filepath path;
490 in->make_long_path(path);
491 ldout(cct, 1) << "dump_inode: "
492 << (disconnected ? "DISCONNECTED ":"")
493 << "inode " << in->ino
494 << " " << path
495 << " ref " << in->get_nref()
496 << " " << *in << dendl;
497
498 if (f) {
499 f->open_object_section("inode");
500 f->dump_stream("path") << path;
501 if (disconnected)
502 f->dump_int("disconnected", 1);
503 in->dump(f);
504 f->close_section();
505 }
506
507 did.insert(in);
508 if (in->dir) {
509 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
510 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
511 it != in->dir->dentries.end();
512 ++it) {
513 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
514 if (f) {
515 f->open_object_section("dentry");
516 it->second->dump(f);
517 f->close_section();
518 }
519 if (it->second->inode)
520 dump_inode(f, it->second->inode.get(), did, false);
521 }
522 }
523 }
524
525 void Client::dump_cache(Formatter *f)
526 {
527 set<Inode*> did;
528
529 ldout(cct, 1) << __func__ << dendl;
530
531 if (f)
532 f->open_array_section("cache");
533
534 if (root)
535 dump_inode(f, root.get(), did, true);
536
537 // make a second pass to catch anything disconnected
538 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
539 it != inode_map.end();
540 ++it) {
541 if (did.count(it->second))
542 continue;
543 dump_inode(f, it->second, did, true);
544 }
545
546 if (f)
547 f->close_section();
548 }
549
550 void Client::dump_status(Formatter *f)
551 {
552 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
553
554 ldout(cct, 1) << __func__ << dendl;
555
556 const epoch_t osd_epoch
557 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
558
559 if (f) {
560 f->open_object_section("metadata");
561 for (const auto& kv : metadata)
562 f->dump_string(kv.first.c_str(), kv.second);
563 f->close_section();
564
565 f->dump_int("dentry_count", lru.lru_get_size());
566 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
567 f->dump_int("id", get_nodeid().v);
568 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
569 f->dump_object("inst", inst);
570 f->dump_object("addr", inst.addr);
571 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
572 f->dump_string("addr_str", inst.addr.get_legacy_str());
573 f->dump_int("inode_count", inode_map.size());
574 f->dump_int("mds_epoch", mdsmap->get_epoch());
575 f->dump_int("osd_epoch", osd_epoch);
576 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
577 f->dump_bool("blocklisted", blocklisted);
578 f->dump_string("fs_name", mdsmap->get_fs_name());
579 }
580 }
581
582 void Client::_pre_init()
583 {
584 timer.init();
585
586 objecter_finisher.start();
587 filer.reset(new Filer(objecter, &objecter_finisher));
588
589 objectcacher->start();
590 }
591
592 int Client::init()
593 {
594 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
595 ceph_assert(iref_writer.is_first_writer());
596
597 _pre_init();
598 {
599 std::scoped_lock l{client_lock};
600 messenger->add_dispatcher_tail(this);
601 }
602 _finish_init();
603 iref_writer.update_state(CLIENT_INITIALIZED);
604 return 0;
605 }
606
607 void Client::_finish_init()
608 {
609 {
610 std::scoped_lock l{client_lock};
611 // logger
612 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
613 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
614 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
615 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
616 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
617 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
618 // average, standard deviation mds/r/w/ latencies
619 plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
620 plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
621 plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
622 plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
623 plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
624 plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
625 plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
626 plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
627 plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
628 logger.reset(plb.create_perf_counters());
629 cct->get_perfcounters_collection()->add(logger.get());
630 }
631
632 cct->_conf.add_observer(this);
633
634 AdminSocket* admin_socket = cct->get_admin_socket();
635 int ret = admin_socket->register_command("mds_requests",
636 &m_command_hook,
637 "show in-progress mds requests");
638 if (ret < 0) {
639 lderr(cct) << "error registering admin socket command: "
640 << cpp_strerror(-ret) << dendl;
641 }
642 ret = admin_socket->register_command("mds_sessions "
643 "name=cap_dump,type=CephBool,req=false",
644 &m_command_hook,
645 "show mds session state");
646 if (ret < 0) {
647 lderr(cct) << "error registering admin socket command: "
648 << cpp_strerror(-ret) << dendl;
649 }
650 ret = admin_socket->register_command("dump_cache",
651 &m_command_hook,
652 "show in-memory metadata cache contents");
653 if (ret < 0) {
654 lderr(cct) << "error registering admin socket command: "
655 << cpp_strerror(-ret) << dendl;
656 }
657 ret = admin_socket->register_command("kick_stale_sessions",
658 &m_command_hook,
659 "kick sessions that were remote reset");
660 if (ret < 0) {
661 lderr(cct) << "error registering admin socket command: "
662 << cpp_strerror(-ret) << dendl;
663 }
664 ret = admin_socket->register_command("status",
665 &m_command_hook,
666 "show overall client status");
667 if (ret < 0) {
668 lderr(cct) << "error registering admin socket command: "
669 << cpp_strerror(-ret) << dendl;
670 }
671 }
672
673 void Client::shutdown()
674 {
675 ldout(cct, 1) << __func__ << dendl;
676
677 // If we were not mounted, but were being used for sending
678 // MDS commands, we may have sessions that need closing.
679 {
680 std::scoped_lock l{client_lock};
681
682 // To make sure the tick thread will be stoppped before
683 // destructing the Client, just in case like the _mount()
684 // failed but didn't not get a chance to stop the tick
685 // thread
686 tick_thread_stopped = true;
687 upkeep_cond.notify_one();
688
689 _close_sessions();
690 }
691 cct->_conf.remove_observer(this);
692
693 cct->get_admin_socket()->unregister_commands(&m_command_hook);
694
695 if (ino_invalidate_cb) {
696 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
697 async_ino_invalidator.wait_for_empty();
698 async_ino_invalidator.stop();
699 }
700
701 if (dentry_invalidate_cb) {
702 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
703 async_dentry_invalidator.wait_for_empty();
704 async_dentry_invalidator.stop();
705 }
706
707 if (switch_interrupt_cb) {
708 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
709 interrupt_finisher.wait_for_empty();
710 interrupt_finisher.stop();
711 }
712
713 if (remount_cb) {
714 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
715 remount_finisher.wait_for_empty();
716 remount_finisher.stop();
717 }
718
719 if (ino_release_cb) {
720 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
721 async_ino_releasor.wait_for_empty();
722 async_ino_releasor.stop();
723 }
724
725 objectcacher->stop(); // outside of client_lock! this does a join.
726
727 /*
728 * We are shuting down the client.
729 *
730 * Just declare the state to CLIENT_NEW to block and fail any
731 * new comming "reader" and then try to wait all the in-flight
732 * "readers" to finish.
733 */
734 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
735 if (!iref_writer.is_first_writer())
736 return;
737 iref_writer.wait_readers_done();
738
739 {
740 std::scoped_lock l(timer_lock);
741 timer.shutdown();
742 }
743
744 objecter_finisher.wait_for_empty();
745 objecter_finisher.stop();
746
747 if (logger) {
748 cct->get_perfcounters_collection()->remove(logger.get());
749 logger.reset();
750 }
751 }
752
753 void Client::update_io_stat_metadata(utime_t latency) {
754 auto lat_nsec = latency.to_nsec();
755 // old values are used to compute new ones
756 auto o_avg = logger->tget(l_c_md_avg).to_nsec();
757 auto o_sqsum = logger->get(l_c_md_sqsum);
758
759 auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
760 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
761 nr_metadata_request);
762
763 logger->tinc(l_c_lat, latency);
764 logger->tinc(l_c_reply, latency);
765
766 utime_t avg;
767 avg.set_from_double(n_avg / 1000000000);
768 logger->tset(l_c_md_avg, avg);
769 logger->set(l_c_md_sqsum, n_sqsum);
770 logger->set(l_c_md_ops, nr_metadata_request);
771 }
772
773 void Client::update_io_stat_read(utime_t latency) {
774 auto lat_nsec = latency.to_nsec();
775 // old values are used to compute new ones
776 auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
777 auto o_sqsum = logger->get(l_c_rd_sqsum);
778
779 auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
780 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
781 nr_read_request);
782
783 logger->tinc(l_c_read, latency);
784
785 utime_t avg;
786 avg.set_from_double(n_avg / 1000000000);
787 logger->tset(l_c_rd_avg, avg);
788 logger->set(l_c_rd_sqsum, n_sqsum);
789 logger->set(l_c_rd_ops, nr_read_request);
790 }
791
792 void Client::update_io_stat_write(utime_t latency) {
793 auto lat_nsec = latency.to_nsec();
794 // old values are used to compute new ones
795 auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
796 auto o_sqsum = logger->get(l_c_wr_sqsum);
797
798 auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
799 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
800 nr_write_request);
801
802 logger->tinc(l_c_wrlat, latency);
803
804 utime_t avg;
805 avg.set_from_double(n_avg / 1000000000);
806 logger->tset(l_c_wr_avg, avg);
807 logger->set(l_c_wr_sqsum, n_sqsum);
808 logger->set(l_c_wr_ops, nr_write_request);
809 }
810
811 // ===================
812 // metadata cache stuff
813
814 void Client::trim_cache(bool trim_kernel_dcache)
815 {
816 uint64_t max = cct->_conf->client_cache_size;
817 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
818 unsigned last = 0;
819 while (lru.lru_get_size() != last) {
820 last = lru.lru_get_size();
821
822 if (!is_unmounting() && lru.lru_get_size() <= max) break;
823
824 // trim!
825 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
826 if (!dn)
827 break; // done
828
829 trim_dentry(dn);
830 }
831
832 if (trim_kernel_dcache && lru.lru_get_size() > max)
833 _invalidate_kernel_dcache();
834
835 // hose root?
836 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
837 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
838 root.reset();
839 }
840 }
841
842 void Client::trim_cache_for_reconnect(MetaSession *s)
843 {
844 mds_rank_t mds = s->mds_num;
845 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
846
847 int trimmed = 0;
848 list<Dentry*> skipped;
849 while (lru.lru_get_size() > 0) {
850 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
851 if (!dn)
852 break;
853
854 if ((dn->inode && dn->inode->caps.count(mds)) ||
855 dn->dir->parent_inode->caps.count(mds)) {
856 trim_dentry(dn);
857 trimmed++;
858 } else
859 skipped.push_back(dn);
860 }
861
862 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
863 lru.lru_insert_mid(*p);
864
865 ldout(cct, 20) << __func__ << " mds." << mds
866 << " trimmed " << trimmed << " dentries" << dendl;
867
868 if (s->caps.size() > 0)
869 _invalidate_kernel_dcache();
870 }
871
872 void Client::trim_dentry(Dentry *dn)
873 {
874 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
875 << " in dir "
876 << std::hex << dn->dir->parent_inode->ino << std::dec
877 << dendl;
878 if (dn->inode) {
879 Inode *diri = dn->dir->parent_inode;
880 clear_dir_complete_and_ordered(diri, true);
881 }
882 unlink(dn, false, false); // drop dir, drop dentry
883 }
884
885
886 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
887 uint64_t truncate_seq, uint64_t truncate_size)
888 {
889 uint64_t prior_size = in->size;
890
891 if (truncate_seq > in->truncate_seq ||
892 (truncate_seq == in->truncate_seq && size > in->size)) {
893 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
894 in->size = size;
895 in->reported_size = size;
896 if (truncate_seq != in->truncate_seq) {
897 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
898 << truncate_seq << dendl;
899 in->truncate_seq = truncate_seq;
900 in->oset.truncate_seq = truncate_seq;
901
902 // truncate cached file data
903 if (prior_size > size) {
904 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
905 }
906 }
907
908 // truncate inline data
909 if (in->inline_version < CEPH_INLINE_NONE) {
910 uint32_t len = in->inline_data.length();
911 if (size < len)
912 in->inline_data.splice(size, len - size);
913 }
914 }
915 if (truncate_seq >= in->truncate_seq &&
916 in->truncate_size != truncate_size) {
917 if (in->is_file()) {
918 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
919 << truncate_size << dendl;
920 in->truncate_size = truncate_size;
921 in->oset.truncate_size = truncate_size;
922 } else {
923 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
924 }
925 }
926 }
927
928 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
929 utime_t ctime, utime_t mtime, utime_t atime)
930 {
931 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
932 << " ctime " << ctime << " mtime " << mtime << dendl;
933
934 if (time_warp_seq > in->time_warp_seq)
935 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
936 << " is higher than local time_warp_seq "
937 << in->time_warp_seq << dendl;
938
939 int warn = false;
940 // be careful with size, mtime, atime
941 if (issued & (CEPH_CAP_FILE_EXCL|
942 CEPH_CAP_FILE_WR|
943 CEPH_CAP_FILE_BUFFER|
944 CEPH_CAP_AUTH_EXCL|
945 CEPH_CAP_XATTR_EXCL)) {
946 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
947 if (ctime > in->ctime)
948 in->ctime = ctime;
949 if (time_warp_seq > in->time_warp_seq) {
950 //the mds updated times, so take those!
951 in->mtime = mtime;
952 in->atime = atime;
953 in->time_warp_seq = time_warp_seq;
954 } else if (time_warp_seq == in->time_warp_seq) {
955 //take max times
956 if (mtime > in->mtime)
957 in->mtime = mtime;
958 if (atime > in->atime)
959 in->atime = atime;
960 } else if (issued & CEPH_CAP_FILE_EXCL) {
961 //ignore mds values as we have a higher seq
962 } else warn = true;
963 } else {
964 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
965 if (time_warp_seq >= in->time_warp_seq) {
966 in->ctime = ctime;
967 in->mtime = mtime;
968 in->atime = atime;
969 in->time_warp_seq = time_warp_seq;
970 } else warn = true;
971 }
972 if (warn) {
973 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
974 << time_warp_seq << " is lower than local time_warp_seq "
975 << in->time_warp_seq
976 << dendl;
977 }
978 }
979
980 void Client::_fragmap_remove_non_leaves(Inode *in)
981 {
982 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
983 if (!in->dirfragtree.is_leaf(p->first))
984 in->fragmap.erase(p++);
985 else
986 ++p;
987 }
988
989 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
990 {
991 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
992 if (p->second == mds)
993 in->fragmap.erase(p++);
994 else
995 ++p;
996 }
997
998 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
999 MetaSession *session,
1000 const UserPerm& request_perms)
1001 {
1002 Inode *in;
1003 bool was_new = false;
1004 if (inode_map.count(st->vino)) {
1005 in = inode_map[st->vino];
1006 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1007 } else {
1008 in = new Inode(this, st->vino, &st->layout);
1009 inode_map[st->vino] = in;
1010
1011 if (use_faked_inos())
1012 _assign_faked_ino(in);
1013
1014 if (!root) {
1015 root = in;
1016 if (use_faked_inos())
1017 _assign_faked_root(root.get());
1018 root_ancestor = in;
1019 cwd = root;
1020 } else if (is_mounting()) {
1021 root_parents[root_ancestor] = in;
1022 root_ancestor = in;
1023 }
1024
1025 // immutable bits
1026 in->ino = st->vino.ino;
1027 in->snapid = st->vino.snapid;
1028 in->mode = st->mode & S_IFMT;
1029 was_new = true;
1030 }
1031
1032 in->rdev = st->rdev;
1033 if (in->is_symlink())
1034 in->symlink = st->symlink;
1035
1036 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1037 bool new_version = false;
1038 if (in->version == 0 ||
1039 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1040 (in->version & ~1) < st->version))
1041 new_version = true;
1042
1043 int issued;
1044 in->caps_issued(&issued);
1045 issued |= in->caps_dirty();
1046 int new_issued = ~issued & (int)st->cap.caps;
1047
1048 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1049 !(issued & CEPH_CAP_AUTH_EXCL)) {
1050 in->mode = st->mode;
1051 in->uid = st->uid;
1052 in->gid = st->gid;
1053 in->btime = st->btime;
1054 in->snap_btime = st->snap_btime;
1055 in->snap_metadata = st->snap_metadata;
1056 }
1057
1058 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1059 !(issued & CEPH_CAP_LINK_EXCL)) {
1060 in->nlink = st->nlink;
1061 }
1062
1063 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1064 update_inode_file_time(in, issued, st->time_warp_seq,
1065 st->ctime, st->mtime, st->atime);
1066 }
1067
1068 if (new_version ||
1069 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1070 in->layout = st->layout;
1071 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1072 }
1073
1074 if (in->is_dir()) {
1075 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1076 in->dirstat = st->dirstat;
1077 }
1078 // dir_layout/rstat/quota are not tracked by capability, update them only if
1079 // the inode stat is from auth mds
1080 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1081 in->dir_layout = st->dir_layout;
1082 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1083 in->rstat = st->rstat;
1084 in->quota = st->quota;
1085 in->dir_pin = st->dir_pin;
1086 }
1087 // move me if/when version reflects fragtree changes.
1088 if (in->dirfragtree != st->dirfragtree) {
1089 in->dirfragtree = st->dirfragtree;
1090 _fragmap_remove_non_leaves(in);
1091 }
1092 }
1093
1094 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1095 st->xattrbl.length() &&
1096 st->xattr_version > in->xattr_version) {
1097 auto p = st->xattrbl.cbegin();
1098 decode(in->xattrs, p);
1099 in->xattr_version = st->xattr_version;
1100 }
1101
1102 if (st->inline_version > in->inline_version) {
1103 in->inline_data = st->inline_data;
1104 in->inline_version = st->inline_version;
1105 }
1106
1107 /* always take a newer change attr */
1108 if (st->change_attr > in->change_attr)
1109 in->change_attr = st->change_attr;
1110
1111 if (st->version > in->version)
1112 in->version = st->version;
1113
1114 if (was_new)
1115 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1116
1117 if (!st->cap.caps)
1118 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1119
1120 if (in->snapid == CEPH_NOSNAP) {
1121 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1122 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1123 st->cap.flags, request_perms);
1124 if (in->auth_cap && in->auth_cap->session == session) {
1125 in->max_size = st->max_size;
1126 in->rstat = st->rstat;
1127 }
1128
1129 // setting I_COMPLETE needs to happen after adding the cap
1130 if (in->is_dir() &&
1131 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1132 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1133 in->dirstat.nfiles == 0 &&
1134 in->dirstat.nsubdirs == 0) {
1135 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1136 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1137 if (in->dir) {
1138 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1139 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1140 in->dir->readdir_cache.clear();
1141 for (const auto& p : in->dir->dentries) {
1142 unlink(p.second, true, true); // keep dir, keep dentry
1143 }
1144 if (in->dir->dentries.empty())
1145 close_dir(in->dir);
1146 }
1147 }
1148 } else {
1149 in->snap_caps |= st->cap.caps;
1150 }
1151
1152 in->fscrypt = st->fscrypt;
1153 return in;
1154 }
1155
1156
1157 /*
1158 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1159 */
1160 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1161 Inode *in, utime_t from, MetaSession *session,
1162 Dentry *old_dentry)
1163 {
1164 Dentry *dn = NULL;
1165 if (dir->dentries.count(dname))
1166 dn = dir->dentries[dname];
1167
1168 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1169 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1170 << dendl;
1171
1172 if (dn && dn->inode) {
1173 if (dn->inode->vino() == in->vino()) {
1174 touch_dn(dn);
1175 ldout(cct, 12) << " had dentry " << dname
1176 << " with correct vino " << dn->inode->vino()
1177 << dendl;
1178 } else {
1179 ldout(cct, 12) << " had dentry " << dname
1180 << " with WRONG vino " << dn->inode->vino()
1181 << dendl;
1182 unlink(dn, true, true); // keep dir, keep dentry
1183 }
1184 }
1185
1186 if (!dn || !dn->inode) {
1187 InodeRef tmp_ref(in);
1188 if (old_dentry) {
1189 if (old_dentry->dir != dir) {
1190 Inode *old_diri = old_dentry->dir->parent_inode;
1191 clear_dir_complete_and_ordered(old_diri, false);
1192 }
1193 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1194 }
1195 Inode *diri = dir->parent_inode;
1196 clear_dir_complete_and_ordered(diri, false);
1197 dn = link(dir, dname, in, dn);
1198 }
1199
1200 update_dentry_lease(dn, dlease, from, session);
1201 return dn;
1202 }
1203
1204 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1205 {
1206 utime_t dttl = from;
1207 dttl += (float)dlease->duration_ms / 1000.0;
1208
1209 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1210
1211 ceph_assert(dn);
1212
1213 if (dlease->mask & CEPH_LEASE_VALID) {
1214 if (dttl > dn->lease_ttl) {
1215 ldout(cct, 10) << "got dentry lease on " << dn->name
1216 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1217 dn->lease_ttl = dttl;
1218 dn->lease_mds = session->mds_num;
1219 dn->lease_seq = dlease->seq;
1220 dn->lease_gen = session->cap_gen;
1221 }
1222 }
1223 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1224 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1225 dn->mark_primary();
1226 dn->alternate_name = std::move(dlease->alternate_name);
1227 }
1228
1229
1230 /*
1231 * update MDS location cache for a single inode
1232 */
1233 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1234 {
1235 // auth
1236 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1237 if (dst->auth >= 0) {
1238 in->fragmap[dst->frag] = dst->auth;
1239 } else {
1240 in->fragmap.erase(dst->frag);
1241 }
1242 if (!in->dirfragtree.is_leaf(dst->frag)) {
1243 in->dirfragtree.force_to_leaf(cct, dst->frag);
1244 _fragmap_remove_non_leaves(in);
1245 }
1246
1247 // replicated, only update from auth mds reply
1248 if (from == dst->auth) {
1249 in->dir_replicated = !dst->dist.empty();
1250 if (!dst->dist.empty())
1251 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1252 else
1253 in->frag_repmap.erase(dst->frag);
1254 }
1255 }
1256
1257 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1258 {
1259 if (complete)
1260 diri->dir_release_count++;
1261 else
1262 diri->dir_ordered_count++;
1263 if (diri->flags & I_COMPLETE) {
1264 if (complete) {
1265 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1266 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1267 } else {
1268 if (diri->flags & I_DIR_ORDERED) {
1269 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1270 diri->flags &= ~I_DIR_ORDERED;
1271 }
1272 }
1273 if (diri->dir)
1274 diri->dir->readdir_cache.clear();
1275 }
1276 }
1277
1278 /*
1279 * insert results from readdir or lssnap into the metadata cache.
1280 */
1281 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1282
1283 auto& reply = request->reply;
1284 ConnectionRef con = request->reply->get_connection();
1285 uint64_t features;
1286 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1287 features = (uint64_t)-1;
1288 }
1289 else {
1290 features = con->get_features();
1291 }
1292
1293 dir_result_t *dirp = request->dirp;
1294 ceph_assert(dirp);
1295
1296 // the extra buffer list is only set for readdir and lssnap replies
1297 auto p = reply->get_extra_bl().cbegin();
1298 if (!p.end()) {
1299 // snapdir?
1300 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1301 ceph_assert(diri);
1302 diri = open_snapdir(diri);
1303 }
1304
1305 // only open dir if we're actually adding stuff to it!
1306 Dir *dir = diri->open_dir();
1307 ceph_assert(dir);
1308
1309 // dirstat
1310 DirStat dst(p, features);
1311 __u32 numdn;
1312 __u16 flags;
1313 decode(numdn, p);
1314 decode(flags, p);
1315
1316 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1317 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1318
1319 frag_t fg = (unsigned)request->head.args.readdir.frag;
1320 unsigned readdir_offset = dirp->next_offset;
1321 string readdir_start = dirp->last_name;
1322 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1323
1324 unsigned last_hash = 0;
1325 if (hash_order) {
1326 if (!readdir_start.empty()) {
1327 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1328 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1329 /* mds understands offset_hash */
1330 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1331 }
1332 }
1333
1334 if (fg != dst.frag) {
1335 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1336 fg = dst.frag;
1337 if (!hash_order) {
1338 readdir_offset = 2;
1339 readdir_start.clear();
1340 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1341 }
1342 }
1343
1344 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1345 << ", hash_order=" << hash_order
1346 << ", readdir_start " << readdir_start
1347 << ", last_hash " << last_hash
1348 << ", next_offset " << readdir_offset << dendl;
1349
1350 if (diri->snapid != CEPH_SNAPDIR &&
1351 fg.is_leftmost() && readdir_offset == 2 &&
1352 !(hash_order && last_hash)) {
1353 dirp->release_count = diri->dir_release_count;
1354 dirp->ordered_count = diri->dir_ordered_count;
1355 dirp->start_shared_gen = diri->shared_gen;
1356 dirp->cache_index = 0;
1357 }
1358
1359 dirp->buffer_frag = fg;
1360
1361 _readdir_drop_dirp_buffer(dirp);
1362 dirp->buffer.reserve(numdn);
1363
1364 string dname;
1365 LeaseStat dlease;
1366 for (unsigned i=0; i<numdn; i++) {
1367 decode(dname, p);
1368 dlease.decode(p, features);
1369 InodeStat ist(p, features);
1370
1371 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1372
1373 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1374 request->perms);
1375 Dentry *dn;
1376 if (diri->dir->dentries.count(dname)) {
1377 Dentry *olddn = diri->dir->dentries[dname];
1378 if (olddn->inode != in) {
1379 // replace incorrect dentry
1380 unlink(olddn, true, true); // keep dir, dentry
1381 dn = link(dir, dname, in, olddn);
1382 ceph_assert(dn == olddn);
1383 } else {
1384 // keep existing dn
1385 dn = olddn;
1386 touch_dn(dn);
1387 }
1388 } else {
1389 // new dn
1390 dn = link(dir, dname, in, NULL);
1391 }
1392 dn->alternate_name = std::move(dlease.alternate_name);
1393
1394 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1395 if (hash_order) {
1396 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1397 if (hash != last_hash)
1398 readdir_offset = 2;
1399 last_hash = hash;
1400 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1401 } else {
1402 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1403 }
1404 // add to readdir cache
1405 if (dirp->release_count == diri->dir_release_count &&
1406 dirp->ordered_count == diri->dir_ordered_count &&
1407 dirp->start_shared_gen == diri->shared_gen) {
1408 if (dirp->cache_index == dir->readdir_cache.size()) {
1409 if (i == 0) {
1410 ceph_assert(!dirp->inode->is_complete_and_ordered());
1411 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1412 }
1413 dir->readdir_cache.push_back(dn);
1414 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1415 if (dirp->inode->is_complete_and_ordered())
1416 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1417 else
1418 dir->readdir_cache[dirp->cache_index] = dn;
1419 } else {
1420 ceph_abort_msg("unexpected readdir buffer idx");
1421 }
1422 dirp->cache_index++;
1423 }
1424 // add to cached result list
1425 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1426 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1427 }
1428
1429 if (numdn > 0)
1430 dirp->last_name = dname;
1431 if (end)
1432 dirp->next_offset = 2;
1433 else
1434 dirp->next_offset = readdir_offset;
1435
1436 if (dir->is_empty())
1437 close_dir(dir);
1438 }
1439 }
1440
1441 /** insert_trace
1442 *
1443 * insert a trace from a MDS reply into the cache.
1444 */
1445 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1446 {
1447 auto& reply = request->reply;
1448 int op = request->get_op();
1449
1450 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1451 << " is_target=" << (int)reply->head.is_target
1452 << " is_dentry=" << (int)reply->head.is_dentry
1453 << dendl;
1454
1455 auto p = reply->get_trace_bl().cbegin();
1456 if (request->got_unsafe) {
1457 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1458 ceph_assert(p.end());
1459 return NULL;
1460 }
1461
1462 if (p.end()) {
1463 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1464
1465 Dentry *d = request->dentry();
1466 if (d) {
1467 Inode *diri = d->dir->parent_inode;
1468 clear_dir_complete_and_ordered(diri, true);
1469 }
1470
1471 if (d && reply->get_result() == 0) {
1472 if (op == CEPH_MDS_OP_RENAME) {
1473 // rename
1474 Dentry *od = request->old_dentry();
1475 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1476 ceph_assert(od);
1477 unlink(od, true, true); // keep dir, dentry
1478 } else if (op == CEPH_MDS_OP_RMDIR ||
1479 op == CEPH_MDS_OP_UNLINK) {
1480 // unlink, rmdir
1481 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1482 unlink(d, true, true); // keep dir, dentry
1483 }
1484 }
1485 return NULL;
1486 }
1487
1488 ConnectionRef con = request->reply->get_connection();
1489 uint64_t features;
1490 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1491 features = (uint64_t)-1;
1492 }
1493 else {
1494 features = con->get_features();
1495 }
1496 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1497
1498 // snap trace
1499 SnapRealm *realm = NULL;
1500 if (reply->snapbl.length())
1501 update_snap_trace(reply->snapbl, &realm);
1502
1503 ldout(cct, 10) << " hrm "
1504 << " is_target=" << (int)reply->head.is_target
1505 << " is_dentry=" << (int)reply->head.is_dentry
1506 << dendl;
1507
1508 InodeStat dirst;
1509 DirStat dst;
1510 string dname;
1511 LeaseStat dlease;
1512 InodeStat ist;
1513
1514 if (reply->head.is_dentry) {
1515 dirst.decode(p, features);
1516 dst.decode(p, features);
1517 decode(dname, p);
1518 dlease.decode(p, features);
1519 }
1520
1521 Inode *in = 0;
1522 if (reply->head.is_target) {
1523 ist.decode(p, features);
1524 if (cct->_conf->client_debug_getattr_caps) {
1525 unsigned wanted = 0;
1526 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1527 wanted = request->head.args.getattr.mask;
1528 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1529 wanted = request->head.args.open.mask;
1530
1531 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1532 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1533 ceph_abort_msg("MDS reply does not contain xattrs");
1534 }
1535
1536 in = add_update_inode(&ist, request->sent_stamp, session,
1537 request->perms);
1538 }
1539
1540 Inode *diri = NULL;
1541 if (reply->head.is_dentry) {
1542 diri = add_update_inode(&dirst, request->sent_stamp, session,
1543 request->perms);
1544 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1545 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
1546
1547 if (in) {
1548 Dir *dir = diri->open_dir();
1549 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1550 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1551 } else {
1552 Dentry *dn = NULL;
1553 if (diri->dir && diri->dir->dentries.count(dname)) {
1554 dn = diri->dir->dentries[dname];
1555 if (dn->inode) {
1556 clear_dir_complete_and_ordered(diri, false);
1557 unlink(dn, true, true); // keep dir, dentry
1558 }
1559 }
1560 if (dlease.duration_ms > 0) {
1561 if (!dn) {
1562 Dir *dir = diri->open_dir();
1563 dn = link(dir, dname, NULL, NULL);
1564 }
1565 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1566 }
1567 }
1568 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1569 op == CEPH_MDS_OP_MKSNAP) {
1570 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1571 // fake it for snap lookup
1572 vinodeno_t vino = ist.vino;
1573 vino.snapid = CEPH_SNAPDIR;
1574 ceph_assert(inode_map.count(vino));
1575 diri = inode_map[vino];
1576
1577 string dname = request->path.last_dentry();
1578
1579 LeaseStat dlease;
1580 dlease.duration_ms = 0;
1581
1582 if (in) {
1583 Dir *dir = diri->open_dir();
1584 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1585 } else {
1586 if (diri->dir && diri->dir->dentries.count(dname)) {
1587 Dentry *dn = diri->dir->dentries[dname];
1588 if (dn->inode)
1589 unlink(dn, true, true); // keep dir, dentry
1590 }
1591 }
1592 }
1593
1594 if (in) {
1595 if (op == CEPH_MDS_OP_READDIR ||
1596 op == CEPH_MDS_OP_LSSNAP) {
1597 insert_readdir_results(request, session, in);
1598 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1599 // hack: return parent inode instead
1600 in = diri;
1601 }
1602
1603 if (request->dentry() == NULL && in != request->inode()) {
1604 // pin the target inode if its parent dentry is not pinned
1605 request->set_other_inode(in);
1606 }
1607 }
1608
1609 if (realm)
1610 put_snap_realm(realm);
1611
1612 request->target = in;
1613 return in;
1614 }
1615
1616 // -------
1617
1618 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1619 {
1620 mds_rank_t mds = MDS_RANK_NONE;
1621 __u32 hash = 0;
1622 bool is_hash = false;
1623 int issued = 0;
1624
1625 Inode *in = NULL;
1626 Dentry *de = NULL;
1627
1628 if (req->resend_mds >= 0) {
1629 mds = req->resend_mds;
1630 req->resend_mds = -1;
1631 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1632 goto out;
1633 }
1634
1635 if (cct->_conf->client_use_random_mds)
1636 goto random_mds;
1637
1638 in = req->inode();
1639 de = req->dentry();
1640 if (in) {
1641 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1642 if (req->path.depth()) {
1643 hash = in->hash_dentry_name(req->path[0]);
1644 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1645 << " on " << req->path[0]
1646 << " => " << hash << dendl;
1647 is_hash = true;
1648 }
1649 } else if (de) {
1650 if (de->inode) {
1651 in = de->inode.get();
1652 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1653 } else {
1654 in = de->dir->parent_inode;
1655 hash = in->hash_dentry_name(de->name);
1656 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1657 << " on " << de->name
1658 << " => " << hash << dendl;
1659 is_hash = true;
1660 }
1661 }
1662 if (in) {
1663 if (in->snapid != CEPH_NOSNAP) {
1664 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1665 while (in->snapid != CEPH_NOSNAP) {
1666 if (in->snapid == CEPH_SNAPDIR)
1667 in = in->snapdir_parent.get();
1668 else if (!in->dentries.empty())
1669 /* In most cases there will only be one dentry, so getting it
1670 * will be the correct action. If there are multiple hard links,
1671 * I think the MDS should be able to redirect as needed*/
1672 in = in->get_first_parent()->dir->parent_inode;
1673 else {
1674 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1675 break;
1676 }
1677 }
1678 is_hash = false;
1679 }
1680
1681 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1682 << " hash=" << hash << dendl;
1683
1684 if (req->get_op() == CEPH_MDS_OP_GETATTR)
1685 issued = req->inode()->caps_issued();
1686
1687 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1688 frag_t fg = in->dirfragtree[hash];
1689 if (!req->auth_is_best(issued)) {
1690 auto repmapit = in->frag_repmap.find(fg);
1691 if (repmapit != in->frag_repmap.end()) {
1692 auto& repmap = repmapit->second;
1693 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1694 mds = repmap.at(r);
1695 }
1696 } else if (in->fragmap.count(fg)) {
1697 mds = in->fragmap[fg];
1698 if (phash_diri)
1699 *phash_diri = in;
1700 } else if (in->auth_cap) {
1701 req->send_to_auth = true;
1702 mds = in->auth_cap->session->mds_num;
1703 }
1704 if (mds >= 0) {
1705 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1706 goto out;
1707 }
1708 }
1709
1710 if (in->auth_cap && req->auth_is_best(issued)) {
1711 mds = in->auth_cap->session->mds_num;
1712 } else if (!in->caps.empty()) {
1713 mds = in->caps.begin()->second.session->mds_num;
1714 } else {
1715 goto random_mds;
1716 }
1717 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1718
1719 goto out;
1720 }
1721
1722 random_mds:
1723 if (mds < 0) {
1724 mds = _get_random_up_mds();
1725 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1726 }
1727
1728 out:
1729 ldout(cct, 20) << "mds is " << mds << dendl;
1730 return mds;
1731 }
1732
1733 void Client::connect_mds_targets(mds_rank_t mds)
1734 {
1735 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1736 ceph_assert(mds_sessions.count(mds));
1737 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1738 for (const auto &rank : info.export_targets) {
1739 if (mds_sessions.count(rank) == 0 &&
1740 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1741 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1742 << " export target mds." << rank << dendl;
1743 _open_mds_session(rank);
1744 }
1745 }
1746 }
1747
1748 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1749 {
1750 f->dump_int("id", get_nodeid().v);
1751 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1752 f->dump_object("inst", inst);
1753 f->dump_stream("inst_str") << inst;
1754 f->dump_stream("addr_str") << inst.addr;
1755 f->open_array_section("sessions");
1756 for (const auto &p : mds_sessions) {
1757 f->open_object_section("session");
1758 p.second->dump(f, cap_dump);
1759 f->close_section();
1760 }
1761 f->close_section();
1762 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1763 }
1764
1765 void Client::dump_mds_requests(Formatter *f)
1766 {
1767 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1768 p != mds_requests.end();
1769 ++p) {
1770 f->open_object_section("request");
1771 p->second->dump(f);
1772 f->close_section();
1773 }
1774 }
1775
1776 int Client::verify_reply_trace(int r, MetaSession *session,
1777 MetaRequest *request, const MConstRef<MClientReply>& reply,
1778 InodeRef *ptarget, bool *pcreated,
1779 const UserPerm& perms)
1780 {
1781 // check whether this request actually did the create, and set created flag
1782 bufferlist extra_bl;
1783 inodeno_t created_ino;
1784 bool got_created_ino = false;
1785 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1786
1787 extra_bl = reply->get_extra_bl();
1788 if (extra_bl.length() >= 8) {
1789 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1790 struct openc_response_t ocres;
1791
1792 decode(ocres, extra_bl);
1793 created_ino = ocres.created_ino;
1794 /*
1795 * The userland cephfs client doesn't have a way to do an async create
1796 * (yet), so just discard delegated_inos for now. Eventually we should
1797 * store them and use them in create calls, even if they are synchronous,
1798 * if only for testing purposes.
1799 */
1800 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1801 } else {
1802 // u64 containing number of created ino
1803 decode(created_ino, extra_bl);
1804 }
1805 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1806 got_created_ino = true;
1807 }
1808
1809 if (pcreated)
1810 *pcreated = got_created_ino;
1811
1812 if (request->target) {
1813 *ptarget = request->target;
1814 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1815 } else {
1816 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1817 (*ptarget) = p->second;
1818 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1819 } else {
1820 // we got a traceless reply, and need to look up what we just
1821 // created. for now, do this by name. someday, do this by the
1822 // ino... which we know! FIXME.
1823 InodeRef target;
1824 Dentry *d = request->dentry();
1825 if (d) {
1826 if (d->dir) {
1827 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1828 << d->dir->parent_inode->ino << "/" << d->name
1829 << " got_ino " << got_created_ino
1830 << " ino " << created_ino
1831 << dendl;
1832 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1833 &target, perms);
1834 } else {
1835 // if the dentry is not linked, just do our best. see #5021.
1836 ceph_abort_msg("how did this happen? i want logs!");
1837 }
1838 } else {
1839 Inode *in = request->inode();
1840 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1841 << in->ino << dendl;
1842 r = _getattr(in, request->regetattr_mask, perms, true);
1843 target = in;
1844 }
1845 if (r >= 0) {
1846 // verify ino returned in reply and trace_dist are the same
1847 if (got_created_ino &&
1848 created_ino.val != target->ino.val) {
1849 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1850 r = -CEPHFS_EINTR;
1851 }
1852 if (ptarget)
1853 ptarget->swap(target);
1854 }
1855 }
1856 }
1857
1858 return r;
1859 }
1860
1861
1862 /**
1863 * make a request
1864 *
1865 * Blocking helper to make an MDS request.
1866 *
1867 * If the ptarget flag is set, behavior changes slightly: the caller
1868 * expects to get a pointer to the inode we are creating or operating
1869 * on. As a result, we will follow up any traceless mutation reply
1870 * with a getattr or lookup to transparently handle a traceless reply
1871 * from the MDS (as when the MDS restarts and the client has to replay
1872 * a request).
1873 *
1874 * @param request the MetaRequest to execute
1875 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1876 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1877 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1878 * @param use_mds [optional] prefer a specific mds (-1 for default)
1879 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1880 */
1881 int Client::make_request(MetaRequest *request,
1882 const UserPerm& perms,
1883 InodeRef *ptarget, bool *pcreated,
1884 mds_rank_t use_mds,
1885 bufferlist *pdirbl)
1886 {
1887 int r = 0;
1888
1889 // assign a unique tid
1890 ceph_tid_t tid = ++last_tid;
1891 request->set_tid(tid);
1892
1893 // and timestamp
1894 request->op_stamp = ceph_clock_now();
1895 request->created = ceph::coarse_mono_clock::now();
1896
1897 // make note
1898 mds_requests[tid] = request->get();
1899 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1900 oldest_tid = tid;
1901
1902 request->set_caller_perms(perms);
1903
1904 if (cct->_conf->client_inject_fixed_oldest_tid) {
1905 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1906 request->set_oldest_client_tid(1);
1907 } else {
1908 request->set_oldest_client_tid(oldest_tid);
1909 }
1910
1911 // hack target mds?
1912 if (use_mds >= 0)
1913 request->resend_mds = use_mds;
1914
1915 MetaSessionRef session = NULL;
1916 while (1) {
1917 if (request->aborted())
1918 break;
1919
1920 if (blocklisted) {
1921 request->abort(-CEPHFS_EBLOCKLISTED);
1922 break;
1923 }
1924
1925 // set up wait cond
1926 ceph::condition_variable caller_cond;
1927 request->caller_cond = &caller_cond;
1928
1929 // choose mds
1930 Inode *hash_diri = NULL;
1931 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1932 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1933 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1934 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1935 if (hash_diri) {
1936 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1937 _fragmap_remove_stopped_mds(hash_diri, mds);
1938 } else {
1939 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1940 request->resend_mds = _get_random_up_mds();
1941 }
1942 } else {
1943 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1944 wait_on_list(waiting_for_mdsmap);
1945 }
1946 continue;
1947 }
1948
1949 // open a session?
1950 if (!have_open_session(mds)) {
1951 session = _get_or_open_mds_session(mds);
1952 if (session->state == MetaSession::STATE_REJECTED) {
1953 request->abort(-CEPHFS_EPERM);
1954 break;
1955 }
1956 // wait
1957 if (session->state == MetaSession::STATE_OPENING) {
1958 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1959 wait_on_context_list(session->waiting_for_open);
1960 continue;
1961 }
1962
1963 if (!have_open_session(mds))
1964 continue;
1965 } else {
1966 session = mds_sessions.at(mds);
1967 }
1968
1969 // send request.
1970 send_request(request, session.get());
1971
1972 // wait for signal
1973 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1974 request->kick = false;
1975 std::unique_lock l{client_lock, std::adopt_lock};
1976 caller_cond.wait(l, [request] {
1977 return (request->reply || // reply
1978 request->resend_mds >= 0 || // forward
1979 request->kick);
1980 });
1981 l.release();
1982 request->caller_cond = nullptr;
1983
1984 // did we get a reply?
1985 if (request->reply)
1986 break;
1987 }
1988
1989 if (!request->reply) {
1990 ceph_assert(request->aborted());
1991 ceph_assert(!request->got_unsafe);
1992 r = request->get_abort_code();
1993 request->item.remove_myself();
1994 unregister_request(request);
1995 put_request(request);
1996 return r;
1997 }
1998
1999 // got it!
2000 auto reply = std::move(request->reply);
2001 r = reply->get_result();
2002 if (r >= 0)
2003 request->success = true;
2004
2005 // kick dispatcher (we've got it!)
2006 ceph_assert(request->dispatch_cond);
2007 request->dispatch_cond->notify_all();
2008 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2009 request->dispatch_cond = 0;
2010
2011 if (r >= 0 && ptarget)
2012 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
2013
2014 if (pdirbl)
2015 *pdirbl = reply->get_extra_bl();
2016
2017 // -- log times --
2018 utime_t lat = ceph_clock_now();
2019 lat -= request->sent_stamp;
2020 ldout(cct, 20) << "lat " << lat << dendl;
2021
2022 ++nr_metadata_request;
2023 update_io_stat_metadata(lat);
2024
2025 put_request(request);
2026 return r;
2027 }
2028
2029 void Client::unregister_request(MetaRequest *req)
2030 {
2031 mds_requests.erase(req->tid);
2032 if (req->tid == oldest_tid) {
2033 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2034 while (true) {
2035 if (p == mds_requests.end()) {
2036 oldest_tid = 0;
2037 break;
2038 }
2039 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2040 oldest_tid = p->first;
2041 break;
2042 }
2043 ++p;
2044 }
2045 }
2046 put_request(req);
2047 }
2048
2049 void Client::put_request(MetaRequest *request)
2050 {
2051 if (request->_put()) {
2052 int op = -1;
2053 if (request->success)
2054 op = request->get_op();
2055 InodeRef other_in;
2056 request->take_other_inode(&other_in);
2057 delete request;
2058
2059 if (other_in &&
2060 (op == CEPH_MDS_OP_RMDIR ||
2061 op == CEPH_MDS_OP_RENAME ||
2062 op == CEPH_MDS_OP_RMSNAP)) {
2063 _try_to_trim_inode(other_in.get(), false);
2064 }
2065 }
2066 }
2067
2068 int Client::encode_inode_release(Inode *in, MetaRequest *req,
2069 mds_rank_t mds, int drop,
2070 int unless, int force)
2071 {
2072 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
2073 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
2074 << ", force:" << force << ")" << dendl;
2075 int released = 0;
2076 auto it = in->caps.find(mds);
2077 if (it != in->caps.end()) {
2078 Cap &cap = it->second;
2079 drop &= ~(in->dirty_caps | get_caps_used(in));
2080 if ((drop & cap.issued) &&
2081 !(unless & cap.issued)) {
2082 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
2083 cap.issued &= ~drop;
2084 cap.implemented &= ~drop;
2085 released = 1;
2086 } else {
2087 released = force;
2088 }
2089 if (released) {
2090 cap.wanted = in->caps_wanted();
2091 if (&cap == in->auth_cap &&
2092 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2093 in->requested_max_size = 0;
2094 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2095 }
2096 ceph_mds_request_release rel;
2097 rel.ino = in->ino;
2098 rel.cap_id = cap.cap_id;
2099 rel.seq = cap.seq;
2100 rel.issue_seq = cap.issue_seq;
2101 rel.mseq = cap.mseq;
2102 rel.caps = cap.implemented;
2103 rel.wanted = cap.wanted;
2104 rel.dname_len = 0;
2105 rel.dname_seq = 0;
2106 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2107 }
2108 }
2109 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2110 << released << dendl;
2111 return released;
2112 }
2113
2114 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2115 mds_rank_t mds, int drop, int unless)
2116 {
2117 ldout(cct, 20) << __func__ << " enter(dn:"
2118 << dn << ")" << dendl;
2119 int released = 0;
2120 if (dn->dir)
2121 released = encode_inode_release(dn->dir->parent_inode, req,
2122 mds, drop, unless, 1);
2123 if (released && dn->lease_mds == mds) {
2124 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2125 auto& rel = req->cap_releases.back();
2126 rel.item.dname_len = dn->name.length();
2127 rel.item.dname_seq = dn->lease_seq;
2128 rel.dname = dn->name;
2129 dn->lease_mds = -1;
2130 }
2131 ldout(cct, 25) << __func__ << " exit(dn:"
2132 << dn << ")" << dendl;
2133 }
2134
2135
2136 /*
2137 * This requires the MClientRequest *request member to be set.
2138 * It will error out horribly without one.
2139 * Additionally, if you set any *drop member, you'd better have
2140 * set the corresponding dentry!
2141 */
2142 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2143 {
2144 ldout(cct, 20) << __func__ << " enter (req: "
2145 << req << ", mds: " << mds << ")" << dendl;
2146 if (req->inode_drop && req->inode())
2147 encode_inode_release(req->inode(), req,
2148 mds, req->inode_drop,
2149 req->inode_unless);
2150
2151 if (req->old_inode_drop && req->old_inode())
2152 encode_inode_release(req->old_inode(), req,
2153 mds, req->old_inode_drop,
2154 req->old_inode_unless);
2155 if (req->other_inode_drop && req->other_inode())
2156 encode_inode_release(req->other_inode(), req,
2157 mds, req->other_inode_drop,
2158 req->other_inode_unless);
2159
2160 if (req->dentry_drop && req->dentry())
2161 encode_dentry_release(req->dentry(), req,
2162 mds, req->dentry_drop,
2163 req->dentry_unless);
2164
2165 if (req->old_dentry_drop && req->old_dentry())
2166 encode_dentry_release(req->old_dentry(), req,
2167 mds, req->old_dentry_drop,
2168 req->old_dentry_unless);
2169 ldout(cct, 25) << __func__ << " exit (req: "
2170 << req << ", mds " << mds <<dendl;
2171 }
2172
2173 bool Client::have_open_session(mds_rank_t mds)
2174 {
2175 const auto &it = mds_sessions.find(mds);
2176 return it != mds_sessions.end() &&
2177 (it->second->state == MetaSession::STATE_OPEN ||
2178 it->second->state == MetaSession::STATE_STALE);
2179 }
2180
2181 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2182 {
2183 const auto &it = mds_sessions.find(mds);
2184 if (it == mds_sessions.end() || it->second->con != con) {
2185 return NULL;
2186 } else {
2187 return it->second;
2188 }
2189 }
2190
2191 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2192 {
2193 auto it = mds_sessions.find(mds);
2194 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2195 }
2196
2197 /**
2198 * Populate a map of strings with client-identifying metadata,
2199 * such as the hostname. Call this once at initialization.
2200 */
2201 void Client::populate_metadata(const std::string &mount_root)
2202 {
2203 // Hostname
2204 #ifdef _WIN32
2205 // TODO: move this to compat.h
2206 char hostname[64];
2207 DWORD hostname_sz = 64;
2208 GetComputerNameA(hostname, &hostname_sz);
2209 metadata["hostname"] = hostname;
2210 #else
2211 struct utsname u;
2212 int r = uname(&u);
2213 if (r >= 0) {
2214 metadata["hostname"] = u.nodename;
2215 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2216 } else {
2217 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2218 }
2219 #endif
2220
2221 metadata["pid"] = stringify(getpid());
2222
2223 // Ceph entity id (the '0' in "client.0")
2224 metadata["entity_id"] = cct->_conf->name.get_id();
2225
2226 // Our mount position
2227 if (!mount_root.empty()) {
2228 metadata["root"] = mount_root;
2229 }
2230
2231 // Ceph version
2232 metadata["ceph_version"] = pretty_version_to_str();
2233 metadata["ceph_sha1"] = git_version_to_str();
2234
2235 // Apply any metadata from the user's configured overrides
2236 std::vector<std::string> tokens;
2237 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2238 for (const auto &i : tokens) {
2239 auto eqpos = i.find("=");
2240 // Throw out anything that isn't of the form "<str>=<str>"
2241 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2242 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2243 continue;
2244 }
2245 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2246 }
2247 }
2248
2249 /**
2250 * Optionally add or override client metadata fields.
2251 */
2252 void Client::update_metadata(std::string const &k, std::string const &v)
2253 {
2254 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2255 ceph_assert(iref_reader.is_state_satisfied());
2256
2257 std::scoped_lock l(client_lock);
2258
2259 auto it = metadata.find(k);
2260 if (it != metadata.end()) {
2261 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2262 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2263 }
2264
2265 metadata[k] = v;
2266 }
2267
2268 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2269 {
2270 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2271 auto addrs = mdsmap->get_addrs(mds);
2272 auto em = mds_sessions.emplace(std::piecewise_construct,
2273 std::forward_as_tuple(mds),
2274 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2275 ceph_assert(em.second); /* not already present */
2276 auto session = em.first->second;
2277
2278 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2279 m->metadata = metadata;
2280 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2281 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2282 session->con->send_message2(std::move(m));
2283 return session;
2284 }
2285
2286 void Client::_close_mds_session(MetaSession *s)
2287 {
2288 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2289 s->state = MetaSession::STATE_CLOSING;
2290 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2291 }
2292
2293 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2294 {
2295 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2296 if (rejected && s->state != MetaSession::STATE_CLOSING)
2297 s->state = MetaSession::STATE_REJECTED;
2298 else
2299 s->state = MetaSession::STATE_CLOSED;
2300 s->con->mark_down();
2301 signal_context_list(s->waiting_for_open);
2302 mount_cond.notify_all();
2303 remove_session_caps(s, err);
2304 kick_requests_closed(s);
2305 mds_ranks_closing.erase(s->mds_num);
2306 if (s->state == MetaSession::STATE_CLOSED)
2307 mds_sessions.erase(s->mds_num);
2308 }
2309
2310 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2311 {
2312 mds_rank_t from = mds_rank_t(m->get_source().num());
2313 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2314
2315 std::scoped_lock cl(client_lock);
2316 auto session = _get_mds_session(from, m->get_connection().get());
2317 if (!session) {
2318 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2319 return;
2320 }
2321
2322 switch (m->get_op()) {
2323 case CEPH_SESSION_OPEN:
2324 {
2325 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2326 missing_features -= m->supported_features;
2327 if (!missing_features.empty()) {
2328 lderr(cct) << "mds." << from << " lacks required features '"
2329 << missing_features << "', closing session " << dendl;
2330 _close_mds_session(session.get());
2331 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2332 break;
2333 }
2334 session->mds_features = std::move(m->supported_features);
2335 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
2336
2337 renew_caps(session.get());
2338 session->state = MetaSession::STATE_OPEN;
2339 if (is_unmounting())
2340 mount_cond.notify_all();
2341 else
2342 connect_mds_targets(from);
2343 signal_context_list(session->waiting_for_open);
2344 break;
2345 }
2346
2347 case CEPH_SESSION_CLOSE:
2348 _closed_mds_session(session.get());
2349 break;
2350
2351 case CEPH_SESSION_RENEWCAPS:
2352 if (session->cap_renew_seq == m->get_seq()) {
2353 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2354 session->cap_ttl =
2355 session->last_cap_renew_request + mdsmap->get_session_timeout();
2356 if (was_stale)
2357 wake_up_session_caps(session.get(), false);
2358 }
2359 break;
2360
2361 case CEPH_SESSION_STALE:
2362 // invalidate session caps/leases
2363 session->cap_gen++;
2364 session->cap_ttl = ceph_clock_now();
2365 session->cap_ttl -= 1;
2366 renew_caps(session.get());
2367 break;
2368
2369 case CEPH_SESSION_RECALL_STATE:
2370 /*
2371 * Call the renew caps and flush cap releases just before
2372 * triming the caps in case the tick() won't get a chance
2373 * to run them, which could cause the client to be blocklisted
2374 * and MDS daemons trying to recall the caps again and
2375 * again.
2376 *
2377 * In most cases it will do nothing, and the new cap releases
2378 * added by trim_caps() followed will be deferred flushing
2379 * by tick().
2380 */
2381 renew_and_flush_cap_releases();
2382 trim_caps(session.get(), m->get_max_caps());
2383 break;
2384
2385 case CEPH_SESSION_FLUSHMSG:
2386 /* flush cap release */
2387 if (auto& m = session->release; m) {
2388 session->con->send_message2(std::move(m));
2389 }
2390 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2391 break;
2392
2393 case CEPH_SESSION_FORCE_RO:
2394 force_session_readonly(session.get());
2395 break;
2396
2397 case CEPH_SESSION_REJECT:
2398 {
2399 std::string_view error_str;
2400 auto it = m->metadata.find("error_string");
2401 if (it != m->metadata.end())
2402 error_str = it->second;
2403 else
2404 error_str = "unknown error";
2405 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2406
2407 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2408 }
2409 break;
2410
2411 default:
2412 ceph_abort();
2413 }
2414 }
2415
2416 bool Client::_any_stale_sessions() const
2417 {
2418 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2419
2420 for (const auto &p : mds_sessions) {
2421 if (p.second->state == MetaSession::STATE_STALE) {
2422 return true;
2423 }
2424 }
2425
2426 return false;
2427 }
2428
2429 void Client::_kick_stale_sessions()
2430 {
2431 ldout(cct, 1) << __func__ << dendl;
2432
2433 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2434 auto s = it->second;
2435 if (s->state == MetaSession::STATE_REJECTED) {
2436 mds_sessions.erase(it->first);
2437 continue;
2438 }
2439 if (s->state == MetaSession::STATE_STALE)
2440 _closed_mds_session(s.get());
2441 }
2442 }
2443
2444 void Client::send_request(MetaRequest *request, MetaSession *session,
2445 bool drop_cap_releases)
2446 {
2447 // make the request
2448 mds_rank_t mds = session->mds_num;
2449 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2450 << " for mds." << mds << dendl;
2451 auto r = build_client_request(request);
2452 if (request->dentry()) {
2453 r->set_dentry_wanted();
2454 }
2455 if (request->got_unsafe) {
2456 r->set_replayed_op();
2457 if (request->target)
2458 r->head.ino = request->target->ino;
2459 } else {
2460 encode_cap_releases(request, mds);
2461 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2462 request->cap_releases.clear();
2463 else
2464 r->releases.swap(request->cap_releases);
2465 }
2466 r->set_mdsmap_epoch(mdsmap->get_epoch());
2467 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2468 objecter->with_osdmap([r](const OSDMap& o) {
2469 r->set_osdmap_epoch(o.get_epoch());
2470 });
2471 }
2472
2473 if (request->mds == -1) {
2474 request->sent_stamp = ceph_clock_now();
2475 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2476 }
2477 request->mds = mds;
2478
2479 Inode *in = request->inode();
2480 if (in) {
2481 auto it = in->caps.find(mds);
2482 if (it != in->caps.end()) {
2483 request->sent_on_mseq = it->second.mseq;
2484 }
2485 }
2486
2487 session->requests.push_back(&request->item);
2488
2489 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2490 session->con->send_message2(std::move(r));
2491 }
2492
2493 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2494 {
2495 auto req = make_message<MClientRequest>(request->get_op());
2496 req->set_tid(request->tid);
2497 req->set_stamp(request->op_stamp);
2498 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2499
2500 // if the filepath's haven't been set, set them!
2501 if (request->path.empty()) {
2502 Inode *in = request->inode();
2503 Dentry *de = request->dentry();
2504 if (in)
2505 in->make_nosnap_relative_path(request->path);
2506 else if (de) {
2507 if (de->inode)
2508 de->inode->make_nosnap_relative_path(request->path);
2509 else if (de->dir) {
2510 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2511 request->path.push_dentry(de->name);
2512 }
2513 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2514 << " No path, inode, or appropriately-endowed dentry given!"
2515 << dendl;
2516 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2517 << " No path, inode, or dentry given!"
2518 << dendl;
2519 }
2520 req->set_filepath(request->get_filepath());
2521 req->set_filepath2(request->get_filepath2());
2522 req->set_alternate_name(request->alternate_name);
2523 req->set_data(request->data);
2524 req->set_retry_attempt(request->retry_attempt++);
2525 req->head.num_fwd = request->num_fwd;
2526 const gid_t *_gids;
2527 int gid_count = request->perms.get_gids(&_gids);
2528 req->set_gid_list(gid_count, _gids);
2529 return req;
2530 }
2531
2532
2533
2534 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2535 {
2536 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2537
2538 std::scoped_lock cl(client_lock);
2539 auto session = _get_mds_session(mds, fwd->get_connection().get());
2540 if (!session) {
2541 return;
2542 }
2543 ceph_tid_t tid = fwd->get_tid();
2544
2545 if (mds_requests.count(tid) == 0) {
2546 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2547 return;
2548 }
2549
2550 MetaRequest *request = mds_requests[tid];
2551 ceph_assert(request);
2552
2553 /*
2554 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2555 * is 'int32_t', while in 'ceph_mds_request_head' the
2556 * type is '__u8'. So in case the request bounces between
2557 * MDSes exceeding 256 times, the client will get stuck.
2558 *
2559 * In this case it's ususally a bug in MDS and continue
2560 * bouncing the request makes no sense.
2561 *
2562 * In future this could be fixed in ceph code, so avoid
2563 * using the hardcode here.
2564 */
2565 int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
2566 max_fwd = 1 << (max_fwd * CHAR_BIT) - 1;
2567 auto num_fwd = fwd->get_num_fwd();
2568 if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
2569 if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
2570 request->abort(-EMULTIHOP);
2571 request->caller_cond->notify_all();
2572 ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
2573 << ", abort it" << dendl;
2574 } else {
2575 ldout(cct, 10) << __func__ << " tid " << tid
2576 << " old fwd seq " << fwd->get_num_fwd()
2577 << " <= req fwd " << request->num_fwd
2578 << ", ignore it" << dendl;
2579 }
2580 return;
2581 }
2582
2583 // reset retry counter
2584 request->retry_attempt = 0;
2585
2586 // request not forwarded, or dest mds has no session.
2587 // resend.
2588 ldout(cct, 10) << __func__ << " tid " << tid
2589 << " fwd " << fwd->get_num_fwd()
2590 << " to mds." << fwd->get_dest_mds()
2591 << ", resending to " << fwd->get_dest_mds()
2592 << dendl;
2593
2594 request->mds = -1;
2595 request->item.remove_myself();
2596 request->num_fwd = num_fwd;
2597 request->resend_mds = fwd->get_dest_mds();
2598 request->caller_cond->notify_all();
2599 }
2600
2601 bool Client::is_dir_operation(MetaRequest *req)
2602 {
2603 int op = req->get_op();
2604 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2605 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2606 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2607 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2608 return true;
2609 return false;
2610 }
2611
2612 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2613 {
2614 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2615
2616 std::scoped_lock cl(client_lock);
2617 auto session = _get_mds_session(mds_num, reply->get_connection().get());
2618 if (!session) {
2619 return;
2620 }
2621
2622 ceph_tid_t tid = reply->get_tid();
2623 bool is_safe = reply->is_safe();
2624
2625 if (mds_requests.count(tid) == 0) {
2626 lderr(cct) << __func__ << " no pending request on tid " << tid
2627 << " safe is:" << is_safe << dendl;
2628 return;
2629 }
2630 MetaRequest *request = mds_requests.at(tid);
2631
2632 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2633 << " tid " << tid << dendl;
2634
2635 if (request->got_unsafe && !is_safe) {
2636 //duplicate response
2637 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2638 << mds_num << " safe:" << is_safe << dendl;
2639 return;
2640 }
2641
2642 ceph_assert(!request->reply);
2643 request->reply = reply;
2644 insert_trace(request, session.get());
2645
2646 // Handle unsafe reply
2647 if (!is_safe) {
2648 request->got_unsafe = true;
2649 session->unsafe_requests.push_back(&request->unsafe_item);
2650 if (is_dir_operation(request)) {
2651 Inode *dir = request->inode();
2652 ceph_assert(dir);
2653 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2654 }
2655 if (request->target) {
2656 InodeRef &in = request->target;
2657 in->unsafe_ops.push_back(&request->unsafe_target_item);
2658 }
2659 }
2660
2661 // Only signal the caller once (on the first reply):
2662 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2663 if (!is_safe || !request->got_unsafe) {
2664 ceph::condition_variable cond;
2665 request->dispatch_cond = &cond;
2666
2667 // wake up waiter
2668 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2669 request->caller_cond->notify_all();
2670
2671 // wake for kick back
2672 std::unique_lock l{client_lock, std::adopt_lock};
2673 cond.wait(l, [tid, request, &cond, this] {
2674 if (request->dispatch_cond) {
2675 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2676 << tid << " " << &cond << dendl;
2677 }
2678 return !request->dispatch_cond;
2679 });
2680 l.release();
2681 }
2682
2683 if (is_safe) {
2684 // the filesystem change is committed to disk
2685 // we're done, clean up
2686 if (request->got_unsafe) {
2687 request->unsafe_item.remove_myself();
2688 request->unsafe_dir_item.remove_myself();
2689 request->unsafe_target_item.remove_myself();
2690 signal_cond_list(request->waitfor_safe);
2691 }
2692 request->item.remove_myself();
2693 unregister_request(request);
2694 }
2695 if (is_unmounting())
2696 mount_cond.notify_all();
2697 }
2698
2699 void Client::_handle_full_flag(int64_t pool)
2700 {
2701 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2702 << "on " << pool << dendl;
2703 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2704 // to do this rather than blocking, because otherwise when we fill up we
2705 // potentially lock caps forever on files with dirty pages, and we need
2706 // to be able to release those caps to the MDS so that it can delete files
2707 // and free up space.
2708 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2709
2710 // For all inodes with layouts in this pool and a pending flush write op
2711 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2712 // from ObjectCacher so that it doesn't re-issue the write in response to
2713 // the ENOSPC error.
2714 // Fortunately since we're cancelling everything in a given pool, we don't
2715 // need to know which ops belong to which ObjectSet, we can just blow all
2716 // the un-flushed cached data away and mark any dirty inodes' async_err
2717 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2718 // affecting this pool, and all the objectsets we're purging were also
2719 // in this pool.
2720 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2721 i != inode_map.end(); ++i)
2722 {
2723 Inode *inode = i->second;
2724 if (inode->oset.dirty_or_tx
2725 && (pool == -1 || inode->layout.pool_id == pool)) {
2726 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2727 << " has dirty objects, purging and setting ENOSPC" << dendl;
2728 objectcacher->purge_set(&inode->oset);
2729 inode->set_async_err(-CEPHFS_ENOSPC);
2730 }
2731 }
2732
2733 if (cancelled_epoch != (epoch_t)-1) {
2734 set_cap_epoch_barrier(cancelled_epoch);
2735 }
2736 }
2737
2738 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2739 {
2740 std::scoped_lock cl(client_lock);
2741
2742 const auto myaddrs = messenger->get_myaddrs();
2743 bool new_blocklist = objecter->with_osdmap(
2744 [&](const OSDMap& o) {
2745 return o.is_blocklisted(myaddrs);
2746 });
2747
2748 if (new_blocklist && !blocklisted) {
2749 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2750 return o.get_epoch();
2751 });
2752 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2753 blocklisted = true;
2754
2755 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2756
2757 // Since we know all our OSD ops will fail, cancel them all preemtively,
2758 // so that on an unhealthy cluster we can umount promptly even if e.g.
2759 // some PGs were inaccessible.
2760 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2761
2762 }
2763
2764 if (blocklisted) {
2765 // Handle case where we were blocklisted but no longer are
2766 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2767 return o.is_blocklisted(myaddrs);});
2768 }
2769
2770 // Always subscribe to next osdmap for blocklisted client
2771 // until this client is not blocklisted.
2772 if (blocklisted) {
2773 objecter->maybe_request_map();
2774 }
2775
2776 if (objecter->osdmap_full_flag()) {
2777 _handle_full_flag(-1);
2778 } else {
2779 // Accumulate local list of full pools so that I can drop
2780 // the objecter lock before re-entering objecter in
2781 // cancel_writes
2782 std::vector<int64_t> full_pools;
2783
2784 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2785 for (const auto& kv : o.get_pools()) {
2786 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2787 full_pools.push_back(kv.first);
2788 }
2789 }
2790 });
2791
2792 for (auto p : full_pools)
2793 _handle_full_flag(p);
2794
2795 // Subscribe to subsequent maps to watch for the full flag going
2796 // away. For the global full flag objecter does this for us, but
2797 // it pays no attention to the per-pool full flag so in this branch
2798 // we do it ourselves.
2799 if (!full_pools.empty()) {
2800 objecter->maybe_request_map();
2801 }
2802 }
2803 }
2804
2805
2806 // ------------------------
2807 // incoming messages
2808
2809
2810 bool Client::ms_dispatch2(const MessageRef &m)
2811 {
2812 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2813 if (!iref_reader.is_state_satisfied()) {
2814 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2815 return true;
2816 }
2817
2818 switch (m->get_type()) {
2819 // mounting and mds sessions
2820 case CEPH_MSG_MDS_MAP:
2821 handle_mds_map(ref_cast<MMDSMap>(m));
2822 break;
2823 case CEPH_MSG_FS_MAP:
2824 handle_fs_map(ref_cast<MFSMap>(m));
2825 break;
2826 case CEPH_MSG_FS_MAP_USER:
2827 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2828 break;
2829 case CEPH_MSG_CLIENT_SESSION:
2830 handle_client_session(ref_cast<MClientSession>(m));
2831 break;
2832
2833 case CEPH_MSG_OSD_MAP:
2834 handle_osd_map(ref_cast<MOSDMap>(m));
2835 break;
2836
2837 // requests
2838 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2839 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2840 break;
2841 case CEPH_MSG_CLIENT_REPLY:
2842 handle_client_reply(ref_cast<MClientReply>(m));
2843 break;
2844
2845 // reclaim reply
2846 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2847 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2848 break;
2849
2850 case CEPH_MSG_CLIENT_SNAP:
2851 handle_snap(ref_cast<MClientSnap>(m));
2852 break;
2853 case CEPH_MSG_CLIENT_CAPS:
2854 handle_caps(ref_cast<MClientCaps>(m));
2855 break;
2856 case CEPH_MSG_CLIENT_LEASE:
2857 handle_lease(ref_cast<MClientLease>(m));
2858 break;
2859 case MSG_COMMAND_REPLY:
2860 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2861 handle_command_reply(ref_cast<MCommandReply>(m));
2862 } else {
2863 return false;
2864 }
2865 break;
2866 case CEPH_MSG_CLIENT_QUOTA:
2867 handle_quota(ref_cast<MClientQuota>(m));
2868 break;
2869
2870 default:
2871 return false;
2872 }
2873
2874 // unmounting?
2875 std::scoped_lock cl(client_lock);
2876 if (is_unmounting()) {
2877 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2878 << "+" << inode_map.size() << dendl;
2879 uint64_t size = lru.lru_get_size() + inode_map.size();
2880 trim_cache();
2881 if (size > lru.lru_get_size() + inode_map.size()) {
2882 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2883 mount_cond.notify_all();
2884 } else {
2885 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2886 << "+" << inode_map.size() << dendl;
2887 }
2888 }
2889
2890 return true;
2891 }
2892
2893 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2894 {
2895 std::scoped_lock cl(client_lock);
2896 fsmap.reset(new FSMap(m->get_fsmap()));
2897
2898 signal_cond_list(waiting_for_fsmap);
2899
2900 monclient->sub_got("fsmap", fsmap->get_epoch());
2901 }
2902
2903 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2904 {
2905 std::scoped_lock cl(client_lock);
2906 fsmap_user.reset(new FSMapUser);
2907 *fsmap_user = m->get_fsmap();
2908
2909 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2910 signal_cond_list(waiting_for_fsmap);
2911 }
2912
2913 // Cancel all the commands for missing or laggy GIDs
2914 void Client::cancel_commands(const MDSMap& newmap)
2915 {
2916 std::vector<ceph_tid_t> cancel_ops;
2917
2918 std::scoped_lock cmd_lock(command_lock);
2919 auto &commands = command_table.get_commands();
2920 for (const auto &[tid, op] : commands) {
2921 const mds_gid_t op_mds_gid = op.mds_gid;
2922 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2923 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2924 cancel_ops.push_back(tid);
2925 if (op.outs) {
2926 std::ostringstream ss;
2927 ss << "MDS " << op_mds_gid << " went away";
2928 *(op.outs) = ss.str();
2929 }
2930 /*
2931 * No need to make the con->mark_down under
2932 * client_lock here, because the con will
2933 * has its own lock.
2934 */
2935 op.con->mark_down();
2936 if (op.on_finish)
2937 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2938 }
2939 }
2940
2941 for (const auto &tid : cancel_ops)
2942 command_table.erase(tid);
2943 }
2944
2945 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2946 {
2947 std::unique_lock cl(client_lock);
2948 if (m->get_epoch() <= mdsmap->get_epoch()) {
2949 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2950 << " is identical to or older than our "
2951 << mdsmap->get_epoch() << dendl;
2952 return;
2953 }
2954
2955 cl.unlock();
2956 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2957 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2958 _mdsmap->decode(m->get_encoded());
2959 cancel_commands(*_mdsmap.get());
2960 cl.lock();
2961
2962 _mdsmap.swap(mdsmap);
2963
2964 // reset session
2965 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2966 mds_rank_t mds = p->first;
2967 MetaSessionRef session = p->second;
2968 ++p;
2969
2970 int oldstate = _mdsmap->get_state(mds);
2971 int newstate = mdsmap->get_state(mds);
2972 if (!mdsmap->is_up(mds)) {
2973 session->con->mark_down();
2974 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2975 auto old_inc = _mdsmap->get_incarnation(mds);
2976 auto new_inc = mdsmap->get_incarnation(mds);
2977 if (old_inc != new_inc) {
2978 ldout(cct, 1) << "mds incarnation changed from "
2979 << old_inc << " to " << new_inc << dendl;
2980 oldstate = MDSMap::STATE_NULL;
2981 }
2982 session->con->mark_down();
2983 session->addrs = mdsmap->get_addrs(mds);
2984 // When new MDS starts to take over, notify kernel to trim unused entries
2985 // in its dcache/icache. Hopefully, the kernel will release some unused
2986 // inodes before the new MDS enters reconnect state.
2987 trim_cache_for_reconnect(session.get());
2988 } else if (oldstate == newstate)
2989 continue; // no change
2990
2991 session->mds_state = newstate;
2992 if (newstate == MDSMap::STATE_RECONNECT) {
2993 session->con = messenger->connect_to_mds(session->addrs);
2994 send_reconnect(session.get());
2995 } else if (newstate > MDSMap::STATE_RECONNECT) {
2996 if (oldstate < MDSMap::STATE_RECONNECT) {
2997 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2998 _closed_mds_session(session.get());
2999 continue;
3000 }
3001 if (newstate >= MDSMap::STATE_ACTIVE) {
3002 if (oldstate < MDSMap::STATE_ACTIVE) {
3003 // kick new requests
3004 kick_requests(session.get());
3005 kick_flushing_caps(session.get());
3006 signal_context_list(session->waiting_for_open);
3007 wake_up_session_caps(session.get(), true);
3008 }
3009 connect_mds_targets(mds);
3010 }
3011 } else if (newstate == MDSMap::STATE_NULL &&
3012 mds >= mdsmap->get_max_mds()) {
3013 _closed_mds_session(session.get());
3014 }
3015 }
3016
3017 // kick any waiting threads
3018 signal_cond_list(waiting_for_mdsmap);
3019
3020 monclient->sub_got("mdsmap", mdsmap->get_epoch());
3021 }
3022
3023 void Client::send_reconnect(MetaSession *session)
3024 {
3025 mds_rank_t mds = session->mds_num;
3026 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
3027
3028 // trim unused caps to reduce MDS's cache rejoin time
3029 trim_cache_for_reconnect(session);
3030
3031 session->readonly = false;
3032
3033 session->release.reset();
3034
3035 // reset my cap seq number
3036 session->seq = 0;
3037 //connect to the mds' offload targets
3038 connect_mds_targets(mds);
3039 //make sure unsafe requests get saved
3040 resend_unsafe_requests(session);
3041
3042 early_kick_flushing_caps(session);
3043
3044 auto m = make_message<MClientReconnect>();
3045 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
3046
3047 // i have an open session.
3048 ceph::unordered_set<inodeno_t> did_snaprealm;
3049 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3050 p != inode_map.end();
3051 ++p) {
3052 Inode *in = p->second;
3053 auto it = in->caps.find(mds);
3054 if (it != in->caps.end()) {
3055 if (allow_multi &&
3056 m->get_approx_size() >=
3057 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
3058 m->mark_more();
3059 session->con->send_message2(std::move(m));
3060
3061 m = make_message<MClientReconnect>();
3062 }
3063
3064 Cap &cap = it->second;
3065 ldout(cct, 10) << " caps on " << p->first
3066 << " " << ccap_string(cap.issued)
3067 << " wants " << ccap_string(in->caps_wanted())
3068 << dendl;
3069 filepath path;
3070 in->make_short_path(path);
3071 ldout(cct, 10) << " path " << path << dendl;
3072
3073 bufferlist flockbl;
3074 _encode_filelocks(in, flockbl);
3075
3076 cap.seq = 0; // reset seq.
3077 cap.issue_seq = 0; // reset seq.
3078 cap.mseq = 0; // reset seq.
3079 // cap gen should catch up with session cap_gen
3080 if (cap.gen < session->cap_gen) {
3081 cap.gen = session->cap_gen;
3082 cap.issued = cap.implemented = CEPH_CAP_PIN;
3083 } else {
3084 cap.issued = cap.implemented;
3085 }
3086 snapid_t snap_follows = 0;
3087 if (!in->cap_snaps.empty())
3088 snap_follows = in->cap_snaps.begin()->first;
3089
3090 m->add_cap(p->first.ino,
3091 cap.cap_id,
3092 path.get_ino(), path.get_path(), // ino
3093 in->caps_wanted(), // wanted
3094 cap.issued, // issued
3095 in->snaprealm->ino,
3096 snap_follows,
3097 flockbl);
3098
3099 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3100 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3101 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3102 did_snaprealm.insert(in->snaprealm->ino);
3103 }
3104 }
3105 }
3106
3107 if (!allow_multi)
3108 m->set_encoding_version(0); // use connection features to choose encoding
3109 session->con->send_message2(std::move(m));
3110
3111 mount_cond.notify_all();
3112
3113 if (session->reclaim_state == MetaSession::RECLAIMING)
3114 signal_cond_list(waiting_for_reclaim);
3115 }
3116
3117
3118 void Client::kick_requests(MetaSession *session)
3119 {
3120 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3121 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3122 p != mds_requests.end();
3123 ++p) {
3124 MetaRequest *req = p->second;
3125 if (req->got_unsafe)
3126 continue;
3127 if (req->aborted()) {
3128 if (req->caller_cond) {
3129 req->kick = true;
3130 req->caller_cond->notify_all();
3131 }
3132 continue;
3133 }
3134 if (req->retry_attempt > 0)
3135 continue; // new requests only
3136 if (req->mds == session->mds_num) {
3137 send_request(p->second, session);
3138 }
3139 }
3140 }
3141
3142 void Client::resend_unsafe_requests(MetaSession *session)
3143 {
3144 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3145 !iter.end();
3146 ++iter)
3147 send_request(*iter, session);
3148
3149 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3150 // process completed requests in clientreplay stage.
3151 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3152 p != mds_requests.end();
3153 ++p) {
3154 MetaRequest *req = p->second;
3155 if (req->got_unsafe)
3156 continue;
3157 if (req->aborted())
3158 continue;
3159 if (req->retry_attempt == 0)
3160 continue; // old requests only
3161 if (req->mds == session->mds_num)
3162 send_request(req, session, true);
3163 }
3164 }
3165
3166 void Client::wait_unsafe_requests()
3167 {
3168 list<MetaRequest*> last_unsafe_reqs;
3169 for (const auto &p : mds_sessions) {
3170 const auto s = p.second;
3171 if (!s->unsafe_requests.empty()) {
3172 MetaRequest *req = s->unsafe_requests.back();
3173 req->get();
3174 last_unsafe_reqs.push_back(req);
3175 }
3176 }
3177
3178 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3179 p != last_unsafe_reqs.end();
3180 ++p) {
3181 MetaRequest *req = *p;
3182 if (req->unsafe_item.is_on_list())
3183 wait_on_list(req->waitfor_safe);
3184 put_request(req);
3185 }
3186 }
3187
3188 void Client::kick_requests_closed(MetaSession *session)
3189 {
3190 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3191 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3192 p != mds_requests.end(); ) {
3193 MetaRequest *req = p->second;
3194 ++p;
3195 if (req->mds == session->mds_num) {
3196 if (req->caller_cond) {
3197 req->kick = true;
3198 req->caller_cond->notify_all();
3199 }
3200 req->item.remove_myself();
3201 if (req->got_unsafe) {
3202 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3203 req->unsafe_item.remove_myself();
3204 if (is_dir_operation(req)) {
3205 Inode *dir = req->inode();
3206 ceph_assert(dir);
3207 dir->set_async_err(-CEPHFS_EIO);
3208 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3209 << dir->ino << " " << req->get_tid() << dendl;
3210 req->unsafe_dir_item.remove_myself();
3211 }
3212 if (req->target) {
3213 InodeRef &in = req->target;
3214 in->set_async_err(-CEPHFS_EIO);
3215 lderr(cct) << "kick_requests_closed drop req of inode : "
3216 << in->ino << " " << req->get_tid() << dendl;
3217 req->unsafe_target_item.remove_myself();
3218 }
3219 signal_cond_list(req->waitfor_safe);
3220 unregister_request(req);
3221 }
3222 }
3223 }
3224 ceph_assert(session->requests.empty());
3225 ceph_assert(session->unsafe_requests.empty());
3226 }
3227
3228
3229
3230
3231 /************
3232 * leases
3233 */
3234
3235 void Client::got_mds_push(MetaSession *s)
3236 {
3237 s->seq++;
3238 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3239 if (s->state == MetaSession::STATE_CLOSING) {
3240 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3241 }
3242 }
3243
3244 void Client::handle_lease(const MConstRef<MClientLease>& m)
3245 {
3246 ldout(cct, 10) << __func__ << " " << *m << dendl;
3247
3248 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3249 mds_rank_t mds = mds_rank_t(m->get_source().num());
3250
3251 std::scoped_lock cl(client_lock);
3252 auto session = _get_mds_session(mds, m->get_connection().get());
3253 if (!session) {
3254 return;
3255 }
3256
3257 got_mds_push(session.get());
3258
3259 ceph_seq_t seq = m->get_seq();
3260
3261 Inode *in;
3262 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3263 if (inode_map.count(vino) == 0) {
3264 ldout(cct, 10) << " don't have vino " << vino << dendl;
3265 goto revoke;
3266 }
3267 in = inode_map[vino];
3268
3269 if (m->get_mask() & CEPH_LEASE_VALID) {
3270 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3271 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3272 goto revoke;
3273 }
3274 Dentry *dn = in->dir->dentries[m->dname];
3275 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3276 dn->lease_mds = -1;
3277 }
3278
3279 revoke:
3280 {
3281 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3282 m->get_mask(), m->get_ino(),
3283 m->get_first(), m->get_last(), m->dname);
3284 m->get_connection()->send_message2(std::move(reply));
3285 }
3286 }
3287
3288 void Client::_put_inode(Inode *in, int n)
3289 {
3290 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3291
3292 int left = in->get_nref();
3293 ceph_assert(left >= n + 1);
3294 in->iput(n);
3295 left -= n;
3296 if (left == 1) { // the last one will be held by the inode_map
3297 // release any caps
3298 remove_all_caps(in);
3299
3300 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3301 bool unclean = objectcacher->release_set(&in->oset);
3302 ceph_assert(!unclean);
3303 inode_map.erase(in->vino());
3304 if (use_faked_inos())
3305 _release_faked_ino(in);
3306
3307 if (root == nullptr) {
3308 root_ancestor = 0;
3309 while (!root_parents.empty())
3310 root_parents.erase(root_parents.begin());
3311 }
3312
3313 in->iput();
3314 }
3315 }
3316
3317 void Client::delay_put_inodes(bool wakeup)
3318 {
3319 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3320
3321 std::map<Inode*,int> release;
3322 {
3323 std::scoped_lock dl(delay_i_lock);
3324 release.swap(delay_i_release);
3325 }
3326
3327 if (release.empty())
3328 return;
3329
3330 for (auto &[in, cnt] : release)
3331 _put_inode(in, cnt);
3332
3333 if (wakeup)
3334 mount_cond.notify_all();
3335 }
3336
3337 void Client::put_inode(Inode *in, int n)
3338 {
3339 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3340
3341 std::scoped_lock dl(delay_i_lock);
3342 delay_i_release[in] += n;
3343 }
3344
3345 void Client::close_dir(Dir *dir)
3346 {
3347 Inode *in = dir->parent_inode;
3348 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3349 ceph_assert(dir->is_empty());
3350 ceph_assert(in->dir == dir);
3351 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3352 if (!in->dentries.empty())
3353 in->get_first_parent()->put(); // unpin dentry
3354
3355 delete in->dir;
3356 in->dir = 0;
3357 put_inode(in); // unpin inode
3358 }
3359
3360 /**
3361 * Don't call this with in==NULL, use get_or_create for that
3362 * leave dn set to default NULL unless you're trying to add
3363 * a new inode to a pre-created Dentry
3364 */
3365 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3366 {
3367 if (!dn) {
3368 // create a new Dentry
3369 dn = new Dentry(dir, name);
3370
3371 lru.lru_insert_mid(dn); // mid or top?
3372
3373 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3374 << " dn " << dn << " (new dn)" << dendl;
3375 } else {
3376 ceph_assert(!dn->inode);
3377 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3378 << " dn " << dn << " (old dn)" << dendl;
3379 }
3380
3381 if (in) { // link to inode
3382 InodeRef tmp_ref;
3383 // only one parent for directories!
3384 if (in->is_dir() && !in->dentries.empty()) {
3385 tmp_ref = in; // prevent unlink below from freeing the inode.
3386 Dentry *olddn = in->get_first_parent();
3387 ceph_assert(olddn->dir != dir || olddn->name != name);
3388 Inode *old_diri = olddn->dir->parent_inode;
3389 clear_dir_complete_and_ordered(old_diri, true);
3390 unlink(olddn, true, true); // keep dir, dentry
3391 }
3392
3393 dn->link(in);
3394 inc_dentry_nr();
3395 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3396 }
3397
3398 return dn;
3399 }
3400
3401 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3402 {
3403 InodeRef in(dn->inode);
3404 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3405 << " inode " << dn->inode << dendl;
3406
3407 // unlink from inode
3408 if (dn->inode) {
3409 dn->unlink();
3410 dec_dentry_nr();
3411 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3412 }
3413
3414 if (keepdentry) {
3415 dn->lease_mds = -1;
3416 } else {
3417 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3418
3419 // unlink from dir
3420 Dir *dir = dn->dir;
3421 dn->detach();
3422
3423 // delete den
3424 lru.lru_remove(dn);
3425 dn->put();
3426
3427 if (dir->is_empty() && !keepdir)
3428 close_dir(dir);
3429 }
3430 }
3431
3432 /**
3433 * For asynchronous flushes, check for errors from the IO and
3434 * update the inode if necessary
3435 */
3436 class C_Client_FlushComplete : public Context {
3437 private:
3438 Client *client;
3439 InodeRef inode;
3440 public:
3441 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3442 void finish(int r) override {
3443 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3444 if (r != 0) {
3445 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3446 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3447 << " 0x" << std::hex << inode->ino << std::dec
3448 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3449 inode->set_async_err(r);
3450 }
3451 }
3452 };
3453
3454
3455 /****
3456 * caps
3457 */
3458
3459 void Client::get_cap_ref(Inode *in, int cap)
3460 {
3461 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3462 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3463 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3464 in->iget();
3465 }
3466 if ((cap & CEPH_CAP_FILE_CACHE) &&
3467 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3468 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3469 in->iget();
3470 }
3471 in->get_cap_ref(cap);
3472 }
3473
3474 void Client::put_cap_ref(Inode *in, int cap)
3475 {
3476 int last = in->put_cap_ref(cap);
3477 if (last) {
3478 int put_nref = 0;
3479 int drop = last & ~in->caps_issued();
3480 if (in->snapid == CEPH_NOSNAP) {
3481 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3482 !in->cap_snaps.empty() &&
3483 in->cap_snaps.rbegin()->second.writing) {
3484 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3485 in->cap_snaps.rbegin()->second.writing = 0;
3486 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3487 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3488 }
3489 if (last & CEPH_CAP_FILE_BUFFER) {
3490 for (auto &p : in->cap_snaps)
3491 p.second.dirty_data = 0;
3492 signal_cond_list(in->waitfor_commit);
3493 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3494 ++put_nref;
3495 }
3496 }
3497 if (last & CEPH_CAP_FILE_CACHE) {
3498 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3499 ++put_nref;
3500 }
3501 if (drop)
3502 check_caps(in, 0);
3503 if (put_nref)
3504 put_inode(in, put_nref);
3505 }
3506 }
3507
3508 // get caps for a given file handle -- the inode should have @need caps
3509 // issued by the mds and @want caps not revoked (or not under revocation).
3510 // this routine blocks till the cap requirement is satisfied. also account
3511 // (track) for capability hit when required (when cap requirement succeedes).
3512 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3513 {
3514 Inode *in = fh->inode.get();
3515
3516 int r = check_pool_perm(in, need);
3517 if (r < 0)
3518 return r;
3519
3520 while (1) {
3521 int file_wanted = in->caps_file_wanted();
3522 if ((file_wanted & need) != need) {
3523 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3524 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3525 << dendl;
3526 return -CEPHFS_EBADF;
3527 }
3528
3529 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3530 return -CEPHFS_EBADF;
3531
3532 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3533 return -CEPHFS_EIO;
3534
3535 int implemented;
3536 int have = in->caps_issued(&implemented);
3537
3538 bool waitfor_caps = false;
3539 bool waitfor_commit = false;
3540
3541 if (have & need & CEPH_CAP_FILE_WR) {
3542 if (endoff > 0) {
3543 if ((endoff >= (loff_t)in->max_size ||
3544 endoff > (loff_t)(in->size << 1)) &&
3545 endoff > (loff_t)in->wanted_max_size) {
3546 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3547 in->wanted_max_size = endoff;
3548 }
3549 if (in->wanted_max_size > in->max_size &&
3550 in->wanted_max_size > in->requested_max_size)
3551 check_caps(in, 0);
3552 }
3553
3554 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3555 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3556 waitfor_caps = true;
3557 }
3558 if (!in->cap_snaps.empty()) {
3559 if (in->cap_snaps.rbegin()->second.writing) {
3560 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3561 waitfor_caps = true;
3562 }
3563 for (auto &p : in->cap_snaps) {
3564 if (p.second.dirty_data) {
3565 waitfor_commit = true;
3566 break;
3567 }
3568 }
3569 if (waitfor_commit) {
3570 _flush(in, new C_Client_FlushComplete(this, in));
3571 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3572 }
3573 }
3574 }
3575
3576 if (!waitfor_caps && !waitfor_commit) {
3577 if ((have & need) == need) {
3578 int revoking = implemented & ~have;
3579 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3580 << " need " << ccap_string(need) << " want " << ccap_string(want)
3581 << " revoking " << ccap_string(revoking)
3582 << dendl;
3583 if ((revoking & want) == 0) {
3584 *phave = need | (have & want);
3585 in->get_cap_ref(need);
3586 cap_hit();
3587 return 0;
3588 }
3589 }
3590 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3591 waitfor_caps = true;
3592 }
3593
3594 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3595 in->auth_cap->session->readonly)
3596 return -CEPHFS_EROFS;
3597
3598 if (in->flags & I_CAP_DROPPED) {
3599 int mds_wanted = in->caps_mds_wanted();
3600 if ((mds_wanted & need) != need) {
3601 int ret = _renew_caps(in);
3602 if (ret < 0)
3603 return ret;
3604 continue;
3605 }
3606 if (!(file_wanted & ~mds_wanted))
3607 in->flags &= ~I_CAP_DROPPED;
3608 }
3609
3610 if (waitfor_caps)
3611 wait_on_list(in->waitfor_caps);
3612 else if (waitfor_commit)
3613 wait_on_list(in->waitfor_commit);
3614 }
3615 }
3616
3617 int Client::get_caps_used(Inode *in)
3618 {
3619 unsigned used = in->caps_used();
3620 if (!(used & CEPH_CAP_FILE_CACHE) &&
3621 !objectcacher->set_is_empty(&in->oset))
3622 used |= CEPH_CAP_FILE_CACHE;
3623 return used;
3624 }
3625
3626 void Client::cap_delay_requeue(Inode *in)
3627 {
3628 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3629
3630 in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
3631 delayed_list.push_back(&in->delay_cap_item);
3632 }
3633
3634 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3635 int flags, int used, int want, int retain,
3636 int flush, ceph_tid_t flush_tid)
3637 {
3638 int held = cap->issued | cap->implemented;
3639 int revoking = cap->implemented & ~cap->issued;
3640 retain &= ~revoking;
3641 int dropping = cap->issued & ~retain;
3642 int op = CEPH_CAP_OP_UPDATE;
3643
3644 ldout(cct, 10) << __func__ << " " << *in
3645 << " mds." << session->mds_num << " seq " << cap->seq
3646 << " used " << ccap_string(used)
3647 << " want " << ccap_string(want)
3648 << " flush " << ccap_string(flush)
3649 << " retain " << ccap_string(retain)
3650 << " held "<< ccap_string(held)
3651 << " revoking " << ccap_string(revoking)
3652 << " dropping " << ccap_string(dropping)
3653 << dendl;
3654
3655 if (cct->_conf->client_inject_release_failure && revoking) {
3656 const int would_have_issued = cap->issued & retain;
3657 const int would_have_implemented = cap->implemented & (cap->issued | used);
3658 // Simulated bug:
3659 // - tell the server we think issued is whatever they issued plus whatever we implemented
3660 // - leave what we have implemented in place
3661 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3662 cap->issued = cap->issued | cap->implemented;
3663
3664 // Make an exception for revoking xattr caps: we are injecting
3665 // failure to release other caps, but allow xattr because client
3666 // will block on xattr ops if it can't release these to MDS (#9800)
3667 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3668 cap->issued ^= xattr_mask & revoking;
3669 cap->implemented ^= xattr_mask & revoking;
3670
3671 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3672 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3673 } else {
3674 // Normal behaviour
3675 cap->issued &= retain;
3676 cap->implemented &= cap->issued | used;
3677 }
3678
3679 snapid_t follows = 0;
3680
3681 if (flush)
3682 follows = in->snaprealm->get_snap_context().seq;
3683
3684 auto m = make_message<MClientCaps>(op,
3685 in->ino,
3686 0,
3687 cap->cap_id, cap->seq,
3688 cap->implemented,
3689 want,
3690 flush,
3691 cap->mseq,
3692 cap_epoch_barrier);
3693 m->caller_uid = in->cap_dirtier_uid;
3694 m->caller_gid = in->cap_dirtier_gid;
3695
3696 m->head.issue_seq = cap->issue_seq;
3697 m->set_tid(flush_tid);
3698
3699 m->head.uid = in->uid;
3700 m->head.gid = in->gid;
3701 m->head.mode = in->mode;
3702
3703 m->head.nlink = in->nlink;
3704
3705 if (flush & CEPH_CAP_XATTR_EXCL) {
3706 encode(in->xattrs, m->xattrbl);
3707 m->head.xattr_version = in->xattr_version;
3708 }
3709
3710 m->size = in->size;
3711 m->max_size = in->max_size;
3712 m->truncate_seq = in->truncate_seq;
3713 m->truncate_size = in->truncate_size;
3714 m->mtime = in->mtime;
3715 m->atime = in->atime;
3716 m->ctime = in->ctime;
3717 m->btime = in->btime;
3718 m->time_warp_seq = in->time_warp_seq;
3719 m->change_attr = in->change_attr;
3720
3721 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3722 !in->cap_snaps.empty() &&
3723 in->cap_snaps.rbegin()->second.flush_tid == 0)
3724 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3725 m->flags = flags;
3726
3727 if (flush & CEPH_CAP_FILE_WR) {
3728 m->inline_version = in->inline_version;
3729 m->inline_data = in->inline_data;
3730 }
3731
3732 in->reported_size = in->size;
3733 m->set_snap_follows(follows);
3734 cap->wanted = want;
3735 if (cap == in->auth_cap) {
3736 if (want & CEPH_CAP_ANY_FILE_WR) {
3737 m->set_max_size(in->wanted_max_size);
3738 in->requested_max_size = in->wanted_max_size;
3739 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3740 } else {
3741 in->requested_max_size = 0;
3742 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3743 }
3744 }
3745
3746 if (!session->flushing_caps_tids.empty())
3747 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3748
3749 session->con->send_message2(std::move(m));
3750 }
3751
3752 static bool is_max_size_approaching(Inode *in)
3753 {
3754 /* mds will adjust max size according to the reported size */
3755 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3756 return false;
3757 if (in->size >= in->max_size)
3758 return true;
3759 /* half of previous max_size increment has been used */
3760 if (in->max_size > in->reported_size &&
3761 (in->size << 1) >= in->max_size + in->reported_size)
3762 return true;
3763 return false;
3764 }
3765
3766 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3767 {
3768 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3769 return used;
3770 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3771 return used;
3772
3773 if (issued & CEPH_CAP_FILE_LAZYIO) {
3774 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3775 used &= ~CEPH_CAP_FILE_CACHE;
3776 used |= CEPH_CAP_FILE_LAZYIO;
3777 }
3778 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3779 used &= ~CEPH_CAP_FILE_BUFFER;
3780 used |= CEPH_CAP_FILE_LAZYIO;
3781 }
3782 } else {
3783 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3784 used &= ~CEPH_CAP_FILE_CACHE;
3785 used |= CEPH_CAP_FILE_LAZYIO;
3786 }
3787 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3788 used &= ~CEPH_CAP_FILE_BUFFER;
3789 used |= CEPH_CAP_FILE_LAZYIO;
3790 }
3791 }
3792 return used;
3793 }
3794
3795 /**
3796 * check_caps
3797 *
3798 * Examine currently used and wanted versus held caps. Release, flush or ack
3799 * revoked caps to the MDS as appropriate.
3800 *
3801 * @param in the inode to check
3802 * @param flags flags to apply to cap check
3803 */
3804 void Client::check_caps(Inode *in, unsigned flags)
3805 {
3806 unsigned wanted = in->caps_wanted();
3807 unsigned used = get_caps_used(in);
3808 unsigned cap_used;
3809
3810 int implemented;
3811 int issued = in->caps_issued(&implemented);
3812 int revoking = implemented & ~issued;
3813
3814 int orig_used = used;
3815 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3816
3817 int retain = wanted | used | CEPH_CAP_PIN;
3818 if (!is_unmounting() && in->nlink > 0) {
3819 if (wanted) {
3820 retain |= CEPH_CAP_ANY;
3821 } else if (in->is_dir() &&
3822 (issued & CEPH_CAP_FILE_SHARED) &&
3823 (in->flags & I_COMPLETE)) {
3824 // we do this here because we don't want to drop to Fs (and then
3825 // drop the Fs if we do a create!) if that alone makes us send lookups
3826 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3827 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3828 retain |= wanted;
3829 } else {
3830 retain |= CEPH_CAP_ANY_SHARED;
3831 // keep RD only if we didn't have the file open RW,
3832 // because then the mds would revoke it anyway to
3833 // journal max_size=0.
3834 if (in->max_size == 0)
3835 retain |= CEPH_CAP_ANY_RD;
3836 }
3837 }
3838
3839 ldout(cct, 10) << __func__ << " on " << *in
3840 << " wanted " << ccap_string(wanted)
3841 << " used " << ccap_string(used)
3842 << " issued " << ccap_string(issued)
3843 << " revoking " << ccap_string(revoking)
3844 << " flags=" << flags
3845 << dendl;
3846
3847 if (in->snapid != CEPH_NOSNAP)
3848 return; //snap caps last forever, can't write
3849
3850 if (in->caps.empty())
3851 return; // guard if at end of func
3852
3853 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3854 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3855 if (_release(in))
3856 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3857 }
3858
3859 for (auto &[mds, cap] : in->caps) {
3860 auto session = mds_sessions.at(mds);
3861
3862 cap_used = used;
3863 if (in->auth_cap && &cap != in->auth_cap)
3864 cap_used &= ~in->auth_cap->issued;
3865
3866 revoking = cap.implemented & ~cap.issued;
3867
3868 ldout(cct, 10) << " cap mds." << mds
3869 << " issued " << ccap_string(cap.issued)
3870 << " implemented " << ccap_string(cap.implemented)
3871 << " revoking " << ccap_string(revoking) << dendl;
3872
3873 if (in->wanted_max_size > in->max_size &&
3874 in->wanted_max_size > in->requested_max_size &&
3875 &cap == in->auth_cap)
3876 goto ack;
3877
3878 /* approaching file_max? */
3879 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3880 &cap == in->auth_cap &&
3881 is_max_size_approaching(in)) {
3882 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3883 << ", reported " << in->reported_size << dendl;
3884 goto ack;
3885 }
3886
3887 /* completed revocation? */
3888 if (revoking && (revoking & cap_used) == 0) {
3889 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3890 goto ack;
3891 }
3892
3893 /* want more caps from mds? */
3894 if (wanted & ~(cap.wanted | cap.issued))
3895 goto ack;
3896
3897 if (!revoking && is_unmounting() && (cap_used == 0))
3898 goto ack;
3899
3900 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3901 !in->dirty_caps) // and we have no dirty caps
3902 continue;
3903
3904 if (!(flags & CHECK_CAPS_NODELAY)) {
3905 ldout(cct, 10) << "delaying cap release" << dendl;
3906 cap_delay_requeue(in);
3907 continue;
3908 }
3909
3910 ack:
3911 if (&cap == in->auth_cap) {
3912 if (in->flags & I_KICK_FLUSH) {
3913 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3914 << " to mds." << mds << dendl;
3915 kick_flushing_caps(in, session.get());
3916 }
3917 if (!in->cap_snaps.empty() &&
3918 in->cap_snaps.rbegin()->second.flush_tid == 0)
3919 flush_snaps(in);
3920 }
3921
3922 int flushing;
3923 int msg_flags = 0;
3924 ceph_tid_t flush_tid;
3925 if (in->auth_cap == &cap && in->dirty_caps) {
3926 flushing = mark_caps_flushing(in, &flush_tid);
3927 if (flags & CHECK_CAPS_SYNCHRONOUS)
3928 msg_flags |= MClientCaps::FLAG_SYNC;
3929 } else {
3930 flushing = 0;
3931 flush_tid = 0;
3932 }
3933
3934 in->delay_cap_item.remove_myself();
3935 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
3936 flushing, flush_tid);
3937 }
3938 }
3939
3940
3941 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3942 {
3943 int used = get_caps_used(in);
3944 int dirty = in->caps_dirty();
3945 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3946
3947 if (in->cap_snaps.size() &&
3948 in->cap_snaps.rbegin()->second.writing) {
3949 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3950 return;
3951 } else if (in->caps_dirty() ||
3952 (used & CEPH_CAP_FILE_WR) ||
3953 (dirty & CEPH_CAP_ANY_WR)) {
3954 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3955 ceph_assert(capsnapem.second); /* element inserted */
3956 CapSnap &capsnap = capsnapem.first->second;
3957 capsnap.context = old_snapc;
3958 capsnap.issued = in->caps_issued();
3959 capsnap.dirty = in->caps_dirty();
3960
3961 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3962
3963 capsnap.uid = in->uid;
3964 capsnap.gid = in->gid;
3965 capsnap.mode = in->mode;
3966 capsnap.btime = in->btime;
3967 capsnap.xattrs = in->xattrs;
3968 capsnap.xattr_version = in->xattr_version;
3969 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3970 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3971
3972 if (used & CEPH_CAP_FILE_WR) {
3973 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3974 capsnap.writing = 1;
3975 } else {
3976 finish_cap_snap(in, capsnap, used);
3977 }
3978 } else {
3979 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3980 }
3981 }
3982
3983 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3984 {
3985 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3986 capsnap.size = in->size;
3987 capsnap.mtime = in->mtime;
3988 capsnap.atime = in->atime;
3989 capsnap.ctime = in->ctime;
3990 capsnap.time_warp_seq = in->time_warp_seq;
3991 capsnap.change_attr = in->change_attr;
3992 capsnap.dirty |= in->caps_dirty();
3993
3994 /* Only reset it if it wasn't set before */
3995 if (capsnap.cap_dirtier_uid == -1) {
3996 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3997 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3998 }
3999
4000 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4001 capsnap.inline_data = in->inline_data;
4002 capsnap.inline_version = in->inline_version;
4003 }
4004
4005 if (used & CEPH_CAP_FILE_BUFFER) {
4006 capsnap.writing = 1;
4007 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
4008 << " WRBUFFER, delaying" << dendl;
4009 } else {
4010 capsnap.dirty_data = 0;
4011 flush_snaps(in);
4012 }
4013 }
4014
4015 void Client::send_flush_snap(Inode *in, MetaSession *session,
4016 snapid_t follows, CapSnap& capsnap)
4017 {
4018 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4019 in->ino, in->snaprealm->ino, 0,
4020 in->auth_cap->mseq, cap_epoch_barrier);
4021 m->caller_uid = capsnap.cap_dirtier_uid;
4022 m->caller_gid = capsnap.cap_dirtier_gid;
4023
4024 m->set_client_tid(capsnap.flush_tid);
4025 m->head.snap_follows = follows;
4026
4027 m->head.caps = capsnap.issued;
4028 m->head.dirty = capsnap.dirty;
4029
4030 m->head.uid = capsnap.uid;
4031 m->head.gid = capsnap.gid;
4032 m->head.mode = capsnap.mode;
4033 m->btime = capsnap.btime;
4034
4035 m->size = capsnap.size;
4036
4037 m->head.xattr_version = capsnap.xattr_version;
4038 encode(capsnap.xattrs, m->xattrbl);
4039
4040 m->ctime = capsnap.ctime;
4041 m->btime = capsnap.btime;
4042 m->mtime = capsnap.mtime;
4043 m->atime = capsnap.atime;
4044 m->time_warp_seq = capsnap.time_warp_seq;
4045 m->change_attr = capsnap.change_attr;
4046
4047 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4048 m->inline_version = in->inline_version;
4049 m->inline_data = in->inline_data;
4050 }
4051
4052 ceph_assert(!session->flushing_caps_tids.empty());
4053 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4054
4055 session->con->send_message2(std::move(m));
4056 }
4057
4058 void Client::flush_snaps(Inode *in)
4059 {
4060 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
4061 ceph_assert(in->cap_snaps.size());
4062
4063 // pick auth mds
4064 ceph_assert(in->auth_cap);
4065 MetaSession *session = in->auth_cap->session;
4066
4067 for (auto &p : in->cap_snaps) {
4068 CapSnap &capsnap = p.second;
4069 // only do new flush
4070 if (capsnap.flush_tid > 0)
4071 continue;
4072
4073 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4074 << " follows " << p.first
4075 << " size " << capsnap.size
4076 << " mtime " << capsnap.mtime
4077 << " dirty_data=" << capsnap.dirty_data
4078 << " writing=" << capsnap.writing
4079 << " on " << *in << dendl;
4080 if (capsnap.dirty_data || capsnap.writing)
4081 break;
4082
4083 capsnap.flush_tid = ++last_flush_tid;
4084 session->flushing_caps_tids.insert(capsnap.flush_tid);
4085 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4086 if (!in->flushing_cap_item.is_on_list())
4087 session->flushing_caps.push_back(&in->flushing_cap_item);
4088
4089 send_flush_snap(in, session, p.first, capsnap);
4090 }
4091 }
4092
4093 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
4094 {
4095 ceph::condition_variable cond;
4096 ls.push_back(&cond);
4097 std::unique_lock l{client_lock, std::adopt_lock};
4098 cond.wait(l);
4099 l.release();
4100 ls.remove(&cond);
4101 }
4102
4103 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4104 {
4105 for (auto cond : ls) {
4106 cond->notify_all();
4107 }
4108 }
4109
4110 void Client::wait_on_context_list(list<Context*>& ls)
4111 {
4112 ceph::condition_variable cond;
4113 bool done = false;
4114 int r;
4115 ls.push_back(new C_Cond(cond, &done, &r));
4116 std::unique_lock l{client_lock, std::adopt_lock};
4117 cond.wait(l, [&done] { return done;});
4118 l.release();
4119 }
4120
4121 void Client::signal_context_list(list<Context*>& ls)
4122 {
4123 while (!ls.empty()) {
4124 ls.front()->complete(0);
4125 ls.pop_front();
4126 }
4127 }
4128
4129 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4130 {
4131 for (const auto &cap : s->caps) {
4132 auto &in = cap->inode;
4133 if (reconnect) {
4134 in.requested_max_size = 0;
4135 in.wanted_max_size = 0;
4136 } else {
4137 if (cap->gen < s->cap_gen) {
4138 // mds did not re-issue stale cap.
4139 cap->issued = cap->implemented = CEPH_CAP_PIN;
4140 // make sure mds knows what we want.
4141 if (in.caps_file_wanted() & ~cap->wanted)
4142 in.flags |= I_CAP_DROPPED;
4143 }
4144 }
4145 signal_cond_list(in.waitfor_caps);
4146 }
4147 }
4148
4149
4150 // flush dirty data (from objectcache)
4151
4152 class C_Client_CacheInvalidate : public Context {
4153 private:
4154 Client *client;
4155 vinodeno_t ino;
4156 int64_t offset, length;
4157 public:
4158 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4159 client(c), offset(off), length(len) {
4160 if (client->use_faked_inos())
4161 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4162 else
4163 ino = in->vino();
4164 }
4165 void finish(int r) override {
4166 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4167 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4168 client->_async_invalidate(ino, offset, length);
4169 }
4170 };
4171
4172 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4173 {
4174 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4175 if (!mref_reader.is_state_satisfied())
4176 return;
4177
4178 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4179 ino_invalidate_cb(callback_handle, ino, off, len);
4180 }
4181
4182 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4183
4184 if (ino_invalidate_cb)
4185 // we queue the invalidate, which calls the callback and decrements the ref
4186 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4187 }
4188
4189 void Client::_invalidate_inode_cache(Inode *in)
4190 {
4191 ldout(cct, 10) << __func__ << " " << *in << dendl;
4192
4193 // invalidate our userspace inode cache
4194 if (cct->_conf->client_oc) {
4195 objectcacher->release_set(&in->oset);
4196 if (!objectcacher->set_is_empty(&in->oset))
4197 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4198 }
4199
4200 _schedule_invalidate_callback(in, 0, 0);
4201 }
4202
4203 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4204 {
4205 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4206
4207 // invalidate our userspace inode cache
4208 if (cct->_conf->client_oc) {
4209 vector<ObjectExtent> ls;
4210 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4211 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4212 }
4213
4214 _schedule_invalidate_callback(in, off, len);
4215 }
4216
4217 bool Client::_release(Inode *in)
4218 {
4219 ldout(cct, 20) << "_release " << *in << dendl;
4220 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4221 _invalidate_inode_cache(in);
4222 return true;
4223 }
4224 return false;
4225 }
4226
4227 bool Client::_flush(Inode *in, Context *onfinish)
4228 {
4229 ldout(cct, 10) << "_flush " << *in << dendl;
4230
4231 if (!in->oset.dirty_or_tx) {
4232 ldout(cct, 10) << " nothing to flush" << dendl;
4233 onfinish->complete(0);
4234 return true;
4235 }
4236
4237 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4238 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4239 objectcacher->purge_set(&in->oset);
4240 if (onfinish) {
4241 onfinish->complete(-CEPHFS_ENOSPC);
4242 }
4243 return true;
4244 }
4245
4246 return objectcacher->flush_set(&in->oset, onfinish);
4247 }
4248
4249 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4250 {
4251 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4252 if (!in->oset.dirty_or_tx) {
4253 ldout(cct, 10) << " nothing to flush" << dendl;
4254 return;
4255 }
4256
4257 C_SaferCond onflush("Client::_flush_range flock");
4258 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4259 offset, size, &onflush);
4260 if (!ret) {
4261 // wait for flush
4262 client_lock.unlock();
4263 onflush.wait();
4264 client_lock.lock();
4265 }
4266 }
4267
4268 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4269 {
4270 // std::scoped_lock l(client_lock);
4271 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4272 Inode *in = static_cast<Inode *>(oset->parent);
4273 ceph_assert(in);
4274 _flushed(in);
4275 }
4276
4277 void Client::_flushed(Inode *in)
4278 {
4279 ldout(cct, 10) << "_flushed " << *in << dendl;
4280
4281 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4282 }
4283
4284
4285
4286 // checks common to add_update_cap, handle_cap_grant
4287 void Client::check_cap_issue(Inode *in, unsigned issued)
4288 {
4289 unsigned had = in->caps_issued();
4290
4291 if ((issued & CEPH_CAP_FILE_CACHE) &&
4292 !(had & CEPH_CAP_FILE_CACHE))
4293 in->cache_gen++;
4294
4295 if ((issued & CEPH_CAP_FILE_SHARED) !=
4296 (had & CEPH_CAP_FILE_SHARED)) {
4297 if (issued & CEPH_CAP_FILE_SHARED)
4298 in->shared_gen++;
4299 if (in->is_dir())
4300 clear_dir_complete_and_ordered(in, true);
4301 }
4302 }
4303
4304 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4305 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4306 inodeno_t realm, int flags, const UserPerm& cap_perms)
4307 {
4308 if (!in->is_any_caps()) {
4309 ceph_assert(in->snaprealm == 0);
4310 in->snaprealm = get_snap_realm(realm);
4311 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4312 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4313 } else {
4314 ceph_assert(in->snaprealm);
4315 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4316 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4317 in->snaprealm_item.remove_myself();
4318 auto oldrealm = in->snaprealm;
4319 in->snaprealm = get_snap_realm(realm);
4320 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4321 put_snap_realm(oldrealm);
4322 }
4323 }
4324
4325 mds_rank_t mds = mds_session->mds_num;
4326 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4327 Cap &cap = capem.first->second;
4328 if (!capem.second) {
4329 if (cap.gen < mds_session->cap_gen)
4330 cap.issued = cap.implemented = CEPH_CAP_PIN;
4331
4332 /*
4333 * auth mds of the inode changed. we received the cap export
4334 * message, but still haven't received the cap import message.
4335 * handle_cap_export() updated the new auth MDS' cap.
4336 *
4337 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4338 * a message that was send before the cap import message. So
4339 * don't remove caps.
4340 */
4341 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4342 if (&cap != in->auth_cap)
4343 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4344
4345 ceph_assert(cap.cap_id == cap_id);
4346 seq = cap.seq;
4347 mseq = cap.mseq;
4348 issued |= cap.issued;
4349 flags |= CEPH_CAP_FLAG_AUTH;
4350 }
4351 } else {
4352 inc_pinned_icaps();
4353 }
4354
4355 check_cap_issue(in, issued);
4356
4357 if (flags & CEPH_CAP_FLAG_AUTH) {
4358 if (in->auth_cap != &cap &&
4359 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4360 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4361 ldout(cct, 10) << __func__ << " changing auth cap: "
4362 << "add myself to new auth MDS' flushing caps list" << dendl;
4363 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4364 }
4365 in->auth_cap = &cap;
4366 }
4367 }
4368
4369 unsigned old_caps = cap.issued;
4370 cap.cap_id = cap_id;
4371 cap.issued = issued;
4372 cap.implemented |= issued;
4373 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4374 cap.wanted = wanted;
4375 else
4376 cap.wanted |= wanted;
4377 cap.seq = seq;
4378 cap.issue_seq = seq;
4379 cap.mseq = mseq;
4380 cap.gen = mds_session->cap_gen;
4381 cap.latest_perms = cap_perms;
4382 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4383 << " from mds." << mds
4384 << " on " << *in
4385 << dendl;
4386
4387 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4388 // non-auth MDS is revoking the newly grant caps ?
4389 for (auto &p : in->caps) {
4390 if (&p.second == &cap)
4391 continue;
4392 if (p.second.implemented & ~p.second.issued & issued) {
4393 check_caps(in, CHECK_CAPS_NODELAY);
4394 break;
4395 }
4396 }
4397 }
4398
4399 if (issued & ~old_caps)
4400 signal_cond_list(in->waitfor_caps);
4401 }
4402
4403 void Client::remove_cap(Cap *cap, bool queue_release)
4404 {
4405 auto &in = cap->inode;
4406 MetaSession *session = cap->session;
4407 mds_rank_t mds = cap->session->mds_num;
4408
4409 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4410
4411 if (queue_release) {
4412 session->enqueue_cap_release(
4413 in.ino,
4414 cap->cap_id,
4415 cap->issue_seq,
4416 cap->mseq,
4417 cap_epoch_barrier);
4418 } else {
4419 dec_pinned_icaps();
4420 }
4421
4422
4423 if (in.auth_cap == cap) {
4424 if (in.flushing_cap_item.is_on_list()) {
4425 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4426 in.flushing_cap_item.remove_myself();
4427 }
4428 in.auth_cap = NULL;
4429 }
4430 size_t n = in.caps.erase(mds);
4431 ceph_assert(n == 1);
4432 cap = nullptr;
4433
4434 if (!in.is_any_caps()) {
4435 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4436 in.snaprealm_item.remove_myself();
4437 put_snap_realm(in.snaprealm);
4438 in.snaprealm = 0;
4439 }
4440 }
4441
4442 void Client::remove_all_caps(Inode *in)
4443 {
4444 while (!in->caps.empty())
4445 remove_cap(&in->caps.begin()->second, true);
4446 }
4447
4448 void Client::remove_session_caps(MetaSession *s, int err)
4449 {
4450 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4451
4452 while (s->caps.size()) {
4453 Cap *cap = *s->caps.begin();
4454 InodeRef in(&cap->inode);
4455 bool dirty_caps = false;
4456 if (in->auth_cap == cap) {
4457 dirty_caps = in->dirty_caps | in->flushing_caps;
4458 in->wanted_max_size = 0;
4459 in->requested_max_size = 0;
4460 if (in->has_any_filelocks())
4461 in->flags |= I_ERROR_FILELOCK;
4462 }
4463 auto caps = cap->implemented;
4464 if (cap->wanted | cap->issued)
4465 in->flags |= I_CAP_DROPPED;
4466 remove_cap(cap, false);
4467 in->cap_snaps.clear();
4468 if (dirty_caps) {
4469 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4470 if (in->flushing_caps) {
4471 num_flushing_caps--;
4472 in->flushing_cap_tids.clear();
4473 }
4474 in->flushing_caps = 0;
4475 in->mark_caps_clean();
4476 put_inode(in.get());
4477 }
4478 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4479 if (caps && !in->caps_issued_mask(caps, true)) {
4480 if (err == -CEPHFS_EBLOCKLISTED) {
4481 if (in->oset.dirty_or_tx) {
4482 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4483 in->set_async_err(err);
4484 }
4485 objectcacher->purge_set(&in->oset);
4486 } else {
4487 objectcacher->release_set(&in->oset);
4488 }
4489 _schedule_invalidate_callback(in.get(), 0, 0);
4490 }
4491
4492 signal_cond_list(in->waitfor_caps);
4493 }
4494 s->flushing_caps_tids.clear();
4495 sync_cond.notify_all();
4496 }
4497
4498 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4499 {
4500 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4501 bool abort_on_failure = false;
4502
4503 errno = 0;
4504 int r = remount_cb(callback_handle);
4505 if (r == 0) {
4506 retries_on_invalidate = 0;
4507 } else {
4508 int e = errno;
4509 client_t whoami = get_nodeid();
4510 if (r == -1) {
4511 lderr(cct) <<
4512 "failed to remount (to trim kernel dentries): "
4513 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4514 } else {
4515 lderr(cct) <<
4516 "failed to remount (to trim kernel dentries): "
4517 "return code = " << r << dendl;
4518 }
4519 bool should_abort =
4520 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4521 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4522 !(retry_on_error && (++retries_on_invalidate < max_retries));
4523 if (should_abort && !is_unmounting()) {
4524 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4525 abort_on_failure = true;
4526 }
4527 }
4528 return std::make_pair(r, abort_on_failure);
4529 }
4530
4531 class C_Client_Remount : public Context {
4532 private:
4533 Client *client;
4534 public:
4535 explicit C_Client_Remount(Client *c) : client(c) {}
4536 void finish(int r) override {
4537 ceph_assert(r == 0);
4538 client->_do_remount(true);
4539 }
4540 };
4541
4542 void Client::_invalidate_kernel_dcache()
4543 {
4544 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4545 if (!mref_reader.is_state_satisfied())
4546 return;
4547
4548 if (can_invalidate_dentries) {
4549 if (dentry_invalidate_cb && root->dir) {
4550 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4551 p != root->dir->dentries.end();
4552 ++p) {
4553 if (p->second->inode)
4554 _schedule_invalidate_dentry_callback(p->second, false);
4555 }
4556 }
4557 } else if (remount_cb) {
4558 // Hacky:
4559 // when remounting a file system, linux kernel trims all unused dentries in the fs
4560 remount_finisher.queue(new C_Client_Remount(this));
4561 }
4562 }
4563
4564 void Client::_trim_negative_child_dentries(InodeRef& in)
4565 {
4566 if (!in->is_dir())
4567 return;
4568
4569 Dir* dir = in->dir;
4570 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4571 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4572 Dentry *dn = p->second;
4573 ++p;
4574 ceph_assert(!dn->inode);
4575 if (dn->lru_is_expireable())
4576 unlink(dn, true, false); // keep dir, drop dentry
4577 }
4578 if (dir->dentries.empty()) {
4579 close_dir(dir);
4580 }
4581 }
4582
4583 if (in->flags & I_SNAPDIR_OPEN) {
4584 InodeRef snapdir = open_snapdir(in.get());
4585 _trim_negative_child_dentries(snapdir);
4586 }
4587 }
4588
4589 class C_Client_CacheRelease : public Context {
4590 private:
4591 Client *client;
4592 vinodeno_t ino;
4593 public:
4594 C_Client_CacheRelease(Client *c, Inode *in) :
4595 client(c) {
4596 if (client->use_faked_inos())
4597 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4598 else
4599 ino = in->vino();
4600 }
4601 void finish(int r) override {
4602 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4603 client->_async_inode_release(ino);
4604 }
4605 };
4606
4607 void Client::_async_inode_release(vinodeno_t ino)
4608 {
4609 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4610 if (!mref_reader.is_state_satisfied())
4611 return;
4612
4613 ldout(cct, 10) << __func__ << " " << ino << dendl;
4614 ino_release_cb(callback_handle, ino);
4615 }
4616
4617 void Client::_schedule_ino_release_callback(Inode *in) {
4618
4619 if (ino_release_cb)
4620 // we queue the invalidate, which calls the callback and decrements the ref
4621 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4622 }
4623
4624 void Client::trim_caps(MetaSession *s, uint64_t max)
4625 {
4626 mds_rank_t mds = s->mds_num;
4627 size_t caps_size = s->caps.size();
4628 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4629 << " caps " << caps_size << dendl;
4630
4631 uint64_t trimmed = 0;
4632 auto p = s->caps.begin();
4633 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4634 * looking at from getting deleted during traversal. */
4635 while ((caps_size - trimmed) > max && !p.end()) {
4636 Cap *cap = *p;
4637 InodeRef in(&cap->inode);
4638
4639 // Increment p early because it will be invalidated if cap
4640 // is deleted inside remove_cap
4641 ++p;
4642
4643 if (in->caps.size() > 1 && cap != in->auth_cap) {
4644 int mine = cap->issued | cap->implemented;
4645 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4646 // disposable non-auth cap
4647 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4648 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4649 cap = (remove_cap(cap, true), nullptr);
4650 trimmed++;
4651 }
4652 } else {
4653 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4654 _trim_negative_child_dentries(in);
4655 bool all = true;
4656 auto q = in->dentries.begin();
4657 while (q != in->dentries.end()) {
4658 Dentry *dn = *q;
4659 ++q;
4660 if (dn->lru_is_expireable()) {
4661 if (can_invalidate_dentries &&
4662 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4663 // Only issue one of these per DN for inodes in root: handle
4664 // others more efficiently by calling for root-child DNs at
4665 // the end of this function.
4666 _schedule_invalidate_dentry_callback(dn, true);
4667 }
4668 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4669 to_trim.insert(dn);
4670 } else {
4671 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4672 all = false;
4673 }
4674 }
4675 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4676 _schedule_ino_release_callback(in.get());
4677 }
4678 if (all && in->ino != CEPH_INO_ROOT) {
4679 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4680 trimmed++;
4681 }
4682 }
4683 }
4684 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4685 for (const auto &dn : to_trim) {
4686 trim_dentry(dn);
4687 }
4688 to_trim.clear();
4689
4690 caps_size = s->caps.size();
4691 if (caps_size > (size_t)max)
4692 _invalidate_kernel_dcache();
4693 }
4694
4695 void Client::force_session_readonly(MetaSession *s)
4696 {
4697 s->readonly = true;
4698 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4699 auto &in = (*p)->inode;
4700 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4701 signal_cond_list(in.waitfor_caps);
4702 }
4703 }
4704
4705 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4706 {
4707 MetaSession *session = in->auth_cap->session;
4708
4709 int flushing = in->dirty_caps;
4710 ceph_assert(flushing);
4711
4712 ceph_tid_t flush_tid = ++last_flush_tid;
4713 in->flushing_cap_tids[flush_tid] = flushing;
4714
4715 if (!in->flushing_caps) {
4716 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4717 num_flushing_caps++;
4718 } else {
4719 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4720 }
4721
4722 in->flushing_caps |= flushing;
4723 in->mark_caps_clean();
4724
4725 if (!in->flushing_cap_item.is_on_list())
4726 session->flushing_caps.push_back(&in->flushing_cap_item);
4727 session->flushing_caps_tids.insert(flush_tid);
4728
4729 *ptid = flush_tid;
4730 return flushing;
4731 }
4732
4733 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4734 {
4735 for (auto &p : in->cap_snaps) {
4736 CapSnap &capsnap = p.second;
4737 if (capsnap.flush_tid > 0) {
4738 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4739 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4740 }
4741 }
4742 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4743 it != in->flushing_cap_tids.end();
4744 ++it) {
4745 old_s->flushing_caps_tids.erase(it->first);
4746 new_s->flushing_caps_tids.insert(it->first);
4747 }
4748 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4749 }
4750
4751 /*
4752 * Flush all the dirty caps back to the MDS. Because the callers
4753 * generally wait on the result of this function (syncfs and umount
4754 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4755 */
4756 void Client::flush_caps_sync()
4757 {
4758 ldout(cct, 10) << __func__ << dendl;
4759 for (auto &q : mds_sessions) {
4760 auto s = q.second;
4761 xlist<Inode*>::iterator p = s->dirty_list.begin();
4762 while (!p.end()) {
4763 unsigned flags = CHECK_CAPS_NODELAY;
4764 Inode *in = *p;
4765
4766 ++p;
4767 if (p.end())
4768 flags |= CHECK_CAPS_SYNCHRONOUS;
4769 check_caps(in, flags);
4770 }
4771 }
4772 }
4773
4774 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4775 {
4776 while (in->flushing_caps) {
4777 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4778 ceph_assert(it != in->flushing_cap_tids.end());
4779 if (it->first > want)
4780 break;
4781 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4782 << ccap_string(it->second) << " want " << want
4783 << " last " << it->first << dendl;
4784 wait_on_list(in->waitfor_caps);
4785 }
4786 }
4787
4788 void Client::wait_sync_caps(ceph_tid_t want)
4789 {
4790 retry:
4791 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4792 << num_flushing_caps << " total flushing)" << dendl;
4793 for (auto &p : mds_sessions) {
4794 auto s = p.second;
4795 if (s->flushing_caps_tids.empty())
4796 continue;
4797 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4798 if (oldest_tid <= want) {
4799 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4800 << " (want " << want << ")" << dendl;
4801 std::unique_lock l{client_lock, std::adopt_lock};
4802 sync_cond.wait(l);
4803 l.release();
4804 goto retry;
4805 }
4806 }
4807 }
4808
4809 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4810 {
4811 in->flags &= ~I_KICK_FLUSH;
4812
4813 Cap *cap = in->auth_cap;
4814 ceph_assert(cap->session == session);
4815
4816 ceph_tid_t last_snap_flush = 0;
4817 for (auto p = in->flushing_cap_tids.rbegin();
4818 p != in->flushing_cap_tids.rend();
4819 ++p) {
4820 if (!p->second) {
4821 last_snap_flush = p->first;
4822 break;
4823 }
4824 }
4825
4826 int wanted = in->caps_wanted();
4827 int used = get_caps_used(in) | in->caps_dirty();
4828 auto it = in->cap_snaps.begin();
4829 for (auto& p : in->flushing_cap_tids) {
4830 if (p.second) {
4831 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4832 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4833 p.second, p.first);
4834 } else {
4835 ceph_assert(it != in->cap_snaps.end());
4836 ceph_assert(it->second.flush_tid == p.first);
4837 send_flush_snap(in, session, it->first, it->second);
4838 ++it;
4839 }
4840 }
4841 }
4842
4843 void Client::kick_flushing_caps(MetaSession *session)
4844 {
4845 mds_rank_t mds = session->mds_num;
4846 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4847
4848 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4849 Inode *in = *p;
4850 if (in->flags & I_KICK_FLUSH) {
4851 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4852 kick_flushing_caps(in, session);
4853 }
4854 }
4855 }
4856
4857 void Client::early_kick_flushing_caps(MetaSession *session)
4858 {
4859 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4860 Inode *in = *p;
4861 Cap *cap = in->auth_cap;
4862 ceph_assert(cap);
4863
4864 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4865 // stage. This guarantees that MDS processes the cap flush message before issuing
4866 // the flushing caps to other client.
4867 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4868 in->flags |= I_KICK_FLUSH;
4869 continue;
4870 }
4871
4872 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4873 << " to mds." << session->mds_num << dendl;
4874 // send_reconnect() also will reset these sequence numbers. make sure
4875 // sequence numbers in cap flush message match later reconnect message.
4876 cap->seq = 0;
4877 cap->issue_seq = 0;
4878 cap->mseq = 0;
4879 cap->issued = cap->implemented;
4880
4881 kick_flushing_caps(in, session);
4882 }
4883 }
4884
4885 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4886 {
4887 list<SnapRealm*> q;
4888 q.push_back(realm);
4889
4890 while (!q.empty()) {
4891 realm = q.front();
4892 q.pop_front();
4893
4894 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4895 realm->invalidate_cache();
4896
4897 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4898 p != realm->pchildren.end();
4899 ++p)
4900 q.push_back(*p);
4901 }
4902 }
4903
4904 SnapRealm *Client::get_snap_realm(inodeno_t r)
4905 {
4906 SnapRealm *realm = snap_realms[r];
4907
4908 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
4909 << (realm ? realm->nref : 0) << dendl;
4910 if (!realm) {
4911 snap_realms[r] = realm = new SnapRealm(r);
4912
4913 // Do not release the global snaprealm until unmounting.
4914 if (r == CEPH_INO_GLOBAL_SNAPREALM)
4915 realm->nref++;
4916 }
4917
4918 realm->nref++;
4919 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
4920 << realm->nref << dendl;
4921 return realm;
4922 }
4923
4924 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4925 {
4926 if (snap_realms.count(r) == 0) {
4927 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4928 return NULL;
4929 }
4930 SnapRealm *realm = snap_realms[r];
4931 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4932 realm->nref++;
4933 return realm;
4934 }
4935
4936 void Client::put_snap_realm(SnapRealm *realm)
4937 {
4938 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4939 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4940 if (--realm->nref == 0) {
4941 snap_realms.erase(realm->ino);
4942 if (realm->pparent) {
4943 realm->pparent->pchildren.erase(realm);
4944 put_snap_realm(realm->pparent);
4945 }
4946 delete realm;
4947 }
4948 }
4949
4950 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4951 {
4952 if (realm->parent != parent) {
4953 ldout(cct, 10) << __func__ << " " << *realm
4954 << " " << realm->parent << " -> " << parent << dendl;
4955 realm->parent = parent;
4956 if (realm->pparent) {
4957 realm->pparent->pchildren.erase(realm);
4958 put_snap_realm(realm->pparent);
4959 }
4960 realm->pparent = get_snap_realm(parent);
4961 realm->pparent->pchildren.insert(realm);
4962 return true;
4963 }
4964 return false;
4965 }
4966
4967 static bool has_new_snaps(const SnapContext& old_snapc,
4968 const SnapContext& new_snapc)
4969 {
4970 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4971 }
4972
4973
4974 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4975 {
4976 SnapRealm *first_realm = NULL;
4977 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4978
4979 map<SnapRealm*, SnapContext> dirty_realms;
4980
4981 auto p = bl.cbegin();
4982 while (!p.end()) {
4983 SnapRealmInfo info;
4984 decode(info, p);
4985 SnapRealm *realm = get_snap_realm(info.ino());
4986
4987 bool invalidate = false;
4988
4989 if (info.seq() > realm->seq) {
4990 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4991 << dendl;
4992
4993 if (flush) {
4994 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4995 // flush me + children
4996 list<SnapRealm*> q;
4997 q.push_back(realm);
4998 while (!q.empty()) {
4999 SnapRealm *realm = q.front();
5000 q.pop_front();
5001
5002 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5003 p != realm->pchildren.end();
5004 ++p)
5005 q.push_back(*p);
5006
5007 if (dirty_realms.count(realm) == 0) {
5008 realm->nref++;
5009 dirty_realms[realm] = realm->get_snap_context();
5010 }
5011 }
5012 }
5013
5014 // update
5015 realm->seq = info.seq();
5016 realm->created = info.created();
5017 realm->parent_since = info.parent_since();
5018 realm->prior_parent_snaps = info.prior_parent_snaps;
5019 realm->my_snaps = info.my_snaps;
5020 invalidate = true;
5021 }
5022
5023 // _always_ verify parent
5024 if (adjust_realm_parent(realm, info.parent()))
5025 invalidate = true;
5026
5027 if (invalidate) {
5028 invalidate_snaprealm_and_children(realm);
5029 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
5030 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
5031 } else {
5032 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
5033 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5034 }
5035
5036 if (!first_realm)
5037 first_realm = realm;
5038 else
5039 put_snap_realm(realm);
5040 }
5041
5042 for (auto &[realm, snapc] : dirty_realms) {
5043 // if there are new snaps ?
5044 if (has_new_snaps(snapc, realm->get_snap_context())) {
5045 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
5046 for (auto&& in : realm->inodes_with_caps) {
5047 queue_cap_snap(in, snapc);
5048 }
5049 } else {
5050 ldout(cct, 10) << " no new snap on " << *realm << dendl;
5051 }
5052 put_snap_realm(realm);
5053 }
5054
5055 if (realm_ret)
5056 *realm_ret = first_realm;
5057 else
5058 put_snap_realm(first_realm);
5059 }
5060
5061 void Client::handle_snap(const MConstRef<MClientSnap>& m)
5062 {
5063 ldout(cct, 10) << __func__ << " " << *m << dendl;
5064 mds_rank_t mds = mds_rank_t(m->get_source().num());
5065
5066 std::scoped_lock cl(client_lock);
5067 auto session = _get_mds_session(mds, m->get_connection().get());
5068 if (!session) {
5069 return;
5070 }
5071
5072 got_mds_push(session.get());
5073
5074 map<Inode*, SnapContext> to_move;
5075 SnapRealm *realm = 0;
5076
5077 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5078 ceph_assert(m->head.split);
5079 SnapRealmInfo info;
5080 auto p = m->bl.cbegin();
5081 decode(info, p);
5082 ceph_assert(info.ino() == m->head.split);
5083
5084 // flush, then move, ino's.
5085 realm = get_snap_realm(info.ino());
5086 ldout(cct, 10) << " splitting off " << *realm << dendl;
5087 for (auto& ino : m->split_inos) {
5088 vinodeno_t vino(ino, CEPH_NOSNAP);
5089 if (inode_map.count(vino)) {
5090 Inode *in = inode_map[vino];
5091 if (!in->snaprealm || in->snaprealm == realm)
5092 continue;
5093 if (in->snaprealm->created > info.created()) {
5094 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5095 << *in->snaprealm << dendl;
5096 continue;
5097 }
5098 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5099
5100
5101 in->snaprealm_item.remove_myself();
5102 to_move[in] = in->snaprealm->get_snap_context();
5103 put_snap_realm(in->snaprealm);
5104 }
5105 }
5106
5107 // move child snaprealms, too
5108 for (auto& child_realm : m->split_realms) {
5109 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5110 SnapRealm *child = get_snap_realm_maybe(child_realm);
5111 if (!child)
5112 continue;
5113 adjust_realm_parent(child, realm->ino);
5114 put_snap_realm(child);
5115 }
5116 }
5117
5118 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5119
5120 if (realm) {
5121 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5122 Inode *in = p->first;
5123 in->snaprealm = realm;
5124 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5125 realm->nref++;
5126 // queue for snap writeback
5127 if (has_new_snaps(p->second, realm->get_snap_context()))
5128 queue_cap_snap(in, p->second);
5129 }
5130 put_snap_realm(realm);
5131 }
5132 }
5133
5134 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5135 {
5136 mds_rank_t mds = mds_rank_t(m->get_source().num());
5137
5138 std::scoped_lock cl(client_lock);
5139 auto session = _get_mds_session(mds, m->get_connection().get());
5140 if (!session) {
5141 return;
5142 }
5143
5144 got_mds_push(session.get());
5145
5146 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5147
5148 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5149 if (inode_map.count(vino)) {
5150 Inode *in = NULL;
5151 in = inode_map[vino];
5152
5153 if (in) {
5154 in->quota = m->quota;
5155 in->rstat = m->rstat;
5156 }
5157 }
5158 }
5159
5160 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5161 {
5162 mds_rank_t mds = mds_rank_t(m->get_source().num());
5163
5164 std::scoped_lock cl(client_lock);
5165 auto session = _get_mds_session(mds, m->get_connection().get());
5166 if (!session) {
5167 return;
5168 }
5169
5170 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5171 // Pause RADOS operations until we see the required epoch
5172 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5173 }
5174
5175 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5176 // Record the barrier so that we will transmit it to MDS when releasing
5177 set_cap_epoch_barrier(m->osd_epoch_barrier);
5178 }
5179
5180 got_mds_push(session.get());
5181
5182 Inode *in;
5183 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5184 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5185 in = it->second;
5186 } else {
5187 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5188 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5189 session->enqueue_cap_release(
5190 m->get_ino(),
5191 m->get_cap_id(),
5192 m->get_seq(),
5193 m->get_mseq(),
5194 cap_epoch_barrier);
5195 } else {
5196 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5197 }
5198
5199 // in case the mds is waiting on e.g. a revocation
5200 flush_cap_releases();
5201 return;
5202 }
5203
5204 switch (m->get_op()) {
5205 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5206 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5207 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5208 }
5209
5210 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5211 Cap &cap = in->caps.at(mds);
5212
5213 switch (m->get_op()) {
5214 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5215 case CEPH_CAP_OP_IMPORT:
5216 case CEPH_CAP_OP_REVOKE:
5217 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5218 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5219 }
5220 } else {
5221 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5222 return;
5223 }
5224 }
5225
5226 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5227 {
5228 mds_rank_t mds = session->mds_num;
5229
5230 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5231 << " IMPORT from mds." << mds << dendl;
5232
5233 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5234 Cap *cap = NULL;
5235 UserPerm cap_perms;
5236 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5237 cap = &it->second;
5238 cap_perms = cap->latest_perms;
5239 }
5240
5241 // add/update it
5242 SnapRealm *realm = NULL;
5243 update_snap_trace(m->snapbl, &realm);
5244
5245 int issued = m->get_caps();
5246 int wanted = m->get_wanted();
5247 add_update_cap(in, session, m->get_cap_id(),
5248 issued, wanted, m->get_seq(), m->get_mseq(),
5249 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5250
5251 if (cap && cap->cap_id == m->peer.cap_id) {
5252 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5253 }
5254
5255 if (realm)
5256 put_snap_realm(realm);
5257
5258 if (in->auth_cap && in->auth_cap->session == session) {
5259 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5260 in->requested_max_size > m->get_max_size()) {
5261 in->requested_max_size = 0;
5262 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5263 }
5264 // reflush any/all caps (if we are now the auth_cap)
5265 kick_flushing_caps(in, session);
5266 }
5267 }
5268
5269 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5270 {
5271 mds_rank_t mds = session->mds_num;
5272
5273 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5274 << " EXPORT from mds." << mds << dendl;
5275
5276 auto it = in->caps.find(mds);
5277 if (it != in->caps.end()) {
5278 Cap &cap = it->second;
5279 if (cap.cap_id == m->get_cap_id()) {
5280 if (m->peer.cap_id) {
5281 const auto peer_mds = mds_rank_t(m->peer.mds);
5282 auto tsession = _get_or_open_mds_session(peer_mds);
5283 auto it = in->caps.find(peer_mds);
5284 if (it != in->caps.end()) {
5285 Cap &tcap = it->second;
5286 if (tcap.cap_id == m->peer.cap_id &&
5287 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5288 tcap.cap_id = m->peer.cap_id;
5289 tcap.seq = m->peer.seq - 1;
5290 tcap.issue_seq = tcap.seq;
5291 tcap.issued |= cap.issued;
5292 tcap.implemented |= cap.issued;
5293 if (&cap == in->auth_cap)
5294 in->auth_cap = &tcap;
5295 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5296 adjust_session_flushing_caps(in, session, tsession.get());
5297 }
5298 } else {
5299 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5300 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5301 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5302 cap.latest_perms);
5303 }
5304 } else {
5305 if (cap.wanted | cap.issued)
5306 in->flags |= I_CAP_DROPPED;
5307 }
5308
5309 remove_cap(&cap, false);
5310 }
5311 }
5312 }
5313
5314 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5315 {
5316 mds_rank_t mds = session->mds_num;
5317 ceph_assert(in->caps.count(mds));
5318
5319 ldout(cct, 10) << __func__ << " on ino " << *in
5320 << " size " << in->size << " -> " << m->get_size()
5321 << dendl;
5322
5323 int issued;
5324 in->caps_issued(&issued);
5325 issued |= in->caps_dirty();
5326 update_inode_file_size(in, issued, m->get_size(),
5327 m->get_truncate_seq(), m->get_truncate_size());
5328 }
5329
5330 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5331 {
5332 ceph_tid_t flush_ack_tid = m->get_client_tid();
5333 int dirty = m->get_dirty();
5334 int cleaned = 0;
5335 int flushed = 0;
5336
5337 auto it = in->flushing_cap_tids.begin();
5338 if (it->first < flush_ack_tid) {
5339 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5340 << " got unexpected flush ack tid " << flush_ack_tid
5341 << " expected is " << it->first << dendl;
5342 }
5343 for (; it != in->flushing_cap_tids.end(); ) {
5344 if (!it->second) {
5345 // cap snap
5346 ++it;
5347 continue;
5348 }
5349 if (it->first == flush_ack_tid)
5350 cleaned = it->second;
5351 if (it->first <= flush_ack_tid) {
5352 session->flushing_caps_tids.erase(it->first);
5353 in->flushing_cap_tids.erase(it++);
5354 ++flushed;
5355 continue;
5356 }
5357 cleaned &= ~it->second;
5358 if (!cleaned)
5359 break;
5360 ++it;
5361 }
5362
5363 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5364 << " cleaned " << ccap_string(cleaned) << " on " << *in
5365 << " with " << ccap_string(dirty) << dendl;
5366
5367 if (flushed) {
5368 signal_cond_list(in->waitfor_caps);
5369 if (session->flushing_caps_tids.empty() ||
5370 *session->flushing_caps_tids.begin() > flush_ack_tid)
5371 sync_cond.notify_all();
5372 }
5373
5374 if (!dirty) {
5375 in->cap_dirtier_uid = -1;
5376 in->cap_dirtier_gid = -1;
5377 }
5378
5379 if (!cleaned) {
5380 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5381 } else {
5382 if (in->flushing_caps) {
5383 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5384 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5385 in->flushing_caps &= ~cleaned;
5386 if (in->flushing_caps == 0) {
5387 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5388 num_flushing_caps--;
5389 if (in->flushing_cap_tids.empty())
5390 in->flushing_cap_item.remove_myself();
5391 }
5392 if (!in->caps_dirty())
5393 put_inode(in);
5394 }
5395 }
5396 }
5397
5398
5399 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5400 {
5401 ceph_tid_t flush_ack_tid = m->get_client_tid();
5402 mds_rank_t mds = session->mds_num;
5403 ceph_assert(in->caps.count(mds));
5404 snapid_t follows = m->get_snap_follows();
5405
5406 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5407 auto& capsnap = it->second;
5408 if (flush_ack_tid != capsnap.flush_tid) {
5409 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5410 } else {
5411 InodeRef tmp_ref(in);
5412 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5413 << " on " << *in << dendl;
5414 session->flushing_caps_tids.erase(capsnap.flush_tid);
5415 in->flushing_cap_tids.erase(capsnap.flush_tid);
5416 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5417 in->flushing_cap_item.remove_myself();
5418 in->cap_snaps.erase(it);
5419
5420 signal_cond_list(in->waitfor_caps);
5421 if (session->flushing_caps_tids.empty() ||
5422 *session->flushing_caps_tids.begin() > flush_ack_tid)
5423 sync_cond.notify_all();
5424 }
5425 } else {
5426 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5427 << " on " << *in << dendl;
5428 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5429 }
5430 }
5431
5432 class C_Client_DentryInvalidate : public Context {
5433 private:
5434 Client *client;
5435 vinodeno_t dirino;
5436 vinodeno_t ino;
5437 string name;
5438 public:
5439 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5440 client(c), name(dn->name) {
5441 if (client->use_faked_inos()) {
5442 dirino.ino = dn->dir->parent_inode->faked_ino;
5443 if (del)
5444 ino.ino = dn->inode->faked_ino;
5445 } else {
5446 dirino = dn->dir->parent_inode->vino();
5447 if (del)
5448 ino = dn->inode->vino();
5449 }
5450 if (!del)
5451 ino.ino = inodeno_t();
5452 }
5453 void finish(int r) override {
5454 // _async_dentry_invalidate is responsible for its own locking
5455 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5456 client->_async_dentry_invalidate(dirino, ino, name);
5457 }
5458 };
5459
5460 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5461 {
5462 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5463 if (!mref_reader.is_state_satisfied())
5464 return;
5465
5466 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5467 << " in dir " << dirino << dendl;
5468 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5469 }
5470
5471 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5472 {
5473 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5474 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5475 }
5476
5477 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5478 {
5479 int ref = in->get_nref();
5480 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5481
5482 if (in->dir && !in->dir->dentries.empty()) {
5483 for (auto p = in->dir->dentries.begin();
5484 p != in->dir->dentries.end(); ) {
5485 Dentry *dn = p->second;
5486 ++p;
5487 /* rmsnap removes whole subtree, need trim inodes recursively.
5488 * we don't need to invalidate dentries recursively. because
5489 * invalidating a directory dentry effectively invalidate
5490 * whole subtree */
5491 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5492 _try_to_trim_inode(dn->inode.get(), false);
5493
5494 if (dn->lru_is_expireable())
5495 unlink(dn, true, false); // keep dir, drop dentry
5496 }
5497 if (in->dir->dentries.empty()) {
5498 close_dir(in->dir);
5499 --ref;
5500 }
5501 }
5502
5503 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5504 InodeRef snapdir = open_snapdir(in);
5505 _try_to_trim_inode(snapdir.get(), false);
5506 --ref;
5507 }
5508
5509 if (ref > 1) {
5510 auto q = in->dentries.begin();
5511 while (q != in->dentries.end()) {
5512 Dentry *dn = *q;
5513 ++q;
5514 if( in->ll_ref > 0 && sched_inval) {
5515 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5516 // so in->dentries doesn't always reflect the state of kernel's dcache.
5517 _schedule_invalidate_dentry_callback(dn, true);
5518 }
5519 unlink(dn, true, true);
5520 }
5521 }
5522 }
5523
5524 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5525 {
5526 mds_rank_t mds = session->mds_num;
5527 int used = get_caps_used(in);
5528 int wanted = in->caps_wanted();
5529 int flags = 0;
5530
5531 const unsigned new_caps = m->get_caps();
5532 const bool was_stale = session->cap_gen > cap->gen;
5533 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5534 << " mds." << mds << " seq " << m->get_seq()
5535 << " caps now " << ccap_string(new_caps)
5536 << " was " << ccap_string(cap->issued)
5537 << (was_stale ? " (stale)" : "") << dendl;
5538
5539 if (was_stale)
5540 cap->issued = cap->implemented = CEPH_CAP_PIN;
5541 cap->seq = m->get_seq();
5542 cap->gen = session->cap_gen;
5543
5544 check_cap_issue(in, new_caps);
5545
5546 // update inode
5547 int issued;
5548 in->caps_issued(&issued);
5549 issued |= in->caps_dirty();
5550
5551 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5552 !(issued & CEPH_CAP_AUTH_EXCL)) {
5553 in->mode = m->head.mode;
5554 in->uid = m->head.uid;
5555 in->gid = m->head.gid;
5556 in->btime = m->btime;
5557 }
5558 bool deleted_inode = false;
5559 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5560 !(issued & CEPH_CAP_LINK_EXCL)) {
5561 in->nlink = m->head.nlink;
5562 if (in->nlink == 0)
5563 deleted_inode = true;
5564 }
5565 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5566 m->xattrbl.length() &&
5567 m->head.xattr_version > in->xattr_version) {
5568 auto p = m->xattrbl.cbegin();
5569 decode(in->xattrs, p);
5570 in->xattr_version = m->head.xattr_version;
5571 }
5572
5573 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5574 in->dirstat.nfiles = m->get_nfiles();
5575 in->dirstat.nsubdirs = m->get_nsubdirs();
5576 }
5577
5578 if (new_caps & CEPH_CAP_ANY_RD) {
5579 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5580 m->get_ctime(), m->get_mtime(), m->get_atime());
5581 }
5582
5583 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5584 in->layout = m->get_layout();
5585 update_inode_file_size(in, issued, m->get_size(),
5586 m->get_truncate_seq(), m->get_truncate_size());
5587 }
5588
5589 if (m->inline_version > in->inline_version) {
5590 in->inline_data = m->inline_data;
5591 in->inline_version = m->inline_version;
5592 }
5593
5594 /* always take a newer change attr */
5595 if (m->get_change_attr() > in->change_attr)
5596 in->change_attr = m->get_change_attr();
5597
5598 // max_size
5599 if (cap == in->auth_cap &&
5600 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5601 (m->get_max_size() != in->max_size)) {
5602 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5603 in->max_size = m->get_max_size();
5604 if (in->max_size > in->wanted_max_size) {
5605 in->wanted_max_size = 0;
5606 in->requested_max_size = 0;
5607 }
5608 }
5609
5610 bool check = false;
5611 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5612 (wanted & ~(cap->wanted | new_caps))) {
5613 // If mds is importing cap, prior cap messages that update 'wanted'
5614 // may get dropped by mds (migrate seq mismatch).
5615 //
5616 // We don't send cap message to update 'wanted' if what we want are
5617 // already issued. If mds revokes caps, cap message that releases caps
5618 // also tells mds what we want. But if caps got revoked by mds forcedly
5619 // (session stale). We may haven't told mds what we want.
5620 check = true;
5621 }
5622
5623
5624 // update caps
5625 auto revoked = cap->issued & ~new_caps;
5626 if (revoked) {
5627 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5628 cap->issued = new_caps;
5629 cap->implemented |= new_caps;
5630
5631 // recall delegations if we're losing caps necessary for them
5632 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5633 in->recall_deleg(false);
5634 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5635 in->recall_deleg(true);
5636
5637 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5638 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5639 !_flush(in, new C_Client_FlushComplete(this, in))) {
5640 // waitin' for flush
5641 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5642 if (_release(in)) {
5643 check = true;
5644 flags = CHECK_CAPS_NODELAY;
5645 }
5646 } else {
5647 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5648 check = true;
5649 flags = CHECK_CAPS_NODELAY;
5650 }
5651 } else if (cap->issued == new_caps) {
5652 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5653 } else {
5654 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5655 cap->issued = new_caps;
5656 cap->implemented |= new_caps;
5657
5658 if (cap == in->auth_cap) {
5659 // non-auth MDS is revoking the newly grant caps ?
5660 for (const auto &p : in->caps) {
5661 if (&p.second == cap)
5662 continue;
5663 if (p.second.implemented & ~p.second.issued & new_caps) {
5664 check = true;
5665 break;
5666 }
5667 }
5668 }
5669 }
5670
5671 if (check)
5672 check_caps(in, flags);
5673
5674 // wake up waiters
5675 if (new_caps)
5676 signal_cond_list(in->waitfor_caps);
5677
5678 // may drop inode's last ref
5679 if (deleted_inode)
5680 _try_to_trim_inode(in, true);
5681 }
5682
5683 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5684 {
5685 if (perms.uid() == 0) {
5686 // For directories, DACs are overridable.
5687 // For files, Read/write DACs are always overridable but executable DACs are
5688 // overridable when there is at least one exec bit set
5689 if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
5690 return -CEPHFS_EACCES;
5691 return 0;
5692 }
5693
5694 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5695 int ret = _posix_acl_permission(in, perms, want);
5696 if (ret != -CEPHFS_EAGAIN)
5697 return ret;
5698 }
5699
5700 // check permissions before doing anything else
5701 if (!in->check_mode(perms, want))
5702 return -CEPHFS_EACCES;
5703 return 0;
5704 }
5705
5706 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5707 const UserPerm& perms)
5708 {
5709 int r = _getattr_for_perm(in, perms);
5710 if (r < 0)
5711 goto out;
5712
5713 r = 0;
5714 if (strncmp(name, "system.", 7) == 0) {
5715 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5716 r = -CEPHFS_EPERM;
5717 } else {
5718 r = inode_permission(in, perms, want);
5719 }
5720 out:
5721 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5722 return r;
5723 }
5724
5725 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5726 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5727 return out;
5728 }
5729
5730 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5731 const UserPerm& perms)
5732 {
5733 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5734 int r = _getattr_for_perm(in, perms);
5735 if (r < 0)
5736 goto out;
5737
5738 if (mask & CEPH_SETATTR_SIZE) {
5739 r = inode_permission(in, perms, MAY_WRITE);
5740 if (r < 0)
5741 goto out;
5742 }
5743
5744 r = -CEPHFS_EPERM;
5745 if (mask & CEPH_SETATTR_UID) {
5746 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5747 goto out;
5748 }
5749 if (mask & CEPH_SETATTR_GID) {
5750 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5751 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5752 goto out;
5753 }
5754
5755 if (mask & CEPH_SETATTR_MODE) {
5756 if (perms.uid() != 0 && perms.uid() != in->uid)
5757 goto out;
5758
5759 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5760 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5761 stx->stx_mode &= ~S_ISGID;
5762 }
5763
5764 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5765 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5766 if (perms.uid() != 0 && perms.uid() != in->uid) {
5767 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5768 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5769 check_mask |= CEPH_SETATTR_MTIME;
5770 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5771 check_mask |= CEPH_SETATTR_ATIME;
5772 if (check_mask & mask) {
5773 goto out;
5774 } else {
5775 r = inode_permission(in, perms, MAY_WRITE);
5776 if (r < 0)
5777 goto out;
5778 }
5779 }
5780 }
5781 r = 0;
5782 out:
5783 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5784 return r;
5785 }
5786
5787 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5788 {
5789 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5790 unsigned want = 0;
5791
5792 if ((flags & O_ACCMODE) == O_WRONLY)
5793 want = MAY_WRITE;
5794 else if ((flags & O_ACCMODE) == O_RDWR)
5795 want = MAY_READ | MAY_WRITE;
5796 else if ((flags & O_ACCMODE) == O_RDONLY)
5797 want = MAY_READ;
5798 if (flags & O_TRUNC)
5799 want |= MAY_WRITE;
5800
5801 int r = 0;
5802 switch (in->mode & S_IFMT) {
5803 case S_IFLNK:
5804 r = -CEPHFS_ELOOP;
5805 goto out;
5806 case S_IFDIR:
5807 if (want & MAY_WRITE) {
5808 r = -CEPHFS_EISDIR;
5809 goto out;
5810 }
5811 break;
5812 }
5813
5814 r = _getattr_for_perm(in, perms);
5815 if (r < 0)
5816 goto out;
5817
5818 r = inode_permission(in, perms, want);
5819 out:
5820 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5821 return r;
5822 }
5823
5824 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5825 {
5826 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5827 int r = _getattr_for_perm(dir, perms);
5828 if (r < 0)
5829 goto out;
5830
5831 r = inode_permission(dir, perms, MAY_EXEC);
5832 out:
5833 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5834 return r;
5835 }
5836
5837 int Client::may_create(Inode *dir, const UserPerm& perms)
5838 {
5839 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5840 int r = _getattr_for_perm(dir, perms);
5841 if (r < 0)
5842 goto out;
5843
5844 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5845 out:
5846 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5847 return r;
5848 }
5849
5850 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5851 {
5852 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5853 int r = _getattr_for_perm(dir, perms);
5854 if (r < 0)
5855 goto out;
5856
5857 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5858 if (r < 0)
5859 goto out;
5860
5861 /* 'name == NULL' means rmsnap w/o permission checks */
5862 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5863 InodeRef otherin;
5864 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5865 if (r < 0)
5866 goto out;
5867 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5868 r = -CEPHFS_EPERM;
5869 }
5870 out:
5871 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5872 return r;
5873 }
5874
5875 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5876 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5877
5878 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5879 if (!mref_reader.is_state_satisfied())
5880 return -ENOTCONN;
5881
5882 filepath path(relpath);
5883 string name = path.last_dentry();
5884 path.pop_dentry();
5885 InodeRef dir;
5886
5887 std::scoped_lock lock(client_lock);
5888 int r = path_walk(path, &dir, perms);
5889 if (r < 0)
5890 return r;
5891 if (cct->_conf->client_permissions) {
5892 int r = may_delete(dir.get(), name.c_str(), perms);
5893 if (r < 0)
5894 return r;
5895 }
5896
5897 return 0;
5898 }
5899
5900 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5901 {
5902 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5903 int r = _getattr_for_perm(in, perms);
5904 if (r < 0)
5905 goto out;
5906
5907 if (perms.uid() == 0 || perms.uid() == in->uid) {
5908 r = 0;
5909 goto out;
5910 }
5911
5912 r = -CEPHFS_EPERM;
5913 if (!S_ISREG(in->mode))
5914 goto out;
5915
5916 if (in->mode & S_ISUID)
5917 goto out;
5918
5919 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5920 goto out;
5921
5922 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5923 out:
5924 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5925 return r;
5926 }
5927
5928 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5929 {
5930 int mask = CEPH_STAT_CAP_MODE;
5931 bool force = false;
5932 if (acl_type != NO_ACL) {
5933 mask |= CEPH_STAT_CAP_XATTR;
5934 force = in->xattr_version == 0;
5935 }
5936 return _getattr(in, mask, perms, force);
5937 }
5938
5939 vinodeno_t Client::_get_vino(Inode *in)
5940 {
5941 /* The caller must hold the client lock */
5942 return vinodeno_t(in->ino, in->snapid);
5943 }
5944
5945 /**
5946 * Resolve an MDS spec to a list of MDS daemon GIDs.
5947 *
5948 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5949 * It may be '*' in which case it matches all GIDs.
5950 *
5951 * If no error is returned, the `targets` vector will be populated with at least
5952 * one MDS.
5953 */
5954 int Client::resolve_mds(
5955 const std::string &mds_spec,
5956 std::vector<mds_gid_t> *targets)
5957 {
5958 ceph_assert(fsmap);
5959 ceph_assert(targets != nullptr);
5960
5961 mds_role_t role;
5962 CachedStackStringStream css;
5963 int role_r = fsmap->parse_role(mds_spec, &role, *css);
5964 if (role_r == 0) {
5965 // We got a role, resolve it to a GID
5966 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5967 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5968 << role << "' aka " << info.human_name() << dendl;
5969 targets->push_back(info.global_id);
5970 return 0;
5971 }
5972
5973 std::string strtol_err;
5974 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5975 if (strtol_err.empty()) {
5976 // It is a possible GID
5977 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5978 if (fsmap->gid_exists(mds_gid)) {
5979 auto& info = fsmap->get_info_gid(mds_gid);
5980 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5981 << info.human_name() << dendl;
5982 targets->push_back(mds_gid);
5983 return 0;
5984 } else {
5985 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5986 << dendl;
5987 lderr(cct) << "FSMap: " << *fsmap << dendl;
5988 return -CEPHFS_ENOENT;
5989 }
5990 } else if (mds_spec == "*") {
5991 // It is a wildcard: use all MDSs
5992 const auto& mds_info = fsmap->get_mds_info();
5993
5994 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5995 if (mds_info.empty()) {
5996 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5997 lderr(cct) << "FSMap: " << *fsmap << dendl;
5998 return -CEPHFS_ENOENT;
5999 }
6000
6001 for (const auto& [gid, info] : mds_info) {
6002 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6003 targets->push_back(gid);
6004 }
6005 return 0;
6006 } else {
6007 // It did not parse as an integer, it is not a wildcard, it must be a name
6008 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6009 if (mds_gid == 0) {
6010 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
6011 lderr(cct) << "FSMap: " << *fsmap << dendl;
6012 return -CEPHFS_ENOENT;
6013 } else {
6014 auto& info = fsmap->get_info_gid(mds_gid);
6015 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6016 << "' to " << info.human_name() << dendl;
6017 targets->push_back(mds_gid);
6018 }
6019 return 0;
6020 }
6021 }
6022
6023
6024 /**
6025 * Authenticate with mon and establish global ID
6026 */
6027 int Client::authenticate()
6028 {
6029 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6030
6031 if (monclient->is_authenticated()) {
6032 return 0;
6033 }
6034
6035 client_lock.unlock();
6036 int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
6037 client_lock.lock();
6038 if (r < 0) {
6039 return r;
6040 }
6041
6042 whoami = monclient->get_global_id();
6043 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6044
6045 return 0;
6046 }
6047
6048 int Client::fetch_fsmap(bool user)
6049 {
6050 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6051
6052 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6053 // rather than MDSMap because no one MDSMap contains all the daemons, and
6054 // a `tell` can address any daemon.
6055 version_t fsmap_latest;
6056 bs::error_code ec;
6057 do {
6058 client_lock.unlock();
6059 std::tie(fsmap_latest, std::ignore) =
6060 monclient->get_version("fsmap", ca::use_blocked[ec]);
6061 client_lock.lock();
6062 } while (ec == bs::errc::resource_unavailable_try_again);
6063
6064 if (ec) {
6065 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6066 return ceph::from_error_code(ec);
6067 }
6068
6069 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6070
6071 if (user) {
6072 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6073 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6074 monclient->renew_subs();
6075 wait_on_list(waiting_for_fsmap);
6076 }
6077 ceph_assert(fsmap_user);
6078 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6079 } else {
6080 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6081 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6082 monclient->renew_subs();
6083 wait_on_list(waiting_for_fsmap);
6084 }
6085 ceph_assert(fsmap);
6086 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6087 }
6088 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6089 << fsmap_latest << dendl;
6090 return 0;
6091 }
6092
6093 /**
6094 *
6095 * @mds_spec one of ID, rank, GID, "*"
6096 *
6097 */
6098 int Client::mds_command(
6099 const std::string &mds_spec,
6100 const vector<string>& cmd,
6101 const bufferlist& inbl,
6102 bufferlist *outbl,
6103 string *outs,
6104 Context *onfinish)
6105 {
6106 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6107 if (!iref_reader.is_state_satisfied())
6108 return -CEPHFS_ENOTCONN;
6109
6110 std::unique_lock cl(client_lock);
6111
6112 int r;
6113 r = authenticate();
6114 if (r < 0) {
6115 return r;
6116 }
6117
6118 r = fetch_fsmap(false);
6119 if (r < 0) {
6120 return r;
6121 }
6122
6123 // Look up MDS target(s) of the command
6124 std::vector<mds_gid_t> targets;
6125 r = resolve_mds(mds_spec, &targets);
6126 if (r < 0) {
6127 return r;
6128 }
6129
6130 // If daemons are laggy, we won't send them commands. If all
6131 // are laggy then we fail.
6132 std::vector<mds_gid_t> non_laggy;
6133 for (const auto& gid : targets) {
6134 const auto info = fsmap->get_info_gid(gid);
6135 if (!info.laggy()) {
6136 non_laggy.push_back(gid);
6137 }
6138 }
6139 if (non_laggy.size() == 0) {
6140 *outs = "All targeted MDS daemons are laggy";
6141 return -CEPHFS_ENOENT;
6142 }
6143
6144 if (metadata.empty()) {
6145 // We are called on an unmounted client, so metadata
6146 // won't be initialized yet.
6147 populate_metadata("");
6148 }
6149
6150 // Send commands to targets
6151 C_GatherBuilder gather(cct, onfinish);
6152 for (const auto& target_gid : non_laggy) {
6153 const auto info = fsmap->get_info_gid(target_gid);
6154
6155 // Open a connection to the target MDS
6156 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6157
6158 cl.unlock();
6159 {
6160 std::scoped_lock cmd_lock(command_lock);
6161 // Generate MDSCommandOp state
6162 auto &op = command_table.start_command();
6163
6164 op.on_finish = gather.new_sub();
6165 op.cmd = cmd;
6166 op.outbl = outbl;
6167 op.outs = outs;
6168 op.inbl = inbl;
6169 op.mds_gid = target_gid;
6170 op.con = conn;
6171
6172 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6173 << " tid=" << op.tid << cmd << dendl;
6174
6175 // Construct and send MCommand
6176 MessageRef m = op.get_message(monclient->get_fsid());
6177 conn->send_message2(std::move(m));
6178 }
6179 cl.lock();
6180 }
6181 gather.activate();
6182
6183 return 0;
6184 }
6185
6186 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6187 {
6188 ceph_tid_t const tid = m->get_tid();
6189
6190 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6191
6192 std::scoped_lock cmd_lock(command_lock);
6193 if (!command_table.exists(tid)) {
6194 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6195 return;
6196 }
6197
6198 auto &op = command_table.get_command(tid);
6199 if (op.outbl) {
6200 *op.outbl = m->get_data();
6201 }
6202 if (op.outs) {
6203 *op.outs = m->rs;
6204 }
6205
6206 if (op.on_finish) {
6207 op.on_finish->complete(m->r);
6208 }
6209
6210 command_table.erase(tid);
6211 }
6212
6213 // -------------------
6214 // MOUNT
6215
6216 int Client::subscribe_mdsmap(const std::string &fs_name)
6217 {
6218 int r = authenticate();
6219 if (r < 0) {
6220 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6221 return r;
6222 }
6223
6224 std::string resolved_fs_name;
6225 if (fs_name.empty()) {
6226 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6227 if (resolved_fs_name.empty())
6228 // Try the backwards compatibility fs name option
6229 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6230 } else {
6231 resolved_fs_name = fs_name;
6232 }
6233
6234 std::string want = "mdsmap";
6235 if (!resolved_fs_name.empty()) {
6236 r = fetch_fsmap(true);
6237 if (r < 0)
6238 return r;
6239 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6240 if (fscid == FS_CLUSTER_ID_NONE) {
6241 return -CEPHFS_ENOENT;
6242 }
6243
6244 std::ostringstream oss;
6245 oss << want << "." << fscid;
6246 want = oss.str();
6247 }
6248 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6249
6250 monclient->sub_want(want, 0, 0);
6251 monclient->renew_subs();
6252
6253 return 0;
6254 }
6255
6256 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6257 bool require_mds, const std::string &fs_name)
6258 {
6259 ceph_assert(is_initialized());
6260
6261 /*
6262 * To make sure that the _unmount() must wait until the mount()
6263 * is done.
6264 */
6265 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6266 if (!mref_writer.is_first_writer()) // already mounting or mounted
6267 return 0;
6268
6269 std::unique_lock cl(client_lock);
6270
6271 int r = subscribe_mdsmap(fs_name);
6272 if (r < 0) {
6273 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6274 return r;
6275 }
6276
6277 start_tick_thread(); // start tick thread
6278
6279 if (require_mds) {
6280 while (1) {
6281 auto availability = mdsmap->is_cluster_available();
6282 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6283 // Error out
6284 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6285 return CEPH_FUSE_NO_MDS_UP;
6286 } else if (availability == MDSMap::AVAILABLE) {
6287 // Continue to mount
6288 break;
6289 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6290 // Else, wait. MDSMonitor will update the map to bring
6291 // us to a conclusion eventually.
6292 wait_on_list(waiting_for_mdsmap);
6293 } else {
6294 // Unexpected value!
6295 ceph_abort();
6296 }
6297 }
6298 }
6299
6300 populate_metadata(mount_root.empty() ? "/" : mount_root);
6301
6302 filepath fp(CEPH_INO_ROOT);
6303 if (!mount_root.empty()) {
6304 fp = filepath(mount_root.c_str());
6305 }
6306 while (true) {
6307 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6308 req->set_filepath(fp);
6309 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6310 int res = make_request(req, perms);
6311 if (res < 0) {
6312 if (res == -CEPHFS_EACCES && root) {
6313 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6314 break;
6315 }
6316 return res;
6317 }
6318
6319 if (fp.depth())
6320 fp.pop_dentry();
6321 else
6322 break;
6323 }
6324
6325 ceph_assert(root);
6326 _ll_get(root.get());
6327
6328 // trace?
6329 if (!cct->_conf->client_trace.empty()) {
6330 traceout.open(cct->_conf->client_trace.c_str());
6331 if (traceout.is_open()) {
6332 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6333 } else {
6334 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6335 }
6336 }
6337
6338 /*
6339 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6340 ldout(cct, 3) << "op: struct stat st;" << dendl;
6341 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6342 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6343 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6344 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6345 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6346 ldout(cct, 3) << "op: int fd;" << dendl;
6347 */
6348
6349 mref_writer.update_state(CLIENT_MOUNTED);
6350 return 0;
6351 }
6352
6353 // UNMOUNT
6354
6355 void Client::_close_sessions()
6356 {
6357 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6358 if (it->second->state == MetaSession::STATE_REJECTED)
6359 mds_sessions.erase(it++);
6360 else
6361 ++it;
6362 }
6363
6364 while (!mds_sessions.empty()) {
6365 // send session closes!
6366 for (auto &p : mds_sessions) {
6367 if (p.second->state != MetaSession::STATE_CLOSING) {
6368 _close_mds_session(p.second.get());
6369 mds_ranks_closing.insert(p.first);
6370 }
6371 }
6372
6373 // wait for sessions to close
6374 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6375 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6376 << timo << "s)" << dendl;
6377 std::unique_lock l{client_lock, std::adopt_lock};
6378 if (!timo) {
6379 mount_cond.wait(l);
6380 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6381 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6382 while (!mds_ranks_closing.empty()) {
6383 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6384 // this prunes entry from mds_sessions and mds_ranks_closing
6385 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6386 }
6387 }
6388
6389 mds_ranks_closing.clear();
6390 l.release();
6391 }
6392 }
6393
6394 void Client::flush_mdlog_sync(Inode *in)
6395 {
6396 if (in->unsafe_ops.empty()) {
6397 return;
6398 }
6399
6400 std::set<mds_rank_t> anchor;
6401 for (auto &&p : in->unsafe_ops) {
6402 anchor.emplace(p->mds);
6403 }
6404 if (in->auth_cap) {
6405 anchor.emplace(in->auth_cap->session->mds_num);
6406 }
6407
6408 for (auto &rank : anchor) {
6409 auto session = &mds_sessions.at(rank);
6410 flush_mdlog(session->get());
6411 }
6412 }
6413
6414 void Client::flush_mdlog_sync()
6415 {
6416 if (mds_requests.empty())
6417 return;
6418 for (auto &p : mds_sessions) {
6419 flush_mdlog(p.second.get());
6420 }
6421 }
6422
6423 void Client::flush_mdlog(MetaSession *session)
6424 {
6425 // Only send this to Luminous or newer MDS daemons, older daemons
6426 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6427 const uint64_t features = session->con->get_features();
6428 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6429 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6430 session->con->send_message2(std::move(m));
6431 }
6432 }
6433
6434
6435 void Client::_abort_mds_sessions(int err)
6436 {
6437 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6438 auto req = p->second;
6439 ++p;
6440 // unsafe requests will be removed during close session below.
6441 if (req->got_unsafe)
6442 continue;
6443
6444 req->abort(err);
6445 if (req->caller_cond) {
6446 req->kick = true;
6447 req->caller_cond->notify_all();
6448 }
6449 }
6450
6451 // Process aborts on any requests that were on this waitlist.
6452 // Any requests that were on a waiting_for_open session waitlist
6453 // will get kicked during close session below.
6454 signal_cond_list(waiting_for_mdsmap);
6455
6456 // Force-close all sessions
6457 while(!mds_sessions.empty()) {
6458 auto session = mds_sessions.begin()->second;
6459 _closed_mds_session(session.get(), err);
6460 }
6461 }
6462
6463 void Client::_unmount(bool abort)
6464 {
6465 /*
6466 * We are unmounting the client.
6467 *
6468 * Just declare the state to STATE_UNMOUNTING to block and fail
6469 * any new comming "reader" and then try to wait all the in-flight
6470 * "readers" to finish.
6471 */
6472 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6473 if (!mref_writer.is_first_writer())
6474 return;
6475 mref_writer.wait_readers_done();
6476
6477 std::unique_lock lock{client_lock};
6478
6479 if (abort || blocklisted) {
6480 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6481 } else {
6482 ldout(cct, 2) << "unmounting" << dendl;
6483 }
6484
6485 deleg_timeout = 0;
6486
6487 if (abort) {
6488 mount_aborted = true;
6489 // Abort all mds sessions
6490 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6491
6492 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6493 } else {
6494 // flush the mdlog for pending requests, if any
6495 flush_mdlog_sync();
6496 }
6497
6498 mount_cond.wait(lock, [this] {
6499 if (!mds_requests.empty()) {
6500 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6501 << dendl;
6502 }
6503 return mds_requests.empty();
6504 });
6505
6506 cwd.reset();
6507 root.reset();
6508
6509 // clean up any unclosed files
6510 while (!fd_map.empty()) {
6511 Fh *fh = fd_map.begin()->second;
6512 fd_map.erase(fd_map.begin());
6513 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6514 _release_fh(fh);
6515 }
6516
6517 while (!ll_unclosed_fh_set.empty()) {
6518 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6519 Fh *fh = *it;
6520 ll_unclosed_fh_set.erase(fh);
6521 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6522 _release_fh(fh);
6523 }
6524
6525 while (!opened_dirs.empty()) {
6526 dir_result_t *dirp = *opened_dirs.begin();
6527 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6528 _closedir(dirp);
6529 }
6530
6531 _ll_drop_pins();
6532
6533 if (cct->_conf->client_oc) {
6534 // flush/release all buffered data
6535 std::list<InodeRef> anchor;
6536 for (auto& p : inode_map) {
6537 Inode *in = p.second;
6538 if (!in) {
6539 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6540 ceph_assert(in);
6541 }
6542
6543 // prevent inode from getting freed
6544 anchor.emplace_back(in);
6545
6546 if (abort || blocklisted) {
6547 objectcacher->purge_set(&in->oset);
6548 } else if (!in->caps.empty()) {
6549 _release(in);
6550 _flush(in, new C_Client_FlushComplete(this, in));
6551 }
6552 }
6553 }
6554
6555 if (abort || blocklisted) {
6556 for (auto &q : mds_sessions) {
6557 auto s = q.second;
6558 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6559 Inode *in = *p;
6560 ++p;
6561 if (in->dirty_caps) {
6562 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6563 in->mark_caps_clean();
6564 put_inode(in);
6565 }
6566 }
6567 }
6568 } else {
6569 flush_caps_sync();
6570 wait_sync_caps(last_flush_tid);
6571 }
6572
6573 // empty lru cache
6574 trim_cache();
6575
6576 delay_put_inodes();
6577
6578 while (lru.lru_get_size() > 0 ||
6579 !inode_map.empty()) {
6580 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6581 << "+" << inode_map.size() << " items"
6582 << ", waiting (for caps to release?)"
6583 << dendl;
6584
6585 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6586 r == std::cv_status::timeout) {
6587 dump_cache(NULL);
6588 }
6589 }
6590 ceph_assert(lru.lru_get_size() == 0);
6591 ceph_assert(inode_map.empty());
6592
6593 // stop tracing
6594 if (!cct->_conf->client_trace.empty()) {
6595 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6596 traceout.close();
6597 }
6598
6599 // stop the tick thread
6600 tick_thread_stopped = true;
6601 upkeep_cond.notify_one();
6602
6603 _close_sessions();
6604
6605 // release the global snapshot realm
6606 SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6607 if (global_realm) {
6608 ceph_assert(global_realm->nref == 1);
6609 put_snap_realm(global_realm);
6610 }
6611
6612 mref_writer.update_state(CLIENT_UNMOUNTED);
6613
6614 ldout(cct, 2) << "unmounted." << dendl;
6615 }
6616
6617 void Client::unmount()
6618 {
6619 _unmount(false);
6620 }
6621
6622 void Client::abort_conn()
6623 {
6624 _unmount(true);
6625 }
6626
6627 void Client::flush_cap_releases()
6628 {
6629 uint64_t nr_caps = 0;
6630
6631 // send any cap releases
6632 for (auto &p : mds_sessions) {
6633 auto session = p.second;
6634 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6635 p.first)) {
6636 nr_caps += session->release->caps.size();
6637 if (cct->_conf->client_inject_release_failure) {
6638 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6639 } else {
6640 session->con->send_message2(std::move(session->release));
6641 }
6642 session->release.reset();
6643 }
6644 }
6645
6646 if (nr_caps > 0) {
6647 dec_pinned_icaps(nr_caps);
6648 }
6649 }
6650
6651 void Client::renew_and_flush_cap_releases()
6652 {
6653 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6654
6655 if (!mount_aborted && mdsmap->get_epoch()) {
6656 // renew caps?
6657 auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6658 if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
6659 renew_caps();
6660
6661 flush_cap_releases();
6662 }
6663 }
6664
6665 void Client::tick()
6666 {
6667 ldout(cct, 20) << "tick" << dendl;
6668
6669 auto now = ceph::coarse_mono_clock::now();
6670
6671 /*
6672 * If the mount() is not finished
6673 */
6674 if (is_mounting() && !mds_requests.empty()) {
6675 MetaRequest *req = mds_requests.begin()->second;
6676
6677 if (req->created + mount_timeout < now) {
6678 req->abort(-CEPHFS_ETIMEDOUT);
6679 if (req->caller_cond) {
6680 req->kick = true;
6681 req->caller_cond->notify_all();
6682 }
6683 signal_cond_list(waiting_for_mdsmap);
6684 for (auto &p : mds_sessions) {
6685 signal_context_list(p.second->waiting_for_open);
6686 }
6687 }
6688 }
6689
6690 renew_and_flush_cap_releases();
6691
6692 // delayed caps
6693 xlist<Inode*>::iterator p = delayed_list.begin();
6694 while (!p.end()) {
6695 Inode *in = *p;
6696 ++p;
6697 if (!mount_aborted && in->hold_caps_until > now)
6698 break;
6699 delayed_list.pop_front();
6700 if (!mount_aborted)
6701 check_caps(in, CHECK_CAPS_NODELAY);
6702 }
6703
6704 if (!mount_aborted)
6705 collect_and_send_metrics();
6706
6707 delay_put_inodes(is_unmounting());
6708 trim_cache(true);
6709
6710 if (blocklisted && (is_mounted() || is_unmounting()) &&
6711 last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
6712 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6713 messenger->client_reset();
6714 fd_gen++; // invalidate open files
6715 blocklisted = false;
6716 _kick_stale_sessions();
6717 last_auto_reconnect = now;
6718 }
6719 }
6720
6721 void Client::start_tick_thread()
6722 {
6723 upkeeper = std::thread([this]() {
6724 using time = ceph::coarse_mono_time;
6725 using sec = std::chrono::seconds;
6726
6727 auto last_tick = time::min();
6728
6729 std::unique_lock cl(client_lock);
6730 while (!tick_thread_stopped) {
6731 auto now = clock::now();
6732 auto since = now - last_tick;
6733
6734 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6735 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6736
6737 auto interval = std::max(t_interval, d_interval);
6738 if (likely(since >= interval*.90)) {
6739 tick();
6740 last_tick = clock::now();
6741 } else {
6742 interval -= since;
6743 }
6744
6745 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6746 if (!tick_thread_stopped)
6747 upkeep_cond.wait_for(cl, interval);
6748 }
6749 });
6750 }
6751
6752 void Client::collect_and_send_metrics() {
6753 ldout(cct, 20) << __func__ << dendl;
6754
6755 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6756
6757 // right now, we only track and send global metrics. its sufficient
6758 // to send these metrics to MDS rank0.
6759 collect_and_send_global_metrics();
6760 }
6761
6762 void Client::collect_and_send_global_metrics() {
6763 ldout(cct, 20) << __func__ << dendl;
6764 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6765
6766 if (!have_open_session((mds_rank_t)0)) {
6767 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6768 << dendl;
6769 return;
6770 }
6771 auto session = _get_or_open_mds_session((mds_rank_t)0);
6772 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6773 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6774 return;
6775 }
6776
6777 ClientMetricMessage metric;
6778 std::vector<ClientMetricMessage> message;
6779
6780 // read latency
6781 if (_collect_and_send_global_metrics ||
6782 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
6783 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
6784 logger->tget(l_c_rd_avg),
6785 logger->get(l_c_rd_sqsum),
6786 nr_read_request));
6787 message.push_back(metric);
6788 }
6789
6790 // write latency
6791 if (_collect_and_send_global_metrics ||
6792 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
6793 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
6794 logger->tget(l_c_wr_avg),
6795 logger->get(l_c_wr_sqsum),
6796 nr_write_request));
6797 message.push_back(metric);
6798 }
6799
6800 // metadata latency
6801 if (_collect_and_send_global_metrics ||
6802 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
6803 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
6804 logger->tget(l_c_md_avg),
6805 logger->get(l_c_md_sqsum),
6806 nr_metadata_request));
6807 message.push_back(metric);
6808 }
6809
6810 // cap hit ratio -- nr_caps is unused right now
6811 if (_collect_and_send_global_metrics ||
6812 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6813 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6814 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6815 message.push_back(metric);
6816 }
6817
6818 // dentry lease hit ratio
6819 if (_collect_and_send_global_metrics ||
6820 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6821 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6822 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6823 message.push_back(metric);
6824 }
6825
6826 // opened files
6827 if (_collect_and_send_global_metrics ||
6828 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
6829 auto [opened_files, total_inodes] = get_opened_files_rates();
6830 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6831 message.push_back(metric);
6832 }
6833
6834 // pinned i_caps
6835 if (_collect_and_send_global_metrics ||
6836 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
6837 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6838 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6839 message.push_back(metric);
6840 }
6841
6842 // opened inodes
6843 if (_collect_and_send_global_metrics ||
6844 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
6845 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6846 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6847 message.push_back(metric);
6848 }
6849
6850 // read io sizes
6851 if (_collect_and_send_global_metrics ||
6852 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6853 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6854 total_read_size));
6855 message.push_back(metric);
6856 }
6857
6858 // write io sizes
6859 if (_collect_and_send_global_metrics ||
6860 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6861 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6862 total_write_size));
6863 message.push_back(metric);
6864 }
6865
6866 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6867 }
6868
6869 void Client::renew_caps()
6870 {
6871 ldout(cct, 10) << "renew_caps()" << dendl;
6872 last_cap_renew = ceph::coarse_mono_clock::now();
6873
6874 for (auto &p : mds_sessions) {
6875 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6876 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6877 renew_caps(p.second.get());
6878 }
6879 }
6880
6881 void Client::renew_caps(MetaSession *session)
6882 {
6883 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6884 session->last_cap_renew_request = ceph_clock_now();
6885 uint64_t seq = ++session->cap_renew_seq;
6886 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6887 }
6888
6889
6890 // ===============================================================
6891 // high level (POSIXy) interface
6892
6893 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6894 InodeRef *target, const UserPerm& perms)
6895 {
6896 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6897 MetaRequest *req = new MetaRequest(op);
6898 filepath path;
6899 dir->make_nosnap_relative_path(path);
6900 path.push_dentry(name);
6901 req->set_filepath(path);
6902 req->set_inode(dir);
6903 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6904 mask |= DEBUG_GETATTR_CAPS;
6905 req->head.args.getattr.mask = mask;
6906
6907 ldout(cct, 10) << __func__ << " on " << path << dendl;
6908
6909 int r = make_request(req, perms, target);
6910 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6911 return r;
6912 }
6913
6914 bool Client::_dentry_valid(const Dentry *dn)
6915 {
6916 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6917
6918 // is dn lease valid?
6919 utime_t now = ceph_clock_now();
6920 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6921 mds_sessions.count(dn->lease_mds)) {
6922 auto s = mds_sessions.at(dn->lease_mds);
6923 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
6924 dlease_hit();
6925 return true;
6926 }
6927
6928 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6929 << " vs lease_gen " << dn->lease_gen << dendl;
6930 }
6931
6932 dlease_miss();
6933 return false;
6934 }
6935
6936 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6937 const UserPerm& perms, std::string* alternate_name)
6938 {
6939 int r = 0;
6940 Dentry *dn = NULL;
6941 bool did_lookup_request = false;
6942 // can only request shared caps
6943 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6944
6945 if (dname == "..") {
6946 if (dir->dentries.empty()) {
6947 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6948 filepath path(dir->ino);
6949 req->set_filepath(path);
6950
6951 InodeRef tmptarget;
6952 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6953
6954 if (r == 0) {
6955 *target = std::move(tmptarget);
6956 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6957 } else {
6958 *target = dir;
6959 }
6960 }
6961 else
6962 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6963 goto done;
6964 }
6965
6966 if (dname == ".") {
6967 *target = dir;
6968 goto done;
6969 }
6970
6971 if (!dir->is_dir()) {
6972 r = -CEPHFS_ENOTDIR;
6973 goto done;
6974 }
6975
6976 if (dname.length() > NAME_MAX) {
6977 r = -CEPHFS_ENAMETOOLONG;
6978 goto done;
6979 }
6980
6981 if (dname == cct->_conf->client_snapdir &&
6982 dir->snapid == CEPH_NOSNAP) {
6983 *target = open_snapdir(dir);
6984 goto done;
6985 }
6986
6987 relookup:
6988 if (dir->dir &&
6989 dir->dir->dentries.count(dname)) {
6990 dn = dir->dir->dentries[dname];
6991
6992 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6993 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6994
6995 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6996 if (_dentry_valid(dn)) {
6997 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6998 // make trim_caps() behave.
6999 dir->try_touch_cap(dn->lease_mds);
7000 goto hit_dn;
7001 }
7002 // dir shared caps?
7003 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7004 if (dn->cap_shared_gen == dir->shared_gen &&
7005 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7006 goto hit_dn;
7007 if (!dn->inode && (dir->flags & I_COMPLETE)) {
7008 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7009 << *dir << " dn '" << dname << "'" << dendl;
7010 return -CEPHFS_ENOENT;
7011 }
7012 }
7013 } else {
7014 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7015 }
7016 } else {
7017 // can we conclude ENOENT locally?
7018 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7019 (dir->flags & I_COMPLETE)) {
7020 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7021 return -CEPHFS_ENOENT;
7022 }
7023 }
7024
7025 if (did_lookup_request) {
7026 r = 0;
7027 goto done;
7028 }
7029 r = _do_lookup(dir, dname, mask, target, perms);
7030 did_lookup_request = true;
7031 if (r == 0) {
7032 /* complete lookup to get dentry for alternate_name */
7033 goto relookup;
7034 } else {
7035 goto done;
7036 }
7037
7038 hit_dn:
7039 if (dn->inode) {
7040 *target = dn->inode;
7041 if (alternate_name)
7042 *alternate_name = dn->alternate_name;
7043 } else {
7044 r = -CEPHFS_ENOENT;
7045 }
7046 touch_dn(dn);
7047 goto done;
7048
7049 done:
7050 if (r < 0)
7051 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7052 else
7053 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7054 return r;
7055 }
7056
7057 int Client::get_or_create(Inode *dir, const char* name,
7058 Dentry **pdn, bool expect_null)
7059 {
7060 // lookup
7061 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7062 dir->open_dir();
7063 if (dir->dir->dentries.count(name)) {
7064 Dentry *dn = dir->dir->dentries[name];
7065 if (_dentry_valid(dn)) {
7066 if (expect_null)
7067 return -CEPHFS_EEXIST;
7068 }
7069 *pdn = dn;
7070 } else {
7071 // otherwise link up a new one
7072 *pdn = link(dir->dir, name, NULL, NULL);
7073 }
7074
7075 // success
7076 return 0;
7077 }
7078
7079 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7080 {
7081 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7082 if (!mref_reader.is_state_satisfied())
7083 return -CEPHFS_ENOTCONN;
7084
7085 ldout(cct, 10) << __func__ << ": " << path << dendl;
7086
7087 std::scoped_lock lock(client_lock);
7088
7089 return path_walk(path, wdr, perms, followsym);
7090 }
7091
7092 int Client::path_walk(const filepath& origpath, InodeRef *end,
7093 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
7094 {
7095 walk_dentry_result wdr;
7096 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
7097 *end = std::move(wdr.in);
7098 return rc;
7099 }
7100
7101 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7102 bool followsym, int mask, InodeRef dirinode)
7103 {
7104 filepath path = origpath;
7105 InodeRef cur;
7106 std::string alternate_name;
7107 if (origpath.absolute())
7108 cur = root;
7109 else if (!dirinode)
7110 cur = cwd;
7111 else {
7112 cur = dirinode;
7113 }
7114 ceph_assert(cur);
7115
7116 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
7117 ldout(cct, 10) << __func__ << " " << path << dendl;
7118
7119 int symlinks = 0;
7120
7121 unsigned i=0;
7122 while (i < path.depth() && cur) {
7123 int caps = 0;
7124 const string &dname = path[i];
7125 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7126 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7127 InodeRef next;
7128 if (cct->_conf->client_permissions) {
7129 int r = may_lookup(cur.get(), perms);
7130 if (r < 0)
7131 return r;
7132 caps = CEPH_CAP_AUTH_SHARED;
7133 }
7134
7135 /* Get extra requested caps on the last component */
7136 if (i == (path.depth() - 1))
7137 caps |= mask;
7138 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7139 if (r < 0)
7140 return r;
7141 // only follow trailing symlink if followsym. always follow
7142 // 'directory' symlinks.
7143 if (next && next->is_symlink()) {
7144 symlinks++;
7145 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7146 if (symlinks > MAXSYMLINKS) {
7147 return -CEPHFS_ELOOP;
7148 }
7149
7150 if (i < path.depth() - 1) {
7151 // dir symlink
7152 // replace consumed components of path with symlink dir target
7153 filepath resolved(next->symlink.c_str());
7154 resolved.append(path.postfixpath(i + 1));
7155 path = resolved;
7156 i = 0;
7157 if (next->symlink[0] == '/') {
7158 cur = root;
7159 }
7160 continue;
7161 } else if (followsym) {
7162 if (next->symlink[0] == '/') {
7163 path = next->symlink.c_str();
7164 i = 0;
7165 // reset position
7166 cur = root;
7167 } else {
7168 filepath more(next->symlink.c_str());
7169 // we need to remove the symlink component from off of the path
7170 // before adding the target that the symlink points to. remain
7171 // at the same position in the path.
7172 path.pop_dentry();
7173 path.append(more);
7174 }
7175 continue;
7176 }
7177 }
7178 cur.swap(next);
7179 i++;
7180 }
7181 if (!cur)
7182 return -CEPHFS_ENOENT;
7183 if (result) {
7184 result->in = std::move(cur);
7185 result->alternate_name = std::move(alternate_name);
7186 }
7187 return 0;
7188 }
7189
7190
7191 // namespace ops
7192
7193 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7194 {
7195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7196 if (!mref_reader.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN;
7198
7199 tout(cct) << "link" << std::endl;
7200 tout(cct) << relexisting << std::endl;
7201 tout(cct) << relpath << std::endl;
7202
7203 filepath existing(relexisting);
7204
7205 InodeRef in, dir;
7206
7207 std::scoped_lock lock(client_lock);
7208 int r = path_walk(existing, &in, perm, true);
7209 if (r < 0)
7210 return r;
7211 if (std::string(relpath) == "/") {
7212 r = -CEPHFS_EEXIST;
7213 return r;
7214 }
7215 filepath path(relpath);
7216 string name = path.last_dentry();
7217 path.pop_dentry();
7218
7219 r = path_walk(path, &dir, perm, true);
7220 if (r < 0)
7221 return r;
7222 if (cct->_conf->client_permissions) {
7223 if (S_ISDIR(in->mode)) {
7224 r = -CEPHFS_EPERM;
7225 return r;
7226 }
7227 r = may_hardlink(in.get(), perm);
7228 if (r < 0)
7229 return r;
7230 r = may_create(dir.get(), perm);
7231 if (r < 0)
7232 return r;
7233 }
7234 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7235 return r;
7236 }
7237
7238 int Client::unlink(const char *relpath, const UserPerm& perm)
7239 {
7240 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7241 }
7242
7243 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7244 {
7245 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7246 if (!mref_reader.is_state_satisfied()) {
7247 return -CEPHFS_ENOTCONN;
7248 }
7249
7250 tout(cct) << __func__ << std::endl;
7251 tout(cct) << dirfd << std::endl;
7252 tout(cct) << relpath << std::endl;
7253 tout(cct) << flags << std::endl;
7254
7255 if (std::string(relpath) == "/") {
7256 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7257 }
7258
7259 filepath path(relpath);
7260 string name = path.last_dentry();
7261 path.pop_dentry();
7262 InodeRef dir;
7263
7264 std::scoped_lock lock(client_lock);
7265
7266 InodeRef dirinode;
7267 int r = get_fd_inode(dirfd, &dirinode);
7268 if (r < 0) {
7269 return r;
7270 }
7271
7272 r = path_walk(path, &dir, perm, true, 0, dirinode);
7273 if (r < 0) {
7274 return r;
7275 }
7276 if (cct->_conf->client_permissions) {
7277 r = may_delete(dir.get(), name.c_str(), perm);
7278 if (r < 0) {
7279 return r;
7280 }
7281 }
7282 if (flags & AT_REMOVEDIR) {
7283 r = _rmdir(dir.get(), name.c_str(), perm);
7284 } else {
7285 r = _unlink(dir.get(), name.c_str(), perm);
7286 }
7287 return r;
7288 }
7289
7290 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7291 {
7292 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7293 if (!mref_reader.is_state_satisfied())
7294 return -CEPHFS_ENOTCONN;
7295
7296 tout(cct) << __func__ << std::endl;
7297 tout(cct) << relfrom << std::endl;
7298 tout(cct) << relto << std::endl;
7299
7300 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7301 return -CEPHFS_EBUSY;
7302
7303 filepath from(relfrom);
7304 filepath to(relto);
7305 string fromname = from.last_dentry();
7306 from.pop_dentry();
7307 string toname = to.last_dentry();
7308 to.pop_dentry();
7309
7310 InodeRef fromdir, todir;
7311
7312 std::scoped_lock lock(client_lock);
7313 int r = path_walk(from, &fromdir, perm);
7314 if (r < 0)
7315 goto out;
7316 r = path_walk(to, &todir, perm);
7317 if (r < 0)
7318 goto out;
7319
7320 if (cct->_conf->client_permissions) {
7321 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7322 if (r < 0)
7323 return r;
7324 r = may_delete(todir.get(), toname.c_str(), perm);
7325 if (r < 0 && r != -CEPHFS_ENOENT)
7326 return r;
7327 }
7328 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7329 out:
7330 return r;
7331 }
7332
7333 // dirs
7334
7335 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7336 {
7337 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7338 }
7339
7340 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7341 std::string alternate_name)
7342 {
7343 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7344 if (!mref_reader.is_state_satisfied())
7345 return -CEPHFS_ENOTCONN;
7346
7347 tout(cct) << __func__ << std::endl;
7348 tout(cct) << dirfd << std::endl;
7349 tout(cct) << relpath << std::endl;
7350 tout(cct) << mode << std::endl;
7351 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7352
7353 if (std::string(relpath) == "/") {
7354 return -CEPHFS_EEXIST;
7355 }
7356
7357 filepath path(relpath);
7358 string name = path.last_dentry();
7359 path.pop_dentry();
7360 InodeRef dir;
7361
7362 std::scoped_lock lock(client_lock);
7363
7364 InodeRef dirinode;
7365 int r = get_fd_inode(dirfd, &dirinode);
7366 if (r < 0) {
7367 return r;
7368 }
7369
7370 r = path_walk(path, &dir, perm, true, 0, dirinode);
7371 if (r < 0) {
7372 return r;
7373 }
7374 if (cct->_conf->client_permissions) {
7375 r = may_create(dir.get(), perm);
7376 if (r < 0) {
7377 return r;
7378 }
7379 }
7380 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7381 }
7382
7383 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7384 {
7385 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7386 if (!mref_reader.is_state_satisfied())
7387 return -CEPHFS_ENOTCONN;
7388
7389 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7390 tout(cct) << __func__ << std::endl;
7391 tout(cct) << relpath << std::endl;
7392 tout(cct) << mode << std::endl;
7393
7394 //get through existing parts of path
7395 filepath path(relpath);
7396 unsigned int i;
7397 int r = 0, caps = 0;
7398 InodeRef cur, next;
7399
7400 std::scoped_lock lock(client_lock);
7401 cur = cwd;
7402 for (i=0; i<path.depth(); ++i) {
7403 if (cct->_conf->client_permissions) {
7404 r = may_lookup(cur.get(), perms);
7405 if (r < 0)
7406 break;
7407 caps = CEPH_CAP_AUTH_SHARED;
7408 }
7409 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7410 if (r < 0)
7411 break;
7412 cur.swap(next);
7413 }
7414 if (r!=-CEPHFS_ENOENT) return r;
7415 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7416 //make new directory at each level
7417 for (; i<path.depth(); ++i) {
7418 if (cct->_conf->client_permissions) {
7419 r = may_create(cur.get(), perms);
7420 if (r < 0)
7421 return r;
7422 }
7423 //make new dir
7424 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7425
7426 //check proper creation/existence
7427 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7428 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7429 }
7430 if (r < 0)
7431 return r;
7432 //move to new dir and continue
7433 cur.swap(next);
7434 ldout(cct, 20) << __func__ << ": successfully created directory "
7435 << filepath(cur->ino).get_path() << dendl;
7436 }
7437 return 0;
7438 }
7439
7440 int Client::rmdir(const char *relpath, const UserPerm& perms)
7441 {
7442 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7443 }
7444
7445 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7446 {
7447 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7448 if (!mref_reader.is_state_satisfied())
7449 return -CEPHFS_ENOTCONN;
7450
7451 tout(cct) << __func__ << std::endl;
7452 tout(cct) << relpath << std::endl;
7453 tout(cct) << mode << std::endl;
7454 tout(cct) << rdev << std::endl;
7455
7456 if (std::string(relpath) == "/")
7457 return -CEPHFS_EEXIST;
7458
7459 filepath path(relpath);
7460 string name = path.last_dentry();
7461 path.pop_dentry();
7462 InodeRef dir;
7463
7464 std::scoped_lock lock(client_lock);
7465 int r = path_walk(path, &dir, perms);
7466 if (r < 0)
7467 return r;
7468 if (cct->_conf->client_permissions) {
7469 int r = may_create(dir.get(), perms);
7470 if (r < 0)
7471 return r;
7472 }
7473 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7474 }
7475
7476 // symlinks
7477
7478 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7479 {
7480 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7481 }
7482
7483 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7484 std::string alternate_name)
7485 {
7486 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7487 if (!mref_reader.is_state_satisfied()) {
7488 return -CEPHFS_ENOTCONN;
7489 }
7490
7491 tout(cct) << __func__ << std::endl;
7492 tout(cct) << target << std::endl;
7493 tout(cct) << dirfd << std::endl;
7494 tout(cct) << relpath << std::endl;
7495
7496 if (std::string(relpath) == "/") {
7497 return -CEPHFS_EEXIST;
7498 }
7499
7500 filepath path(relpath);
7501 string name = path.last_dentry();
7502 path.pop_dentry();
7503 InodeRef dir;
7504
7505 std::scoped_lock lock(client_lock);
7506
7507 InodeRef dirinode;
7508 int r = get_fd_inode(dirfd, &dirinode);
7509 if (r < 0) {
7510 return r;
7511 }
7512 r = path_walk(path, &dir, perms, true, 0, dirinode);
7513 if (r < 0) {
7514 return r;
7515 }
7516 if (cct->_conf->client_permissions) {
7517 int r = may_create(dir.get(), perms);
7518 if (r < 0) {
7519 return r;
7520 }
7521 }
7522 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7523 }
7524
7525 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7526 {
7527 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7528 }
7529
7530 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7531 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7532 if (!mref_reader.is_state_satisfied()) {
7533 return -CEPHFS_ENOTCONN;
7534 }
7535
7536 tout(cct) << __func__ << std::endl;
7537 tout(cct) << dirfd << std::endl;
7538 tout(cct) << relpath << std::endl;
7539
7540 InodeRef dirinode;
7541 std::scoped_lock lock(client_lock);
7542 int r = get_fd_inode(dirfd, &dirinode);
7543 if (r < 0) {
7544 return r;
7545 }
7546
7547 InodeRef in;
7548 filepath path(relpath);
7549 r = path_walk(path, &in, perms, false, 0, dirinode);
7550 if (r < 0) {
7551 return r;
7552 }
7553
7554 return _readlink(in.get(), buf, size);
7555 }
7556
7557 int Client::_readlink(Inode *in, char *buf, size_t size)
7558 {
7559 if (!in->is_symlink())
7560 return -CEPHFS_EINVAL;
7561
7562 // copy into buf (at most size bytes)
7563 int r = in->symlink.length();
7564 if (r > (int)size)
7565 r = size;
7566 memcpy(buf, in->symlink.c_str(), r);
7567 return r;
7568 }
7569
7570
7571 // inode stuff
7572
7573 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7574 {
7575 bool yes = in->caps_issued_mask(mask, true);
7576
7577 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7578 if (yes && !force)
7579 return 0;
7580
7581 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7582 filepath path;
7583 in->make_nosnap_relative_path(path);
7584 req->set_filepath(path);
7585 req->set_inode(in);
7586 req->head.args.getattr.mask = mask;
7587
7588 int res = make_request(req, perms);
7589 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7590 return res;
7591 }
7592
7593 int Client::_getvxattr(
7594 Inode *in,
7595 const UserPerm& perms,
7596 const char *xattr_name,
7597 ssize_t size,
7598 void *value,
7599 mds_rank_t rank)
7600 {
7601 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7602 return -CEPHFS_ENODATA;
7603 }
7604
7605 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7606 filepath path;
7607 in->make_nosnap_relative_path(path);
7608 req->set_filepath(path);
7609 req->set_inode(in);
7610 req->set_string2(xattr_name);
7611
7612 bufferlist bl;
7613 int res = make_request(req, perms, nullptr, nullptr, rank, &bl);
7614 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7615
7616 if (res < 0) {
7617 return res;
7618 }
7619
7620 std::string buf;
7621 auto p = bl.cbegin();
7622
7623 DECODE_START(1, p);
7624 decode(buf, p);
7625 DECODE_FINISH(p);
7626
7627 ssize_t len = buf.length();
7628
7629 res = len; // refer to man getxattr(2) for output buffer size == 0
7630
7631 if (size > 0) {
7632 if (len > size) {
7633 res = -CEPHFS_ERANGE; // insufficient output buffer space
7634 } else {
7635 memcpy(value, buf.c_str(), len);
7636 }
7637 }
7638 return res;
7639 }
7640
7641 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7642 const UserPerm& perms, InodeRef *inp)
7643 {
7644 int issued = in->caps_issued();
7645 union ceph_mds_request_args args;
7646 bool kill_sguid = false;
7647 int inode_drop = 0;
7648
7649 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7650 ccap_string(issued) << dendl;
7651
7652 if (in->snapid != CEPH_NOSNAP) {
7653 return -CEPHFS_EROFS;
7654 }
7655 if ((mask & CEPH_SETATTR_SIZE) &&
7656 (uint64_t)stx->stx_size > in->size &&
7657 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7658 perms)) {
7659 return -CEPHFS_EDQUOT;
7660 }
7661
7662 memset(&args, 0, sizeof(args));
7663
7664 // make the change locally?
7665 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7666 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7667 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7668 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7669 << in->cap_dirtier_gid << ", forcing sync setattr"
7670 << dendl;
7671 /*
7672 * This works because we implicitly flush the caps as part of the
7673 * request, so the cap update check will happen with the writeback
7674 * cap context, and then the setattr check will happen with the
7675 * caller's context.
7676 *
7677 * In reality this pattern is likely pretty rare (different users
7678 * setattr'ing the same file). If that turns out not to be the
7679 * case later, we can build a more complex pipelined cap writeback
7680 * infrastructure...
7681 */
7682 mask |= CEPH_SETATTR_CTIME;
7683 }
7684
7685 if (!mask) {
7686 // caller just needs us to bump the ctime
7687 in->ctime = ceph_clock_now();
7688 in->cap_dirtier_uid = perms.uid();
7689 in->cap_dirtier_gid = perms.gid();
7690 if (issued & CEPH_CAP_AUTH_EXCL)
7691 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7692 else if (issued & CEPH_CAP_FILE_EXCL)
7693 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7694 else if (issued & CEPH_CAP_XATTR_EXCL)
7695 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7696 else
7697 mask |= CEPH_SETATTR_CTIME;
7698 }
7699
7700 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7701 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7702
7703 mask &= ~CEPH_SETATTR_KILL_SGUID;
7704 } else if (mask & CEPH_SETATTR_SIZE) {
7705 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7706 mask |= CEPH_SETATTR_KILL_SGUID;
7707 inode_drop |= CEPH_CAP_AUTH_SHARED;
7708 }
7709
7710 if (mask & CEPH_SETATTR_UID) {
7711 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7712
7713 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7714 in->ctime = ceph_clock_now();
7715 in->cap_dirtier_uid = perms.uid();
7716 in->cap_dirtier_gid = perms.gid();
7717 in->uid = stx->stx_uid;
7718 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7719 mask &= ~CEPH_SETATTR_UID;
7720 kill_sguid = true;
7721 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7722 in->uid != stx->stx_uid) {
7723 args.setattr.uid = stx->stx_uid;
7724 inode_drop |= CEPH_CAP_AUTH_SHARED;
7725 } else {
7726 mask &= ~CEPH_SETATTR_UID;
7727 }
7728 }
7729
7730 if (mask & CEPH_SETATTR_GID) {
7731 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7732
7733 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7734 in->ctime = ceph_clock_now();
7735 in->cap_dirtier_uid = perms.uid();
7736 in->cap_dirtier_gid = perms.gid();
7737 in->gid = stx->stx_gid;
7738 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7739 mask &= ~CEPH_SETATTR_GID;
7740 kill_sguid = true;
7741 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7742 in->gid != stx->stx_gid) {
7743 args.setattr.gid = stx->stx_gid;
7744 inode_drop |= CEPH_CAP_AUTH_SHARED;
7745 } else {
7746 mask &= ~CEPH_SETATTR_GID;
7747 }
7748 }
7749
7750 if (mask & CEPH_SETATTR_MODE) {
7751 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7752
7753 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7754 in->ctime = ceph_clock_now();
7755 in->cap_dirtier_uid = perms.uid();
7756 in->cap_dirtier_gid = perms.gid();
7757 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7758 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7759 mask &= ~CEPH_SETATTR_MODE;
7760 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7761 in->mode != stx->stx_mode) {
7762 args.setattr.mode = stx->stx_mode;
7763 inode_drop |= CEPH_CAP_AUTH_SHARED;
7764 } else {
7765 mask &= ~CEPH_SETATTR_MODE;
7766 }
7767 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7768 kill_sguid && S_ISREG(in->mode) &&
7769 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7770 /* Must squash the any setuid/setgid bits with an ownership change */
7771 in->mode &= ~(S_ISUID|S_ISGID);
7772 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7773 }
7774
7775 if (mask & CEPH_SETATTR_BTIME) {
7776 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7777
7778 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7779 in->ctime = ceph_clock_now();
7780 in->cap_dirtier_uid = perms.uid();
7781 in->cap_dirtier_gid = perms.gid();
7782 in->btime = utime_t(stx->stx_btime);
7783 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7784 mask &= ~CEPH_SETATTR_BTIME;
7785 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7786 in->btime != utime_t(stx->stx_btime)) {
7787 args.setattr.btime = utime_t(stx->stx_btime);
7788 inode_drop |= CEPH_CAP_AUTH_SHARED;
7789 } else {
7790 mask &= ~CEPH_SETATTR_BTIME;
7791 }
7792 }
7793
7794 if (mask & CEPH_SETATTR_SIZE) {
7795 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7796 //too big!
7797 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7798 return -CEPHFS_EFBIG;
7799 }
7800
7801 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7802 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7803 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7804 stx->stx_size >= in->size) {
7805 if (stx->stx_size > in->size) {
7806 in->size = in->reported_size = stx->stx_size;
7807 in->cap_dirtier_uid = perms.uid();
7808 in->cap_dirtier_gid = perms.gid();
7809 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7810 mask &= ~(CEPH_SETATTR_SIZE);
7811 mask |= CEPH_SETATTR_MTIME;
7812 } else {
7813 // ignore it when size doesn't change
7814 mask &= ~(CEPH_SETATTR_SIZE);
7815 }
7816 } else {
7817 args.setattr.size = stx->stx_size;
7818 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7819 CEPH_CAP_FILE_WR;
7820 }
7821 }
7822
7823 if (mask & CEPH_SETATTR_MTIME) {
7824 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7825 in->mtime = utime_t(stx->stx_mtime);
7826 in->ctime = ceph_clock_now();
7827 in->cap_dirtier_uid = perms.uid();
7828 in->cap_dirtier_gid = perms.gid();
7829 in->time_warp_seq++;
7830 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7831 mask &= ~CEPH_SETATTR_MTIME;
7832 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7833 utime_t(stx->stx_mtime) > in->mtime) {
7834 in->mtime = utime_t(stx->stx_mtime);
7835 in->ctime = ceph_clock_now();
7836 in->cap_dirtier_uid = perms.uid();
7837 in->cap_dirtier_gid = perms.gid();
7838 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7839 mask &= ~CEPH_SETATTR_MTIME;
7840 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7841 in->mtime != utime_t(stx->stx_mtime)) {
7842 args.setattr.mtime = utime_t(stx->stx_mtime);
7843 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7844 CEPH_CAP_FILE_WR;
7845 } else {
7846 mask &= ~CEPH_SETATTR_MTIME;
7847 }
7848 }
7849
7850 if (mask & CEPH_SETATTR_ATIME) {
7851 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7852 in->atime = utime_t(stx->stx_atime);
7853 in->ctime = ceph_clock_now();
7854 in->cap_dirtier_uid = perms.uid();
7855 in->cap_dirtier_gid = perms.gid();
7856 in->time_warp_seq++;
7857 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7858 mask &= ~CEPH_SETATTR_ATIME;
7859 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7860 utime_t(stx->stx_atime) > in->atime) {
7861 in->atime = utime_t(stx->stx_atime);
7862 in->ctime = ceph_clock_now();
7863 in->cap_dirtier_uid = perms.uid();
7864 in->cap_dirtier_gid = perms.gid();
7865 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7866 mask &= ~CEPH_SETATTR_ATIME;
7867 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7868 in->atime != utime_t(stx->stx_atime)) {
7869 args.setattr.atime = utime_t(stx->stx_atime);
7870 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7871 CEPH_CAP_FILE_WR;
7872 } else {
7873 mask &= ~CEPH_SETATTR_ATIME;
7874 }
7875 }
7876
7877 if (!mask) {
7878 in->change_attr++;
7879 return 0;
7880 }
7881
7882 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7883
7884 filepath path;
7885
7886 in->make_nosnap_relative_path(path);
7887 req->set_filepath(path);
7888 req->set_inode(in);
7889
7890 req->head.args = args;
7891 req->inode_drop = inode_drop;
7892 req->head.args.setattr.mask = mask;
7893 req->regetattr_mask = mask;
7894
7895 int res = make_request(req, perms, inp);
7896 ldout(cct, 10) << "_setattr result=" << res << dendl;
7897 return res;
7898 }
7899
7900 /* Note that we only care about attrs that setattr cares about */
7901 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7902 {
7903 stx->stx_size = st->st_size;
7904 stx->stx_mode = st->st_mode;
7905 stx->stx_uid = st->st_uid;
7906 stx->stx_gid = st->st_gid;
7907 #ifdef __APPLE__
7908 stx->stx_mtime = st->st_mtimespec;
7909 stx->stx_atime = st->st_atimespec;
7910 #elif __WIN32
7911 stx->stx_mtime.tv_sec = st->st_mtime;
7912 stx->stx_atime.tv_sec = st->st_atime;
7913 #else
7914 stx->stx_mtime = st->st_mtim;
7915 stx->stx_atime = st->st_atim;
7916 #endif
7917 }
7918
7919 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7920 const UserPerm& perms, InodeRef *inp)
7921 {
7922 int ret = _do_setattr(in, stx, mask, perms, inp);
7923 if (ret < 0)
7924 return ret;
7925 if (mask & CEPH_SETATTR_MODE)
7926 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7927 return ret;
7928 }
7929
7930 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7931 const UserPerm& perms)
7932 {
7933 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7934 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7935 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7936 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7937 if (cct->_conf->client_permissions) {
7938 int r = may_setattr(in.get(), stx, mask, perms);
7939 if (r < 0)
7940 return r;
7941 }
7942 return __setattrx(in.get(), stx, mask, perms);
7943 }
7944
7945 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7946 const UserPerm& perms)
7947 {
7948 struct ceph_statx stx;
7949
7950 stat_to_statx(attr, &stx);
7951 mask &= ~CEPH_SETATTR_BTIME;
7952
7953 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7954 mask &= ~CEPH_SETATTR_UID;
7955 }
7956 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7957 mask &= ~CEPH_SETATTR_GID;
7958 }
7959
7960 return _setattrx(in, &stx, mask, perms);
7961 }
7962
7963 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7964 const UserPerm& perms)
7965 {
7966 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7967 if (!mref_reader.is_state_satisfied())
7968 return -CEPHFS_ENOTCONN;
7969
7970 tout(cct) << __func__ << std::endl;
7971 tout(cct) << relpath << std::endl;
7972 tout(cct) << mask << std::endl;
7973
7974 filepath path(relpath);
7975 InodeRef in;
7976
7977 std::scoped_lock lock(client_lock);
7978 int r = path_walk(path, &in, perms);
7979 if (r < 0)
7980 return r;
7981 return _setattr(in, attr, mask, perms);
7982 }
7983
7984 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7985 const UserPerm& perms, int flags)
7986 {
7987 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7988 if (!mref_reader.is_state_satisfied())
7989 return -CEPHFS_ENOTCONN;
7990
7991 tout(cct) << __func__ << std::endl;
7992 tout(cct) << relpath << std::endl;
7993 tout(cct) << mask << std::endl;
7994
7995 filepath path(relpath);
7996 InodeRef in;
7997
7998 std::scoped_lock lock(client_lock);
7999 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8000 if (r < 0)
8001 return r;
8002 return _setattrx(in, stx, mask, perms);
8003 }
8004
8005 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8006 {
8007 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8008 if (!mref_reader.is_state_satisfied())
8009 return -CEPHFS_ENOTCONN;
8010
8011 tout(cct) << __func__ << std::endl;
8012 tout(cct) << fd << std::endl;
8013 tout(cct) << mask << std::endl;
8014
8015 std::scoped_lock lock(client_lock);
8016 Fh *f = get_filehandle(fd);
8017 if (!f)
8018 return -CEPHFS_EBADF;
8019 #if defined(__linux__) && defined(O_PATH)
8020 if (f->flags & O_PATH)
8021 return -CEPHFS_EBADF;
8022 #endif
8023 return _setattr(f->inode, attr, mask, perms);
8024 }
8025
8026 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8027 {
8028 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8029 if (!mref_reader.is_state_satisfied())
8030 return -CEPHFS_ENOTCONN;
8031
8032 tout(cct) << __func__ << std::endl;
8033 tout(cct) << fd << std::endl;
8034 tout(cct) << mask << std::endl;
8035
8036 std::scoped_lock lock(client_lock);
8037 Fh *f = get_filehandle(fd);
8038 if (!f)
8039 return -CEPHFS_EBADF;
8040 #if defined(__linux__) && defined(O_PATH)
8041 if (f->flags & O_PATH)
8042 return -CEPHFS_EBADF;
8043 #endif
8044 return _setattrx(f->inode, stx, mask, perms);
8045 }
8046
8047 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8048 frag_info_t *dirstat, int mask)
8049 {
8050 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8051 if (!mref_reader.is_state_satisfied())
8052 return -CEPHFS_ENOTCONN;
8053
8054 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8055 tout(cct) << "stat" << std::endl;
8056 tout(cct) << relpath << std::endl;
8057
8058 filepath path(relpath);
8059 InodeRef in;
8060
8061 std::scoped_lock lock(client_lock);
8062 int r = path_walk(path, &in, perms, true, mask);
8063 if (r < 0)
8064 return r;
8065 r = _getattr(in, mask, perms);
8066 if (r < 0) {
8067 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8068 return r;
8069 }
8070 fill_stat(in, stbuf, dirstat);
8071 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8072 return r;
8073 }
8074
8075 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8076 {
8077 unsigned mask = 0;
8078
8079 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8080 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
8081 goto out;
8082
8083 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8084 mask |= CEPH_CAP_PIN;
8085 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8086 mask |= CEPH_CAP_AUTH_SHARED;
8087 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8088 mask |= CEPH_CAP_LINK_SHARED;
8089 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
8090 mask |= CEPH_CAP_FILE_SHARED;
8091 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8092 mask |= CEPH_CAP_XATTR_SHARED;
8093 out:
8094 return mask;
8095 }
8096
8097 int Client::statx(const char *relpath, struct ceph_statx *stx,
8098 const UserPerm& perms,
8099 unsigned int want, unsigned int flags)
8100 {
8101 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
8102 }
8103
8104 int Client::lstat(const char *relpath, struct stat *stbuf,
8105 const UserPerm& perms, frag_info_t *dirstat, int mask)
8106 {
8107 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8108 if (!mref_reader.is_state_satisfied())
8109 return -CEPHFS_ENOTCONN;
8110
8111 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8112 tout(cct) << __func__ << std::endl;
8113 tout(cct) << relpath << std::endl;
8114
8115 filepath path(relpath);
8116 InodeRef in;
8117
8118 std::scoped_lock lock(client_lock);
8119 // don't follow symlinks
8120 int r = path_walk(path, &in, perms, false, mask);
8121 if (r < 0)
8122 return r;
8123 r = _getattr(in, mask, perms);
8124 if (r < 0) {
8125 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8126 return r;
8127 }
8128 fill_stat(in, stbuf, dirstat);
8129 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8130 return r;
8131 }
8132
8133 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8134 {
8135 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8136 << " mode 0" << oct << in->mode << dec
8137 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8138 memset(st, 0, sizeof(struct stat));
8139 if (use_faked_inos())
8140 st->st_ino = in->faked_ino;
8141 else
8142 st->st_ino = in->ino;
8143 st->st_dev = in->snapid;
8144 st->st_mode = in->mode;
8145 st->st_rdev = in->rdev;
8146 if (in->is_dir()) {
8147 switch (in->nlink) {
8148 case 0:
8149 st->st_nlink = 0; /* dir is unlinked */
8150 break;
8151 case 1:
8152 st->st_nlink = 1 /* parent dentry */
8153 + 1 /* <dir>/. */
8154 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8155 break;
8156 default:
8157 ceph_abort();
8158 }
8159 } else {
8160 st->st_nlink = in->nlink;
8161 }
8162 st->st_uid = in->uid;
8163 st->st_gid = in->gid;
8164 if (in->ctime > in->mtime) {
8165 stat_set_ctime_sec(st, in->ctime.sec());
8166 stat_set_ctime_nsec(st, in->ctime.nsec());
8167 } else {
8168 stat_set_ctime_sec(st, in->mtime.sec());
8169 stat_set_ctime_nsec(st, in->mtime.nsec());
8170 }
8171 stat_set_atime_sec(st, in->atime.sec());
8172 stat_set_atime_nsec(st, in->atime.nsec());
8173 stat_set_mtime_sec(st, in->mtime.sec());
8174 stat_set_mtime_nsec(st, in->mtime.nsec());
8175 if (in->is_dir()) {
8176 if (cct->_conf->client_dirsize_rbytes)
8177 st->st_size = in->rstat.rbytes;
8178 else
8179 st->st_size = in->dirstat.size();
8180 // The Windows "stat" structure provides just a subset of the fields that are
8181 // available on Linux.
8182 #ifndef _WIN32
8183 st->st_blocks = 1;
8184 #endif
8185 } else {
8186 st->st_size = in->size;
8187 #ifndef _WIN32
8188 st->st_blocks = (in->size + 511) >> 9;
8189 #endif
8190 }
8191 #ifndef _WIN32
8192 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8193 #endif
8194
8195 if (dirstat)
8196 *dirstat = in->dirstat;
8197 if (rstat)
8198 *rstat = in->rstat;
8199
8200 return in->caps_issued();
8201 }
8202
8203 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8204 {
8205 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8206 << " mode 0" << oct << in->mode << dec
8207 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8208 memset(stx, 0, sizeof(struct ceph_statx));
8209
8210 /*
8211 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8212 * so that all bits are set.
8213 */
8214 if (!mask)
8215 mask = ~0;
8216
8217 /* These are always considered to be available */
8218 stx->stx_dev = in->snapid;
8219 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8220
8221 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8222 stx->stx_mode = S_IFMT & in->mode;
8223 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8224 stx->stx_rdev = in->rdev;
8225 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8226
8227 if (mask & CEPH_CAP_AUTH_SHARED) {
8228 stx->stx_uid = in->uid;
8229 stx->stx_gid = in->gid;
8230 stx->stx_mode = in->mode;
8231 in->btime.to_timespec(&stx->stx_btime);
8232 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8233 }
8234
8235 if (mask & CEPH_CAP_LINK_SHARED) {
8236 if (in->is_dir()) {
8237 switch (in->nlink) {
8238 case 0:
8239 stx->stx_nlink = 0; /* dir is unlinked */
8240 break;
8241 case 1:
8242 stx->stx_nlink = 1 /* parent dentry */
8243 + 1 /* <dir>/. */
8244 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8245 break;
8246 default:
8247 ceph_abort();
8248 }
8249 } else {
8250 stx->stx_nlink = in->nlink;
8251 }
8252 stx->stx_mask |= CEPH_STATX_NLINK;
8253 }
8254
8255 if (mask & CEPH_CAP_FILE_SHARED) {
8256
8257 in->atime.to_timespec(&stx->stx_atime);
8258 in->mtime.to_timespec(&stx->stx_mtime);
8259
8260 if (in->is_dir()) {
8261 if (cct->_conf->client_dirsize_rbytes)
8262 stx->stx_size = in->rstat.rbytes;
8263 else
8264 stx->stx_size = in->dirstat.size();
8265 stx->stx_blocks = 1;
8266 } else {
8267 stx->stx_size = in->size;
8268 stx->stx_blocks = (in->size + 511) >> 9;
8269 }
8270 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8271 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8272 }
8273
8274 /* Change time and change_attr both require all shared caps to view */
8275 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8276 stx->stx_version = in->change_attr;
8277 if (in->ctime > in->mtime)
8278 in->ctime.to_timespec(&stx->stx_ctime);
8279 else
8280 in->mtime.to_timespec(&stx->stx_ctime);
8281 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8282 }
8283
8284 }
8285
8286 void Client::touch_dn(Dentry *dn)
8287 {
8288 lru.lru_touch(dn);
8289 }
8290
8291 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8292 {
8293 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8294 }
8295
8296 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8297 {
8298 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8299 if (!mref_reader.is_state_satisfied())
8300 return -CEPHFS_ENOTCONN;
8301
8302 tout(cct) << __func__ << std::endl;
8303 tout(cct) << fd << std::endl;
8304 tout(cct) << mode << std::endl;
8305
8306 std::scoped_lock lock(client_lock);
8307 Fh *f = get_filehandle(fd);
8308 if (!f)
8309 return -CEPHFS_EBADF;
8310 #if defined(__linux__) && defined(O_PATH)
8311 if (f->flags & O_PATH)
8312 return -CEPHFS_EBADF;
8313 #endif
8314 struct stat attr;
8315 attr.st_mode = mode;
8316 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8317 }
8318
8319 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8320 const UserPerm& perms) {
8321 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8322 if (!mref_reader.is_state_satisfied()) {
8323 return -CEPHFS_ENOTCONN;
8324 }
8325
8326 tout(cct) << __func__ << std::endl;
8327 tout(cct) << dirfd << std::endl;
8328 tout(cct) << relpath << std::endl;
8329 tout(cct) << mode << std::endl;
8330 tout(cct) << flags << std::endl;
8331
8332 filepath path(relpath);
8333 InodeRef in;
8334 InodeRef dirinode;
8335
8336 std::scoped_lock lock(client_lock);
8337 int r = get_fd_inode(dirfd, &dirinode);
8338 if (r < 0) {
8339 return r;
8340 }
8341
8342 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8343 if (r < 0) {
8344 return r;
8345 }
8346 struct stat attr;
8347 attr.st_mode = mode;
8348 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8349 }
8350
8351 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8352 {
8353 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8354 }
8355
8356 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8357 const UserPerm& perms)
8358 {
8359 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8360 }
8361
8362 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8363 {
8364 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8365 if (!mref_reader.is_state_satisfied())
8366 return -CEPHFS_ENOTCONN;
8367
8368 tout(cct) << __func__ << std::endl;
8369 tout(cct) << fd << std::endl;
8370 tout(cct) << new_uid << std::endl;
8371 tout(cct) << new_gid << std::endl;
8372
8373 std::scoped_lock lock(client_lock);
8374 Fh *f = get_filehandle(fd);
8375 if (!f)
8376 return -CEPHFS_EBADF;
8377 #if defined(__linux__) && defined(O_PATH)
8378 if (f->flags & O_PATH)
8379 return -CEPHFS_EBADF;
8380 #endif
8381 struct stat attr;
8382 attr.st_uid = new_uid;
8383 attr.st_gid = new_gid;
8384 int mask = 0;
8385 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8386 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8387 return _setattr(f->inode, &attr, mask, perms);
8388 }
8389
8390 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8391 const UserPerm& perms)
8392 {
8393 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8394 }
8395
8396 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8397 int flags, const UserPerm& perms) {
8398 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8399 if (!mref_reader.is_state_satisfied()) {
8400 return -CEPHFS_ENOTCONN;
8401 }
8402
8403 tout(cct) << __func__ << std::endl;
8404 tout(cct) << dirfd << std::endl;
8405 tout(cct) << relpath << std::endl;
8406 tout(cct) << new_uid << std::endl;
8407 tout(cct) << new_gid << std::endl;
8408 tout(cct) << flags << std::endl;
8409
8410 filepath path(relpath);
8411 InodeRef in;
8412 InodeRef dirinode;
8413
8414 std::scoped_lock lock(client_lock);
8415 int r = get_fd_inode(dirfd, &dirinode);
8416 if (r < 0) {
8417 return r;
8418 }
8419
8420 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8421 if (r < 0) {
8422 return r;
8423 }
8424 struct stat attr;
8425 attr.st_uid = new_uid;
8426 attr.st_gid = new_gid;
8427 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8428 }
8429
8430 static void attr_set_atime_and_mtime(struct stat *attr,
8431 const utime_t &atime,
8432 const utime_t &mtime)
8433 {
8434 stat_set_atime_sec(attr, atime.tv.tv_sec);
8435 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8436 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8437 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8438 }
8439
8440 // for [l]utime() invoke the timeval variant as the timespec
8441 // variant are not yet implemented. for futime[s](), invoke
8442 // the timespec variant.
8443 int Client::utime(const char *relpath, struct utimbuf *buf,
8444 const UserPerm& perms)
8445 {
8446 struct timeval tv[2];
8447 tv[0].tv_sec = buf->actime;
8448 tv[0].tv_usec = 0;
8449 tv[1].tv_sec = buf->modtime;
8450 tv[1].tv_usec = 0;
8451
8452 return utimes(relpath, tv, perms);
8453 }
8454
8455 int Client::lutime(const char *relpath, struct utimbuf *buf,
8456 const UserPerm& perms)
8457 {
8458 struct timeval tv[2];
8459 tv[0].tv_sec = buf->actime;
8460 tv[0].tv_usec = 0;
8461 tv[1].tv_sec = buf->modtime;
8462 tv[1].tv_usec = 0;
8463
8464 return lutimes(relpath, tv, perms);
8465 }
8466
8467 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8468 {
8469 struct timespec ts[2];
8470 ts[0].tv_sec = buf->actime;
8471 ts[0].tv_nsec = 0;
8472 ts[1].tv_sec = buf->modtime;
8473 ts[1].tv_nsec = 0;
8474
8475 return futimens(fd, ts, perms);
8476 }
8477
8478 int Client::utimes(const char *relpath, struct timeval times[2],
8479 const UserPerm& perms)
8480 {
8481 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8482 if (!mref_reader.is_state_satisfied())
8483 return -CEPHFS_ENOTCONN;
8484
8485 tout(cct) << __func__ << std::endl;
8486 tout(cct) << relpath << std::endl;
8487 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8488 << std::endl;
8489 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8490 << std::endl;
8491
8492 filepath path(relpath);
8493 InodeRef in;
8494
8495 std::scoped_lock lock(client_lock);
8496 int r = path_walk(path, &in, perms);
8497 if (r < 0)
8498 return r;
8499 struct stat attr;
8500 utime_t atime(times[0]);
8501 utime_t mtime(times[1]);
8502
8503 attr_set_atime_and_mtime(&attr, atime, mtime);
8504 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8505 }
8506
8507 int Client::lutimes(const char *relpath, struct timeval times[2],
8508 const UserPerm& perms)
8509 {
8510 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8511 if (!mref_reader.is_state_satisfied())
8512 return -CEPHFS_ENOTCONN;
8513
8514 tout(cct) << __func__ << std::endl;
8515 tout(cct) << relpath << std::endl;
8516 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8517 << std::endl;
8518 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8519 << std::endl;
8520
8521 filepath path(relpath);
8522 InodeRef in;
8523
8524 std::scoped_lock lock(client_lock);
8525 int r = path_walk(path, &in, perms, false);
8526 if (r < 0)
8527 return r;
8528 struct stat attr;
8529 utime_t atime(times[0]);
8530 utime_t mtime(times[1]);
8531
8532 attr_set_atime_and_mtime(&attr, atime, mtime);
8533 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8534 }
8535
8536 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8537 {
8538 struct timespec ts[2];
8539 ts[0].tv_sec = times[0].tv_sec;
8540 ts[0].tv_nsec = times[0].tv_usec * 1000;
8541 ts[1].tv_sec = times[1].tv_sec;
8542 ts[1].tv_nsec = times[1].tv_usec * 1000;
8543
8544 return futimens(fd, ts, perms);
8545 }
8546
8547 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8548 {
8549 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8550 if (!mref_reader.is_state_satisfied())
8551 return -CEPHFS_ENOTCONN;
8552
8553 tout(cct) << __func__ << std::endl;
8554 tout(cct) << fd << std::endl;
8555 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8556 << std::endl;
8557 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8558 << std::endl;
8559
8560 std::scoped_lock lock(client_lock);
8561 Fh *f = get_filehandle(fd);
8562 if (!f)
8563 return -CEPHFS_EBADF;
8564 #if defined(__linux__) && defined(O_PATH)
8565 if (f->flags & O_PATH)
8566 return -CEPHFS_EBADF;
8567 #endif
8568 struct stat attr;
8569 utime_t atime(times[0]);
8570 utime_t mtime(times[1]);
8571
8572 attr_set_atime_and_mtime(&attr, atime, mtime);
8573 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8574 }
8575
8576 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8577 const UserPerm& perms) {
8578 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8579 if (!mref_reader.is_state_satisfied()) {
8580 return -CEPHFS_ENOTCONN;
8581 }
8582
8583 tout(cct) << __func__ << std::endl;
8584 tout(cct) << dirfd << std::endl;
8585 tout(cct) << relpath << std::endl;
8586 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8587 << std::endl;
8588 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8589 << std::endl;
8590 tout(cct) << flags << std::endl;
8591
8592 filepath path(relpath);
8593 InodeRef in;
8594 InodeRef dirinode;
8595
8596 std::scoped_lock lock(client_lock);
8597 int r = get_fd_inode(dirfd, &dirinode);
8598 if (r < 0) {
8599 return r;
8600 }
8601
8602 #if defined(__linux__) && defined(O_PATH)
8603 if (flags & O_PATH) {
8604 return -CEPHFS_EBADF;
8605 }
8606 #endif
8607
8608 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8609 if (r < 0) {
8610 return r;
8611 }
8612 struct stat attr;
8613 utime_t atime(times[0]);
8614 utime_t mtime(times[1]);
8615
8616 attr_set_atime_and_mtime(&attr, atime, mtime);
8617 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8618 }
8619
8620 int Client::flock(int fd, int operation, uint64_t owner)
8621 {
8622 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8623 if (!mref_reader.is_state_satisfied())
8624 return -CEPHFS_ENOTCONN;
8625
8626 tout(cct) << __func__ << std::endl;
8627 tout(cct) << fd << std::endl;
8628 tout(cct) << operation << std::endl;
8629 tout(cct) << owner << std::endl;
8630
8631 std::scoped_lock lock(client_lock);
8632 Fh *f = get_filehandle(fd);
8633 if (!f)
8634 return -CEPHFS_EBADF;
8635
8636 return _flock(f, operation, owner);
8637 }
8638
8639 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8640 {
8641 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8642 if (!mref_reader.is_state_satisfied())
8643 return -CEPHFS_ENOTCONN;
8644
8645 tout(cct) << __func__ << std::endl;
8646 tout(cct) << relpath << std::endl;
8647
8648 filepath path(relpath);
8649 InodeRef in;
8650
8651 std::scoped_lock lock(client_lock);
8652 int r = path_walk(path, &in, perms, true);
8653 if (r < 0)
8654 return r;
8655 if (cct->_conf->client_permissions) {
8656 int r = may_open(in.get(), O_RDONLY, perms);
8657 if (r < 0)
8658 return r;
8659 }
8660 r = _opendir(in.get(), dirpp, perms);
8661 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8662 if (r != -CEPHFS_ENOTDIR)
8663 tout(cct) << (uintptr_t)*dirpp << std::endl;
8664 return r;
8665 }
8666
8667 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8668 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8669 if (!mref_reader.is_state_satisfied()) {
8670 return -CEPHFS_ENOTCONN;
8671 }
8672
8673 tout(cct) << __func__ << std::endl;
8674 tout(cct) << dirfd << std::endl;
8675
8676 InodeRef dirinode;
8677 std::scoped_lock locker(client_lock);
8678 int r = get_fd_inode(dirfd, &dirinode);
8679 if (r < 0) {
8680 return r;
8681 }
8682
8683 if (cct->_conf->client_permissions) {
8684 r = may_open(dirinode.get(), O_RDONLY, perms);
8685 if (r < 0) {
8686 return r;
8687 }
8688 }
8689 r = _opendir(dirinode.get(), dirpp, perms);
8690 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8691 if (r != -CEPHFS_ENOTDIR) {
8692 tout(cct) << (uintptr_t)*dirpp << std::endl;
8693 }
8694 return r;
8695 }
8696
8697 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8698 {
8699 if (!in->is_dir())
8700 return -CEPHFS_ENOTDIR;
8701 *dirpp = new dir_result_t(in, perms);
8702 opened_dirs.insert(*dirpp);
8703 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8704 return 0;
8705 }
8706
8707
8708 int Client::closedir(dir_result_t *dir)
8709 {
8710 tout(cct) << __func__ << std::endl;
8711 tout(cct) << (uintptr_t)dir << std::endl;
8712
8713 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8714 std::scoped_lock lock(client_lock);
8715 _closedir(dir);
8716 return 0;
8717 }
8718
8719 void Client::_closedir(dir_result_t *dirp)
8720 {
8721 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8722
8723 if (dirp->inode) {
8724 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8725 dirp->inode.reset();
8726 }
8727 _readdir_drop_dirp_buffer(dirp);
8728 opened_dirs.erase(dirp);
8729 delete dirp;
8730 }
8731
8732 void Client::rewinddir(dir_result_t *dirp)
8733 {
8734 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8735
8736 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8737 if (!mref_reader.is_state_satisfied())
8738 return;
8739
8740 std::scoped_lock lock(client_lock);
8741 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8742 _readdir_drop_dirp_buffer(d);
8743 d->reset();
8744 }
8745
8746 loff_t Client::telldir(dir_result_t *dirp)
8747 {
8748 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8749 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8750 return d->offset;
8751 }
8752
8753 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8754 {
8755 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8756
8757 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8758 if (!mref_reader.is_state_satisfied())
8759 return;
8760
8761 std::scoped_lock lock(client_lock);
8762
8763 if (offset == dirp->offset)
8764 return;
8765
8766 if (offset > dirp->offset)
8767 dirp->release_count = 0; // bump if we do a forward seek
8768 else
8769 dirp->ordered_count = 0; // disable filling readdir cache
8770
8771 if (dirp->hash_order()) {
8772 if (dirp->offset > offset) {
8773 _readdir_drop_dirp_buffer(dirp);
8774 dirp->reset();
8775 }
8776 } else {
8777 if (offset == 0 ||
8778 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8779 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8780 _readdir_drop_dirp_buffer(dirp);
8781 dirp->reset();
8782 }
8783 }
8784
8785 dirp->offset = offset;
8786 }
8787
8788
8789 //struct dirent {
8790 // ino_t d_ino; /* inode number */
8791 // off_t d_off; /* offset to the next dirent */
8792 // unsigned short d_reclen; /* length of this record */
8793 // unsigned char d_type; /* type of file */
8794 // char d_name[256]; /* filename */
8795 //};
8796 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8797 {
8798 strncpy(de->d_name, name, 255);
8799 de->d_name[255] = '\0';
8800 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8801 de->d_ino = ino;
8802 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8803 de->d_off = next_off;
8804 #endif
8805 de->d_reclen = 1;
8806 de->d_type = IFTODT(type);
8807 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8808 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8809 #endif
8810 }
8811
8812 void Client::_readdir_next_frag(dir_result_t *dirp)
8813 {
8814 frag_t fg = dirp->buffer_frag;
8815
8816 if (fg.is_rightmost()) {
8817 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8818 dirp->set_end();
8819 return;
8820 }
8821
8822 // advance
8823 fg = fg.next();
8824 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8825
8826 if (dirp->hash_order()) {
8827 // keep last_name
8828 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8829 if (dirp->offset < new_offset) // don't decrease offset
8830 dirp->offset = new_offset;
8831 } else {
8832 dirp->last_name.clear();
8833 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8834 _readdir_rechoose_frag(dirp);
8835 }
8836 }
8837
8838 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8839 {
8840 ceph_assert(dirp->inode);
8841
8842 if (dirp->hash_order())
8843 return;
8844
8845 frag_t cur = frag_t(dirp->offset_high());
8846 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8847 if (fg != cur) {
8848 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8849 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8850 dirp->last_name.clear();
8851 dirp->next_offset = 2;
8852 }
8853 }
8854
8855 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8856 {
8857 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8858 dirp->buffer.clear();
8859 }
8860
8861 int Client::_readdir_get_frag(dir_result_t *dirp)
8862 {
8863 ceph_assert(dirp);
8864 ceph_assert(dirp->inode);
8865
8866 // get the current frag.
8867 frag_t fg;
8868 if (dirp->hash_order())
8869 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8870 else
8871 fg = frag_t(dirp->offset_high());
8872
8873 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8874 << " offset " << hex << dirp->offset << dec << dendl;
8875
8876 int op = CEPH_MDS_OP_READDIR;
8877 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8878 op = CEPH_MDS_OP_LSSNAP;
8879
8880 InodeRef& diri = dirp->inode;
8881
8882 MetaRequest *req = new MetaRequest(op);
8883 filepath path;
8884 diri->make_nosnap_relative_path(path);
8885 req->set_filepath(path);
8886 req->set_inode(diri.get());
8887 req->head.args.readdir.frag = fg;
8888 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8889 if (dirp->last_name.length()) {
8890 req->path2.set_path(dirp->last_name);
8891 } else if (dirp->hash_order()) {
8892 req->head.args.readdir.offset_hash = dirp->offset_high();
8893 }
8894 req->dirp = dirp;
8895
8896 bufferlist dirbl;
8897 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8898
8899 if (res == -CEPHFS_EAGAIN) {
8900 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8901 _readdir_rechoose_frag(dirp);
8902 return _readdir_get_frag(dirp);
8903 }
8904
8905 if (res == 0) {
8906 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8907 << " size " << dirp->buffer.size() << dendl;
8908 } else {
8909 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8910 dirp->set_end();
8911 }
8912
8913 return res;
8914 }
8915
8916 struct dentry_off_lt {
8917 bool operator()(const Dentry* dn, int64_t off) const {
8918 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8919 }
8920 };
8921
8922 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8923 int caps, bool getref)
8924 {
8925 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8926 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8927 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8928 << dendl;
8929 Dir *dir = dirp->inode->dir;
8930
8931 if (!dir) {
8932 ldout(cct, 10) << " dir is empty" << dendl;
8933 dirp->set_end();
8934 return 0;
8935 }
8936
8937 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8938 dir->readdir_cache.end(),
8939 dirp->offset, dentry_off_lt());
8940
8941 string dn_name;
8942 while (true) {
8943 int mask = caps;
8944 if (!dirp->inode->is_complete_and_ordered())
8945 return -CEPHFS_EAGAIN;
8946 if (pd == dir->readdir_cache.end())
8947 break;
8948 Dentry *dn = *pd;
8949 if (dn->inode == NULL) {
8950 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8951 ++pd;
8952 continue;
8953 }
8954 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8955 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8956 ++pd;
8957 continue;
8958 }
8959
8960 int idx = pd - dir->readdir_cache.begin();
8961 if (dn->inode->is_dir()) {
8962 mask |= CEPH_STAT_RSTAT;
8963 }
8964 int r = _getattr(dn->inode, mask, dirp->perms);
8965 if (r < 0)
8966 return r;
8967
8968 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8969 pd = dir->readdir_cache.begin() + idx;
8970 if (pd >= dir->readdir_cache.end() || *pd != dn)
8971 return -CEPHFS_EAGAIN;
8972
8973 struct ceph_statx stx;
8974 struct dirent de;
8975 fill_statx(dn->inode, caps, &stx);
8976
8977 uint64_t next_off = dn->offset + 1;
8978 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8979 ++pd;
8980 if (pd == dir->readdir_cache.end())
8981 next_off = dir_result_t::END;
8982
8983 Inode *in = NULL;
8984 if (getref) {
8985 in = dn->inode.get();
8986 _ll_get(in);
8987 }
8988
8989 dn_name = dn->name; // fill in name while we have lock
8990
8991 client_lock.unlock();
8992 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8993 client_lock.lock();
8994 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8995 << " = " << r << dendl;
8996 if (r < 0) {
8997 return r;
8998 }
8999
9000 dirp->offset = next_off;
9001 if (dirp->at_end())
9002 dirp->next_offset = 2;
9003 else
9004 dirp->next_offset = dirp->offset_low();
9005 dirp->last_name = dn_name; // we successfully returned this one; update!
9006 dirp->release_count = 0; // last_name no longer match cache index
9007 if (r > 0)
9008 return r;
9009 }
9010
9011 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
9012 dirp->set_end();
9013 return 0;
9014 }
9015
9016 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
9017 unsigned want, unsigned flags, bool getref)
9018 {
9019 int caps = statx_to_mask(flags, want);
9020
9021 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9022 if (!mref_reader.is_state_satisfied())
9023 return -CEPHFS_ENOTCONN;
9024
9025 std::unique_lock cl(client_lock);
9026
9027 dir_result_t *dirp = static_cast<dir_result_t*>(d);
9028
9029 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
9030 << dec << " at_end=" << dirp->at_end()
9031 << " hash_order=" << dirp->hash_order() << dendl;
9032
9033 struct dirent de;
9034 struct ceph_statx stx;
9035 memset(&de, 0, sizeof(de));
9036 memset(&stx, 0, sizeof(stx));
9037
9038 InodeRef& diri = dirp->inode;
9039
9040 if (dirp->at_end())
9041 return 0;
9042
9043 if (dirp->offset == 0) {
9044 ldout(cct, 15) << " including ." << dendl;
9045 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
9046 uint64_t next_off = 1;
9047
9048 int r;
9049 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
9050 if (r < 0)
9051 return r;
9052
9053 fill_statx(diri, caps, &stx);
9054 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9055
9056 Inode *inode = NULL;
9057 if (getref) {
9058 inode = diri.get();
9059 _ll_get(inode);
9060 }
9061
9062 cl.unlock();
9063 r = cb(p, &de, &stx, next_off, inode);
9064 cl.lock();
9065 if (r < 0)
9066 return r;
9067
9068 dirp->offset = next_off;
9069 if (r > 0)
9070 return r;
9071 }
9072 if (dirp->offset == 1) {
9073 ldout(cct, 15) << " including .." << dendl;
9074 uint64_t next_off = 2;
9075 InodeRef in;
9076 if (diri->dentries.empty())
9077 in = diri;
9078 else
9079 in = diri->get_first_parent()->dir->parent_inode;
9080
9081 int r;
9082 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
9083 if (r < 0)
9084 return r;
9085
9086 fill_statx(in, caps, &stx);
9087 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9088
9089 Inode *inode = NULL;
9090 if (getref) {
9091 inode = in.get();
9092 _ll_get(inode);
9093 }
9094
9095 cl.unlock();
9096 r = cb(p, &de, &stx, next_off, inode);
9097 cl.lock();
9098 if (r < 0)
9099 return r;
9100
9101 dirp->offset = next_off;
9102 if (r > 0)
9103 return r;
9104 }
9105
9106 // can we read from our cache?
9107 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
9108 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9109 << dirp->inode->is_complete_and_ordered()
9110 << " issued " << ccap_string(dirp->inode->caps_issued())
9111 << dendl;
9112 if (dirp->inode->snapid != CEPH_SNAPDIR &&
9113 dirp->inode->is_complete_and_ordered() &&
9114 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
9115 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
9116 if (err != -CEPHFS_EAGAIN)
9117 return err;
9118 }
9119
9120 while (1) {
9121 if (dirp->at_end())
9122 return 0;
9123
9124 bool check_caps = true;
9125 if (!dirp->is_cached()) {
9126 int r = _readdir_get_frag(dirp);
9127 if (r)
9128 return r;
9129 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9130 // different than the requested one. (our dirfragtree was outdated)
9131 check_caps = false;
9132 }
9133 frag_t fg = dirp->buffer_frag;
9134
9135 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9136 << " offset " << hex << dirp->offset << dendl;
9137
9138 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9139 dirp->offset, dir_result_t::dentry_off_lt());
9140 it != dirp->buffer.end();
9141 ++it) {
9142 dir_result_t::dentry &entry = *it;
9143
9144 uint64_t next_off = entry.offset + 1;
9145
9146 int r;
9147 if (check_caps) {
9148 int mask = caps;
9149 if(entry.inode->is_dir()){
9150 mask |= CEPH_STAT_RSTAT;
9151 }
9152 r = _getattr(entry.inode, mask, dirp->perms);
9153 if (r < 0)
9154 return r;
9155 }
9156
9157 fill_statx(entry.inode, caps, &stx);
9158 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9159
9160 Inode *inode = NULL;
9161 if (getref) {
9162 inode = entry.inode.get();
9163 _ll_get(inode);
9164 }
9165
9166 cl.unlock();
9167 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9168 cl.lock();
9169
9170 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9171 << " = " << r << dendl;
9172 if (r < 0)
9173 return r;
9174
9175 dirp->offset = next_off;
9176 if (r > 0)
9177 return r;
9178 }
9179
9180 if (dirp->next_offset > 2) {
9181 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9182 _readdir_drop_dirp_buffer(dirp);
9183 continue; // more!
9184 }
9185
9186 if (!fg.is_rightmost()) {
9187 // next frag!
9188 _readdir_next_frag(dirp);
9189 continue;
9190 }
9191
9192 if (diri->shared_gen == dirp->start_shared_gen &&
9193 diri->dir_release_count == dirp->release_count) {
9194 if (diri->dir_ordered_count == dirp->ordered_count) {
9195 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9196 if (diri->dir) {
9197 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9198 diri->dir->readdir_cache.resize(dirp->cache_index);
9199 }
9200 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9201 } else {
9202 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9203 diri->flags |= I_COMPLETE;
9204 }
9205 }
9206
9207 dirp->set_end();
9208 return 0;
9209 }
9210 ceph_abort();
9211 return 0;
9212 }
9213
9214
9215 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9216 {
9217 return readdirplus_r(d, de, 0, 0, 0, NULL);
9218 }
9219
9220 /*
9221 * readdirplus_r
9222 *
9223 * returns
9224 * 1 if we got a dirent
9225 * 0 for end of directory
9226 * <0 on error
9227 */
9228
9229 struct single_readdir {
9230 struct dirent *de;
9231 struct ceph_statx *stx;
9232 Inode *inode;
9233 bool full;
9234 };
9235
9236 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9237 struct ceph_statx *stx, off_t off,
9238 Inode *in)
9239 {
9240 single_readdir *c = static_cast<single_readdir *>(p);
9241
9242 if (c->full)
9243 return -1; // already filled this dirent
9244
9245 *c->de = *de;
9246 if (c->stx)
9247 *c->stx = *stx;
9248 c->inode = in;
9249 c->full = true;
9250 return 1;
9251 }
9252
9253 struct dirent *Client::readdir(dir_result_t *d)
9254 {
9255 int ret;
9256 auto& de = d->de;
9257 single_readdir sr;
9258 sr.de = &de;
9259 sr.stx = NULL;
9260 sr.inode = NULL;
9261 sr.full = false;
9262
9263 // our callback fills the dirent and sets sr.full=true on first
9264 // call, and returns -1 the second time around.
9265 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9266 if (ret < -1) {
9267 errno = -ret; // this sucks.
9268 return (dirent *) NULL;
9269 }
9270 if (sr.full) {
9271 return &de;
9272 }
9273 return (dirent *) NULL;
9274 }
9275
9276 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9277 struct ceph_statx *stx, unsigned want,
9278 unsigned flags, Inode **out)
9279 {
9280 single_readdir sr;
9281 sr.de = de;
9282 sr.stx = stx;
9283 sr.inode = NULL;
9284 sr.full = false;
9285
9286 // our callback fills the dirent and sets sr.full=true on first
9287 // call, and returns -1 the second time around.
9288 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9289 if (r < -1)
9290 return r;
9291 if (out)
9292 *out = sr.inode;
9293 if (sr.full)
9294 return 1;
9295 return 0;
9296 }
9297
9298
9299 /* getdents */
9300 struct getdents_result {
9301 char *buf;
9302 int buflen;
9303 int pos;
9304 bool fullent;
9305 };
9306
9307 static int _readdir_getdent_cb(void *p, struct dirent *de,
9308 struct ceph_statx *stx, off_t off, Inode *in)
9309 {
9310 struct getdents_result *c = static_cast<getdents_result *>(p);
9311
9312 int dlen;
9313 if (c->fullent)
9314 dlen = sizeof(*de);
9315 else
9316 dlen = strlen(de->d_name) + 1;
9317
9318 if (c->pos + dlen > c->buflen)
9319 return -1; // doesn't fit
9320
9321 if (c->fullent) {
9322 memcpy(c->buf + c->pos, de, sizeof(*de));
9323 } else {
9324 memcpy(c->buf + c->pos, de->d_name, dlen);
9325 }
9326 c->pos += dlen;
9327 return 0;
9328 }
9329
9330 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9331 {
9332 getdents_result gr;
9333 gr.buf = buf;
9334 gr.buflen = buflen;
9335 gr.fullent = fullent;
9336 gr.pos = 0;
9337
9338 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9339
9340 if (r < 0) { // some error
9341 if (r == -1) { // buffer ran out of space
9342 if (gr.pos) { // but we got some entries already!
9343 return gr.pos;
9344 } // or we need a larger buffer
9345 return -CEPHFS_ERANGE;
9346 } else { // actual error, return it
9347 return r;
9348 }
9349 }
9350 return gr.pos;
9351 }
9352
9353
9354 /* getdir */
9355 struct getdir_result {
9356 list<string> *contents;
9357 int num;
9358 };
9359
9360 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9361 {
9362 getdir_result *r = static_cast<getdir_result *>(p);
9363
9364 r->contents->push_back(de->d_name);
9365 r->num++;
9366 return 0;
9367 }
9368
9369 int Client::getdir(const char *relpath, list<string>& contents,
9370 const UserPerm& perms)
9371 {
9372 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9373 tout(cct) << "getdir" << std::endl;
9374 tout(cct) << relpath << std::endl;
9375
9376 dir_result_t *d;
9377 int r = opendir(relpath, &d, perms);
9378 if (r < 0)
9379 return r;
9380
9381 getdir_result gr;
9382 gr.contents = &contents;
9383 gr.num = 0;
9384 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9385
9386 closedir(d);
9387
9388 if (r < 0)
9389 return r;
9390 return gr.num;
9391 }
9392
9393
9394 /****** file i/o **********/
9395
9396 // common parts for open and openat. call with client_lock locked.
9397 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9398 const UserPerm& perms, mode_t mode, int stripe_unit,
9399 int stripe_count, int object_size, const char *data_pool,
9400 std::string alternate_name) {
9401 ceph_assert(ceph_mutex_is_locked(client_lock));
9402 int cflags = ceph_flags_sys2wire(flags);
9403 tout(cct) << cflags << std::endl;
9404
9405 Fh *fh = NULL;
9406
9407 #if defined(__linux__) && defined(O_PATH)
9408 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9409 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9410 * in kernel (fs/open.c). */
9411 if (flags & O_PATH)
9412 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9413 #endif
9414
9415 filepath path(relpath);
9416 InodeRef in;
9417 bool created = false;
9418 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9419 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9420 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9421
9422 InodeRef dirinode = nullptr;
9423 int r = get_fd_inode(dirfd, &dirinode);
9424 if (r < 0) {
9425 return r;
9426 }
9427
9428 r = path_walk(path, &in, perms, followsym, mask, dirinode);
9429 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9430 return -CEPHFS_EEXIST;
9431
9432 #if defined(__linux__) && defined(O_PATH)
9433 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9434 #else
9435 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9436 #endif
9437 return -CEPHFS_ELOOP;
9438
9439 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9440 filepath dirpath = path;
9441 string dname = dirpath.last_dentry();
9442 dirpath.pop_dentry();
9443 InodeRef dir;
9444 r = path_walk(dirpath, &dir, perms, true,
9445 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9446 if (r < 0) {
9447 goto out;
9448 }
9449 if (cct->_conf->client_permissions) {
9450 r = may_create(dir.get(), perms);
9451 if (r < 0)
9452 goto out;
9453 }
9454 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9455 stripe_count, object_size, data_pool, &created, perms,
9456 std::move(alternate_name));
9457 }
9458 if (r < 0)
9459 goto out;
9460
9461 if (!created) {
9462 // posix says we can only check permissions of existing files
9463 if (cct->_conf->client_permissions) {
9464 r = may_open(in.get(), flags, perms);
9465 if (r < 0)
9466 goto out;
9467 }
9468 }
9469
9470 if (!fh)
9471 r = _open(in.get(), flags, mode, &fh, perms);
9472 if (r >= 0) {
9473 // allocate a integer file descriptor
9474 ceph_assert(fh);
9475 r = get_fd();
9476 ceph_assert(fd_map.count(r) == 0);
9477 fd_map[r] = fh;
9478 }
9479
9480 out:
9481 return r;
9482 }
9483
9484 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9485 mode_t mode, int stripe_unit, int stripe_count,
9486 int object_size, const char *data_pool, std::string alternate_name)
9487 {
9488 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9489 stripe_count, object_size, data_pool, alternate_name);
9490 }
9491
9492 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9493 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9494 const char *data_pool, std::string alternate_name) {
9495 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9496 if (!mref_reader.is_state_satisfied()) {
9497 return -CEPHFS_ENOTCONN;
9498 }
9499
9500 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9501 tout(cct) << dirfd << std::endl;
9502 tout(cct) << relpath << std::endl;
9503 tout(cct) << flags << std::endl;
9504 tout(cct) << mode << std::endl;
9505
9506 std::scoped_lock locker(client_lock);
9507 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9508 object_size, data_pool, alternate_name);
9509
9510 tout(cct) << r << std::endl;
9511 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9512 return r;
9513 }
9514
9515 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9516 const UserPerm& perms)
9517 {
9518 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9519
9520 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9521 if (!mref_reader.is_state_satisfied())
9522 return -CEPHFS_ENOTCONN;
9523
9524 std::scoped_lock lock(client_lock);
9525 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9526 filepath path(ino);
9527 req->set_filepath(path);
9528
9529 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9530 char f[30];
9531 sprintf(f, "%u", h);
9532 filepath path2(dirino);
9533 path2.push_dentry(string(f));
9534 req->set_filepath2(path2);
9535
9536 int r = make_request(req, perms, NULL, NULL,
9537 rand() % mdsmap->get_num_in_mds());
9538 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9539 return r;
9540 }
9541
9542
9543 /**
9544 * Load inode into local cache.
9545 *
9546 * If inode pointer is non-NULL, and take a reference on
9547 * the resulting Inode object in one operation, so that caller
9548 * can safely assume inode will still be there after return.
9549 */
9550 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9551 {
9552 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9553
9554 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9555 if (!mref_reader.is_state_satisfied())
9556 return -CEPHFS_ENOTCONN;
9557
9558 if (is_reserved_vino(vino))
9559 return -CEPHFS_ESTALE;
9560
9561 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9562 filepath path(vino.ino);
9563 req->set_filepath(path);
9564
9565 /*
9566 * The MDS expects either a "real" snapid here or 0. The special value
9567 * carveouts for the snapid are all at the end of the range so we can
9568 * just look for any snapid below this value.
9569 */
9570 if (vino.snapid < CEPH_NOSNAP)
9571 req->head.args.lookupino.snapid = vino.snapid;
9572
9573 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9574 if (r == 0 && inode != NULL) {
9575 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9576 ceph_assert(p != inode_map.end());
9577 *inode = p->second;
9578 _ll_get(*inode);
9579 }
9580 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9581 return r;
9582 }
9583
9584 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9585 {
9586 vinodeno_t vino(ino, CEPH_NOSNAP);
9587 std::scoped_lock lock(client_lock);
9588 return _lookup_vino(vino, perms, inode);
9589 }
9590
9591 /**
9592 * Find the parent inode of `ino` and insert it into
9593 * our cache. Conditionally also set `parent` to a referenced
9594 * Inode* if caller provides non-NULL value.
9595 */
9596 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9597 {
9598 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9599
9600 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9601 filepath path(ino->ino);
9602 req->set_filepath(path);
9603
9604 InodeRef target;
9605 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9606 // Give caller a reference to the parent ino if they provided a pointer.
9607 if (parent != NULL) {
9608 if (r == 0) {
9609 *parent = target.get();
9610 _ll_get(*parent);
9611 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9612 } else {
9613 *parent = NULL;
9614 }
9615 }
9616 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9617 return r;
9618 }
9619
9620 /**
9621 * Populate the parent dentry for `ino`, provided it is
9622 * a child of `parent`.
9623 */
9624 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9625 {
9626 ceph_assert(parent->is_dir());
9627 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9628
9629 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9630 if (!mref_reader.is_state_satisfied())
9631 return -CEPHFS_ENOTCONN;
9632
9633 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9634 req->set_filepath2(filepath(parent->ino));
9635 req->set_filepath(filepath(ino->ino));
9636 req->set_inode(ino);
9637
9638 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9639 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9640 return r;
9641 }
9642
9643 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9644 {
9645 std::scoped_lock lock(client_lock);
9646 return _lookup_name(ino, parent, perms);
9647 }
9648
9649 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9650 {
9651 ceph_assert(in);
9652 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9653
9654 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9655
9656 if (in->snapid != CEPH_NOSNAP) {
9657 in->snap_cap_refs++;
9658 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9659 << ccap_string(in->caps_issued()) << dendl;
9660 }
9661
9662 const auto& conf = cct->_conf;
9663 f->readahead.set_trigger_requests(1);
9664 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9665 uint64_t max_readahead = Readahead::NO_LIMIT;
9666 if (conf->client_readahead_max_bytes) {
9667 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9668 }
9669 if (conf->client_readahead_max_periods) {
9670 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9671 }
9672 f->readahead.set_max_readahead_size(max_readahead);
9673 vector<uint64_t> alignments;
9674 alignments.push_back(in->layout.get_period());
9675 alignments.push_back(in->layout.stripe_unit);
9676 f->readahead.set_alignments(alignments);
9677
9678 return f;
9679 }
9680
9681 int Client::_release_fh(Fh *f)
9682 {
9683 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9684 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9685 Inode *in = f->inode.get();
9686 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9687
9688 in->unset_deleg(f);
9689
9690 if (in->snapid == CEPH_NOSNAP) {
9691 if (in->put_open_ref(f->mode)) {
9692 _flush(in, new C_Client_FlushComplete(this, in));
9693 check_caps(in, 0);
9694 }
9695 } else {
9696 ceph_assert(in->snap_cap_refs > 0);
9697 in->snap_cap_refs--;
9698 }
9699
9700 _release_filelocks(f);
9701
9702 // Finally, read any async err (i.e. from flushes)
9703 int err = f->take_async_err();
9704 if (err != 0) {
9705 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9706 << cpp_strerror(err) << dendl;
9707 } else {
9708 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9709 }
9710
9711 _put_fh(f);
9712
9713 return err;
9714 }
9715
9716 void Client::_put_fh(Fh *f)
9717 {
9718 int left = f->put();
9719 if (!left) {
9720 delete f;
9721 }
9722 }
9723
9724 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9725 const UserPerm& perms)
9726 {
9727 if (in->snapid != CEPH_NOSNAP &&
9728 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9729 return -CEPHFS_EROFS;
9730 }
9731
9732 // use normalized flags to generate cmode
9733 int cflags = ceph_flags_sys2wire(flags);
9734 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9735 cflags |= CEPH_O_LAZY;
9736
9737 int cmode = ceph_flags_to_mode(cflags);
9738 int want = ceph_caps_for_mode(cmode);
9739 int result = 0;
9740
9741 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9742
9743 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9744 // update wanted?
9745 check_caps(in, CHECK_CAPS_NODELAY);
9746 } else {
9747
9748 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9749 filepath path;
9750 in->make_nosnap_relative_path(path);
9751 req->set_filepath(path);
9752 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9753 req->head.args.open.mode = mode;
9754 req->head.args.open.pool = -1;
9755 if (cct->_conf->client_debug_getattr_caps)
9756 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9757 else
9758 req->head.args.open.mask = 0;
9759 req->head.args.open.old_size = in->size; // for O_TRUNC
9760 req->set_inode(in);
9761 result = make_request(req, perms);
9762
9763 /*
9764 * NFS expects that delegations will be broken on a conflicting open,
9765 * not just when there is actual conflicting access to the file. SMB leases
9766 * and oplocks also have similar semantics.
9767 *
9768 * Ensure that clients that have delegations enabled will wait on minimal
9769 * caps during open, just to ensure that other clients holding delegations
9770 * return theirs first.
9771 */
9772 if (deleg_timeout && result == 0) {
9773 int need = 0, have;
9774
9775 if (cmode & CEPH_FILE_MODE_WR)
9776 need |= CEPH_CAP_FILE_WR;
9777 if (cmode & CEPH_FILE_MODE_RD)
9778 need |= CEPH_CAP_FILE_RD;
9779
9780 Fh fh(in, flags, cmode, fd_gen, perms);
9781 result = get_caps(&fh, need, want, &have, -1);
9782 if (result < 0) {
9783 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9784 " . Denying open: " <<
9785 cpp_strerror(result) << dendl;
9786 } else {
9787 put_cap_ref(in, need);
9788 }
9789 }
9790 }
9791
9792 // success?
9793 if (result >= 0) {
9794 if (fhp)
9795 *fhp = _create_fh(in, flags, cmode, perms);
9796 } else {
9797 in->put_open_ref(cmode);
9798 }
9799
9800 trim_cache();
9801
9802 return result;
9803 }
9804
9805 int Client::_renew_caps(Inode *in)
9806 {
9807 int wanted = in->caps_file_wanted();
9808 if (in->is_any_caps() &&
9809 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9810 check_caps(in, CHECK_CAPS_NODELAY);
9811 return 0;
9812 }
9813
9814 int flags = 0;
9815 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9816 flags = O_RDWR;
9817 else if (wanted & CEPH_CAP_FILE_RD)
9818 flags = O_RDONLY;
9819 else if (wanted & CEPH_CAP_FILE_WR)
9820 flags = O_WRONLY;
9821
9822 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9823 filepath path;
9824 in->make_nosnap_relative_path(path);
9825 req->set_filepath(path);
9826 req->head.args.open.flags = flags;
9827 req->head.args.open.pool = -1;
9828 if (cct->_conf->client_debug_getattr_caps)
9829 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9830 else
9831 req->head.args.open.mask = 0;
9832 req->set_inode(in);
9833
9834 // duplicate in case Cap goes away; not sure if that race is a concern?
9835 const UserPerm *pperm = in->get_best_perms();
9836 UserPerm perms;
9837 if (pperm != NULL)
9838 perms = *pperm;
9839 int ret = make_request(req, perms);
9840 return ret;
9841 }
9842
9843 int Client::_close(int fd)
9844 {
9845 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9846 tout(cct) << "close" << std::endl;
9847 tout(cct) << fd << std::endl;
9848
9849 Fh *fh = get_filehandle(fd);
9850 if (!fh)
9851 return -CEPHFS_EBADF;
9852 int err = _release_fh(fh);
9853 fd_map.erase(fd);
9854 put_fd(fd);
9855 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9856 return err;
9857 }
9858
9859 int Client::close(int fd) {
9860 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9861 if (!mref_reader.is_state_satisfied())
9862 return -CEPHFS_ENOTCONN;
9863
9864 std::scoped_lock lock(client_lock);
9865 return _close(fd);
9866 }
9867
9868 // ------------
9869 // read, write
9870
9871 loff_t Client::lseek(int fd, loff_t offset, int whence)
9872 {
9873 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9874 if (!mref_reader.is_state_satisfied())
9875 return -CEPHFS_ENOTCONN;
9876
9877 tout(cct) << "lseek" << std::endl;
9878 tout(cct) << fd << std::endl;
9879 tout(cct) << offset << std::endl;
9880 tout(cct) << whence << std::endl;
9881
9882 std::scoped_lock lock(client_lock);
9883 Fh *f = get_filehandle(fd);
9884 if (!f)
9885 return -CEPHFS_EBADF;
9886 #if defined(__linux__) && defined(O_PATH)
9887 if (f->flags & O_PATH)
9888 return -CEPHFS_EBADF;
9889 #endif
9890 return _lseek(f, offset, whence);
9891 }
9892
9893 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9894 {
9895 Inode *in = f->inode.get();
9896 bool whence_check = false;
9897 loff_t pos = -1;
9898
9899 switch (whence) {
9900 case SEEK_END:
9901 whence_check = true;
9902 break;
9903
9904 #ifdef SEEK_DATA
9905 case SEEK_DATA:
9906 whence_check = true;
9907 break;
9908 #endif
9909
9910 #ifdef SEEK_HOLE
9911 case SEEK_HOLE:
9912 whence_check = true;
9913 break;
9914 #endif
9915 }
9916
9917 if (whence_check) {
9918 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9919 if (r < 0)
9920 return r;
9921 }
9922
9923 switch (whence) {
9924 case SEEK_SET:
9925 pos = offset;
9926 break;
9927
9928 case SEEK_CUR:
9929 pos = f->pos + offset;
9930 break;
9931
9932 case SEEK_END:
9933 pos = in->size + offset;
9934 break;
9935
9936 #ifdef SEEK_DATA
9937 case SEEK_DATA:
9938 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9939 return -CEPHFS_ENXIO;
9940 pos = offset;
9941 break;
9942 #endif
9943
9944 #ifdef SEEK_HOLE
9945 case SEEK_HOLE:
9946 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9947 return -CEPHFS_ENXIO;
9948 pos = in->size;
9949 break;
9950 #endif
9951
9952 default:
9953 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9954 return -CEPHFS_EINVAL;
9955 }
9956
9957 if (pos < 0) {
9958 return -CEPHFS_EINVAL;
9959 } else {
9960 f->pos = pos;
9961 }
9962
9963 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9964 return f->pos;
9965 }
9966
9967
9968 void Client::lock_fh_pos(Fh *f)
9969 {
9970 ldout(cct, 10) << __func__ << " " << f << dendl;
9971
9972 if (f->pos_locked || !f->pos_waiters.empty()) {
9973 ceph::condition_variable cond;
9974 f->pos_waiters.push_back(&cond);
9975 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9976 std::unique_lock l{client_lock, std::adopt_lock};
9977 cond.wait(l, [f, me=&cond] {
9978 return !f->pos_locked && f->pos_waiters.front() == me;
9979 });
9980 l.release();
9981 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9982 ceph_assert(f->pos_waiters.front() == &cond);
9983 f->pos_waiters.pop_front();
9984 }
9985
9986 f->pos_locked = true;
9987 }
9988
9989 void Client::unlock_fh_pos(Fh *f)
9990 {
9991 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9992
9993 ldout(cct, 10) << __func__ << " " << f << dendl;
9994 f->pos_locked = false;
9995 if (!f->pos_waiters.empty()) {
9996 // only wake up the oldest waiter
9997 auto cond = f->pos_waiters.front();
9998 cond->notify_one();
9999 }
10000 }
10001
10002 int Client::uninline_data(Inode *in, Context *onfinish)
10003 {
10004 if (!in->inline_data.length()) {
10005 onfinish->complete(0);
10006 return 0;
10007 }
10008
10009 char oid_buf[32];
10010 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10011 object_t oid = oid_buf;
10012
10013 ObjectOperation create_ops;
10014 create_ops.create(false);
10015
10016 objecter->mutate(oid,
10017 OSDMap::file_to_object_locator(in->layout),
10018 create_ops,
10019 in->snaprealm->get_snap_context(),
10020 ceph::real_clock::now(),
10021 0,
10022 NULL);
10023
10024 bufferlist inline_version_bl;
10025 encode(in->inline_version, inline_version_bl);
10026
10027 ObjectOperation uninline_ops;
10028 uninline_ops.cmpxattr("inline_version",
10029 CEPH_OSD_CMPXATTR_OP_GT,
10030 CEPH_OSD_CMPXATTR_MODE_U64,
10031 inline_version_bl);
10032 bufferlist inline_data = in->inline_data;
10033 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10034 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10035
10036 objecter->mutate(oid,
10037 OSDMap::file_to_object_locator(in->layout),
10038 uninline_ops,
10039 in->snaprealm->get_snap_context(),
10040 ceph::real_clock::now(),
10041 0,
10042 onfinish);
10043
10044 return 0;
10045 }
10046
10047 //
10048
10049 // blocking osd interface
10050
10051 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10052 {
10053 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10054 if (!mref_reader.is_state_satisfied())
10055 return -CEPHFS_ENOTCONN;
10056
10057 tout(cct) << "read" << std::endl;
10058 tout(cct) << fd << std::endl;
10059 tout(cct) << size << std::endl;
10060 tout(cct) << offset << std::endl;
10061
10062 std::unique_lock lock(client_lock);
10063 Fh *f = get_filehandle(fd);
10064 if (!f)
10065 return -CEPHFS_EBADF;
10066 #if defined(__linux__) && defined(O_PATH)
10067 if (f->flags & O_PATH)
10068 return -CEPHFS_EBADF;
10069 #endif
10070 bufferlist bl;
10071 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10072 size = std::min(size, (loff_t)INT_MAX);
10073 int r = _read(f, offset, size, &bl);
10074 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10075 if (r >= 0) {
10076 lock.unlock();
10077 bl.begin().copy(bl.length(), buf);
10078 r = bl.length();
10079 }
10080 return r;
10081 }
10082
10083 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10084 {
10085 if (iovcnt < 0)
10086 return -CEPHFS_EINVAL;
10087 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10088 }
10089
10090 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
10091 {
10092 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10093
10094 int want, have = 0;
10095 bool movepos = false;
10096 std::unique_ptr<C_SaferCond> onuninline;
10097 int64_t rc = 0;
10098 const auto& conf = cct->_conf;
10099 Inode *in = f->inode.get();
10100 utime_t lat;
10101 utime_t start = ceph_clock_now();
10102
10103 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
10104 return -CEPHFS_EBADF;
10105 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10106
10107 if (offset < 0) {
10108 lock_fh_pos(f);
10109 offset = f->pos;
10110 movepos = true;
10111 }
10112 loff_t start_pos = offset;
10113
10114 if (in->inline_version == 0) {
10115 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10116 if (r < 0) {
10117 rc = r;
10118 goto done;
10119 }
10120 ceph_assert(in->inline_version > 0);
10121 }
10122
10123 retry:
10124 if (f->mode & CEPH_FILE_MODE_LAZY)
10125 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10126 else
10127 want = CEPH_CAP_FILE_CACHE;
10128 {
10129 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10130 if (r < 0) {
10131 rc = r;
10132 goto done;
10133 }
10134 }
10135 if (f->flags & O_DIRECT)
10136 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
10137
10138 if (in->inline_version < CEPH_INLINE_NONE) {
10139 if (!(have & CEPH_CAP_FILE_CACHE)) {
10140 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
10141 uninline_data(in, onuninline.get());
10142 } else {
10143 uint32_t len = in->inline_data.length();
10144 uint64_t endoff = offset + size;
10145 if (endoff > in->size)
10146 endoff = in->size;
10147
10148 if (offset < len) {
10149 if (endoff <= len) {
10150 bl->substr_of(in->inline_data, offset, endoff - offset);
10151 } else {
10152 bl->substr_of(in->inline_data, offset, len - offset);
10153 bl->append_zero(endoff - len);
10154 }
10155 rc = endoff - offset;
10156 } else if ((uint64_t)offset < endoff) {
10157 bl->append_zero(endoff - offset);
10158 rc = endoff - offset;
10159 } else {
10160 rc = 0;
10161 }
10162 goto success;
10163 }
10164 }
10165
10166 if (!conf->client_debug_force_sync_read &&
10167 conf->client_oc &&
10168 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10169
10170 if (f->flags & O_RSYNC) {
10171 _flush_range(in, offset, size);
10172 }
10173 rc = _read_async(f, offset, size, bl);
10174 if (rc < 0)
10175 goto done;
10176 } else {
10177 if (f->flags & O_DIRECT)
10178 _flush_range(in, offset, size);
10179
10180 bool checkeof = false;
10181 rc = _read_sync(f, offset, size, bl, &checkeof);
10182 if (rc < 0)
10183 goto done;
10184 if (checkeof) {
10185 offset += rc;
10186 size -= rc;
10187
10188 put_cap_ref(in, CEPH_CAP_FILE_RD);
10189 have = 0;
10190 // reverify size
10191 {
10192 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10193 if (r < 0) {
10194 rc = r;
10195 goto done;
10196 }
10197 }
10198
10199 // eof? short read.
10200 if ((uint64_t)offset < in->size)
10201 goto retry;
10202 }
10203 }
10204
10205 success:
10206 ceph_assert(rc >= 0);
10207 update_read_io_size(bl->length());
10208 if (movepos) {
10209 // adjust fd pos
10210 f->pos = start_pos + rc;
10211 }
10212
10213 lat = ceph_clock_now();
10214 lat -= start;
10215
10216 ++nr_read_request;
10217 update_io_stat_read(lat);
10218
10219 done:
10220 // done!
10221
10222 if (onuninline) {
10223 client_lock.unlock();
10224 int ret = onuninline->wait();
10225 client_lock.lock();
10226 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
10227 in->inline_data.clear();
10228 in->inline_version = CEPH_INLINE_NONE;
10229 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10230 check_caps(in, 0);
10231 } else
10232 rc = ret;
10233 }
10234 if (have) {
10235 put_cap_ref(in, CEPH_CAP_FILE_RD);
10236 }
10237 if (movepos) {
10238 unlock_fh_pos(f);
10239 }
10240 return rc;
10241 }
10242
10243 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10244 client(c), f(f) {
10245 f->get();
10246 f->readahead.inc_pending();
10247 }
10248
10249 Client::C_Readahead::~C_Readahead() {
10250 f->readahead.dec_pending();
10251 client->_put_fh(f);
10252 }
10253
10254 void Client::C_Readahead::finish(int r) {
10255 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10256 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10257 if (r > 0) {
10258 client->update_read_io_size(r);
10259 }
10260 }
10261
10262 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10263 {
10264 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10265
10266 const auto& conf = cct->_conf;
10267 Inode *in = f->inode.get();
10268
10269 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10270
10271 // trim read based on file size?
10272 if (off >= in->size)
10273 return 0;
10274 if (len == 0)
10275 return 0;
10276 if (off + len > in->size) {
10277 len = in->size - off;
10278 }
10279
10280 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10281 << " max_bytes=" << f->readahead.get_max_readahead_size()
10282 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10283
10284 // read (and possibly block)
10285 int r = 0;
10286 C_SaferCond onfinish("Client::_read_async flock");
10287 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10288 off, len, bl, 0, &onfinish);
10289 if (r == 0) {
10290 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10291 client_lock.unlock();
10292 r = onfinish.wait();
10293 client_lock.lock();
10294 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10295 update_read_io_size(bl->length());
10296 }
10297
10298 if(f->readahead.get_min_readahead_size() > 0) {
10299 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10300 if (readahead_extent.second > 0) {
10301 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10302 << " (caller wants " << off << "~" << len << ")" << dendl;
10303 Context *onfinish2 = new C_Readahead(this, f);
10304 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10305 readahead_extent.first, readahead_extent.second,
10306 NULL, 0, onfinish2);
10307 if (r2 == 0) {
10308 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10309 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10310 } else {
10311 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10312 delete onfinish2;
10313 }
10314 }
10315 }
10316
10317 return r;
10318 }
10319
10320 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10321 bool *checkeof)
10322 {
10323 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10324
10325 Inode *in = f->inode.get();
10326 uint64_t pos = off;
10327 int left = len;
10328 int read = 0;
10329
10330 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10331
10332 // 0 success, 1 continue and < 0 error happen.
10333 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10334 int r = onfinish.wait();
10335
10336 // if we get ENOENT from OSD, assume 0 bytes returned
10337 if (r == -CEPHFS_ENOENT)
10338 r = 0;
10339 if (r < 0)
10340 return r;
10341
10342 if (tbl.length()) {
10343 r = tbl.length();
10344
10345 read += r;
10346 pos += r;
10347 left -= r;
10348 bl->claim_append(tbl);
10349 }
10350 // short read?
10351 if (r >= 0 && r < wanted) {
10352 if (pos < in->size) {
10353 // zero up to known EOF
10354 int64_t some = in->size - pos;
10355 if (some > left)
10356 some = left;
10357 auto z = buffer::ptr_node::create(some);
10358 z->zero();
10359 bl->push_back(std::move(z));
10360 read += some;
10361 pos += some;
10362 left -= some;
10363 if (left == 0)
10364 return 0;
10365 }
10366
10367 *checkeof = true;
10368 return 0;
10369 }
10370 return 1;
10371 };
10372
10373 while (left > 0) {
10374 C_SaferCond onfinish("Client::_read_sync flock");
10375 bufferlist tbl;
10376
10377 int wanted = left;
10378 filer->read_trunc(in->ino, &in->layout, in->snapid,
10379 pos, left, &tbl, 0,
10380 in->truncate_size, in->truncate_seq,
10381 &onfinish);
10382 client_lock.unlock();
10383 int r = wait_and_copy(onfinish, tbl, wanted);
10384 client_lock.lock();
10385 if (!r)
10386 return read;
10387 if (r < 0)
10388 return r;
10389 }
10390 return read;
10391 }
10392
10393 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10394 {
10395 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10396 if (!mref_reader.is_state_satisfied())
10397 return -CEPHFS_ENOTCONN;
10398
10399 tout(cct) << "write" << std::endl;
10400 tout(cct) << fd << std::endl;
10401 tout(cct) << size << std::endl;
10402 tout(cct) << offset << std::endl;
10403
10404 std::scoped_lock lock(client_lock);
10405 Fh *fh = get_filehandle(fd);
10406 if (!fh)
10407 return -CEPHFS_EBADF;
10408 #if defined(__linux__) && defined(O_PATH)
10409 if (fh->flags & O_PATH)
10410 return -CEPHFS_EBADF;
10411 #endif
10412 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10413 size = std::min(size, (loff_t)INT_MAX);
10414 int r = _write(fh, offset, size, buf, NULL, false);
10415 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10416 return r;
10417 }
10418
10419 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10420 {
10421 if (iovcnt < 0)
10422 return -CEPHFS_EINVAL;
10423 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10424 }
10425
10426 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10427 unsigned iovcnt, int64_t offset,
10428 bool write, bool clamp_to_int)
10429 {
10430 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10431
10432 #if defined(__linux__) && defined(O_PATH)
10433 if (fh->flags & O_PATH)
10434 return -CEPHFS_EBADF;
10435 #endif
10436 loff_t totallen = 0;
10437 for (unsigned i = 0; i < iovcnt; i++) {
10438 totallen += iov[i].iov_len;
10439 }
10440
10441 /*
10442 * Some of the API functions take 64-bit size values, but only return
10443 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10444 * we don't do I/Os larger than the values we can return.
10445 */
10446 if (clamp_to_int) {
10447 totallen = std::min(totallen, (loff_t)INT_MAX);
10448 }
10449 if (write) {
10450 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10451 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10452 return w;
10453 } else {
10454 bufferlist bl;
10455 int64_t r = _read(fh, offset, totallen, &bl);
10456 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10457 if (r <= 0)
10458 return r;
10459
10460 client_lock.unlock();
10461 auto iter = bl.cbegin();
10462 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10463 /*
10464 * This piece of code aims to handle the case that bufferlist
10465 * does not have enough data to fill in the iov
10466 */
10467 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10468 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10469 resid -= round_size;
10470 /* iter is self-updating */
10471 }
10472 client_lock.lock();
10473 return r;
10474 }
10475 }
10476
10477 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10478 {
10479 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10480 if (!mref_reader.is_state_satisfied())
10481 return -CEPHFS_ENOTCONN;
10482
10483 tout(cct) << fd << std::endl;
10484 tout(cct) << offset << std::endl;
10485
10486 std::scoped_lock cl(client_lock);
10487 Fh *fh = get_filehandle(fd);
10488 if (!fh)
10489 return -CEPHFS_EBADF;
10490 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10491 }
10492
10493 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10494 const struct iovec *iov, int iovcnt)
10495 {
10496 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10497
10498 uint64_t fpos = 0;
10499 Inode *in = f->inode.get();
10500
10501 if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10502 (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10503 return -CEPHFS_EFBIG;
10504 }
10505 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10506
10507 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10508 return -CEPHFS_ENOSPC;
10509 }
10510
10511 ceph_assert(in->snapid == CEPH_NOSNAP);
10512
10513 // was Fh opened as writeable?
10514 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10515 return -CEPHFS_EBADF;
10516
10517 // use/adjust fd pos?
10518 if (offset < 0) {
10519 lock_fh_pos(f);
10520 /*
10521 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10522 * change out from under us.
10523 */
10524 if (f->flags & O_APPEND) {
10525 auto r = _lseek(f, 0, SEEK_END);
10526 if (r < 0) {
10527 unlock_fh_pos(f);
10528 return r;
10529 }
10530 }
10531 offset = f->pos;
10532 fpos = offset+size;
10533 unlock_fh_pos(f);
10534 }
10535
10536 // check quota
10537 uint64_t endoff = offset + size;
10538 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10539 f->actor_perms)) {
10540 return -CEPHFS_EDQUOT;
10541 }
10542
10543 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10544
10545 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10546
10547 // time it.
10548 utime_t start = ceph_clock_now();
10549
10550 if (in->inline_version == 0) {
10551 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10552 if (r < 0)
10553 return r;
10554 ceph_assert(in->inline_version > 0);
10555 }
10556
10557 // copy into fresh buffer (since our write may be resub, async)
10558 bufferlist bl;
10559 if (buf) {
10560 if (size > 0)
10561 bl.append(buf, size);
10562 } else if (iov){
10563 for (int i = 0; i < iovcnt; i++) {
10564 if (iov[i].iov_len > 0) {
10565 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10566 }
10567 }
10568 }
10569
10570 utime_t lat;
10571 uint64_t totalwritten;
10572 int want, have;
10573 if (f->mode & CEPH_FILE_MODE_LAZY)
10574 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10575 else
10576 want = CEPH_CAP_FILE_BUFFER;
10577 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10578 if (r < 0)
10579 return r;
10580
10581 /* clear the setuid/setgid bits, if any */
10582 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10583 struct ceph_statx stx = { 0 };
10584
10585 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10586 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10587 if (r < 0)
10588 return r;
10589 } else {
10590 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10591 }
10592
10593 if (f->flags & O_DIRECT)
10594 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10595
10596 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10597
10598 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10599
10600 if (in->inline_version < CEPH_INLINE_NONE) {
10601 if (endoff > cct->_conf->client_max_inline_size ||
10602 endoff > CEPH_INLINE_MAX_SIZE ||
10603 !(have & CEPH_CAP_FILE_BUFFER)) {
10604 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10605 uninline_data(in, onuninline.get());
10606 } else {
10607 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10608
10609 uint32_t len = in->inline_data.length();
10610
10611 if (endoff < len)
10612 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10613
10614 if (offset < len)
10615 in->inline_data.splice(offset, len - offset);
10616 else if (offset > len)
10617 in->inline_data.append_zero(offset - len);
10618
10619 in->inline_data.append(bl);
10620 in->inline_version++;
10621
10622 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10623
10624 goto success;
10625 }
10626 }
10627
10628 if (cct->_conf->client_oc &&
10629 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10630 // do buffered write
10631 if (!in->oset.dirty_or_tx)
10632 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10633
10634 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10635
10636 // async, caching, non-blocking.
10637 r = objectcacher->file_write(&in->oset, &in->layout,
10638 in->snaprealm->get_snap_context(),
10639 offset, size, bl, ceph::real_clock::now(),
10640 0);
10641 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10642
10643 if (r < 0)
10644 goto done;
10645
10646 // flush cached write if O_SYNC is set on file fh
10647 // O_DSYNC == O_SYNC on linux < 2.6.33
10648 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10649 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10650 _flush_range(in, offset, size);
10651 }
10652 } else {
10653 if (f->flags & O_DIRECT)
10654 _flush_range(in, offset, size);
10655
10656 // simple, non-atomic sync write
10657 C_SaferCond onfinish("Client::_write flock");
10658 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10659
10660 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10661 offset, size, bl, ceph::real_clock::now(), 0,
10662 in->truncate_size, in->truncate_seq,
10663 &onfinish);
10664 client_lock.unlock();
10665 r = onfinish.wait();
10666 client_lock.lock();
10667 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10668 if (r < 0)
10669 goto done;
10670 }
10671
10672 // if we get here, write was successful, update client metadata
10673 success:
10674 update_write_io_size(size);
10675 // time
10676 lat = ceph_clock_now();
10677 lat -= start;
10678
10679 ++nr_write_request;
10680 update_io_stat_write(lat);
10681
10682 if (fpos) {
10683 lock_fh_pos(f);
10684 f->pos = fpos;
10685 unlock_fh_pos(f);
10686 }
10687 totalwritten = size;
10688 r = (int64_t)totalwritten;
10689
10690 // extend file?
10691 if (totalwritten + offset > in->size) {
10692 in->size = totalwritten + offset;
10693 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10694
10695 if (is_quota_bytes_approaching(in, f->actor_perms)) {
10696 check_caps(in, CHECK_CAPS_NODELAY);
10697 } else if (is_max_size_approaching(in)) {
10698 check_caps(in, 0);
10699 }
10700
10701 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10702 } else {
10703 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10704 }
10705
10706 // mtime
10707 in->mtime = in->ctime = ceph_clock_now();
10708 in->change_attr++;
10709 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10710
10711 done:
10712
10713 if (nullptr != onuninline) {
10714 client_lock.unlock();
10715 int uninline_ret = onuninline->wait();
10716 client_lock.lock();
10717
10718 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10719 in->inline_data.clear();
10720 in->inline_version = CEPH_INLINE_NONE;
10721 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10722 check_caps(in, 0);
10723 } else
10724 r = uninline_ret;
10725 }
10726
10727 put_cap_ref(in, CEPH_CAP_FILE_WR);
10728 return r;
10729 }
10730
10731 int Client::_flush(Fh *f)
10732 {
10733 Inode *in = f->inode.get();
10734 int err = f->take_async_err();
10735 if (err != 0) {
10736 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10737 << cpp_strerror(err) << dendl;
10738 } else {
10739 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10740 }
10741
10742 return err;
10743 }
10744
10745 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10746 {
10747 struct ceph_statx stx;
10748 stx.stx_size = length;
10749 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10750 }
10751
10752 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10753 {
10754 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10755 if (!mref_reader.is_state_satisfied())
10756 return -CEPHFS_ENOTCONN;
10757
10758 tout(cct) << __func__ << std::endl;
10759 tout(cct) << fd << std::endl;
10760 tout(cct) << length << std::endl;
10761
10762 std::scoped_lock lock(client_lock);
10763 Fh *f = get_filehandle(fd);
10764 if (!f)
10765 return -CEPHFS_EBADF;
10766 #if defined(__linux__) && defined(O_PATH)
10767 if (f->flags & O_PATH)
10768 return -CEPHFS_EBADF;
10769 #endif
10770 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10771 return -CEPHFS_EBADF;
10772 struct stat attr;
10773 attr.st_size = length;
10774 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10775 }
10776
10777 int Client::fsync(int fd, bool syncdataonly)
10778 {
10779 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10780 if (!mref_reader.is_state_satisfied())
10781 return -CEPHFS_ENOTCONN;
10782
10783 tout(cct) << "fsync" << std::endl;
10784 tout(cct) << fd << std::endl;
10785 tout(cct) << syncdataonly << std::endl;
10786
10787 std::scoped_lock lock(client_lock);
10788 Fh *f = get_filehandle(fd);
10789 if (!f)
10790 return -CEPHFS_EBADF;
10791 #if defined(__linux__) && defined(O_PATH)
10792 if (f->flags & O_PATH)
10793 return -CEPHFS_EBADF;
10794 #endif
10795 int r = _fsync(f, syncdataonly);
10796 if (r == 0) {
10797 // The IOs in this fsync were okay, but maybe something happened
10798 // in the background that we shoudl be reporting?
10799 r = f->take_async_err();
10800 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10801 << ") = 0, async_err = " << r << dendl;
10802 } else {
10803 // Assume that an error we encountered during fsync, even reported
10804 // synchronously, would also have applied the error to the Fh, and we
10805 // should clear it here to avoid returning the same error again on next
10806 // call.
10807 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10808 << r << dendl;
10809 f->take_async_err();
10810 }
10811 return r;
10812 }
10813
10814 int Client::_fsync(Inode *in, bool syncdataonly)
10815 {
10816 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10817
10818 int r = 0;
10819 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10820 ceph_tid_t flush_tid = 0;
10821 InodeRef tmp_ref;
10822 utime_t lat;
10823 utime_t start = ceph_clock_now();
10824
10825 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10826
10827 if (cct->_conf->client_oc) {
10828 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10829 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10830 _flush(in, object_cacher_completion.get());
10831 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10832 }
10833
10834 if (!syncdataonly && in->dirty_caps) {
10835 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10836 if (in->flushing_caps)
10837 flush_tid = last_flush_tid;
10838 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10839
10840 if (!syncdataonly && !in->unsafe_ops.empty()) {
10841 flush_mdlog_sync(in);
10842
10843 MetaRequest *req = in->unsafe_ops.back();
10844 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10845
10846 req->get();
10847 wait_on_list(req->waitfor_safe);
10848 put_request(req);
10849 }
10850
10851 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10852 client_lock.unlock();
10853 ldout(cct, 15) << "waiting on data to flush" << dendl;
10854 r = object_cacher_completion->wait();
10855 client_lock.lock();
10856 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10857 } else {
10858 // FIXME: this can starve
10859 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10860 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10861 << " uncommitted, waiting" << dendl;
10862 wait_on_list(in->waitfor_commit);
10863 }
10864 }
10865
10866 if (!r) {
10867 if (flush_tid > 0)
10868 wait_sync_caps(in, flush_tid);
10869
10870 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10871 } else {
10872 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10873 << cpp_strerror(-r) << dendl;
10874 }
10875
10876 lat = ceph_clock_now();
10877 lat -= start;
10878 logger->tinc(l_c_fsync, lat);
10879
10880 return r;
10881 }
10882
10883 int Client::_fsync(Fh *f, bool syncdataonly)
10884 {
10885 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10886 return _fsync(f->inode.get(), syncdataonly);
10887 }
10888
10889 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10890 {
10891 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10892 if (!mref_reader.is_state_satisfied())
10893 return -CEPHFS_ENOTCONN;
10894
10895 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10896 tout(cct) << fd << std::endl;
10897
10898 std::scoped_lock lock(client_lock);
10899 Fh *f = get_filehandle(fd);
10900 if (!f)
10901 return -CEPHFS_EBADF;
10902 int r = _getattr(f->inode, mask, perms);
10903 if (r < 0)
10904 return r;
10905 fill_stat(f->inode, stbuf, NULL);
10906 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10907 return r;
10908 }
10909
10910 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10911 unsigned int want, unsigned int flags)
10912 {
10913 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10914 if (!mref_reader.is_state_satisfied())
10915 return -CEPHFS_ENOTCONN;
10916
10917 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10918 tout(cct) << fd << std::endl;
10919
10920 std::scoped_lock lock(client_lock);
10921 Fh *f = get_filehandle(fd);
10922 if (!f)
10923 return -CEPHFS_EBADF;
10924
10925 unsigned mask = statx_to_mask(flags, want);
10926
10927 int r = 0;
10928 if (mask) {
10929 r = _getattr(f->inode, mask, perms);
10930 if (r < 0) {
10931 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10932 return r;
10933 }
10934 }
10935
10936 fill_statx(f->inode, mask, stx);
10937 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10938 return r;
10939 }
10940
10941 int Client::statxat(int dirfd, const char *relpath,
10942 struct ceph_statx *stx, const UserPerm& perms,
10943 unsigned int want, unsigned int flags) {
10944 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10945 if (!mref_reader.is_state_satisfied()) {
10946 return -CEPHFS_ENOTCONN;
10947 }
10948
10949 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10950 tout(cct) << dirfd << std::endl;
10951 tout(cct) << relpath << std::endl;
10952
10953 unsigned mask = statx_to_mask(flags, want);
10954
10955 InodeRef dirinode;
10956 std::scoped_lock lock(client_lock);
10957 int r = get_fd_inode(dirfd, &dirinode);
10958 if (r < 0) {
10959 return r;
10960 }
10961
10962 InodeRef in;
10963 filepath path(relpath);
10964 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10965 if (r < 0) {
10966 return r;
10967 }
10968 r = _getattr(in, mask, perms);
10969 if (r < 0) {
10970 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10971 return r;
10972 }
10973
10974 fill_statx(in, mask, stx);
10975 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10976 return r;
10977 }
10978
10979 // not written yet, but i want to link!
10980
10981 int Client::chdir(const char *relpath, std::string &new_cwd,
10982 const UserPerm& perms)
10983 {
10984 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10985 if (!mref_reader.is_state_satisfied())
10986 return -CEPHFS_ENOTCONN;
10987
10988 tout(cct) << "chdir" << std::endl;
10989 tout(cct) << relpath << std::endl;
10990
10991 filepath path(relpath);
10992 InodeRef in;
10993
10994 std::scoped_lock lock(client_lock);
10995 int r = path_walk(path, &in, perms);
10996 if (r < 0)
10997 return r;
10998
10999 if (!(in.get()->is_dir()))
11000 return -CEPHFS_ENOTDIR;
11001
11002 if (cwd != in)
11003 cwd.swap(in);
11004 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
11005
11006 _getcwd(new_cwd, perms);
11007 return 0;
11008 }
11009
11010 void Client::_getcwd(string& dir, const UserPerm& perms)
11011 {
11012 filepath path;
11013 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
11014
11015 Inode *in = cwd.get();
11016 while (in != root.get()) {
11017 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
11018
11019 // A cwd or ancester is unlinked
11020 if (in->dentries.empty()) {
11021 return;
11022 }
11023
11024 Dentry *dn = in->get_first_parent();
11025
11026
11027 if (!dn) {
11028 // look it up
11029 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
11030 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11031 filepath path(in->ino);
11032 req->set_filepath(path);
11033 req->set_inode(in);
11034 int res = make_request(req, perms);
11035 if (res < 0)
11036 break;
11037
11038 // start over
11039 path = filepath();
11040 in = cwd.get();
11041 continue;
11042 }
11043 path.push_front_dentry(dn->name);
11044 in = dn->dir->parent_inode;
11045 }
11046 dir = "/";
11047 dir += path.get_path();
11048 }
11049
11050 void Client::getcwd(string& dir, const UserPerm& perms)
11051 {
11052 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11053 if (!mref_reader.is_state_satisfied())
11054 return;
11055
11056 std::scoped_lock l(client_lock);
11057
11058 _getcwd(dir, perms);
11059 }
11060
11061 int Client::statfs(const char *path, struct statvfs *stbuf,
11062 const UserPerm& perms)
11063 {
11064 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11065 if (!mref_reader.is_state_satisfied())
11066 return -CEPHFS_ENOTCONN;
11067
11068 tout(cct) << __func__ << std::endl;
11069 unsigned long int total_files_on_fs;
11070
11071 ceph_statfs stats;
11072 C_SaferCond cond;
11073
11074 std::unique_lock lock(client_lock);
11075 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11076 if (data_pools.size() == 1) {
11077 objecter->get_fs_stats(stats, data_pools[0], &cond);
11078 } else {
11079 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
11080 }
11081
11082 lock.unlock();
11083 int rval = cond.wait();
11084 lock.lock();
11085
11086 ceph_assert(root);
11087 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
11088
11089 if (rval < 0) {
11090 ldout(cct, 1) << "underlying call to statfs returned error: "
11091 << cpp_strerror(rval)
11092 << dendl;
11093 return rval;
11094 }
11095
11096 memset(stbuf, 0, sizeof(*stbuf));
11097
11098 /*
11099 * we're going to set a block size of 4MB so we can represent larger
11100 * FSes without overflowing. Additionally convert the space
11101 * measurements from KB to bytes while making them in terms of
11102 * blocks. We use 4MB only because it is big enough, and because it
11103 * actually *is* the (ceph) default block size.
11104 */
11105 const int CEPH_BLOCK_SHIFT = 22;
11106 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11107 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
11108 stbuf->f_files = total_files_on_fs;
11109 stbuf->f_ffree = -1;
11110 stbuf->f_favail = -1;
11111 stbuf->f_fsid = -1; // ??
11112 stbuf->f_flag = 0; // ??
11113 stbuf->f_namemax = NAME_MAX;
11114
11115 // Usually quota_root will == root_ancestor, but if the mount root has no
11116 // quota but we can see a parent of it that does have a quota, we'll
11117 // respect that one instead.
11118 ceph_assert(root != nullptr);
11119 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
11120
11121 // get_quota_root should always give us something if client quotas are
11122 // enabled
11123 ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
11124
11125 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11126
11127 // Skip the getattr if any sessions are stale, as we don't want to
11128 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11129 // is unhealthy.
11130 if (!_any_stale_sessions()) {
11131 int r = _getattr(quota_root, 0, perms, true);
11132 if (r != 0) {
11133 // Ignore return value: error getting latest inode metadata is not a good
11134 // reason to break "df".
11135 lderr(cct) << "Error in getattr on quota root 0x"
11136 << std::hex << quota_root->ino << std::dec
11137 << " statfs result may be outdated" << dendl;
11138 }
11139 }
11140
11141 // Special case: if there is a size quota set on the Inode acting
11142 // as the root for this client mount, then report the quota status
11143 // as the filesystem statistics.
11144 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11145 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
11146 // It is possible for a quota to be exceeded: arithmetic here must
11147 // handle case where used > total.
11148 const fsblkcnt_t free = total > used ? total - used : 0;
11149
11150 stbuf->f_blocks = total;
11151 stbuf->f_bfree = free;
11152 stbuf->f_bavail = free;
11153 } else {
11154 // General case: report the cluster statistics returned from RADOS. Because
11155 // multiple pools may be used without one filesystem namespace via
11156 // layouts, this is the most correct thing we can do.
11157 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11158 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11159 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11160 }
11161
11162 return rval;
11163 }
11164
11165 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11166 struct flock *fl, uint64_t owner, bool removing)
11167 {
11168 ldout(cct, 10) << __func__ << " ino " << in->ino
11169 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11170 << " type " << fl->l_type << " owner " << owner
11171 << " " << fl->l_start << "~" << fl->l_len << dendl;
11172
11173 if (in->flags & I_ERROR_FILELOCK)
11174 return -CEPHFS_EIO;
11175
11176 int lock_cmd;
11177 if (F_RDLCK == fl->l_type)
11178 lock_cmd = CEPH_LOCK_SHARED;
11179 else if (F_WRLCK == fl->l_type)
11180 lock_cmd = CEPH_LOCK_EXCL;
11181 else if (F_UNLCK == fl->l_type)
11182 lock_cmd = CEPH_LOCK_UNLOCK;
11183 else
11184 return -CEPHFS_EIO;
11185
11186 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11187 sleep = 0;
11188
11189 /*
11190 * Set the most significant bit, so that MDS knows the 'owner'
11191 * is sufficient to identify the owner of lock. (old code uses
11192 * both 'owner' and 'pid')
11193 */
11194 owner |= (1ULL << 63);
11195
11196 MetaRequest *req = new MetaRequest(op);
11197 filepath path;
11198 in->make_nosnap_relative_path(path);
11199 req->set_filepath(path);
11200 req->set_inode(in);
11201
11202 req->head.args.filelock_change.rule = lock_type;
11203 req->head.args.filelock_change.type = lock_cmd;
11204 req->head.args.filelock_change.owner = owner;
11205 req->head.args.filelock_change.pid = fl->l_pid;
11206 req->head.args.filelock_change.start = fl->l_start;
11207 req->head.args.filelock_change.length = fl->l_len;
11208 req->head.args.filelock_change.wait = sleep;
11209
11210 int ret;
11211 bufferlist bl;
11212
11213 if (sleep && switch_interrupt_cb) {
11214 // enable interrupt
11215 switch_interrupt_cb(callback_handle, req->get());
11216 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11217 // disable interrupt
11218 switch_interrupt_cb(callback_handle, NULL);
11219 if (ret == 0 && req->aborted()) {
11220 // effect of this lock request has been revoked by the 'lock intr' request
11221 ret = req->get_abort_code();
11222 }
11223 put_request(req);
11224 } else {
11225 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11226 }
11227
11228 if (ret == 0) {
11229 if (op == CEPH_MDS_OP_GETFILELOCK) {
11230 ceph_filelock filelock;
11231 auto p = bl.cbegin();
11232 decode(filelock, p);
11233
11234 if (CEPH_LOCK_SHARED == filelock.type)
11235 fl->l_type = F_RDLCK;
11236 else if (CEPH_LOCK_EXCL == filelock.type)
11237 fl->l_type = F_WRLCK;
11238 else
11239 fl->l_type = F_UNLCK;
11240
11241 fl->l_whence = SEEK_SET;
11242 fl->l_start = filelock.start;
11243 fl->l_len = filelock.length;
11244 fl->l_pid = filelock.pid;
11245 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11246 ceph_lock_state_t *lock_state;
11247 if (lock_type == CEPH_LOCK_FCNTL) {
11248 if (!in->fcntl_locks)
11249 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11250 lock_state = in->fcntl_locks.get();
11251 } else if (lock_type == CEPH_LOCK_FLOCK) {
11252 if (!in->flock_locks)
11253 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11254 lock_state = in->flock_locks.get();
11255 } else {
11256 ceph_abort();
11257 return -CEPHFS_EINVAL;
11258 }
11259 _update_lock_state(fl, owner, lock_state);
11260
11261 if (!removing) {
11262 if (lock_type == CEPH_LOCK_FCNTL) {
11263 if (!fh->fcntl_locks)
11264 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11265 lock_state = fh->fcntl_locks.get();
11266 } else {
11267 if (!fh->flock_locks)
11268 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11269 lock_state = fh->flock_locks.get();
11270 }
11271 _update_lock_state(fl, owner, lock_state);
11272 }
11273 } else
11274 ceph_abort();
11275 }
11276 return ret;
11277 }
11278
11279 int Client::_interrupt_filelock(MetaRequest *req)
11280 {
11281 // Set abort code, but do not kick. The abort code prevents the request
11282 // from being re-sent.
11283 req->abort(-CEPHFS_EINTR);
11284 if (req->mds < 0)
11285 return 0; // haven't sent the request
11286
11287 Inode *in = req->inode();
11288
11289 int lock_type;
11290 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11291 lock_type = CEPH_LOCK_FLOCK_INTR;
11292 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11293 lock_type = CEPH_LOCK_FCNTL_INTR;
11294 else {
11295 ceph_abort();
11296 return -CEPHFS_EINVAL;
11297 }
11298
11299 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11300 filepath path;
11301 in->make_nosnap_relative_path(path);
11302 intr_req->set_filepath(path);
11303 intr_req->set_inode(in);
11304 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11305 intr_req->head.args.filelock_change.rule = lock_type;
11306 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11307
11308 UserPerm perms(req->get_uid(), req->get_gid());
11309 return make_request(intr_req, perms, NULL, NULL, -1);
11310 }
11311
11312 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11313 {
11314 if (!in->fcntl_locks && !in->flock_locks)
11315 return;
11316
11317 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11318 encode(nr_fcntl_locks, bl);
11319 if (nr_fcntl_locks) {
11320 auto &lock_state = in->fcntl_locks;
11321 for(auto p = lock_state->held_locks.begin();
11322 p != lock_state->held_locks.end();
11323 ++p)
11324 encode(p->second, bl);
11325 }
11326
11327 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11328 encode(nr_flock_locks, bl);
11329 if (nr_flock_locks) {
11330 auto &lock_state = in->flock_locks;
11331 for(auto p = lock_state->held_locks.begin();
11332 p != lock_state->held_locks.end();
11333 ++p)
11334 encode(p->second, bl);
11335 }
11336
11337 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11338 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11339 }
11340
11341 void Client::_release_filelocks(Fh *fh)
11342 {
11343 if (!fh->fcntl_locks && !fh->flock_locks)
11344 return;
11345
11346 Inode *in = fh->inode.get();
11347 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11348
11349 list<ceph_filelock> activated_locks;
11350
11351 list<pair<int, ceph_filelock> > to_release;
11352
11353 if (fh->fcntl_locks) {
11354 auto &lock_state = fh->fcntl_locks;
11355 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11356 auto q = p++;
11357 if (in->flags & I_ERROR_FILELOCK) {
11358 lock_state->remove_lock(q->second, activated_locks);
11359 } else {
11360 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11361 }
11362 }
11363 lock_state.reset();
11364 }
11365 if (fh->flock_locks) {
11366 auto &lock_state = fh->flock_locks;
11367 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11368 auto q = p++;
11369 if (in->flags & I_ERROR_FILELOCK) {
11370 lock_state->remove_lock(q->second, activated_locks);
11371 } else {
11372 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11373 }
11374 }
11375 lock_state.reset();
11376 }
11377
11378 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11379 in->flags &= ~I_ERROR_FILELOCK;
11380
11381 if (to_release.empty())
11382 return;
11383
11384 struct flock fl;
11385 memset(&fl, 0, sizeof(fl));
11386 fl.l_whence = SEEK_SET;
11387 fl.l_type = F_UNLCK;
11388
11389 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11390 p != to_release.end();
11391 ++p) {
11392 fl.l_start = p->second.start;
11393 fl.l_len = p->second.length;
11394 fl.l_pid = p->second.pid;
11395 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11396 p->second.owner, true);
11397 }
11398 }
11399
11400 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11401 ceph_lock_state_t *lock_state)
11402 {
11403 int lock_cmd;
11404 if (F_RDLCK == fl->l_type)
11405 lock_cmd = CEPH_LOCK_SHARED;
11406 else if (F_WRLCK == fl->l_type)
11407 lock_cmd = CEPH_LOCK_EXCL;
11408 else
11409 lock_cmd = CEPH_LOCK_UNLOCK;;
11410
11411 ceph_filelock filelock;
11412 filelock.start = fl->l_start;
11413 filelock.length = fl->l_len;
11414 filelock.client = 0;
11415 // see comment in _do_filelock()
11416 filelock.owner = owner | (1ULL << 63);
11417 filelock.pid = fl->l_pid;
11418 filelock.type = lock_cmd;
11419
11420 if (filelock.type == CEPH_LOCK_UNLOCK) {
11421 list<ceph_filelock> activated_locks;
11422 lock_state->remove_lock(filelock, activated_locks);
11423 } else {
11424 bool r = lock_state->add_lock(filelock, false, false, NULL);
11425 ceph_assert(r);
11426 }
11427 }
11428
11429 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11430 {
11431 Inode *in = fh->inode.get();
11432 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11433 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11434 return ret;
11435 }
11436
11437 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11438 {
11439 Inode *in = fh->inode.get();
11440 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11441 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11442 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11443 return ret;
11444 }
11445
11446 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11447 {
11448 Inode *in = fh->inode.get();
11449 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11450
11451 int sleep = !(cmd & LOCK_NB);
11452 cmd &= ~LOCK_NB;
11453
11454 int type;
11455 switch (cmd) {
11456 case LOCK_SH:
11457 type = F_RDLCK;
11458 break;
11459 case LOCK_EX:
11460 type = F_WRLCK;
11461 break;
11462 case LOCK_UN:
11463 type = F_UNLCK;
11464 break;
11465 default:
11466 return -CEPHFS_EINVAL;
11467 }
11468
11469 struct flock fl;
11470 memset(&fl, 0, sizeof(fl));
11471 fl.l_type = type;
11472 fl.l_whence = SEEK_SET;
11473
11474 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11475 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11476 return ret;
11477 }
11478
11479 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11480 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11481 if (!mref_reader.is_state_satisfied()) {
11482 return -CEPHFS_ENOTCONN;
11483 }
11484
11485 std::scoped_lock lock(client_lock);
11486 InodeRef in;
11487 int r = Client::path_walk(path, &in, perms, true);
11488 if (r < 0) {
11489 return r;
11490 }
11491
11492 if (in->snapid == CEPH_NOSNAP) {
11493 return -CEPHFS_EINVAL;
11494 }
11495
11496 snap_info->id = in->snapid;
11497 snap_info->metadata = in->snap_metadata;
11498 return 0;
11499 }
11500
11501 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11502 {
11503 /* Since the only thing this does is wrap a call to statfs, and
11504 statfs takes a lock, it doesn't seem we have a need to split it
11505 out. */
11506 return statfs(0, stbuf, perms);
11507 }
11508
11509 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11510 {
11511 if (!args)
11512 return;
11513
11514 ldout(cct, 10) << __func__ << " cb " << args->handle
11515 << " invalidate_ino_cb " << args->ino_cb
11516 << " invalidate_dentry_cb " << args->dentry_cb
11517 << " switch_interrupt_cb " << args->switch_intr_cb
11518 << " remount_cb " << args->remount_cb
11519 << dendl;
11520 callback_handle = args->handle;
11521 if (args->ino_cb) {
11522 ino_invalidate_cb = args->ino_cb;
11523 async_ino_invalidator.start();
11524 }
11525 if (args->dentry_cb) {
11526 dentry_invalidate_cb = args->dentry_cb;
11527 async_dentry_invalidator.start();
11528 }
11529 if (args->switch_intr_cb) {
11530 switch_interrupt_cb = args->switch_intr_cb;
11531 interrupt_finisher.start();
11532 }
11533 if (args->remount_cb) {
11534 remount_cb = args->remount_cb;
11535 remount_finisher.start();
11536 }
11537 if (args->ino_release_cb) {
11538 ino_release_cb = args->ino_release_cb;
11539 async_ino_releasor.start();
11540 }
11541 if (args->umask_cb)
11542 umask_cb = args->umask_cb;
11543 }
11544
11545 // This is deprecated, use ll_register_callbacks2() instead.
11546 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11547 {
11548 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11549
11550 _ll_register_callbacks(args);
11551 }
11552
11553 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11554 {
11555 if (is_mounting() || is_mounted() || is_unmounting())
11556 return -CEPHFS_EBUSY;
11557
11558 _ll_register_callbacks(args);
11559 return 0;
11560 }
11561
11562 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11563 {
11564 std::pair <int, bool> r(0, false);
11565
11566 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11567 if (!iref_reader.is_state_satisfied())
11568 return std::make_pair(-CEPHFS_ENOTCONN, false);
11569
11570 can_invalidate_dentries = can_invalidate;
11571
11572 if (can_invalidate_dentries) {
11573 ceph_assert(dentry_invalidate_cb);
11574 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11575 } else {
11576 ceph_assert(remount_cb);
11577 ldout(cct, 1) << "using remount_cb" << dendl;
11578 r = _do_remount(false);
11579 }
11580
11581 return r;
11582 }
11583
11584 int Client::_sync_fs()
11585 {
11586 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11587
11588 ldout(cct, 10) << __func__ << dendl;
11589
11590 // flush file data
11591 std::unique_ptr<C_SaferCond> cond = nullptr;
11592 if (cct->_conf->client_oc) {
11593 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11594 objectcacher->flush_all(cond.get());
11595 }
11596
11597 // flush caps
11598 flush_caps_sync();
11599 ceph_tid_t flush_tid = last_flush_tid;
11600
11601 // wait for unsafe mds requests
11602 wait_unsafe_requests();
11603
11604 wait_sync_caps(flush_tid);
11605
11606 if (nullptr != cond) {
11607 client_lock.unlock();
11608 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11609 cond->wait();
11610 ldout(cct, 15) << __func__ << " flush finished" << dendl;
11611 client_lock.lock();
11612 }
11613
11614 return 0;
11615 }
11616
11617 int Client::sync_fs()
11618 {
11619 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11620 if (!mref_reader.is_state_satisfied())
11621 return -CEPHFS_ENOTCONN;
11622
11623 std::scoped_lock l(client_lock);
11624
11625 return _sync_fs();
11626 }
11627
11628 int64_t Client::drop_caches()
11629 {
11630 std::scoped_lock l(client_lock);
11631 return objectcacher->release_all();
11632 }
11633
11634 int Client::_lazyio(Fh *fh, int enable)
11635 {
11636 Inode *in = fh->inode.get();
11637 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11638
11639 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11640 return 0;
11641
11642 int orig_mode = fh->mode;
11643 if (enable) {
11644 fh->mode |= CEPH_FILE_MODE_LAZY;
11645 in->get_open_ref(fh->mode);
11646 in->put_open_ref(orig_mode);
11647 check_caps(in, CHECK_CAPS_NODELAY);
11648 } else {
11649 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11650 in->get_open_ref(fh->mode);
11651 in->put_open_ref(orig_mode);
11652 check_caps(in, 0);
11653 }
11654
11655 return 0;
11656 }
11657
11658 int Client::lazyio(int fd, int enable)
11659 {
11660 std::scoped_lock l(client_lock);
11661 Fh *f = get_filehandle(fd);
11662 if (!f)
11663 return -CEPHFS_EBADF;
11664
11665 return _lazyio(f, enable);
11666 }
11667
11668 int Client::ll_lazyio(Fh *fh, int enable)
11669 {
11670 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11671 tout(cct) << __func__ << std::endl;
11672
11673 std::scoped_lock lock(client_lock);
11674 return _lazyio(fh, enable);
11675 }
11676
11677 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11678 {
11679 std::scoped_lock l(client_lock);
11680 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11681 << ", " << offset << ", " << count << ")" << dendl;
11682
11683 Fh *f = get_filehandle(fd);
11684 if (!f)
11685 return -CEPHFS_EBADF;
11686
11687 // for now
11688 _fsync(f, true);
11689
11690 return 0;
11691 }
11692
11693 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11694 {
11695 std::scoped_lock l(client_lock);
11696 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11697 << ", " << offset << ", " << count << ")" << dendl;
11698
11699 Fh *f = get_filehandle(fd);
11700 if (!f)
11701 return -CEPHFS_EBADF;
11702 Inode *in = f->inode.get();
11703
11704 _fsync(f, true);
11705 if (_release(in)) {
11706 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11707 if (r < 0)
11708 return r;
11709 }
11710 return 0;
11711 }
11712
11713
11714 // =============================
11715 // snaps
11716
11717 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11718 mode_t mode, const std::map<std::string, std::string> &metadata)
11719 {
11720 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11721 if (!mref_reader.is_state_satisfied())
11722 return -CEPHFS_ENOTCONN;
11723
11724 std::scoped_lock l(client_lock);
11725
11726 filepath path(relpath);
11727 InodeRef in;
11728 int r = path_walk(path, &in, perm);
11729 if (r < 0)
11730 return r;
11731 if (cct->_conf->client_permissions) {
11732 r = may_create(in.get(), perm);
11733 if (r < 0)
11734 return r;
11735 }
11736 Inode *snapdir = open_snapdir(in.get());
11737 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11738 }
11739
11740 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11741 {
11742 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11743 if (!mref_reader.is_state_satisfied())
11744 return -CEPHFS_ENOTCONN;
11745
11746 std::scoped_lock l(client_lock);
11747
11748 filepath path(relpath);
11749 InodeRef in;
11750 int r = path_walk(path, &in, perms);
11751 if (r < 0)
11752 return r;
11753 Inode *snapdir = open_snapdir(in.get());
11754 if (cct->_conf->client_permissions) {
11755 r = may_delete(snapdir, check_perms ? name : NULL, perms);
11756 if (r < 0)
11757 return r;
11758 }
11759 return _rmdir(snapdir, name, perms);
11760 }
11761
11762 // =============================
11763 // expose caps
11764
11765 int Client::get_caps_issued(int fd)
11766 {
11767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11768 if (!mref_reader.is_state_satisfied())
11769 return -CEPHFS_ENOTCONN;
11770
11771 std::scoped_lock lock(client_lock);
11772
11773 Fh *f = get_filehandle(fd);
11774 if (!f)
11775 return -CEPHFS_EBADF;
11776
11777 return f->inode->caps_issued();
11778 }
11779
11780 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11781 {
11782 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11783 if (!mref_reader.is_state_satisfied())
11784 return -CEPHFS_ENOTCONN;
11785
11786 std::scoped_lock lock(client_lock);
11787
11788 filepath p(path);
11789 InodeRef in;
11790 int r = path_walk(p, &in, perms, true);
11791 if (r < 0)
11792 return r;
11793 return in->caps_issued();
11794 }
11795
11796 // =========================================
11797 // low level
11798
11799 Inode *Client::open_snapdir(Inode *diri)
11800 {
11801 Inode *in;
11802 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11803 if (!inode_map.count(vino)) {
11804 in = new Inode(this, vino, &diri->layout);
11805
11806 in->ino = diri->ino;
11807 in->snapid = CEPH_SNAPDIR;
11808 in->mode = diri->mode;
11809 in->uid = diri->uid;
11810 in->gid = diri->gid;
11811 in->nlink = 1;
11812 in->mtime = diri->mtime;
11813 in->ctime = diri->ctime;
11814 in->btime = diri->btime;
11815 in->atime = diri->atime;
11816 in->size = diri->size;
11817 in->change_attr = diri->change_attr;
11818
11819 in->dirfragtree.clear();
11820 in->snapdir_parent = diri;
11821 diri->flags |= I_SNAPDIR_OPEN;
11822 inode_map[vino] = in;
11823 if (use_faked_inos())
11824 _assign_faked_ino(in);
11825 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11826 } else {
11827 in = inode_map[vino];
11828 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11829 }
11830 return in;
11831 }
11832
11833 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11834 Inode **out, const UserPerm& perms)
11835 {
11836 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11837 if (!mref_reader.is_state_satisfied())
11838 return -CEPHFS_ENOTCONN;
11839
11840 vinodeno_t vparent = _get_vino(parent);
11841 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11842 tout(cct) << __func__ << std::endl;
11843 tout(cct) << name << std::endl;
11844
11845 std::scoped_lock lock(client_lock);
11846
11847 int r = 0;
11848 if (!fuse_default_permissions) {
11849 if (strcmp(name, ".") && strcmp(name, "..")) {
11850 r = may_lookup(parent, perms);
11851 if (r < 0)
11852 return r;
11853 }
11854 }
11855
11856 string dname(name);
11857 InodeRef in;
11858
11859 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11860 if (r < 0) {
11861 attr->st_ino = 0;
11862 goto out;
11863 }
11864
11865 ceph_assert(in);
11866 fill_stat(in, attr);
11867 _ll_get(in.get());
11868
11869 out:
11870 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11871 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11872 tout(cct) << attr->st_ino << std::endl;
11873 *out = in.get();
11874 return r;
11875 }
11876
11877 int Client::ll_lookup_vino(
11878 vinodeno_t vino,
11879 const UserPerm& perms,
11880 Inode **inode)
11881 {
11882 ceph_assert(inode != NULL);
11883 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11884 if (!mref_reader.is_state_satisfied())
11885 return -CEPHFS_ENOTCONN;
11886
11887 if (is_reserved_vino(vino))
11888 return -CEPHFS_ESTALE;
11889
11890 std::scoped_lock lock(client_lock);
11891 ldout(cct, 3) << __func__ << " " << vino << dendl;
11892
11893 // Check the cache first
11894 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11895 if (p != inode_map.end()) {
11896 *inode = p->second;
11897 _ll_get(*inode);
11898 return 0;
11899 }
11900
11901 uint64_t snapid = vino.snapid;
11902
11903 // for snapdir, find the non-snapped dir inode
11904 if (snapid == CEPH_SNAPDIR)
11905 vino.snapid = CEPH_NOSNAP;
11906
11907 int r = _lookup_vino(vino, perms, inode);
11908 if (r)
11909 return r;
11910 ceph_assert(*inode != NULL);
11911
11912 if (snapid == CEPH_SNAPDIR) {
11913 Inode *tmp = *inode;
11914
11915 // open the snapdir and put the inode ref
11916 *inode = open_snapdir(tmp);
11917 _ll_forget(tmp, 1);
11918 _ll_get(*inode);
11919 }
11920 return 0;
11921 }
11922
11923 int Client::ll_lookup_inode(
11924 struct inodeno_t ino,
11925 const UserPerm& perms,
11926 Inode **inode)
11927 {
11928 vinodeno_t vino(ino, CEPH_NOSNAP);
11929 return ll_lookup_vino(vino, perms, inode);
11930 }
11931
11932 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11933 struct ceph_statx *stx, unsigned want, unsigned flags,
11934 const UserPerm& perms)
11935 {
11936 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11937 if (!mref_reader.is_state_satisfied())
11938 return -CEPHFS_ENOTCONN;
11939
11940 vinodeno_t vparent = _get_vino(parent);
11941 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11942 tout(cct) << "ll_lookupx" << std::endl;
11943 tout(cct) << name << std::endl;
11944
11945 std::scoped_lock lock(client_lock);
11946
11947 int r = 0;
11948 if (!fuse_default_permissions) {
11949 r = may_lookup(parent, perms);
11950 if (r < 0)
11951 return r;
11952 }
11953
11954 string dname(name);
11955 InodeRef in;
11956
11957 unsigned mask = statx_to_mask(flags, want);
11958 r = _lookup(parent, dname, mask, &in, perms);
11959 if (r < 0) {
11960 stx->stx_ino = 0;
11961 stx->stx_mask = 0;
11962 } else {
11963 ceph_assert(in);
11964 fill_statx(in, mask, stx);
11965 _ll_get(in.get());
11966 }
11967
11968 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11969 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11970 tout(cct) << stx->stx_ino << std::endl;
11971 *out = in.get();
11972 return r;
11973 }
11974
11975 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11976 unsigned int want, unsigned int flags, const UserPerm& perms)
11977 {
11978 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11979 if (!mref_reader.is_state_satisfied())
11980 return -CEPHFS_ENOTCONN;
11981
11982 filepath fp(name, 0);
11983 InodeRef in;
11984 int rc;
11985 unsigned mask = statx_to_mask(flags, want);
11986
11987 ldout(cct, 3) << __func__ << " " << name << dendl;
11988 tout(cct) << __func__ << std::endl;
11989 tout(cct) << name << std::endl;
11990
11991 std::scoped_lock lock(client_lock);
11992 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11993 if (rc < 0) {
11994 /* zero out mask, just in case... */
11995 stx->stx_mask = 0;
11996 stx->stx_ino = 0;
11997 *out = NULL;
11998 return rc;
11999 } else {
12000 ceph_assert(in);
12001 fill_statx(in, mask, stx);
12002 _ll_get(in.get());
12003 *out = in.get();
12004 return 0;
12005 }
12006 }
12007
12008 void Client::_ll_get(Inode *in)
12009 {
12010 if (in->ll_ref == 0) {
12011 in->iget();
12012 if (in->is_dir() && !in->dentries.empty()) {
12013 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12014 in->get_first_parent()->get(); // pin dentry
12015 }
12016 if (in->snapid != CEPH_NOSNAP)
12017 ll_snap_ref[in->snapid]++;
12018 }
12019 in->ll_get();
12020 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
12021 }
12022
12023 int Client::_ll_put(Inode *in, uint64_t num)
12024 {
12025 in->ll_put(num);
12026 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
12027 if (in->ll_ref == 0) {
12028 if (in->is_dir() && !in->dentries.empty()) {
12029 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12030 in->get_first_parent()->put(); // unpin dentry
12031 }
12032 if (in->snapid != CEPH_NOSNAP) {
12033 auto p = ll_snap_ref.find(in->snapid);
12034 ceph_assert(p != ll_snap_ref.end());
12035 ceph_assert(p->second > 0);
12036 if (--p->second == 0)
12037 ll_snap_ref.erase(p);
12038 }
12039 put_inode(in);
12040 return 0;
12041 } else {
12042 return in->ll_ref;
12043 }
12044 }
12045
12046 void Client::_ll_drop_pins()
12047 {
12048 ldout(cct, 10) << __func__ << dendl;
12049 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
12050 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12051 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12052 it != inode_map.end();
12053 it = next) {
12054 Inode *in = it->second;
12055 next = it;
12056 ++next;
12057 if (in->ll_ref){
12058 to_be_put.insert(in);
12059 _ll_put(in, in->ll_ref);
12060 }
12061 }
12062 }
12063
12064 bool Client::_ll_forget(Inode *in, uint64_t count)
12065 {
12066 inodeno_t ino = in->ino;
12067
12068 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12069 tout(cct) << __func__ << std::endl;
12070 tout(cct) << ino.val << std::endl;
12071 tout(cct) << count << std::endl;
12072
12073 // Ignore forget if we're no longer mounted
12074 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12075 if (!mref_reader.is_state_satisfied())
12076 return true;
12077
12078 if (ino == 1) return true; // ignore forget on root.
12079
12080 bool last = false;
12081 if (in->ll_ref < count) {
12082 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12083 << ", which only has ll_ref=" << in->ll_ref << dendl;
12084 _ll_put(in, in->ll_ref);
12085 last = true;
12086 } else {
12087 if (_ll_put(in, count) == 0)
12088 last = true;
12089 }
12090
12091 return last;
12092 }
12093
12094 bool Client::ll_forget(Inode *in, uint64_t count)
12095 {
12096 std::scoped_lock lock(client_lock);
12097 return _ll_forget(in, count);
12098 }
12099
12100 bool Client::ll_put(Inode *in)
12101 {
12102 /* ll_forget already takes the lock */
12103 return ll_forget(in, 1);
12104 }
12105
12106 int Client::ll_get_snap_ref(snapid_t snap)
12107 {
12108 std::scoped_lock lock(client_lock);
12109 auto p = ll_snap_ref.find(snap);
12110 if (p != ll_snap_ref.end())
12111 return p->second;
12112 return 0;
12113 }
12114
12115 snapid_t Client::ll_get_snapid(Inode *in)
12116 {
12117 std::scoped_lock lock(client_lock);
12118 return in->snapid;
12119 }
12120
12121 Inode *Client::ll_get_inode(ino_t ino)
12122 {
12123 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12124 if (!mref_reader.is_state_satisfied())
12125 return NULL;
12126
12127 std::scoped_lock lock(client_lock);
12128
12129 vinodeno_t vino = _map_faked_ino(ino);
12130 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12131 if (p == inode_map.end())
12132 return NULL;
12133 Inode *in = p->second;
12134 _ll_get(in);
12135 return in;
12136 }
12137
12138 Inode *Client::ll_get_inode(vinodeno_t vino)
12139 {
12140 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12141 if (!mref_reader.is_state_satisfied())
12142 return NULL;
12143
12144 if (is_reserved_vino(vino))
12145 return NULL;
12146
12147 std::scoped_lock lock(client_lock);
12148
12149 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12150 if (p == inode_map.end())
12151 return NULL;
12152 Inode *in = p->second;
12153 _ll_get(in);
12154 return in;
12155 }
12156
12157 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12158 {
12159 vinodeno_t vino = _get_vino(in);
12160
12161 ldout(cct, 8) << __func__ << " " << vino << dendl;
12162 tout(cct) << __func__ << std::endl;
12163 tout(cct) << vino.ino.val << std::endl;
12164
12165 if (vino.snapid < CEPH_NOSNAP)
12166 return 0;
12167 else
12168 return _getattr(in, caps, perms);
12169 }
12170
12171 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12172 {
12173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12174 if (!mref_reader.is_state_satisfied())
12175 return -CEPHFS_ENOTCONN;
12176
12177 std::scoped_lock lock(client_lock);
12178
12179 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12180
12181 if (res == 0)
12182 fill_stat(in, attr);
12183 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12184 return res;
12185 }
12186
12187 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12188 unsigned int flags, const UserPerm& perms)
12189 {
12190 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12191 if (!mref_reader.is_state_satisfied())
12192 return -CEPHFS_ENOTCONN;
12193
12194 std::scoped_lock lock(client_lock);
12195
12196 int res = 0;
12197 unsigned mask = statx_to_mask(flags, want);
12198
12199 if (mask && !in->caps_issued_mask(mask, true))
12200 res = _ll_getattr(in, mask, perms);
12201
12202 if (res == 0)
12203 fill_statx(in, mask, stx);
12204 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12205 return res;
12206 }
12207
12208 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12209 const UserPerm& perms, InodeRef *inp)
12210 {
12211 vinodeno_t vino = _get_vino(in);
12212
12213 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12214 << dendl;
12215 tout(cct) << __func__ << std::endl;
12216 tout(cct) << vino.ino.val << std::endl;
12217 tout(cct) << stx->stx_mode << std::endl;
12218 tout(cct) << stx->stx_uid << std::endl;
12219 tout(cct) << stx->stx_gid << std::endl;
12220 tout(cct) << stx->stx_size << std::endl;
12221 tout(cct) << stx->stx_mtime << std::endl;
12222 tout(cct) << stx->stx_atime << std::endl;
12223 tout(cct) << stx->stx_btime << std::endl;
12224 tout(cct) << mask << std::endl;
12225
12226 if (!fuse_default_permissions) {
12227 int res = may_setattr(in, stx, mask, perms);
12228 if (res < 0)
12229 return res;
12230 }
12231
12232 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12233
12234 return __setattrx(in, stx, mask, perms, inp);
12235 }
12236
12237 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12238 const UserPerm& perms)
12239 {
12240 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12241 if (!mref_reader.is_state_satisfied())
12242 return -CEPHFS_ENOTCONN;
12243
12244 std::scoped_lock lock(client_lock);
12245
12246 InodeRef target(in);
12247 int res = _ll_setattrx(in, stx, mask, perms, &target);
12248 if (res == 0) {
12249 ceph_assert(in == target.get());
12250 fill_statx(in, in->caps_issued(), stx);
12251 }
12252
12253 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12254 return res;
12255 }
12256
12257 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12258 const UserPerm& perms)
12259 {
12260 struct ceph_statx stx;
12261 stat_to_statx(attr, &stx);
12262
12263 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12264 if (!mref_reader.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN;
12266
12267 std::scoped_lock lock(client_lock);
12268
12269 InodeRef target(in);
12270 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12271 if (res == 0) {
12272 ceph_assert(in == target.get());
12273 fill_stat(in, attr);
12274 }
12275
12276 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12277 return res;
12278 }
12279
12280
12281 // ----------
12282 // xattrs
12283
12284 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12285 const UserPerm& perms)
12286 {
12287 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12288 if (!mref_reader.is_state_satisfied())
12289 return -CEPHFS_ENOTCONN;
12290
12291 std::scoped_lock lock(client_lock);
12292
12293 InodeRef in;
12294 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12295 if (r < 0)
12296 return r;
12297 return _getxattr(in, name, value, size, perms);
12298 }
12299
12300 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12301 const UserPerm& perms)
12302 {
12303 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12304 if (!mref_reader.is_state_satisfied())
12305 return -CEPHFS_ENOTCONN;
12306
12307 std::scoped_lock lock(client_lock);
12308
12309 InodeRef in;
12310 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12311 if (r < 0)
12312 return r;
12313 return _getxattr(in, name, value, size, perms);
12314 }
12315
12316 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12317 const UserPerm& perms)
12318 {
12319 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12320 if (!mref_reader.is_state_satisfied())
12321 return -CEPHFS_ENOTCONN;
12322
12323 std::scoped_lock lock(client_lock);
12324
12325 Fh *f = get_filehandle(fd);
12326 if (!f)
12327 return -CEPHFS_EBADF;
12328 return _getxattr(f->inode, name, value, size, perms);
12329 }
12330
12331 int Client::listxattr(const char *path, char *list, size_t size,
12332 const UserPerm& perms)
12333 {
12334 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12335 if (!mref_reader.is_state_satisfied())
12336 return -CEPHFS_ENOTCONN;
12337
12338 std::scoped_lock lock(client_lock);
12339
12340 InodeRef in;
12341 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12342 if (r < 0)
12343 return r;
12344 return Client::_listxattr(in.get(), list, size, perms);
12345 }
12346
12347 int Client::llistxattr(const char *path, char *list, size_t size,
12348 const UserPerm& perms)
12349 {
12350 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12351 if (!mref_reader.is_state_satisfied())
12352 return -CEPHFS_ENOTCONN;
12353
12354 std::scoped_lock lock(client_lock);
12355
12356 InodeRef in;
12357 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12358 if (r < 0)
12359 return r;
12360 return Client::_listxattr(in.get(), list, size, perms);
12361 }
12362
12363 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12364 {
12365 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12366 if (!mref_reader.is_state_satisfied())
12367 return -CEPHFS_ENOTCONN;
12368
12369 std::scoped_lock lock(client_lock);
12370
12371 Fh *f = get_filehandle(fd);
12372 if (!f)
12373 return -CEPHFS_EBADF;
12374 return Client::_listxattr(f->inode.get(), list, size, perms);
12375 }
12376
12377 int Client::removexattr(const char *path, const char *name,
12378 const UserPerm& perms)
12379 {
12380 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12381 if (!mref_reader.is_state_satisfied())
12382 return -CEPHFS_ENOTCONN;
12383
12384 std::scoped_lock lock(client_lock);
12385
12386 InodeRef in;
12387 int r = Client::path_walk(path, &in, perms, true);
12388 if (r < 0)
12389 return r;
12390 return _removexattr(in, name, perms);
12391 }
12392
12393 int Client::lremovexattr(const char *path, const char *name,
12394 const UserPerm& perms)
12395 {
12396 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12397 if (!mref_reader.is_state_satisfied())
12398 return -CEPHFS_ENOTCONN;
12399
12400 std::scoped_lock lock(client_lock);
12401
12402 InodeRef in;
12403 int r = Client::path_walk(path, &in, perms, false);
12404 if (r < 0)
12405 return r;
12406 return _removexattr(in, name, perms);
12407 }
12408
12409 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12410 {
12411 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12412 if (!mref_reader.is_state_satisfied())
12413 return -CEPHFS_ENOTCONN;
12414
12415 std::scoped_lock lock(client_lock);
12416
12417 Fh *f = get_filehandle(fd);
12418 if (!f)
12419 return -CEPHFS_EBADF;
12420 return _removexattr(f->inode, name, perms);
12421 }
12422
12423 int Client::setxattr(const char *path, const char *name, const void *value,
12424 size_t size, int flags, const UserPerm& perms)
12425 {
12426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12427 if (!mref_reader.is_state_satisfied())
12428 return -CEPHFS_ENOTCONN;
12429
12430 _setxattr_maybe_wait_for_osdmap(name, value, size);
12431
12432 std::scoped_lock lock(client_lock);
12433
12434 InodeRef in;
12435 int r = Client::path_walk(path, &in, perms, true);
12436 if (r < 0)
12437 return r;
12438 return _setxattr(in, name, value, size, flags, perms);
12439 }
12440
12441 int Client::lsetxattr(const char *path, const char *name, const void *value,
12442 size_t size, int flags, const UserPerm& perms)
12443 {
12444 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12445 if (!mref_reader.is_state_satisfied())
12446 return -CEPHFS_ENOTCONN;
12447
12448 _setxattr_maybe_wait_for_osdmap(name, value, size);
12449
12450 std::scoped_lock lock(client_lock);
12451
12452 InodeRef in;
12453 int r = Client::path_walk(path, &in, perms, false);
12454 if (r < 0)
12455 return r;
12456 return _setxattr(in, name, value, size, flags, perms);
12457 }
12458
12459 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12460 int flags, const UserPerm& perms)
12461 {
12462 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12463 if (!mref_reader.is_state_satisfied())
12464 return -CEPHFS_ENOTCONN;
12465
12466 _setxattr_maybe_wait_for_osdmap(name, value, size);
12467
12468 std::scoped_lock lock(client_lock);
12469
12470 Fh *f = get_filehandle(fd);
12471 if (!f)
12472 return -CEPHFS_EBADF;
12473 return _setxattr(f->inode, name, value, size, flags, perms);
12474 }
12475
12476 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12477 const UserPerm& perms)
12478 {
12479 int r;
12480 const VXattr *vxattr = nullptr;
12481
12482 vxattr = _match_vxattr(in, name);
12483 if (vxattr) {
12484 r = -CEPHFS_ENODATA;
12485
12486 // Do a force getattr to get the latest quota before returning
12487 // a value to userspace.
12488 int flags = 0;
12489 if (vxattr->flags & VXATTR_RSTAT) {
12490 flags |= CEPH_STAT_RSTAT;
12491 }
12492 if (vxattr->flags & VXATTR_DIRSTAT) {
12493 flags |= CEPH_CAP_FILE_SHARED;
12494 }
12495 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12496 if (r != 0) {
12497 // Error from getattr!
12498 return r;
12499 }
12500
12501 // call pointer-to-member function
12502 char buf[256];
12503 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12504 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12505 } else {
12506 r = -CEPHFS_ENODATA;
12507 }
12508
12509 if (size != 0) {
12510 if (r > (int)size) {
12511 r = -CEPHFS_ERANGE;
12512 } else if (r > 0) {
12513 memcpy(value, buf, r);
12514 }
12515 }
12516 goto out;
12517 }
12518
12519 if (!strncmp(name, "ceph.", 5)) {
12520 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12521 goto out;
12522 }
12523
12524 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12525 r = -CEPHFS_EOPNOTSUPP;
12526 goto out;
12527 }
12528
12529 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12530 if (r == 0) {
12531 string n(name);
12532 r = -CEPHFS_ENODATA;
12533 if (in->xattrs.count(n)) {
12534 r = in->xattrs[n].length();
12535 if (r > 0 && size != 0) {
12536 if (size >= (unsigned)r)
12537 memcpy(value, in->xattrs[n].c_str(), r);
12538 else
12539 r = -CEPHFS_ERANGE;
12540 }
12541 }
12542 }
12543 out:
12544 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12545 return r;
12546 }
12547
12548 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12549 const UserPerm& perms)
12550 {
12551 if (cct->_conf->client_permissions) {
12552 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12553 if (r < 0)
12554 return r;
12555 }
12556 return _getxattr(in.get(), name, value, size, perms);
12557 }
12558
12559 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12560 size_t size, const UserPerm& perms)
12561 {
12562 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12563 if (!mref_reader.is_state_satisfied())
12564 return -CEPHFS_ENOTCONN;
12565
12566 vinodeno_t vino = _get_vino(in);
12567
12568 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12569 tout(cct) << __func__ << std::endl;
12570 tout(cct) << vino.ino.val << std::endl;
12571 tout(cct) << name << std::endl;
12572
12573 std::scoped_lock lock(client_lock);
12574 if (!fuse_default_permissions) {
12575 int r = xattr_permission(in, name, MAY_READ, perms);
12576 if (r < 0)
12577 return r;
12578 }
12579
12580 return _getxattr(in, name, value, size, perms);
12581 }
12582
12583 int Client::_listxattr(Inode *in, char *name, size_t size,
12584 const UserPerm& perms)
12585 {
12586 bool len_only = (size == 0);
12587 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12588 if (r != 0) {
12589 goto out;
12590 }
12591
12592 r = 0;
12593 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12594 if (xattr_name.rfind("ceph.", 0) == 0) {
12595 continue;
12596 }
12597
12598 size_t this_len = xattr_name.length() + 1;
12599 r += this_len;
12600 if (len_only)
12601 continue;
12602
12603 if (this_len > size) {
12604 r = -CEPHFS_ERANGE;
12605 goto out;
12606 }
12607
12608 memcpy(name, xattr_name.c_str(), this_len);
12609 name += this_len;
12610 size -= this_len;
12611 }
12612 out:
12613 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12614 return r;
12615 }
12616
12617 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12618 const UserPerm& perms)
12619 {
12620 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12621 if (!mref_reader.is_state_satisfied())
12622 return -CEPHFS_ENOTCONN;
12623
12624 vinodeno_t vino = _get_vino(in);
12625
12626 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12627 tout(cct) << __func__ << std::endl;
12628 tout(cct) << vino.ino.val << std::endl;
12629 tout(cct) << size << std::endl;
12630
12631 std::scoped_lock lock(client_lock);
12632 return _listxattr(in, names, size, perms);
12633 }
12634
12635 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12636 size_t size, int flags, const UserPerm& perms)
12637 {
12638
12639 int xattr_flags = 0;
12640 if (!value)
12641 xattr_flags |= CEPH_XATTR_REMOVE;
12642 if (flags & XATTR_CREATE)
12643 xattr_flags |= CEPH_XATTR_CREATE;
12644 if (flags & XATTR_REPLACE)
12645 xattr_flags |= CEPH_XATTR_REPLACE;
12646
12647 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12648 filepath path;
12649 in->make_nosnap_relative_path(path);
12650 req->set_filepath(path);
12651 req->set_string2(name);
12652 req->set_inode(in);
12653 req->head.args.setxattr.flags = xattr_flags;
12654
12655 bufferlist bl;
12656 ceph_assert(value || size == 0);
12657 bl.append((const char*)value, size);
12658 req->set_data(bl);
12659
12660 int res = make_request(req, perms);
12661
12662 trim_cache();
12663 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12664 res << dendl;
12665 return res;
12666 }
12667
12668 int Client::_setxattr(Inode *in, const char *name, const void *value,
12669 size_t size, int flags, const UserPerm& perms)
12670 {
12671 if (in->snapid != CEPH_NOSNAP) {
12672 return -CEPHFS_EROFS;
12673 }
12674
12675 if (size == 0) {
12676 value = "";
12677 } else if (value == NULL) {
12678 return -CEPHFS_EINVAL;
12679 }
12680
12681 bool posix_acl_xattr = false;
12682 if (acl_type == POSIX_ACL)
12683 posix_acl_xattr = !strncmp(name, "system.", 7);
12684
12685 if (strncmp(name, "user.", 5) &&
12686 strncmp(name, "security.", 9) &&
12687 strncmp(name, "trusted.", 8) &&
12688 strncmp(name, "ceph.", 5) &&
12689 !posix_acl_xattr)
12690 return -CEPHFS_EOPNOTSUPP;
12691
12692 bool check_realm = false;
12693
12694 if (posix_acl_xattr) {
12695 if (!strcmp(name, ACL_EA_ACCESS)) {
12696 mode_t new_mode = in->mode;
12697 if (value) {
12698 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12699 if (ret < 0)
12700 return ret;
12701 if (ret == 0) {
12702 value = NULL;
12703 size = 0;
12704 }
12705 if (new_mode != in->mode) {
12706 struct ceph_statx stx;
12707 stx.stx_mode = new_mode;
12708 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12709 if (ret < 0)
12710 return ret;
12711 }
12712 }
12713 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12714 if (value) {
12715 if (!S_ISDIR(in->mode))
12716 return -CEPHFS_EACCES;
12717 int ret = posix_acl_check(value, size);
12718 if (ret < 0)
12719 return -CEPHFS_EINVAL;
12720 if (ret == 0) {
12721 value = NULL;
12722 size = 0;
12723 }
12724 }
12725 } else {
12726 return -CEPHFS_EOPNOTSUPP;
12727 }
12728 } else {
12729 const VXattr *vxattr = _match_vxattr(in, name);
12730 if (vxattr) {
12731 if (vxattr->readonly)
12732 return -CEPHFS_EOPNOTSUPP;
12733 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12734 check_realm = true;
12735 }
12736 }
12737
12738 int ret = _do_setxattr(in, name, value, size, flags, perms);
12739 if (ret >= 0 && check_realm) {
12740 // check if snaprealm was created for quota inode
12741 if (in->quota.is_enable() &&
12742 !(in->snaprealm && in->snaprealm->ino == in->ino))
12743 ret = -CEPHFS_EOPNOTSUPP;
12744 }
12745
12746 return ret;
12747 }
12748
12749 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12750 size_t size, int flags, const UserPerm& perms)
12751 {
12752 if (cct->_conf->client_permissions) {
12753 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12754 if (r < 0)
12755 return r;
12756 }
12757 return _setxattr(in.get(), name, value, size, flags, perms);
12758 }
12759
12760 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12761 {
12762 string tmp;
12763 if (name == "layout") {
12764 string::iterator begin = value.begin();
12765 string::iterator end = value.end();
12766 keys_and_values<string::iterator> p; // create instance of parser
12767 std::map<string, string> m; // map to receive results
12768 if (!qi::parse(begin, end, p, m)) { // returns true if successful
12769 return -CEPHFS_EINVAL;
12770 }
12771 if (begin != end)
12772 return -CEPHFS_EINVAL;
12773 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12774 if (q->first == "pool") {
12775 tmp = q->second;
12776 break;
12777 }
12778 }
12779 } else if (name == "layout.pool") {
12780 tmp = value;
12781 }
12782
12783 if (tmp.length()) {
12784 int64_t pool;
12785 try {
12786 pool = boost::lexical_cast<unsigned>(tmp);
12787 if (!osdmap->have_pg_pool(pool))
12788 return -CEPHFS_ENOENT;
12789 } catch (boost::bad_lexical_cast const&) {
12790 pool = osdmap->lookup_pg_pool_name(tmp);
12791 if (pool < 0) {
12792 return -CEPHFS_ENOENT;
12793 }
12794 }
12795 }
12796
12797 return 0;
12798 }
12799
12800 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12801 {
12802 // For setting pool of layout, MetaRequest need osdmap epoch.
12803 // There is a race which create a new data pool but client and mds both don't have.
12804 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12805 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12806 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12807 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12808 string rest(strstr(name, "layout"));
12809 string v((const char*)value, size);
12810 int r = objecter->with_osdmap([&](const OSDMap& o) {
12811 return _setxattr_check_data_pool(rest, v, &o);
12812 });
12813
12814 if (r == -CEPHFS_ENOENT) {
12815 bs::error_code ec;
12816 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12817 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12818 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12819 }
12820 }
12821 }
12822
12823 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12824 size_t size, int flags, const UserPerm& perms)
12825 {
12826 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12827 if (!mref_reader.is_state_satisfied())
12828 return -CEPHFS_ENOTCONN;
12829
12830 _setxattr_maybe_wait_for_osdmap(name, value, size);
12831
12832 vinodeno_t vino = _get_vino(in);
12833
12834 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12835 tout(cct) << __func__ << std::endl;
12836 tout(cct) << vino.ino.val << std::endl;
12837 tout(cct) << name << std::endl;
12838
12839 std::scoped_lock lock(client_lock);
12840 if (!fuse_default_permissions) {
12841 int r = xattr_permission(in, name, MAY_WRITE, perms);
12842 if (r < 0)
12843 return r;
12844 }
12845 return _setxattr(in, name, value, size, flags, perms);
12846 }
12847
12848 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12849 {
12850 if (in->snapid != CEPH_NOSNAP) {
12851 return -CEPHFS_EROFS;
12852 }
12853
12854 // same xattrs supported by kernel client
12855 if (strncmp(name, "user.", 5) &&
12856 strncmp(name, "system.", 7) &&
12857 strncmp(name, "security.", 9) &&
12858 strncmp(name, "trusted.", 8) &&
12859 strncmp(name, "ceph.", 5))
12860 return -CEPHFS_EOPNOTSUPP;
12861
12862 const VXattr *vxattr = _match_vxattr(in, name);
12863 if (vxattr && vxattr->readonly)
12864 return -CEPHFS_EOPNOTSUPP;
12865
12866 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12867 filepath path;
12868 in->make_nosnap_relative_path(path);
12869 req->set_filepath(path);
12870 req->set_filepath2(name);
12871 req->set_inode(in);
12872
12873 int res = make_request(req, perms);
12874
12875 trim_cache();
12876 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12877 return res;
12878 }
12879
12880 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12881 {
12882 if (cct->_conf->client_permissions) {
12883 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12884 if (r < 0)
12885 return r;
12886 }
12887 return _removexattr(in.get(), name, perms);
12888 }
12889
12890 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12891 {
12892 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12893 if (!mref_reader.is_state_satisfied())
12894 return -CEPHFS_ENOTCONN;
12895
12896 vinodeno_t vino = _get_vino(in);
12897
12898 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12899 tout(cct) << "ll_removexattr" << std::endl;
12900 tout(cct) << vino.ino.val << std::endl;
12901 tout(cct) << name << std::endl;
12902
12903 std::scoped_lock lock(client_lock);
12904 if (!fuse_default_permissions) {
12905 int r = xattr_permission(in, name, MAY_WRITE, perms);
12906 if (r < 0)
12907 return r;
12908 }
12909
12910 return _removexattr(in, name, perms);
12911 }
12912
12913 bool Client::_vxattrcb_quota_exists(Inode *in)
12914 {
12915 return in->quota.is_enable() &&
12916 (in->snapid != CEPH_NOSNAP ||
12917 (in->snaprealm && in->snaprealm->ino == in->ino));
12918 }
12919 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12920 {
12921 return snprintf(val, size,
12922 "max_bytes=%lld max_files=%lld",
12923 (long long int)in->quota.max_bytes,
12924 (long long int)in->quota.max_files);
12925 }
12926 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12927 {
12928 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12929 }
12930 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12931 {
12932 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12933 }
12934
12935 bool Client::_vxattrcb_layout_exists(Inode *in)
12936 {
12937 return in->layout != file_layout_t();
12938 }
12939 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12940 {
12941 int r = snprintf(val, size,
12942 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12943 (unsigned long long)in->layout.stripe_unit,
12944 (unsigned long long)in->layout.stripe_count,
12945 (unsigned long long)in->layout.object_size);
12946 objecter->with_osdmap([&](const OSDMap& o) {
12947 if (o.have_pg_pool(in->layout.pool_id))
12948 r += snprintf(val + r, size - r, "%s",
12949 o.get_pool_name(in->layout.pool_id).c_str());
12950 else
12951 r += snprintf(val + r, size - r, "%" PRIu64,
12952 (uint64_t)in->layout.pool_id);
12953 });
12954 if (in->layout.pool_ns.length())
12955 r += snprintf(val + r, size - r, " pool_namespace=%s",
12956 in->layout.pool_ns.c_str());
12957 return r;
12958 }
12959 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12960 {
12961 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12962 }
12963 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12964 {
12965 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12966 }
12967 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12968 {
12969 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12970 }
12971 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12972 {
12973 size_t r;
12974 objecter->with_osdmap([&](const OSDMap& o) {
12975 if (o.have_pg_pool(in->layout.pool_id))
12976 r = snprintf(val, size, "%s", o.get_pool_name(
12977 in->layout.pool_id).c_str());
12978 else
12979 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12980 });
12981 return r;
12982 }
12983 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12984 {
12985 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12986 }
12987 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12988 {
12989 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12990 }
12991 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12992 {
12993 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12994 }
12995 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12996 {
12997 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12998 }
12999 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13000 {
13001 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
13002 }
13003 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13004 {
13005 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
13006 }
13007 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13008 {
13009 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
13010 }
13011 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13012 {
13013 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13014 }
13015 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13016 {
13017 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
13018 }
13019 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13020 {
13021 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
13022 (long)in->rstat.rctime.nsec());
13023 }
13024 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13025 {
13026 return in->dir_pin != -CEPHFS_ENODATA;
13027 }
13028 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13029 {
13030 return snprintf(val, size, "%ld", (long)in->dir_pin);
13031 }
13032
13033 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13034 {
13035 return !in->snap_btime.is_zero();
13036 }
13037
13038 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13039 {
13040 return snprintf(val, size, "%llu.%09lu",
13041 (long long unsigned)in->snap_btime.sec(),
13042 (long unsigned)in->snap_btime.nsec());
13043 }
13044
13045 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13046 {
13047 int issued;
13048
13049 in->caps_issued(&issued);
13050 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13051 }
13052
13053 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13054 {
13055 // checking one of the xattrs would suffice
13056 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13057 }
13058
13059 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13060 {
13061 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13062 in->xattrs["ceph.mirror.info.cluster_id"].length(),
13063 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13064 in->xattrs["ceph.mirror.info.fs_id"].length(),
13065 in->xattrs["ceph.mirror.info.fs_id"].c_str());
13066 }
13067
13068 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13069 {
13070 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13071 }
13072
13073 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13074 {
13075 auto name = messenger->get_myname();
13076 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
13077 }
13078
13079 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13080 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13081
13082 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13083 { \
13084 name: CEPH_XATTR_NAME(_type, _name), \
13085 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13086 readonly: true, \
13087 exists_cb: NULL, \
13088 flags: _flags, \
13089 }
13090 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13091 { \
13092 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13093 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13094 readonly: false, \
13095 exists_cb: &Client::_vxattrcb_layout_exists, \
13096 flags: 0, \
13097 }
13098 #define XATTR_QUOTA_FIELD(_type, _name) \
13099 { \
13100 name: CEPH_XATTR_NAME(_type, _name), \
13101 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13102 readonly: false, \
13103 exists_cb: &Client::_vxattrcb_quota_exists, \
13104 flags: 0, \
13105 }
13106
13107 const Client::VXattr Client::_dir_vxattrs[] = {
13108 {
13109 name: "ceph.dir.layout",
13110 getxattr_cb: &Client::_vxattrcb_layout,
13111 readonly: false,
13112 exists_cb: &Client::_vxattrcb_layout_exists,
13113 flags: 0,
13114 },
13115 // FIXME
13116 // Delete the following dir layout field definitions for release "S"
13117 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13118 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13119 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13120 XATTR_LAYOUT_FIELD(dir, layout, pool),
13121 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
13122 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13123 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13124 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13125 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13126 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13127 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
13128 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
13129 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13130 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
13131 {
13132 name: "ceph.quota",
13133 getxattr_cb: &Client::_vxattrcb_quota,
13134 readonly: false,
13135 exists_cb: &Client::_vxattrcb_quota_exists,
13136 flags: 0,
13137 },
13138 XATTR_QUOTA_FIELD(quota, max_bytes),
13139 XATTR_QUOTA_FIELD(quota, max_files),
13140 // FIXME
13141 // Delete the following dir pin field definitions for release "S"
13142 {
13143 name: "ceph.dir.pin",
13144 getxattr_cb: &Client::_vxattrcb_dir_pin,
13145 readonly: false,
13146 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13147 flags: 0,
13148 },
13149 {
13150 name: "ceph.snap.btime",
13151 getxattr_cb: &Client::_vxattrcb_snap_btime,
13152 readonly: true,
13153 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13154 flags: 0,
13155 },
13156 {
13157 name: "ceph.mirror.info",
13158 getxattr_cb: &Client::_vxattrcb_mirror_info,
13159 readonly: false,
13160 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13161 flags: 0,
13162 },
13163 {
13164 name: "ceph.caps",
13165 getxattr_cb: &Client::_vxattrcb_caps,
13166 readonly: true,
13167 exists_cb: NULL,
13168 flags: 0,
13169 },
13170 { name: "" } /* Required table terminator */
13171 };
13172
13173 const Client::VXattr Client::_file_vxattrs[] = {
13174 {
13175 name: "ceph.file.layout",
13176 getxattr_cb: &Client::_vxattrcb_layout,
13177 readonly: false,
13178 exists_cb: &Client::_vxattrcb_layout_exists,
13179 flags: 0,
13180 },
13181 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13182 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13183 XATTR_LAYOUT_FIELD(file, layout, object_size),
13184 XATTR_LAYOUT_FIELD(file, layout, pool),
13185 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13186 {
13187 name: "ceph.snap.btime",
13188 getxattr_cb: &Client::_vxattrcb_snap_btime,
13189 readonly: true,
13190 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13191 flags: 0,
13192 },
13193 {
13194 name: "ceph.caps",
13195 getxattr_cb: &Client::_vxattrcb_caps,
13196 readonly: true,
13197 exists_cb: NULL,
13198 flags: 0,
13199 },
13200 { name: "" } /* Required table terminator */
13201 };
13202
13203 const Client::VXattr Client::_common_vxattrs[] = {
13204 {
13205 name: "ceph.cluster_fsid",
13206 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13207 readonly: true,
13208 exists_cb: nullptr,
13209 flags: 0,
13210 },
13211 {
13212 name: "ceph.client_id",
13213 getxattr_cb: &Client::_vxattrcb_client_id,
13214 readonly: true,
13215 exists_cb: nullptr,
13216 flags: 0,
13217 },
13218 { name: "" } /* Required table terminator */
13219 };
13220
13221 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13222 {
13223 if (in->is_dir())
13224 return _dir_vxattrs;
13225 else if (in->is_file())
13226 return _file_vxattrs;
13227 return NULL;
13228 }
13229
13230 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13231 {
13232 if (strncmp(name, "ceph.", 5) == 0) {
13233 const VXattr *vxattr = _get_vxattrs(in);
13234 if (vxattr) {
13235 while (!vxattr->name.empty()) {
13236 if (vxattr->name == name)
13237 return vxattr;
13238 vxattr++;
13239 }
13240 }
13241
13242 // for common vxattrs
13243 vxattr = _common_vxattrs;
13244 while (!vxattr->name.empty()) {
13245 if (vxattr->name == name)
13246 return vxattr;
13247 vxattr++;
13248 }
13249 }
13250
13251 return NULL;
13252 }
13253
13254 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13255 {
13256 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13257 if (!mref_reader.is_state_satisfied())
13258 return -CEPHFS_ENOTCONN;
13259
13260 vinodeno_t vino = _get_vino(in);
13261
13262 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13263 tout(cct) << "ll_readlink" << std::endl;
13264 tout(cct) << vino.ino.val << std::endl;
13265
13266 std::scoped_lock lock(client_lock);
13267 for (auto dn : in->dentries) {
13268 touch_dn(dn);
13269 }
13270
13271 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13272 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13273 return r;
13274 }
13275
13276 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13277 const UserPerm& perms, InodeRef *inp)
13278 {
13279 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13280 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13281 << ", gid " << perms.gid() << ")" << dendl;
13282
13283 if (strlen(name) > NAME_MAX)
13284 return -CEPHFS_ENAMETOOLONG;
13285
13286 if (dir->snapid != CEPH_NOSNAP) {
13287 return -CEPHFS_EROFS;
13288 }
13289 if (is_quota_files_exceeded(dir, perms)) {
13290 return -CEPHFS_EDQUOT;
13291 }
13292
13293 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13294
13295 filepath path;
13296 dir->make_nosnap_relative_path(path);
13297 path.push_dentry(name);
13298 req->set_filepath(path);
13299 req->set_inode(dir);
13300 req->head.args.mknod.rdev = rdev;
13301 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13302 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13303
13304 bufferlist xattrs_bl;
13305 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13306 if (res < 0)
13307 goto fail;
13308 req->head.args.mknod.mode = mode;
13309 if (xattrs_bl.length() > 0)
13310 req->set_data(xattrs_bl);
13311
13312 Dentry *de;
13313 res = get_or_create(dir, name, &de);
13314 if (res < 0)
13315 goto fail;
13316 req->set_dentry(de);
13317
13318 res = make_request(req, perms, inp);
13319
13320 trim_cache();
13321
13322 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13323 return res;
13324
13325 fail:
13326 put_request(req);
13327 return res;
13328 }
13329
13330 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13331 dev_t rdev, struct stat *attr, Inode **out,
13332 const UserPerm& perms)
13333 {
13334 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13335 if (!mref_reader.is_state_satisfied())
13336 return -CEPHFS_ENOTCONN;
13337
13338 vinodeno_t vparent = _get_vino(parent);
13339
13340 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13341 tout(cct) << "ll_mknod" << std::endl;
13342 tout(cct) << vparent.ino.val << std::endl;
13343 tout(cct) << name << std::endl;
13344 tout(cct) << mode << std::endl;
13345 tout(cct) << rdev << std::endl;
13346
13347 std::scoped_lock lock(client_lock);
13348 if (!fuse_default_permissions) {
13349 int r = may_create(parent, perms);
13350 if (r < 0)
13351 return r;
13352 }
13353
13354 InodeRef in;
13355 int r = _mknod(parent, name, mode, rdev, perms, &in);
13356 if (r == 0) {
13357 fill_stat(in, attr);
13358 _ll_get(in.get());
13359 }
13360 tout(cct) << attr->st_ino << std::endl;
13361 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13362 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13363 *out = in.get();
13364 return r;
13365 }
13366
13367 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13368 dev_t rdev, Inode **out,
13369 struct ceph_statx *stx, unsigned want, unsigned flags,
13370 const UserPerm& perms)
13371 {
13372 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13373 if (!mref_reader.is_state_satisfied())
13374 return -CEPHFS_ENOTCONN;
13375
13376 unsigned caps = statx_to_mask(flags, want);
13377
13378 vinodeno_t vparent = _get_vino(parent);
13379
13380 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13381 tout(cct) << "ll_mknodx" << std::endl;
13382 tout(cct) << vparent.ino.val << std::endl;
13383 tout(cct) << name << std::endl;
13384 tout(cct) << mode << std::endl;
13385 tout(cct) << rdev << std::endl;
13386
13387 std::scoped_lock lock(client_lock);
13388
13389 if (!fuse_default_permissions) {
13390 int r = may_create(parent, perms);
13391 if (r < 0)
13392 return r;
13393 }
13394
13395 InodeRef in;
13396 int r = _mknod(parent, name, mode, rdev, perms, &in);
13397 if (r == 0) {
13398 fill_statx(in, caps, stx);
13399 _ll_get(in.get());
13400 }
13401 tout(cct) << stx->stx_ino << std::endl;
13402 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13403 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13404 *out = in.get();
13405 return r;
13406 }
13407
13408 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13409 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13410 int object_size, const char *data_pool, bool *created,
13411 const UserPerm& perms, std::string alternate_name)
13412 {
13413 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13414 mode << dec << ")" << dendl;
13415
13416 if (strlen(name) > NAME_MAX)
13417 return -CEPHFS_ENAMETOOLONG;
13418 if (dir->snapid != CEPH_NOSNAP) {
13419 return -CEPHFS_EROFS;
13420 }
13421 if (is_quota_files_exceeded(dir, perms)) {
13422 return -CEPHFS_EDQUOT;
13423 }
13424
13425 // use normalized flags to generate cmode
13426 int cflags = ceph_flags_sys2wire(flags);
13427 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13428 cflags |= CEPH_O_LAZY;
13429
13430 int cmode = ceph_flags_to_mode(cflags);
13431
13432 int64_t pool_id = -1;
13433 if (data_pool && *data_pool) {
13434 pool_id = objecter->with_osdmap(
13435 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13436 if (pool_id < 0)
13437 return -CEPHFS_EINVAL;
13438 if (pool_id > 0xffffffffll)
13439 return -CEPHFS_ERANGE; // bummer!
13440 }
13441
13442 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13443
13444 filepath path;
13445 dir->make_nosnap_relative_path(path);
13446 path.push_dentry(name);
13447 req->set_filepath(path);
13448 req->set_alternate_name(std::move(alternate_name));
13449 req->set_inode(dir);
13450 req->head.args.open.flags = cflags | CEPH_O_CREAT;
13451
13452 req->head.args.open.stripe_unit = stripe_unit;
13453 req->head.args.open.stripe_count = stripe_count;
13454 req->head.args.open.object_size = object_size;
13455 if (cct->_conf->client_debug_getattr_caps)
13456 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13457 else
13458 req->head.args.open.mask = 0;
13459 req->head.args.open.pool = pool_id;
13460 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13461 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13462
13463 mode |= S_IFREG;
13464 bufferlist xattrs_bl;
13465 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13466 if (res < 0)
13467 goto fail;
13468 req->head.args.open.mode = mode;
13469 if (xattrs_bl.length() > 0)
13470 req->set_data(xattrs_bl);
13471
13472 Dentry *de;
13473 res = get_or_create(dir, name, &de);
13474 if (res < 0)
13475 goto fail;
13476 req->set_dentry(de);
13477
13478 res = make_request(req, perms, inp, created);
13479 if (res < 0) {
13480 goto reply_error;
13481 }
13482
13483 /* If the caller passed a value in fhp, do the open */
13484 if(fhp) {
13485 (*inp)->get_open_ref(cmode);
13486 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13487 }
13488
13489 reply_error:
13490 trim_cache();
13491
13492 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
13493 << " layout " << stripe_unit
13494 << ' ' << stripe_count
13495 << ' ' << object_size
13496 <<") = " << res << dendl;
13497 return res;
13498
13499 fail:
13500 put_request(req);
13501 return res;
13502 }
13503
13504 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
13505 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13506 std::string alternate_name)
13507 {
13508 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
13509 << mode << dec << ", uid " << perm.uid()
13510 << ", gid " << perm.gid() << ")" << dendl;
13511
13512 if (strlen(name) > NAME_MAX)
13513 return -CEPHFS_ENAMETOOLONG;
13514
13515 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13516 return -CEPHFS_EROFS;
13517 }
13518 if (is_quota_files_exceeded(dir, perm)) {
13519 return -CEPHFS_EDQUOT;
13520 }
13521
13522 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13523 MetaRequest *req = new MetaRequest(is_snap_op ?
13524 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13525
13526 filepath path;
13527 dir->make_nosnap_relative_path(path);
13528 path.push_dentry(name);
13529 req->set_filepath(path);
13530 req->set_inode(dir);
13531 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13532 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13533 req->set_alternate_name(std::move(alternate_name));
13534
13535 mode |= S_IFDIR;
13536 bufferlist bl;
13537 int res = _posix_acl_create(dir, &mode, bl, perm);
13538 if (res < 0)
13539 goto fail;
13540 req->head.args.mkdir.mode = mode;
13541 if (is_snap_op) {
13542 SnapPayload payload;
13543 // clear the bufferlist that may have been populated by the call
13544 // to _posix_acl_create(). MDS mksnap does not make use of it.
13545 // So, reuse it to pass metadata payload.
13546 bl.clear();
13547 payload.metadata = metadata;
13548 encode(payload, bl);
13549 }
13550 if (bl.length() > 0) {
13551 req->set_data(bl);
13552 }
13553
13554 Dentry *de;
13555 res = get_or_create(dir, name, &de);
13556 if (res < 0)
13557 goto fail;
13558 req->set_dentry(de);
13559
13560 ldout(cct, 10) << "_mkdir: making request" << dendl;
13561 res = make_request(req, perm, inp);
13562 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13563
13564 trim_cache();
13565
13566 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13567 return res;
13568
13569 fail:
13570 put_request(req);
13571 return res;
13572 }
13573
13574 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13575 struct stat *attr, Inode **out, const UserPerm& perm)
13576 {
13577 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13578 if (!mref_reader.is_state_satisfied())
13579 return -CEPHFS_ENOTCONN;
13580
13581 vinodeno_t vparent = _get_vino(parent);
13582
13583 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13584 tout(cct) << "ll_mkdir" << std::endl;
13585 tout(cct) << vparent.ino.val << std::endl;
13586 tout(cct) << name << std::endl;
13587 tout(cct) << mode << std::endl;
13588
13589 std::scoped_lock lock(client_lock);
13590
13591 if (!fuse_default_permissions) {
13592 int r = may_create(parent, perm);
13593 if (r < 0)
13594 return r;
13595 }
13596
13597 InodeRef in;
13598 int r = _mkdir(parent, name, mode, perm, &in);
13599 if (r == 0) {
13600 fill_stat(in, attr);
13601 _ll_get(in.get());
13602 }
13603 tout(cct) << attr->st_ino << std::endl;
13604 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13605 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13606 *out = in.get();
13607 return r;
13608 }
13609
13610 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13611 struct ceph_statx *stx, unsigned want, unsigned flags,
13612 const UserPerm& perms)
13613 {
13614 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13615 if (!mref_reader.is_state_satisfied())
13616 return -CEPHFS_ENOTCONN;
13617
13618 vinodeno_t vparent = _get_vino(parent);
13619
13620 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13621 tout(cct) << "ll_mkdirx" << std::endl;
13622 tout(cct) << vparent.ino.val << std::endl;
13623 tout(cct) << name << std::endl;
13624 tout(cct) << mode << std::endl;
13625
13626 std::scoped_lock lock(client_lock);
13627
13628 if (!fuse_default_permissions) {
13629 int r = may_create(parent, perms);
13630 if (r < 0)
13631 return r;
13632 }
13633
13634 InodeRef in;
13635 int r = _mkdir(parent, name, mode, perms, &in);
13636 if (r == 0) {
13637 fill_statx(in, statx_to_mask(flags, want), stx);
13638 _ll_get(in.get());
13639 } else {
13640 stx->stx_ino = 0;
13641 stx->stx_mask = 0;
13642 }
13643 tout(cct) << stx->stx_ino << std::endl;
13644 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13645 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13646 *out = in.get();
13647 return r;
13648 }
13649
13650 int Client::_symlink(Inode *dir, const char *name, const char *target,
13651 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13652 {
13653 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13654 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13655 << dendl;
13656
13657 if (strlen(name) > NAME_MAX)
13658 return -CEPHFS_ENAMETOOLONG;
13659
13660 if (dir->snapid != CEPH_NOSNAP) {
13661 return -CEPHFS_EROFS;
13662 }
13663 if (is_quota_files_exceeded(dir, perms)) {
13664 return -CEPHFS_EDQUOT;
13665 }
13666
13667 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13668
13669 filepath path;
13670 dir->make_nosnap_relative_path(path);
13671 path.push_dentry(name);
13672 req->set_filepath(path);
13673 req->set_alternate_name(std::move(alternate_name));
13674 req->set_inode(dir);
13675 req->set_string2(target);
13676 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13677 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13678
13679 Dentry *de;
13680 int res = get_or_create(dir, name, &de);
13681 if (res < 0)
13682 goto fail;
13683 req->set_dentry(de);
13684
13685 res = make_request(req, perms, inp);
13686
13687 trim_cache();
13688 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13689 res << dendl;
13690 return res;
13691
13692 fail:
13693 put_request(req);
13694 return res;
13695 }
13696
13697 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13698 struct stat *attr, Inode **out, const UserPerm& perms)
13699 {
13700 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13701 if (!mref_reader.is_state_satisfied())
13702 return -CEPHFS_ENOTCONN;
13703
13704 vinodeno_t vparent = _get_vino(parent);
13705
13706 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13707 << dendl;
13708 tout(cct) << "ll_symlink" << std::endl;
13709 tout(cct) << vparent.ino.val << std::endl;
13710 tout(cct) << name << std::endl;
13711 tout(cct) << value << std::endl;
13712
13713 std::scoped_lock lock(client_lock);
13714
13715 if (!fuse_default_permissions) {
13716 int r = may_create(parent, perms);
13717 if (r < 0)
13718 return r;
13719 }
13720
13721 InodeRef in;
13722 int r = _symlink(parent, name, value, perms, "", &in);
13723 if (r == 0) {
13724 fill_stat(in, attr);
13725 _ll_get(in.get());
13726 }
13727 tout(cct) << attr->st_ino << std::endl;
13728 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13729 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13730 *out = in.get();
13731 return r;
13732 }
13733
13734 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13735 Inode **out, struct ceph_statx *stx, unsigned want,
13736 unsigned flags, const UserPerm& perms)
13737 {
13738 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13739 if (!mref_reader.is_state_satisfied())
13740 return -CEPHFS_ENOTCONN;
13741
13742 vinodeno_t vparent = _get_vino(parent);
13743
13744 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13745 << dendl;
13746 tout(cct) << "ll_symlinkx" << std::endl;
13747 tout(cct) << vparent.ino.val << std::endl;
13748 tout(cct) << name << std::endl;
13749 tout(cct) << value << std::endl;
13750
13751 std::scoped_lock lock(client_lock);
13752
13753 if (!fuse_default_permissions) {
13754 int r = may_create(parent, perms);
13755 if (r < 0)
13756 return r;
13757 }
13758
13759 InodeRef in;
13760 int r = _symlink(parent, name, value, perms, "", &in);
13761 if (r == 0) {
13762 fill_statx(in, statx_to_mask(flags, want), stx);
13763 _ll_get(in.get());
13764 }
13765 tout(cct) << stx->stx_ino << std::endl;
13766 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13767 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13768 *out = in.get();
13769 return r;
13770 }
13771
13772 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13773 {
13774 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13775 << " uid " << perm.uid() << " gid " << perm.gid()
13776 << ")" << dendl;
13777
13778 if (dir->snapid != CEPH_NOSNAP) {
13779 return -CEPHFS_EROFS;
13780 }
13781
13782 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13783
13784 filepath path;
13785 dir->make_nosnap_relative_path(path);
13786 path.push_dentry(name);
13787 req->set_filepath(path);
13788
13789 InodeRef otherin;
13790 Inode *in;
13791 Dentry *de;
13792
13793 int res = get_or_create(dir, name, &de);
13794 if (res < 0)
13795 goto fail;
13796 req->set_dentry(de);
13797 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13798 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13799
13800 res = _lookup(dir, name, 0, &otherin, perm);
13801 if (res < 0)
13802 goto fail;
13803
13804 in = otherin.get();
13805 req->set_other_inode(in);
13806 in->break_all_delegs();
13807 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13808
13809 req->set_inode(dir);
13810
13811 res = make_request(req, perm);
13812
13813 trim_cache();
13814 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13815 return res;
13816
13817 fail:
13818 put_request(req);
13819 return res;
13820 }
13821
13822 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13823 {
13824 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13825 if (!mref_reader.is_state_satisfied())
13826 return -CEPHFS_ENOTCONN;
13827
13828 vinodeno_t vino = _get_vino(in);
13829
13830 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13831 tout(cct) << "ll_unlink" << std::endl;
13832 tout(cct) << vino.ino.val << std::endl;
13833 tout(cct) << name << std::endl;
13834
13835 std::scoped_lock lock(client_lock);
13836
13837 if (!fuse_default_permissions) {
13838 int r = may_delete(in, name, perm);
13839 if (r < 0)
13840 return r;
13841 }
13842 return _unlink(in, name, perm);
13843 }
13844
13845 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13846 {
13847 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13848 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13849
13850 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13851 return -CEPHFS_EROFS;
13852 }
13853
13854 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13855 MetaRequest *req = new MetaRequest(op);
13856 filepath path;
13857 dir->make_nosnap_relative_path(path);
13858 path.push_dentry(name);
13859 req->set_filepath(path);
13860 req->set_inode(dir);
13861
13862 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13863 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13864 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13865
13866 InodeRef in;
13867
13868 Dentry *de;
13869 int res = get_or_create(dir, name, &de);
13870 if (res < 0)
13871 goto fail;
13872 if (op == CEPH_MDS_OP_RMDIR)
13873 req->set_dentry(de);
13874 else
13875 de->get();
13876
13877 res = _lookup(dir, name, 0, &in, perms);
13878 if (res < 0)
13879 goto fail;
13880
13881 if (op == CEPH_MDS_OP_RMSNAP) {
13882 unlink(de, true, true);
13883 de->put();
13884 }
13885 req->set_other_inode(in.get());
13886
13887 res = make_request(req, perms);
13888
13889 trim_cache();
13890 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13891 return res;
13892
13893 fail:
13894 put_request(req);
13895 return res;
13896 }
13897
13898 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13899 {
13900 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13901 if (!mref_reader.is_state_satisfied())
13902 return -CEPHFS_ENOTCONN;
13903
13904 vinodeno_t vino = _get_vino(in);
13905
13906 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13907 tout(cct) << "ll_rmdir" << std::endl;
13908 tout(cct) << vino.ino.val << std::endl;
13909 tout(cct) << name << std::endl;
13910
13911 std::scoped_lock lock(client_lock);
13912
13913 if (!fuse_default_permissions) {
13914 int r = may_delete(in, name, perms);
13915 if (r < 0)
13916 return r;
13917 }
13918
13919 return _rmdir(in, name, perms);
13920 }
13921
13922 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13923 {
13924 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13925 << todir->ino << " " << toname
13926 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13927 << dendl;
13928
13929 if (fromdir->snapid != todir->snapid)
13930 return -CEPHFS_EXDEV;
13931
13932 int op = CEPH_MDS_OP_RENAME;
13933 if (fromdir->snapid != CEPH_NOSNAP) {
13934 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13935 op = CEPH_MDS_OP_RENAMESNAP;
13936 else
13937 return -CEPHFS_EROFS;
13938 }
13939 if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
13940 Inode *fromdir_root =
13941 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13942 Inode *todir_root =
13943 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13944 if (fromdir_root != todir_root) {
13945 return -CEPHFS_EXDEV;
13946 }
13947 }
13948
13949 InodeRef target;
13950 MetaRequest *req = new MetaRequest(op);
13951
13952 filepath from;
13953 fromdir->make_nosnap_relative_path(from);
13954 from.push_dentry(fromname);
13955 filepath to;
13956 todir->make_nosnap_relative_path(to);
13957 to.push_dentry(toname);
13958 req->set_filepath(to);
13959 req->set_filepath2(from);
13960 req->set_alternate_name(std::move(alternate_name));
13961
13962 Dentry *oldde;
13963 int res = get_or_create(fromdir, fromname, &oldde);
13964 if (res < 0)
13965 goto fail;
13966 Dentry *de;
13967 res = get_or_create(todir, toname, &de);
13968 if (res < 0)
13969 goto fail;
13970
13971 if (op == CEPH_MDS_OP_RENAME) {
13972 req->set_old_dentry(oldde);
13973 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13974 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13975
13976 req->set_dentry(de);
13977 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13978 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13979
13980 InodeRef oldin, otherin;
13981 res = _lookup(fromdir, fromname, 0, &oldin, perm);
13982 if (res < 0)
13983 goto fail;
13984
13985 Inode *oldinode = oldin.get();
13986 oldinode->break_all_delegs();
13987 req->set_old_inode(oldinode);
13988 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13989
13990 res = _lookup(todir, toname, 0, &otherin, perm);
13991 switch (res) {
13992 case 0:
13993 {
13994 Inode *in = otherin.get();
13995 req->set_other_inode(in);
13996 in->break_all_delegs();
13997 }
13998 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13999 break;
14000 case -CEPHFS_ENOENT:
14001 break;
14002 default:
14003 goto fail;
14004 }
14005
14006 req->set_inode(todir);
14007 } else {
14008 // renamesnap reply contains no tracedn, so we need to invalidate
14009 // dentry manually
14010 unlink(oldde, true, true);
14011 unlink(de, true, true);
14012
14013 req->set_inode(todir);
14014 }
14015
14016 res = make_request(req, perm, &target);
14017 ldout(cct, 10) << "rename result is " << res << dendl;
14018
14019 // renamed item from our cache
14020
14021 trim_cache();
14022 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
14023 return res;
14024
14025 fail:
14026 put_request(req);
14027 return res;
14028 }
14029
14030 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14031 const char *newname, const UserPerm& perm)
14032 {
14033 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14034 if (!mref_reader.is_state_satisfied())
14035 return -CEPHFS_ENOTCONN;
14036
14037 vinodeno_t vparent = _get_vino(parent);
14038 vinodeno_t vnewparent = _get_vino(newparent);
14039
14040 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14041 << vnewparent << " " << newname << dendl;
14042 tout(cct) << "ll_rename" << std::endl;
14043 tout(cct) << vparent.ino.val << std::endl;
14044 tout(cct) << name << std::endl;
14045 tout(cct) << vnewparent.ino.val << std::endl;
14046 tout(cct) << newname << std::endl;
14047
14048 std::scoped_lock lock(client_lock);
14049
14050 if (!fuse_default_permissions) {
14051 int r = may_delete(parent, name, perm);
14052 if (r < 0)
14053 return r;
14054 r = may_delete(newparent, newname, perm);
14055 if (r < 0 && r != -CEPHFS_ENOENT)
14056 return r;
14057 }
14058
14059 return _rename(parent, name, newparent, newname, perm, "");
14060 }
14061
14062 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
14063 {
14064 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
14065 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14066
14067 if (strlen(newname) > NAME_MAX)
14068 return -CEPHFS_ENAMETOOLONG;
14069
14070 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
14071 return -CEPHFS_EROFS;
14072 }
14073 if (is_quota_files_exceeded(dir, perm)) {
14074 return -CEPHFS_EDQUOT;
14075 }
14076
14077 in->break_all_delegs();
14078 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14079
14080 filepath path(newname, dir->ino);
14081 req->set_filepath(path);
14082 req->set_alternate_name(std::move(alternate_name));
14083 filepath existing(in->ino);
14084 req->set_filepath2(existing);
14085
14086 req->set_inode(dir);
14087 req->inode_drop = CEPH_CAP_FILE_SHARED;
14088 req->inode_unless = CEPH_CAP_FILE_EXCL;
14089
14090 Dentry *de;
14091 int res = get_or_create(dir, newname, &de);
14092 if (res < 0)
14093 goto fail;
14094 req->set_dentry(de);
14095
14096 res = make_request(req, perm, inp);
14097 ldout(cct, 10) << "link result is " << res << dendl;
14098
14099 trim_cache();
14100 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
14101 return res;
14102
14103 fail:
14104 put_request(req);
14105 return res;
14106 }
14107
14108 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14109 const UserPerm& perm)
14110 {
14111 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14112 if (!mref_reader.is_state_satisfied())
14113 return -CEPHFS_ENOTCONN;
14114
14115 vinodeno_t vino = _get_vino(in);
14116 vinodeno_t vnewparent = _get_vino(newparent);
14117
14118 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
14119 newname << dendl;
14120 tout(cct) << "ll_link" << std::endl;
14121 tout(cct) << vino.ino.val << std::endl;
14122 tout(cct) << vnewparent << std::endl;
14123 tout(cct) << newname << std::endl;
14124
14125 InodeRef target;
14126
14127 std::scoped_lock lock(client_lock);
14128
14129 if (!fuse_default_permissions) {
14130 if (S_ISDIR(in->mode))
14131 return -CEPHFS_EPERM;
14132
14133 int r = may_hardlink(in, perm);
14134 if (r < 0)
14135 return r;
14136
14137 r = may_create(newparent, perm);
14138 if (r < 0)
14139 return r;
14140 }
14141
14142 return _link(in, newparent, newname, perm, "", &target);
14143 }
14144
14145 int Client::ll_num_osds(void)
14146 {
14147 std::scoped_lock lock(client_lock);
14148 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14149 }
14150
14151 int Client::ll_osdaddr(int osd, uint32_t *addr)
14152 {
14153 std::scoped_lock lock(client_lock);
14154
14155 entity_addr_t g;
14156 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14157 if (!o.exists(osd))
14158 return false;
14159 g = o.get_addrs(osd).front();
14160 return true;
14161 });
14162 if (!exists)
14163 return -1;
14164 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14165 *addr = ntohl(nb_addr);
14166 return 0;
14167 }
14168
14169 uint32_t Client::ll_stripe_unit(Inode *in)
14170 {
14171 std::scoped_lock lock(client_lock);
14172 return in->layout.stripe_unit;
14173 }
14174
14175 uint64_t Client::ll_snap_seq(Inode *in)
14176 {
14177 std::scoped_lock lock(client_lock);
14178 return in->snaprealm->seq;
14179 }
14180
14181 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14182 {
14183 std::scoped_lock lock(client_lock);
14184 *layout = in->layout;
14185 return 0;
14186 }
14187
14188 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14189 {
14190 return ll_file_layout(fh->inode.get(), layout);
14191 }
14192
14193 /* Currently we cannot take advantage of redundancy in reads, since we
14194 would have to go through all possible placement groups (a
14195 potentially quite large number determined by a hash), and use CRUSH
14196 to calculate the appropriate set of OSDs for each placement group,
14197 then index into that. An array with one entry per OSD is much more
14198 tractable and works for demonstration purposes. */
14199
14200 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14201 file_layout_t* layout)
14202 {
14203 std::scoped_lock lock(client_lock);
14204
14205 inodeno_t ino = in->ino;
14206 uint32_t object_size = layout->object_size;
14207 uint32_t su = layout->stripe_unit;
14208 uint32_t stripe_count = layout->stripe_count;
14209 uint64_t stripes_per_object = object_size / su;
14210 uint64_t stripeno = 0, stripepos = 0;
14211
14212 if(stripe_count) {
14213 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14214 stripepos = blockno % stripe_count; // which object in the object set (X)
14215 }
14216 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14217 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14218
14219 object_t oid = file_object_t(ino, objectno);
14220 return objecter->with_osdmap([&](const OSDMap& o) {
14221 ceph_object_layout olayout =
14222 o.file_to_object_layout(oid, *layout);
14223 pg_t pg = (pg_t)olayout.ol_pgid;
14224 vector<int> osds;
14225 int primary;
14226 o.pg_to_acting_osds(pg, &osds, &primary);
14227 return primary;
14228 });
14229 }
14230
14231 /* Return the offset of the block, internal to the object */
14232
14233 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14234 {
14235 std::scoped_lock lock(client_lock);
14236 file_layout_t *layout=&(in->layout);
14237 uint32_t object_size = layout->object_size;
14238 uint32_t su = layout->stripe_unit;
14239 uint64_t stripes_per_object = object_size / su;
14240
14241 return (blockno % stripes_per_object) * su;
14242 }
14243
14244 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14245 const UserPerm& perms)
14246 {
14247 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14248 if (!mref_reader.is_state_satisfied())
14249 return -CEPHFS_ENOTCONN;
14250
14251 vinodeno_t vino = _get_vino(in);
14252
14253 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14254 tout(cct) << "ll_opendir" << std::endl;
14255 tout(cct) << vino.ino.val << std::endl;
14256
14257 std::scoped_lock lock(client_lock);
14258
14259 if (!fuse_default_permissions) {
14260 int r = may_open(in, flags, perms);
14261 if (r < 0)
14262 return r;
14263 }
14264
14265 int r = _opendir(in, dirpp, perms);
14266 tout(cct) << (uintptr_t)*dirpp << std::endl;
14267
14268 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14269 << dendl;
14270 return r;
14271 }
14272
14273 int Client::ll_releasedir(dir_result_t *dirp)
14274 {
14275 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14276 if (!mref_reader.is_state_satisfied())
14277 return -CEPHFS_ENOTCONN;
14278
14279 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14280 tout(cct) << "ll_releasedir" << std::endl;
14281 tout(cct) << (uintptr_t)dirp << std::endl;
14282
14283 std::scoped_lock lock(client_lock);
14284
14285 _closedir(dirp);
14286 return 0;
14287 }
14288
14289 int Client::ll_fsyncdir(dir_result_t *dirp)
14290 {
14291 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14292 if (!mref_reader.is_state_satisfied())
14293 return -CEPHFS_ENOTCONN;
14294
14295 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14296 tout(cct) << "ll_fsyncdir" << std::endl;
14297 tout(cct) << (uintptr_t)dirp << std::endl;
14298
14299 std::scoped_lock lock(client_lock);
14300 return _fsync(dirp->inode.get(), false);
14301 }
14302
14303 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14304 {
14305 ceph_assert(!(flags & O_CREAT));
14306
14307 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14308 if (!mref_reader.is_state_satisfied())
14309 return -CEPHFS_ENOTCONN;
14310
14311 vinodeno_t vino = _get_vino(in);
14312
14313 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14314 tout(cct) << "ll_open" << std::endl;
14315 tout(cct) << vino.ino.val << std::endl;
14316 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14317
14318 std::scoped_lock lock(client_lock);
14319
14320 int r;
14321 if (!fuse_default_permissions) {
14322 r = may_open(in, flags, perms);
14323 if (r < 0)
14324 goto out;
14325 }
14326
14327 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14328
14329 out:
14330 Fh *fhptr = fhp ? *fhp : NULL;
14331 if (fhptr) {
14332 ll_unclosed_fh_set.insert(fhptr);
14333 }
14334 tout(cct) << (uintptr_t)fhptr << std::endl;
14335 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14336 " = " << r << " (" << fhptr << ")" << dendl;
14337 return r;
14338 }
14339
14340 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14341 int flags, InodeRef *in, int caps, Fh **fhp,
14342 const UserPerm& perms)
14343 {
14344 *fhp = NULL;
14345
14346 vinodeno_t vparent = _get_vino(parent);
14347
14348 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14349 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14350 << ", gid " << perms.gid() << dendl;
14351 tout(cct) << "ll_create" << std::endl;
14352 tout(cct) << vparent.ino.val << std::endl;
14353 tout(cct) << name << std::endl;
14354 tout(cct) << mode << std::endl;
14355 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14356
14357 bool created = false;
14358 int r = _lookup(parent, name, caps, in, perms);
14359
14360 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14361 return -CEPHFS_EEXIST;
14362
14363 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14364 if (!fuse_default_permissions) {
14365 r = may_create(parent, perms);
14366 if (r < 0)
14367 goto out;
14368 }
14369 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14370 perms, "");
14371 if (r < 0)
14372 goto out;
14373 }
14374
14375 if (r < 0)
14376 goto out;
14377
14378 ceph_assert(*in);
14379
14380 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14381 if (!created) {
14382 if (!fuse_default_permissions) {
14383 r = may_open(in->get(), flags, perms);
14384 if (r < 0) {
14385 if (*fhp) {
14386 int release_r = _release_fh(*fhp);
14387 ceph_assert(release_r == 0); // during create, no async data ops should have happened
14388 }
14389 goto out;
14390 }
14391 }
14392 if (*fhp == NULL) {
14393 r = _open(in->get(), flags, mode, fhp, perms);
14394 if (r < 0)
14395 goto out;
14396 }
14397 }
14398
14399 out:
14400 if (*fhp) {
14401 ll_unclosed_fh_set.insert(*fhp);
14402 }
14403
14404 ino_t ino = 0;
14405 if (r >= 0) {
14406 Inode *inode = in->get();
14407 if (use_faked_inos())
14408 ino = inode->faked_ino;
14409 else
14410 ino = inode->ino;
14411 }
14412
14413 tout(cct) << (uintptr_t)*fhp << std::endl;
14414 tout(cct) << ino << std::endl;
14415 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14416 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14417 *fhp << " " << hex << ino << dec << ")" << dendl;
14418
14419 return r;
14420 }
14421
14422 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14423 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14424 const UserPerm& perms)
14425 {
14426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14427 if (!mref_reader.is_state_satisfied())
14428 return -CEPHFS_ENOTCONN;
14429
14430 std::scoped_lock lock(client_lock);
14431 InodeRef in;
14432
14433 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14434 fhp, perms);
14435 if (r >= 0) {
14436 ceph_assert(in);
14437
14438 // passing an Inode in outp requires an additional ref
14439 if (outp) {
14440 _ll_get(in.get());
14441 *outp = in.get();
14442 }
14443 fill_stat(in, attr);
14444 } else {
14445 attr->st_ino = 0;
14446 }
14447
14448 return r;
14449 }
14450
14451 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14452 int oflags, Inode **outp, Fh **fhp,
14453 struct ceph_statx *stx, unsigned want, unsigned lflags,
14454 const UserPerm& perms)
14455 {
14456 unsigned caps = statx_to_mask(lflags, want);
14457 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14458 if (!mref_reader.is_state_satisfied())
14459 return -CEPHFS_ENOTCONN;
14460
14461 std::scoped_lock lock(client_lock);
14462 InodeRef in;
14463
14464 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14465 if (r >= 0) {
14466 ceph_assert(in);
14467
14468 // passing an Inode in outp requires an additional ref
14469 if (outp) {
14470 _ll_get(in.get());
14471 *outp = in.get();
14472 }
14473 fill_statx(in, caps, stx);
14474 } else {
14475 stx->stx_ino = 0;
14476 stx->stx_mask = 0;
14477 }
14478
14479 return r;
14480 }
14481
14482 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14483 {
14484 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14485 if (!mref_reader.is_state_satisfied())
14486 return -CEPHFS_ENOTCONN;
14487
14488 tout(cct) << "ll_lseek" << std::endl;
14489 tout(cct) << offset << std::endl;
14490 tout(cct) << whence << std::endl;
14491
14492 std::scoped_lock lock(client_lock);
14493 return _lseek(fh, offset, whence);
14494 }
14495
14496 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14497 {
14498 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14499 if (!mref_reader.is_state_satisfied())
14500 return -CEPHFS_ENOTCONN;
14501
14502 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14503 tout(cct) << "ll_read" << std::endl;
14504 tout(cct) << (uintptr_t)fh << std::endl;
14505 tout(cct) << off << std::endl;
14506 tout(cct) << len << std::endl;
14507
14508 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14509 len = std::min(len, (loff_t)INT_MAX);
14510 std::scoped_lock lock(client_lock);
14511
14512 int r = _read(fh, off, len, bl);
14513 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14514 << dendl;
14515 return r;
14516 }
14517
14518 int Client::ll_read_block(Inode *in, uint64_t blockid,
14519 char *buf,
14520 uint64_t offset,
14521 uint64_t length,
14522 file_layout_t* layout)
14523 {
14524 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14525 if (!mref_reader.is_state_satisfied())
14526 return -CEPHFS_ENOTCONN;
14527
14528 vinodeno_t vino = _get_vino(in);
14529 object_t oid = file_object_t(vino.ino, blockid);
14530 C_SaferCond onfinish;
14531 bufferlist bl;
14532
14533 objecter->read(oid,
14534 object_locator_t(layout->pool_id),
14535 offset,
14536 length,
14537 vino.snapid,
14538 &bl,
14539 CEPH_OSD_FLAG_READ,
14540 &onfinish);
14541
14542 int r = onfinish.wait();
14543 if (r >= 0) {
14544 bl.begin().copy(bl.length(), buf);
14545 r = bl.length();
14546 }
14547
14548 return r;
14549 }
14550
14551 /* It appears that the OSD doesn't return success unless the entire
14552 buffer was written, return the write length on success. */
14553
14554 int Client::ll_write_block(Inode *in, uint64_t blockid,
14555 char* buf, uint64_t offset,
14556 uint64_t length, file_layout_t* layout,
14557 uint64_t snapseq, uint32_t sync)
14558 {
14559 vinodeno_t vino = ll_get_vino(in);
14560 int r = 0;
14561 std::unique_ptr<C_SaferCond> onsafe = nullptr;
14562
14563 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14564 if (!mref_reader.is_state_satisfied())
14565 return -CEPHFS_ENOTCONN;
14566
14567 if (length == 0) {
14568 return -CEPHFS_EINVAL;
14569 }
14570 if (true || sync) {
14571 /* if write is stable, the epilogue is waiting on
14572 * flock */
14573 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14574 }
14575 object_t oid = file_object_t(vino.ino, blockid);
14576 SnapContext fakesnap;
14577 ceph::bufferlist bl;
14578 if (length > 0) {
14579 bl.push_back(buffer::copy(buf, length));
14580 }
14581
14582 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14583 << dendl;
14584
14585 fakesnap.seq = snapseq;
14586
14587 /* lock just in time */
14588 objecter->write(oid,
14589 object_locator_t(layout->pool_id),
14590 offset,
14591 length,
14592 fakesnap,
14593 bl,
14594 ceph::real_clock::now(),
14595 0,
14596 onsafe.get());
14597
14598 if (nullptr != onsafe) {
14599 r = onsafe->wait();
14600 }
14601
14602 if (r < 0) {
14603 return r;
14604 } else {
14605 return length;
14606 }
14607 }
14608
14609 int Client::ll_commit_blocks(Inode *in,
14610 uint64_t offset,
14611 uint64_t length)
14612 {
14613 /*
14614 BarrierContext *bctx;
14615 vinodeno_t vino = _get_vino(in);
14616 uint64_t ino = vino.ino;
14617
14618 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14619 << offset << " to " << length << dendl;
14620
14621 if (length == 0) {
14622 return -CEPHFS_EINVAL;
14623 }
14624
14625 std::scoped_lock lock(client_lock);
14626 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14627 if (p != barriers.end()) {
14628 barrier_interval civ(offset, offset + length);
14629 p->second->commit_barrier(civ);
14630 }
14631 */
14632 return 0;
14633 }
14634
14635 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14636 {
14637 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14638 "~" << len << dendl;
14639 tout(cct) << "ll_write" << std::endl;
14640 tout(cct) << (uintptr_t)fh << std::endl;
14641 tout(cct) << off << std::endl;
14642 tout(cct) << len << std::endl;
14643
14644 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14645 if (!mref_reader.is_state_satisfied())
14646 return -CEPHFS_ENOTCONN;
14647
14648 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14649 len = std::min(len, (loff_t)INT_MAX);
14650 std::scoped_lock lock(client_lock);
14651
14652 int r = _write(fh, off, len, data, NULL, 0);
14653 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14654 << dendl;
14655 return r;
14656 }
14657
14658 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14659 {
14660 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14661 if (!mref_reader.is_state_satisfied())
14662 return -CEPHFS_ENOTCONN;
14663
14664 std::scoped_lock cl(client_lock);
14665 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
14666 }
14667
14668 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14669 {
14670 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14671 if (!mref_reader.is_state_satisfied())
14672 return -CEPHFS_ENOTCONN;
14673
14674 std::scoped_lock cl(client_lock);
14675 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
14676 }
14677
14678 int Client::ll_flush(Fh *fh)
14679 {
14680 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14681 if (!mref_reader.is_state_satisfied())
14682 return -CEPHFS_ENOTCONN;
14683
14684 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14685 tout(cct) << "ll_flush" << std::endl;
14686 tout(cct) << (uintptr_t)fh << std::endl;
14687
14688 std::scoped_lock lock(client_lock);
14689 return _flush(fh);
14690 }
14691
14692 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14693 {
14694 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14695 if (!mref_reader.is_state_satisfied())
14696 return -CEPHFS_ENOTCONN;
14697
14698 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14699 tout(cct) << "ll_fsync" << std::endl;
14700 tout(cct) << (uintptr_t)fh << std::endl;
14701
14702 std::scoped_lock lock(client_lock);
14703 int r = _fsync(fh, syncdataonly);
14704 if (r) {
14705 // If we're returning an error, clear it from the FH
14706 fh->take_async_err();
14707 }
14708 return r;
14709 }
14710
14711 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14712 {
14713 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14714 if (!mref_reader.is_state_satisfied())
14715 return -CEPHFS_ENOTCONN;
14716
14717 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14718 tout(cct) << "ll_sync_inode" << std::endl;
14719 tout(cct) << (uintptr_t)in << std::endl;
14720
14721 std::scoped_lock lock(client_lock);
14722 return _fsync(in, syncdataonly);
14723 }
14724
14725 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14726 {
14727 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14728
14729 if (offset < 0 || length <= 0)
14730 return -CEPHFS_EINVAL;
14731
14732 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14733 return -CEPHFS_EOPNOTSUPP;
14734
14735 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14736 return -CEPHFS_EOPNOTSUPP;
14737
14738 Inode *in = fh->inode.get();
14739
14740 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14741 !(mode & FALLOC_FL_PUNCH_HOLE)) {
14742 return -CEPHFS_ENOSPC;
14743 }
14744
14745 if (in->snapid != CEPH_NOSNAP)
14746 return -CEPHFS_EROFS;
14747
14748 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14749 return -CEPHFS_EBADF;
14750
14751 uint64_t size = offset + length;
14752 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14753 size > in->size &&
14754 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14755 return -CEPHFS_EDQUOT;
14756 }
14757
14758 int have;
14759 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14760 if (r < 0)
14761 return r;
14762
14763 std::unique_ptr<C_SaferCond> onuninline = nullptr;
14764 if (mode & FALLOC_FL_PUNCH_HOLE) {
14765 if (in->inline_version < CEPH_INLINE_NONE &&
14766 (have & CEPH_CAP_FILE_BUFFER)) {
14767 bufferlist bl;
14768 auto inline_iter = in->inline_data.cbegin();
14769 int len = in->inline_data.length();
14770 if (offset < len) {
14771 if (offset > 0)
14772 inline_iter.copy(offset, bl);
14773 int size = length;
14774 if (offset + size > len)
14775 size = len - offset;
14776 if (size > 0)
14777 bl.append_zero(size);
14778 if (offset + size < len) {
14779 inline_iter += size;
14780 inline_iter.copy(len - offset - size, bl);
14781 }
14782 in->inline_data = bl;
14783 in->inline_version++;
14784 }
14785 in->mtime = in->ctime = ceph_clock_now();
14786 in->change_attr++;
14787 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14788 } else {
14789 if (in->inline_version < CEPH_INLINE_NONE) {
14790 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14791 uninline_data(in, onuninline.get());
14792 }
14793
14794 C_SaferCond onfinish("Client::_punch_hole flock");
14795
14796 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14797
14798 _invalidate_inode_cache(in, offset, length);
14799 filer->zero(in->ino, &in->layout,
14800 in->snaprealm->get_snap_context(),
14801 offset, length,
14802 ceph::real_clock::now(),
14803 0, true, &onfinish);
14804 in->mtime = in->ctime = ceph_clock_now();
14805 in->change_attr++;
14806 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14807
14808 client_lock.unlock();
14809 onfinish.wait();
14810 client_lock.lock();
14811 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14812 }
14813 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14814 uint64_t size = offset + length;
14815 if (size > in->size) {
14816 in->size = size;
14817 in->mtime = in->ctime = ceph_clock_now();
14818 in->change_attr++;
14819 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14820
14821 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14822 check_caps(in, CHECK_CAPS_NODELAY);
14823 } else if (is_max_size_approaching(in)) {
14824 check_caps(in, 0);
14825 }
14826 }
14827 }
14828
14829 if (nullptr != onuninline) {
14830 client_lock.unlock();
14831 int ret = onuninline->wait();
14832 client_lock.lock();
14833
14834 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14835 in->inline_data.clear();
14836 in->inline_version = CEPH_INLINE_NONE;
14837 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14838 check_caps(in, 0);
14839 } else
14840 r = ret;
14841 }
14842
14843 put_cap_ref(in, CEPH_CAP_FILE_WR);
14844 return r;
14845 }
14846
14847 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14848 {
14849 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14850 if (!mref_reader.is_state_satisfied())
14851 return -CEPHFS_ENOTCONN;
14852
14853 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14854 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14855 tout(cct) << (uintptr_t)fh << std::endl;
14856
14857 std::scoped_lock lock(client_lock);
14858 return _fallocate(fh, mode, offset, length);
14859 }
14860
14861 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14862 {
14863 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14864 if (!mref_reader.is_state_satisfied())
14865 return -CEPHFS_ENOTCONN;
14866
14867 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14868
14869 std::scoped_lock lock(client_lock);
14870 Fh *fh = get_filehandle(fd);
14871 if (!fh)
14872 return -CEPHFS_EBADF;
14873 #if defined(__linux__) && defined(O_PATH)
14874 if (fh->flags & O_PATH)
14875 return -CEPHFS_EBADF;
14876 #endif
14877 return _fallocate(fh, mode, offset, length);
14878 }
14879
14880 int Client::ll_release(Fh *fh)
14881 {
14882 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14883 if (!mref_reader.is_state_satisfied())
14884 return -CEPHFS_ENOTCONN;
14885
14886 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14887 dendl;
14888 tout(cct) << __func__ << " (fh)" << std::endl;
14889 tout(cct) << (uintptr_t)fh << std::endl;
14890
14891 std::scoped_lock lock(client_lock);
14892
14893 if (ll_unclosed_fh_set.count(fh))
14894 ll_unclosed_fh_set.erase(fh);
14895 return _release_fh(fh);
14896 }
14897
14898 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14899 {
14900 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14901 if (!mref_reader.is_state_satisfied())
14902 return -CEPHFS_ENOTCONN;
14903
14904 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14905 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14906
14907 std::scoped_lock lock(client_lock);
14908 return _getlk(fh, fl, owner);
14909 }
14910
14911 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14912 {
14913 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14914 if (!mref_reader.is_state_satisfied())
14915 return -CEPHFS_ENOTCONN;
14916
14917 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14918 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14919
14920 std::scoped_lock lock(client_lock);
14921 return _setlk(fh, fl, owner, sleep);
14922 }
14923
14924 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14925 {
14926 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14927 if (!mref_reader.is_state_satisfied())
14928 return -CEPHFS_ENOTCONN;
14929
14930 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14931 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14932
14933 std::scoped_lock lock(client_lock);
14934 return _flock(fh, cmd, owner);
14935 }
14936
14937 int Client::set_deleg_timeout(uint32_t timeout)
14938 {
14939 std::scoped_lock lock(client_lock);
14940
14941 /*
14942 * The whole point is to prevent blocklisting so we must time out the
14943 * delegation before the session autoclose timeout kicks in.
14944 */
14945 if (timeout >= mdsmap->get_session_autoclose())
14946 return -CEPHFS_EINVAL;
14947
14948 deleg_timeout = timeout;
14949 return 0;
14950 }
14951
14952 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14953 {
14954 int ret = -CEPHFS_EINVAL;
14955
14956 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14957 if (!mref_reader.is_state_satisfied())
14958 return -CEPHFS_ENOTCONN;
14959
14960 std::scoped_lock lock(client_lock);
14961
14962 Inode *inode = fh->inode.get();
14963
14964 switch(cmd) {
14965 case CEPH_DELEGATION_NONE:
14966 inode->unset_deleg(fh);
14967 ret = 0;
14968 break;
14969 default:
14970 try {
14971 ret = inode->set_deleg(fh, cmd, cb, priv);
14972 } catch (std::bad_alloc&) {
14973 ret = -CEPHFS_ENOMEM;
14974 }
14975 break;
14976 }
14977 return ret;
14978 }
14979
14980 class C_Client_RequestInterrupt : public Context {
14981 private:
14982 Client *client;
14983 MetaRequest *req;
14984 public:
14985 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14986 req->get();
14987 }
14988 void finish(int r) override {
14989 std::scoped_lock l(client->client_lock);
14990 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14991 client->_interrupt_filelock(req);
14992 client->put_request(req);
14993 }
14994 };
14995
14996 void Client::ll_interrupt(void *d)
14997 {
14998 MetaRequest *req = static_cast<MetaRequest*>(d);
14999 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15000 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
15001 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15002 }
15003
15004 // =========================================
15005 // layout
15006
15007 // expose file layouts
15008
15009 int Client::describe_layout(const char *relpath, file_layout_t *lp,
15010 const UserPerm& perms)
15011 {
15012 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15013 if (!mref_reader.is_state_satisfied())
15014 return -CEPHFS_ENOTCONN;
15015
15016 std::scoped_lock lock(client_lock);
15017
15018 filepath path(relpath);
15019 InodeRef in;
15020 int r = path_walk(path, &in, perms);
15021 if (r < 0)
15022 return r;
15023
15024 *lp = in->layout;
15025
15026 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
15027 return 0;
15028 }
15029
15030 int Client::fdescribe_layout(int fd, file_layout_t *lp)
15031 {
15032 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15033 if (!mref_reader.is_state_satisfied())
15034 return -CEPHFS_ENOTCONN;
15035
15036 std::scoped_lock lock(client_lock);
15037
15038 Fh *f = get_filehandle(fd);
15039 if (!f)
15040 return -CEPHFS_EBADF;
15041 Inode *in = f->inode.get();
15042
15043 *lp = in->layout;
15044
15045 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
15046 return 0;
15047 }
15048
15049 int64_t Client::get_default_pool_id()
15050 {
15051 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15052 if (!mref_reader.is_state_satisfied())
15053 return -CEPHFS_ENOTCONN;
15054
15055 std::scoped_lock lock(client_lock);
15056
15057 /* first data pool is the default */
15058 return mdsmap->get_first_data_pool();
15059 }
15060
15061 // expose osdmap
15062
15063 int64_t Client::get_pool_id(const char *pool_name)
15064 {
15065 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15066 if (!mref_reader.is_state_satisfied())
15067 return -CEPHFS_ENOTCONN;
15068
15069 std::scoped_lock lock(client_lock);
15070
15071 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15072 pool_name);
15073 }
15074
15075 string Client::get_pool_name(int64_t pool)
15076 {
15077 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15078 if (!mref_reader.is_state_satisfied())
15079 return string();
15080
15081 std::scoped_lock lock(client_lock);
15082
15083 return objecter->with_osdmap([pool](const OSDMap& o) {
15084 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15085 });
15086 }
15087
15088 int Client::get_pool_replication(int64_t pool)
15089 {
15090 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15091 if (!mref_reader.is_state_satisfied())
15092 return -CEPHFS_ENOTCONN;
15093
15094 std::scoped_lock lock(client_lock);
15095
15096 return objecter->with_osdmap([pool](const OSDMap& o) {
15097 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
15098 });
15099 }
15100
15101 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15102 {
15103 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15104 if (!mref_reader.is_state_satisfied())
15105 return -CEPHFS_ENOTCONN;
15106
15107 std::scoped_lock lock(client_lock);
15108
15109 Fh *f = get_filehandle(fd);
15110 if (!f)
15111 return -CEPHFS_EBADF;
15112 Inode *in = f->inode.get();
15113
15114 vector<ObjectExtent> extents;
15115 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
15116 ceph_assert(extents.size() == 1);
15117
15118 objecter->with_osdmap([&](const OSDMap& o) {
15119 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15120 o.pg_to_acting_osds(pg, osds);
15121 });
15122
15123 if (osds.empty())
15124 return -CEPHFS_EINVAL;
15125
15126 /*
15127 * Return the remainder of the extent (stripe unit)
15128 *
15129 * If length = 1 is passed to Striper::file_to_extents we get a single
15130 * extent back, but its length is one so we still need to compute the length
15131 * to the end of the stripe unit.
15132 *
15133 * If length = su then we may get 1 or 2 objects back in the extents vector
15134 * which would have to be examined. Even then, the offsets are local to the
15135 * object, so matching up to the file offset is extra work.
15136 *
15137 * It seems simpler to stick with length = 1 and manually compute the
15138 * remainder.
15139 */
15140 if (len) {
15141 uint64_t su = in->layout.stripe_unit;
15142 *len = su - (off % su);
15143 }
15144
15145 return 0;
15146 }
15147
15148 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15149 {
15150 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15151 if (!mref_reader.is_state_satisfied())
15152 return -CEPHFS_ENOTCONN;
15153
15154 std::scoped_lock lock(client_lock);
15155
15156 if (id < 0)
15157 return -CEPHFS_EINVAL;
15158 return objecter->with_osdmap([&](const OSDMap& o) {
15159 return o.crush->get_full_location_ordered(id, path);
15160 });
15161 }
15162
15163 int Client::get_file_stripe_address(int fd, loff_t offset,
15164 vector<entity_addr_t>& address)
15165 {
15166 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15167 if (!mref_reader.is_state_satisfied())
15168 return -CEPHFS_ENOTCONN;
15169
15170 std::scoped_lock lock(client_lock);
15171
15172 Fh *f = get_filehandle(fd);
15173 if (!f)
15174 return -CEPHFS_EBADF;
15175 Inode *in = f->inode.get();
15176
15177 // which object?
15178 vector<ObjectExtent> extents;
15179 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15180 in->truncate_size, extents);
15181 ceph_assert(extents.size() == 1);
15182
15183 // now we have the object and its 'layout'
15184 return objecter->with_osdmap([&](const OSDMap& o) {
15185 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15186 vector<int> osds;
15187 o.pg_to_acting_osds(pg, osds);
15188 if (osds.empty())
15189 return -CEPHFS_EINVAL;
15190 for (unsigned i = 0; i < osds.size(); i++) {
15191 entity_addr_t addr = o.get_addrs(osds[i]).front();
15192 address.push_back(addr);
15193 }
15194 return 0;
15195 });
15196 }
15197
15198 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15199 {
15200 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15201 if (!mref_reader.is_state_satisfied())
15202 return -CEPHFS_ENOTCONN;
15203
15204 std::scoped_lock lock(client_lock);
15205
15206 return objecter->with_osdmap([&](const OSDMap& o) {
15207 if (!o.exists(osd))
15208 return -CEPHFS_ENOENT;
15209
15210 addr = o.get_addrs(osd).front();
15211 return 0;
15212 });
15213 }
15214
15215 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15216 loff_t length, loff_t offset)
15217 {
15218 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15219 if (!mref_reader.is_state_satisfied())
15220 return -CEPHFS_ENOTCONN;
15221
15222 std::scoped_lock lock(client_lock);
15223
15224 Fh *f = get_filehandle(fd);
15225 if (!f)
15226 return -CEPHFS_EBADF;
15227 Inode *in = f->inode.get();
15228
15229 // map to a list of extents
15230 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15231
15232 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15233 return 0;
15234 }
15235
15236
15237 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15238 int Client::get_local_osd()
15239 {
15240 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15241 if (!mref_reader.is_state_satisfied())
15242 return -CEPHFS_ENOTCONN;
15243
15244 std::scoped_lock lock(client_lock);
15245
15246 objecter->with_osdmap([this](const OSDMap& o) {
15247 if (o.get_epoch() != local_osd_epoch) {
15248 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15249 local_osd_epoch = o.get_epoch();
15250 }
15251 });
15252 return local_osd;
15253 }
15254
15255
15256
15257
15258
15259
15260 // ===============================
15261
15262 void Client::ms_handle_connect(Connection *con)
15263 {
15264 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15265 }
15266
15267 bool Client::ms_handle_reset(Connection *con)
15268 {
15269 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15270 return false;
15271 }
15272
15273 void Client::ms_handle_remote_reset(Connection *con)
15274 {
15275 std::scoped_lock lock(client_lock);
15276 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15277 switch (con->get_peer_type()) {
15278 case CEPH_ENTITY_TYPE_MDS:
15279 {
15280 // kludge to figure out which mds this is; fixme with a Connection* state
15281 mds_rank_t mds = MDS_RANK_NONE;
15282 MetaSessionRef s = NULL;
15283 for (auto &p : mds_sessions) {
15284 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15285 mds = p.first;
15286 s = p.second;
15287 }
15288 }
15289 if (mds >= 0) {
15290 ceph_assert(s != NULL);
15291 switch (s->state) {
15292 case MetaSession::STATE_CLOSING:
15293 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15294 _closed_mds_session(s.get());
15295 break;
15296
15297 case MetaSession::STATE_OPENING:
15298 {
15299 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15300 list<Context*> waiters;
15301 waiters.swap(s->waiting_for_open);
15302 _closed_mds_session(s.get());
15303 auto news = _get_or_open_mds_session(mds);
15304 news->waiting_for_open.swap(waiters);
15305 }
15306 break;
15307
15308 case MetaSession::STATE_OPEN:
15309 {
15310 objecter->maybe_request_map(); /* to check if we are blocklisted */
15311 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15312 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15313 _closed_mds_session(s.get());
15314 } else {
15315 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15316 s->state = MetaSession::STATE_STALE;
15317 }
15318 }
15319 break;
15320
15321 case MetaSession::STATE_NEW:
15322 case MetaSession::STATE_CLOSED:
15323 default:
15324 break;
15325 }
15326 }
15327 }
15328 break;
15329 }
15330 }
15331
15332 bool Client::ms_handle_refused(Connection *con)
15333 {
15334 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15335 return false;
15336 }
15337
15338 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15339 {
15340 Inode *quota_in = root_ancestor;
15341 SnapRealm *realm = in->snaprealm;
15342
15343 if (!cct->_conf.get_val<bool>("client_quota"))
15344 return NULL;
15345
15346 while (realm) {
15347 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15348 if (realm->ino != in->ino) {
15349 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15350 if (p == inode_map.end())
15351 break;
15352
15353 if (p->second->quota.is_enable()) {
15354 quota_in = p->second;
15355 break;
15356 }
15357 }
15358 realm = realm->pparent;
15359 }
15360 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15361 return quota_in;
15362 }
15363
15364 /**
15365 * Traverse quota ancestors of the Inode, return true
15366 * if any of them passes the passed function
15367 */
15368 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15369 std::function<bool (const Inode &in)> test)
15370 {
15371 if (!cct->_conf.get_val<bool>("client_quota"))
15372 return false;
15373
15374 while (true) {
15375 ceph_assert(in != NULL);
15376 if (test(*in)) {
15377 return true;
15378 }
15379
15380 if (in == root_ancestor) {
15381 // We're done traversing, drop out
15382 return false;
15383 } else {
15384 // Continue up the tree
15385 in = get_quota_root(in, perms);
15386 }
15387 }
15388
15389 return false;
15390 }
15391
15392 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15393 {
15394 return check_quota_condition(in, perms,
15395 [](const Inode &in) {
15396 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15397 });
15398 }
15399
15400 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15401 const UserPerm& perms)
15402 {
15403 return check_quota_condition(in, perms,
15404 [&new_bytes](const Inode &in) {
15405 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15406 > in.quota.max_bytes;
15407 });
15408 }
15409
15410 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15411 {
15412 ceph_assert(in->size >= in->reported_size);
15413 const uint64_t size = in->size - in->reported_size;
15414 return check_quota_condition(in, perms,
15415 [&size](const Inode &in) {
15416 if (in.quota.max_bytes) {
15417 if (in.rstat.rbytes >= in.quota.max_bytes) {
15418 return true;
15419 }
15420
15421 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15422 return (space >> 4) < size;
15423 } else {
15424 return false;
15425 }
15426 });
15427 }
15428
15429 enum {
15430 POOL_CHECKED = 1,
15431 POOL_CHECKING = 2,
15432 POOL_READ = 4,
15433 POOL_WRITE = 8,
15434 };
15435
15436 int Client::check_pool_perm(Inode *in, int need)
15437 {
15438 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15439
15440 if (!cct->_conf->client_check_pool_perm)
15441 return 0;
15442
15443 /* Only need to do this for regular files */
15444 if (!in->is_file())
15445 return 0;
15446
15447 int64_t pool_id = in->layout.pool_id;
15448 std::string pool_ns = in->layout.pool_ns;
15449 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15450 int have = 0;
15451 while (true) {
15452 auto it = pool_perms.find(perm_key);
15453 if (it == pool_perms.end())
15454 break;
15455 if (it->second == POOL_CHECKING) {
15456 // avoid concurrent checkings
15457 wait_on_list(waiting_for_pool_perm);
15458 } else {
15459 have = it->second;
15460 ceph_assert(have & POOL_CHECKED);
15461 break;
15462 }
15463 }
15464
15465 if (!have) {
15466 if (in->snapid != CEPH_NOSNAP) {
15467 // pool permission check needs to write to the first object. But for snapshot,
15468 // head of the first object may have already been deleted. To avoid creating
15469 // orphan object, skip the check for now.
15470 return 0;
15471 }
15472
15473 pool_perms[perm_key] = POOL_CHECKING;
15474
15475 char oid_buf[32];
15476 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15477 object_t oid = oid_buf;
15478
15479 SnapContext nullsnapc;
15480
15481 C_SaferCond rd_cond;
15482 ObjectOperation rd_op;
15483 rd_op.stat(nullptr, nullptr, nullptr);
15484
15485 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15486 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15487
15488 C_SaferCond wr_cond;
15489 ObjectOperation wr_op;
15490 wr_op.create(true);
15491
15492 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15493 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15494
15495 client_lock.unlock();
15496 int rd_ret = rd_cond.wait();
15497 int wr_ret = wr_cond.wait();
15498 client_lock.lock();
15499
15500 bool errored = false;
15501
15502 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
15503 have |= POOL_READ;
15504 else if (rd_ret != -CEPHFS_EPERM) {
15505 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15506 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15507 errored = true;
15508 }
15509
15510 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
15511 have |= POOL_WRITE;
15512 else if (wr_ret != -CEPHFS_EPERM) {
15513 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15514 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15515 errored = true;
15516 }
15517
15518 if (errored) {
15519 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15520 // Raise EIO because actual error code might be misleading for
15521 // userspace filesystem user.
15522 pool_perms.erase(perm_key);
15523 signal_cond_list(waiting_for_pool_perm);
15524 return -CEPHFS_EIO;
15525 }
15526
15527 pool_perms[perm_key] = have | POOL_CHECKED;
15528 signal_cond_list(waiting_for_pool_perm);
15529 }
15530
15531 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15532 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15533 << " need " << ccap_string(need) << ", but no read perm" << dendl;
15534 return -CEPHFS_EPERM;
15535 }
15536 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15537 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15538 << " need " << ccap_string(need) << ", but no write perm" << dendl;
15539 return -CEPHFS_EPERM;
15540 }
15541
15542 return 0;
15543 }
15544
15545 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15546 {
15547 if (acl_type == POSIX_ACL) {
15548 if (in->xattrs.count(ACL_EA_ACCESS)) {
15549 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15550
15551 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15552 }
15553 }
15554 return -CEPHFS_EAGAIN;
15555 }
15556
15557 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15558 {
15559 if (acl_type == NO_ACL)
15560 return 0;
15561
15562 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15563 if (r < 0)
15564 goto out;
15565
15566 if (acl_type == POSIX_ACL) {
15567 if (in->xattrs.count(ACL_EA_ACCESS)) {
15568 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15569 bufferptr acl(access_acl.c_str(), access_acl.length());
15570 r = posix_acl_access_chmod(acl, mode);
15571 if (r < 0)
15572 goto out;
15573 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15574 } else {
15575 r = 0;
15576 }
15577 }
15578 out:
15579 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15580 return r;
15581 }
15582
15583 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15584 const UserPerm& perms)
15585 {
15586 if (acl_type == NO_ACL)
15587 return 0;
15588
15589 if (S_ISLNK(*mode))
15590 return 0;
15591
15592 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15593 if (r < 0)
15594 goto out;
15595
15596 if (acl_type == POSIX_ACL) {
15597 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15598 map<string, bufferptr> xattrs;
15599
15600 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15601 bufferptr acl(default_acl.c_str(), default_acl.length());
15602 r = posix_acl_inherit_mode(acl, mode);
15603 if (r < 0)
15604 goto out;
15605
15606 if (r > 0) {
15607 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15608 if (r < 0)
15609 goto out;
15610 if (r > 0)
15611 xattrs[ACL_EA_ACCESS] = acl;
15612 }
15613
15614 if (S_ISDIR(*mode))
15615 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15616
15617 r = xattrs.size();
15618 if (r > 0)
15619 encode(xattrs, xattrs_bl);
15620 } else {
15621 if (umask_cb)
15622 *mode &= ~umask_cb(callback_handle);
15623 r = 0;
15624 }
15625 }
15626 out:
15627 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15628 return r;
15629 }
15630
15631 void Client::set_filer_flags(int flags)
15632 {
15633 std::scoped_lock l(client_lock);
15634 ceph_assert(flags == 0 ||
15635 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15636 objecter->add_global_op_flags(flags);
15637 }
15638
15639 void Client::clear_filer_flags(int flags)
15640 {
15641 std::scoped_lock l(client_lock);
15642 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15643 objecter->clear_global_op_flag(flags);
15644 }
15645
15646 // called before mount
15647 void Client::set_uuid(const std::string& uuid)
15648 {
15649 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15650 ceph_assert(iref_reader.is_state_satisfied());
15651
15652 std::scoped_lock l(client_lock);
15653 ceph_assert(!uuid.empty());
15654
15655 metadata["uuid"] = uuid;
15656 _close_sessions();
15657 }
15658
15659 // called before mount. 0 means infinite
15660 void Client::set_session_timeout(unsigned timeout)
15661 {
15662 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15663 ceph_assert(iref_reader.is_state_satisfied());
15664
15665 std::scoped_lock l(client_lock);
15666
15667 metadata["timeout"] = stringify(timeout);
15668 }
15669
15670 // called before mount
15671 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15672 const std::string& fs_name)
15673 {
15674 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15675 if (!iref_reader.is_state_satisfied())
15676 return -CEPHFS_ENOTCONN;
15677
15678 if (uuid.empty())
15679 return -CEPHFS_EINVAL;
15680
15681 std::unique_lock l(client_lock);
15682 {
15683 auto it = metadata.find("uuid");
15684 if (it != metadata.end() && it->second == uuid)
15685 return -CEPHFS_EINVAL;
15686 }
15687
15688 int r = subscribe_mdsmap(fs_name);
15689 if (r < 0) {
15690 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15691 return r;
15692 }
15693
15694 if (metadata.empty())
15695 populate_metadata("");
15696
15697 while (mdsmap->get_epoch() == 0)
15698 wait_on_list(waiting_for_mdsmap);
15699
15700 reclaim_errno = 0;
15701 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15702 if (!mdsmap->is_up(mds)) {
15703 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15704 wait_on_list(waiting_for_mdsmap);
15705 continue;
15706 }
15707
15708 MetaSessionRef session;
15709 if (!have_open_session(mds)) {
15710 session = _get_or_open_mds_session(mds);
15711 if (session->state == MetaSession::STATE_REJECTED)
15712 return -CEPHFS_EPERM;
15713 if (session->state != MetaSession::STATE_OPENING) {
15714 // umounting?
15715 return -CEPHFS_EINVAL;
15716 }
15717 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15718 wait_on_context_list(session->waiting_for_open);
15719 continue;
15720 }
15721
15722 session = mds_sessions.at(mds);
15723 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15724 return -CEPHFS_EOPNOTSUPP;
15725
15726 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15727 session->reclaim_state == MetaSession::RECLAIMING) {
15728 session->reclaim_state = MetaSession::RECLAIMING;
15729 auto m = make_message<MClientReclaim>(uuid, flags);
15730 session->con->send_message2(std::move(m));
15731 wait_on_list(waiting_for_reclaim);
15732 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15733 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15734 } else {
15735 mds++;
15736 }
15737 }
15738
15739 // didn't find target session in any mds
15740 if (reclaim_target_addrs.empty()) {
15741 if (flags & CEPH_RECLAIM_RESET)
15742 return -CEPHFS_ENOENT;
15743 return -CEPHFS_ENOTRECOVERABLE;
15744 }
15745
15746 if (flags & CEPH_RECLAIM_RESET)
15747 return 0;
15748
15749 // use blocklist to check if target session was killed
15750 // (config option mds_session_blocklist_on_evict needs to be true)
15751 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15752 bs::error_code ec;
15753 l.unlock();
15754 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15755 l.lock();
15756
15757 if (ec)
15758 return ceph::from_error_code(ec);
15759
15760 bool blocklisted = objecter->with_osdmap(
15761 [this](const OSDMap &osd_map) -> bool {
15762 return osd_map.is_blocklisted(reclaim_target_addrs);
15763 });
15764 if (blocklisted)
15765 return -CEPHFS_ENOTRECOVERABLE;
15766
15767 metadata["reclaiming_uuid"] = uuid;
15768 return 0;
15769 }
15770
15771 void Client::finish_reclaim()
15772 {
15773 auto it = metadata.find("reclaiming_uuid");
15774 if (it == metadata.end()) {
15775 for (auto &p : mds_sessions)
15776 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15777 return;
15778 }
15779
15780 for (auto &p : mds_sessions) {
15781 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15782 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15783 p.second->con->send_message2(std::move(m));
15784 }
15785
15786 metadata["uuid"] = it->second;
15787 metadata.erase(it);
15788 }
15789
15790 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15791 {
15792 mds_rank_t from = mds_rank_t(reply->get_source().num());
15793 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15794
15795 std::scoped_lock cl(client_lock);
15796 auto session = _get_mds_session(from, reply->get_connection().get());
15797 if (!session) {
15798 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15799 return;
15800 }
15801
15802 if (reply->get_result() >= 0) {
15803 session->reclaim_state = MetaSession::RECLAIM_OK;
15804 if (reply->get_epoch() > reclaim_osd_epoch)
15805 reclaim_osd_epoch = reply->get_epoch();
15806 if (!reply->get_addrs().empty())
15807 reclaim_target_addrs = reply->get_addrs();
15808 } else {
15809 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15810 reclaim_errno = reply->get_result();
15811 }
15812
15813 signal_cond_list(waiting_for_reclaim);
15814 }
15815
15816 /**
15817 * This is included in cap release messages, to cause
15818 * the MDS to wait until this OSD map epoch. It is necessary
15819 * in corner cases where we cancel RADOS ops, so that
15820 * nobody else tries to do IO to the same objects in
15821 * the same epoch as the cancelled ops.
15822 */
15823 void Client::set_cap_epoch_barrier(epoch_t e)
15824 {
15825 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15826 cap_epoch_barrier = e;
15827 }
15828
15829 const char** Client::get_tracked_conf_keys() const
15830 {
15831 static const char* keys[] = {
15832 "client_cache_size",
15833 "client_cache_mid",
15834 "client_acl_type",
15835 "client_deleg_timeout",
15836 "client_deleg_break_on_open",
15837 "client_oc_size",
15838 "client_oc_max_objects",
15839 "client_oc_max_dirty",
15840 "client_oc_target_dirty",
15841 "client_oc_max_dirty_age",
15842 "client_caps_release_delay",
15843 "client_mount_timeout",
15844 NULL
15845 };
15846 return keys;
15847 }
15848
15849 void Client::handle_conf_change(const ConfigProxy& conf,
15850 const std::set <std::string> &changed)
15851 {
15852 std::scoped_lock lock(client_lock);
15853
15854 if (changed.count("client_cache_mid")) {
15855 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15856 }
15857 if (changed.count("client_acl_type")) {
15858 acl_type = NO_ACL;
15859 if (cct->_conf->client_acl_type == "posix_acl")
15860 acl_type = POSIX_ACL;
15861 }
15862 if (changed.count("client_oc_size")) {
15863 objectcacher->set_max_size(cct->_conf->client_oc_size);
15864 }
15865 if (changed.count("client_oc_max_objects")) {
15866 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15867 }
15868 if (changed.count("client_oc_max_dirty")) {
15869 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15870 }
15871 if (changed.count("client_oc_target_dirty")) {
15872 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15873 }
15874 if (changed.count("client_oc_max_dirty_age")) {
15875 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15876 }
15877 if (changed.count("client_collect_and_send_global_metrics")) {
15878 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
15879 "client_collect_and_send_global_metrics");
15880 }
15881 if (changed.count("client_caps_release_delay")) {
15882 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
15883 "client_caps_release_delay");
15884 }
15885 if (changed.count("client_mount_timeout")) {
15886 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
15887 "client_mount_timeout");
15888 }
15889 }
15890
15891 void intrusive_ptr_add_ref(Inode *in)
15892 {
15893 in->iget();
15894 }
15895
15896 void intrusive_ptr_release(Inode *in)
15897 {
15898 in->client->put_inode(in);
15899 }
15900
15901 mds_rank_t Client::_get_random_up_mds() const
15902 {
15903 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15904
15905 std::set<mds_rank_t> up;
15906 mdsmap->get_up_mds_set(up);
15907
15908 if (up.empty())
15909 return MDS_RANK_NONE;
15910 std::set<mds_rank_t>::const_iterator p = up.begin();
15911 for (int n = rand() % up.size(); n; n--)
15912 ++p;
15913 return *p;
15914 }
15915
15916
15917 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15918 boost::asio::io_context& ictx)
15919 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15920 {
15921 monclient->set_messenger(m);
15922 objecter->set_client_incarnation(0);
15923 }
15924
15925 StandaloneClient::~StandaloneClient()
15926 {
15927 delete objecter;
15928 objecter = nullptr;
15929 }
15930
15931 int StandaloneClient::init()
15932 {
15933 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15934 ceph_assert(iref_writer.is_first_writer());
15935
15936 _pre_init();
15937 objecter->init();
15938
15939 client_lock.lock();
15940
15941 messenger->add_dispatcher_tail(objecter);
15942 messenger->add_dispatcher_tail(this);
15943
15944 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15945 int r = monclient->init();
15946 if (r < 0) {
15947 // need to do cleanup because we're in an intermediate init state
15948 {
15949 std::scoped_lock l(timer_lock);
15950 timer.shutdown();
15951 }
15952
15953 client_lock.unlock();
15954 objecter->shutdown();
15955 objectcacher->stop();
15956 monclient->shutdown();
15957 return r;
15958 }
15959 objecter->start();
15960
15961 client_lock.unlock();
15962 _finish_init();
15963 iref_writer.update_state(CLIENT_INITIALIZED);
15964
15965 return 0;
15966 }
15967
15968 void StandaloneClient::shutdown()
15969 {
15970 Client::shutdown();
15971 objecter->shutdown();
15972 monclient->shutdown();
15973 }