]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import ceph quincy 17.2.6
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #else
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
77
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
83
84 #define dout_subsys ceph_subsys_client
85
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
90
91 #include "Client.h"
92 #include "Inode.h"
93 #include "Dentry.h"
94 #include "Delegation.h"
95 #include "Dir.h"
96 #include "ClientSnapRealm.h"
97 #include "Fh.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
102
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
105
106 #include "include/cephfs/ceph_ll_client.h"
107
108 #if HAVE_GETGROUPLIST
109 #include <grp.h>
110 #include <pwd.h>
111 #include <unistd.h>
112 #endif
113
114 #undef dout_prefix
115 #define dout_prefix *_dout << "client." << whoami << " "
116
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119 // FreeBSD fails to define this
120 #ifndef O_DSYNC
121 #define O_DSYNC 0x0
122 #endif
123 // Darwin fails to define this
124 #ifndef O_RSYNC
125 #define O_RSYNC 0x0
126 #endif
127
128 #ifndef O_DIRECT
129 #define O_DIRECT 0x0
130 #endif
131
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
137 // them as invalid.
138 #ifndef O_NOFOLLOW
139 #define O_NOFOLLOW 0x0
140 #endif
141
142 #ifndef O_SYNC
143 #define O_SYNC 0x0
144 #endif
145
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
148 #ifndef S_IXUGO
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150 #endif
151
152 using std::dec;
153 using std::hex;
154 using std::list;
155 using std::oct;
156 using std::pair;
157 using std::string;
158 using std::vector;
159
160 using namespace TOPNSPC::common;
161
162 namespace bs = boost::system;
163 namespace ca = ceph::async;
164
165 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166 {
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169 }
170
171 bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177 }
178
179 // running average and standard deviation -- presented in
180 // Donald Knuth's TAoCP, Volume II.
181 double calc_average(double old_avg, double value, uint64_t count) {
182 double new_avg;
183 if (count == 1) {
184 new_avg = value;
185 } else {
186 new_avg = old_avg + ((value - old_avg) / count);
187 }
188
189 return new_avg;
190 }
191
192 double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
193 double value, uint64_t count) {
194 double new_sq_sum;
195 if (count == 1) {
196 new_sq_sum = 0.0;
197 } else {
198 new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
199 }
200
201 return new_sq_sum;
202 }
203
204 // -------------
205
206 Client::CommandHook::CommandHook(Client *client) :
207 m_client(client)
208 {
209 }
210
211 int Client::CommandHook::call(
212 std::string_view command,
213 const cmdmap_t& cmdmap,
214 const bufferlist&,
215 Formatter *f,
216 std::ostream& errss,
217 bufferlist& out)
218 {
219 f->open_object_section("result");
220 {
221 std::scoped_lock l{m_client->client_lock};
222 if (command == "mds_requests")
223 m_client->dump_mds_requests(f);
224 else if (command == "mds_sessions") {
225 bool cap_dump = false;
226 cmd_getval(cmdmap, "cap_dump", cap_dump);
227 m_client->dump_mds_sessions(f, cap_dump);
228 } else if (command == "dump_cache")
229 m_client->dump_cache(f);
230 else if (command == "kick_stale_sessions")
231 m_client->_kick_stale_sessions();
232 else if (command == "status")
233 m_client->dump_status(f);
234 else
235 ceph_abort_msg("bad command registered");
236 }
237 f->close_section();
238 return 0;
239 }
240
241
242 // -------------
243
244 int Client::get_fd_inode(int fd, InodeRef *in) {
245 int r = 0;
246 if (fd == CEPHFS_AT_FDCWD) {
247 *in = cwd;
248 } else {
249 Fh *f = get_filehandle(fd);
250 if (!f) {
251 r = -CEPHFS_EBADF;
252 } else {
253 *in = f->inode;
254 }
255 }
256 return r;
257 }
258
259 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
260 : inode(in), offset(0), next_offset(2),
261 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
262 perms(perms)
263 { }
264
265 void Client::_reset_faked_inos()
266 {
267 ino_t start = 1024;
268 free_faked_inos.clear();
269 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
270 last_used_faked_ino = 0;
271 last_used_faked_root = 0;
272 #ifdef _WIN32
273 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
274 // Windows structures, including Dokan ones, are using 64B identifiers.
275 _use_faked_inos = false;
276 #else
277 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
278 #endif
279 }
280
281 void Client::_assign_faked_ino(Inode *in)
282 {
283 if (0 == last_used_faked_ino)
284 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
285 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
286 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
287 last_used_faked_ino = 2048;
288 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
289 }
290 ceph_assert(it != free_faked_inos.end());
291 if (last_used_faked_ino < it.get_start()) {
292 ceph_assert(it.get_len() > 0);
293 last_used_faked_ino = it.get_start();
294 } else {
295 ++last_used_faked_ino;
296 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
297 }
298 in->faked_ino = last_used_faked_ino;
299 free_faked_inos.erase(in->faked_ino);
300 faked_ino_map[in->faked_ino] = in->vino();
301 }
302
303 /*
304 * In the faked mode, if you export multiple subdirectories,
305 * you will see that the inode numbers of the exported subdirectories
306 * are the same. so we distinguish the mount point by reserving
307 * the "fake ids" between "1024~2048" and combining the last
308 * 10bits(0x3ff) of the "root inodes".
309 */
310 void Client::_assign_faked_root(Inode *in)
311 {
312 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
313 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
314 last_used_faked_root = 0;
315 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
316 }
317 ceph_assert(it != free_faked_inos.end());
318 vinodeno_t inode_info = in->vino();
319 uint64_t inode_num = (uint64_t)inode_info.ino;
320 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
321 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
322 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
323
324 in->faked_ino = last_used_faked_root;
325 free_faked_inos.erase(in->faked_ino);
326 faked_ino_map[in->faked_ino] = in->vino();
327 }
328
329 void Client::_release_faked_ino(Inode *in)
330 {
331 free_faked_inos.insert(in->faked_ino);
332 faked_ino_map.erase(in->faked_ino);
333 }
334
335 vinodeno_t Client::_map_faked_ino(ino_t ino)
336 {
337 vinodeno_t vino;
338 if (ino == 1)
339 vino = root->vino();
340 else if (faked_ino_map.count(ino))
341 vino = faked_ino_map[ino];
342 else
343 vino = vinodeno_t(0, CEPH_NOSNAP);
344 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
345 return vino;
346 }
347
348 vinodeno_t Client::map_faked_ino(ino_t ino)
349 {
350 std::scoped_lock lock(client_lock);
351 return _map_faked_ino(ino);
352 }
353
354 // cons/des
355
356 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
357 : Dispatcher(m->cct->get()),
358 timer(m->cct, timer_lock, false),
359 messenger(m),
360 monclient(mc),
361 objecter(objecter_),
362 whoami(mc->get_global_id()),
363 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
364 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
365 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
366 async_ino_invalidator(m->cct),
367 async_dentry_invalidator(m->cct),
368 interrupt_finisher(m->cct),
369 remount_finisher(m->cct),
370 async_ino_releasor(m->cct),
371 objecter_finisher(m->cct),
372 m_command_hook(this),
373 fscid(0)
374 {
375 _reset_faked_inos();
376
377 user_id = cct->_conf->client_mount_uid;
378 group_id = cct->_conf->client_mount_gid;
379 fuse_default_permissions = cct->_conf.get_val<bool>(
380 "fuse_default_permissions");
381
382 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
383 "client_collect_and_send_global_metrics");
384
385 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
386 "client_mount_timeout");
387
388 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
389 "client_caps_release_delay");
390
391 if (cct->_conf->client_acl_type == "posix_acl")
392 acl_type = POSIX_ACL;
393
394 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
395
396 // file handles
397 free_fd_set.insert(10, 1<<30);
398
399 mdsmap.reset(new MDSMap);
400
401 // osd interfaces
402 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
403 &client_lock));
404 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
405 client_flush_set_callback, // all commit callback
406 (void*)this,
407 cct->_conf->client_oc_size,
408 cct->_conf->client_oc_max_objects,
409 cct->_conf->client_oc_max_dirty,
410 cct->_conf->client_oc_target_dirty,
411 cct->_conf->client_oc_max_dirty_age,
412 true));
413 }
414
415
416 Client::~Client()
417 {
418 ceph_assert(ceph_mutex_is_not_locked(client_lock));
419
420 // If the task is crashed or aborted and doesn't
421 // get any chance to run the umount and shutdow.
422 {
423 std::scoped_lock l{client_lock};
424 tick_thread_stopped = true;
425 upkeep_cond.notify_one();
426 }
427
428 if (upkeeper.joinable())
429 upkeeper.join();
430
431 // It is necessary to hold client_lock, because any inode destruction
432 // may call into ObjectCacher, which asserts that it's lock (which is
433 // client_lock) is held.
434 std::scoped_lock l{client_lock};
435 tear_down_cache();
436 }
437
438 void Client::tear_down_cache()
439 {
440 // fd's
441 for (auto &[fd, fh] : fd_map) {
442 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
443 _release_fh(fh);
444 }
445 fd_map.clear();
446
447 while (!opened_dirs.empty()) {
448 dir_result_t *dirp = *opened_dirs.begin();
449 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
450 _closedir(dirp);
451 }
452
453 // caps!
454 // *** FIXME ***
455
456 // empty lru
457 trim_cache();
458 ceph_assert(lru.lru_get_size() == 0);
459
460 // close root ino
461 ceph_assert(inode_map.size() <= 1 + root_parents.size());
462 if (root && inode_map.size() == 1 + root_parents.size()) {
463 root.reset();
464 }
465
466 ceph_assert(inode_map.empty());
467 }
468
469 inodeno_t Client::get_root_ino()
470 {
471 std::scoped_lock l(client_lock);
472 if (use_faked_inos())
473 return root->faked_ino;
474 else
475 return root->ino;
476 }
477
478 Inode *Client::get_root()
479 {
480 std::scoped_lock l(client_lock);
481 root->ll_get();
482 return root.get();
483 }
484
485
486 // debug crapola
487
488 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
489 {
490 filepath path;
491 in->make_long_path(path);
492 ldout(cct, 1) << "dump_inode: "
493 << (disconnected ? "DISCONNECTED ":"")
494 << "inode " << in->ino
495 << " " << path
496 << " ref " << in->get_nref()
497 << " " << *in << dendl;
498
499 if (f) {
500 f->open_object_section("inode");
501 f->dump_stream("path") << path;
502 if (disconnected)
503 f->dump_int("disconnected", 1);
504 in->dump(f);
505 f->close_section();
506 }
507
508 did.insert(in);
509 if (in->dir) {
510 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
511 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
512 it != in->dir->dentries.end();
513 ++it) {
514 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
515 if (f) {
516 f->open_object_section("dentry");
517 it->second->dump(f);
518 f->close_section();
519 }
520 if (it->second->inode)
521 dump_inode(f, it->second->inode.get(), did, false);
522 }
523 }
524 }
525
526 void Client::dump_cache(Formatter *f)
527 {
528 set<Inode*> did;
529
530 ldout(cct, 1) << __func__ << dendl;
531
532 if (f)
533 f->open_array_section("cache");
534
535 if (root)
536 dump_inode(f, root.get(), did, true);
537
538 // make a second pass to catch anything disconnected
539 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
540 it != inode_map.end();
541 ++it) {
542 if (did.count(it->second))
543 continue;
544 dump_inode(f, it->second, did, true);
545 }
546
547 if (f)
548 f->close_section();
549 }
550
551 void Client::dump_status(Formatter *f)
552 {
553 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
554
555 ldout(cct, 1) << __func__ << dendl;
556
557 const epoch_t osd_epoch
558 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
559
560 if (f) {
561 f->open_object_section("metadata");
562 for (const auto& kv : metadata)
563 f->dump_string(kv.first.c_str(), kv.second);
564 f->close_section();
565
566 f->dump_int("dentry_count", lru.lru_get_size());
567 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
568 f->dump_int("id", get_nodeid().v);
569 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
570 f->dump_object("inst", inst);
571 f->dump_object("addr", inst.addr);
572 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
573 f->dump_string("addr_str", inst.addr.get_legacy_str());
574 f->dump_int("inode_count", inode_map.size());
575 f->dump_int("mds_epoch", mdsmap->get_epoch());
576 f->dump_int("osd_epoch", osd_epoch);
577 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
578 f->dump_bool("blocklisted", blocklisted);
579 f->dump_string("fs_name", mdsmap->get_fs_name());
580 }
581 }
582
583 void Client::_pre_init()
584 {
585 timer.init();
586
587 objecter_finisher.start();
588 filer.reset(new Filer(objecter, &objecter_finisher));
589
590 objectcacher->start();
591 }
592
593 int Client::init()
594 {
595 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
596 ceph_assert(iref_writer.is_first_writer());
597
598 _pre_init();
599 {
600 std::scoped_lock l{client_lock};
601 messenger->add_dispatcher_tail(this);
602 }
603 _finish_init();
604 iref_writer.update_state(CLIENT_INITIALIZED);
605 return 0;
606 }
607
608 void Client::_finish_init()
609 {
610 {
611 std::scoped_lock l{client_lock};
612 // logger
613 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
614 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
615 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
616 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
617 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
618 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
619 // average, standard deviation mds/r/w/ latencies
620 plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
621 plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
622 plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
623 plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
624 plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
625 plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
626 plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
627 plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
628 plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
629 logger.reset(plb.create_perf_counters());
630 cct->get_perfcounters_collection()->add(logger.get());
631 }
632
633 cct->_conf.add_observer(this);
634
635 AdminSocket* admin_socket = cct->get_admin_socket();
636 int ret = admin_socket->register_command("mds_requests",
637 &m_command_hook,
638 "show in-progress mds requests");
639 if (ret < 0) {
640 lderr(cct) << "error registering admin socket command: "
641 << cpp_strerror(-ret) << dendl;
642 }
643 ret = admin_socket->register_command("mds_sessions "
644 "name=cap_dump,type=CephBool,req=false",
645 &m_command_hook,
646 "show mds session state");
647 if (ret < 0) {
648 lderr(cct) << "error registering admin socket command: "
649 << cpp_strerror(-ret) << dendl;
650 }
651 ret = admin_socket->register_command("dump_cache",
652 &m_command_hook,
653 "show in-memory metadata cache contents");
654 if (ret < 0) {
655 lderr(cct) << "error registering admin socket command: "
656 << cpp_strerror(-ret) << dendl;
657 }
658 ret = admin_socket->register_command("kick_stale_sessions",
659 &m_command_hook,
660 "kick sessions that were remote reset");
661 if (ret < 0) {
662 lderr(cct) << "error registering admin socket command: "
663 << cpp_strerror(-ret) << dendl;
664 }
665 ret = admin_socket->register_command("status",
666 &m_command_hook,
667 "show overall client status");
668 if (ret < 0) {
669 lderr(cct) << "error registering admin socket command: "
670 << cpp_strerror(-ret) << dendl;
671 }
672 }
673
674 void Client::shutdown()
675 {
676 ldout(cct, 1) << __func__ << dendl;
677
678 // If we were not mounted, but were being used for sending
679 // MDS commands, we may have sessions that need closing.
680 {
681 std::scoped_lock l{client_lock};
682
683 // To make sure the tick thread will be stoppped before
684 // destructing the Client, just in case like the _mount()
685 // failed but didn't not get a chance to stop the tick
686 // thread
687 tick_thread_stopped = true;
688 upkeep_cond.notify_one();
689
690 _close_sessions();
691 }
692 cct->_conf.remove_observer(this);
693
694 cct->get_admin_socket()->unregister_commands(&m_command_hook);
695
696 if (ino_invalidate_cb) {
697 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
698 async_ino_invalidator.wait_for_empty();
699 async_ino_invalidator.stop();
700 }
701
702 if (dentry_invalidate_cb) {
703 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
704 async_dentry_invalidator.wait_for_empty();
705 async_dentry_invalidator.stop();
706 }
707
708 if (switch_interrupt_cb) {
709 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
710 interrupt_finisher.wait_for_empty();
711 interrupt_finisher.stop();
712 }
713
714 if (remount_cb) {
715 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
716 remount_finisher.wait_for_empty();
717 remount_finisher.stop();
718 }
719
720 if (ino_release_cb) {
721 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
722 async_ino_releasor.wait_for_empty();
723 async_ino_releasor.stop();
724 }
725
726 objectcacher->stop(); // outside of client_lock! this does a join.
727
728 /*
729 * We are shuting down the client.
730 *
731 * Just declare the state to CLIENT_NEW to block and fail any
732 * new comming "reader" and then try to wait all the in-flight
733 * "readers" to finish.
734 */
735 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
736 if (!iref_writer.is_first_writer())
737 return;
738 iref_writer.wait_readers_done();
739
740 {
741 std::scoped_lock l(timer_lock);
742 timer.shutdown();
743 }
744
745 objecter_finisher.wait_for_empty();
746 objecter_finisher.stop();
747
748 if (logger) {
749 cct->get_perfcounters_collection()->remove(logger.get());
750 logger.reset();
751 }
752 }
753
754 void Client::update_io_stat_metadata(utime_t latency) {
755 auto lat_nsec = latency.to_nsec();
756 // old values are used to compute new ones
757 auto o_avg = logger->tget(l_c_md_avg).to_nsec();
758 auto o_sqsum = logger->get(l_c_md_sqsum);
759
760 auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
761 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
762 nr_metadata_request);
763
764 logger->tinc(l_c_lat, latency);
765 logger->tinc(l_c_reply, latency);
766
767 utime_t avg;
768 avg.set_from_double(n_avg / 1000000000);
769 logger->tset(l_c_md_avg, avg);
770 logger->set(l_c_md_sqsum, n_sqsum);
771 logger->set(l_c_md_ops, nr_metadata_request);
772 }
773
774 void Client::update_io_stat_read(utime_t latency) {
775 auto lat_nsec = latency.to_nsec();
776 // old values are used to compute new ones
777 auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
778 auto o_sqsum = logger->get(l_c_rd_sqsum);
779
780 auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
781 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
782 nr_read_request);
783
784 logger->tinc(l_c_read, latency);
785
786 utime_t avg;
787 avg.set_from_double(n_avg / 1000000000);
788 logger->tset(l_c_rd_avg, avg);
789 logger->set(l_c_rd_sqsum, n_sqsum);
790 logger->set(l_c_rd_ops, nr_read_request);
791 }
792
793 void Client::update_io_stat_write(utime_t latency) {
794 auto lat_nsec = latency.to_nsec();
795 // old values are used to compute new ones
796 auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
797 auto o_sqsum = logger->get(l_c_wr_sqsum);
798
799 auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
800 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
801 nr_write_request);
802
803 logger->tinc(l_c_wrlat, latency);
804
805 utime_t avg;
806 avg.set_from_double(n_avg / 1000000000);
807 logger->tset(l_c_wr_avg, avg);
808 logger->set(l_c_wr_sqsum, n_sqsum);
809 logger->set(l_c_wr_ops, nr_write_request);
810 }
811
812 // ===================
813 // metadata cache stuff
814
815 void Client::trim_cache(bool trim_kernel_dcache)
816 {
817 uint64_t max = cct->_conf->client_cache_size;
818 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
819 unsigned last = 0;
820 while (lru.lru_get_size() != last) {
821 last = lru.lru_get_size();
822
823 if (!is_unmounting() && lru.lru_get_size() <= max) break;
824
825 // trim!
826 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
827 if (!dn)
828 break; // done
829
830 trim_dentry(dn);
831 }
832
833 if (trim_kernel_dcache && lru.lru_get_size() > max)
834 _invalidate_kernel_dcache();
835
836 // hose root?
837 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
838 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
839 root.reset();
840 }
841 }
842
843 void Client::trim_cache_for_reconnect(MetaSession *s)
844 {
845 mds_rank_t mds = s->mds_num;
846 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
847
848 int trimmed = 0;
849 list<Dentry*> skipped;
850 while (lru.lru_get_size() > 0) {
851 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
852 if (!dn)
853 break;
854
855 if ((dn->inode && dn->inode->caps.count(mds)) ||
856 dn->dir->parent_inode->caps.count(mds)) {
857 trim_dentry(dn);
858 trimmed++;
859 } else
860 skipped.push_back(dn);
861 }
862
863 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
864 lru.lru_insert_mid(*p);
865
866 ldout(cct, 20) << __func__ << " mds." << mds
867 << " trimmed " << trimmed << " dentries" << dendl;
868
869 if (s->caps.size() > 0)
870 _invalidate_kernel_dcache();
871 }
872
873 void Client::trim_dentry(Dentry *dn)
874 {
875 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
876 << " in dir "
877 << std::hex << dn->dir->parent_inode->ino << std::dec
878 << dendl;
879 if (dn->inode) {
880 Inode *diri = dn->dir->parent_inode;
881 clear_dir_complete_and_ordered(diri, true);
882 }
883 unlink(dn, false, false); // drop dir, drop dentry
884 }
885
886
887 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
888 uint64_t truncate_seq, uint64_t truncate_size)
889 {
890 uint64_t prior_size = in->size;
891
892 if (truncate_seq > in->truncate_seq ||
893 (truncate_seq == in->truncate_seq && size > in->size)) {
894 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
895 in->size = size;
896 in->reported_size = size;
897 if (truncate_seq != in->truncate_seq) {
898 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
899 << truncate_seq << dendl;
900 in->truncate_seq = truncate_seq;
901 in->oset.truncate_seq = truncate_seq;
902
903 // truncate cached file data
904 if (prior_size > size) {
905 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
906 }
907 }
908
909 // truncate inline data
910 if (in->inline_version < CEPH_INLINE_NONE) {
911 uint32_t len = in->inline_data.length();
912 if (size < len)
913 in->inline_data.splice(size, len - size);
914 }
915 }
916 if (truncate_seq >= in->truncate_seq &&
917 in->truncate_size != truncate_size) {
918 if (in->is_file()) {
919 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
920 << truncate_size << dendl;
921 in->truncate_size = truncate_size;
922 in->oset.truncate_size = truncate_size;
923 } else {
924 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
925 }
926 }
927 }
928
929 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
930 utime_t ctime, utime_t mtime, utime_t atime)
931 {
932 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
933 << " ctime " << ctime << " mtime " << mtime << dendl;
934
935 if (time_warp_seq > in->time_warp_seq)
936 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
937 << " is higher than local time_warp_seq "
938 << in->time_warp_seq << dendl;
939
940 int warn = false;
941 // be careful with size, mtime, atime
942 if (issued & (CEPH_CAP_FILE_EXCL|
943 CEPH_CAP_FILE_WR|
944 CEPH_CAP_FILE_BUFFER|
945 CEPH_CAP_AUTH_EXCL|
946 CEPH_CAP_XATTR_EXCL)) {
947 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
948 if (ctime > in->ctime)
949 in->ctime = ctime;
950 if (time_warp_seq > in->time_warp_seq) {
951 //the mds updated times, so take those!
952 in->mtime = mtime;
953 in->atime = atime;
954 in->time_warp_seq = time_warp_seq;
955 } else if (time_warp_seq == in->time_warp_seq) {
956 //take max times
957 if (mtime > in->mtime)
958 in->mtime = mtime;
959 if (atime > in->atime)
960 in->atime = atime;
961 } else if (issued & CEPH_CAP_FILE_EXCL) {
962 //ignore mds values as we have a higher seq
963 } else warn = true;
964 } else {
965 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
966 if (time_warp_seq >= in->time_warp_seq) {
967 in->ctime = ctime;
968 in->mtime = mtime;
969 in->atime = atime;
970 in->time_warp_seq = time_warp_seq;
971 } else warn = true;
972 }
973 if (warn) {
974 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
975 << time_warp_seq << " is lower than local time_warp_seq "
976 << in->time_warp_seq
977 << dendl;
978 }
979 }
980
981 void Client::_fragmap_remove_non_leaves(Inode *in)
982 {
983 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
984 if (!in->dirfragtree.is_leaf(p->first))
985 in->fragmap.erase(p++);
986 else
987 ++p;
988 }
989
990 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
991 {
992 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
993 if (p->second == mds)
994 in->fragmap.erase(p++);
995 else
996 ++p;
997 }
998
999 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
1000 MetaSession *session,
1001 const UserPerm& request_perms)
1002 {
1003 Inode *in;
1004 bool was_new = false;
1005 if (inode_map.count(st->vino)) {
1006 in = inode_map[st->vino];
1007 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1008 } else {
1009 in = new Inode(this, st->vino, &st->layout);
1010 inode_map[st->vino] = in;
1011
1012 if (use_faked_inos())
1013 _assign_faked_ino(in);
1014
1015 if (!root) {
1016 root = in;
1017 if (use_faked_inos())
1018 _assign_faked_root(root.get());
1019 root_ancestor = in;
1020 cwd = root;
1021 } else if (is_mounting()) {
1022 root_parents[root_ancestor] = in;
1023 root_ancestor = in;
1024 }
1025
1026 // immutable bits
1027 in->ino = st->vino.ino;
1028 in->snapid = st->vino.snapid;
1029 in->mode = st->mode & S_IFMT;
1030 was_new = true;
1031 }
1032
1033 in->rdev = st->rdev;
1034 if (in->is_symlink())
1035 in->symlink = st->symlink;
1036
1037 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1038 bool new_version = false;
1039 if (in->version == 0 ||
1040 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1041 (in->version & ~1) < st->version))
1042 new_version = true;
1043
1044 int issued;
1045 in->caps_issued(&issued);
1046 issued |= in->caps_dirty();
1047 int new_issued = ~issued & (int)st->cap.caps;
1048
1049 bool need_snapdir_attr_refresh = false;
1050 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1051 !(issued & CEPH_CAP_AUTH_EXCL)) {
1052 in->mode = st->mode;
1053 in->uid = st->uid;
1054 in->gid = st->gid;
1055 in->btime = st->btime;
1056 in->snap_btime = st->snap_btime;
1057 in->snap_metadata = st->snap_metadata;
1058 need_snapdir_attr_refresh = true;
1059 }
1060
1061 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1062 !(issued & CEPH_CAP_LINK_EXCL)) {
1063 in->nlink = st->nlink;
1064 }
1065
1066 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1067 need_snapdir_attr_refresh = true;
1068 update_inode_file_time(in, issued, st->time_warp_seq,
1069 st->ctime, st->mtime, st->atime);
1070 }
1071
1072 if (new_version ||
1073 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1074 in->layout = st->layout;
1075 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1076 }
1077
1078 if (in->is_dir()) {
1079 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1080 in->dirstat = st->dirstat;
1081 }
1082 // dir_layout/rstat/quota are not tracked by capability, update them only if
1083 // the inode stat is from auth mds
1084 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1085 in->dir_layout = st->dir_layout;
1086 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1087 in->rstat = st->rstat;
1088 in->quota = st->quota;
1089 in->dir_pin = st->dir_pin;
1090 }
1091 // move me if/when version reflects fragtree changes.
1092 if (in->dirfragtree != st->dirfragtree) {
1093 in->dirfragtree = st->dirfragtree;
1094 _fragmap_remove_non_leaves(in);
1095 }
1096 }
1097
1098 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1099 st->xattrbl.length() &&
1100 st->xattr_version > in->xattr_version) {
1101 auto p = st->xattrbl.cbegin();
1102 decode(in->xattrs, p);
1103 in->xattr_version = st->xattr_version;
1104 need_snapdir_attr_refresh = true;
1105 }
1106
1107 if (st->inline_version > in->inline_version) {
1108 in->inline_data = st->inline_data;
1109 in->inline_version = st->inline_version;
1110 }
1111
1112 /* always take a newer change attr */
1113 ldout(cct, 12) << __func__ << " client inode change_attr: " << in->change_attr << " , mds inodestat change_attr: " << st->change_attr << dendl;
1114 if (st->change_attr > in->change_attr)
1115 in->change_attr = st->change_attr;
1116
1117 if (st->version > in->version)
1118 in->version = st->version;
1119
1120 if (was_new)
1121 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1122
1123 if (!st->cap.caps)
1124 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1125
1126 if (in->snapid == CEPH_NOSNAP) {
1127 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1128 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1129 st->cap.flags, request_perms);
1130 if (in->auth_cap && in->auth_cap->session == session) {
1131 in->max_size = st->max_size;
1132 in->rstat = st->rstat;
1133 }
1134
1135 // setting I_COMPLETE needs to happen after adding the cap
1136 if (in->is_dir() &&
1137 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1138 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1139 in->dirstat.nfiles == 0 &&
1140 in->dirstat.nsubdirs == 0) {
1141 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1142 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1143 if (in->dir) {
1144 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1145 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1146 in->dir->readdir_cache.clear();
1147 for (const auto& p : in->dir->dentries) {
1148 unlink(p.second, true, true); // keep dir, keep dentry
1149 }
1150 if (in->dir->dentries.empty())
1151 close_dir(in->dir);
1152 }
1153 }
1154 } else {
1155 in->snap_caps |= st->cap.caps;
1156 }
1157
1158 in->fscrypt = st->fscrypt;
1159 if (need_snapdir_attr_refresh && in->is_dir() && in->snapid == CEPH_NOSNAP) {
1160 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
1161 if (inode_map.count(vino)) {
1162 refresh_snapdir_attrs(inode_map[vino], in);
1163 }
1164 }
1165
1166 return in;
1167 }
1168
1169
1170 /*
1171 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1172 */
1173 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1174 Inode *in, utime_t from, MetaSession *session,
1175 Dentry *old_dentry)
1176 {
1177 Dentry *dn = NULL;
1178 if (dir->dentries.count(dname))
1179 dn = dir->dentries[dname];
1180
1181 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1182 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1183 << dendl;
1184
1185 if (dn && dn->inode) {
1186 if (dn->inode->vino() == in->vino()) {
1187 touch_dn(dn);
1188 ldout(cct, 12) << " had dentry " << dname
1189 << " with correct vino " << dn->inode->vino()
1190 << dendl;
1191 } else {
1192 ldout(cct, 12) << " had dentry " << dname
1193 << " with WRONG vino " << dn->inode->vino()
1194 << dendl;
1195 unlink(dn, true, true); // keep dir, keep dentry
1196 }
1197 }
1198
1199 if (!dn || !dn->inode) {
1200 InodeRef tmp_ref(in);
1201 if (old_dentry) {
1202 if (old_dentry->dir != dir) {
1203 Inode *old_diri = old_dentry->dir->parent_inode;
1204 clear_dir_complete_and_ordered(old_diri, false);
1205 }
1206 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1207 }
1208 Inode *diri = dir->parent_inode;
1209 clear_dir_complete_and_ordered(diri, false);
1210 dn = link(dir, dname, in, dn);
1211 }
1212
1213 update_dentry_lease(dn, dlease, from, session);
1214 return dn;
1215 }
1216
1217 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1218 {
1219 utime_t dttl = from;
1220 dttl += (float)dlease->duration_ms / 1000.0;
1221
1222 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1223
1224 ceph_assert(dn);
1225
1226 if (dlease->mask & CEPH_LEASE_VALID) {
1227 if (dttl > dn->lease_ttl) {
1228 ldout(cct, 10) << "got dentry lease on " << dn->name
1229 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1230 dn->lease_ttl = dttl;
1231 dn->lease_mds = session->mds_num;
1232 dn->lease_seq = dlease->seq;
1233 dn->lease_gen = session->cap_gen;
1234 }
1235 }
1236 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1237 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1238 dn->mark_primary();
1239 dn->alternate_name = std::move(dlease->alternate_name);
1240 }
1241
1242
1243 /*
1244 * update MDS location cache for a single inode
1245 */
1246 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1247 {
1248 // auth
1249 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1250 if (dst->auth >= 0) {
1251 in->fragmap[dst->frag] = dst->auth;
1252 } else {
1253 in->fragmap.erase(dst->frag);
1254 }
1255 if (!in->dirfragtree.is_leaf(dst->frag)) {
1256 in->dirfragtree.force_to_leaf(cct, dst->frag);
1257 _fragmap_remove_non_leaves(in);
1258 }
1259
1260 // replicated, only update from auth mds reply
1261 if (from == dst->auth) {
1262 in->dir_replicated = !dst->dist.empty();
1263 if (!dst->dist.empty())
1264 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1265 else
1266 in->frag_repmap.erase(dst->frag);
1267 }
1268 }
1269
1270 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1271 {
1272 if (complete)
1273 diri->dir_release_count++;
1274 else
1275 diri->dir_ordered_count++;
1276 if (diri->flags & I_COMPLETE) {
1277 if (complete) {
1278 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1279 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1280 } else {
1281 if (diri->flags & I_DIR_ORDERED) {
1282 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1283 diri->flags &= ~I_DIR_ORDERED;
1284 }
1285 }
1286 if (diri->dir)
1287 diri->dir->readdir_cache.clear();
1288 }
1289 }
1290
1291 /*
1292 * insert results from readdir or lssnap into the metadata cache.
1293 */
1294 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1295
1296 auto& reply = request->reply;
1297 ConnectionRef con = request->reply->get_connection();
1298 uint64_t features;
1299 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1300 features = (uint64_t)-1;
1301 }
1302 else {
1303 features = con->get_features();
1304 }
1305
1306 dir_result_t *dirp = request->dirp;
1307 ceph_assert(dirp);
1308
1309 // the extra buffer list is only set for readdir and lssnap replies
1310 auto p = reply->get_extra_bl().cbegin();
1311 if (!p.end()) {
1312 // snapdir?
1313 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1314 ceph_assert(diri);
1315 diri = open_snapdir(diri);
1316 }
1317
1318 // only open dir if we're actually adding stuff to it!
1319 Dir *dir = diri->open_dir();
1320 ceph_assert(dir);
1321
1322 // dirstat
1323 DirStat dst(p, features);
1324 __u32 numdn;
1325 __u16 flags;
1326 decode(numdn, p);
1327 decode(flags, p);
1328
1329 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1330 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1331
1332 frag_t fg = (unsigned)request->head.args.readdir.frag;
1333 unsigned readdir_offset = dirp->next_offset;
1334 string readdir_start = dirp->last_name;
1335 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1336
1337 unsigned last_hash = 0;
1338 if (hash_order) {
1339 if (!readdir_start.empty()) {
1340 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1341 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1342 /* mds understands offset_hash */
1343 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1344 }
1345 }
1346
1347 if (fg != dst.frag) {
1348 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1349 fg = dst.frag;
1350 if (!hash_order) {
1351 readdir_offset = 2;
1352 readdir_start.clear();
1353 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1354 }
1355 }
1356
1357 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1358 << ", hash_order=" << hash_order
1359 << ", readdir_start " << readdir_start
1360 << ", last_hash " << last_hash
1361 << ", next_offset " << readdir_offset << dendl;
1362
1363 if (diri->snapid != CEPH_SNAPDIR &&
1364 fg.is_leftmost() && readdir_offset == 2 &&
1365 !(hash_order && last_hash)) {
1366 dirp->release_count = diri->dir_release_count;
1367 dirp->ordered_count = diri->dir_ordered_count;
1368 dirp->start_shared_gen = diri->shared_gen;
1369 dirp->cache_index = 0;
1370 }
1371
1372 dirp->buffer_frag = fg;
1373
1374 _readdir_drop_dirp_buffer(dirp);
1375 dirp->buffer.reserve(numdn);
1376
1377 string dname;
1378 LeaseStat dlease;
1379 for (unsigned i=0; i<numdn; i++) {
1380 decode(dname, p);
1381 dlease.decode(p, features);
1382 InodeStat ist(p, features);
1383
1384 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1385
1386 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1387 request->perms);
1388 Dentry *dn;
1389 if (diri->dir->dentries.count(dname)) {
1390 Dentry *olddn = diri->dir->dentries[dname];
1391 if (olddn->inode != in) {
1392 // replace incorrect dentry
1393 unlink(olddn, true, true); // keep dir, dentry
1394 dn = link(dir, dname, in, olddn);
1395 ceph_assert(dn == olddn);
1396 } else {
1397 // keep existing dn
1398 dn = olddn;
1399 touch_dn(dn);
1400 }
1401 } else {
1402 // new dn
1403 dn = link(dir, dname, in, NULL);
1404 }
1405 dn->alternate_name = std::move(dlease.alternate_name);
1406
1407 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1408 if (hash_order) {
1409 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1410 if (hash != last_hash)
1411 readdir_offset = 2;
1412 last_hash = hash;
1413 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1414 } else {
1415 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1416 }
1417 // add to readdir cache
1418 if (dirp->release_count == diri->dir_release_count &&
1419 dirp->ordered_count == diri->dir_ordered_count &&
1420 dirp->start_shared_gen == diri->shared_gen) {
1421 if (dirp->cache_index == dir->readdir_cache.size()) {
1422 if (i == 0) {
1423 ceph_assert(!dirp->inode->is_complete_and_ordered());
1424 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1425 }
1426 dir->readdir_cache.push_back(dn);
1427 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1428 if (dirp->inode->is_complete_and_ordered())
1429 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1430 else
1431 dir->readdir_cache[dirp->cache_index] = dn;
1432 } else {
1433 ceph_abort_msg("unexpected readdir buffer idx");
1434 }
1435 dirp->cache_index++;
1436 }
1437 // add to cached result list
1438 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1439 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1440 }
1441
1442 if (numdn > 0)
1443 dirp->last_name = dname;
1444 if (end)
1445 dirp->next_offset = 2;
1446 else
1447 dirp->next_offset = readdir_offset;
1448
1449 if (dir->is_empty())
1450 close_dir(dir);
1451 }
1452 }
1453
1454 /** insert_trace
1455 *
1456 * insert a trace from a MDS reply into the cache.
1457 */
1458 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1459 {
1460 auto& reply = request->reply;
1461 int op = request->get_op();
1462
1463 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1464 << " is_target=" << (int)reply->head.is_target
1465 << " is_dentry=" << (int)reply->head.is_dentry
1466 << dendl;
1467
1468 auto p = reply->get_trace_bl().cbegin();
1469 if (request->got_unsafe) {
1470 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1471 ceph_assert(p.end());
1472 return NULL;
1473 }
1474
1475 if (p.end()) {
1476 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1477
1478 Dentry *d = request->dentry();
1479 if (d) {
1480 Inode *diri = d->dir->parent_inode;
1481 clear_dir_complete_and_ordered(diri, true);
1482 }
1483
1484 if (d && reply->get_result() == 0) {
1485 if (op == CEPH_MDS_OP_RENAME) {
1486 // rename
1487 Dentry *od = request->old_dentry();
1488 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1489 ceph_assert(od);
1490 unlink(od, true, true); // keep dir, dentry
1491 } else if (op == CEPH_MDS_OP_RMDIR ||
1492 op == CEPH_MDS_OP_UNLINK) {
1493 // unlink, rmdir
1494 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1495 unlink(d, true, true); // keep dir, dentry
1496 }
1497 }
1498 return NULL;
1499 }
1500
1501 ConnectionRef con = request->reply->get_connection();
1502 uint64_t features;
1503 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1504 features = (uint64_t)-1;
1505 }
1506 else {
1507 features = con->get_features();
1508 }
1509 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1510
1511 // snap trace
1512 SnapRealm *realm = NULL;
1513 if (reply->snapbl.length())
1514 update_snap_trace(reply->snapbl, &realm);
1515
1516 ldout(cct, 10) << " hrm "
1517 << " is_target=" << (int)reply->head.is_target
1518 << " is_dentry=" << (int)reply->head.is_dentry
1519 << dendl;
1520
1521 InodeStat dirst;
1522 DirStat dst;
1523 string dname;
1524 LeaseStat dlease;
1525 InodeStat ist;
1526
1527 if (reply->head.is_dentry) {
1528 dirst.decode(p, features);
1529 dst.decode(p, features);
1530 decode(dname, p);
1531 dlease.decode(p, features);
1532 }
1533
1534 Inode *in = 0;
1535 if (reply->head.is_target) {
1536 ist.decode(p, features);
1537 if (cct->_conf->client_debug_getattr_caps) {
1538 unsigned wanted = 0;
1539 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1540 wanted = request->head.args.getattr.mask;
1541 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1542 wanted = request->head.args.open.mask;
1543
1544 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1545 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1546 ceph_abort_msg("MDS reply does not contain xattrs");
1547 }
1548
1549 in = add_update_inode(&ist, request->sent_stamp, session,
1550 request->perms);
1551 }
1552
1553 Inode *diri = NULL;
1554 if (reply->head.is_dentry) {
1555 diri = add_update_inode(&dirst, request->sent_stamp, session,
1556 request->perms);
1557 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1558 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
1559
1560 if (in) {
1561 Dir *dir = diri->open_dir();
1562 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1563 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1564 } else {
1565 Dentry *dn = NULL;
1566 if (diri->dir && diri->dir->dentries.count(dname)) {
1567 dn = diri->dir->dentries[dname];
1568 if (dn->inode) {
1569 clear_dir_complete_and_ordered(diri, false);
1570 unlink(dn, true, true); // keep dir, dentry
1571 }
1572 }
1573 if (dlease.duration_ms > 0) {
1574 if (!dn) {
1575 Dir *dir = diri->open_dir();
1576 dn = link(dir, dname, NULL, NULL);
1577 }
1578 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1579 }
1580 }
1581 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1582 op == CEPH_MDS_OP_MKSNAP) {
1583 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1584 // fake it for snap lookup
1585 vinodeno_t vino = ist.vino;
1586 vino.snapid = CEPH_SNAPDIR;
1587 ceph_assert(inode_map.count(vino));
1588 diri = inode_map[vino];
1589
1590 string dname = request->path.last_dentry();
1591
1592 LeaseStat dlease;
1593 dlease.duration_ms = 0;
1594
1595 if (in) {
1596 Dir *dir = diri->open_dir();
1597 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1598 } else {
1599 if (diri->dir && diri->dir->dentries.count(dname)) {
1600 Dentry *dn = diri->dir->dentries[dname];
1601 if (dn->inode)
1602 unlink(dn, true, true); // keep dir, dentry
1603 }
1604 }
1605 }
1606
1607 if (in) {
1608 if (op == CEPH_MDS_OP_READDIR ||
1609 op == CEPH_MDS_OP_LSSNAP) {
1610 insert_readdir_results(request, session, in);
1611 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1612 // hack: return parent inode instead
1613 in = diri;
1614 }
1615
1616 if (request->dentry() == NULL && in != request->inode()) {
1617 // pin the target inode if its parent dentry is not pinned
1618 request->set_other_inode(in);
1619 }
1620 }
1621
1622 if (realm)
1623 put_snap_realm(realm);
1624
1625 request->target = in;
1626 return in;
1627 }
1628
1629 // -------
1630
1631 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1632 {
1633 mds_rank_t mds = MDS_RANK_NONE;
1634 __u32 hash = 0;
1635 bool is_hash = false;
1636 int issued = 0;
1637
1638 Inode *in = NULL;
1639 Dentry *de = NULL;
1640
1641 if (req->resend_mds >= 0) {
1642 mds = req->resend_mds;
1643 req->resend_mds = -1;
1644 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1645 goto out;
1646 }
1647
1648 if (cct->_conf->client_use_random_mds)
1649 goto random_mds;
1650
1651 in = req->inode();
1652 de = req->dentry();
1653 if (in) {
1654 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1655 if (req->path.depth()) {
1656 hash = in->hash_dentry_name(req->path[0]);
1657 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1658 << " on " << req->path[0]
1659 << " => " << hash << dendl;
1660 is_hash = true;
1661 }
1662 } else if (de) {
1663 if (de->inode) {
1664 in = de->inode.get();
1665 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1666 } else {
1667 in = de->dir->parent_inode;
1668 hash = in->hash_dentry_name(de->name);
1669 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1670 << " on " << de->name
1671 << " => " << hash << dendl;
1672 is_hash = true;
1673 }
1674 }
1675 if (in) {
1676 if (in->snapid != CEPH_NOSNAP) {
1677 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1678 while (in->snapid != CEPH_NOSNAP) {
1679 if (in->snapid == CEPH_SNAPDIR)
1680 in = in->snapdir_parent.get();
1681 else if (!in->dentries.empty())
1682 /* In most cases there will only be one dentry, so getting it
1683 * will be the correct action. If there are multiple hard links,
1684 * I think the MDS should be able to redirect as needed*/
1685 in = in->get_first_parent()->dir->parent_inode;
1686 else {
1687 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1688 break;
1689 }
1690 }
1691 is_hash = false;
1692 }
1693
1694 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1695 << " hash=" << hash << dendl;
1696
1697 if (req->get_op() == CEPH_MDS_OP_GETATTR)
1698 issued = req->inode()->caps_issued();
1699
1700 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1701 frag_t fg = in->dirfragtree[hash];
1702 if (!req->auth_is_best(issued)) {
1703 auto repmapit = in->frag_repmap.find(fg);
1704 if (repmapit != in->frag_repmap.end()) {
1705 auto& repmap = repmapit->second;
1706 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1707 mds = repmap.at(r);
1708 }
1709 } else if (in->fragmap.count(fg)) {
1710 mds = in->fragmap[fg];
1711 if (phash_diri)
1712 *phash_diri = in;
1713 } else if (in->auth_cap) {
1714 req->send_to_auth = true;
1715 mds = in->auth_cap->session->mds_num;
1716 }
1717 if (mds >= 0) {
1718 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1719 goto out;
1720 }
1721 }
1722
1723 if (in->auth_cap && req->auth_is_best(issued)) {
1724 mds = in->auth_cap->session->mds_num;
1725 } else if (!in->caps.empty()) {
1726 mds = in->caps.begin()->second.session->mds_num;
1727 } else {
1728 goto random_mds;
1729 }
1730 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1731
1732 goto out;
1733 }
1734
1735 random_mds:
1736 if (mds < 0) {
1737 mds = _get_random_up_mds();
1738 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1739 }
1740
1741 out:
1742 ldout(cct, 20) << "mds is " << mds << dendl;
1743 return mds;
1744 }
1745
1746 void Client::connect_mds_targets(mds_rank_t mds)
1747 {
1748 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1749 ceph_assert(mds_sessions.count(mds));
1750 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1751 for (const auto &rank : info.export_targets) {
1752 if (mds_sessions.count(rank) == 0 &&
1753 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1754 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1755 << " export target mds." << rank << dendl;
1756 _open_mds_session(rank);
1757 }
1758 }
1759 }
1760
1761 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1762 {
1763 f->dump_int("id", get_nodeid().v);
1764 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1765 f->dump_object("inst", inst);
1766 f->dump_stream("inst_str") << inst;
1767 f->dump_stream("addr_str") << inst.addr;
1768 f->open_array_section("sessions");
1769 for (const auto &p : mds_sessions) {
1770 f->open_object_section("session");
1771 p.second->dump(f, cap_dump);
1772 f->close_section();
1773 }
1774 f->close_section();
1775 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1776 }
1777
1778 void Client::dump_mds_requests(Formatter *f)
1779 {
1780 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1781 p != mds_requests.end();
1782 ++p) {
1783 f->open_object_section("request");
1784 p->second->dump(f);
1785 f->close_section();
1786 }
1787 }
1788
1789 int Client::verify_reply_trace(int r, MetaSession *session,
1790 MetaRequest *request, const MConstRef<MClientReply>& reply,
1791 InodeRef *ptarget, bool *pcreated,
1792 const UserPerm& perms)
1793 {
1794 // check whether this request actually did the create, and set created flag
1795 bufferlist extra_bl;
1796 inodeno_t created_ino;
1797 bool got_created_ino = false;
1798 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1799
1800 extra_bl = reply->get_extra_bl();
1801 if (extra_bl.length() >= 8) {
1802 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1803 struct openc_response_t ocres;
1804
1805 decode(ocres, extra_bl);
1806 created_ino = ocres.created_ino;
1807 /*
1808 * The userland cephfs client doesn't have a way to do an async create
1809 * (yet), so just discard delegated_inos for now. Eventually we should
1810 * store them and use them in create calls, even if they are synchronous,
1811 * if only for testing purposes.
1812 */
1813 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1814 } else {
1815 // u64 containing number of created ino
1816 decode(created_ino, extra_bl);
1817 }
1818 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1819 got_created_ino = true;
1820 }
1821
1822 if (pcreated)
1823 *pcreated = got_created_ino;
1824
1825 if (request->target) {
1826 *ptarget = request->target;
1827 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1828 } else {
1829 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1830 (*ptarget) = p->second;
1831 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1832 } else {
1833 // we got a traceless reply, and need to look up what we just
1834 // created. for now, do this by name. someday, do this by the
1835 // ino... which we know! FIXME.
1836 InodeRef target;
1837 Dentry *d = request->dentry();
1838 if (d) {
1839 if (d->dir) {
1840 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1841 << d->dir->parent_inode->ino << "/" << d->name
1842 << " got_ino " << got_created_ino
1843 << " ino " << created_ino
1844 << dendl;
1845 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1846 &target, perms);
1847 } else {
1848 // if the dentry is not linked, just do our best. see #5021.
1849 ceph_abort_msg("how did this happen? i want logs!");
1850 }
1851 } else {
1852 Inode *in = request->inode();
1853 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1854 << in->ino << dendl;
1855 r = _getattr(in, request->regetattr_mask, perms, true);
1856 target = in;
1857 }
1858 if (r >= 0) {
1859 // verify ino returned in reply and trace_dist are the same
1860 if (got_created_ino &&
1861 created_ino.val != target->ino.val) {
1862 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1863 r = -CEPHFS_EINTR;
1864 }
1865 if (ptarget)
1866 ptarget->swap(target);
1867 }
1868 }
1869 }
1870
1871 return r;
1872 }
1873
1874
1875 /**
1876 * make a request
1877 *
1878 * Blocking helper to make an MDS request.
1879 *
1880 * If the ptarget flag is set, behavior changes slightly: the caller
1881 * expects to get a pointer to the inode we are creating or operating
1882 * on. As a result, we will follow up any traceless mutation reply
1883 * with a getattr or lookup to transparently handle a traceless reply
1884 * from the MDS (as when the MDS restarts and the client has to replay
1885 * a request).
1886 *
1887 * @param request the MetaRequest to execute
1888 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1889 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1890 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1891 * @param use_mds [optional] prefer a specific mds (-1 for default)
1892 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1893 */
1894 int Client::make_request(MetaRequest *request,
1895 const UserPerm& perms,
1896 InodeRef *ptarget, bool *pcreated,
1897 mds_rank_t use_mds,
1898 bufferlist *pdirbl,
1899 size_t feature_needed)
1900 {
1901 int r = 0;
1902
1903 // assign a unique tid
1904 ceph_tid_t tid = ++last_tid;
1905 request->set_tid(tid);
1906
1907 // and timestamp
1908 request->op_stamp = ceph_clock_now();
1909 request->created = ceph::coarse_mono_clock::now();
1910
1911 // make note
1912 mds_requests[tid] = request->get();
1913 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1914 oldest_tid = tid;
1915
1916 request->set_caller_perms(perms);
1917
1918 if (cct->_conf->client_inject_fixed_oldest_tid) {
1919 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1920 request->set_oldest_client_tid(1);
1921 } else {
1922 request->set_oldest_client_tid(oldest_tid);
1923 }
1924
1925 // hack target mds?
1926 if (use_mds >= 0)
1927 request->resend_mds = use_mds;
1928
1929 MetaSessionRef session = NULL;
1930 while (1) {
1931 if (request->aborted())
1932 break;
1933
1934 if (blocklisted) {
1935 request->abort(-CEPHFS_EBLOCKLISTED);
1936 break;
1937 }
1938
1939 // set up wait cond
1940 ceph::condition_variable caller_cond;
1941 request->caller_cond = &caller_cond;
1942
1943 // choose mds
1944 Inode *hash_diri = NULL;
1945 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1946 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1947 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1948 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1949 if (hash_diri) {
1950 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1951 _fragmap_remove_stopped_mds(hash_diri, mds);
1952 } else {
1953 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1954 request->resend_mds = _get_random_up_mds();
1955 }
1956 } else {
1957 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1958 wait_on_list(waiting_for_mdsmap);
1959 }
1960 continue;
1961 }
1962
1963 // open a session?
1964 if (!have_open_session(mds)) {
1965 session = _get_or_open_mds_session(mds);
1966 if (session->state == MetaSession::STATE_REJECTED) {
1967 request->abort(-CEPHFS_EPERM);
1968 break;
1969 }
1970 // wait
1971 if (session->state == MetaSession::STATE_OPENING) {
1972 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1973 wait_on_context_list(session->waiting_for_open);
1974 continue;
1975 }
1976
1977 if (!have_open_session(mds))
1978 continue;
1979 } else {
1980 session = mds_sessions.at(mds);
1981 }
1982
1983 if (feature_needed != ULONG_MAX && !session->mds_features.test(feature_needed)) {
1984 request->abort(-CEPHFS_EOPNOTSUPP);
1985 break;
1986 }
1987
1988 // send request.
1989 send_request(request, session.get());
1990
1991 // wait for signal
1992 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1993 request->kick = false;
1994 std::unique_lock l{client_lock, std::adopt_lock};
1995 caller_cond.wait(l, [request] {
1996 return (request->reply || // reply
1997 request->resend_mds >= 0 || // forward
1998 request->kick);
1999 });
2000 l.release();
2001 request->caller_cond = nullptr;
2002
2003 // did we get a reply?
2004 if (request->reply)
2005 break;
2006 }
2007
2008 if (!request->reply) {
2009 ceph_assert(request->aborted());
2010 ceph_assert(!request->got_unsafe);
2011 r = request->get_abort_code();
2012 request->item.remove_myself();
2013 unregister_request(request);
2014 put_request(request);
2015 return r;
2016 }
2017
2018 // got it!
2019 auto reply = std::move(request->reply);
2020 r = reply->get_result();
2021 if (r >= 0)
2022 request->success = true;
2023
2024 // kick dispatcher (we've got it!)
2025 ceph_assert(request->dispatch_cond);
2026 request->dispatch_cond->notify_all();
2027 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2028 request->dispatch_cond = 0;
2029
2030 if (r >= 0 && ptarget)
2031 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
2032
2033 if (pdirbl)
2034 *pdirbl = reply->get_extra_bl();
2035
2036 // -- log times --
2037 utime_t lat = ceph_clock_now();
2038 lat -= request->sent_stamp;
2039 ldout(cct, 20) << "lat " << lat << dendl;
2040
2041 ++nr_metadata_request;
2042 update_io_stat_metadata(lat);
2043
2044 put_request(request);
2045 return r;
2046 }
2047
2048 void Client::unregister_request(MetaRequest *req)
2049 {
2050 mds_requests.erase(req->tid);
2051 if (req->tid == oldest_tid) {
2052 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2053 while (true) {
2054 if (p == mds_requests.end()) {
2055 oldest_tid = 0;
2056 break;
2057 }
2058 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2059 oldest_tid = p->first;
2060 break;
2061 }
2062 ++p;
2063 }
2064 }
2065 put_request(req);
2066 }
2067
2068 void Client::put_request(MetaRequest *request)
2069 {
2070 if (request->_put()) {
2071 int op = -1;
2072 if (request->success)
2073 op = request->get_op();
2074 InodeRef other_in;
2075 request->take_other_inode(&other_in);
2076 delete request;
2077
2078 if (other_in &&
2079 (op == CEPH_MDS_OP_RMDIR ||
2080 op == CEPH_MDS_OP_RENAME ||
2081 op == CEPH_MDS_OP_RMSNAP)) {
2082 _try_to_trim_inode(other_in.get(), false);
2083 }
2084 }
2085 }
2086
2087 int Client::encode_inode_release(Inode *in, MetaRequest *req,
2088 mds_rank_t mds, int drop,
2089 int unless, int force)
2090 {
2091 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
2092 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
2093 << ", force:" << force << ")" << dendl;
2094 int released = 0;
2095 auto it = in->caps.find(mds);
2096 if (it != in->caps.end()) {
2097 Cap &cap = it->second;
2098 drop &= ~(in->dirty_caps | get_caps_used(in));
2099 if ((drop & cap.issued) &&
2100 !(unless & cap.issued)) {
2101 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
2102 cap.issued &= ~drop;
2103 cap.implemented &= ~drop;
2104 released = 1;
2105 } else {
2106 released = force;
2107 }
2108 if (released) {
2109 cap.wanted = in->caps_wanted();
2110 if (&cap == in->auth_cap &&
2111 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2112 in->requested_max_size = 0;
2113 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2114 }
2115 ceph_mds_request_release rel;
2116 rel.ino = in->ino;
2117 rel.cap_id = cap.cap_id;
2118 rel.seq = cap.seq;
2119 rel.issue_seq = cap.issue_seq;
2120 rel.mseq = cap.mseq;
2121 rel.caps = cap.implemented;
2122 rel.wanted = cap.wanted;
2123 rel.dname_len = 0;
2124 rel.dname_seq = 0;
2125 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2126 }
2127 }
2128 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2129 << released << dendl;
2130 return released;
2131 }
2132
2133 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2134 mds_rank_t mds, int drop, int unless)
2135 {
2136 ldout(cct, 20) << __func__ << " enter(dn:"
2137 << dn << ")" << dendl;
2138 int released = 0;
2139 if (dn->dir)
2140 released = encode_inode_release(dn->dir->parent_inode, req,
2141 mds, drop, unless, 1);
2142 if (released && dn->lease_mds == mds) {
2143 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2144 auto& rel = req->cap_releases.back();
2145 rel.item.dname_len = dn->name.length();
2146 rel.item.dname_seq = dn->lease_seq;
2147 rel.dname = dn->name;
2148 dn->lease_mds = -1;
2149 }
2150 ldout(cct, 25) << __func__ << " exit(dn:"
2151 << dn << ")" << dendl;
2152 }
2153
2154
2155 /*
2156 * This requires the MClientRequest *request member to be set.
2157 * It will error out horribly without one.
2158 * Additionally, if you set any *drop member, you'd better have
2159 * set the corresponding dentry!
2160 */
2161 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2162 {
2163 ldout(cct, 20) << __func__ << " enter (req: "
2164 << req << ", mds: " << mds << ")" << dendl;
2165 if (req->inode_drop && req->inode())
2166 encode_inode_release(req->inode(), req,
2167 mds, req->inode_drop,
2168 req->inode_unless);
2169
2170 if (req->old_inode_drop && req->old_inode())
2171 encode_inode_release(req->old_inode(), req,
2172 mds, req->old_inode_drop,
2173 req->old_inode_unless);
2174 if (req->other_inode_drop && req->other_inode())
2175 encode_inode_release(req->other_inode(), req,
2176 mds, req->other_inode_drop,
2177 req->other_inode_unless);
2178
2179 if (req->dentry_drop && req->dentry())
2180 encode_dentry_release(req->dentry(), req,
2181 mds, req->dentry_drop,
2182 req->dentry_unless);
2183
2184 if (req->old_dentry_drop && req->old_dentry())
2185 encode_dentry_release(req->old_dentry(), req,
2186 mds, req->old_dentry_drop,
2187 req->old_dentry_unless);
2188 ldout(cct, 25) << __func__ << " exit (req: "
2189 << req << ", mds " << mds <<dendl;
2190 }
2191
2192 bool Client::have_open_session(mds_rank_t mds)
2193 {
2194 const auto &it = mds_sessions.find(mds);
2195 return it != mds_sessions.end() &&
2196 (it->second->state == MetaSession::STATE_OPEN ||
2197 it->second->state == MetaSession::STATE_STALE);
2198 }
2199
2200 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2201 {
2202 const auto &it = mds_sessions.find(mds);
2203 if (it == mds_sessions.end() || it->second->con != con) {
2204 return NULL;
2205 } else {
2206 return it->second;
2207 }
2208 }
2209
2210 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2211 {
2212 auto it = mds_sessions.find(mds);
2213 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2214 }
2215
2216 /**
2217 * Populate a map of strings with client-identifying metadata,
2218 * such as the hostname. Call this once at initialization.
2219 */
2220 void Client::populate_metadata(const std::string &mount_root)
2221 {
2222 // Hostname
2223 #ifdef _WIN32
2224 // TODO: move this to compat.h
2225 char hostname[64];
2226 DWORD hostname_sz = 64;
2227 GetComputerNameA(hostname, &hostname_sz);
2228 metadata["hostname"] = hostname;
2229 #else
2230 struct utsname u;
2231 int r = uname(&u);
2232 if (r >= 0) {
2233 metadata["hostname"] = u.nodename;
2234 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2235 } else {
2236 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2237 }
2238 #endif
2239
2240 metadata["pid"] = stringify(getpid());
2241
2242 // Ceph entity id (the '0' in "client.0")
2243 metadata["entity_id"] = cct->_conf->name.get_id();
2244
2245 // Our mount position
2246 if (!mount_root.empty()) {
2247 metadata["root"] = mount_root;
2248 }
2249
2250 // Ceph version
2251 metadata["ceph_version"] = pretty_version_to_str();
2252 metadata["ceph_sha1"] = git_version_to_str();
2253
2254 // Apply any metadata from the user's configured overrides
2255 std::vector<std::string> tokens;
2256 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2257 for (const auto &i : tokens) {
2258 auto eqpos = i.find("=");
2259 // Throw out anything that isn't of the form "<str>=<str>"
2260 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2261 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2262 continue;
2263 }
2264 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2265 }
2266 }
2267
2268 /**
2269 * Optionally add or override client metadata fields.
2270 */
2271 void Client::update_metadata(std::string const &k, std::string const &v)
2272 {
2273 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2274 ceph_assert(iref_reader.is_state_satisfied());
2275
2276 std::scoped_lock l(client_lock);
2277
2278 auto it = metadata.find(k);
2279 if (it != metadata.end()) {
2280 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2281 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2282 }
2283
2284 metadata[k] = v;
2285 }
2286
2287 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2288 {
2289 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2290 auto addrs = mdsmap->get_addrs(mds);
2291 auto em = mds_sessions.emplace(std::piecewise_construct,
2292 std::forward_as_tuple(mds),
2293 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2294 ceph_assert(em.second); /* not already present */
2295 auto session = em.first->second;
2296
2297 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2298 m->metadata = metadata;
2299 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2300 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2301 session->con->send_message2(std::move(m));
2302 return session;
2303 }
2304
2305 void Client::_close_mds_session(MetaSession *s)
2306 {
2307 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2308 s->state = MetaSession::STATE_CLOSING;
2309 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2310 }
2311
2312 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2313 {
2314 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2315 if (rejected && s->state != MetaSession::STATE_CLOSING)
2316 s->state = MetaSession::STATE_REJECTED;
2317 else
2318 s->state = MetaSession::STATE_CLOSED;
2319 s->con->mark_down();
2320 signal_context_list(s->waiting_for_open);
2321 mount_cond.notify_all();
2322 remove_session_caps(s, err);
2323 kick_requests_closed(s);
2324 mds_ranks_closing.erase(s->mds_num);
2325 if (s->state == MetaSession::STATE_CLOSED)
2326 mds_sessions.erase(s->mds_num);
2327 }
2328
2329 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2330 {
2331 mds_rank_t from = mds_rank_t(m->get_source().num());
2332 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2333
2334 std::scoped_lock cl(client_lock);
2335 auto session = _get_mds_session(from, m->get_connection().get());
2336 if (!session) {
2337 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2338 return;
2339 }
2340
2341 switch (m->get_op()) {
2342 case CEPH_SESSION_OPEN:
2343 {
2344 if (session->state == MetaSession::STATE_OPEN) {
2345 ldout(cct, 10) << "mds." << from << " already opened, ignore it"
2346 << dendl;
2347 return;
2348 }
2349 /*
2350 * The connection maybe broken and the session in client side
2351 * has been reinitialized, need to update the seq anyway.
2352 */
2353 if (!session->seq && m->get_seq())
2354 session->seq = m->get_seq();
2355
2356 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2357 missing_features -= m->supported_features;
2358 if (!missing_features.empty()) {
2359 lderr(cct) << "mds." << from << " lacks required features '"
2360 << missing_features << "', closing session " << dendl;
2361 _close_mds_session(session.get());
2362 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2363 break;
2364 }
2365 session->mds_features = std::move(m->supported_features);
2366 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
2367
2368 renew_caps(session.get());
2369 session->state = MetaSession::STATE_OPEN;
2370 if (is_unmounting())
2371 mount_cond.notify_all();
2372 else
2373 connect_mds_targets(from);
2374 signal_context_list(session->waiting_for_open);
2375 break;
2376 }
2377
2378 case CEPH_SESSION_CLOSE:
2379 _closed_mds_session(session.get());
2380 break;
2381
2382 case CEPH_SESSION_RENEWCAPS:
2383 if (session->cap_renew_seq == m->get_seq()) {
2384 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2385 session->cap_ttl =
2386 session->last_cap_renew_request + mdsmap->get_session_timeout();
2387 if (was_stale)
2388 wake_up_session_caps(session.get(), false);
2389 }
2390 break;
2391
2392 case CEPH_SESSION_STALE:
2393 // invalidate session caps/leases
2394 session->cap_gen++;
2395 session->cap_ttl = ceph_clock_now();
2396 session->cap_ttl -= 1;
2397 renew_caps(session.get());
2398 break;
2399
2400 case CEPH_SESSION_RECALL_STATE:
2401 /*
2402 * Call the renew caps and flush cap releases just before
2403 * triming the caps in case the tick() won't get a chance
2404 * to run them, which could cause the client to be blocklisted
2405 * and MDS daemons trying to recall the caps again and
2406 * again.
2407 *
2408 * In most cases it will do nothing, and the new cap releases
2409 * added by trim_caps() followed will be deferred flushing
2410 * by tick().
2411 */
2412 renew_and_flush_cap_releases();
2413 trim_caps(session.get(), m->get_max_caps());
2414 break;
2415
2416 case CEPH_SESSION_FLUSHMSG:
2417 /* flush cap release */
2418 if (auto& m = session->release; m) {
2419 session->con->send_message2(std::move(m));
2420 }
2421 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2422 break;
2423
2424 case CEPH_SESSION_FORCE_RO:
2425 force_session_readonly(session.get());
2426 break;
2427
2428 case CEPH_SESSION_REJECT:
2429 {
2430 std::string_view error_str;
2431 auto it = m->metadata.find("error_string");
2432 if (it != m->metadata.end())
2433 error_str = it->second;
2434 else
2435 error_str = "unknown error";
2436 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2437
2438 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2439 }
2440 break;
2441
2442 default:
2443 ceph_abort();
2444 }
2445 }
2446
2447 bool Client::_any_stale_sessions() const
2448 {
2449 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2450
2451 for (const auto &p : mds_sessions) {
2452 if (p.second->state == MetaSession::STATE_STALE) {
2453 return true;
2454 }
2455 }
2456
2457 return false;
2458 }
2459
2460 void Client::_kick_stale_sessions()
2461 {
2462 ldout(cct, 1) << __func__ << dendl;
2463
2464 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2465 auto s = it->second;
2466 if (s->state == MetaSession::STATE_REJECTED) {
2467 mds_sessions.erase(it->first);
2468 continue;
2469 }
2470 if (s->state == MetaSession::STATE_STALE)
2471 _closed_mds_session(s.get());
2472 }
2473 }
2474
2475 void Client::send_request(MetaRequest *request, MetaSession *session,
2476 bool drop_cap_releases)
2477 {
2478 // make the request
2479 mds_rank_t mds = session->mds_num;
2480 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2481 << " for mds." << mds << dendl;
2482 auto r = build_client_request(request);
2483 if (request->dentry()) {
2484 r->set_dentry_wanted();
2485 }
2486 if (request->got_unsafe) {
2487 r->set_replayed_op();
2488 if (request->target)
2489 r->head.ino = request->target->ino;
2490 } else {
2491 encode_cap_releases(request, mds);
2492 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2493 request->cap_releases.clear();
2494 else
2495 r->releases.swap(request->cap_releases);
2496 }
2497 r->set_mdsmap_epoch(mdsmap->get_epoch());
2498 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2499 objecter->with_osdmap([r](const OSDMap& o) {
2500 r->set_osdmap_epoch(o.get_epoch());
2501 });
2502 }
2503
2504 if (request->mds == -1) {
2505 request->sent_stamp = ceph_clock_now();
2506 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2507 }
2508 request->mds = mds;
2509
2510 Inode *in = request->inode();
2511 if (in) {
2512 auto it = in->caps.find(mds);
2513 if (it != in->caps.end()) {
2514 request->sent_on_mseq = it->second.mseq;
2515 }
2516 }
2517
2518 session->requests.push_back(&request->item);
2519
2520 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2521 session->con->send_message2(std::move(r));
2522 }
2523
2524 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2525 {
2526 auto req = make_message<MClientRequest>(request->get_op());
2527 req->set_tid(request->tid);
2528 req->set_stamp(request->op_stamp);
2529 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2530
2531 // if the filepath's haven't been set, set them!
2532 if (request->path.empty()) {
2533 Inode *in = request->inode();
2534 Dentry *de = request->dentry();
2535 if (in)
2536 in->make_nosnap_relative_path(request->path);
2537 else if (de) {
2538 if (de->inode)
2539 de->inode->make_nosnap_relative_path(request->path);
2540 else if (de->dir) {
2541 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2542 request->path.push_dentry(de->name);
2543 }
2544 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2545 << " No path, inode, or appropriately-endowed dentry given!"
2546 << dendl;
2547 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2548 << " No path, inode, or dentry given!"
2549 << dendl;
2550 }
2551 req->set_filepath(request->get_filepath());
2552 req->set_filepath2(request->get_filepath2());
2553 req->set_alternate_name(request->alternate_name);
2554 req->set_data(request->data);
2555 req->set_retry_attempt(request->retry_attempt++);
2556 req->head.num_fwd = request->num_fwd;
2557 const gid_t *_gids;
2558 int gid_count = request->perms.get_gids(&_gids);
2559 req->set_gid_list(gid_count, _gids);
2560 return req;
2561 }
2562
2563
2564
2565 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2566 {
2567 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2568
2569 std::scoped_lock cl(client_lock);
2570 auto session = _get_mds_session(mds, fwd->get_connection().get());
2571 if (!session) {
2572 return;
2573 }
2574 ceph_tid_t tid = fwd->get_tid();
2575
2576 if (mds_requests.count(tid) == 0) {
2577 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2578 return;
2579 }
2580
2581 MetaRequest *request = mds_requests[tid];
2582 ceph_assert(request);
2583
2584 /*
2585 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2586 * is 'int32_t', while in 'ceph_mds_request_head' the
2587 * type is '__u8'. So in case the request bounces between
2588 * MDSes exceeding 256 times, the client will get stuck.
2589 *
2590 * In this case it's ususally a bug in MDS and continue
2591 * bouncing the request makes no sense.
2592 *
2593 * In future this could be fixed in ceph code, so avoid
2594 * using the hardcode here.
2595 */
2596 int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
2597 max_fwd = 1 << (max_fwd * CHAR_BIT) - 1;
2598 auto num_fwd = fwd->get_num_fwd();
2599 if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
2600 if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
2601 request->abort(-EMULTIHOP);
2602 request->caller_cond->notify_all();
2603 ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
2604 << ", abort it" << dendl;
2605 } else {
2606 ldout(cct, 10) << __func__ << " tid " << tid
2607 << " old fwd seq " << fwd->get_num_fwd()
2608 << " <= req fwd " << request->num_fwd
2609 << ", ignore it" << dendl;
2610 }
2611 return;
2612 }
2613
2614 // reset retry counter
2615 request->retry_attempt = 0;
2616
2617 // request not forwarded, or dest mds has no session.
2618 // resend.
2619 ldout(cct, 10) << __func__ << " tid " << tid
2620 << " fwd " << fwd->get_num_fwd()
2621 << " to mds." << fwd->get_dest_mds()
2622 << ", resending to " << fwd->get_dest_mds()
2623 << dendl;
2624
2625 request->mds = -1;
2626 request->item.remove_myself();
2627 request->num_fwd = num_fwd;
2628 request->resend_mds = fwd->get_dest_mds();
2629 request->caller_cond->notify_all();
2630 }
2631
2632 bool Client::is_dir_operation(MetaRequest *req)
2633 {
2634 int op = req->get_op();
2635 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2636 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2637 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2638 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2639 return true;
2640 return false;
2641 }
2642
2643 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2644 {
2645 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2646
2647 std::scoped_lock cl(client_lock);
2648 auto session = _get_mds_session(mds_num, reply->get_connection().get());
2649 if (!session) {
2650 return;
2651 }
2652
2653 ceph_tid_t tid = reply->get_tid();
2654 bool is_safe = reply->is_safe();
2655
2656 if (mds_requests.count(tid) == 0) {
2657 lderr(cct) << __func__ << " no pending request on tid " << tid
2658 << " safe is:" << is_safe << dendl;
2659 return;
2660 }
2661 MetaRequest *request = mds_requests.at(tid);
2662
2663 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2664 << " tid " << tid << dendl;
2665
2666 if (request->got_unsafe && !is_safe) {
2667 //duplicate response
2668 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2669 << mds_num << " safe:" << is_safe << dendl;
2670 return;
2671 }
2672
2673 ceph_assert(!request->reply);
2674 request->reply = reply;
2675 insert_trace(request, session.get());
2676
2677 // Handle unsafe reply
2678 if (!is_safe) {
2679 request->got_unsafe = true;
2680 session->unsafe_requests.push_back(&request->unsafe_item);
2681 if (is_dir_operation(request)) {
2682 Inode *dir = request->inode();
2683 ceph_assert(dir);
2684 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2685 }
2686 if (request->target) {
2687 InodeRef &in = request->target;
2688 in->unsafe_ops.push_back(&request->unsafe_target_item);
2689 }
2690 }
2691
2692 // Only signal the caller once (on the first reply):
2693 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2694 if (!is_safe || !request->got_unsafe) {
2695 ceph::condition_variable cond;
2696 request->dispatch_cond = &cond;
2697
2698 // wake up waiter
2699 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2700 request->caller_cond->notify_all();
2701
2702 // wake for kick back
2703 std::unique_lock l{client_lock, std::adopt_lock};
2704 cond.wait(l, [tid, request, &cond, this] {
2705 if (request->dispatch_cond) {
2706 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2707 << tid << " " << &cond << dendl;
2708 }
2709 return !request->dispatch_cond;
2710 });
2711 l.release();
2712 }
2713
2714 if (is_safe) {
2715 // the filesystem change is committed to disk
2716 // we're done, clean up
2717 if (request->got_unsafe) {
2718 request->unsafe_item.remove_myself();
2719 request->unsafe_dir_item.remove_myself();
2720 request->unsafe_target_item.remove_myself();
2721 signal_cond_list(request->waitfor_safe);
2722 }
2723 request->item.remove_myself();
2724 unregister_request(request);
2725 }
2726 if (is_unmounting())
2727 mount_cond.notify_all();
2728 }
2729
2730 void Client::_handle_full_flag(int64_t pool)
2731 {
2732 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2733 << "on " << pool << dendl;
2734 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2735 // to do this rather than blocking, because otherwise when we fill up we
2736 // potentially lock caps forever on files with dirty pages, and we need
2737 // to be able to release those caps to the MDS so that it can delete files
2738 // and free up space.
2739 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2740
2741 // For all inodes with layouts in this pool and a pending flush write op
2742 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2743 // from ObjectCacher so that it doesn't re-issue the write in response to
2744 // the ENOSPC error.
2745 // Fortunately since we're cancelling everything in a given pool, we don't
2746 // need to know which ops belong to which ObjectSet, we can just blow all
2747 // the un-flushed cached data away and mark any dirty inodes' async_err
2748 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2749 // affecting this pool, and all the objectsets we're purging were also
2750 // in this pool.
2751 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2752 i != inode_map.end(); ++i)
2753 {
2754 Inode *inode = i->second;
2755 if (inode->oset.dirty_or_tx
2756 && (pool == -1 || inode->layout.pool_id == pool)) {
2757 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2758 << " has dirty objects, purging and setting ENOSPC" << dendl;
2759 objectcacher->purge_set(&inode->oset);
2760 inode->set_async_err(-CEPHFS_ENOSPC);
2761 }
2762 }
2763
2764 if (cancelled_epoch != (epoch_t)-1) {
2765 set_cap_epoch_barrier(cancelled_epoch);
2766 }
2767 }
2768
2769 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2770 {
2771 std::scoped_lock cl(client_lock);
2772
2773 const auto myaddrs = messenger->get_myaddrs();
2774 bool new_blocklist = objecter->with_osdmap(
2775 [&](const OSDMap& o) {
2776 return o.is_blocklisted(myaddrs);
2777 });
2778
2779 if (new_blocklist && !blocklisted) {
2780 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2781 return o.get_epoch();
2782 });
2783 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2784 blocklisted = true;
2785
2786 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2787
2788 // Since we know all our OSD ops will fail, cancel them all preemtively,
2789 // so that on an unhealthy cluster we can umount promptly even if e.g.
2790 // some PGs were inaccessible.
2791 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2792
2793 }
2794
2795 if (blocklisted) {
2796 // Handle case where we were blocklisted but no longer are
2797 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2798 return o.is_blocklisted(myaddrs);});
2799 }
2800
2801 // Always subscribe to next osdmap for blocklisted client
2802 // until this client is not blocklisted.
2803 if (blocklisted) {
2804 objecter->maybe_request_map();
2805 }
2806
2807 if (objecter->osdmap_full_flag()) {
2808 _handle_full_flag(-1);
2809 } else {
2810 // Accumulate local list of full pools so that I can drop
2811 // the objecter lock before re-entering objecter in
2812 // cancel_writes
2813 std::vector<int64_t> full_pools;
2814
2815 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2816 for (const auto& kv : o.get_pools()) {
2817 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2818 full_pools.push_back(kv.first);
2819 }
2820 }
2821 });
2822
2823 for (auto p : full_pools)
2824 _handle_full_flag(p);
2825
2826 // Subscribe to subsequent maps to watch for the full flag going
2827 // away. For the global full flag objecter does this for us, but
2828 // it pays no attention to the per-pool full flag so in this branch
2829 // we do it ourselves.
2830 if (!full_pools.empty()) {
2831 objecter->maybe_request_map();
2832 }
2833 }
2834 }
2835
2836
2837 // ------------------------
2838 // incoming messages
2839
2840
2841 bool Client::ms_dispatch2(const MessageRef &m)
2842 {
2843 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2844 if (!iref_reader.is_state_satisfied()) {
2845 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2846 return true;
2847 }
2848
2849 switch (m->get_type()) {
2850 // mounting and mds sessions
2851 case CEPH_MSG_MDS_MAP:
2852 handle_mds_map(ref_cast<MMDSMap>(m));
2853 break;
2854 case CEPH_MSG_FS_MAP:
2855 handle_fs_map(ref_cast<MFSMap>(m));
2856 break;
2857 case CEPH_MSG_FS_MAP_USER:
2858 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2859 break;
2860 case CEPH_MSG_CLIENT_SESSION:
2861 handle_client_session(ref_cast<MClientSession>(m));
2862 break;
2863
2864 case CEPH_MSG_OSD_MAP:
2865 handle_osd_map(ref_cast<MOSDMap>(m));
2866 break;
2867
2868 // requests
2869 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2870 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2871 break;
2872 case CEPH_MSG_CLIENT_REPLY:
2873 handle_client_reply(ref_cast<MClientReply>(m));
2874 break;
2875
2876 // reclaim reply
2877 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2878 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2879 break;
2880
2881 case CEPH_MSG_CLIENT_SNAP:
2882 handle_snap(ref_cast<MClientSnap>(m));
2883 break;
2884 case CEPH_MSG_CLIENT_CAPS:
2885 handle_caps(ref_cast<MClientCaps>(m));
2886 break;
2887 case CEPH_MSG_CLIENT_LEASE:
2888 handle_lease(ref_cast<MClientLease>(m));
2889 break;
2890 case MSG_COMMAND_REPLY:
2891 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2892 handle_command_reply(ref_cast<MCommandReply>(m));
2893 } else {
2894 return false;
2895 }
2896 break;
2897 case CEPH_MSG_CLIENT_QUOTA:
2898 handle_quota(ref_cast<MClientQuota>(m));
2899 break;
2900
2901 default:
2902 return false;
2903 }
2904
2905 // unmounting?
2906 std::scoped_lock cl(client_lock);
2907 if (is_unmounting()) {
2908 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2909 << "+" << inode_map.size() << dendl;
2910 uint64_t size = lru.lru_get_size() + inode_map.size();
2911 trim_cache();
2912 if (size > lru.lru_get_size() + inode_map.size()) {
2913 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2914 mount_cond.notify_all();
2915 } else {
2916 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2917 << "+" << inode_map.size() << dendl;
2918 }
2919 }
2920
2921 return true;
2922 }
2923
2924 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2925 {
2926 std::scoped_lock cl(client_lock);
2927 fsmap.reset(new FSMap(m->get_fsmap()));
2928
2929 signal_cond_list(waiting_for_fsmap);
2930
2931 monclient->sub_got("fsmap", fsmap->get_epoch());
2932 }
2933
2934 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2935 {
2936 std::scoped_lock cl(client_lock);
2937 fsmap_user.reset(new FSMapUser);
2938 *fsmap_user = m->get_fsmap();
2939
2940 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2941 signal_cond_list(waiting_for_fsmap);
2942 }
2943
2944 // Cancel all the commands for missing or laggy GIDs
2945 void Client::cancel_commands(const MDSMap& newmap)
2946 {
2947 std::vector<ceph_tid_t> cancel_ops;
2948
2949 std::scoped_lock cmd_lock(command_lock);
2950 auto &commands = command_table.get_commands();
2951 for (const auto &[tid, op] : commands) {
2952 const mds_gid_t op_mds_gid = op.mds_gid;
2953 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2954 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2955 cancel_ops.push_back(tid);
2956 if (op.outs) {
2957 std::ostringstream ss;
2958 ss << "MDS " << op_mds_gid << " went away";
2959 *(op.outs) = ss.str();
2960 }
2961 /*
2962 * No need to make the con->mark_down under
2963 * client_lock here, because the con will
2964 * has its own lock.
2965 */
2966 op.con->mark_down();
2967 if (op.on_finish)
2968 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2969 }
2970 }
2971
2972 for (const auto &tid : cancel_ops)
2973 command_table.erase(tid);
2974 }
2975
2976 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2977 {
2978 std::unique_lock cl(client_lock);
2979 if (m->get_epoch() <= mdsmap->get_epoch()) {
2980 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2981 << " is identical to or older than our "
2982 << mdsmap->get_epoch() << dendl;
2983 return;
2984 }
2985
2986 cl.unlock();
2987 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2988 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2989 _mdsmap->decode(m->get_encoded());
2990 cancel_commands(*_mdsmap.get());
2991 cl.lock();
2992
2993 _mdsmap.swap(mdsmap);
2994
2995 // reset session
2996 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2997 mds_rank_t mds = p->first;
2998 MetaSessionRef session = p->second;
2999 ++p;
3000
3001 int oldstate = _mdsmap->get_state(mds);
3002 int newstate = mdsmap->get_state(mds);
3003 if (!mdsmap->is_up(mds)) {
3004 session->con->mark_down();
3005 } else if (mdsmap->get_addrs(mds) != session->addrs) {
3006 auto old_inc = _mdsmap->get_incarnation(mds);
3007 auto new_inc = mdsmap->get_incarnation(mds);
3008 if (old_inc != new_inc) {
3009 ldout(cct, 1) << "mds incarnation changed from "
3010 << old_inc << " to " << new_inc << dendl;
3011 oldstate = MDSMap::STATE_NULL;
3012 }
3013 session->con->mark_down();
3014 session->addrs = mdsmap->get_addrs(mds);
3015 // When new MDS starts to take over, notify kernel to trim unused entries
3016 // in its dcache/icache. Hopefully, the kernel will release some unused
3017 // inodes before the new MDS enters reconnect state.
3018 trim_cache_for_reconnect(session.get());
3019 } else if (oldstate == newstate)
3020 continue; // no change
3021
3022 session->mds_state = newstate;
3023 if (newstate == MDSMap::STATE_RECONNECT) {
3024 session->con = messenger->connect_to_mds(session->addrs);
3025 send_reconnect(session.get());
3026 } else if (newstate > MDSMap::STATE_RECONNECT) {
3027 if (oldstate < MDSMap::STATE_RECONNECT) {
3028 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
3029 _closed_mds_session(session.get());
3030 continue;
3031 }
3032 if (newstate >= MDSMap::STATE_ACTIVE) {
3033 if (oldstate < MDSMap::STATE_ACTIVE) {
3034 // kick new requests
3035 kick_requests(session.get());
3036 kick_flushing_caps(session.get());
3037 signal_context_list(session->waiting_for_open);
3038 wake_up_session_caps(session.get(), true);
3039 }
3040 connect_mds_targets(mds);
3041 }
3042 } else if (newstate == MDSMap::STATE_NULL &&
3043 mds >= mdsmap->get_max_mds()) {
3044 _closed_mds_session(session.get());
3045 }
3046 }
3047
3048 // kick any waiting threads
3049 signal_cond_list(waiting_for_mdsmap);
3050
3051 monclient->sub_got("mdsmap", mdsmap->get_epoch());
3052 }
3053
3054 void Client::send_reconnect(MetaSession *session)
3055 {
3056 mds_rank_t mds = session->mds_num;
3057 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
3058
3059 // trim unused caps to reduce MDS's cache rejoin time
3060 trim_cache_for_reconnect(session);
3061
3062 session->readonly = false;
3063
3064 session->release.reset();
3065
3066 // reset my cap seq number
3067 session->seq = 0;
3068 //connect to the mds' offload targets
3069 connect_mds_targets(mds);
3070 //make sure unsafe requests get saved
3071 resend_unsafe_requests(session);
3072
3073 early_kick_flushing_caps(session);
3074
3075 auto m = make_message<MClientReconnect>();
3076 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
3077
3078 // i have an open session.
3079 ceph::unordered_set<inodeno_t> did_snaprealm;
3080 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3081 p != inode_map.end();
3082 ++p) {
3083 Inode *in = p->second;
3084 auto it = in->caps.find(mds);
3085 if (it != in->caps.end()) {
3086 if (allow_multi &&
3087 m->get_approx_size() >=
3088 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
3089 m->mark_more();
3090 session->con->send_message2(std::move(m));
3091
3092 m = make_message<MClientReconnect>();
3093 }
3094
3095 Cap &cap = it->second;
3096 ldout(cct, 10) << " caps on " << p->first
3097 << " " << ccap_string(cap.issued)
3098 << " wants " << ccap_string(in->caps_wanted())
3099 << dendl;
3100 filepath path;
3101 in->make_short_path(path);
3102 ldout(cct, 10) << " path " << path << dendl;
3103
3104 bufferlist flockbl;
3105 _encode_filelocks(in, flockbl);
3106
3107 cap.seq = 0; // reset seq.
3108 cap.issue_seq = 0; // reset seq.
3109 cap.mseq = 0; // reset seq.
3110 // cap gen should catch up with session cap_gen
3111 if (cap.gen < session->cap_gen) {
3112 cap.gen = session->cap_gen;
3113 cap.issued = cap.implemented = CEPH_CAP_PIN;
3114 } else {
3115 cap.issued = cap.implemented;
3116 }
3117 snapid_t snap_follows = 0;
3118 if (!in->cap_snaps.empty())
3119 snap_follows = in->cap_snaps.begin()->first;
3120
3121 m->add_cap(p->first.ino,
3122 cap.cap_id,
3123 path.get_ino(), path.get_path(), // ino
3124 in->caps_wanted(), // wanted
3125 cap.issued, // issued
3126 in->snaprealm->ino,
3127 snap_follows,
3128 flockbl);
3129
3130 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3131 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3132 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3133 did_snaprealm.insert(in->snaprealm->ino);
3134 }
3135 }
3136 }
3137
3138 if (!allow_multi)
3139 m->set_encoding_version(0); // use connection features to choose encoding
3140 session->con->send_message2(std::move(m));
3141
3142 mount_cond.notify_all();
3143
3144 if (session->reclaim_state == MetaSession::RECLAIMING)
3145 signal_cond_list(waiting_for_reclaim);
3146 }
3147
3148
3149 void Client::kick_requests(MetaSession *session)
3150 {
3151 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3152 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3153 p != mds_requests.end();
3154 ++p) {
3155 MetaRequest *req = p->second;
3156 if (req->got_unsafe)
3157 continue;
3158 if (req->aborted()) {
3159 if (req->caller_cond) {
3160 req->kick = true;
3161 req->caller_cond->notify_all();
3162 }
3163 continue;
3164 }
3165 if (req->retry_attempt > 0)
3166 continue; // new requests only
3167 if (req->mds == session->mds_num) {
3168 send_request(p->second, session);
3169 }
3170 }
3171 }
3172
3173 void Client::resend_unsafe_requests(MetaSession *session)
3174 {
3175 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3176 !iter.end();
3177 ++iter)
3178 send_request(*iter, session);
3179
3180 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3181 // process completed requests in clientreplay stage.
3182 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3183 p != mds_requests.end();
3184 ++p) {
3185 MetaRequest *req = p->second;
3186 if (req->got_unsafe)
3187 continue;
3188 if (req->aborted())
3189 continue;
3190 if (req->retry_attempt == 0)
3191 continue; // old requests only
3192 if (req->mds == session->mds_num)
3193 send_request(req, session, true);
3194 }
3195 }
3196
3197 void Client::wait_unsafe_requests()
3198 {
3199 list<MetaRequest*> last_unsafe_reqs;
3200 for (const auto &p : mds_sessions) {
3201 const auto s = p.second;
3202 if (!s->unsafe_requests.empty()) {
3203 MetaRequest *req = s->unsafe_requests.back();
3204 req->get();
3205 last_unsafe_reqs.push_back(req);
3206 }
3207 }
3208
3209 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3210 p != last_unsafe_reqs.end();
3211 ++p) {
3212 MetaRequest *req = *p;
3213 if (req->unsafe_item.is_on_list())
3214 wait_on_list(req->waitfor_safe);
3215 put_request(req);
3216 }
3217 }
3218
3219 void Client::kick_requests_closed(MetaSession *session)
3220 {
3221 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3222 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3223 p != mds_requests.end(); ) {
3224 MetaRequest *req = p->second;
3225 ++p;
3226 if (req->mds == session->mds_num) {
3227 if (req->caller_cond) {
3228 req->kick = true;
3229 req->caller_cond->notify_all();
3230 }
3231 req->item.remove_myself();
3232 if (req->got_unsafe) {
3233 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3234 req->unsafe_item.remove_myself();
3235 if (is_dir_operation(req)) {
3236 Inode *dir = req->inode();
3237 ceph_assert(dir);
3238 dir->set_async_err(-CEPHFS_EIO);
3239 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3240 << dir->ino << " " << req->get_tid() << dendl;
3241 req->unsafe_dir_item.remove_myself();
3242 }
3243 if (req->target) {
3244 InodeRef &in = req->target;
3245 in->set_async_err(-CEPHFS_EIO);
3246 lderr(cct) << "kick_requests_closed drop req of inode : "
3247 << in->ino << " " << req->get_tid() << dendl;
3248 req->unsafe_target_item.remove_myself();
3249 }
3250 signal_cond_list(req->waitfor_safe);
3251 unregister_request(req);
3252 }
3253 }
3254 }
3255 ceph_assert(session->requests.empty());
3256 ceph_assert(session->unsafe_requests.empty());
3257 }
3258
3259
3260
3261
3262 /************
3263 * leases
3264 */
3265
3266 void Client::got_mds_push(MetaSession *s)
3267 {
3268 s->seq++;
3269 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3270 if (s->state == MetaSession::STATE_CLOSING) {
3271 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3272 }
3273 }
3274
3275 void Client::handle_lease(const MConstRef<MClientLease>& m)
3276 {
3277 ldout(cct, 10) << __func__ << " " << *m << dendl;
3278
3279 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3280 mds_rank_t mds = mds_rank_t(m->get_source().num());
3281
3282 std::scoped_lock cl(client_lock);
3283 auto session = _get_mds_session(mds, m->get_connection().get());
3284 if (!session) {
3285 return;
3286 }
3287
3288 got_mds_push(session.get());
3289
3290 ceph_seq_t seq = m->get_seq();
3291
3292 Inode *in;
3293 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3294 if (inode_map.count(vino) == 0) {
3295 ldout(cct, 10) << " don't have vino " << vino << dendl;
3296 goto revoke;
3297 }
3298 in = inode_map[vino];
3299
3300 if (m->get_mask() & CEPH_LEASE_VALID) {
3301 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3302 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3303 goto revoke;
3304 }
3305 Dentry *dn = in->dir->dentries[m->dname];
3306 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3307 dn->lease_mds = -1;
3308 }
3309
3310 revoke:
3311 {
3312 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3313 m->get_mask(), m->get_ino(),
3314 m->get_first(), m->get_last(), m->dname);
3315 m->get_connection()->send_message2(std::move(reply));
3316 }
3317 }
3318
3319 void Client::_put_inode(Inode *in, int n)
3320 {
3321 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3322
3323 int left = in->get_nref();
3324 ceph_assert(left >= n + 1);
3325 in->iput(n);
3326 left -= n;
3327 if (left == 1) { // the last one will be held by the inode_map
3328 // release any caps
3329 remove_all_caps(in);
3330
3331 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3332 bool unclean = objectcacher->release_set(&in->oset);
3333 ceph_assert(!unclean);
3334 inode_map.erase(in->vino());
3335 if (use_faked_inos())
3336 _release_faked_ino(in);
3337
3338 if (root == nullptr) {
3339 root_ancestor = 0;
3340 while (!root_parents.empty())
3341 root_parents.erase(root_parents.begin());
3342 }
3343
3344 in->iput();
3345 }
3346 }
3347
3348 void Client::delay_put_inodes(bool wakeup)
3349 {
3350 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3351
3352 std::map<Inode*,int> release;
3353 {
3354 std::scoped_lock dl(delay_i_lock);
3355 release.swap(delay_i_release);
3356 }
3357
3358 if (release.empty())
3359 return;
3360
3361 for (auto &[in, cnt] : release)
3362 _put_inode(in, cnt);
3363
3364 if (wakeup)
3365 mount_cond.notify_all();
3366 }
3367
3368 void Client::put_inode(Inode *in, int n)
3369 {
3370 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3371
3372 std::scoped_lock dl(delay_i_lock);
3373 delay_i_release[in] += n;
3374 }
3375
3376 void Client::close_dir(Dir *dir)
3377 {
3378 Inode *in = dir->parent_inode;
3379 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3380 ceph_assert(dir->is_empty());
3381 ceph_assert(in->dir == dir);
3382 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3383 if (!in->dentries.empty())
3384 in->get_first_parent()->put(); // unpin dentry
3385
3386 delete in->dir;
3387 in->dir = 0;
3388 put_inode(in); // unpin inode
3389 }
3390
3391 /**
3392 * Don't call this with in==NULL, use get_or_create for that
3393 * leave dn set to default NULL unless you're trying to add
3394 * a new inode to a pre-created Dentry
3395 */
3396 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3397 {
3398 if (!dn) {
3399 // create a new Dentry
3400 dn = new Dentry(dir, name);
3401
3402 lru.lru_insert_mid(dn); // mid or top?
3403
3404 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3405 << " dn " << dn << " (new dn)" << dendl;
3406 } else {
3407 ceph_assert(!dn->inode);
3408 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3409 << " dn " << dn << " (old dn)" << dendl;
3410 }
3411
3412 if (in) { // link to inode
3413 InodeRef tmp_ref;
3414 // only one parent for directories!
3415 if (in->is_dir() && !in->dentries.empty()) {
3416 tmp_ref = in; // prevent unlink below from freeing the inode.
3417 Dentry *olddn = in->get_first_parent();
3418 ceph_assert(olddn->dir != dir || olddn->name != name);
3419 Inode *old_diri = olddn->dir->parent_inode;
3420 clear_dir_complete_and_ordered(old_diri, true);
3421 unlink(olddn, true, true); // keep dir, dentry
3422 }
3423
3424 dn->link(in);
3425 inc_dentry_nr();
3426 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3427 }
3428
3429 return dn;
3430 }
3431
3432 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3433 {
3434 InodeRef in(dn->inode);
3435 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3436 << " inode " << dn->inode << dendl;
3437
3438 // unlink from inode
3439 if (dn->inode) {
3440 dn->unlink();
3441 dec_dentry_nr();
3442 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3443 }
3444
3445 if (keepdentry) {
3446 dn->lease_mds = -1;
3447 } else {
3448 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3449
3450 // unlink from dir
3451 Dir *dir = dn->dir;
3452 dn->detach();
3453
3454 // delete den
3455 lru.lru_remove(dn);
3456 dn->put();
3457
3458 if (dir->is_empty() && !keepdir)
3459 close_dir(dir);
3460 }
3461 }
3462
3463 /**
3464 * For asynchronous flushes, check for errors from the IO and
3465 * update the inode if necessary
3466 */
3467 class C_Client_FlushComplete : public Context {
3468 private:
3469 Client *client;
3470 InodeRef inode;
3471 public:
3472 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3473 void finish(int r) override {
3474 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3475 if (r != 0) {
3476 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3477 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3478 << " 0x" << std::hex << inode->ino << std::dec
3479 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3480 inode->set_async_err(r);
3481 }
3482 }
3483 };
3484
3485
3486 /****
3487 * caps
3488 */
3489
3490 void Client::get_cap_ref(Inode *in, int cap)
3491 {
3492 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3493 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3494 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3495 in->iget();
3496 }
3497 if ((cap & CEPH_CAP_FILE_CACHE) &&
3498 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3499 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3500 in->iget();
3501 }
3502 in->get_cap_ref(cap);
3503 }
3504
3505 void Client::put_cap_ref(Inode *in, int cap)
3506 {
3507 int last = in->put_cap_ref(cap);
3508 if (last) {
3509 int put_nref = 0;
3510 int drop = last & ~in->caps_issued();
3511 if (in->snapid == CEPH_NOSNAP) {
3512 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3513 !in->cap_snaps.empty() &&
3514 in->cap_snaps.rbegin()->second.writing) {
3515 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3516 in->cap_snaps.rbegin()->second.writing = 0;
3517 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3518 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3519 }
3520 if (last & CEPH_CAP_FILE_BUFFER) {
3521 for (auto &p : in->cap_snaps)
3522 p.second.dirty_data = 0;
3523 signal_cond_list(in->waitfor_commit);
3524 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3525 ++put_nref;
3526 }
3527 }
3528 if (last & CEPH_CAP_FILE_CACHE) {
3529 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3530 ++put_nref;
3531 }
3532 if (drop)
3533 check_caps(in, 0);
3534 if (put_nref)
3535 put_inode(in, put_nref);
3536 }
3537 }
3538
3539 // get caps for a given file handle -- the inode should have @need caps
3540 // issued by the mds and @want caps not revoked (or not under revocation).
3541 // this routine blocks till the cap requirement is satisfied. also account
3542 // (track) for capability hit when required (when cap requirement succeedes).
3543 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3544 {
3545 Inode *in = fh->inode.get();
3546
3547 int r = check_pool_perm(in, need);
3548 if (r < 0)
3549 return r;
3550
3551 while (1) {
3552 int file_wanted = in->caps_file_wanted();
3553 if ((file_wanted & need) != need) {
3554 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3555 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3556 << dendl;
3557 return -CEPHFS_EBADF;
3558 }
3559
3560 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3561 return -CEPHFS_EBADF;
3562
3563 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3564 return -CEPHFS_EIO;
3565
3566 int implemented;
3567 int have = in->caps_issued(&implemented);
3568
3569 bool waitfor_caps = false;
3570 bool waitfor_commit = false;
3571
3572 if (have & need & CEPH_CAP_FILE_WR) {
3573 if (endoff > 0) {
3574 if ((endoff >= (loff_t)in->max_size ||
3575 endoff > (loff_t)(in->size << 1)) &&
3576 endoff > (loff_t)in->wanted_max_size) {
3577 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3578 in->wanted_max_size = endoff;
3579 }
3580 if (in->wanted_max_size > in->max_size &&
3581 in->wanted_max_size > in->requested_max_size)
3582 check_caps(in, 0);
3583 }
3584
3585 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3586 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3587 waitfor_caps = true;
3588 }
3589 if (!in->cap_snaps.empty()) {
3590 if (in->cap_snaps.rbegin()->second.writing) {
3591 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3592 waitfor_caps = true;
3593 }
3594 for (auto &p : in->cap_snaps) {
3595 if (p.second.dirty_data) {
3596 waitfor_commit = true;
3597 break;
3598 }
3599 }
3600 if (waitfor_commit) {
3601 _flush(in, new C_Client_FlushComplete(this, in));
3602 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3603 }
3604 }
3605 }
3606
3607 if (!waitfor_caps && !waitfor_commit) {
3608 if ((have & need) == need) {
3609 int revoking = implemented & ~have;
3610 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3611 << " need " << ccap_string(need) << " want " << ccap_string(want)
3612 << " revoking " << ccap_string(revoking)
3613 << dendl;
3614 if ((revoking & want) == 0) {
3615 *phave = need | (have & want);
3616 in->get_cap_ref(need);
3617 cap_hit();
3618 return 0;
3619 }
3620 }
3621 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3622 waitfor_caps = true;
3623 }
3624
3625 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3626 in->auth_cap->session->readonly)
3627 return -CEPHFS_EROFS;
3628
3629 if (in->flags & I_CAP_DROPPED) {
3630 int mds_wanted = in->caps_mds_wanted();
3631 if ((mds_wanted & need) != need) {
3632 int ret = _renew_caps(in);
3633 if (ret < 0)
3634 return ret;
3635 continue;
3636 }
3637 if (!(file_wanted & ~mds_wanted))
3638 in->flags &= ~I_CAP_DROPPED;
3639 }
3640
3641 if (waitfor_caps)
3642 wait_on_list(in->waitfor_caps);
3643 else if (waitfor_commit)
3644 wait_on_list(in->waitfor_commit);
3645 }
3646 }
3647
3648 int Client::get_caps_used(Inode *in)
3649 {
3650 unsigned used = in->caps_used();
3651 if (!(used & CEPH_CAP_FILE_CACHE) &&
3652 !objectcacher->set_is_empty(&in->oset))
3653 used |= CEPH_CAP_FILE_CACHE;
3654 return used;
3655 }
3656
3657 void Client::cap_delay_requeue(Inode *in)
3658 {
3659 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3660
3661 in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
3662 delayed_list.push_back(&in->delay_cap_item);
3663 }
3664
3665 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3666 int flags, int used, int want, int retain,
3667 int flush, ceph_tid_t flush_tid)
3668 {
3669 int held = cap->issued | cap->implemented;
3670 int revoking = cap->implemented & ~cap->issued;
3671 retain &= ~revoking;
3672 int dropping = cap->issued & ~retain;
3673 int op = CEPH_CAP_OP_UPDATE;
3674
3675 ldout(cct, 10) << __func__ << " " << *in
3676 << " mds." << session->mds_num << " seq " << cap->seq
3677 << " used " << ccap_string(used)
3678 << " want " << ccap_string(want)
3679 << " flush " << ccap_string(flush)
3680 << " retain " << ccap_string(retain)
3681 << " held "<< ccap_string(held)
3682 << " revoking " << ccap_string(revoking)
3683 << " dropping " << ccap_string(dropping)
3684 << dendl;
3685
3686 if (cct->_conf->client_inject_release_failure && revoking) {
3687 const int would_have_issued = cap->issued & retain;
3688 const int would_have_implemented = cap->implemented & (cap->issued | used);
3689 // Simulated bug:
3690 // - tell the server we think issued is whatever they issued plus whatever we implemented
3691 // - leave what we have implemented in place
3692 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3693 cap->issued = cap->issued | cap->implemented;
3694
3695 // Make an exception for revoking xattr caps: we are injecting
3696 // failure to release other caps, but allow xattr because client
3697 // will block on xattr ops if it can't release these to MDS (#9800)
3698 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3699 cap->issued ^= xattr_mask & revoking;
3700 cap->implemented ^= xattr_mask & revoking;
3701
3702 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3703 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3704 } else {
3705 // Normal behaviour
3706 cap->issued &= retain;
3707 cap->implemented &= cap->issued | used;
3708 }
3709
3710 snapid_t follows = 0;
3711
3712 if (flush)
3713 follows = in->snaprealm->get_snap_context().seq;
3714
3715 auto m = make_message<MClientCaps>(op,
3716 in->ino,
3717 0,
3718 cap->cap_id, cap->seq,
3719 cap->implemented,
3720 want,
3721 flush,
3722 cap->mseq,
3723 cap_epoch_barrier);
3724 m->caller_uid = in->cap_dirtier_uid;
3725 m->caller_gid = in->cap_dirtier_gid;
3726
3727 m->head.issue_seq = cap->issue_seq;
3728 m->set_tid(flush_tid);
3729
3730 m->head.uid = in->uid;
3731 m->head.gid = in->gid;
3732 m->head.mode = in->mode;
3733
3734 m->head.nlink = in->nlink;
3735
3736 if (flush & CEPH_CAP_XATTR_EXCL) {
3737 encode(in->xattrs, m->xattrbl);
3738 m->head.xattr_version = in->xattr_version;
3739 }
3740
3741 m->size = in->size;
3742 m->max_size = in->max_size;
3743 m->truncate_seq = in->truncate_seq;
3744 m->truncate_size = in->truncate_size;
3745 m->mtime = in->mtime;
3746 m->atime = in->atime;
3747 m->ctime = in->ctime;
3748 m->btime = in->btime;
3749 m->time_warp_seq = in->time_warp_seq;
3750 m->change_attr = in->change_attr;
3751
3752 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3753 !in->cap_snaps.empty() &&
3754 in->cap_snaps.rbegin()->second.flush_tid == 0)
3755 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3756 m->flags = flags;
3757
3758 if (flush & CEPH_CAP_FILE_WR) {
3759 m->inline_version = in->inline_version;
3760 m->inline_data = in->inline_data;
3761 }
3762
3763 in->reported_size = in->size;
3764 m->set_snap_follows(follows);
3765 cap->wanted = want;
3766 if (cap == in->auth_cap) {
3767 if (want & CEPH_CAP_ANY_FILE_WR) {
3768 m->set_max_size(in->wanted_max_size);
3769 in->requested_max_size = in->wanted_max_size;
3770 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3771 } else {
3772 in->requested_max_size = 0;
3773 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3774 }
3775 }
3776
3777 if (!session->flushing_caps_tids.empty())
3778 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3779
3780 session->con->send_message2(std::move(m));
3781 }
3782
3783 static bool is_max_size_approaching(Inode *in)
3784 {
3785 /* mds will adjust max size according to the reported size */
3786 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3787 return false;
3788 if (in->size >= in->max_size)
3789 return true;
3790 /* half of previous max_size increment has been used */
3791 if (in->max_size > in->reported_size &&
3792 (in->size << 1) >= in->max_size + in->reported_size)
3793 return true;
3794 return false;
3795 }
3796
3797 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3798 {
3799 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3800 return used;
3801 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3802 return used;
3803
3804 if (issued & CEPH_CAP_FILE_LAZYIO) {
3805 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3806 used &= ~CEPH_CAP_FILE_CACHE;
3807 used |= CEPH_CAP_FILE_LAZYIO;
3808 }
3809 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3810 used &= ~CEPH_CAP_FILE_BUFFER;
3811 used |= CEPH_CAP_FILE_LAZYIO;
3812 }
3813 } else {
3814 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3815 used &= ~CEPH_CAP_FILE_CACHE;
3816 used |= CEPH_CAP_FILE_LAZYIO;
3817 }
3818 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3819 used &= ~CEPH_CAP_FILE_BUFFER;
3820 used |= CEPH_CAP_FILE_LAZYIO;
3821 }
3822 }
3823 return used;
3824 }
3825
3826 /**
3827 * check_caps
3828 *
3829 * Examine currently used and wanted versus held caps. Release, flush or ack
3830 * revoked caps to the MDS as appropriate.
3831 *
3832 * @param in the inode to check
3833 * @param flags flags to apply to cap check
3834 */
3835 void Client::check_caps(Inode *in, unsigned flags)
3836 {
3837 unsigned wanted = in->caps_wanted();
3838 unsigned used = get_caps_used(in);
3839 unsigned cap_used;
3840
3841 int implemented;
3842 int issued = in->caps_issued(&implemented);
3843 int revoking = implemented & ~issued;
3844
3845 int orig_used = used;
3846 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3847
3848 int retain = wanted | used | CEPH_CAP_PIN;
3849 if (!is_unmounting() && in->nlink > 0) {
3850 if (wanted) {
3851 retain |= CEPH_CAP_ANY;
3852 } else if (in->is_dir() &&
3853 (issued & CEPH_CAP_FILE_SHARED) &&
3854 (in->flags & I_COMPLETE)) {
3855 // we do this here because we don't want to drop to Fs (and then
3856 // drop the Fs if we do a create!) if that alone makes us send lookups
3857 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3858 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3859 retain |= wanted;
3860 } else {
3861 retain |= CEPH_CAP_ANY_SHARED;
3862 // keep RD only if we didn't have the file open RW,
3863 // because then the mds would revoke it anyway to
3864 // journal max_size=0.
3865 if (in->max_size == 0)
3866 retain |= CEPH_CAP_ANY_RD;
3867 }
3868 }
3869
3870 ldout(cct, 10) << __func__ << " on " << *in
3871 << " wanted " << ccap_string(wanted)
3872 << " used " << ccap_string(used)
3873 << " issued " << ccap_string(issued)
3874 << " revoking " << ccap_string(revoking)
3875 << " flags=" << flags
3876 << dendl;
3877
3878 if (in->snapid != CEPH_NOSNAP)
3879 return; //snap caps last forever, can't write
3880
3881 if (in->caps.empty())
3882 return; // guard if at end of func
3883
3884 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3885 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3886 if (_release(in))
3887 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3888 }
3889
3890 for (auto &[mds, cap] : in->caps) {
3891 auto session = mds_sessions.at(mds);
3892
3893 cap_used = used;
3894 if (in->auth_cap && &cap != in->auth_cap)
3895 cap_used &= ~in->auth_cap->issued;
3896
3897 revoking = cap.implemented & ~cap.issued;
3898
3899 ldout(cct, 10) << " cap mds." << mds
3900 << " issued " << ccap_string(cap.issued)
3901 << " implemented " << ccap_string(cap.implemented)
3902 << " revoking " << ccap_string(revoking) << dendl;
3903
3904 if (in->wanted_max_size > in->max_size &&
3905 in->wanted_max_size > in->requested_max_size &&
3906 &cap == in->auth_cap)
3907 goto ack;
3908
3909 /* approaching file_max? */
3910 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3911 &cap == in->auth_cap &&
3912 is_max_size_approaching(in)) {
3913 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3914 << ", reported " << in->reported_size << dendl;
3915 goto ack;
3916 }
3917
3918 /* completed revocation? */
3919 if (revoking && (revoking & cap_used) == 0) {
3920 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3921 goto ack;
3922 }
3923
3924 /* want more caps from mds? */
3925 if (wanted & ~(cap.wanted | cap.issued))
3926 goto ack;
3927
3928 if (!revoking && is_unmounting() && (cap_used == 0))
3929 goto ack;
3930
3931 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3932 !in->dirty_caps) // and we have no dirty caps
3933 continue;
3934
3935 if (!(flags & CHECK_CAPS_NODELAY)) {
3936 ldout(cct, 10) << "delaying cap release" << dendl;
3937 cap_delay_requeue(in);
3938 continue;
3939 }
3940
3941 ack:
3942 if (&cap == in->auth_cap) {
3943 if (in->flags & I_KICK_FLUSH) {
3944 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3945 << " to mds." << mds << dendl;
3946 kick_flushing_caps(in, session.get());
3947 }
3948 if (!in->cap_snaps.empty() &&
3949 in->cap_snaps.rbegin()->second.flush_tid == 0)
3950 flush_snaps(in);
3951 }
3952
3953 int flushing;
3954 int msg_flags = 0;
3955 ceph_tid_t flush_tid;
3956 if (in->auth_cap == &cap && in->dirty_caps) {
3957 flushing = mark_caps_flushing(in, &flush_tid);
3958 if (flags & CHECK_CAPS_SYNCHRONOUS)
3959 msg_flags |= MClientCaps::FLAG_SYNC;
3960 } else {
3961 flushing = 0;
3962 flush_tid = 0;
3963 }
3964
3965 in->delay_cap_item.remove_myself();
3966 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
3967 flushing, flush_tid);
3968 }
3969 }
3970
3971
3972 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3973 {
3974 int used = get_caps_used(in);
3975 int dirty = in->caps_dirty();
3976 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3977
3978 if (in->cap_snaps.size() &&
3979 in->cap_snaps.rbegin()->second.writing) {
3980 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3981 return;
3982 } else if (in->caps_dirty() ||
3983 (used & CEPH_CAP_FILE_WR) ||
3984 (dirty & CEPH_CAP_ANY_WR)) {
3985 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3986 ceph_assert(capsnapem.second); /* element inserted */
3987 CapSnap &capsnap = capsnapem.first->second;
3988 capsnap.context = old_snapc;
3989 capsnap.issued = in->caps_issued();
3990 capsnap.dirty = in->caps_dirty();
3991
3992 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3993
3994 capsnap.uid = in->uid;
3995 capsnap.gid = in->gid;
3996 capsnap.mode = in->mode;
3997 capsnap.btime = in->btime;
3998 capsnap.xattrs = in->xattrs;
3999 capsnap.xattr_version = in->xattr_version;
4000 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4001 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4002
4003 if (used & CEPH_CAP_FILE_WR) {
4004 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
4005 capsnap.writing = 1;
4006 } else {
4007 finish_cap_snap(in, capsnap, used);
4008 }
4009 } else {
4010 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
4011 }
4012 }
4013
4014 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
4015 {
4016 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
4017 capsnap.size = in->size;
4018 capsnap.mtime = in->mtime;
4019 capsnap.atime = in->atime;
4020 capsnap.ctime = in->ctime;
4021 capsnap.time_warp_seq = in->time_warp_seq;
4022 capsnap.change_attr = in->change_attr;
4023 capsnap.dirty |= in->caps_dirty();
4024
4025 /* Only reset it if it wasn't set before */
4026 if (capsnap.cap_dirtier_uid == -1) {
4027 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4028 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4029 }
4030
4031 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4032 capsnap.inline_data = in->inline_data;
4033 capsnap.inline_version = in->inline_version;
4034 }
4035
4036 if (used & CEPH_CAP_FILE_BUFFER) {
4037 capsnap.writing = 1;
4038 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
4039 << " WRBUFFER, delaying" << dendl;
4040 } else {
4041 capsnap.dirty_data = 0;
4042 flush_snaps(in);
4043 }
4044 }
4045
4046 void Client::send_flush_snap(Inode *in, MetaSession *session,
4047 snapid_t follows, CapSnap& capsnap)
4048 {
4049 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4050 in->ino, in->snaprealm->ino, 0,
4051 in->auth_cap->mseq, cap_epoch_barrier);
4052 m->caller_uid = capsnap.cap_dirtier_uid;
4053 m->caller_gid = capsnap.cap_dirtier_gid;
4054
4055 m->set_client_tid(capsnap.flush_tid);
4056 m->head.snap_follows = follows;
4057
4058 m->head.caps = capsnap.issued;
4059 m->head.dirty = capsnap.dirty;
4060
4061 m->head.uid = capsnap.uid;
4062 m->head.gid = capsnap.gid;
4063 m->head.mode = capsnap.mode;
4064 m->btime = capsnap.btime;
4065
4066 m->size = capsnap.size;
4067
4068 m->head.xattr_version = capsnap.xattr_version;
4069 encode(capsnap.xattrs, m->xattrbl);
4070
4071 m->ctime = capsnap.ctime;
4072 m->btime = capsnap.btime;
4073 m->mtime = capsnap.mtime;
4074 m->atime = capsnap.atime;
4075 m->time_warp_seq = capsnap.time_warp_seq;
4076 m->change_attr = capsnap.change_attr;
4077
4078 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4079 m->inline_version = in->inline_version;
4080 m->inline_data = in->inline_data;
4081 }
4082
4083 ceph_assert(!session->flushing_caps_tids.empty());
4084 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4085
4086 session->con->send_message2(std::move(m));
4087 }
4088
4089 void Client::flush_snaps(Inode *in)
4090 {
4091 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
4092 ceph_assert(in->cap_snaps.size());
4093
4094 // pick auth mds
4095 ceph_assert(in->auth_cap);
4096 MetaSession *session = in->auth_cap->session;
4097
4098 for (auto &p : in->cap_snaps) {
4099 CapSnap &capsnap = p.second;
4100 // only do new flush
4101 if (capsnap.flush_tid > 0)
4102 continue;
4103
4104 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4105 << " follows " << p.first
4106 << " size " << capsnap.size
4107 << " mtime " << capsnap.mtime
4108 << " dirty_data=" << capsnap.dirty_data
4109 << " writing=" << capsnap.writing
4110 << " on " << *in << dendl;
4111 if (capsnap.dirty_data || capsnap.writing)
4112 break;
4113
4114 capsnap.flush_tid = ++last_flush_tid;
4115 session->flushing_caps_tids.insert(capsnap.flush_tid);
4116 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4117 if (!in->flushing_cap_item.is_on_list())
4118 session->flushing_caps.push_back(&in->flushing_cap_item);
4119
4120 send_flush_snap(in, session, p.first, capsnap);
4121 }
4122 }
4123
4124 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
4125 {
4126 ceph::condition_variable cond;
4127 ls.push_back(&cond);
4128 std::unique_lock l{client_lock, std::adopt_lock};
4129 cond.wait(l);
4130 l.release();
4131 ls.remove(&cond);
4132 }
4133
4134 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4135 {
4136 for (auto cond : ls) {
4137 cond->notify_all();
4138 }
4139 }
4140
4141 void Client::wait_on_context_list(list<Context*>& ls)
4142 {
4143 ceph::condition_variable cond;
4144 bool done = false;
4145 int r;
4146 ls.push_back(new C_Cond(cond, &done, &r));
4147 std::unique_lock l{client_lock, std::adopt_lock};
4148 cond.wait(l, [&done] { return done;});
4149 l.release();
4150 }
4151
4152 void Client::signal_context_list(list<Context*>& ls)
4153 {
4154 while (!ls.empty()) {
4155 ls.front()->complete(0);
4156 ls.pop_front();
4157 }
4158 }
4159
4160 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4161 {
4162 for (const auto &cap : s->caps) {
4163 auto &in = cap->inode;
4164 if (reconnect) {
4165 in.requested_max_size = 0;
4166 in.wanted_max_size = 0;
4167 } else {
4168 if (cap->gen < s->cap_gen) {
4169 // mds did not re-issue stale cap.
4170 cap->issued = cap->implemented = CEPH_CAP_PIN;
4171 // make sure mds knows what we want.
4172 if (in.caps_file_wanted() & ~cap->wanted)
4173 in.flags |= I_CAP_DROPPED;
4174 }
4175 }
4176 signal_cond_list(in.waitfor_caps);
4177 }
4178 }
4179
4180
4181 // flush dirty data (from objectcache)
4182
4183 class C_Client_CacheInvalidate : public Context {
4184 private:
4185 Client *client;
4186 vinodeno_t ino;
4187 int64_t offset, length;
4188 public:
4189 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4190 client(c), offset(off), length(len) {
4191 if (client->use_faked_inos())
4192 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4193 else
4194 ino = in->vino();
4195 }
4196 void finish(int r) override {
4197 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4198 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4199 client->_async_invalidate(ino, offset, length);
4200 }
4201 };
4202
4203 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4204 {
4205 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4206 if (!mref_reader.is_state_satisfied())
4207 return;
4208
4209 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4210 ino_invalidate_cb(callback_handle, ino, off, len);
4211 }
4212
4213 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4214
4215 if (ino_invalidate_cb)
4216 // we queue the invalidate, which calls the callback and decrements the ref
4217 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4218 }
4219
4220 void Client::_invalidate_inode_cache(Inode *in)
4221 {
4222 ldout(cct, 10) << __func__ << " " << *in << dendl;
4223
4224 // invalidate our userspace inode cache
4225 if (cct->_conf->client_oc) {
4226 objectcacher->release_set(&in->oset);
4227 if (!objectcacher->set_is_empty(&in->oset))
4228 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4229 }
4230
4231 _schedule_invalidate_callback(in, 0, 0);
4232 }
4233
4234 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4235 {
4236 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4237
4238 // invalidate our userspace inode cache
4239 if (cct->_conf->client_oc) {
4240 vector<ObjectExtent> ls;
4241 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4242 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4243 }
4244
4245 _schedule_invalidate_callback(in, off, len);
4246 }
4247
4248 bool Client::_release(Inode *in)
4249 {
4250 ldout(cct, 20) << "_release " << *in << dendl;
4251 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4252 _invalidate_inode_cache(in);
4253 return true;
4254 }
4255 return false;
4256 }
4257
4258 bool Client::_flush(Inode *in, Context *onfinish)
4259 {
4260 ldout(cct, 10) << "_flush " << *in << dendl;
4261
4262 if (!in->oset.dirty_or_tx) {
4263 ldout(cct, 10) << " nothing to flush" << dendl;
4264 onfinish->complete(0);
4265 return true;
4266 }
4267
4268 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4269 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4270 objectcacher->purge_set(&in->oset);
4271 if (onfinish) {
4272 onfinish->complete(-CEPHFS_ENOSPC);
4273 }
4274 return true;
4275 }
4276
4277 return objectcacher->flush_set(&in->oset, onfinish);
4278 }
4279
4280 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4281 {
4282 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4283 if (!in->oset.dirty_or_tx) {
4284 ldout(cct, 10) << " nothing to flush" << dendl;
4285 return;
4286 }
4287
4288 C_SaferCond onflush("Client::_flush_range flock");
4289 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4290 offset, size, &onflush);
4291 if (!ret) {
4292 // wait for flush
4293 client_lock.unlock();
4294 onflush.wait();
4295 client_lock.lock();
4296 }
4297 }
4298
4299 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4300 {
4301 // std::scoped_lock l(client_lock);
4302 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4303 Inode *in = static_cast<Inode *>(oset->parent);
4304 ceph_assert(in);
4305 _flushed(in);
4306 }
4307
4308 void Client::_flushed(Inode *in)
4309 {
4310 ldout(cct, 10) << "_flushed " << *in << dendl;
4311
4312 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4313 }
4314
4315
4316
4317 // checks common to add_update_cap, handle_cap_grant
4318 void Client::check_cap_issue(Inode *in, unsigned issued)
4319 {
4320 unsigned had = in->caps_issued();
4321
4322 if ((issued & CEPH_CAP_FILE_CACHE) &&
4323 !(had & CEPH_CAP_FILE_CACHE))
4324 in->cache_gen++;
4325
4326 if ((issued & CEPH_CAP_FILE_SHARED) !=
4327 (had & CEPH_CAP_FILE_SHARED)) {
4328 if (issued & CEPH_CAP_FILE_SHARED)
4329 in->shared_gen++;
4330 if (in->is_dir())
4331 clear_dir_complete_and_ordered(in, true);
4332 }
4333 }
4334
4335 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4336 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4337 inodeno_t realm, int flags, const UserPerm& cap_perms)
4338 {
4339 if (!in->is_any_caps()) {
4340 ceph_assert(in->snaprealm == 0);
4341 in->snaprealm = get_snap_realm(realm);
4342 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4343 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4344 } else {
4345 ceph_assert(in->snaprealm);
4346 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4347 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4348 in->snaprealm_item.remove_myself();
4349 auto oldrealm = in->snaprealm;
4350 in->snaprealm = get_snap_realm(realm);
4351 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4352 put_snap_realm(oldrealm);
4353 }
4354 }
4355
4356 mds_rank_t mds = mds_session->mds_num;
4357 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4358 Cap &cap = capem.first->second;
4359 if (!capem.second) {
4360 if (cap.gen < mds_session->cap_gen)
4361 cap.issued = cap.implemented = CEPH_CAP_PIN;
4362
4363 /*
4364 * auth mds of the inode changed. we received the cap export
4365 * message, but still haven't received the cap import message.
4366 * handle_cap_export() updated the new auth MDS' cap.
4367 *
4368 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4369 * a message that was send before the cap import message. So
4370 * don't remove caps.
4371 */
4372 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4373 if (&cap != in->auth_cap)
4374 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4375
4376 ceph_assert(cap.cap_id == cap_id);
4377 seq = cap.seq;
4378 mseq = cap.mseq;
4379 issued |= cap.issued;
4380 flags |= CEPH_CAP_FLAG_AUTH;
4381 }
4382 } else {
4383 inc_pinned_icaps();
4384 }
4385
4386 check_cap_issue(in, issued);
4387
4388 if (flags & CEPH_CAP_FLAG_AUTH) {
4389 if (in->auth_cap != &cap &&
4390 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4391 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4392 ldout(cct, 10) << __func__ << " changing auth cap: "
4393 << "add myself to new auth MDS' flushing caps list" << dendl;
4394 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4395 }
4396 in->auth_cap = &cap;
4397 }
4398 }
4399
4400 unsigned old_caps = cap.issued;
4401 cap.cap_id = cap_id;
4402 cap.issued = issued;
4403 cap.implemented |= issued;
4404 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4405 cap.wanted = wanted;
4406 else
4407 cap.wanted |= wanted;
4408 cap.seq = seq;
4409 cap.issue_seq = seq;
4410 cap.mseq = mseq;
4411 cap.gen = mds_session->cap_gen;
4412 cap.latest_perms = cap_perms;
4413 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4414 << " from mds." << mds
4415 << " on " << *in
4416 << dendl;
4417
4418 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4419 // non-auth MDS is revoking the newly grant caps ?
4420 for (auto &p : in->caps) {
4421 if (&p.second == &cap)
4422 continue;
4423 if (p.second.implemented & ~p.second.issued & issued) {
4424 check_caps(in, CHECK_CAPS_NODELAY);
4425 break;
4426 }
4427 }
4428 }
4429
4430 if (issued & ~old_caps)
4431 signal_cond_list(in->waitfor_caps);
4432 }
4433
4434 void Client::remove_cap(Cap *cap, bool queue_release)
4435 {
4436 auto &in = cap->inode;
4437 MetaSession *session = cap->session;
4438 mds_rank_t mds = cap->session->mds_num;
4439
4440 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4441
4442 if (queue_release) {
4443 session->enqueue_cap_release(
4444 in.ino,
4445 cap->cap_id,
4446 cap->issue_seq,
4447 cap->mseq,
4448 cap_epoch_barrier);
4449 } else {
4450 dec_pinned_icaps();
4451 }
4452
4453
4454 if (in.auth_cap == cap) {
4455 if (in.flushing_cap_item.is_on_list()) {
4456 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4457 in.flushing_cap_item.remove_myself();
4458 }
4459 in.auth_cap = NULL;
4460 }
4461 size_t n = in.caps.erase(mds);
4462 ceph_assert(n == 1);
4463 cap = nullptr;
4464
4465 if (!in.is_any_caps()) {
4466 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4467 in.snaprealm_item.remove_myself();
4468 put_snap_realm(in.snaprealm);
4469 in.snaprealm = 0;
4470 }
4471 }
4472
4473 void Client::remove_all_caps(Inode *in)
4474 {
4475 while (!in->caps.empty())
4476 remove_cap(&in->caps.begin()->second, true);
4477 }
4478
4479 void Client::remove_session_caps(MetaSession *s, int err)
4480 {
4481 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4482
4483 while (s->caps.size()) {
4484 Cap *cap = *s->caps.begin();
4485 InodeRef in(&cap->inode);
4486 bool dirty_caps = false;
4487 if (in->auth_cap == cap) {
4488 dirty_caps = in->dirty_caps | in->flushing_caps;
4489 in->wanted_max_size = 0;
4490 in->requested_max_size = 0;
4491 if (in->has_any_filelocks())
4492 in->flags |= I_ERROR_FILELOCK;
4493 }
4494 auto caps = cap->implemented;
4495 if (cap->wanted | cap->issued)
4496 in->flags |= I_CAP_DROPPED;
4497 remove_cap(cap, false);
4498 in->cap_snaps.clear();
4499 if (dirty_caps) {
4500 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4501 if (in->flushing_caps) {
4502 num_flushing_caps--;
4503 in->flushing_cap_tids.clear();
4504 }
4505 in->flushing_caps = 0;
4506 in->mark_caps_clean();
4507 put_inode(in.get());
4508 }
4509 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4510 if (caps && !in->caps_issued_mask(caps, true)) {
4511 if (err == -CEPHFS_EBLOCKLISTED) {
4512 if (in->oset.dirty_or_tx) {
4513 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4514 in->set_async_err(err);
4515 }
4516 objectcacher->purge_set(&in->oset);
4517 } else {
4518 objectcacher->release_set(&in->oset);
4519 }
4520 _schedule_invalidate_callback(in.get(), 0, 0);
4521 }
4522
4523 signal_cond_list(in->waitfor_caps);
4524 }
4525 s->flushing_caps_tids.clear();
4526 sync_cond.notify_all();
4527 }
4528
4529 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4530 {
4531 uint64_t max_retries = cct->_conf.get_val<uint64_t>("client_max_retries_on_remount_failure");
4532 bool abort_on_failure = false;
4533
4534 errno = 0;
4535 int r = remount_cb(callback_handle);
4536 if (r == 0) {
4537 retries_on_invalidate = 0;
4538 } else {
4539 int e = errno;
4540 client_t whoami = get_nodeid();
4541 if (r == -1) {
4542 lderr(cct) <<
4543 "failed to remount (to trim kernel dentries): "
4544 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4545 } else {
4546 lderr(cct) <<
4547 "failed to remount (to trim kernel dentries): "
4548 "return code = " << r << dendl;
4549 }
4550 bool should_abort =
4551 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4552 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4553 !(retry_on_error && (++retries_on_invalidate < max_retries));
4554 if (should_abort && !is_unmounting()) {
4555 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4556 abort_on_failure = true;
4557 }
4558 }
4559 return std::make_pair(r, abort_on_failure);
4560 }
4561
4562 class C_Client_Remount : public Context {
4563 private:
4564 Client *client;
4565 public:
4566 explicit C_Client_Remount(Client *c) : client(c) {}
4567 void finish(int r) override {
4568 ceph_assert(r == 0);
4569 auto result = client->_do_remount(true);
4570 if (result.second) {
4571 ceph_abort();
4572 }
4573 }
4574 };
4575
4576 void Client::_invalidate_kernel_dcache()
4577 {
4578 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4579 if (!mref_reader.is_state_satisfied())
4580 return;
4581
4582 if (can_invalidate_dentries) {
4583 if (dentry_invalidate_cb && root->dir) {
4584 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4585 p != root->dir->dentries.end();
4586 ++p) {
4587 if (p->second->inode)
4588 _schedule_invalidate_dentry_callback(p->second, false);
4589 }
4590 }
4591 } else if (remount_cb) {
4592 // Hacky:
4593 // when remounting a file system, linux kernel trims all unused dentries in the fs
4594 remount_finisher.queue(new C_Client_Remount(this));
4595 }
4596 }
4597
4598 void Client::_trim_negative_child_dentries(InodeRef& in)
4599 {
4600 if (!in->is_dir())
4601 return;
4602
4603 Dir* dir = in->dir;
4604 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4605 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4606 Dentry *dn = p->second;
4607 ++p;
4608 ceph_assert(!dn->inode);
4609 if (dn->lru_is_expireable())
4610 unlink(dn, true, false); // keep dir, drop dentry
4611 }
4612 if (dir->dentries.empty()) {
4613 close_dir(dir);
4614 }
4615 }
4616
4617 if (in->flags & I_SNAPDIR_OPEN) {
4618 InodeRef snapdir = open_snapdir(in.get());
4619 _trim_negative_child_dentries(snapdir);
4620 }
4621 }
4622
4623 class C_Client_CacheRelease : public Context {
4624 private:
4625 Client *client;
4626 vinodeno_t ino;
4627 public:
4628 C_Client_CacheRelease(Client *c, Inode *in) :
4629 client(c) {
4630 if (client->use_faked_inos())
4631 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4632 else
4633 ino = in->vino();
4634 }
4635 void finish(int r) override {
4636 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4637 client->_async_inode_release(ino);
4638 }
4639 };
4640
4641 void Client::_async_inode_release(vinodeno_t ino)
4642 {
4643 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4644 if (!mref_reader.is_state_satisfied())
4645 return;
4646
4647 ldout(cct, 10) << __func__ << " " << ino << dendl;
4648 ino_release_cb(callback_handle, ino);
4649 }
4650
4651 void Client::_schedule_ino_release_callback(Inode *in) {
4652
4653 if (ino_release_cb)
4654 // we queue the invalidate, which calls the callback and decrements the ref
4655 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4656 }
4657
4658 void Client::trim_caps(MetaSession *s, uint64_t max)
4659 {
4660 mds_rank_t mds = s->mds_num;
4661 size_t caps_size = s->caps.size();
4662 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4663 << " caps " << caps_size << dendl;
4664
4665 uint64_t trimmed = 0;
4666 auto p = s->caps.begin();
4667 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4668 * looking at from getting deleted during traversal. */
4669 while ((caps_size - trimmed) > max && !p.end()) {
4670 Cap *cap = *p;
4671 InodeRef in(&cap->inode);
4672
4673 // Increment p early because it will be invalidated if cap
4674 // is deleted inside remove_cap
4675 ++p;
4676
4677 if (in->caps.size() > 1 && cap != in->auth_cap) {
4678 int mine = cap->issued | cap->implemented;
4679 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4680 // disposable non-auth cap
4681 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4682 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4683 cap = (remove_cap(cap, true), nullptr);
4684 trimmed++;
4685 }
4686 } else {
4687 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4688 _trim_negative_child_dentries(in);
4689 bool all = true;
4690 auto q = in->dentries.begin();
4691 while (q != in->dentries.end()) {
4692 Dentry *dn = *q;
4693 ++q;
4694 if (dn->lru_is_expireable()) {
4695 if (can_invalidate_dentries &&
4696 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4697 // Only issue one of these per DN for inodes in root: handle
4698 // others more efficiently by calling for root-child DNs at
4699 // the end of this function.
4700 _schedule_invalidate_dentry_callback(dn, true);
4701 }
4702 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4703 to_trim.insert(dn);
4704 } else {
4705 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4706 all = false;
4707 }
4708 }
4709 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4710 _schedule_ino_release_callback(in.get());
4711 }
4712 if (all && in->ino != CEPH_INO_ROOT) {
4713 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4714 trimmed++;
4715 }
4716 }
4717 }
4718 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4719 for (const auto &dn : to_trim) {
4720 trim_dentry(dn);
4721 }
4722 to_trim.clear();
4723
4724 caps_size = s->caps.size();
4725 if (caps_size > (size_t)max)
4726 _invalidate_kernel_dcache();
4727 }
4728
4729 void Client::force_session_readonly(MetaSession *s)
4730 {
4731 s->readonly = true;
4732 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4733 auto &in = (*p)->inode;
4734 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4735 signal_cond_list(in.waitfor_caps);
4736 }
4737 }
4738
4739 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4740 {
4741 MetaSession *session = in->auth_cap->session;
4742
4743 int flushing = in->dirty_caps;
4744 ceph_assert(flushing);
4745
4746 ceph_tid_t flush_tid = ++last_flush_tid;
4747 in->flushing_cap_tids[flush_tid] = flushing;
4748
4749 if (!in->flushing_caps) {
4750 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4751 num_flushing_caps++;
4752 } else {
4753 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4754 }
4755
4756 in->flushing_caps |= flushing;
4757 in->mark_caps_clean();
4758
4759 if (!in->flushing_cap_item.is_on_list())
4760 session->flushing_caps.push_back(&in->flushing_cap_item);
4761 session->flushing_caps_tids.insert(flush_tid);
4762
4763 *ptid = flush_tid;
4764 return flushing;
4765 }
4766
4767 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4768 {
4769 for (auto &p : in->cap_snaps) {
4770 CapSnap &capsnap = p.second;
4771 if (capsnap.flush_tid > 0) {
4772 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4773 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4774 }
4775 }
4776 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4777 it != in->flushing_cap_tids.end();
4778 ++it) {
4779 old_s->flushing_caps_tids.erase(it->first);
4780 new_s->flushing_caps_tids.insert(it->first);
4781 }
4782 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4783 }
4784
4785 /*
4786 * Flush all the dirty caps back to the MDS. Because the callers
4787 * generally wait on the result of this function (syncfs and umount
4788 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4789 */
4790 void Client::flush_caps_sync()
4791 {
4792 ldout(cct, 10) << __func__ << dendl;
4793 for (auto &q : mds_sessions) {
4794 auto s = q.second;
4795 xlist<Inode*>::iterator p = s->dirty_list.begin();
4796 while (!p.end()) {
4797 unsigned flags = CHECK_CAPS_NODELAY;
4798 Inode *in = *p;
4799
4800 ++p;
4801 if (p.end())
4802 flags |= CHECK_CAPS_SYNCHRONOUS;
4803 check_caps(in, flags);
4804 }
4805 }
4806 }
4807
4808 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4809 {
4810 while (in->flushing_caps) {
4811 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4812 ceph_assert(it != in->flushing_cap_tids.end());
4813 if (it->first > want)
4814 break;
4815 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4816 << ccap_string(it->second) << " want " << want
4817 << " last " << it->first << dendl;
4818 wait_on_list(in->waitfor_caps);
4819 }
4820 }
4821
4822 void Client::wait_sync_caps(ceph_tid_t want)
4823 {
4824 retry:
4825 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4826 << num_flushing_caps << " total flushing)" << dendl;
4827 for (auto &p : mds_sessions) {
4828 auto s = p.second;
4829 if (s->flushing_caps_tids.empty())
4830 continue;
4831 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4832 if (oldest_tid <= want) {
4833 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4834 << " (want " << want << ")" << dendl;
4835 std::unique_lock l{client_lock, std::adopt_lock};
4836 sync_cond.wait(l);
4837 l.release();
4838 goto retry;
4839 }
4840 }
4841 }
4842
4843 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4844 {
4845 in->flags &= ~I_KICK_FLUSH;
4846
4847 Cap *cap = in->auth_cap;
4848 ceph_assert(cap->session == session);
4849
4850 ceph_tid_t last_snap_flush = 0;
4851 for (auto p = in->flushing_cap_tids.rbegin();
4852 p != in->flushing_cap_tids.rend();
4853 ++p) {
4854 if (!p->second) {
4855 last_snap_flush = p->first;
4856 break;
4857 }
4858 }
4859
4860 int wanted = in->caps_wanted();
4861 int used = get_caps_used(in) | in->caps_dirty();
4862 auto it = in->cap_snaps.begin();
4863 for (auto& p : in->flushing_cap_tids) {
4864 if (p.second) {
4865 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4866 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4867 p.second, p.first);
4868 } else {
4869 ceph_assert(it != in->cap_snaps.end());
4870 ceph_assert(it->second.flush_tid == p.first);
4871 send_flush_snap(in, session, it->first, it->second);
4872 ++it;
4873 }
4874 }
4875 }
4876
4877 void Client::kick_flushing_caps(MetaSession *session)
4878 {
4879 mds_rank_t mds = session->mds_num;
4880 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4881
4882 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4883 Inode *in = *p;
4884 if (in->flags & I_KICK_FLUSH) {
4885 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4886 kick_flushing_caps(in, session);
4887 }
4888 }
4889 }
4890
4891 void Client::early_kick_flushing_caps(MetaSession *session)
4892 {
4893 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4894 Inode *in = *p;
4895 Cap *cap = in->auth_cap;
4896 ceph_assert(cap);
4897
4898 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4899 // stage. This guarantees that MDS processes the cap flush message before issuing
4900 // the flushing caps to other client.
4901 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4902 in->flags |= I_KICK_FLUSH;
4903 continue;
4904 }
4905
4906 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4907 << " to mds." << session->mds_num << dendl;
4908 // send_reconnect() also will reset these sequence numbers. make sure
4909 // sequence numbers in cap flush message match later reconnect message.
4910 cap->seq = 0;
4911 cap->issue_seq = 0;
4912 cap->mseq = 0;
4913 cap->issued = cap->implemented;
4914
4915 kick_flushing_caps(in, session);
4916 }
4917 }
4918
4919 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4920 {
4921 list<SnapRealm*> q;
4922 q.push_back(realm);
4923
4924 while (!q.empty()) {
4925 realm = q.front();
4926 q.pop_front();
4927
4928 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4929 realm->invalidate_cache();
4930
4931 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4932 p != realm->pchildren.end();
4933 ++p)
4934 q.push_back(*p);
4935 }
4936 }
4937
4938 SnapRealm *Client::get_snap_realm(inodeno_t r)
4939 {
4940 SnapRealm *realm = snap_realms[r];
4941
4942 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
4943 << (realm ? realm->nref : 0) << dendl;
4944 if (!realm) {
4945 snap_realms[r] = realm = new SnapRealm(r);
4946
4947 // Do not release the global snaprealm until unmounting.
4948 if (r == CEPH_INO_GLOBAL_SNAPREALM)
4949 realm->nref++;
4950 }
4951
4952 realm->nref++;
4953 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
4954 << realm->nref << dendl;
4955 return realm;
4956 }
4957
4958 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4959 {
4960 if (snap_realms.count(r) == 0) {
4961 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4962 return NULL;
4963 }
4964 SnapRealm *realm = snap_realms[r];
4965 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4966 realm->nref++;
4967 return realm;
4968 }
4969
4970 void Client::put_snap_realm(SnapRealm *realm)
4971 {
4972 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4973 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4974 if (--realm->nref == 0) {
4975 snap_realms.erase(realm->ino);
4976 if (realm->pparent) {
4977 realm->pparent->pchildren.erase(realm);
4978 put_snap_realm(realm->pparent);
4979 }
4980 delete realm;
4981 }
4982 }
4983
4984 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4985 {
4986 if (realm->parent != parent) {
4987 ldout(cct, 10) << __func__ << " " << *realm
4988 << " " << realm->parent << " -> " << parent << dendl;
4989 realm->parent = parent;
4990 if (realm->pparent) {
4991 realm->pparent->pchildren.erase(realm);
4992 put_snap_realm(realm->pparent);
4993 }
4994 realm->pparent = get_snap_realm(parent);
4995 realm->pparent->pchildren.insert(realm);
4996 return true;
4997 }
4998 return false;
4999 }
5000
5001 static bool has_new_snaps(const SnapContext& old_snapc,
5002 const SnapContext& new_snapc)
5003 {
5004 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
5005 }
5006
5007
5008 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
5009 {
5010 SnapRealm *first_realm = NULL;
5011 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
5012
5013 map<SnapRealm*, SnapContext> dirty_realms;
5014
5015 auto p = bl.cbegin();
5016 while (!p.end()) {
5017 SnapRealmInfo info;
5018 decode(info, p);
5019 SnapRealm *realm = get_snap_realm(info.ino());
5020
5021 bool invalidate = false;
5022
5023 if (info.seq() > realm->seq) {
5024 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
5025 << dendl;
5026
5027 if (flush) {
5028 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5029 // flush me + children
5030 list<SnapRealm*> q;
5031 q.push_back(realm);
5032 while (!q.empty()) {
5033 SnapRealm *realm = q.front();
5034 q.pop_front();
5035
5036 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5037 p != realm->pchildren.end();
5038 ++p)
5039 q.push_back(*p);
5040
5041 if (dirty_realms.count(realm) == 0) {
5042 realm->nref++;
5043 dirty_realms[realm] = realm->get_snap_context();
5044 }
5045 }
5046 }
5047
5048 // update
5049 realm->seq = info.seq();
5050 realm->created = info.created();
5051 realm->parent_since = info.parent_since();
5052 realm->prior_parent_snaps = info.prior_parent_snaps;
5053 realm->my_snaps = info.my_snaps;
5054 invalidate = true;
5055 }
5056
5057 // _always_ verify parent
5058 if (adjust_realm_parent(realm, info.parent()))
5059 invalidate = true;
5060
5061 if (invalidate) {
5062 invalidate_snaprealm_and_children(realm);
5063 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
5064 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
5065 } else {
5066 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
5067 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5068 }
5069
5070 if (!first_realm)
5071 first_realm = realm;
5072 else
5073 put_snap_realm(realm);
5074 }
5075
5076 for (auto &[realm, snapc] : dirty_realms) {
5077 // if there are new snaps ?
5078 if (has_new_snaps(snapc, realm->get_snap_context())) {
5079 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
5080 for (auto&& in : realm->inodes_with_caps) {
5081 queue_cap_snap(in, snapc);
5082 }
5083 } else {
5084 ldout(cct, 10) << " no new snap on " << *realm << dendl;
5085 }
5086 put_snap_realm(realm);
5087 }
5088
5089 if (realm_ret)
5090 *realm_ret = first_realm;
5091 else
5092 put_snap_realm(first_realm);
5093 }
5094
5095 void Client::handle_snap(const MConstRef<MClientSnap>& m)
5096 {
5097 ldout(cct, 10) << __func__ << " " << *m << dendl;
5098 mds_rank_t mds = mds_rank_t(m->get_source().num());
5099
5100 std::scoped_lock cl(client_lock);
5101 auto session = _get_mds_session(mds, m->get_connection().get());
5102 if (!session) {
5103 return;
5104 }
5105
5106 got_mds_push(session.get());
5107
5108 map<Inode*, SnapContext> to_move;
5109 SnapRealm *realm = 0;
5110
5111 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5112 ceph_assert(m->head.split);
5113 SnapRealmInfo info;
5114 auto p = m->bl.cbegin();
5115 decode(info, p);
5116 ceph_assert(info.ino() == m->head.split);
5117
5118 // flush, then move, ino's.
5119 realm = get_snap_realm(info.ino());
5120 ldout(cct, 10) << " splitting off " << *realm << dendl;
5121 for (auto& ino : m->split_inos) {
5122 vinodeno_t vino(ino, CEPH_NOSNAP);
5123 if (inode_map.count(vino)) {
5124 Inode *in = inode_map[vino];
5125 if (!in->snaprealm || in->snaprealm == realm)
5126 continue;
5127 if (in->snaprealm->created > info.created()) {
5128 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5129 << *in->snaprealm << dendl;
5130 continue;
5131 }
5132 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5133
5134
5135 in->snaprealm_item.remove_myself();
5136 to_move[in] = in->snaprealm->get_snap_context();
5137 put_snap_realm(in->snaprealm);
5138 }
5139 }
5140
5141 // move child snaprealms, too
5142 for (auto& child_realm : m->split_realms) {
5143 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5144 SnapRealm *child = get_snap_realm_maybe(child_realm);
5145 if (!child)
5146 continue;
5147 adjust_realm_parent(child, realm->ino);
5148 put_snap_realm(child);
5149 }
5150 }
5151
5152 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5153
5154 if (realm) {
5155 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5156 Inode *in = p->first;
5157 in->snaprealm = realm;
5158 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5159 realm->nref++;
5160 // queue for snap writeback
5161 if (has_new_snaps(p->second, realm->get_snap_context()))
5162 queue_cap_snap(in, p->second);
5163 }
5164 put_snap_realm(realm);
5165 }
5166 }
5167
5168 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5169 {
5170 mds_rank_t mds = mds_rank_t(m->get_source().num());
5171
5172 std::scoped_lock cl(client_lock);
5173 auto session = _get_mds_session(mds, m->get_connection().get());
5174 if (!session) {
5175 return;
5176 }
5177
5178 got_mds_push(session.get());
5179
5180 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5181
5182 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5183 if (inode_map.count(vino)) {
5184 Inode *in = NULL;
5185 in = inode_map[vino];
5186
5187 if (in) {
5188 in->quota = m->quota;
5189 in->rstat = m->rstat;
5190 }
5191 }
5192 }
5193
5194 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5195 {
5196 mds_rank_t mds = mds_rank_t(m->get_source().num());
5197
5198 std::scoped_lock cl(client_lock);
5199 auto session = _get_mds_session(mds, m->get_connection().get());
5200 if (!session) {
5201 return;
5202 }
5203
5204 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5205 // Pause RADOS operations until we see the required epoch
5206 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5207 }
5208
5209 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5210 // Record the barrier so that we will transmit it to MDS when releasing
5211 set_cap_epoch_barrier(m->osd_epoch_barrier);
5212 }
5213
5214 got_mds_push(session.get());
5215
5216 Inode *in;
5217 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5218 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5219 in = it->second;
5220 } else {
5221 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5222 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5223 session->enqueue_cap_release(
5224 m->get_ino(),
5225 m->get_cap_id(),
5226 m->get_seq(),
5227 m->get_mseq(),
5228 cap_epoch_barrier);
5229 } else {
5230 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5231 }
5232
5233 // in case the mds is waiting on e.g. a revocation
5234 flush_cap_releases();
5235 return;
5236 }
5237
5238 switch (m->get_op()) {
5239 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5240 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5241 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5242 }
5243
5244 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5245 Cap &cap = in->caps.at(mds);
5246
5247 switch (m->get_op()) {
5248 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5249 case CEPH_CAP_OP_IMPORT:
5250 case CEPH_CAP_OP_REVOKE:
5251 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5252 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5253 }
5254 } else {
5255 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5256 return;
5257 }
5258 }
5259
5260 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5261 {
5262 mds_rank_t mds = session->mds_num;
5263
5264 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5265 << " IMPORT from mds." << mds << dendl;
5266
5267 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5268 Cap *cap = NULL;
5269 UserPerm cap_perms;
5270 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5271 cap = &it->second;
5272 cap_perms = cap->latest_perms;
5273 }
5274
5275 // add/update it
5276 SnapRealm *realm = NULL;
5277 update_snap_trace(m->snapbl, &realm);
5278
5279 int issued = m->get_caps();
5280 int wanted = m->get_wanted();
5281 add_update_cap(in, session, m->get_cap_id(),
5282 issued, wanted, m->get_seq(), m->get_mseq(),
5283 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5284
5285 if (cap && cap->cap_id == m->peer.cap_id) {
5286 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5287 }
5288
5289 if (realm)
5290 put_snap_realm(realm);
5291
5292 if (in->auth_cap && in->auth_cap->session == session) {
5293 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5294 in->requested_max_size > m->get_max_size()) {
5295 in->requested_max_size = 0;
5296 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5297 }
5298 // reflush any/all caps (if we are now the auth_cap)
5299 kick_flushing_caps(in, session);
5300 }
5301 }
5302
5303 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5304 {
5305 mds_rank_t mds = session->mds_num;
5306
5307 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5308 << " EXPORT from mds." << mds << dendl;
5309
5310 auto it = in->caps.find(mds);
5311 if (it != in->caps.end()) {
5312 Cap &cap = it->second;
5313 if (cap.cap_id == m->get_cap_id()) {
5314 if (m->peer.cap_id) {
5315 const auto peer_mds = mds_rank_t(m->peer.mds);
5316 auto tsession = _get_or_open_mds_session(peer_mds);
5317 auto it = in->caps.find(peer_mds);
5318 if (it != in->caps.end()) {
5319 Cap &tcap = it->second;
5320 if (tcap.cap_id == m->peer.cap_id &&
5321 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5322 tcap.cap_id = m->peer.cap_id;
5323 tcap.seq = m->peer.seq - 1;
5324 tcap.issue_seq = tcap.seq;
5325 tcap.issued |= cap.issued;
5326 tcap.implemented |= cap.issued;
5327 if (&cap == in->auth_cap)
5328 in->auth_cap = &tcap;
5329 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5330 adjust_session_flushing_caps(in, session, tsession.get());
5331 }
5332 } else {
5333 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5334 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5335 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5336 cap.latest_perms);
5337 }
5338 } else {
5339 if (cap.wanted | cap.issued)
5340 in->flags |= I_CAP_DROPPED;
5341 }
5342
5343 remove_cap(&cap, false);
5344 }
5345 }
5346 }
5347
5348 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5349 {
5350 mds_rank_t mds = session->mds_num;
5351 ceph_assert(in->caps.count(mds));
5352
5353 ldout(cct, 10) << __func__ << " on ino " << *in
5354 << " size " << in->size << " -> " << m->get_size()
5355 << dendl;
5356
5357 int issued;
5358 in->caps_issued(&issued);
5359 issued |= in->caps_dirty();
5360 update_inode_file_size(in, issued, m->get_size(),
5361 m->get_truncate_seq(), m->get_truncate_size());
5362 }
5363
5364 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5365 {
5366 ceph_tid_t flush_ack_tid = m->get_client_tid();
5367 int dirty = m->get_dirty();
5368 int cleaned = 0;
5369 int flushed = 0;
5370
5371 auto it = in->flushing_cap_tids.begin();
5372 if (it->first < flush_ack_tid) {
5373 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5374 << " got unexpected flush ack tid " << flush_ack_tid
5375 << " expected is " << it->first << dendl;
5376 }
5377 for (; it != in->flushing_cap_tids.end(); ) {
5378 if (!it->second) {
5379 // cap snap
5380 ++it;
5381 continue;
5382 }
5383 if (it->first == flush_ack_tid)
5384 cleaned = it->second;
5385 if (it->first <= flush_ack_tid) {
5386 session->flushing_caps_tids.erase(it->first);
5387 in->flushing_cap_tids.erase(it++);
5388 ++flushed;
5389 continue;
5390 }
5391 cleaned &= ~it->second;
5392 if (!cleaned)
5393 break;
5394 ++it;
5395 }
5396
5397 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5398 << " cleaned " << ccap_string(cleaned) << " on " << *in
5399 << " with " << ccap_string(dirty) << dendl;
5400
5401 if (flushed) {
5402 signal_cond_list(in->waitfor_caps);
5403 if (session->flushing_caps_tids.empty() ||
5404 *session->flushing_caps_tids.begin() > flush_ack_tid)
5405 sync_cond.notify_all();
5406 }
5407
5408 if (!dirty) {
5409 in->cap_dirtier_uid = -1;
5410 in->cap_dirtier_gid = -1;
5411 }
5412
5413 if (!cleaned) {
5414 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5415 } else {
5416 if (in->flushing_caps) {
5417 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5418 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5419 in->flushing_caps &= ~cleaned;
5420 if (in->flushing_caps == 0) {
5421 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5422 num_flushing_caps--;
5423 if (in->flushing_cap_tids.empty())
5424 in->flushing_cap_item.remove_myself();
5425 }
5426 if (!in->caps_dirty())
5427 put_inode(in);
5428 }
5429 }
5430 }
5431
5432
5433 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5434 {
5435 ceph_tid_t flush_ack_tid = m->get_client_tid();
5436 mds_rank_t mds = session->mds_num;
5437 ceph_assert(in->caps.count(mds));
5438 snapid_t follows = m->get_snap_follows();
5439
5440 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5441 auto& capsnap = it->second;
5442 if (flush_ack_tid != capsnap.flush_tid) {
5443 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5444 } else {
5445 InodeRef tmp_ref(in);
5446 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5447 << " on " << *in << dendl;
5448 session->flushing_caps_tids.erase(capsnap.flush_tid);
5449 in->flushing_cap_tids.erase(capsnap.flush_tid);
5450 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5451 in->flushing_cap_item.remove_myself();
5452 in->cap_snaps.erase(it);
5453
5454 signal_cond_list(in->waitfor_caps);
5455 if (session->flushing_caps_tids.empty() ||
5456 *session->flushing_caps_tids.begin() > flush_ack_tid)
5457 sync_cond.notify_all();
5458 }
5459 } else {
5460 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5461 << " on " << *in << dendl;
5462 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5463 }
5464 }
5465
5466 class C_Client_DentryInvalidate : public Context {
5467 private:
5468 Client *client;
5469 vinodeno_t dirino;
5470 vinodeno_t ino;
5471 string name;
5472 public:
5473 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5474 client(c), name(dn->name) {
5475 if (client->use_faked_inos()) {
5476 dirino.ino = dn->dir->parent_inode->faked_ino;
5477 if (del)
5478 ino.ino = dn->inode->faked_ino;
5479 } else {
5480 dirino = dn->dir->parent_inode->vino();
5481 if (del)
5482 ino = dn->inode->vino();
5483 }
5484 if (!del)
5485 ino.ino = inodeno_t();
5486 }
5487 void finish(int r) override {
5488 // _async_dentry_invalidate is responsible for its own locking
5489 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5490 client->_async_dentry_invalidate(dirino, ino, name);
5491 }
5492 };
5493
5494 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5495 {
5496 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5497 if (!mref_reader.is_state_satisfied())
5498 return;
5499
5500 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5501 << " in dir " << dirino << dendl;
5502 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5503 }
5504
5505 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5506 {
5507 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5508 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5509 }
5510
5511 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5512 {
5513 int ref = in->get_nref();
5514 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5515
5516 if (in->dir && !in->dir->dentries.empty()) {
5517 for (auto p = in->dir->dentries.begin();
5518 p != in->dir->dentries.end(); ) {
5519 Dentry *dn = p->second;
5520 ++p;
5521 /* rmsnap removes whole subtree, need trim inodes recursively.
5522 * we don't need to invalidate dentries recursively. because
5523 * invalidating a directory dentry effectively invalidate
5524 * whole subtree */
5525 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5526 _try_to_trim_inode(dn->inode.get(), false);
5527
5528 if (dn->lru_is_expireable())
5529 unlink(dn, true, false); // keep dir, drop dentry
5530 }
5531 if (in->dir->dentries.empty()) {
5532 close_dir(in->dir);
5533 --ref;
5534 }
5535 }
5536
5537 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5538 InodeRef snapdir = open_snapdir(in);
5539 _try_to_trim_inode(snapdir.get(), false);
5540 --ref;
5541 }
5542
5543 if (ref > 1) {
5544 auto q = in->dentries.begin();
5545 while (q != in->dentries.end()) {
5546 Dentry *dn = *q;
5547 ++q;
5548 if( in->ll_ref > 0 && sched_inval) {
5549 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5550 // so in->dentries doesn't always reflect the state of kernel's dcache.
5551 _schedule_invalidate_dentry_callback(dn, true);
5552 }
5553 unlink(dn, true, true);
5554 }
5555 }
5556 }
5557
5558 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5559 {
5560 mds_rank_t mds = session->mds_num;
5561 int used = get_caps_used(in);
5562 int wanted = in->caps_wanted();
5563 int flags = 0;
5564
5565 const unsigned new_caps = m->get_caps();
5566 const bool was_stale = session->cap_gen > cap->gen;
5567 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5568 << " mds." << mds << " seq " << m->get_seq()
5569 << " caps now " << ccap_string(new_caps)
5570 << " was " << ccap_string(cap->issued)
5571 << (was_stale ? " (stale)" : "") << dendl;
5572
5573 if (was_stale)
5574 cap->issued = cap->implemented = CEPH_CAP_PIN;
5575 cap->seq = m->get_seq();
5576 cap->gen = session->cap_gen;
5577
5578 check_cap_issue(in, new_caps);
5579
5580 // update inode
5581 int issued;
5582 in->caps_issued(&issued);
5583 issued |= in->caps_dirty();
5584
5585 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5586 !(issued & CEPH_CAP_AUTH_EXCL)) {
5587 in->mode = m->head.mode;
5588 in->uid = m->head.uid;
5589 in->gid = m->head.gid;
5590 in->btime = m->btime;
5591 }
5592 bool deleted_inode = false;
5593 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5594 !(issued & CEPH_CAP_LINK_EXCL)) {
5595 in->nlink = m->head.nlink;
5596 if (in->nlink == 0)
5597 deleted_inode = true;
5598 }
5599 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5600 m->xattrbl.length() &&
5601 m->head.xattr_version > in->xattr_version) {
5602 auto p = m->xattrbl.cbegin();
5603 decode(in->xattrs, p);
5604 in->xattr_version = m->head.xattr_version;
5605 }
5606
5607 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5608 in->dirstat.nfiles = m->get_nfiles();
5609 in->dirstat.nsubdirs = m->get_nsubdirs();
5610 }
5611
5612 if (new_caps & CEPH_CAP_ANY_RD) {
5613 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5614 m->get_ctime(), m->get_mtime(), m->get_atime());
5615 }
5616
5617 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5618 in->layout = m->get_layout();
5619 update_inode_file_size(in, issued, m->get_size(),
5620 m->get_truncate_seq(), m->get_truncate_size());
5621 }
5622
5623 if (m->inline_version > in->inline_version) {
5624 in->inline_data = m->inline_data;
5625 in->inline_version = m->inline_version;
5626 }
5627
5628 /* always take a newer change attr */
5629 if (m->get_change_attr() > in->change_attr)
5630 in->change_attr = m->get_change_attr();
5631
5632 // max_size
5633 if (cap == in->auth_cap &&
5634 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5635 (m->get_max_size() != in->max_size)) {
5636 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5637 in->max_size = m->get_max_size();
5638 if (in->max_size > in->wanted_max_size) {
5639 in->wanted_max_size = 0;
5640 in->requested_max_size = 0;
5641 }
5642 }
5643
5644 bool check = false;
5645 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5646 (wanted & ~(cap->wanted | new_caps))) {
5647 // If mds is importing cap, prior cap messages that update 'wanted'
5648 // may get dropped by mds (migrate seq mismatch).
5649 //
5650 // We don't send cap message to update 'wanted' if what we want are
5651 // already issued. If mds revokes caps, cap message that releases caps
5652 // also tells mds what we want. But if caps got revoked by mds forcedly
5653 // (session stale). We may haven't told mds what we want.
5654 check = true;
5655 }
5656
5657
5658 // update caps
5659 auto revoked = cap->issued & ~new_caps;
5660 if (revoked) {
5661 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5662 cap->issued = new_caps;
5663 cap->implemented |= new_caps;
5664
5665 // recall delegations if we're losing caps necessary for them
5666 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5667 in->recall_deleg(false);
5668 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5669 in->recall_deleg(true);
5670
5671 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5672 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5673 !_flush(in, new C_Client_FlushComplete(this, in))) {
5674 // waitin' for flush
5675 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5676 if (_release(in)) {
5677 check = true;
5678 flags = CHECK_CAPS_NODELAY;
5679 }
5680 } else {
5681 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5682 check = true;
5683 flags = CHECK_CAPS_NODELAY;
5684 }
5685 } else if (cap->issued == new_caps) {
5686 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5687 } else {
5688 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5689 cap->issued = new_caps;
5690 cap->implemented |= new_caps;
5691
5692 if (cap == in->auth_cap) {
5693 // non-auth MDS is revoking the newly grant caps ?
5694 for (const auto &p : in->caps) {
5695 if (&p.second == cap)
5696 continue;
5697 if (p.second.implemented & ~p.second.issued & new_caps) {
5698 check = true;
5699 break;
5700 }
5701 }
5702 }
5703 }
5704
5705 if (check)
5706 check_caps(in, flags);
5707
5708 // wake up waiters
5709 if (new_caps)
5710 signal_cond_list(in->waitfor_caps);
5711
5712 // may drop inode's last ref
5713 if (deleted_inode)
5714 _try_to_trim_inode(in, true);
5715 }
5716
5717 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5718 {
5719 if (perms.uid() == 0) {
5720 // For directories, DACs are overridable.
5721 // For files, Read/write DACs are always overridable but executable DACs are
5722 // overridable when there is at least one exec bit set
5723 if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
5724 return -CEPHFS_EACCES;
5725 return 0;
5726 }
5727
5728 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5729 int ret = _posix_acl_permission(in, perms, want);
5730 if (ret != -CEPHFS_EAGAIN)
5731 return ret;
5732 }
5733
5734 // check permissions before doing anything else
5735 if (!in->check_mode(perms, want))
5736 return -CEPHFS_EACCES;
5737 return 0;
5738 }
5739
5740 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5741 const UserPerm& perms)
5742 {
5743 int r = _getattr_for_perm(in, perms);
5744 if (r < 0)
5745 goto out;
5746
5747 r = 0;
5748 if (strncmp(name, "system.", 7) == 0) {
5749 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5750 r = -CEPHFS_EPERM;
5751 } else {
5752 r = inode_permission(in, perms, want);
5753 }
5754 out:
5755 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5756 return r;
5757 }
5758
5759 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5760 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5761 return out;
5762 }
5763
5764 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5765 const UserPerm& perms)
5766 {
5767 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5768 int r = _getattr_for_perm(in, perms);
5769 if (r < 0)
5770 goto out;
5771
5772 if (mask & CEPH_SETATTR_SIZE) {
5773 r = inode_permission(in, perms, MAY_WRITE);
5774 if (r < 0)
5775 goto out;
5776 }
5777
5778 r = -CEPHFS_EPERM;
5779 if (mask & CEPH_SETATTR_UID) {
5780 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5781 goto out;
5782 }
5783 if (mask & CEPH_SETATTR_GID) {
5784 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5785 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5786 goto out;
5787 }
5788
5789 if (mask & CEPH_SETATTR_MODE) {
5790 if (perms.uid() != 0 && perms.uid() != in->uid)
5791 goto out;
5792
5793 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5794 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5795 stx->stx_mode &= ~S_ISGID;
5796 }
5797
5798 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5799 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5800 if (perms.uid() != 0 && perms.uid() != in->uid) {
5801 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5802 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5803 check_mask |= CEPH_SETATTR_MTIME;
5804 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5805 check_mask |= CEPH_SETATTR_ATIME;
5806 if (check_mask & mask) {
5807 goto out;
5808 } else {
5809 r = inode_permission(in, perms, MAY_WRITE);
5810 if (r < 0)
5811 goto out;
5812 }
5813 }
5814 }
5815 r = 0;
5816 out:
5817 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5818 return r;
5819 }
5820
5821 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5822 {
5823 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5824 unsigned want = 0;
5825
5826 if ((flags & O_ACCMODE) == O_WRONLY)
5827 want = MAY_WRITE;
5828 else if ((flags & O_ACCMODE) == O_RDWR)
5829 want = MAY_READ | MAY_WRITE;
5830 else if ((flags & O_ACCMODE) == O_RDONLY)
5831 want = MAY_READ;
5832 if (flags & O_TRUNC)
5833 want |= MAY_WRITE;
5834
5835 int r = 0;
5836 switch (in->mode & S_IFMT) {
5837 case S_IFLNK:
5838 r = -CEPHFS_ELOOP;
5839 goto out;
5840 case S_IFDIR:
5841 if (want & MAY_WRITE) {
5842 r = -CEPHFS_EISDIR;
5843 goto out;
5844 }
5845 break;
5846 }
5847
5848 r = _getattr_for_perm(in, perms);
5849 if (r < 0)
5850 goto out;
5851
5852 r = inode_permission(in, perms, want);
5853 out:
5854 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5855 return r;
5856 }
5857
5858 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5859 {
5860 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5861 int r = _getattr_for_perm(dir, perms);
5862 if (r < 0)
5863 goto out;
5864
5865 r = inode_permission(dir, perms, MAY_EXEC);
5866 out:
5867 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5868 return r;
5869 }
5870
5871 int Client::may_create(Inode *dir, const UserPerm& perms)
5872 {
5873 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5874 int r = _getattr_for_perm(dir, perms);
5875 if (r < 0)
5876 goto out;
5877
5878 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5879 out:
5880 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5881 return r;
5882 }
5883
5884 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5885 {
5886 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5887 int r = _getattr_for_perm(dir, perms);
5888 if (r < 0)
5889 goto out;
5890
5891 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5892 if (r < 0)
5893 goto out;
5894
5895 /* 'name == NULL' means rmsnap w/o permission checks */
5896 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5897 InodeRef otherin;
5898 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5899 if (r < 0)
5900 goto out;
5901 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5902 r = -CEPHFS_EPERM;
5903 }
5904 out:
5905 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5906 return r;
5907 }
5908
5909 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5910 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5911
5912 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5913 if (!mref_reader.is_state_satisfied())
5914 return -ENOTCONN;
5915
5916 filepath path(relpath);
5917 string name = path.last_dentry();
5918 path.pop_dentry();
5919 InodeRef dir;
5920
5921 std::scoped_lock lock(client_lock);
5922 int r = path_walk(path, &dir, perms);
5923 if (r < 0)
5924 return r;
5925 if (cct->_conf->client_permissions) {
5926 int r = may_delete(dir.get(), name.c_str(), perms);
5927 if (r < 0)
5928 return r;
5929 }
5930
5931 return 0;
5932 }
5933
5934 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5935 {
5936 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5937 int r = _getattr_for_perm(in, perms);
5938 if (r < 0)
5939 goto out;
5940
5941 if (perms.uid() == 0 || perms.uid() == in->uid) {
5942 r = 0;
5943 goto out;
5944 }
5945
5946 r = -CEPHFS_EPERM;
5947 if (!S_ISREG(in->mode))
5948 goto out;
5949
5950 if (in->mode & S_ISUID)
5951 goto out;
5952
5953 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5954 goto out;
5955
5956 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5957 out:
5958 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5959 return r;
5960 }
5961
5962 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5963 {
5964 int mask = CEPH_STAT_CAP_MODE;
5965 bool force = false;
5966 if (acl_type != NO_ACL) {
5967 mask |= CEPH_STAT_CAP_XATTR;
5968 force = in->xattr_version == 0;
5969 }
5970 return _getattr(in, mask, perms, force);
5971 }
5972
5973 vinodeno_t Client::_get_vino(Inode *in)
5974 {
5975 /* The caller must hold the client lock */
5976 return vinodeno_t(in->ino, in->snapid);
5977 }
5978
5979 /**
5980 * Resolve an MDS spec to a list of MDS daemon GIDs.
5981 *
5982 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5983 * It may be '*' in which case it matches all GIDs.
5984 *
5985 * If no error is returned, the `targets` vector will be populated with at least
5986 * one MDS.
5987 */
5988 int Client::resolve_mds(
5989 const std::string &mds_spec,
5990 std::vector<mds_gid_t> *targets)
5991 {
5992 ceph_assert(fsmap);
5993 ceph_assert(targets != nullptr);
5994
5995 mds_role_t role;
5996 CachedStackStringStream css;
5997 int role_r = fsmap->parse_role(mds_spec, &role, *css);
5998 if (role_r == 0) {
5999 // We got a role, resolve it to a GID
6000 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
6001 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
6002 << role << "' aka " << info.human_name() << dendl;
6003 targets->push_back(info.global_id);
6004 return 0;
6005 }
6006
6007 std::string strtol_err;
6008 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
6009 if (strtol_err.empty()) {
6010 // It is a possible GID
6011 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
6012 if (fsmap->gid_exists(mds_gid)) {
6013 auto& info = fsmap->get_info_gid(mds_gid);
6014 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
6015 << info.human_name() << dendl;
6016 targets->push_back(mds_gid);
6017 return 0;
6018 } else {
6019 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
6020 << dendl;
6021 lderr(cct) << "FSMap: " << *fsmap << dendl;
6022 return -CEPHFS_ENOENT;
6023 }
6024 } else if (mds_spec == "*") {
6025 // It is a wildcard: use all MDSs
6026 const auto& mds_info = fsmap->get_mds_info();
6027
6028 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
6029 if (mds_info.empty()) {
6030 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
6031 lderr(cct) << "FSMap: " << *fsmap << dendl;
6032 return -CEPHFS_ENOENT;
6033 }
6034
6035 for (const auto& [gid, info] : mds_info) {
6036 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6037 targets->push_back(gid);
6038 }
6039 return 0;
6040 } else {
6041 // It did not parse as an integer, it is not a wildcard, it must be a name
6042 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6043 if (mds_gid == 0) {
6044 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
6045 lderr(cct) << "FSMap: " << *fsmap << dendl;
6046 return -CEPHFS_ENOENT;
6047 } else {
6048 auto& info = fsmap->get_info_gid(mds_gid);
6049 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6050 << "' to " << info.human_name() << dendl;
6051 targets->push_back(mds_gid);
6052 }
6053 return 0;
6054 }
6055 }
6056
6057
6058 /**
6059 * Authenticate with mon and establish global ID
6060 */
6061 int Client::authenticate()
6062 {
6063 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6064
6065 if (monclient->is_authenticated()) {
6066 return 0;
6067 }
6068
6069 client_lock.unlock();
6070 int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
6071 client_lock.lock();
6072 if (r < 0) {
6073 return r;
6074 }
6075
6076 whoami = monclient->get_global_id();
6077 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6078
6079 return 0;
6080 }
6081
6082 int Client::fetch_fsmap(bool user)
6083 {
6084 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6085
6086 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6087 // rather than MDSMap because no one MDSMap contains all the daemons, and
6088 // a `tell` can address any daemon.
6089 version_t fsmap_latest;
6090 bs::error_code ec;
6091 do {
6092 client_lock.unlock();
6093 std::tie(fsmap_latest, std::ignore) =
6094 monclient->get_version("fsmap", ca::use_blocked[ec]);
6095 client_lock.lock();
6096 } while (ec == bs::errc::resource_unavailable_try_again);
6097
6098 if (ec) {
6099 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6100 return ceph::from_error_code(ec);
6101 }
6102
6103 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6104
6105 if (user) {
6106 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6107 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6108 monclient->renew_subs();
6109 wait_on_list(waiting_for_fsmap);
6110 }
6111 ceph_assert(fsmap_user);
6112 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6113 } else {
6114 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6115 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6116 monclient->renew_subs();
6117 wait_on_list(waiting_for_fsmap);
6118 }
6119 ceph_assert(fsmap);
6120 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6121 }
6122 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6123 << fsmap_latest << dendl;
6124 return 0;
6125 }
6126
6127 /**
6128 *
6129 * @mds_spec one of ID, rank, GID, "*"
6130 *
6131 */
6132 int Client::mds_command(
6133 const std::string &mds_spec,
6134 const vector<string>& cmd,
6135 const bufferlist& inbl,
6136 bufferlist *outbl,
6137 string *outs,
6138 Context *onfinish)
6139 {
6140 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6141 if (!iref_reader.is_state_satisfied())
6142 return -CEPHFS_ENOTCONN;
6143
6144 std::unique_lock cl(client_lock);
6145
6146 int r;
6147 r = authenticate();
6148 if (r < 0) {
6149 return r;
6150 }
6151
6152 r = fetch_fsmap(false);
6153 if (r < 0) {
6154 return r;
6155 }
6156
6157 // Look up MDS target(s) of the command
6158 std::vector<mds_gid_t> targets;
6159 r = resolve_mds(mds_spec, &targets);
6160 if (r < 0) {
6161 return r;
6162 }
6163
6164 // If daemons are laggy, we won't send them commands. If all
6165 // are laggy then we fail.
6166 std::vector<mds_gid_t> non_laggy;
6167 for (const auto& gid : targets) {
6168 const auto info = fsmap->get_info_gid(gid);
6169 if (!info.laggy()) {
6170 non_laggy.push_back(gid);
6171 }
6172 }
6173 if (non_laggy.size() == 0) {
6174 *outs = "All targeted MDS daemons are laggy";
6175 return -CEPHFS_ENOENT;
6176 }
6177
6178 if (metadata.empty()) {
6179 // We are called on an unmounted client, so metadata
6180 // won't be initialized yet.
6181 populate_metadata("");
6182 }
6183
6184 // Send commands to targets
6185 C_GatherBuilder gather(cct, onfinish);
6186 for (const auto& target_gid : non_laggy) {
6187 const auto info = fsmap->get_info_gid(target_gid);
6188
6189 // Open a connection to the target MDS
6190 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6191
6192 cl.unlock();
6193 {
6194 std::scoped_lock cmd_lock(command_lock);
6195 // Generate MDSCommandOp state
6196 auto &op = command_table.start_command();
6197
6198 op.on_finish = gather.new_sub();
6199 op.cmd = cmd;
6200 op.outbl = outbl;
6201 op.outs = outs;
6202 op.inbl = inbl;
6203 op.mds_gid = target_gid;
6204 op.con = conn;
6205
6206 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6207 << " tid=" << op.tid << cmd << dendl;
6208
6209 // Construct and send MCommand
6210 MessageRef m = op.get_message(monclient->get_fsid());
6211 conn->send_message2(std::move(m));
6212 }
6213 cl.lock();
6214 }
6215 gather.activate();
6216
6217 return 0;
6218 }
6219
6220 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6221 {
6222 ceph_tid_t const tid = m->get_tid();
6223
6224 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6225
6226 std::scoped_lock cmd_lock(command_lock);
6227 if (!command_table.exists(tid)) {
6228 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6229 return;
6230 }
6231
6232 auto &op = command_table.get_command(tid);
6233 if (op.outbl) {
6234 *op.outbl = m->get_data();
6235 }
6236 if (op.outs) {
6237 *op.outs = m->rs;
6238 }
6239
6240 if (op.on_finish) {
6241 op.on_finish->complete(m->r);
6242 }
6243
6244 command_table.erase(tid);
6245 }
6246
6247 // -------------------
6248 // MOUNT
6249
6250 int Client::subscribe_mdsmap(const std::string &fs_name)
6251 {
6252 int r = authenticate();
6253 if (r < 0) {
6254 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6255 return r;
6256 }
6257
6258 std::string resolved_fs_name;
6259 if (fs_name.empty()) {
6260 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6261 if (resolved_fs_name.empty())
6262 // Try the backwards compatibility fs name option
6263 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6264 } else {
6265 resolved_fs_name = fs_name;
6266 }
6267
6268 std::string want = "mdsmap";
6269 if (!resolved_fs_name.empty()) {
6270 r = fetch_fsmap(true);
6271 if (r < 0)
6272 return r;
6273 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6274 if (fscid == FS_CLUSTER_ID_NONE) {
6275 return -CEPHFS_ENOENT;
6276 }
6277
6278 std::ostringstream oss;
6279 oss << want << "." << fscid;
6280 want = oss.str();
6281 }
6282 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6283
6284 monclient->sub_want(want, 0, 0);
6285 monclient->renew_subs();
6286
6287 return 0;
6288 }
6289
6290 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6291 bool require_mds, const std::string &fs_name)
6292 {
6293 ceph_assert(is_initialized());
6294
6295 /*
6296 * To make sure that the _unmount() must wait until the mount()
6297 * is done.
6298 */
6299 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6300 if (!mref_writer.is_first_writer()) // already mounting or mounted
6301 return 0;
6302
6303 std::unique_lock cl(client_lock);
6304
6305 int r = subscribe_mdsmap(fs_name);
6306 if (r < 0) {
6307 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6308 return r;
6309 }
6310
6311 start_tick_thread(); // start tick thread
6312
6313 if (require_mds) {
6314 while (1) {
6315 auto availability = mdsmap->is_cluster_available();
6316 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6317 // Error out
6318 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6319 return CEPH_FUSE_NO_MDS_UP;
6320 } else if (availability == MDSMap::AVAILABLE) {
6321 // Continue to mount
6322 break;
6323 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6324 // Else, wait. MDSMonitor will update the map to bring
6325 // us to a conclusion eventually.
6326 wait_on_list(waiting_for_mdsmap);
6327 } else {
6328 // Unexpected value!
6329 ceph_abort();
6330 }
6331 }
6332 }
6333
6334 populate_metadata(mount_root.empty() ? "/" : mount_root);
6335
6336 filepath fp(CEPH_INO_ROOT);
6337 if (!mount_root.empty()) {
6338 fp = filepath(mount_root.c_str());
6339 }
6340 while (true) {
6341 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6342 req->set_filepath(fp);
6343 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6344 int res = make_request(req, perms);
6345 if (res < 0) {
6346 if (res == -CEPHFS_EACCES && root) {
6347 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6348 break;
6349 }
6350 return res;
6351 }
6352
6353 if (fp.depth())
6354 fp.pop_dentry();
6355 else
6356 break;
6357 }
6358
6359 ceph_assert(root);
6360 _ll_get(root.get());
6361
6362 // trace?
6363 if (!cct->_conf->client_trace.empty()) {
6364 traceout.open(cct->_conf->client_trace.c_str());
6365 if (traceout.is_open()) {
6366 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6367 } else {
6368 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6369 }
6370 }
6371
6372 /*
6373 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6374 ldout(cct, 3) << "op: struct stat st;" << dendl;
6375 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6376 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6377 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6378 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6379 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6380 ldout(cct, 3) << "op: int fd;" << dendl;
6381 */
6382
6383 mref_writer.update_state(CLIENT_MOUNTED);
6384 return 0;
6385 }
6386
6387 // UNMOUNT
6388
6389 void Client::_close_sessions()
6390 {
6391 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6392 if (it->second->state == MetaSession::STATE_REJECTED)
6393 mds_sessions.erase(it++);
6394 else
6395 ++it;
6396 }
6397
6398 while (!mds_sessions.empty()) {
6399 // send session closes!
6400 for (auto &p : mds_sessions) {
6401 if (p.second->state != MetaSession::STATE_CLOSING) {
6402 _close_mds_session(p.second.get());
6403 mds_ranks_closing.insert(p.first);
6404 }
6405 }
6406
6407 // wait for sessions to close
6408 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6409 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6410 << timo << "s)" << dendl;
6411 std::unique_lock l{client_lock, std::adopt_lock};
6412 if (!timo) {
6413 mount_cond.wait(l);
6414 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6415 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6416 while (!mds_ranks_closing.empty()) {
6417 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6418 // this prunes entry from mds_sessions and mds_ranks_closing
6419 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6420 }
6421 }
6422
6423 mds_ranks_closing.clear();
6424 l.release();
6425 }
6426 }
6427
6428 void Client::flush_mdlog_sync(Inode *in)
6429 {
6430 if (in->unsafe_ops.empty()) {
6431 return;
6432 }
6433
6434 std::set<mds_rank_t> anchor;
6435 for (auto &&p : in->unsafe_ops) {
6436 anchor.emplace(p->mds);
6437 }
6438 if (in->auth_cap) {
6439 anchor.emplace(in->auth_cap->session->mds_num);
6440 }
6441
6442 for (auto &rank : anchor) {
6443 auto session = &mds_sessions.at(rank);
6444 flush_mdlog(session->get());
6445 }
6446 }
6447
6448 void Client::flush_mdlog_sync()
6449 {
6450 if (mds_requests.empty())
6451 return;
6452 for (auto &p : mds_sessions) {
6453 flush_mdlog(p.second.get());
6454 }
6455 }
6456
6457 void Client::flush_mdlog(MetaSession *session)
6458 {
6459 // Only send this to Luminous or newer MDS daemons, older daemons
6460 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6461 const uint64_t features = session->con->get_features();
6462 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6463 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6464 session->con->send_message2(std::move(m));
6465 }
6466 }
6467
6468
6469 void Client::_abort_mds_sessions(int err)
6470 {
6471 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6472 auto req = p->second;
6473 ++p;
6474 // unsafe requests will be removed during close session below.
6475 if (req->got_unsafe)
6476 continue;
6477
6478 req->abort(err);
6479 if (req->caller_cond) {
6480 req->kick = true;
6481 req->caller_cond->notify_all();
6482 }
6483 }
6484
6485 // Process aborts on any requests that were on this waitlist.
6486 // Any requests that were on a waiting_for_open session waitlist
6487 // will get kicked during close session below.
6488 signal_cond_list(waiting_for_mdsmap);
6489
6490 // Force-close all sessions
6491 while(!mds_sessions.empty()) {
6492 auto session = mds_sessions.begin()->second;
6493 _closed_mds_session(session.get(), err);
6494 }
6495 }
6496
6497 void Client::_unmount(bool abort)
6498 {
6499 /*
6500 * We are unmounting the client.
6501 *
6502 * Just declare the state to STATE_UNMOUNTING to block and fail
6503 * any new comming "reader" and then try to wait all the in-flight
6504 * "readers" to finish.
6505 */
6506 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6507 if (!mref_writer.is_first_writer())
6508 return;
6509 mref_writer.wait_readers_done();
6510
6511 std::unique_lock lock{client_lock};
6512
6513 if (abort || blocklisted) {
6514 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6515 } else {
6516 ldout(cct, 2) << "unmounting" << dendl;
6517 }
6518
6519 deleg_timeout = 0;
6520
6521 if (abort) {
6522 mount_aborted = true;
6523 // Abort all mds sessions
6524 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6525
6526 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6527 } else {
6528 // flush the mdlog for pending requests, if any
6529 flush_mdlog_sync();
6530 }
6531
6532 mount_cond.wait(lock, [this] {
6533 if (!mds_requests.empty()) {
6534 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6535 << dendl;
6536 }
6537 return mds_requests.empty();
6538 });
6539
6540 cwd.reset();
6541 root.reset();
6542
6543 // clean up any unclosed files
6544 while (!fd_map.empty()) {
6545 Fh *fh = fd_map.begin()->second;
6546 fd_map.erase(fd_map.begin());
6547 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6548 _release_fh(fh);
6549 }
6550
6551 while (!ll_unclosed_fh_set.empty()) {
6552 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6553 Fh *fh = *it;
6554 ll_unclosed_fh_set.erase(fh);
6555 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6556 _release_fh(fh);
6557 }
6558
6559 while (!opened_dirs.empty()) {
6560 dir_result_t *dirp = *opened_dirs.begin();
6561 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6562 _closedir(dirp);
6563 }
6564
6565 _ll_drop_pins();
6566
6567 if (cct->_conf->client_oc) {
6568 // flush/release all buffered data
6569 std::list<InodeRef> anchor;
6570 for (auto& p : inode_map) {
6571 Inode *in = p.second;
6572 if (!in) {
6573 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6574 ceph_assert(in);
6575 }
6576
6577 // prevent inode from getting freed
6578 anchor.emplace_back(in);
6579
6580 if (abort || blocklisted) {
6581 objectcacher->purge_set(&in->oset);
6582 } else if (!in->caps.empty()) {
6583 _release(in);
6584 _flush(in, new C_Client_FlushComplete(this, in));
6585 }
6586 }
6587 }
6588
6589 if (abort || blocklisted) {
6590 for (auto &q : mds_sessions) {
6591 auto s = q.second;
6592 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6593 Inode *in = *p;
6594 ++p;
6595 if (in->dirty_caps) {
6596 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6597 in->mark_caps_clean();
6598 put_inode(in);
6599 }
6600 }
6601 }
6602 } else {
6603 flush_caps_sync();
6604 wait_sync_caps(last_flush_tid);
6605 }
6606
6607 // empty lru cache
6608 trim_cache();
6609
6610 delay_put_inodes();
6611
6612 while (lru.lru_get_size() > 0 ||
6613 !inode_map.empty()) {
6614 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6615 << "+" << inode_map.size() << " items"
6616 << ", waiting (for caps to release?)"
6617 << dendl;
6618
6619 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6620 r == std::cv_status::timeout) {
6621 dump_cache(NULL);
6622 }
6623 }
6624 ceph_assert(lru.lru_get_size() == 0);
6625 ceph_assert(inode_map.empty());
6626
6627 // stop tracing
6628 if (!cct->_conf->client_trace.empty()) {
6629 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6630 traceout.close();
6631 }
6632
6633 // stop the tick thread
6634 tick_thread_stopped = true;
6635 upkeep_cond.notify_one();
6636
6637 _close_sessions();
6638
6639 // release the global snapshot realm
6640 SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6641 if (global_realm) {
6642 ceph_assert(global_realm->nref == 1);
6643 put_snap_realm(global_realm);
6644 }
6645
6646 mref_writer.update_state(CLIENT_UNMOUNTED);
6647
6648 /*
6649 * Stop the remount_queue before clearing the mountpoint memory
6650 * to avoid possible use-after-free bug.
6651 */
6652 if (remount_cb) {
6653 ldout(cct, 10) << "unmount stopping remount finisher" << dendl;
6654 remount_finisher.wait_for_empty();
6655 remount_finisher.stop();
6656 remount_cb = nullptr;
6657 }
6658
6659 ldout(cct, 2) << "unmounted." << dendl;
6660 }
6661
6662 void Client::unmount()
6663 {
6664 _unmount(false);
6665 }
6666
6667 void Client::abort_conn()
6668 {
6669 _unmount(true);
6670 }
6671
6672 void Client::flush_cap_releases()
6673 {
6674 uint64_t nr_caps = 0;
6675
6676 // send any cap releases
6677 for (auto &p : mds_sessions) {
6678 auto session = p.second;
6679 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6680 p.first)) {
6681 nr_caps += session->release->caps.size();
6682 if (cct->_conf->client_inject_release_failure) {
6683 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6684 } else {
6685 session->con->send_message2(std::move(session->release));
6686 }
6687 session->release.reset();
6688 }
6689 }
6690
6691 if (nr_caps > 0) {
6692 dec_pinned_icaps(nr_caps);
6693 }
6694 }
6695
6696 void Client::renew_and_flush_cap_releases()
6697 {
6698 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6699
6700 if (!mount_aborted && mdsmap->get_epoch()) {
6701 // renew caps?
6702 auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6703 if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
6704 renew_caps();
6705
6706 flush_cap_releases();
6707 }
6708 }
6709
6710 void Client::tick()
6711 {
6712 ldout(cct, 20) << "tick" << dendl;
6713
6714 auto now = ceph::coarse_mono_clock::now();
6715
6716 /*
6717 * If the mount() is not finished
6718 */
6719 if (is_mounting() && !mds_requests.empty()) {
6720 MetaRequest *req = mds_requests.begin()->second;
6721
6722 if (req->created + mount_timeout < now) {
6723 req->abort(-CEPHFS_ETIMEDOUT);
6724 if (req->caller_cond) {
6725 req->kick = true;
6726 req->caller_cond->notify_all();
6727 }
6728 signal_cond_list(waiting_for_mdsmap);
6729 for (auto &p : mds_sessions) {
6730 signal_context_list(p.second->waiting_for_open);
6731 }
6732 }
6733 }
6734
6735 renew_and_flush_cap_releases();
6736
6737 // delayed caps
6738 xlist<Inode*>::iterator p = delayed_list.begin();
6739 while (!p.end()) {
6740 Inode *in = *p;
6741 ++p;
6742 if (!mount_aborted && in->hold_caps_until > now)
6743 break;
6744 delayed_list.pop_front();
6745 if (!mount_aborted)
6746 check_caps(in, CHECK_CAPS_NODELAY);
6747 }
6748
6749 if (!mount_aborted)
6750 collect_and_send_metrics();
6751
6752 delay_put_inodes(is_unmounting());
6753 trim_cache(true);
6754
6755 if (blocklisted && (is_mounted() || is_unmounting()) &&
6756 last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
6757 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6758 messenger->client_reset();
6759 fd_gen++; // invalidate open files
6760 blocklisted = false;
6761 _kick_stale_sessions();
6762 last_auto_reconnect = now;
6763 }
6764 }
6765
6766 void Client::start_tick_thread()
6767 {
6768 upkeeper = std::thread([this]() {
6769 using time = ceph::coarse_mono_time;
6770 using sec = std::chrono::seconds;
6771
6772 auto last_tick = time::min();
6773
6774 std::unique_lock cl(client_lock);
6775 while (!tick_thread_stopped) {
6776 auto now = clock::now();
6777 auto since = now - last_tick;
6778
6779 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6780 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6781
6782 auto interval = std::max(t_interval, d_interval);
6783 if (likely(since >= interval*.90)) {
6784 tick();
6785 last_tick = clock::now();
6786 } else {
6787 interval -= since;
6788 }
6789
6790 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6791 if (!tick_thread_stopped)
6792 upkeep_cond.wait_for(cl, interval);
6793 }
6794 });
6795 }
6796
6797 void Client::collect_and_send_metrics() {
6798 ldout(cct, 20) << __func__ << dendl;
6799
6800 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6801
6802 // right now, we only track and send global metrics. its sufficient
6803 // to send these metrics to MDS rank0.
6804 collect_and_send_global_metrics();
6805 }
6806
6807 void Client::collect_and_send_global_metrics() {
6808 ldout(cct, 20) << __func__ << dendl;
6809 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6810
6811 if (!have_open_session((mds_rank_t)0)) {
6812 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6813 << dendl;
6814 return;
6815 }
6816 auto session = _get_or_open_mds_session((mds_rank_t)0);
6817 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6818 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6819 return;
6820 }
6821
6822 ClientMetricMessage metric;
6823 std::vector<ClientMetricMessage> message;
6824
6825 // read latency
6826 if (_collect_and_send_global_metrics ||
6827 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
6828 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
6829 logger->tget(l_c_rd_avg),
6830 logger->get(l_c_rd_sqsum),
6831 nr_read_request));
6832 message.push_back(metric);
6833 }
6834
6835 // write latency
6836 if (_collect_and_send_global_metrics ||
6837 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
6838 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
6839 logger->tget(l_c_wr_avg),
6840 logger->get(l_c_wr_sqsum),
6841 nr_write_request));
6842 message.push_back(metric);
6843 }
6844
6845 // metadata latency
6846 if (_collect_and_send_global_metrics ||
6847 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
6848 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
6849 logger->tget(l_c_md_avg),
6850 logger->get(l_c_md_sqsum),
6851 nr_metadata_request));
6852 message.push_back(metric);
6853 }
6854
6855 // cap hit ratio -- nr_caps is unused right now
6856 if (_collect_and_send_global_metrics ||
6857 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6858 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6859 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6860 message.push_back(metric);
6861 }
6862
6863 // dentry lease hit ratio
6864 if (_collect_and_send_global_metrics ||
6865 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6866 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6867 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6868 message.push_back(metric);
6869 }
6870
6871 // opened files
6872 if (_collect_and_send_global_metrics ||
6873 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
6874 auto [opened_files, total_inodes] = get_opened_files_rates();
6875 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6876 message.push_back(metric);
6877 }
6878
6879 // pinned i_caps
6880 if (_collect_and_send_global_metrics ||
6881 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
6882 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6883 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6884 message.push_back(metric);
6885 }
6886
6887 // opened inodes
6888 if (_collect_and_send_global_metrics ||
6889 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
6890 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6891 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6892 message.push_back(metric);
6893 }
6894
6895 // read io sizes
6896 if (_collect_and_send_global_metrics ||
6897 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6898 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6899 total_read_size));
6900 message.push_back(metric);
6901 }
6902
6903 // write io sizes
6904 if (_collect_and_send_global_metrics ||
6905 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6906 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6907 total_write_size));
6908 message.push_back(metric);
6909 }
6910
6911 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6912 }
6913
6914 void Client::renew_caps()
6915 {
6916 ldout(cct, 10) << "renew_caps()" << dendl;
6917 last_cap_renew = ceph::coarse_mono_clock::now();
6918
6919 for (auto &p : mds_sessions) {
6920 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6921 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6922 renew_caps(p.second.get());
6923 }
6924 }
6925
6926 void Client::renew_caps(MetaSession *session)
6927 {
6928 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6929 session->last_cap_renew_request = ceph_clock_now();
6930 uint64_t seq = ++session->cap_renew_seq;
6931 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6932 }
6933
6934
6935 // ===============================================================
6936 // high level (POSIXy) interface
6937
6938 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6939 InodeRef *target, const UserPerm& perms)
6940 {
6941 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6942 MetaRequest *req = new MetaRequest(op);
6943 filepath path;
6944 dir->make_nosnap_relative_path(path);
6945 path.push_dentry(name);
6946 req->set_filepath(path);
6947 req->set_inode(dir);
6948 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6949 mask |= DEBUG_GETATTR_CAPS;
6950 req->head.args.getattr.mask = mask;
6951
6952 ldout(cct, 10) << __func__ << " on " << path << dendl;
6953
6954 int r = make_request(req, perms, target);
6955 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6956 return r;
6957 }
6958
6959 bool Client::_dentry_valid(const Dentry *dn)
6960 {
6961 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6962
6963 // is dn lease valid?
6964 utime_t now = ceph_clock_now();
6965 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6966 mds_sessions.count(dn->lease_mds)) {
6967 auto s = mds_sessions.at(dn->lease_mds);
6968 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
6969 dlease_hit();
6970 return true;
6971 }
6972
6973 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6974 << " vs lease_gen " << dn->lease_gen << dendl;
6975 }
6976
6977 dlease_miss();
6978 return false;
6979 }
6980
6981 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6982 const UserPerm& perms, std::string* alternate_name)
6983 {
6984 int r = 0;
6985 Dentry *dn = NULL;
6986 bool did_lookup_request = false;
6987 // can only request shared caps
6988 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6989
6990 if (dname == "..") {
6991 if (dir->dentries.empty()) {
6992 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6993 filepath path(dir->ino);
6994 req->set_filepath(path);
6995
6996 InodeRef tmptarget;
6997 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6998
6999 if (r == 0) {
7000 *target = std::move(tmptarget);
7001 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
7002 } else {
7003 *target = dir;
7004 }
7005 }
7006 else
7007 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
7008 goto done;
7009 }
7010
7011 if (dname == ".") {
7012 *target = dir;
7013 goto done;
7014 }
7015
7016 if (!dir->is_dir()) {
7017 r = -CEPHFS_ENOTDIR;
7018 goto done;
7019 }
7020
7021 if (dname.length() > NAME_MAX) {
7022 r = -CEPHFS_ENAMETOOLONG;
7023 goto done;
7024 }
7025
7026 if (dname == cct->_conf->client_snapdir &&
7027 dir->snapid == CEPH_NOSNAP) {
7028 *target = open_snapdir(dir);
7029 goto done;
7030 }
7031
7032 relookup:
7033 if (dir->dir &&
7034 dir->dir->dentries.count(dname)) {
7035 dn = dir->dir->dentries[dname];
7036
7037 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
7038 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7039
7040 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7041 if (_dentry_valid(dn)) {
7042 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7043 // make trim_caps() behave.
7044 dir->try_touch_cap(dn->lease_mds);
7045 goto hit_dn;
7046 }
7047 // dir shared caps?
7048 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7049 if (dn->cap_shared_gen == dir->shared_gen &&
7050 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7051 goto hit_dn;
7052 if (!dn->inode && (dir->flags & I_COMPLETE)) {
7053 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7054 << *dir << " dn '" << dname << "'" << dendl;
7055 return -CEPHFS_ENOENT;
7056 }
7057 }
7058 } else {
7059 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7060 }
7061 } else {
7062 // can we conclude ENOENT locally?
7063 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7064 (dir->flags & I_COMPLETE)) {
7065 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7066 return -CEPHFS_ENOENT;
7067 }
7068 }
7069
7070 if (did_lookup_request) {
7071 r = 0;
7072 goto done;
7073 }
7074 r = _do_lookup(dir, dname, mask, target, perms);
7075 did_lookup_request = true;
7076 if (r == 0) {
7077 /* complete lookup to get dentry for alternate_name */
7078 goto relookup;
7079 } else {
7080 goto done;
7081 }
7082
7083 hit_dn:
7084 if (dn->inode) {
7085 *target = dn->inode;
7086 if (alternate_name)
7087 *alternate_name = dn->alternate_name;
7088 } else {
7089 r = -CEPHFS_ENOENT;
7090 }
7091 touch_dn(dn);
7092 goto done;
7093
7094 done:
7095 if (r < 0)
7096 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7097 else
7098 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7099 return r;
7100 }
7101
7102 int Client::get_or_create(Inode *dir, const char* name,
7103 Dentry **pdn, bool expect_null)
7104 {
7105 // lookup
7106 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7107 dir->open_dir();
7108 if (dir->dir->dentries.count(name)) {
7109 Dentry *dn = dir->dir->dentries[name];
7110 if (_dentry_valid(dn)) {
7111 if (expect_null)
7112 return -CEPHFS_EEXIST;
7113 }
7114 *pdn = dn;
7115 } else {
7116 // otherwise link up a new one
7117 *pdn = link(dir->dir, name, NULL, NULL);
7118 }
7119
7120 // success
7121 return 0;
7122 }
7123
7124 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7125 {
7126 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7127 if (!mref_reader.is_state_satisfied())
7128 return -CEPHFS_ENOTCONN;
7129
7130 ldout(cct, 10) << __func__ << ": " << path << dendl;
7131
7132 std::scoped_lock lock(client_lock);
7133
7134 return path_walk(path, wdr, perms, followsym);
7135 }
7136
7137 int Client::path_walk(const filepath& origpath, InodeRef *end,
7138 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
7139 {
7140 walk_dentry_result wdr;
7141 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
7142 *end = std::move(wdr.in);
7143 return rc;
7144 }
7145
7146 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7147 bool followsym, int mask, InodeRef dirinode)
7148 {
7149 filepath path = origpath;
7150 InodeRef cur;
7151 std::string alternate_name;
7152 if (origpath.absolute())
7153 cur = root;
7154 else if (!dirinode)
7155 cur = cwd;
7156 else {
7157 cur = dirinode;
7158 }
7159 ceph_assert(cur);
7160
7161 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
7162 ldout(cct, 10) << __func__ << " " << path << dendl;
7163
7164 int symlinks = 0;
7165
7166 unsigned i=0;
7167 while (i < path.depth() && cur) {
7168 int caps = 0;
7169 const string &dname = path[i];
7170 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7171 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7172 InodeRef next;
7173 if (cct->_conf->client_permissions) {
7174 int r = may_lookup(cur.get(), perms);
7175 if (r < 0)
7176 return r;
7177 caps = CEPH_CAP_AUTH_SHARED;
7178 }
7179
7180 /* Get extra requested caps on the last component */
7181 if (i == (path.depth() - 1))
7182 caps |= mask;
7183 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7184 if (r < 0)
7185 return r;
7186 // only follow trailing symlink if followsym. always follow
7187 // 'directory' symlinks.
7188 if (next && next->is_symlink()) {
7189 symlinks++;
7190 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7191 if (symlinks > MAXSYMLINKS) {
7192 return -CEPHFS_ELOOP;
7193 }
7194
7195 if (i < path.depth() - 1) {
7196 // dir symlink
7197 // replace consumed components of path with symlink dir target
7198 filepath resolved(next->symlink.c_str());
7199 resolved.append(path.postfixpath(i + 1));
7200 path = resolved;
7201 i = 0;
7202 if (next->symlink[0] == '/') {
7203 cur = root;
7204 }
7205 continue;
7206 } else if (followsym) {
7207 if (next->symlink[0] == '/') {
7208 path = next->symlink.c_str();
7209 i = 0;
7210 // reset position
7211 cur = root;
7212 } else {
7213 filepath more(next->symlink.c_str());
7214 // we need to remove the symlink component from off of the path
7215 // before adding the target that the symlink points to. remain
7216 // at the same position in the path.
7217 path.pop_dentry();
7218 path.append(more);
7219 }
7220 continue;
7221 }
7222 }
7223 cur.swap(next);
7224 i++;
7225 }
7226 if (!cur)
7227 return -CEPHFS_ENOENT;
7228 if (result) {
7229 result->in = std::move(cur);
7230 result->alternate_name = std::move(alternate_name);
7231 }
7232 return 0;
7233 }
7234
7235
7236 // namespace ops
7237
7238 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7239 {
7240 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7241 if (!mref_reader.is_state_satisfied())
7242 return -CEPHFS_ENOTCONN;
7243
7244 tout(cct) << "link" << std::endl;
7245 tout(cct) << relexisting << std::endl;
7246 tout(cct) << relpath << std::endl;
7247
7248 filepath existing(relexisting);
7249
7250 InodeRef in, dir;
7251
7252 std::scoped_lock lock(client_lock);
7253 int r = path_walk(existing, &in, perm, true);
7254 if (r < 0)
7255 return r;
7256 if (std::string(relpath) == "/") {
7257 r = -CEPHFS_EEXIST;
7258 return r;
7259 }
7260 filepath path(relpath);
7261 string name = path.last_dentry();
7262 path.pop_dentry();
7263
7264 r = path_walk(path, &dir, perm, true);
7265 if (r < 0)
7266 return r;
7267 if (cct->_conf->client_permissions) {
7268 if (S_ISDIR(in->mode)) {
7269 r = -CEPHFS_EPERM;
7270 return r;
7271 }
7272 r = may_hardlink(in.get(), perm);
7273 if (r < 0)
7274 return r;
7275 r = may_create(dir.get(), perm);
7276 if (r < 0)
7277 return r;
7278 }
7279 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7280 return r;
7281 }
7282
7283 int Client::unlink(const char *relpath, const UserPerm& perm)
7284 {
7285 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7286 }
7287
7288 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7289 {
7290 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7291 if (!mref_reader.is_state_satisfied()) {
7292 return -CEPHFS_ENOTCONN;
7293 }
7294
7295 tout(cct) << __func__ << std::endl;
7296 tout(cct) << dirfd << std::endl;
7297 tout(cct) << relpath << std::endl;
7298 tout(cct) << flags << std::endl;
7299
7300 if (std::string(relpath) == "/") {
7301 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7302 }
7303
7304 filepath path(relpath);
7305 string name = path.last_dentry();
7306 path.pop_dentry();
7307 InodeRef dir;
7308
7309 std::scoped_lock lock(client_lock);
7310
7311 InodeRef dirinode;
7312 int r = get_fd_inode(dirfd, &dirinode);
7313 if (r < 0) {
7314 return r;
7315 }
7316
7317 r = path_walk(path, &dir, perm, true, 0, dirinode);
7318 if (r < 0) {
7319 return r;
7320 }
7321 if (cct->_conf->client_permissions) {
7322 r = may_delete(dir.get(), name.c_str(), perm);
7323 if (r < 0) {
7324 return r;
7325 }
7326 }
7327 if (flags & AT_REMOVEDIR) {
7328 r = _rmdir(dir.get(), name.c_str(), perm);
7329 } else {
7330 r = _unlink(dir.get(), name.c_str(), perm);
7331 }
7332 return r;
7333 }
7334
7335 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7336 {
7337 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7338 if (!mref_reader.is_state_satisfied())
7339 return -CEPHFS_ENOTCONN;
7340
7341 tout(cct) << __func__ << std::endl;
7342 tout(cct) << relfrom << std::endl;
7343 tout(cct) << relto << std::endl;
7344
7345 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7346 return -CEPHFS_EBUSY;
7347
7348 filepath from(relfrom);
7349 filepath to(relto);
7350 string fromname = from.last_dentry();
7351 from.pop_dentry();
7352 string toname = to.last_dentry();
7353 to.pop_dentry();
7354
7355 InodeRef fromdir, todir;
7356
7357 std::scoped_lock lock(client_lock);
7358 int r = path_walk(from, &fromdir, perm);
7359 if (r < 0)
7360 goto out;
7361 r = path_walk(to, &todir, perm);
7362 if (r < 0)
7363 goto out;
7364
7365 if (cct->_conf->client_permissions) {
7366 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7367 if (r < 0)
7368 return r;
7369 r = may_delete(todir.get(), toname.c_str(), perm);
7370 if (r < 0 && r != -CEPHFS_ENOENT)
7371 return r;
7372 }
7373 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7374 out:
7375 return r;
7376 }
7377
7378 // dirs
7379
7380 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7381 {
7382 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7383 }
7384
7385 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7386 std::string alternate_name)
7387 {
7388 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7389 if (!mref_reader.is_state_satisfied())
7390 return -CEPHFS_ENOTCONN;
7391
7392 tout(cct) << __func__ << std::endl;
7393 tout(cct) << dirfd << std::endl;
7394 tout(cct) << relpath << std::endl;
7395 tout(cct) << mode << std::endl;
7396 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7397
7398 if (std::string(relpath) == "/") {
7399 return -CEPHFS_EEXIST;
7400 }
7401
7402 filepath path(relpath);
7403 string name = path.last_dentry();
7404 path.pop_dentry();
7405 InodeRef dir;
7406
7407 std::scoped_lock lock(client_lock);
7408
7409 InodeRef dirinode;
7410 int r = get_fd_inode(dirfd, &dirinode);
7411 if (r < 0) {
7412 return r;
7413 }
7414
7415 r = path_walk(path, &dir, perm, true, 0, dirinode);
7416 if (r < 0) {
7417 return r;
7418 }
7419 if (cct->_conf->client_permissions) {
7420 r = may_create(dir.get(), perm);
7421 if (r < 0) {
7422 return r;
7423 }
7424 }
7425 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7426 }
7427
7428 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7429 {
7430 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7431 if (!mref_reader.is_state_satisfied())
7432 return -CEPHFS_ENOTCONN;
7433
7434 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7435 tout(cct) << __func__ << std::endl;
7436 tout(cct) << relpath << std::endl;
7437 tout(cct) << mode << std::endl;
7438
7439 //get through existing parts of path
7440 filepath path(relpath);
7441 unsigned int i;
7442 int r = 0, caps = 0;
7443 InodeRef cur, next;
7444
7445 std::scoped_lock lock(client_lock);
7446 cur = cwd;
7447 for (i=0; i<path.depth(); ++i) {
7448 if (cct->_conf->client_permissions) {
7449 r = may_lookup(cur.get(), perms);
7450 if (r < 0)
7451 break;
7452 caps = CEPH_CAP_AUTH_SHARED;
7453 }
7454 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7455 if (r < 0)
7456 break;
7457 cur.swap(next);
7458 }
7459 if (r!=-CEPHFS_ENOENT) return r;
7460 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7461 //make new directory at each level
7462 for (; i<path.depth(); ++i) {
7463 if (cct->_conf->client_permissions) {
7464 r = may_create(cur.get(), perms);
7465 if (r < 0)
7466 return r;
7467 }
7468 //make new dir
7469 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7470
7471 //check proper creation/existence
7472 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7473 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7474 }
7475 if (r < 0)
7476 return r;
7477 //move to new dir and continue
7478 cur.swap(next);
7479 ldout(cct, 20) << __func__ << ": successfully created directory "
7480 << filepath(cur->ino).get_path() << dendl;
7481 }
7482 return 0;
7483 }
7484
7485 int Client::rmdir(const char *relpath, const UserPerm& perms)
7486 {
7487 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7488 }
7489
7490 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7491 {
7492 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7493 if (!mref_reader.is_state_satisfied())
7494 return -CEPHFS_ENOTCONN;
7495
7496 tout(cct) << __func__ << std::endl;
7497 tout(cct) << relpath << std::endl;
7498 tout(cct) << mode << std::endl;
7499 tout(cct) << rdev << std::endl;
7500
7501 if (std::string(relpath) == "/")
7502 return -CEPHFS_EEXIST;
7503
7504 filepath path(relpath);
7505 string name = path.last_dentry();
7506 path.pop_dentry();
7507 InodeRef dir;
7508
7509 std::scoped_lock lock(client_lock);
7510 int r = path_walk(path, &dir, perms);
7511 if (r < 0)
7512 return r;
7513 if (cct->_conf->client_permissions) {
7514 int r = may_create(dir.get(), perms);
7515 if (r < 0)
7516 return r;
7517 }
7518 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7519 }
7520
7521 // symlinks
7522
7523 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7524 {
7525 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7526 }
7527
7528 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7529 std::string alternate_name)
7530 {
7531 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7532 if (!mref_reader.is_state_satisfied()) {
7533 return -CEPHFS_ENOTCONN;
7534 }
7535
7536 tout(cct) << __func__ << std::endl;
7537 tout(cct) << target << std::endl;
7538 tout(cct) << dirfd << std::endl;
7539 tout(cct) << relpath << std::endl;
7540
7541 if (std::string(relpath) == "/") {
7542 return -CEPHFS_EEXIST;
7543 }
7544
7545 filepath path(relpath);
7546 string name = path.last_dentry();
7547 path.pop_dentry();
7548 InodeRef dir;
7549
7550 std::scoped_lock lock(client_lock);
7551
7552 InodeRef dirinode;
7553 int r = get_fd_inode(dirfd, &dirinode);
7554 if (r < 0) {
7555 return r;
7556 }
7557 r = path_walk(path, &dir, perms, true, 0, dirinode);
7558 if (r < 0) {
7559 return r;
7560 }
7561 if (cct->_conf->client_permissions) {
7562 int r = may_create(dir.get(), perms);
7563 if (r < 0) {
7564 return r;
7565 }
7566 }
7567 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7568 }
7569
7570 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7571 {
7572 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7573 }
7574
7575 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7576 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7577 if (!mref_reader.is_state_satisfied()) {
7578 return -CEPHFS_ENOTCONN;
7579 }
7580
7581 tout(cct) << __func__ << std::endl;
7582 tout(cct) << dirfd << std::endl;
7583 tout(cct) << relpath << std::endl;
7584
7585 InodeRef dirinode;
7586 std::scoped_lock lock(client_lock);
7587 int r = get_fd_inode(dirfd, &dirinode);
7588 if (r < 0) {
7589 return r;
7590 }
7591
7592 InodeRef in;
7593 filepath path(relpath);
7594 r = path_walk(path, &in, perms, false, 0, dirinode);
7595 if (r < 0) {
7596 return r;
7597 }
7598
7599 return _readlink(in.get(), buf, size);
7600 }
7601
7602 int Client::_readlink(Inode *in, char *buf, size_t size)
7603 {
7604 if (!in->is_symlink())
7605 return -CEPHFS_EINVAL;
7606
7607 // copy into buf (at most size bytes)
7608 int r = in->symlink.length();
7609 if (r > (int)size)
7610 r = size;
7611 memcpy(buf, in->symlink.c_str(), r);
7612 return r;
7613 }
7614
7615
7616 // inode stuff
7617
7618 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7619 {
7620 bool yes = in->caps_issued_mask(mask, true);
7621
7622 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7623 if (yes && !force)
7624 return 0;
7625
7626 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7627 filepath path;
7628 in->make_nosnap_relative_path(path);
7629 req->set_filepath(path);
7630 req->set_inode(in);
7631 req->head.args.getattr.mask = mask;
7632
7633 int res = make_request(req, perms);
7634 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7635 return res;
7636 }
7637
7638 int Client::_getvxattr(
7639 Inode *in,
7640 const UserPerm& perms,
7641 const char *xattr_name,
7642 ssize_t size,
7643 void *value,
7644 mds_rank_t rank)
7645 {
7646 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7647 return -CEPHFS_ENODATA;
7648 }
7649
7650 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7651 filepath path;
7652 in->make_nosnap_relative_path(path);
7653 req->set_filepath(path);
7654 req->set_inode(in);
7655 req->set_string2(xattr_name);
7656
7657 bufferlist bl;
7658 int res = make_request(req, perms, nullptr, nullptr, rank, &bl,
7659 CEPHFS_FEATURE_OP_GETVXATTR);
7660 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7661
7662 if (res < 0) {
7663 if (res == -CEPHFS_EOPNOTSUPP) {
7664 return -CEPHFS_ENODATA;
7665 }
7666 return res;
7667 }
7668
7669 std::string buf;
7670 auto p = bl.cbegin();
7671
7672 DECODE_START(1, p);
7673 decode(buf, p);
7674 DECODE_FINISH(p);
7675
7676 ssize_t len = buf.length();
7677
7678 res = len; // refer to man getxattr(2) for output buffer size == 0
7679
7680 if (size > 0) {
7681 if (len > size) {
7682 res = -CEPHFS_ERANGE; // insufficient output buffer space
7683 } else {
7684 memcpy(value, buf.c_str(), len);
7685 }
7686 }
7687 return res;
7688 }
7689
7690 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7691 const UserPerm& perms, InodeRef *inp)
7692 {
7693 int issued = in->caps_issued();
7694 union ceph_mds_request_args args;
7695 bool kill_sguid = false;
7696 int inode_drop = 0;
7697
7698 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7699 ccap_string(issued) << dendl;
7700
7701 if (in->snapid != CEPH_NOSNAP) {
7702 return -CEPHFS_EROFS;
7703 }
7704 if ((mask & CEPH_SETATTR_SIZE) &&
7705 (uint64_t)stx->stx_size > in->size &&
7706 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7707 perms)) {
7708 return -CEPHFS_EDQUOT;
7709 }
7710
7711 memset(&args, 0, sizeof(args));
7712
7713 // make the change locally?
7714 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7715 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7716 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7717 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7718 << in->cap_dirtier_gid << ", forcing sync setattr"
7719 << dendl;
7720 /*
7721 * This works because we implicitly flush the caps as part of the
7722 * request, so the cap update check will happen with the writeback
7723 * cap context, and then the setattr check will happen with the
7724 * caller's context.
7725 *
7726 * In reality this pattern is likely pretty rare (different users
7727 * setattr'ing the same file). If that turns out not to be the
7728 * case later, we can build a more complex pipelined cap writeback
7729 * infrastructure...
7730 */
7731 mask |= CEPH_SETATTR_CTIME;
7732 }
7733
7734 if (!mask) {
7735 // caller just needs us to bump the ctime
7736 in->ctime = ceph_clock_now();
7737 in->cap_dirtier_uid = perms.uid();
7738 in->cap_dirtier_gid = perms.gid();
7739 if (issued & CEPH_CAP_AUTH_EXCL)
7740 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7741 else if (issued & CEPH_CAP_FILE_EXCL)
7742 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7743 else if (issued & CEPH_CAP_XATTR_EXCL)
7744 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7745 else
7746 mask |= CEPH_SETATTR_CTIME;
7747 }
7748
7749 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7750 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7751
7752 mask &= ~CEPH_SETATTR_KILL_SGUID;
7753 } else if (mask & CEPH_SETATTR_SIZE) {
7754 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7755 mask |= CEPH_SETATTR_KILL_SGUID;
7756 inode_drop |= CEPH_CAP_AUTH_SHARED;
7757 }
7758
7759 if (mask & CEPH_SETATTR_UID) {
7760 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7761
7762 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7763 in->ctime = ceph_clock_now();
7764 in->cap_dirtier_uid = perms.uid();
7765 in->cap_dirtier_gid = perms.gid();
7766 in->uid = stx->stx_uid;
7767 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7768 mask &= ~CEPH_SETATTR_UID;
7769 kill_sguid = true;
7770 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7771 in->uid != stx->stx_uid) {
7772 args.setattr.uid = stx->stx_uid;
7773 inode_drop |= CEPH_CAP_AUTH_SHARED;
7774 } else {
7775 mask &= ~CEPH_SETATTR_UID;
7776 }
7777 }
7778
7779 if (mask & CEPH_SETATTR_GID) {
7780 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7781
7782 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7783 in->ctime = ceph_clock_now();
7784 in->cap_dirtier_uid = perms.uid();
7785 in->cap_dirtier_gid = perms.gid();
7786 in->gid = stx->stx_gid;
7787 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7788 mask &= ~CEPH_SETATTR_GID;
7789 kill_sguid = true;
7790 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7791 in->gid != stx->stx_gid) {
7792 args.setattr.gid = stx->stx_gid;
7793 inode_drop |= CEPH_CAP_AUTH_SHARED;
7794 } else {
7795 mask &= ~CEPH_SETATTR_GID;
7796 }
7797 }
7798
7799 if (mask & CEPH_SETATTR_MODE) {
7800 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7801
7802 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7803 in->ctime = ceph_clock_now();
7804 in->cap_dirtier_uid = perms.uid();
7805 in->cap_dirtier_gid = perms.gid();
7806 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7807 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7808 mask &= ~CEPH_SETATTR_MODE;
7809 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7810 in->mode != stx->stx_mode) {
7811 args.setattr.mode = stx->stx_mode;
7812 inode_drop |= CEPH_CAP_AUTH_SHARED;
7813 } else {
7814 mask &= ~CEPH_SETATTR_MODE;
7815 }
7816 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7817 kill_sguid && S_ISREG(in->mode) &&
7818 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7819 /* Must squash the any setuid/setgid bits with an ownership change */
7820 in->mode &= ~(S_ISUID|S_ISGID);
7821 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7822 }
7823
7824 if (mask & CEPH_SETATTR_BTIME) {
7825 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7826
7827 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7828 in->ctime = ceph_clock_now();
7829 in->cap_dirtier_uid = perms.uid();
7830 in->cap_dirtier_gid = perms.gid();
7831 in->btime = utime_t(stx->stx_btime);
7832 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7833 mask &= ~CEPH_SETATTR_BTIME;
7834 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7835 in->btime != utime_t(stx->stx_btime)) {
7836 args.setattr.btime = utime_t(stx->stx_btime);
7837 inode_drop |= CEPH_CAP_AUTH_SHARED;
7838 } else {
7839 mask &= ~CEPH_SETATTR_BTIME;
7840 }
7841 }
7842
7843 if (mask & CEPH_SETATTR_SIZE) {
7844 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7845 //too big!
7846 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7847 return -CEPHFS_EFBIG;
7848 }
7849
7850 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7851 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7852 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7853 stx->stx_size >= in->size) {
7854 if (stx->stx_size > in->size) {
7855 in->size = in->reported_size = stx->stx_size;
7856 in->cap_dirtier_uid = perms.uid();
7857 in->cap_dirtier_gid = perms.gid();
7858 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7859 mask &= ~(CEPH_SETATTR_SIZE);
7860 mask |= CEPH_SETATTR_MTIME;
7861 } else {
7862 // ignore it when size doesn't change
7863 mask &= ~(CEPH_SETATTR_SIZE);
7864 }
7865 } else {
7866 args.setattr.size = stx->stx_size;
7867 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7868 CEPH_CAP_FILE_WR;
7869 }
7870 }
7871
7872 if (mask & CEPH_SETATTR_MTIME) {
7873 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7874 in->mtime = utime_t(stx->stx_mtime);
7875 in->ctime = ceph_clock_now();
7876 in->cap_dirtier_uid = perms.uid();
7877 in->cap_dirtier_gid = perms.gid();
7878 in->time_warp_seq++;
7879 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7880 mask &= ~CEPH_SETATTR_MTIME;
7881 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7882 utime_t(stx->stx_mtime) > in->mtime) {
7883 in->mtime = utime_t(stx->stx_mtime);
7884 in->ctime = ceph_clock_now();
7885 in->cap_dirtier_uid = perms.uid();
7886 in->cap_dirtier_gid = perms.gid();
7887 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7888 mask &= ~CEPH_SETATTR_MTIME;
7889 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7890 in->mtime != utime_t(stx->stx_mtime)) {
7891 args.setattr.mtime = utime_t(stx->stx_mtime);
7892 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7893 CEPH_CAP_FILE_WR;
7894 } else {
7895 mask &= ~CEPH_SETATTR_MTIME;
7896 }
7897 }
7898
7899 if (mask & CEPH_SETATTR_ATIME) {
7900 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7901 in->atime = utime_t(stx->stx_atime);
7902 in->ctime = ceph_clock_now();
7903 in->cap_dirtier_uid = perms.uid();
7904 in->cap_dirtier_gid = perms.gid();
7905 in->time_warp_seq++;
7906 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7907 mask &= ~CEPH_SETATTR_ATIME;
7908 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7909 utime_t(stx->stx_atime) > in->atime) {
7910 in->atime = utime_t(stx->stx_atime);
7911 in->ctime = ceph_clock_now();
7912 in->cap_dirtier_uid = perms.uid();
7913 in->cap_dirtier_gid = perms.gid();
7914 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7915 mask &= ~CEPH_SETATTR_ATIME;
7916 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7917 in->atime != utime_t(stx->stx_atime)) {
7918 args.setattr.atime = utime_t(stx->stx_atime);
7919 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7920 CEPH_CAP_FILE_WR;
7921 } else {
7922 mask &= ~CEPH_SETATTR_ATIME;
7923 }
7924 }
7925
7926 if (!mask) {
7927 in->change_attr++;
7928 if (in->is_dir() && in->snapid == CEPH_NOSNAP) {
7929 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
7930 if (inode_map.count(vino)) {
7931 refresh_snapdir_attrs(inode_map[vino], in);
7932 }
7933 }
7934 return 0;
7935 }
7936
7937 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7938
7939 filepath path;
7940
7941 in->make_nosnap_relative_path(path);
7942 req->set_filepath(path);
7943 req->set_inode(in);
7944
7945 req->head.args = args;
7946 req->inode_drop = inode_drop;
7947 req->head.args.setattr.mask = mask;
7948 req->regetattr_mask = mask;
7949
7950 int res = make_request(req, perms, inp);
7951 ldout(cct, 10) << "_setattr result=" << res << dendl;
7952 return res;
7953 }
7954
7955 /* Note that we only care about attrs that setattr cares about */
7956 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7957 {
7958 stx->stx_size = st->st_size;
7959 stx->stx_mode = st->st_mode;
7960 stx->stx_uid = st->st_uid;
7961 stx->stx_gid = st->st_gid;
7962 #ifdef __APPLE__
7963 stx->stx_mtime = st->st_mtimespec;
7964 stx->stx_atime = st->st_atimespec;
7965 #elif __WIN32
7966 stx->stx_mtime.tv_sec = st->st_mtime;
7967 stx->stx_atime.tv_sec = st->st_atime;
7968 #else
7969 stx->stx_mtime = st->st_mtim;
7970 stx->stx_atime = st->st_atim;
7971 #endif
7972 }
7973
7974 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7975 const UserPerm& perms, InodeRef *inp)
7976 {
7977 int ret = _do_setattr(in, stx, mask, perms, inp);
7978 if (ret < 0)
7979 return ret;
7980 if (mask & CEPH_SETATTR_MODE)
7981 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7982 return ret;
7983 }
7984
7985 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7986 const UserPerm& perms)
7987 {
7988 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7989 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7990 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7991 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7992 if (cct->_conf->client_permissions) {
7993 int r = may_setattr(in.get(), stx, mask, perms);
7994 if (r < 0)
7995 return r;
7996 }
7997 return __setattrx(in.get(), stx, mask, perms);
7998 }
7999
8000 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
8001 const UserPerm& perms)
8002 {
8003 struct ceph_statx stx;
8004
8005 stat_to_statx(attr, &stx);
8006 mask &= ~CEPH_SETATTR_BTIME;
8007
8008 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
8009 mask &= ~CEPH_SETATTR_UID;
8010 }
8011 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
8012 mask &= ~CEPH_SETATTR_GID;
8013 }
8014
8015 return _setattrx(in, &stx, mask, perms);
8016 }
8017
8018 int Client::setattr(const char *relpath, struct stat *attr, int mask,
8019 const UserPerm& perms)
8020 {
8021 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8022 if (!mref_reader.is_state_satisfied())
8023 return -CEPHFS_ENOTCONN;
8024
8025 tout(cct) << __func__ << std::endl;
8026 tout(cct) << relpath << std::endl;
8027 tout(cct) << mask << std::endl;
8028
8029 filepath path(relpath);
8030 InodeRef in;
8031
8032 std::scoped_lock lock(client_lock);
8033 int r = path_walk(path, &in, perms);
8034 if (r < 0)
8035 return r;
8036 return _setattr(in, attr, mask, perms);
8037 }
8038
8039 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
8040 const UserPerm& perms, int flags)
8041 {
8042 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8043 if (!mref_reader.is_state_satisfied())
8044 return -CEPHFS_ENOTCONN;
8045
8046 tout(cct) << __func__ << std::endl;
8047 tout(cct) << relpath << std::endl;
8048 tout(cct) << mask << std::endl;
8049
8050 filepath path(relpath);
8051 InodeRef in;
8052
8053 std::scoped_lock lock(client_lock);
8054 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8055 if (r < 0)
8056 return r;
8057 return _setattrx(in, stx, mask, perms);
8058 }
8059
8060 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8061 {
8062 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8063 if (!mref_reader.is_state_satisfied())
8064 return -CEPHFS_ENOTCONN;
8065
8066 tout(cct) << __func__ << std::endl;
8067 tout(cct) << fd << std::endl;
8068 tout(cct) << mask << std::endl;
8069
8070 std::scoped_lock lock(client_lock);
8071 Fh *f = get_filehandle(fd);
8072 if (!f)
8073 return -CEPHFS_EBADF;
8074 #if defined(__linux__) && defined(O_PATH)
8075 if (f->flags & O_PATH)
8076 return -CEPHFS_EBADF;
8077 #endif
8078 return _setattr(f->inode, attr, mask, perms);
8079 }
8080
8081 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8082 {
8083 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8084 if (!mref_reader.is_state_satisfied())
8085 return -CEPHFS_ENOTCONN;
8086
8087 tout(cct) << __func__ << std::endl;
8088 tout(cct) << fd << std::endl;
8089 tout(cct) << mask << std::endl;
8090
8091 std::scoped_lock lock(client_lock);
8092 Fh *f = get_filehandle(fd);
8093 if (!f)
8094 return -CEPHFS_EBADF;
8095 #if defined(__linux__) && defined(O_PATH)
8096 if (f->flags & O_PATH)
8097 return -CEPHFS_EBADF;
8098 #endif
8099 return _setattrx(f->inode, stx, mask, perms);
8100 }
8101
8102 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8103 frag_info_t *dirstat, int mask)
8104 {
8105 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8106 if (!mref_reader.is_state_satisfied())
8107 return -CEPHFS_ENOTCONN;
8108
8109 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8110 tout(cct) << "stat" << std::endl;
8111 tout(cct) << relpath << std::endl;
8112
8113 filepath path(relpath);
8114 InodeRef in;
8115
8116 std::scoped_lock lock(client_lock);
8117 int r = path_walk(path, &in, perms, true, mask);
8118 if (r < 0)
8119 return r;
8120 r = _getattr(in, mask, perms);
8121 if (r < 0) {
8122 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8123 return r;
8124 }
8125 fill_stat(in, stbuf, dirstat);
8126 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8127 return r;
8128 }
8129
8130 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8131 {
8132 unsigned mask = 0;
8133
8134 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8135 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
8136 goto out;
8137
8138 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8139 mask |= CEPH_CAP_PIN;
8140 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8141 mask |= CEPH_CAP_AUTH_SHARED;
8142 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8143 mask |= CEPH_CAP_LINK_SHARED;
8144 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
8145 mask |= CEPH_CAP_FILE_SHARED;
8146 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8147 mask |= CEPH_CAP_XATTR_SHARED;
8148 out:
8149 return mask;
8150 }
8151
8152 int Client::statx(const char *relpath, struct ceph_statx *stx,
8153 const UserPerm& perms,
8154 unsigned int want, unsigned int flags)
8155 {
8156 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
8157 }
8158
8159 int Client::lstat(const char *relpath, struct stat *stbuf,
8160 const UserPerm& perms, frag_info_t *dirstat, int mask)
8161 {
8162 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8163 if (!mref_reader.is_state_satisfied())
8164 return -CEPHFS_ENOTCONN;
8165
8166 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8167 tout(cct) << __func__ << std::endl;
8168 tout(cct) << relpath << std::endl;
8169
8170 filepath path(relpath);
8171 InodeRef in;
8172
8173 std::scoped_lock lock(client_lock);
8174 // don't follow symlinks
8175 int r = path_walk(path, &in, perms, false, mask);
8176 if (r < 0)
8177 return r;
8178 r = _getattr(in, mask, perms);
8179 if (r < 0) {
8180 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8181 return r;
8182 }
8183 fill_stat(in, stbuf, dirstat);
8184 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8185 return r;
8186 }
8187
8188 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8189 {
8190 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8191 << " mode 0" << oct << in->mode << dec
8192 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8193 memset(st, 0, sizeof(struct stat));
8194 if (use_faked_inos())
8195 st->st_ino = in->faked_ino;
8196 else
8197 st->st_ino = in->ino;
8198 st->st_dev = in->snapid;
8199 st->st_mode = in->mode;
8200 st->st_rdev = in->rdev;
8201 if (in->is_dir()) {
8202 switch (in->nlink) {
8203 case 0:
8204 st->st_nlink = 0; /* dir is unlinked */
8205 break;
8206 case 1:
8207 st->st_nlink = 1 /* parent dentry */
8208 + 1 /* <dir>/. */
8209 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8210 break;
8211 default:
8212 ceph_abort();
8213 }
8214 } else {
8215 st->st_nlink = in->nlink;
8216 }
8217 st->st_uid = in->uid;
8218 st->st_gid = in->gid;
8219 if (in->ctime > in->mtime) {
8220 stat_set_ctime_sec(st, in->ctime.sec());
8221 stat_set_ctime_nsec(st, in->ctime.nsec());
8222 } else {
8223 stat_set_ctime_sec(st, in->mtime.sec());
8224 stat_set_ctime_nsec(st, in->mtime.nsec());
8225 }
8226 stat_set_atime_sec(st, in->atime.sec());
8227 stat_set_atime_nsec(st, in->atime.nsec());
8228 stat_set_mtime_sec(st, in->mtime.sec());
8229 stat_set_mtime_nsec(st, in->mtime.nsec());
8230 if (in->is_dir()) {
8231 if (cct->_conf->client_dirsize_rbytes) {
8232 st->st_size = in->rstat.rbytes;
8233 } else if (in->snapid == CEPH_SNAPDIR) {
8234 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8235 if (realm) {
8236 st->st_size = realm->my_snaps.size();
8237 put_snap_realm(realm);
8238 }
8239 } else {
8240 st->st_size = in->dirstat.size();
8241 }
8242 // The Windows "stat" structure provides just a subset of the fields that are
8243 // available on Linux.
8244 #ifndef _WIN32
8245 st->st_blocks = 1;
8246 #endif
8247 } else {
8248 st->st_size = in->size;
8249 #ifndef _WIN32
8250 st->st_blocks = (in->size + 511) >> 9;
8251 #endif
8252 }
8253 #ifndef _WIN32
8254 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8255 #endif
8256
8257 if (dirstat)
8258 *dirstat = in->dirstat;
8259 if (rstat)
8260 *rstat = in->rstat;
8261
8262 return in->caps_issued();
8263 }
8264
8265 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8266 {
8267 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8268 << " mode 0" << oct << in->mode << dec
8269 << " mtime " << in->mtime << " ctime " << in->ctime << " change_attr " << in->change_attr << dendl;
8270 memset(stx, 0, sizeof(struct ceph_statx));
8271
8272 /*
8273 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8274 * so that all bits are set.
8275 */
8276 if (!mask)
8277 mask = ~0;
8278
8279 /* These are always considered to be available */
8280 stx->stx_dev = in->snapid;
8281 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8282
8283 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8284 stx->stx_mode = S_IFMT & in->mode;
8285 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8286 stx->stx_rdev = in->rdev;
8287 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8288
8289 if (mask & CEPH_CAP_AUTH_SHARED) {
8290 stx->stx_uid = in->uid;
8291 stx->stx_gid = in->gid;
8292 stx->stx_mode = in->mode;
8293 in->btime.to_timespec(&stx->stx_btime);
8294 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8295 }
8296
8297 if (mask & CEPH_CAP_LINK_SHARED) {
8298 if (in->is_dir()) {
8299 switch (in->nlink) {
8300 case 0:
8301 stx->stx_nlink = 0; /* dir is unlinked */
8302 break;
8303 case 1:
8304 stx->stx_nlink = 1 /* parent dentry */
8305 + 1 /* <dir>/. */
8306 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8307 break;
8308 default:
8309 ceph_abort();
8310 }
8311 } else {
8312 stx->stx_nlink = in->nlink;
8313 }
8314 stx->stx_mask |= CEPH_STATX_NLINK;
8315 }
8316
8317 if (mask & CEPH_CAP_FILE_SHARED) {
8318
8319 in->atime.to_timespec(&stx->stx_atime);
8320 in->mtime.to_timespec(&stx->stx_mtime);
8321
8322 if (in->is_dir()) {
8323 if (cct->_conf->client_dirsize_rbytes) {
8324 stx->stx_size = in->rstat.rbytes;
8325 } else if (in->snapid == CEPH_SNAPDIR) {
8326 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8327 if (realm) {
8328 stx->stx_size = realm->my_snaps.size();
8329 put_snap_realm(realm);
8330 }
8331 } else {
8332 stx->stx_size = in->dirstat.size();
8333 }
8334 stx->stx_blocks = 1;
8335 } else {
8336 stx->stx_size = in->size;
8337 stx->stx_blocks = (in->size + 511) >> 9;
8338 }
8339 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8340 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8341 }
8342
8343 /* Change time and change_attr both require all shared caps to view */
8344 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8345 stx->stx_version = in->change_attr;
8346 if (in->ctime > in->mtime)
8347 in->ctime.to_timespec(&stx->stx_ctime);
8348 else
8349 in->mtime.to_timespec(&stx->stx_ctime);
8350 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8351 }
8352
8353 }
8354
8355 void Client::touch_dn(Dentry *dn)
8356 {
8357 lru.lru_touch(dn);
8358 }
8359
8360 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8361 {
8362 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8363 }
8364
8365 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8366 {
8367 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8368 if (!mref_reader.is_state_satisfied())
8369 return -CEPHFS_ENOTCONN;
8370
8371 tout(cct) << __func__ << std::endl;
8372 tout(cct) << fd << std::endl;
8373 tout(cct) << mode << std::endl;
8374
8375 std::scoped_lock lock(client_lock);
8376 Fh *f = get_filehandle(fd);
8377 if (!f)
8378 return -CEPHFS_EBADF;
8379 #if defined(__linux__) && defined(O_PATH)
8380 if (f->flags & O_PATH)
8381 return -CEPHFS_EBADF;
8382 #endif
8383 struct stat attr;
8384 attr.st_mode = mode;
8385 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8386 }
8387
8388 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8389 const UserPerm& perms) {
8390 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8391 if (!mref_reader.is_state_satisfied()) {
8392 return -CEPHFS_ENOTCONN;
8393 }
8394
8395 tout(cct) << __func__ << std::endl;
8396 tout(cct) << dirfd << std::endl;
8397 tout(cct) << relpath << std::endl;
8398 tout(cct) << mode << std::endl;
8399 tout(cct) << flags << std::endl;
8400
8401 filepath path(relpath);
8402 InodeRef in;
8403 InodeRef dirinode;
8404
8405 std::scoped_lock lock(client_lock);
8406 int r = get_fd_inode(dirfd, &dirinode);
8407 if (r < 0) {
8408 return r;
8409 }
8410
8411 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8412 if (r < 0) {
8413 return r;
8414 }
8415 struct stat attr;
8416 attr.st_mode = mode;
8417 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8418 }
8419
8420 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8421 {
8422 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8423 }
8424
8425 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8426 const UserPerm& perms)
8427 {
8428 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8429 }
8430
8431 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8432 {
8433 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8434 if (!mref_reader.is_state_satisfied())
8435 return -CEPHFS_ENOTCONN;
8436
8437 tout(cct) << __func__ << std::endl;
8438 tout(cct) << fd << std::endl;
8439 tout(cct) << new_uid << std::endl;
8440 tout(cct) << new_gid << std::endl;
8441
8442 std::scoped_lock lock(client_lock);
8443 Fh *f = get_filehandle(fd);
8444 if (!f)
8445 return -CEPHFS_EBADF;
8446 #if defined(__linux__) && defined(O_PATH)
8447 if (f->flags & O_PATH)
8448 return -CEPHFS_EBADF;
8449 #endif
8450 struct stat attr;
8451 attr.st_uid = new_uid;
8452 attr.st_gid = new_gid;
8453 int mask = 0;
8454 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8455 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8456 return _setattr(f->inode, &attr, mask, perms);
8457 }
8458
8459 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8460 const UserPerm& perms)
8461 {
8462 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8463 }
8464
8465 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8466 int flags, const UserPerm& perms) {
8467 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8468 if (!mref_reader.is_state_satisfied()) {
8469 return -CEPHFS_ENOTCONN;
8470 }
8471
8472 tout(cct) << __func__ << std::endl;
8473 tout(cct) << dirfd << std::endl;
8474 tout(cct) << relpath << std::endl;
8475 tout(cct) << new_uid << std::endl;
8476 tout(cct) << new_gid << std::endl;
8477 tout(cct) << flags << std::endl;
8478
8479 filepath path(relpath);
8480 InodeRef in;
8481 InodeRef dirinode;
8482
8483 std::scoped_lock lock(client_lock);
8484 int r = get_fd_inode(dirfd, &dirinode);
8485 if (r < 0) {
8486 return r;
8487 }
8488
8489 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8490 if (r < 0) {
8491 return r;
8492 }
8493 struct stat attr;
8494 attr.st_uid = new_uid;
8495 attr.st_gid = new_gid;
8496 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8497 }
8498
8499 static void attr_set_atime_and_mtime(struct stat *attr,
8500 const utime_t &atime,
8501 const utime_t &mtime)
8502 {
8503 stat_set_atime_sec(attr, atime.tv.tv_sec);
8504 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8505 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8506 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8507 }
8508
8509 // for [l]utime() invoke the timeval variant as the timespec
8510 // variant are not yet implemented. for futime[s](), invoke
8511 // the timespec variant.
8512 int Client::utime(const char *relpath, struct utimbuf *buf,
8513 const UserPerm& perms)
8514 {
8515 struct timeval tv[2];
8516 tv[0].tv_sec = buf->actime;
8517 tv[0].tv_usec = 0;
8518 tv[1].tv_sec = buf->modtime;
8519 tv[1].tv_usec = 0;
8520
8521 return utimes(relpath, tv, perms);
8522 }
8523
8524 int Client::lutime(const char *relpath, struct utimbuf *buf,
8525 const UserPerm& perms)
8526 {
8527 struct timeval tv[2];
8528 tv[0].tv_sec = buf->actime;
8529 tv[0].tv_usec = 0;
8530 tv[1].tv_sec = buf->modtime;
8531 tv[1].tv_usec = 0;
8532
8533 return lutimes(relpath, tv, perms);
8534 }
8535
8536 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8537 {
8538 struct timespec ts[2];
8539 ts[0].tv_sec = buf->actime;
8540 ts[0].tv_nsec = 0;
8541 ts[1].tv_sec = buf->modtime;
8542 ts[1].tv_nsec = 0;
8543
8544 return futimens(fd, ts, perms);
8545 }
8546
8547 int Client::utimes(const char *relpath, struct timeval times[2],
8548 const UserPerm& perms)
8549 {
8550 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8551 if (!mref_reader.is_state_satisfied())
8552 return -CEPHFS_ENOTCONN;
8553
8554 tout(cct) << __func__ << std::endl;
8555 tout(cct) << relpath << std::endl;
8556 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8557 << std::endl;
8558 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8559 << std::endl;
8560
8561 filepath path(relpath);
8562 InodeRef in;
8563
8564 std::scoped_lock lock(client_lock);
8565 int r = path_walk(path, &in, perms);
8566 if (r < 0)
8567 return r;
8568 struct stat attr;
8569 utime_t atime(times[0]);
8570 utime_t mtime(times[1]);
8571
8572 attr_set_atime_and_mtime(&attr, atime, mtime);
8573 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8574 }
8575
8576 int Client::lutimes(const char *relpath, struct timeval times[2],
8577 const UserPerm& perms)
8578 {
8579 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8580 if (!mref_reader.is_state_satisfied())
8581 return -CEPHFS_ENOTCONN;
8582
8583 tout(cct) << __func__ << std::endl;
8584 tout(cct) << relpath << std::endl;
8585 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8586 << std::endl;
8587 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8588 << std::endl;
8589
8590 filepath path(relpath);
8591 InodeRef in;
8592
8593 std::scoped_lock lock(client_lock);
8594 int r = path_walk(path, &in, perms, false);
8595 if (r < 0)
8596 return r;
8597 struct stat attr;
8598 utime_t atime(times[0]);
8599 utime_t mtime(times[1]);
8600
8601 attr_set_atime_and_mtime(&attr, atime, mtime);
8602 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8603 }
8604
8605 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8606 {
8607 struct timespec ts[2];
8608 ts[0].tv_sec = times[0].tv_sec;
8609 ts[0].tv_nsec = times[0].tv_usec * 1000;
8610 ts[1].tv_sec = times[1].tv_sec;
8611 ts[1].tv_nsec = times[1].tv_usec * 1000;
8612
8613 return futimens(fd, ts, perms);
8614 }
8615
8616 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8617 {
8618 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8619 if (!mref_reader.is_state_satisfied())
8620 return -CEPHFS_ENOTCONN;
8621
8622 tout(cct) << __func__ << std::endl;
8623 tout(cct) << fd << std::endl;
8624 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8625 << std::endl;
8626 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8627 << std::endl;
8628
8629 std::scoped_lock lock(client_lock);
8630 Fh *f = get_filehandle(fd);
8631 if (!f)
8632 return -CEPHFS_EBADF;
8633 #if defined(__linux__) && defined(O_PATH)
8634 if (f->flags & O_PATH)
8635 return -CEPHFS_EBADF;
8636 #endif
8637 struct stat attr;
8638 utime_t atime(times[0]);
8639 utime_t mtime(times[1]);
8640
8641 attr_set_atime_and_mtime(&attr, atime, mtime);
8642 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8643 }
8644
8645 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8646 const UserPerm& perms) {
8647 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8648 if (!mref_reader.is_state_satisfied()) {
8649 return -CEPHFS_ENOTCONN;
8650 }
8651
8652 tout(cct) << __func__ << std::endl;
8653 tout(cct) << dirfd << std::endl;
8654 tout(cct) << relpath << std::endl;
8655 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8656 << std::endl;
8657 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8658 << std::endl;
8659 tout(cct) << flags << std::endl;
8660
8661 filepath path(relpath);
8662 InodeRef in;
8663 InodeRef dirinode;
8664
8665 std::scoped_lock lock(client_lock);
8666 int r = get_fd_inode(dirfd, &dirinode);
8667 if (r < 0) {
8668 return r;
8669 }
8670
8671 #if defined(__linux__) && defined(O_PATH)
8672 if (flags & O_PATH) {
8673 return -CEPHFS_EBADF;
8674 }
8675 #endif
8676
8677 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8678 if (r < 0) {
8679 return r;
8680 }
8681 struct stat attr;
8682 utime_t atime(times[0]);
8683 utime_t mtime(times[1]);
8684
8685 attr_set_atime_and_mtime(&attr, atime, mtime);
8686 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8687 }
8688
8689 int Client::flock(int fd, int operation, uint64_t owner)
8690 {
8691 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8692 if (!mref_reader.is_state_satisfied())
8693 return -CEPHFS_ENOTCONN;
8694
8695 tout(cct) << __func__ << std::endl;
8696 tout(cct) << fd << std::endl;
8697 tout(cct) << operation << std::endl;
8698 tout(cct) << owner << std::endl;
8699
8700 std::scoped_lock lock(client_lock);
8701 Fh *f = get_filehandle(fd);
8702 if (!f)
8703 return -CEPHFS_EBADF;
8704
8705 return _flock(f, operation, owner);
8706 }
8707
8708 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8709 {
8710 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8711 if (!mref_reader.is_state_satisfied())
8712 return -CEPHFS_ENOTCONN;
8713
8714 tout(cct) << __func__ << std::endl;
8715 tout(cct) << relpath << std::endl;
8716
8717 filepath path(relpath);
8718 InodeRef in;
8719
8720 std::scoped_lock lock(client_lock);
8721 int r = path_walk(path, &in, perms, true);
8722 if (r < 0)
8723 return r;
8724 if (cct->_conf->client_permissions) {
8725 int r = may_open(in.get(), O_RDONLY, perms);
8726 if (r < 0)
8727 return r;
8728 }
8729 r = _opendir(in.get(), dirpp, perms);
8730 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8731 if (r != -CEPHFS_ENOTDIR)
8732 tout(cct) << (uintptr_t)*dirpp << std::endl;
8733 return r;
8734 }
8735
8736 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8737 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8738 if (!mref_reader.is_state_satisfied()) {
8739 return -CEPHFS_ENOTCONN;
8740 }
8741
8742 tout(cct) << __func__ << std::endl;
8743 tout(cct) << dirfd << std::endl;
8744
8745 InodeRef dirinode;
8746 std::scoped_lock locker(client_lock);
8747 int r = get_fd_inode(dirfd, &dirinode);
8748 if (r < 0) {
8749 return r;
8750 }
8751
8752 if (cct->_conf->client_permissions) {
8753 r = may_open(dirinode.get(), O_RDONLY, perms);
8754 if (r < 0) {
8755 return r;
8756 }
8757 }
8758 r = _opendir(dirinode.get(), dirpp, perms);
8759 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8760 if (r != -CEPHFS_ENOTDIR) {
8761 tout(cct) << (uintptr_t)*dirpp << std::endl;
8762 }
8763 return r;
8764 }
8765
8766 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8767 {
8768 if (!in->is_dir())
8769 return -CEPHFS_ENOTDIR;
8770 *dirpp = new dir_result_t(in, perms);
8771 opened_dirs.insert(*dirpp);
8772 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8773 return 0;
8774 }
8775
8776
8777 int Client::closedir(dir_result_t *dir)
8778 {
8779 tout(cct) << __func__ << std::endl;
8780 tout(cct) << (uintptr_t)dir << std::endl;
8781
8782 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8783 std::scoped_lock lock(client_lock);
8784 _closedir(dir);
8785 return 0;
8786 }
8787
8788 void Client::_closedir(dir_result_t *dirp)
8789 {
8790 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8791
8792 if (dirp->inode) {
8793 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8794 dirp->inode.reset();
8795 }
8796 _readdir_drop_dirp_buffer(dirp);
8797 opened_dirs.erase(dirp);
8798 delete dirp;
8799 }
8800
8801 void Client::rewinddir(dir_result_t *dirp)
8802 {
8803 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8804
8805 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8806 if (!mref_reader.is_state_satisfied())
8807 return;
8808
8809 std::scoped_lock lock(client_lock);
8810 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8811 _readdir_drop_dirp_buffer(d);
8812 d->reset();
8813 }
8814
8815 loff_t Client::telldir(dir_result_t *dirp)
8816 {
8817 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8818 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8819 return d->offset;
8820 }
8821
8822 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8823 {
8824 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8825
8826 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8827 if (!mref_reader.is_state_satisfied())
8828 return;
8829
8830 std::scoped_lock lock(client_lock);
8831
8832 if (offset == dirp->offset)
8833 return;
8834
8835 if (offset > dirp->offset)
8836 dirp->release_count = 0; // bump if we do a forward seek
8837 else
8838 dirp->ordered_count = 0; // disable filling readdir cache
8839
8840 if (dirp->hash_order()) {
8841 if (dirp->offset > offset) {
8842 _readdir_drop_dirp_buffer(dirp);
8843 dirp->reset();
8844 }
8845 } else {
8846 if (offset == 0 ||
8847 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8848 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8849 _readdir_drop_dirp_buffer(dirp);
8850 dirp->reset();
8851 }
8852 }
8853
8854 dirp->offset = offset;
8855 }
8856
8857
8858 //struct dirent {
8859 // ino_t d_ino; /* inode number */
8860 // off_t d_off; /* offset to the next dirent */
8861 // unsigned short d_reclen; /* length of this record */
8862 // unsigned char d_type; /* type of file */
8863 // char d_name[256]; /* filename */
8864 //};
8865 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8866 {
8867 strncpy(de->d_name, name, 255);
8868 de->d_name[255] = '\0';
8869 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8870 de->d_ino = ino;
8871 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8872 de->d_off = next_off;
8873 #endif
8874 de->d_reclen = 1;
8875 de->d_type = IFTODT(type);
8876 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8877 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8878 #endif
8879 }
8880
8881 void Client::_readdir_next_frag(dir_result_t *dirp)
8882 {
8883 frag_t fg = dirp->buffer_frag;
8884
8885 if (fg.is_rightmost()) {
8886 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8887 dirp->set_end();
8888 return;
8889 }
8890
8891 // advance
8892 fg = fg.next();
8893 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8894
8895 if (dirp->hash_order()) {
8896 // keep last_name
8897 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8898 if (dirp->offset < new_offset) // don't decrease offset
8899 dirp->offset = new_offset;
8900 } else {
8901 dirp->last_name.clear();
8902 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8903 _readdir_rechoose_frag(dirp);
8904 }
8905 }
8906
8907 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8908 {
8909 ceph_assert(dirp->inode);
8910
8911 if (dirp->hash_order())
8912 return;
8913
8914 frag_t cur = frag_t(dirp->offset_high());
8915 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8916 if (fg != cur) {
8917 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8918 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8919 dirp->last_name.clear();
8920 dirp->next_offset = 2;
8921 }
8922 }
8923
8924 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8925 {
8926 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8927 dirp->buffer.clear();
8928 }
8929
8930 int Client::_readdir_get_frag(dir_result_t *dirp)
8931 {
8932 ceph_assert(dirp);
8933 ceph_assert(dirp->inode);
8934
8935 // get the current frag.
8936 frag_t fg;
8937 if (dirp->hash_order())
8938 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8939 else
8940 fg = frag_t(dirp->offset_high());
8941
8942 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8943 << " offset " << hex << dirp->offset << dec << dendl;
8944
8945 int op = CEPH_MDS_OP_READDIR;
8946 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8947 op = CEPH_MDS_OP_LSSNAP;
8948
8949 InodeRef& diri = dirp->inode;
8950
8951 MetaRequest *req = new MetaRequest(op);
8952 filepath path;
8953 diri->make_nosnap_relative_path(path);
8954 req->set_filepath(path);
8955 req->set_inode(diri.get());
8956 req->head.args.readdir.frag = fg;
8957 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8958 if (dirp->last_name.length()) {
8959 req->path2.set_path(dirp->last_name);
8960 } else if (dirp->hash_order()) {
8961 req->head.args.readdir.offset_hash = dirp->offset_high();
8962 }
8963 req->dirp = dirp;
8964
8965 bufferlist dirbl;
8966 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8967
8968 if (res == -CEPHFS_EAGAIN) {
8969 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8970 _readdir_rechoose_frag(dirp);
8971 return _readdir_get_frag(dirp);
8972 }
8973
8974 if (res == 0) {
8975 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8976 << " size " << dirp->buffer.size() << dendl;
8977 } else {
8978 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8979 dirp->set_end();
8980 }
8981
8982 return res;
8983 }
8984
8985 struct dentry_off_lt {
8986 bool operator()(const Dentry* dn, int64_t off) const {
8987 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8988 }
8989 };
8990
8991 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8992 int caps, bool getref)
8993 {
8994 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8995 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8996 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8997 << dendl;
8998 Dir *dir = dirp->inode->dir;
8999
9000 if (!dir) {
9001 ldout(cct, 10) << " dir is empty" << dendl;
9002 dirp->set_end();
9003 return 0;
9004 }
9005
9006 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
9007 dir->readdir_cache.end(),
9008 dirp->offset, dentry_off_lt());
9009
9010 string dn_name;
9011 while (true) {
9012 int mask = caps;
9013 if (!dirp->inode->is_complete_and_ordered())
9014 return -CEPHFS_EAGAIN;
9015 if (pd == dir->readdir_cache.end())
9016 break;
9017 Dentry *dn = *pd;
9018 if (dn->inode == NULL) {
9019 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
9020 ++pd;
9021 continue;
9022 }
9023 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
9024 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
9025 ++pd;
9026 continue;
9027 }
9028
9029 int idx = pd - dir->readdir_cache.begin();
9030 if (dn->inode->is_dir()) {
9031 mask |= CEPH_STAT_RSTAT;
9032 }
9033 int r = _getattr(dn->inode, mask, dirp->perms);
9034 if (r < 0)
9035 return r;
9036
9037 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9038 pd = dir->readdir_cache.begin() + idx;
9039 if (pd >= dir->readdir_cache.end() || *pd != dn)
9040 return -CEPHFS_EAGAIN;
9041
9042 struct ceph_statx stx;
9043 struct dirent de;
9044 fill_statx(dn->inode, caps, &stx);
9045
9046 uint64_t next_off = dn->offset + 1;
9047 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9048 ++pd;
9049 if (pd == dir->readdir_cache.end())
9050 next_off = dir_result_t::END;
9051
9052 Inode *in = NULL;
9053 if (getref) {
9054 in = dn->inode.get();
9055 _ll_get(in);
9056 }
9057
9058 dn_name = dn->name; // fill in name while we have lock
9059
9060 client_lock.unlock();
9061 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9062 client_lock.lock();
9063 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
9064 << " = " << r << dendl;
9065 if (r < 0) {
9066 return r;
9067 }
9068
9069 dirp->offset = next_off;
9070 if (dirp->at_end())
9071 dirp->next_offset = 2;
9072 else
9073 dirp->next_offset = dirp->offset_low();
9074 dirp->last_name = dn_name; // we successfully returned this one; update!
9075 dirp->release_count = 0; // last_name no longer match cache index
9076 if (r > 0)
9077 return r;
9078 }
9079
9080 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
9081 dirp->set_end();
9082 return 0;
9083 }
9084
9085 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
9086 unsigned want, unsigned flags, bool getref)
9087 {
9088 int caps = statx_to_mask(flags, want);
9089
9090 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9091 if (!mref_reader.is_state_satisfied())
9092 return -CEPHFS_ENOTCONN;
9093
9094 std::unique_lock cl(client_lock);
9095
9096 dir_result_t *dirp = static_cast<dir_result_t*>(d);
9097
9098 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
9099 << dec << " at_end=" << dirp->at_end()
9100 << " hash_order=" << dirp->hash_order() << dendl;
9101
9102 struct dirent de;
9103 struct ceph_statx stx;
9104 memset(&de, 0, sizeof(de));
9105 memset(&stx, 0, sizeof(stx));
9106
9107 InodeRef& diri = dirp->inode;
9108
9109 if (dirp->at_end())
9110 return 0;
9111
9112 if (dirp->offset == 0) {
9113 ldout(cct, 15) << " including ." << dendl;
9114 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
9115 uint64_t next_off = 1;
9116
9117 int r;
9118 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
9119 if (r < 0)
9120 return r;
9121
9122 fill_statx(diri, caps, &stx);
9123 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9124
9125 Inode *inode = NULL;
9126 if (getref) {
9127 inode = diri.get();
9128 _ll_get(inode);
9129 }
9130
9131 cl.unlock();
9132 r = cb(p, &de, &stx, next_off, inode);
9133 cl.lock();
9134 if (r < 0)
9135 return r;
9136
9137 dirp->offset = next_off;
9138 if (r > 0)
9139 return r;
9140 }
9141 if (dirp->offset == 1) {
9142 ldout(cct, 15) << " including .." << dendl;
9143 uint64_t next_off = 2;
9144 InodeRef in;
9145 if (diri->dentries.empty())
9146 in = diri;
9147 else
9148 in = diri->get_first_parent()->dir->parent_inode;
9149
9150 int r;
9151 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
9152 if (r < 0)
9153 return r;
9154
9155 fill_statx(in, caps, &stx);
9156 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9157
9158 Inode *inode = NULL;
9159 if (getref) {
9160 inode = in.get();
9161 _ll_get(inode);
9162 }
9163
9164 cl.unlock();
9165 r = cb(p, &de, &stx, next_off, inode);
9166 cl.lock();
9167 if (r < 0)
9168 return r;
9169
9170 dirp->offset = next_off;
9171 if (r > 0)
9172 return r;
9173 }
9174
9175 // can we read from our cache?
9176 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
9177 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9178 << dirp->inode->is_complete_and_ordered()
9179 << " issued " << ccap_string(dirp->inode->caps_issued())
9180 << dendl;
9181 if (dirp->inode->snapid != CEPH_SNAPDIR &&
9182 dirp->inode->is_complete_and_ordered() &&
9183 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
9184 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
9185 if (err != -CEPHFS_EAGAIN)
9186 return err;
9187 }
9188
9189 while (1) {
9190 if (dirp->at_end())
9191 return 0;
9192
9193 bool check_caps = true;
9194 if (!dirp->is_cached()) {
9195 int r = _readdir_get_frag(dirp);
9196 if (r)
9197 return r;
9198 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9199 // different than the requested one. (our dirfragtree was outdated)
9200 check_caps = false;
9201 }
9202 frag_t fg = dirp->buffer_frag;
9203
9204 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9205 << " offset " << hex << dirp->offset << dendl;
9206
9207 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9208 dirp->offset, dir_result_t::dentry_off_lt());
9209 it != dirp->buffer.end();
9210 ++it) {
9211 dir_result_t::dentry &entry = *it;
9212
9213 uint64_t next_off = entry.offset + 1;
9214
9215 int r;
9216 if (check_caps) {
9217 int mask = caps;
9218 if(entry.inode->is_dir()){
9219 mask |= CEPH_STAT_RSTAT;
9220 }
9221 r = _getattr(entry.inode, mask, dirp->perms);
9222 if (r < 0)
9223 return r;
9224 }
9225
9226 fill_statx(entry.inode, caps, &stx);
9227 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9228
9229 Inode *inode = NULL;
9230 if (getref) {
9231 inode = entry.inode.get();
9232 _ll_get(inode);
9233 }
9234
9235 cl.unlock();
9236 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9237 cl.lock();
9238
9239 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9240 << " = " << r << dendl;
9241 if (r < 0)
9242 return r;
9243
9244 dirp->offset = next_off;
9245 if (r > 0)
9246 return r;
9247 }
9248
9249 if (dirp->next_offset > 2) {
9250 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9251 _readdir_drop_dirp_buffer(dirp);
9252 continue; // more!
9253 }
9254
9255 if (!fg.is_rightmost()) {
9256 // next frag!
9257 _readdir_next_frag(dirp);
9258 continue;
9259 }
9260
9261 if (diri->shared_gen == dirp->start_shared_gen &&
9262 diri->dir_release_count == dirp->release_count) {
9263 if (diri->dir_ordered_count == dirp->ordered_count) {
9264 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9265 if (diri->dir) {
9266 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9267 diri->dir->readdir_cache.resize(dirp->cache_index);
9268 }
9269 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9270 } else {
9271 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9272 diri->flags |= I_COMPLETE;
9273 }
9274 }
9275
9276 dirp->set_end();
9277 return 0;
9278 }
9279 ceph_abort();
9280 return 0;
9281 }
9282
9283
9284 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9285 {
9286 return readdirplus_r(d, de, 0, 0, 0, NULL);
9287 }
9288
9289 /*
9290 * readdirplus_r
9291 *
9292 * returns
9293 * 1 if we got a dirent
9294 * 0 for end of directory
9295 * <0 on error
9296 */
9297
9298 struct single_readdir {
9299 struct dirent *de;
9300 struct ceph_statx *stx;
9301 Inode *inode;
9302 bool full;
9303 };
9304
9305 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9306 struct ceph_statx *stx, off_t off,
9307 Inode *in)
9308 {
9309 single_readdir *c = static_cast<single_readdir *>(p);
9310
9311 if (c->full)
9312 return -1; // already filled this dirent
9313
9314 *c->de = *de;
9315 if (c->stx)
9316 *c->stx = *stx;
9317 c->inode = in;
9318 c->full = true;
9319 return 1;
9320 }
9321
9322 struct dirent *Client::readdir(dir_result_t *d)
9323 {
9324 int ret;
9325 auto& de = d->de;
9326 single_readdir sr;
9327 sr.de = &de;
9328 sr.stx = NULL;
9329 sr.inode = NULL;
9330 sr.full = false;
9331
9332 // our callback fills the dirent and sets sr.full=true on first
9333 // call, and returns -1 the second time around.
9334 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9335 if (ret < -1) {
9336 errno = -ret; // this sucks.
9337 return (dirent *) NULL;
9338 }
9339 if (sr.full) {
9340 return &de;
9341 }
9342 return (dirent *) NULL;
9343 }
9344
9345 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9346 struct ceph_statx *stx, unsigned want,
9347 unsigned flags, Inode **out)
9348 {
9349 single_readdir sr;
9350 sr.de = de;
9351 sr.stx = stx;
9352 sr.inode = NULL;
9353 sr.full = false;
9354
9355 // our callback fills the dirent and sets sr.full=true on first
9356 // call, and returns -1 the second time around.
9357 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9358 if (r < -1)
9359 return r;
9360 if (out)
9361 *out = sr.inode;
9362 if (sr.full)
9363 return 1;
9364 return 0;
9365 }
9366
9367
9368 /* getdents */
9369 struct getdents_result {
9370 char *buf;
9371 int buflen;
9372 int pos;
9373 bool fullent;
9374 };
9375
9376 static int _readdir_getdent_cb(void *p, struct dirent *de,
9377 struct ceph_statx *stx, off_t off, Inode *in)
9378 {
9379 struct getdents_result *c = static_cast<getdents_result *>(p);
9380
9381 int dlen;
9382 if (c->fullent)
9383 dlen = sizeof(*de);
9384 else
9385 dlen = strlen(de->d_name) + 1;
9386
9387 if (c->pos + dlen > c->buflen)
9388 return -1; // doesn't fit
9389
9390 if (c->fullent) {
9391 memcpy(c->buf + c->pos, de, sizeof(*de));
9392 } else {
9393 memcpy(c->buf + c->pos, de->d_name, dlen);
9394 }
9395 c->pos += dlen;
9396 return 0;
9397 }
9398
9399 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9400 {
9401 getdents_result gr;
9402 gr.buf = buf;
9403 gr.buflen = buflen;
9404 gr.fullent = fullent;
9405 gr.pos = 0;
9406
9407 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9408
9409 if (r < 0) { // some error
9410 if (r == -1) { // buffer ran out of space
9411 if (gr.pos) { // but we got some entries already!
9412 return gr.pos;
9413 } // or we need a larger buffer
9414 return -CEPHFS_ERANGE;
9415 } else { // actual error, return it
9416 return r;
9417 }
9418 }
9419 return gr.pos;
9420 }
9421
9422
9423 /* getdir */
9424 struct getdir_result {
9425 list<string> *contents;
9426 int num;
9427 };
9428
9429 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9430 {
9431 getdir_result *r = static_cast<getdir_result *>(p);
9432
9433 r->contents->push_back(de->d_name);
9434 r->num++;
9435 return 0;
9436 }
9437
9438 int Client::getdir(const char *relpath, list<string>& contents,
9439 const UserPerm& perms)
9440 {
9441 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9442 tout(cct) << "getdir" << std::endl;
9443 tout(cct) << relpath << std::endl;
9444
9445 dir_result_t *d;
9446 int r = opendir(relpath, &d, perms);
9447 if (r < 0)
9448 return r;
9449
9450 getdir_result gr;
9451 gr.contents = &contents;
9452 gr.num = 0;
9453 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9454
9455 closedir(d);
9456
9457 if (r < 0)
9458 return r;
9459 return gr.num;
9460 }
9461
9462
9463 /****** file i/o **********/
9464
9465 // common parts for open and openat. call with client_lock locked.
9466 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9467 const UserPerm& perms, mode_t mode, int stripe_unit,
9468 int stripe_count, int object_size, const char *data_pool,
9469 std::string alternate_name) {
9470 ceph_assert(ceph_mutex_is_locked(client_lock));
9471 int cflags = ceph_flags_sys2wire(flags);
9472 tout(cct) << cflags << std::endl;
9473
9474 Fh *fh = NULL;
9475
9476 #if defined(__linux__) && defined(O_PATH)
9477 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9478 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9479 * in kernel (fs/open.c). */
9480 if (flags & O_PATH)
9481 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9482 #endif
9483
9484 filepath path(relpath);
9485 InodeRef in;
9486 bool created = false;
9487 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9488 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9489 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9490
9491 InodeRef dirinode = nullptr;
9492 int r = get_fd_inode(dirfd, &dirinode);
9493 if (r < 0) {
9494 return r;
9495 }
9496
9497 r = path_walk(path, &in, perms, followsym, mask, dirinode);
9498 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9499 return -CEPHFS_EEXIST;
9500
9501 #if defined(__linux__) && defined(O_PATH)
9502 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9503 #else
9504 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9505 #endif
9506 return -CEPHFS_ELOOP;
9507
9508 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9509 filepath dirpath = path;
9510 string dname = dirpath.last_dentry();
9511 dirpath.pop_dentry();
9512 InodeRef dir;
9513 r = path_walk(dirpath, &dir, perms, true,
9514 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9515 if (r < 0) {
9516 goto out;
9517 }
9518 if (cct->_conf->client_permissions) {
9519 r = may_create(dir.get(), perms);
9520 if (r < 0)
9521 goto out;
9522 }
9523 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9524 stripe_count, object_size, data_pool, &created, perms,
9525 std::move(alternate_name));
9526 }
9527 if (r < 0)
9528 goto out;
9529
9530 if (!created) {
9531 // posix says we can only check permissions of existing files
9532 if (cct->_conf->client_permissions) {
9533 r = may_open(in.get(), flags, perms);
9534 if (r < 0)
9535 goto out;
9536 }
9537 }
9538
9539 if (!fh)
9540 r = _open(in.get(), flags, mode, &fh, perms);
9541 if (r >= 0) {
9542 // allocate a integer file descriptor
9543 ceph_assert(fh);
9544 r = get_fd();
9545 ceph_assert(fd_map.count(r) == 0);
9546 fd_map[r] = fh;
9547 }
9548
9549 out:
9550 return r;
9551 }
9552
9553 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9554 mode_t mode, int stripe_unit, int stripe_count,
9555 int object_size, const char *data_pool, std::string alternate_name)
9556 {
9557 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9558 stripe_count, object_size, data_pool, alternate_name);
9559 }
9560
9561 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9562 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9563 const char *data_pool, std::string alternate_name) {
9564 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9565 if (!mref_reader.is_state_satisfied()) {
9566 return -CEPHFS_ENOTCONN;
9567 }
9568
9569 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9570 tout(cct) << dirfd << std::endl;
9571 tout(cct) << relpath << std::endl;
9572 tout(cct) << flags << std::endl;
9573 tout(cct) << mode << std::endl;
9574
9575 std::scoped_lock locker(client_lock);
9576 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9577 object_size, data_pool, alternate_name);
9578
9579 tout(cct) << r << std::endl;
9580 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9581 return r;
9582 }
9583
9584 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9585 const UserPerm& perms)
9586 {
9587 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9588
9589 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9590 if (!mref_reader.is_state_satisfied())
9591 return -CEPHFS_ENOTCONN;
9592
9593 std::scoped_lock lock(client_lock);
9594 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9595 filepath path(ino);
9596 req->set_filepath(path);
9597
9598 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9599 char f[30];
9600 sprintf(f, "%u", h);
9601 filepath path2(dirino);
9602 path2.push_dentry(string(f));
9603 req->set_filepath2(path2);
9604
9605 int r = make_request(req, perms, NULL, NULL,
9606 rand() % mdsmap->get_num_in_mds());
9607 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9608 return r;
9609 }
9610
9611
9612 /**
9613 * Load inode into local cache.
9614 *
9615 * If inode pointer is non-NULL, and take a reference on
9616 * the resulting Inode object in one operation, so that caller
9617 * can safely assume inode will still be there after return.
9618 */
9619 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9620 {
9621 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9622
9623 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9624 if (!mref_reader.is_state_satisfied())
9625 return -CEPHFS_ENOTCONN;
9626
9627 if (is_reserved_vino(vino))
9628 return -CEPHFS_ESTALE;
9629
9630 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9631 filepath path(vino.ino);
9632 req->set_filepath(path);
9633
9634 /*
9635 * The MDS expects either a "real" snapid here or 0. The special value
9636 * carveouts for the snapid are all at the end of the range so we can
9637 * just look for any snapid below this value.
9638 */
9639 if (vino.snapid < CEPH_NOSNAP)
9640 req->head.args.lookupino.snapid = vino.snapid;
9641
9642 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9643 if (r == 0 && inode != NULL) {
9644 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9645 ceph_assert(p != inode_map.end());
9646 *inode = p->second;
9647 _ll_get(*inode);
9648 }
9649 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9650 return r;
9651 }
9652
9653 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9654 {
9655 vinodeno_t vino(ino, CEPH_NOSNAP);
9656 std::scoped_lock lock(client_lock);
9657 return _lookup_vino(vino, perms, inode);
9658 }
9659
9660 /**
9661 * Find the parent inode of `ino` and insert it into
9662 * our cache. Conditionally also set `parent` to a referenced
9663 * Inode* if caller provides non-NULL value.
9664 */
9665 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9666 {
9667 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9668
9669 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9670 filepath path(ino->ino);
9671 req->set_filepath(path);
9672
9673 InodeRef target;
9674 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9675 // Give caller a reference to the parent ino if they provided a pointer.
9676 if (parent != NULL) {
9677 if (r == 0) {
9678 *parent = target.get();
9679 _ll_get(*parent);
9680 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9681 } else {
9682 *parent = NULL;
9683 }
9684 }
9685 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9686 return r;
9687 }
9688
9689 /**
9690 * Populate the parent dentry for `ino`, provided it is
9691 * a child of `parent`.
9692 */
9693 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9694 {
9695 ceph_assert(parent->is_dir());
9696 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9697
9698 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9699 if (!mref_reader.is_state_satisfied())
9700 return -CEPHFS_ENOTCONN;
9701
9702 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9703 req->set_filepath2(filepath(parent->ino));
9704 req->set_filepath(filepath(ino->ino));
9705 req->set_inode(ino);
9706
9707 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9708 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9709 return r;
9710 }
9711
9712 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9713 {
9714 std::scoped_lock lock(client_lock);
9715 return _lookup_name(ino, parent, perms);
9716 }
9717
9718 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9719 {
9720 ceph_assert(in);
9721 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9722
9723 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9724
9725 if (in->snapid != CEPH_NOSNAP) {
9726 in->snap_cap_refs++;
9727 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9728 << ccap_string(in->caps_issued()) << dendl;
9729 }
9730
9731 const auto& conf = cct->_conf;
9732 f->readahead.set_trigger_requests(1);
9733 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9734 uint64_t max_readahead = Readahead::NO_LIMIT;
9735 if (conf->client_readahead_max_bytes) {
9736 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9737 }
9738 if (conf->client_readahead_max_periods) {
9739 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9740 }
9741 f->readahead.set_max_readahead_size(max_readahead);
9742 vector<uint64_t> alignments;
9743 alignments.push_back(in->layout.get_period());
9744 alignments.push_back(in->layout.stripe_unit);
9745 f->readahead.set_alignments(alignments);
9746
9747 return f;
9748 }
9749
9750 int Client::_release_fh(Fh *f)
9751 {
9752 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9753 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9754 Inode *in = f->inode.get();
9755 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9756
9757 in->unset_deleg(f);
9758
9759 if (in->snapid == CEPH_NOSNAP) {
9760 if (in->put_open_ref(f->mode)) {
9761 _flush(in, new C_Client_FlushComplete(this, in));
9762 check_caps(in, 0);
9763 }
9764 } else {
9765 ceph_assert(in->snap_cap_refs > 0);
9766 in->snap_cap_refs--;
9767 }
9768
9769 _release_filelocks(f);
9770
9771 // Finally, read any async err (i.e. from flushes)
9772 int err = f->take_async_err();
9773 if (err != 0) {
9774 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9775 << cpp_strerror(err) << dendl;
9776 } else {
9777 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9778 }
9779
9780 _put_fh(f);
9781
9782 return err;
9783 }
9784
9785 void Client::_put_fh(Fh *f)
9786 {
9787 int left = f->put();
9788 if (!left) {
9789 delete f;
9790 }
9791 }
9792
9793 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9794 const UserPerm& perms)
9795 {
9796 if (in->snapid != CEPH_NOSNAP &&
9797 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9798 return -CEPHFS_EROFS;
9799 }
9800
9801 // use normalized flags to generate cmode
9802 int cflags = ceph_flags_sys2wire(flags);
9803 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9804 cflags |= CEPH_O_LAZY;
9805
9806 int cmode = ceph_flags_to_mode(cflags);
9807 int want = ceph_caps_for_mode(cmode);
9808 int result = 0;
9809
9810 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9811
9812 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9813 // update wanted?
9814 check_caps(in, CHECK_CAPS_NODELAY);
9815 } else {
9816
9817 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9818 filepath path;
9819 in->make_nosnap_relative_path(path);
9820 req->set_filepath(path);
9821 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9822 req->head.args.open.mode = mode;
9823 req->head.args.open.pool = -1;
9824 if (cct->_conf->client_debug_getattr_caps)
9825 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9826 else
9827 req->head.args.open.mask = 0;
9828 req->head.args.open.old_size = in->size; // for O_TRUNC
9829 req->set_inode(in);
9830 result = make_request(req, perms);
9831
9832 /*
9833 * NFS expects that delegations will be broken on a conflicting open,
9834 * not just when there is actual conflicting access to the file. SMB leases
9835 * and oplocks also have similar semantics.
9836 *
9837 * Ensure that clients that have delegations enabled will wait on minimal
9838 * caps during open, just to ensure that other clients holding delegations
9839 * return theirs first.
9840 */
9841 if (deleg_timeout && result == 0) {
9842 int need = 0, have;
9843
9844 if (cmode & CEPH_FILE_MODE_WR)
9845 need |= CEPH_CAP_FILE_WR;
9846 if (cmode & CEPH_FILE_MODE_RD)
9847 need |= CEPH_CAP_FILE_RD;
9848
9849 Fh fh(in, flags, cmode, fd_gen, perms);
9850 result = get_caps(&fh, need, want, &have, -1);
9851 if (result < 0) {
9852 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9853 " . Denying open: " <<
9854 cpp_strerror(result) << dendl;
9855 } else {
9856 put_cap_ref(in, need);
9857 }
9858 }
9859 }
9860
9861 // success?
9862 if (result >= 0) {
9863 if (fhp)
9864 *fhp = _create_fh(in, flags, cmode, perms);
9865 } else {
9866 in->put_open_ref(cmode);
9867 }
9868
9869 trim_cache();
9870
9871 return result;
9872 }
9873
9874 int Client::_renew_caps(Inode *in)
9875 {
9876 int wanted = in->caps_file_wanted();
9877 if (in->is_any_caps() &&
9878 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9879 check_caps(in, CHECK_CAPS_NODELAY);
9880 return 0;
9881 }
9882
9883 int flags = 0;
9884 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9885 flags = O_RDWR;
9886 else if (wanted & CEPH_CAP_FILE_RD)
9887 flags = O_RDONLY;
9888 else if (wanted & CEPH_CAP_FILE_WR)
9889 flags = O_WRONLY;
9890
9891 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9892 filepath path;
9893 in->make_nosnap_relative_path(path);
9894 req->set_filepath(path);
9895 req->head.args.open.flags = flags;
9896 req->head.args.open.pool = -1;
9897 if (cct->_conf->client_debug_getattr_caps)
9898 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9899 else
9900 req->head.args.open.mask = 0;
9901 req->set_inode(in);
9902
9903 // duplicate in case Cap goes away; not sure if that race is a concern?
9904 const UserPerm *pperm = in->get_best_perms();
9905 UserPerm perms;
9906 if (pperm != NULL)
9907 perms = *pperm;
9908 int ret = make_request(req, perms);
9909 return ret;
9910 }
9911
9912 int Client::_close(int fd)
9913 {
9914 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9915 tout(cct) << "close" << std::endl;
9916 tout(cct) << fd << std::endl;
9917
9918 Fh *fh = get_filehandle(fd);
9919 if (!fh)
9920 return -CEPHFS_EBADF;
9921 int err = _release_fh(fh);
9922 fd_map.erase(fd);
9923 put_fd(fd);
9924 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9925 return err;
9926 }
9927
9928 int Client::close(int fd) {
9929 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9930 if (!mref_reader.is_state_satisfied())
9931 return -CEPHFS_ENOTCONN;
9932
9933 std::scoped_lock lock(client_lock);
9934 return _close(fd);
9935 }
9936
9937 // ------------
9938 // read, write
9939
9940 loff_t Client::lseek(int fd, loff_t offset, int whence)
9941 {
9942 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9943 if (!mref_reader.is_state_satisfied())
9944 return -CEPHFS_ENOTCONN;
9945
9946 tout(cct) << "lseek" << std::endl;
9947 tout(cct) << fd << std::endl;
9948 tout(cct) << offset << std::endl;
9949 tout(cct) << whence << std::endl;
9950
9951 std::scoped_lock lock(client_lock);
9952 Fh *f = get_filehandle(fd);
9953 if (!f)
9954 return -CEPHFS_EBADF;
9955 #if defined(__linux__) && defined(O_PATH)
9956 if (f->flags & O_PATH)
9957 return -CEPHFS_EBADF;
9958 #endif
9959 return _lseek(f, offset, whence);
9960 }
9961
9962 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9963 {
9964 Inode *in = f->inode.get();
9965 bool whence_check = false;
9966 loff_t pos = -1;
9967
9968 switch (whence) {
9969 case SEEK_END:
9970 whence_check = true;
9971 break;
9972
9973 #ifdef SEEK_DATA
9974 case SEEK_DATA:
9975 whence_check = true;
9976 break;
9977 #endif
9978
9979 #ifdef SEEK_HOLE
9980 case SEEK_HOLE:
9981 whence_check = true;
9982 break;
9983 #endif
9984 }
9985
9986 if (whence_check) {
9987 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9988 if (r < 0)
9989 return r;
9990 }
9991
9992 switch (whence) {
9993 case SEEK_SET:
9994 pos = offset;
9995 break;
9996
9997 case SEEK_CUR:
9998 pos = f->pos + offset;
9999 break;
10000
10001 case SEEK_END:
10002 pos = in->size + offset;
10003 break;
10004
10005 #ifdef SEEK_DATA
10006 case SEEK_DATA:
10007 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10008 return -CEPHFS_ENXIO;
10009 pos = offset;
10010 break;
10011 #endif
10012
10013 #ifdef SEEK_HOLE
10014 case SEEK_HOLE:
10015 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10016 return -CEPHFS_ENXIO;
10017 pos = in->size;
10018 break;
10019 #endif
10020
10021 default:
10022 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
10023 return -CEPHFS_EINVAL;
10024 }
10025
10026 if (pos < 0) {
10027 return -CEPHFS_EINVAL;
10028 } else {
10029 f->pos = pos;
10030 }
10031
10032 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
10033 return f->pos;
10034 }
10035
10036
10037 void Client::lock_fh_pos(Fh *f)
10038 {
10039 ldout(cct, 10) << __func__ << " " << f << dendl;
10040
10041 if (f->pos_locked || !f->pos_waiters.empty()) {
10042 ceph::condition_variable cond;
10043 f->pos_waiters.push_back(&cond);
10044 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
10045 std::unique_lock l{client_lock, std::adopt_lock};
10046 cond.wait(l, [f, me=&cond] {
10047 return !f->pos_locked && f->pos_waiters.front() == me;
10048 });
10049 l.release();
10050 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
10051 ceph_assert(f->pos_waiters.front() == &cond);
10052 f->pos_waiters.pop_front();
10053 }
10054
10055 f->pos_locked = true;
10056 }
10057
10058 void Client::unlock_fh_pos(Fh *f)
10059 {
10060 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10061
10062 ldout(cct, 10) << __func__ << " " << f << dendl;
10063 f->pos_locked = false;
10064 if (!f->pos_waiters.empty()) {
10065 // only wake up the oldest waiter
10066 auto cond = f->pos_waiters.front();
10067 cond->notify_one();
10068 }
10069 }
10070
10071 int Client::uninline_data(Inode *in, Context *onfinish)
10072 {
10073 if (!in->inline_data.length()) {
10074 onfinish->complete(0);
10075 return 0;
10076 }
10077
10078 char oid_buf[32];
10079 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10080 object_t oid = oid_buf;
10081
10082 ObjectOperation create_ops;
10083 create_ops.create(false);
10084
10085 objecter->mutate(oid,
10086 OSDMap::file_to_object_locator(in->layout),
10087 create_ops,
10088 in->snaprealm->get_snap_context(),
10089 ceph::real_clock::now(),
10090 0,
10091 NULL);
10092
10093 bufferlist inline_version_bl;
10094 encode(in->inline_version, inline_version_bl);
10095
10096 ObjectOperation uninline_ops;
10097 uninline_ops.cmpxattr("inline_version",
10098 CEPH_OSD_CMPXATTR_OP_GT,
10099 CEPH_OSD_CMPXATTR_MODE_U64,
10100 inline_version_bl);
10101 bufferlist inline_data = in->inline_data;
10102 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10103 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10104
10105 objecter->mutate(oid,
10106 OSDMap::file_to_object_locator(in->layout),
10107 uninline_ops,
10108 in->snaprealm->get_snap_context(),
10109 ceph::real_clock::now(),
10110 0,
10111 onfinish);
10112
10113 return 0;
10114 }
10115
10116 //
10117
10118 // blocking osd interface
10119
10120 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10121 {
10122 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10123 if (!mref_reader.is_state_satisfied())
10124 return -CEPHFS_ENOTCONN;
10125
10126 tout(cct) << "read" << std::endl;
10127 tout(cct) << fd << std::endl;
10128 tout(cct) << size << std::endl;
10129 tout(cct) << offset << std::endl;
10130
10131 std::unique_lock lock(client_lock);
10132 Fh *f = get_filehandle(fd);
10133 if (!f)
10134 return -CEPHFS_EBADF;
10135 #if defined(__linux__) && defined(O_PATH)
10136 if (f->flags & O_PATH)
10137 return -CEPHFS_EBADF;
10138 #endif
10139 bufferlist bl;
10140 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10141 size = std::min(size, (loff_t)INT_MAX);
10142 int r = _read(f, offset, size, &bl);
10143 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10144 if (r >= 0) {
10145 lock.unlock();
10146 bl.begin().copy(bl.length(), buf);
10147 r = bl.length();
10148 }
10149 return r;
10150 }
10151
10152 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10153 {
10154 if (iovcnt < 0)
10155 return -CEPHFS_EINVAL;
10156 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10157 }
10158
10159 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
10160 {
10161 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10162
10163 int want, have = 0;
10164 bool movepos = false;
10165 int64_t rc = 0;
10166 const auto& conf = cct->_conf;
10167 Inode *in = f->inode.get();
10168 utime_t lat;
10169 utime_t start = ceph_clock_now();
10170
10171 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
10172 return -CEPHFS_EBADF;
10173 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10174
10175 if (offset < 0) {
10176 lock_fh_pos(f);
10177 offset = f->pos;
10178 movepos = true;
10179 }
10180 loff_t start_pos = offset;
10181
10182 if (in->inline_version == 0) {
10183 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10184 if (r < 0) {
10185 rc = r;
10186 goto done;
10187 }
10188 ceph_assert(in->inline_version > 0);
10189 }
10190
10191 retry:
10192 if (f->mode & CEPH_FILE_MODE_LAZY)
10193 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10194 else
10195 want = CEPH_CAP_FILE_CACHE;
10196 {
10197 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10198 if (r < 0) {
10199 rc = r;
10200 goto done;
10201 }
10202 }
10203 if (f->flags & O_DIRECT)
10204 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
10205
10206 if (in->inline_version < CEPH_INLINE_NONE) {
10207 uint32_t len = in->inline_data.length();
10208 uint64_t endoff = offset + size;
10209 if (endoff > in->size)
10210 endoff = in->size;
10211
10212 if (offset < len) {
10213 if (endoff <= len) {
10214 bl->substr_of(in->inline_data, offset, endoff - offset);
10215 } else {
10216 bl->substr_of(in->inline_data, offset, len - offset);
10217 bl->append_zero(endoff - len);
10218 }
10219 rc = endoff - offset;
10220 } else if ((uint64_t)offset < endoff) {
10221 bl->append_zero(endoff - offset);
10222 rc = endoff - offset;
10223 } else {
10224 rc = 0;
10225 }
10226 goto success;
10227 }
10228
10229 if (!conf->client_debug_force_sync_read &&
10230 conf->client_oc &&
10231 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10232
10233 if (f->flags & O_RSYNC) {
10234 _flush_range(in, offset, size);
10235 }
10236 rc = _read_async(f, offset, size, bl);
10237 if (rc < 0)
10238 goto done;
10239 } else {
10240 if (f->flags & O_DIRECT)
10241 _flush_range(in, offset, size);
10242
10243 bool checkeof = false;
10244 rc = _read_sync(f, offset, size, bl, &checkeof);
10245 if (rc < 0)
10246 goto done;
10247 if (checkeof) {
10248 offset += rc;
10249 size -= rc;
10250
10251 put_cap_ref(in, CEPH_CAP_FILE_RD);
10252 have = 0;
10253 // reverify size
10254 {
10255 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10256 if (r < 0) {
10257 rc = r;
10258 goto done;
10259 }
10260 }
10261
10262 // eof? short read.
10263 if ((uint64_t)offset < in->size)
10264 goto retry;
10265 }
10266 }
10267
10268 success:
10269 ceph_assert(rc >= 0);
10270 update_read_io_size(bl->length());
10271 if (movepos) {
10272 // adjust fd pos
10273 f->pos = start_pos + rc;
10274 }
10275
10276 lat = ceph_clock_now();
10277 lat -= start;
10278
10279 ++nr_read_request;
10280 update_io_stat_read(lat);
10281
10282 done:
10283 // done!
10284 if (have) {
10285 put_cap_ref(in, CEPH_CAP_FILE_RD);
10286 }
10287 if (movepos) {
10288 unlock_fh_pos(f);
10289 }
10290 return rc;
10291 }
10292
10293 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10294 client(c), f(f) {
10295 f->get();
10296 f->readahead.inc_pending();
10297 }
10298
10299 Client::C_Readahead::~C_Readahead() {
10300 f->readahead.dec_pending();
10301 client->_put_fh(f);
10302 }
10303
10304 void Client::C_Readahead::finish(int r) {
10305 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10306 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10307 if (r > 0) {
10308 client->update_read_io_size(r);
10309 }
10310 }
10311
10312 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10313 {
10314 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10315
10316 const auto& conf = cct->_conf;
10317 Inode *in = f->inode.get();
10318
10319 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10320
10321 // trim read based on file size?
10322 if (off >= in->size)
10323 return 0;
10324 if (len == 0)
10325 return 0;
10326 if (off + len > in->size) {
10327 len = in->size - off;
10328 }
10329
10330 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10331 << " max_bytes=" << f->readahead.get_max_readahead_size()
10332 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10333
10334 // read (and possibly block)
10335 int r = 0;
10336 C_SaferCond onfinish("Client::_read_async flock");
10337 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10338 off, len, bl, 0, &onfinish);
10339 if (r == 0) {
10340 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10341 client_lock.unlock();
10342 r = onfinish.wait();
10343 client_lock.lock();
10344 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10345 update_read_io_size(bl->length());
10346 }
10347
10348 if(f->readahead.get_min_readahead_size() > 0) {
10349 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10350 if (readahead_extent.second > 0) {
10351 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10352 << " (caller wants " << off << "~" << len << ")" << dendl;
10353 Context *onfinish2 = new C_Readahead(this, f);
10354 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10355 readahead_extent.first, readahead_extent.second,
10356 NULL, 0, onfinish2);
10357 if (r2 == 0) {
10358 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10359 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10360 } else {
10361 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10362 delete onfinish2;
10363 }
10364 }
10365 }
10366
10367 return r;
10368 }
10369
10370 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10371 bool *checkeof)
10372 {
10373 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10374
10375 Inode *in = f->inode.get();
10376 uint64_t pos = off;
10377 int left = len;
10378 int read = 0;
10379
10380 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10381
10382 // 0 success, 1 continue and < 0 error happen.
10383 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10384 int r = onfinish.wait();
10385
10386 // if we get ENOENT from OSD, assume 0 bytes returned
10387 if (r == -CEPHFS_ENOENT)
10388 r = 0;
10389 if (r < 0)
10390 return r;
10391
10392 if (tbl.length()) {
10393 r = tbl.length();
10394
10395 read += r;
10396 pos += r;
10397 left -= r;
10398 bl->claim_append(tbl);
10399 }
10400 // short read?
10401 if (r >= 0 && r < wanted) {
10402 if (pos < in->size) {
10403 // zero up to known EOF
10404 int64_t some = in->size - pos;
10405 if (some > left)
10406 some = left;
10407 auto z = buffer::ptr_node::create(some);
10408 z->zero();
10409 bl->push_back(std::move(z));
10410 read += some;
10411 pos += some;
10412 left -= some;
10413 if (left == 0)
10414 return 0;
10415 }
10416
10417 *checkeof = true;
10418 return 0;
10419 }
10420 return 1;
10421 };
10422
10423 while (left > 0) {
10424 C_SaferCond onfinish("Client::_read_sync flock");
10425 bufferlist tbl;
10426
10427 int wanted = left;
10428 filer->read_trunc(in->ino, &in->layout, in->snapid,
10429 pos, left, &tbl, 0,
10430 in->truncate_size, in->truncate_seq,
10431 &onfinish);
10432 client_lock.unlock();
10433 int r = wait_and_copy(onfinish, tbl, wanted);
10434 client_lock.lock();
10435 if (!r)
10436 return read;
10437 if (r < 0)
10438 return r;
10439 }
10440 return read;
10441 }
10442
10443 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10444 {
10445 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10446 if (!mref_reader.is_state_satisfied())
10447 return -CEPHFS_ENOTCONN;
10448
10449 tout(cct) << "write" << std::endl;
10450 tout(cct) << fd << std::endl;
10451 tout(cct) << size << std::endl;
10452 tout(cct) << offset << std::endl;
10453
10454 std::scoped_lock lock(client_lock);
10455 Fh *fh = get_filehandle(fd);
10456 if (!fh)
10457 return -CEPHFS_EBADF;
10458 #if defined(__linux__) && defined(O_PATH)
10459 if (fh->flags & O_PATH)
10460 return -CEPHFS_EBADF;
10461 #endif
10462 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10463 size = std::min(size, (loff_t)INT_MAX);
10464 int r = _write(fh, offset, size, buf, NULL, false);
10465 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10466 return r;
10467 }
10468
10469 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10470 {
10471 if (iovcnt < 0)
10472 return -CEPHFS_EINVAL;
10473 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10474 }
10475
10476 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10477 unsigned iovcnt, int64_t offset,
10478 bool write, bool clamp_to_int)
10479 {
10480 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10481
10482 #if defined(__linux__) && defined(O_PATH)
10483 if (fh->flags & O_PATH)
10484 return -CEPHFS_EBADF;
10485 #endif
10486 loff_t totallen = 0;
10487 for (unsigned i = 0; i < iovcnt; i++) {
10488 totallen += iov[i].iov_len;
10489 }
10490
10491 /*
10492 * Some of the API functions take 64-bit size values, but only return
10493 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10494 * we don't do I/Os larger than the values we can return.
10495 */
10496 if (clamp_to_int) {
10497 totallen = std::min(totallen, (loff_t)INT_MAX);
10498 }
10499 if (write) {
10500 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10501 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10502 return w;
10503 } else {
10504 bufferlist bl;
10505 int64_t r = _read(fh, offset, totallen, &bl);
10506 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10507 if (r <= 0)
10508 return r;
10509
10510 client_lock.unlock();
10511 auto iter = bl.cbegin();
10512 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10513 /*
10514 * This piece of code aims to handle the case that bufferlist
10515 * does not have enough data to fill in the iov
10516 */
10517 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10518 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10519 resid -= round_size;
10520 /* iter is self-updating */
10521 }
10522 client_lock.lock();
10523 return r;
10524 }
10525 }
10526
10527 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10528 {
10529 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10530 if (!mref_reader.is_state_satisfied())
10531 return -CEPHFS_ENOTCONN;
10532
10533 tout(cct) << fd << std::endl;
10534 tout(cct) << offset << std::endl;
10535
10536 std::scoped_lock cl(client_lock);
10537 Fh *fh = get_filehandle(fd);
10538 if (!fh)
10539 return -CEPHFS_EBADF;
10540 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10541 }
10542
10543 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10544 const struct iovec *iov, int iovcnt)
10545 {
10546 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10547
10548 uint64_t fpos = 0;
10549 Inode *in = f->inode.get();
10550
10551 if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10552 (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10553 return -CEPHFS_EFBIG;
10554 }
10555 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10556
10557 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10558 return -CEPHFS_ENOSPC;
10559 }
10560
10561 ceph_assert(in->snapid == CEPH_NOSNAP);
10562
10563 // was Fh opened as writeable?
10564 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10565 return -CEPHFS_EBADF;
10566
10567 // use/adjust fd pos?
10568 if (offset < 0) {
10569 lock_fh_pos(f);
10570 /*
10571 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10572 * change out from under us.
10573 */
10574 if (f->flags & O_APPEND) {
10575 auto r = _lseek(f, 0, SEEK_END);
10576 if (r < 0) {
10577 unlock_fh_pos(f);
10578 return r;
10579 }
10580 }
10581 offset = f->pos;
10582 fpos = offset+size;
10583 unlock_fh_pos(f);
10584 }
10585
10586 // check quota
10587 uint64_t endoff = offset + size;
10588 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10589 f->actor_perms)) {
10590 return -CEPHFS_EDQUOT;
10591 }
10592
10593 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10594
10595 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10596
10597 // time it.
10598 utime_t start = ceph_clock_now();
10599
10600 if (in->inline_version == 0) {
10601 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10602 if (r < 0)
10603 return r;
10604 ceph_assert(in->inline_version > 0);
10605 }
10606
10607 // copy into fresh buffer (since our write may be resub, async)
10608 bufferlist bl;
10609 if (buf) {
10610 if (size > 0)
10611 bl.append(buf, size);
10612 } else if (iov){
10613 for (int i = 0; i < iovcnt; i++) {
10614 if (iov[i].iov_len > 0) {
10615 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10616 }
10617 }
10618 }
10619
10620 utime_t lat;
10621 uint64_t totalwritten;
10622 int want, have;
10623 if (f->mode & CEPH_FILE_MODE_LAZY)
10624 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10625 else
10626 want = CEPH_CAP_FILE_BUFFER;
10627 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10628 if (r < 0)
10629 return r;
10630
10631 /* clear the setuid/setgid bits, if any */
10632 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10633 struct ceph_statx stx = { 0 };
10634
10635 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10636 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10637 if (r < 0)
10638 return r;
10639 } else {
10640 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10641 }
10642
10643 if (f->flags & O_DIRECT)
10644 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10645
10646 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10647
10648 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10649
10650 if (in->inline_version < CEPH_INLINE_NONE) {
10651 if (endoff > cct->_conf->client_max_inline_size ||
10652 endoff > CEPH_INLINE_MAX_SIZE ||
10653 !(have & CEPH_CAP_FILE_BUFFER)) {
10654 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10655 uninline_data(in, onuninline.get());
10656 } else {
10657 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10658
10659 uint32_t len = in->inline_data.length();
10660
10661 if (endoff < len)
10662 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10663
10664 if (offset < len)
10665 in->inline_data.splice(offset, len - offset);
10666 else if (offset > len)
10667 in->inline_data.append_zero(offset - len);
10668
10669 in->inline_data.append(bl);
10670 in->inline_version++;
10671
10672 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10673
10674 goto success;
10675 }
10676 }
10677
10678 if (cct->_conf->client_oc &&
10679 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10680 // do buffered write
10681 if (!in->oset.dirty_or_tx)
10682 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10683
10684 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10685
10686 // async, caching, non-blocking.
10687 r = objectcacher->file_write(&in->oset, &in->layout,
10688 in->snaprealm->get_snap_context(),
10689 offset, size, bl, ceph::real_clock::now(),
10690 0);
10691 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10692
10693 if (r < 0)
10694 goto done;
10695
10696 // flush cached write if O_SYNC is set on file fh
10697 // O_DSYNC == O_SYNC on linux < 2.6.33
10698 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10699 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10700 _flush_range(in, offset, size);
10701 }
10702 } else {
10703 if (f->flags & O_DIRECT)
10704 _flush_range(in, offset, size);
10705
10706 // simple, non-atomic sync write
10707 C_SaferCond onfinish("Client::_write flock");
10708 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10709
10710 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10711 offset, size, bl, ceph::real_clock::now(), 0,
10712 in->truncate_size, in->truncate_seq,
10713 &onfinish);
10714 client_lock.unlock();
10715 r = onfinish.wait();
10716 client_lock.lock();
10717 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10718 if (r < 0)
10719 goto done;
10720 }
10721
10722 // if we get here, write was successful, update client metadata
10723 success:
10724 update_write_io_size(size);
10725 // time
10726 lat = ceph_clock_now();
10727 lat -= start;
10728
10729 ++nr_write_request;
10730 update_io_stat_write(lat);
10731
10732 if (fpos) {
10733 lock_fh_pos(f);
10734 f->pos = fpos;
10735 unlock_fh_pos(f);
10736 }
10737 totalwritten = size;
10738 r = (int64_t)totalwritten;
10739
10740 // extend file?
10741 if (totalwritten + offset > in->size) {
10742 in->size = totalwritten + offset;
10743 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10744
10745 if (is_quota_bytes_approaching(in, f->actor_perms)) {
10746 check_caps(in, CHECK_CAPS_NODELAY);
10747 } else if (is_max_size_approaching(in)) {
10748 check_caps(in, 0);
10749 }
10750
10751 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10752 } else {
10753 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10754 }
10755
10756 // mtime
10757 in->mtime = in->ctime = ceph_clock_now();
10758 in->change_attr++;
10759 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10760
10761 done:
10762
10763 if (nullptr != onuninline) {
10764 client_lock.unlock();
10765 int uninline_ret = onuninline->wait();
10766 client_lock.lock();
10767
10768 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10769 in->inline_data.clear();
10770 in->inline_version = CEPH_INLINE_NONE;
10771 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10772 check_caps(in, 0);
10773 } else
10774 r = uninline_ret;
10775 }
10776
10777 put_cap_ref(in, CEPH_CAP_FILE_WR);
10778 return r;
10779 }
10780
10781 int Client::_flush(Fh *f)
10782 {
10783 Inode *in = f->inode.get();
10784 int err = f->take_async_err();
10785 if (err != 0) {
10786 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10787 << cpp_strerror(err) << dendl;
10788 } else {
10789 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10790 }
10791
10792 return err;
10793 }
10794
10795 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10796 {
10797 struct ceph_statx stx;
10798 stx.stx_size = length;
10799 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10800 }
10801
10802 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10803 {
10804 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10805 if (!mref_reader.is_state_satisfied())
10806 return -CEPHFS_ENOTCONN;
10807
10808 tout(cct) << __func__ << std::endl;
10809 tout(cct) << fd << std::endl;
10810 tout(cct) << length << std::endl;
10811
10812 std::scoped_lock lock(client_lock);
10813 Fh *f = get_filehandle(fd);
10814 if (!f)
10815 return -CEPHFS_EBADF;
10816 #if defined(__linux__) && defined(O_PATH)
10817 if (f->flags & O_PATH)
10818 return -CEPHFS_EBADF;
10819 #endif
10820 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10821 return -CEPHFS_EBADF;
10822 struct stat attr;
10823 attr.st_size = length;
10824 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10825 }
10826
10827 int Client::fsync(int fd, bool syncdataonly)
10828 {
10829 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10830 if (!mref_reader.is_state_satisfied())
10831 return -CEPHFS_ENOTCONN;
10832
10833 tout(cct) << "fsync" << std::endl;
10834 tout(cct) << fd << std::endl;
10835 tout(cct) << syncdataonly << std::endl;
10836
10837 std::scoped_lock lock(client_lock);
10838 Fh *f = get_filehandle(fd);
10839 if (!f)
10840 return -CEPHFS_EBADF;
10841 #if defined(__linux__) && defined(O_PATH)
10842 if (f->flags & O_PATH)
10843 return -CEPHFS_EBADF;
10844 #endif
10845 int r = _fsync(f, syncdataonly);
10846 if (r == 0) {
10847 // The IOs in this fsync were okay, but maybe something happened
10848 // in the background that we shoudl be reporting?
10849 r = f->take_async_err();
10850 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10851 << ") = 0, async_err = " << r << dendl;
10852 } else {
10853 // Assume that an error we encountered during fsync, even reported
10854 // synchronously, would also have applied the error to the Fh, and we
10855 // should clear it here to avoid returning the same error again on next
10856 // call.
10857 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10858 << r << dendl;
10859 f->take_async_err();
10860 }
10861 return r;
10862 }
10863
10864 int Client::_fsync(Inode *in, bool syncdataonly)
10865 {
10866 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10867
10868 int r = 0;
10869 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10870 ceph_tid_t flush_tid = 0;
10871 InodeRef tmp_ref;
10872 utime_t lat;
10873 utime_t start = ceph_clock_now();
10874
10875 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10876
10877 if (cct->_conf->client_oc) {
10878 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10879 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10880 _flush(in, object_cacher_completion.get());
10881 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10882 }
10883
10884 if (!syncdataonly && in->dirty_caps) {
10885 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10886 if (in->flushing_caps)
10887 flush_tid = last_flush_tid;
10888 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10889
10890 if (!syncdataonly && !in->unsafe_ops.empty()) {
10891 flush_mdlog_sync(in);
10892
10893 MetaRequest *req = in->unsafe_ops.back();
10894 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10895
10896 req->get();
10897 wait_on_list(req->waitfor_safe);
10898 put_request(req);
10899 }
10900
10901 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10902 client_lock.unlock();
10903 ldout(cct, 15) << "waiting on data to flush" << dendl;
10904 r = object_cacher_completion->wait();
10905 client_lock.lock();
10906 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10907 } else {
10908 // FIXME: this can starve
10909 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10910 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10911 << " uncommitted, waiting" << dendl;
10912 wait_on_list(in->waitfor_commit);
10913 }
10914 }
10915
10916 if (!r) {
10917 if (flush_tid > 0)
10918 wait_sync_caps(in, flush_tid);
10919
10920 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10921 } else {
10922 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10923 << cpp_strerror(-r) << dendl;
10924 }
10925
10926 lat = ceph_clock_now();
10927 lat -= start;
10928 logger->tinc(l_c_fsync, lat);
10929
10930 return r;
10931 }
10932
10933 int Client::_fsync(Fh *f, bool syncdataonly)
10934 {
10935 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10936 return _fsync(f->inode.get(), syncdataonly);
10937 }
10938
10939 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10940 {
10941 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10942 if (!mref_reader.is_state_satisfied())
10943 return -CEPHFS_ENOTCONN;
10944
10945 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10946 tout(cct) << fd << std::endl;
10947
10948 std::scoped_lock lock(client_lock);
10949 Fh *f = get_filehandle(fd);
10950 if (!f)
10951 return -CEPHFS_EBADF;
10952 int r = _getattr(f->inode, mask, perms);
10953 if (r < 0)
10954 return r;
10955 fill_stat(f->inode, stbuf, NULL);
10956 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10957 return r;
10958 }
10959
10960 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10961 unsigned int want, unsigned int flags)
10962 {
10963 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10964 if (!mref_reader.is_state_satisfied())
10965 return -CEPHFS_ENOTCONN;
10966
10967 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10968 tout(cct) << fd << std::endl;
10969
10970 std::scoped_lock lock(client_lock);
10971 Fh *f = get_filehandle(fd);
10972 if (!f)
10973 return -CEPHFS_EBADF;
10974
10975 unsigned mask = statx_to_mask(flags, want);
10976
10977 int r = 0;
10978 if (mask) {
10979 r = _getattr(f->inode, mask, perms);
10980 if (r < 0) {
10981 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10982 return r;
10983 }
10984 }
10985
10986 fill_statx(f->inode, mask, stx);
10987 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10988 return r;
10989 }
10990
10991 int Client::statxat(int dirfd, const char *relpath,
10992 struct ceph_statx *stx, const UserPerm& perms,
10993 unsigned int want, unsigned int flags) {
10994 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10995 if (!mref_reader.is_state_satisfied()) {
10996 return -CEPHFS_ENOTCONN;
10997 }
10998
10999 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
11000 tout(cct) << dirfd << std::endl;
11001 tout(cct) << relpath << std::endl;
11002
11003 unsigned mask = statx_to_mask(flags, want);
11004
11005 InodeRef dirinode;
11006 std::scoped_lock lock(client_lock);
11007 int r = get_fd_inode(dirfd, &dirinode);
11008 if (r < 0) {
11009 return r;
11010 }
11011
11012 InodeRef in;
11013 filepath path(relpath);
11014 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
11015 if (r < 0) {
11016 return r;
11017 }
11018 r = _getattr(in, mask, perms);
11019 if (r < 0) {
11020 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
11021 return r;
11022 }
11023
11024 fill_statx(in, mask, stx);
11025 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
11026 return r;
11027 }
11028
11029 // not written yet, but i want to link!
11030
11031 int Client::chdir(const char *relpath, std::string &new_cwd,
11032 const UserPerm& perms)
11033 {
11034 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11035 if (!mref_reader.is_state_satisfied())
11036 return -CEPHFS_ENOTCONN;
11037
11038 tout(cct) << "chdir" << std::endl;
11039 tout(cct) << relpath << std::endl;
11040
11041 filepath path(relpath);
11042 InodeRef in;
11043
11044 std::scoped_lock lock(client_lock);
11045 int r = path_walk(path, &in, perms);
11046 if (r < 0)
11047 return r;
11048
11049 if (!(in.get()->is_dir()))
11050 return -CEPHFS_ENOTDIR;
11051
11052 if (cwd != in)
11053 cwd.swap(in);
11054 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
11055
11056 _getcwd(new_cwd, perms);
11057 return 0;
11058 }
11059
11060 void Client::_getcwd(string& dir, const UserPerm& perms)
11061 {
11062 filepath path;
11063 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
11064
11065 Inode *in = cwd.get();
11066 while (in != root.get()) {
11067 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
11068
11069 // A cwd or ancester is unlinked
11070 if (in->dentries.empty()) {
11071 return;
11072 }
11073
11074 Dentry *dn = in->get_first_parent();
11075
11076
11077 if (!dn) {
11078 // look it up
11079 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
11080 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11081 filepath path(in->ino);
11082 req->set_filepath(path);
11083 req->set_inode(in);
11084 int res = make_request(req, perms);
11085 if (res < 0)
11086 break;
11087
11088 // start over
11089 path = filepath();
11090 in = cwd.get();
11091 continue;
11092 }
11093 path.push_front_dentry(dn->name);
11094 in = dn->dir->parent_inode;
11095 }
11096 dir = "/";
11097 dir += path.get_path();
11098 }
11099
11100 void Client::getcwd(string& dir, const UserPerm& perms)
11101 {
11102 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11103 if (!mref_reader.is_state_satisfied())
11104 return;
11105
11106 std::scoped_lock l(client_lock);
11107
11108 _getcwd(dir, perms);
11109 }
11110
11111 int Client::statfs(const char *path, struct statvfs *stbuf,
11112 const UserPerm& perms)
11113 {
11114 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11115 if (!mref_reader.is_state_satisfied())
11116 return -CEPHFS_ENOTCONN;
11117
11118 tout(cct) << __func__ << std::endl;
11119 unsigned long int total_files_on_fs;
11120
11121 ceph_statfs stats;
11122 C_SaferCond cond;
11123
11124 std::unique_lock lock(client_lock);
11125 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11126 if (data_pools.size() == 1) {
11127 objecter->get_fs_stats(stats, data_pools[0], &cond);
11128 } else {
11129 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
11130 }
11131
11132 lock.unlock();
11133 int rval = cond.wait();
11134 lock.lock();
11135
11136 ceph_assert(root);
11137 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
11138
11139 if (rval < 0) {
11140 ldout(cct, 1) << "underlying call to statfs returned error: "
11141 << cpp_strerror(rval)
11142 << dendl;
11143 return rval;
11144 }
11145
11146 memset(stbuf, 0, sizeof(*stbuf));
11147
11148 /*
11149 * we're going to set a block size of 4MB so we can represent larger
11150 * FSes without overflowing. Additionally convert the space
11151 * measurements from KB to bytes while making them in terms of
11152 * blocks. We use 4MB only because it is big enough, and because it
11153 * actually *is* the (ceph) default block size.
11154 */
11155 const int CEPH_BLOCK_SHIFT = 22;
11156 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11157 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
11158 stbuf->f_files = total_files_on_fs;
11159 stbuf->f_ffree = -1;
11160 stbuf->f_favail = -1;
11161 stbuf->f_fsid = -1; // ??
11162 stbuf->f_flag = 0; // ??
11163 stbuf->f_namemax = NAME_MAX;
11164
11165 // Usually quota_root will == root_ancestor, but if the mount root has no
11166 // quota but we can see a parent of it that does have a quota, we'll
11167 // respect that one instead.
11168 ceph_assert(root != nullptr);
11169 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
11170
11171 // get_quota_root should always give us something if client quotas are
11172 // enabled
11173 ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
11174
11175 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11176
11177 // Skip the getattr if any sessions are stale, as we don't want to
11178 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11179 // is unhealthy.
11180 if (!_any_stale_sessions()) {
11181 int r = _getattr(quota_root, 0, perms, true);
11182 if (r != 0) {
11183 // Ignore return value: error getting latest inode metadata is not a good
11184 // reason to break "df".
11185 lderr(cct) << "Error in getattr on quota root 0x"
11186 << std::hex << quota_root->ino << std::dec
11187 << " statfs result may be outdated" << dendl;
11188 }
11189 }
11190
11191 // Special case: if there is a size quota set on the Inode acting
11192 // as the root for this client mount, then report the quota status
11193 // as the filesystem statistics.
11194 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11195 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
11196 // It is possible for a quota to be exceeded: arithmetic here must
11197 // handle case where used > total.
11198 const fsblkcnt_t free = total > used ? total - used : 0;
11199
11200 stbuf->f_blocks = total;
11201 stbuf->f_bfree = free;
11202 stbuf->f_bavail = free;
11203 } else {
11204 // General case: report the cluster statistics returned from RADOS. Because
11205 // multiple pools may be used without one filesystem namespace via
11206 // layouts, this is the most correct thing we can do.
11207 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11208 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11209 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11210 }
11211
11212 return rval;
11213 }
11214
11215 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11216 struct flock *fl, uint64_t owner, bool removing)
11217 {
11218 ldout(cct, 10) << __func__ << " ino " << in->ino
11219 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11220 << " type " << fl->l_type << " owner " << owner
11221 << " " << fl->l_start << "~" << fl->l_len << dendl;
11222
11223 if (in->flags & I_ERROR_FILELOCK)
11224 return -CEPHFS_EIO;
11225
11226 int lock_cmd;
11227 if (F_RDLCK == fl->l_type)
11228 lock_cmd = CEPH_LOCK_SHARED;
11229 else if (F_WRLCK == fl->l_type)
11230 lock_cmd = CEPH_LOCK_EXCL;
11231 else if (F_UNLCK == fl->l_type)
11232 lock_cmd = CEPH_LOCK_UNLOCK;
11233 else
11234 return -CEPHFS_EIO;
11235
11236 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11237 sleep = 0;
11238
11239 /*
11240 * Set the most significant bit, so that MDS knows the 'owner'
11241 * is sufficient to identify the owner of lock. (old code uses
11242 * both 'owner' and 'pid')
11243 */
11244 owner |= (1ULL << 63);
11245
11246 MetaRequest *req = new MetaRequest(op);
11247 filepath path;
11248 in->make_nosnap_relative_path(path);
11249 req->set_filepath(path);
11250 req->set_inode(in);
11251
11252 req->head.args.filelock_change.rule = lock_type;
11253 req->head.args.filelock_change.type = lock_cmd;
11254 req->head.args.filelock_change.owner = owner;
11255 req->head.args.filelock_change.pid = fl->l_pid;
11256 req->head.args.filelock_change.start = fl->l_start;
11257 req->head.args.filelock_change.length = fl->l_len;
11258 req->head.args.filelock_change.wait = sleep;
11259
11260 int ret;
11261 bufferlist bl;
11262
11263 if (sleep && switch_interrupt_cb) {
11264 // enable interrupt
11265 switch_interrupt_cb(callback_handle, req->get());
11266 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11267 // disable interrupt
11268 switch_interrupt_cb(callback_handle, NULL);
11269 if (ret == 0 && req->aborted()) {
11270 // effect of this lock request has been revoked by the 'lock intr' request
11271 ret = req->get_abort_code();
11272 }
11273 put_request(req);
11274 } else {
11275 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11276 }
11277
11278 if (ret == 0) {
11279 if (op == CEPH_MDS_OP_GETFILELOCK) {
11280 ceph_filelock filelock;
11281 auto p = bl.cbegin();
11282 decode(filelock, p);
11283
11284 if (CEPH_LOCK_SHARED == filelock.type)
11285 fl->l_type = F_RDLCK;
11286 else if (CEPH_LOCK_EXCL == filelock.type)
11287 fl->l_type = F_WRLCK;
11288 else
11289 fl->l_type = F_UNLCK;
11290
11291 fl->l_whence = SEEK_SET;
11292 fl->l_start = filelock.start;
11293 fl->l_len = filelock.length;
11294 fl->l_pid = filelock.pid;
11295 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11296 ceph_lock_state_t *lock_state;
11297 if (lock_type == CEPH_LOCK_FCNTL) {
11298 if (!in->fcntl_locks)
11299 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11300 lock_state = in->fcntl_locks.get();
11301 } else if (lock_type == CEPH_LOCK_FLOCK) {
11302 if (!in->flock_locks)
11303 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11304 lock_state = in->flock_locks.get();
11305 } else {
11306 ceph_abort();
11307 return -CEPHFS_EINVAL;
11308 }
11309 _update_lock_state(fl, owner, lock_state);
11310
11311 if (!removing) {
11312 if (lock_type == CEPH_LOCK_FCNTL) {
11313 if (!fh->fcntl_locks)
11314 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11315 lock_state = fh->fcntl_locks.get();
11316 } else {
11317 if (!fh->flock_locks)
11318 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11319 lock_state = fh->flock_locks.get();
11320 }
11321 _update_lock_state(fl, owner, lock_state);
11322 }
11323 } else
11324 ceph_abort();
11325 }
11326 return ret;
11327 }
11328
11329 int Client::_interrupt_filelock(MetaRequest *req)
11330 {
11331 // Set abort code, but do not kick. The abort code prevents the request
11332 // from being re-sent.
11333 req->abort(-CEPHFS_EINTR);
11334 if (req->mds < 0)
11335 return 0; // haven't sent the request
11336
11337 Inode *in = req->inode();
11338
11339 int lock_type;
11340 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11341 lock_type = CEPH_LOCK_FLOCK_INTR;
11342 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11343 lock_type = CEPH_LOCK_FCNTL_INTR;
11344 else {
11345 ceph_abort();
11346 return -CEPHFS_EINVAL;
11347 }
11348
11349 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11350 filepath path;
11351 in->make_nosnap_relative_path(path);
11352 intr_req->set_filepath(path);
11353 intr_req->set_inode(in);
11354 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11355 intr_req->head.args.filelock_change.rule = lock_type;
11356 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11357
11358 UserPerm perms(req->get_uid(), req->get_gid());
11359 return make_request(intr_req, perms, NULL, NULL, -1);
11360 }
11361
11362 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11363 {
11364 if (!in->fcntl_locks && !in->flock_locks)
11365 return;
11366
11367 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11368 encode(nr_fcntl_locks, bl);
11369 if (nr_fcntl_locks) {
11370 auto &lock_state = in->fcntl_locks;
11371 for(auto p = lock_state->held_locks.begin();
11372 p != lock_state->held_locks.end();
11373 ++p)
11374 encode(p->second, bl);
11375 }
11376
11377 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11378 encode(nr_flock_locks, bl);
11379 if (nr_flock_locks) {
11380 auto &lock_state = in->flock_locks;
11381 for(auto p = lock_state->held_locks.begin();
11382 p != lock_state->held_locks.end();
11383 ++p)
11384 encode(p->second, bl);
11385 }
11386
11387 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11388 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11389 }
11390
11391 void Client::_release_filelocks(Fh *fh)
11392 {
11393 if (!fh->fcntl_locks && !fh->flock_locks)
11394 return;
11395
11396 Inode *in = fh->inode.get();
11397 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11398
11399 list<ceph_filelock> activated_locks;
11400
11401 list<pair<int, ceph_filelock> > to_release;
11402
11403 if (fh->fcntl_locks) {
11404 auto &lock_state = fh->fcntl_locks;
11405 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11406 auto q = p++;
11407 if (in->flags & I_ERROR_FILELOCK) {
11408 lock_state->remove_lock(q->second, activated_locks);
11409 } else {
11410 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11411 }
11412 }
11413 lock_state.reset();
11414 }
11415 if (fh->flock_locks) {
11416 auto &lock_state = fh->flock_locks;
11417 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11418 auto q = p++;
11419 if (in->flags & I_ERROR_FILELOCK) {
11420 lock_state->remove_lock(q->second, activated_locks);
11421 } else {
11422 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11423 }
11424 }
11425 lock_state.reset();
11426 }
11427
11428 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11429 in->flags &= ~I_ERROR_FILELOCK;
11430
11431 if (to_release.empty())
11432 return;
11433
11434 struct flock fl;
11435 memset(&fl, 0, sizeof(fl));
11436 fl.l_whence = SEEK_SET;
11437 fl.l_type = F_UNLCK;
11438
11439 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11440 p != to_release.end();
11441 ++p) {
11442 fl.l_start = p->second.start;
11443 fl.l_len = p->second.length;
11444 fl.l_pid = p->second.pid;
11445 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11446 p->second.owner, true);
11447 }
11448 }
11449
11450 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11451 ceph_lock_state_t *lock_state)
11452 {
11453 int lock_cmd;
11454 if (F_RDLCK == fl->l_type)
11455 lock_cmd = CEPH_LOCK_SHARED;
11456 else if (F_WRLCK == fl->l_type)
11457 lock_cmd = CEPH_LOCK_EXCL;
11458 else
11459 lock_cmd = CEPH_LOCK_UNLOCK;;
11460
11461 ceph_filelock filelock;
11462 filelock.start = fl->l_start;
11463 filelock.length = fl->l_len;
11464 filelock.client = 0;
11465 // see comment in _do_filelock()
11466 filelock.owner = owner | (1ULL << 63);
11467 filelock.pid = fl->l_pid;
11468 filelock.type = lock_cmd;
11469
11470 if (filelock.type == CEPH_LOCK_UNLOCK) {
11471 list<ceph_filelock> activated_locks;
11472 lock_state->remove_lock(filelock, activated_locks);
11473 } else {
11474 bool r = lock_state->add_lock(filelock, false, false, NULL);
11475 ceph_assert(r);
11476 }
11477 }
11478
11479 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11480 {
11481 Inode *in = fh->inode.get();
11482 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11483 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11484 return ret;
11485 }
11486
11487 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11488 {
11489 Inode *in = fh->inode.get();
11490 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11491 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11492 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11493 return ret;
11494 }
11495
11496 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11497 {
11498 Inode *in = fh->inode.get();
11499 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11500
11501 int sleep = !(cmd & LOCK_NB);
11502 cmd &= ~LOCK_NB;
11503
11504 int type;
11505 switch (cmd) {
11506 case LOCK_SH:
11507 type = F_RDLCK;
11508 break;
11509 case LOCK_EX:
11510 type = F_WRLCK;
11511 break;
11512 case LOCK_UN:
11513 type = F_UNLCK;
11514 break;
11515 default:
11516 return -CEPHFS_EINVAL;
11517 }
11518
11519 struct flock fl;
11520 memset(&fl, 0, sizeof(fl));
11521 fl.l_type = type;
11522 fl.l_whence = SEEK_SET;
11523
11524 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11525 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11526 return ret;
11527 }
11528
11529 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11530 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11531 if (!mref_reader.is_state_satisfied()) {
11532 return -CEPHFS_ENOTCONN;
11533 }
11534
11535 std::scoped_lock lock(client_lock);
11536 InodeRef in;
11537 int r = Client::path_walk(path, &in, perms, true);
11538 if (r < 0) {
11539 return r;
11540 }
11541
11542 if (in->snapid == CEPH_NOSNAP) {
11543 return -CEPHFS_EINVAL;
11544 }
11545
11546 snap_info->id = in->snapid;
11547 snap_info->metadata = in->snap_metadata;
11548 return 0;
11549 }
11550
11551 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11552 {
11553 /* Since the only thing this does is wrap a call to statfs, and
11554 statfs takes a lock, it doesn't seem we have a need to split it
11555 out. */
11556 return statfs(0, stbuf, perms);
11557 }
11558
11559 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11560 {
11561 if (!args)
11562 return;
11563
11564 ldout(cct, 10) << __func__ << " cb " << args->handle
11565 << " invalidate_ino_cb " << args->ino_cb
11566 << " invalidate_dentry_cb " << args->dentry_cb
11567 << " switch_interrupt_cb " << args->switch_intr_cb
11568 << " remount_cb " << args->remount_cb
11569 << dendl;
11570 callback_handle = args->handle;
11571 if (args->ino_cb) {
11572 ino_invalidate_cb = args->ino_cb;
11573 async_ino_invalidator.start();
11574 }
11575 if (args->dentry_cb) {
11576 dentry_invalidate_cb = args->dentry_cb;
11577 async_dentry_invalidator.start();
11578 }
11579 if (args->switch_intr_cb) {
11580 switch_interrupt_cb = args->switch_intr_cb;
11581 interrupt_finisher.start();
11582 }
11583 if (args->remount_cb) {
11584 remount_cb = args->remount_cb;
11585 remount_finisher.start();
11586 }
11587 if (args->ino_release_cb) {
11588 ino_release_cb = args->ino_release_cb;
11589 async_ino_releasor.start();
11590 }
11591 if (args->umask_cb)
11592 umask_cb = args->umask_cb;
11593 }
11594
11595 // This is deprecated, use ll_register_callbacks2() instead.
11596 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11597 {
11598 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11599
11600 _ll_register_callbacks(args);
11601 }
11602
11603 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11604 {
11605 if (is_mounting() || is_mounted() || is_unmounting())
11606 return -CEPHFS_EBUSY;
11607
11608 _ll_register_callbacks(args);
11609 return 0;
11610 }
11611
11612 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11613 {
11614 std::pair <int, bool> r(0, false);
11615
11616 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11617 if (!iref_reader.is_state_satisfied())
11618 return std::make_pair(-CEPHFS_ENOTCONN, false);
11619
11620 can_invalidate_dentries = can_invalidate;
11621
11622 /*
11623 * Force to use the old and slow method to invalidate the dcache
11624 * if the euid is non-root, or the remount may fail with return
11625 * code 1 or 32.
11626 */
11627 uid_t euid = geteuid();
11628 ldout(cct, 10) << "euid: " << euid << dendl;
11629 if (euid != 0) {
11630 can_invalidate_dentries = true;
11631 }
11632
11633 if (can_invalidate_dentries) {
11634 ceph_assert(dentry_invalidate_cb);
11635 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11636 } else {
11637 ceph_assert(remount_cb);
11638 ldout(cct, 1) << "using remount_cb" << dendl;
11639 r = _do_remount(false);
11640 }
11641
11642 return r;
11643 }
11644
11645 int Client::_sync_fs()
11646 {
11647 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11648
11649 ldout(cct, 10) << __func__ << dendl;
11650
11651 // flush file data
11652 std::unique_ptr<C_SaferCond> cond = nullptr;
11653 if (cct->_conf->client_oc) {
11654 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11655 objectcacher->flush_all(cond.get());
11656 }
11657
11658 // flush caps
11659 flush_caps_sync();
11660 ceph_tid_t flush_tid = last_flush_tid;
11661
11662 // wait for unsafe mds requests
11663 wait_unsafe_requests();
11664
11665 wait_sync_caps(flush_tid);
11666
11667 if (nullptr != cond) {
11668 client_lock.unlock();
11669 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11670 cond->wait();
11671 ldout(cct, 15) << __func__ << " flush finished" << dendl;
11672 client_lock.lock();
11673 }
11674
11675 return 0;
11676 }
11677
11678 int Client::sync_fs()
11679 {
11680 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11681 if (!mref_reader.is_state_satisfied())
11682 return -CEPHFS_ENOTCONN;
11683
11684 std::scoped_lock l(client_lock);
11685
11686 return _sync_fs();
11687 }
11688
11689 int64_t Client::drop_caches()
11690 {
11691 std::scoped_lock l(client_lock);
11692 return objectcacher->release_all();
11693 }
11694
11695 int Client::_lazyio(Fh *fh, int enable)
11696 {
11697 Inode *in = fh->inode.get();
11698 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11699
11700 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11701 return 0;
11702
11703 int orig_mode = fh->mode;
11704 if (enable) {
11705 fh->mode |= CEPH_FILE_MODE_LAZY;
11706 in->get_open_ref(fh->mode);
11707 in->put_open_ref(orig_mode);
11708 check_caps(in, CHECK_CAPS_NODELAY);
11709 } else {
11710 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11711 in->get_open_ref(fh->mode);
11712 in->put_open_ref(orig_mode);
11713 check_caps(in, 0);
11714 }
11715
11716 return 0;
11717 }
11718
11719 int Client::lazyio(int fd, int enable)
11720 {
11721 std::scoped_lock l(client_lock);
11722 Fh *f = get_filehandle(fd);
11723 if (!f)
11724 return -CEPHFS_EBADF;
11725
11726 return _lazyio(f, enable);
11727 }
11728
11729 int Client::ll_lazyio(Fh *fh, int enable)
11730 {
11731 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11732 tout(cct) << __func__ << std::endl;
11733
11734 std::scoped_lock lock(client_lock);
11735 return _lazyio(fh, enable);
11736 }
11737
11738 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11739 {
11740 std::scoped_lock l(client_lock);
11741 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11742 << ", " << offset << ", " << count << ")" << dendl;
11743
11744 Fh *f = get_filehandle(fd);
11745 if (!f)
11746 return -CEPHFS_EBADF;
11747
11748 // for now
11749 _fsync(f, true);
11750
11751 return 0;
11752 }
11753
11754 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11755 {
11756 std::scoped_lock l(client_lock);
11757 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11758 << ", " << offset << ", " << count << ")" << dendl;
11759
11760 Fh *f = get_filehandle(fd);
11761 if (!f)
11762 return -CEPHFS_EBADF;
11763 Inode *in = f->inode.get();
11764
11765 _fsync(f, true);
11766 if (_release(in)) {
11767 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11768 if (r < 0)
11769 return r;
11770 }
11771 return 0;
11772 }
11773
11774
11775 // =============================
11776 // snaps
11777
11778 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11779 mode_t mode, const std::map<std::string, std::string> &metadata)
11780 {
11781 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11782 if (!mref_reader.is_state_satisfied())
11783 return -CEPHFS_ENOTCONN;
11784
11785 std::scoped_lock l(client_lock);
11786
11787 filepath path(relpath);
11788 InodeRef in;
11789 int r = path_walk(path, &in, perm);
11790 if (r < 0)
11791 return r;
11792 if (cct->_conf->client_permissions) {
11793 r = may_create(in.get(), perm);
11794 if (r < 0)
11795 return r;
11796 }
11797 Inode *snapdir = open_snapdir(in.get());
11798 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11799 }
11800
11801 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11802 {
11803 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11804 if (!mref_reader.is_state_satisfied())
11805 return -CEPHFS_ENOTCONN;
11806
11807 std::scoped_lock l(client_lock);
11808
11809 filepath path(relpath);
11810 InodeRef in;
11811 int r = path_walk(path, &in, perms);
11812 if (r < 0)
11813 return r;
11814 Inode *snapdir = open_snapdir(in.get());
11815 if (cct->_conf->client_permissions) {
11816 r = may_delete(snapdir, check_perms ? name : NULL, perms);
11817 if (r < 0)
11818 return r;
11819 }
11820 return _rmdir(snapdir, name, perms);
11821 }
11822
11823 // =============================
11824 // expose caps
11825
11826 int Client::get_caps_issued(int fd)
11827 {
11828 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11829 if (!mref_reader.is_state_satisfied())
11830 return -CEPHFS_ENOTCONN;
11831
11832 std::scoped_lock lock(client_lock);
11833
11834 Fh *f = get_filehandle(fd);
11835 if (!f)
11836 return -CEPHFS_EBADF;
11837
11838 return f->inode->caps_issued();
11839 }
11840
11841 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11842 {
11843 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11844 if (!mref_reader.is_state_satisfied())
11845 return -CEPHFS_ENOTCONN;
11846
11847 std::scoped_lock lock(client_lock);
11848
11849 filepath p(path);
11850 InodeRef in;
11851 int r = path_walk(p, &in, perms, true);
11852 if (r < 0)
11853 return r;
11854 return in->caps_issued();
11855 }
11856
11857 // =========================================
11858 // low level
11859
11860 void Client::refresh_snapdir_attrs(Inode *in, Inode *diri) {
11861 ldout(cct, 10) << __func__ << ": snapdir inode=" << *in
11862 << ", inode=" << *diri << dendl;
11863 in->ino = diri->ino;
11864 in->snapid = CEPH_SNAPDIR;
11865 in->mode = diri->mode;
11866 in->uid = diri->uid;
11867 in->gid = diri->gid;
11868 in->nlink = 1;
11869 in->mtime = diri->mtime;
11870 in->ctime = diri->ctime;
11871 in->btime = diri->btime;
11872 in->atime = diri->atime;
11873 in->size = diri->size;
11874 in->change_attr = diri->change_attr;
11875
11876 in->dirfragtree.clear();
11877 in->snapdir_parent = diri;
11878 // copy posix acls to snapshotted inode
11879 in->xattrs.clear();
11880 for (auto &[xattr_key, xattr_value] : diri->xattrs) {
11881 if (xattr_key.rfind("system.", 0) == 0) {
11882 in->xattrs[xattr_key] = xattr_value;
11883 }
11884 }
11885 }
11886
11887 Inode *Client::open_snapdir(Inode *diri)
11888 {
11889 Inode *in;
11890 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11891 if (!inode_map.count(vino)) {
11892 in = new Inode(this, vino, &diri->layout);
11893 refresh_snapdir_attrs(in, diri);
11894 diri->flags |= I_SNAPDIR_OPEN;
11895 inode_map[vino] = in;
11896 if (use_faked_inos())
11897 _assign_faked_ino(in);
11898 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11899 } else {
11900 in = inode_map[vino];
11901 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11902 }
11903 return in;
11904 }
11905
11906 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11907 Inode **out, const UserPerm& perms)
11908 {
11909 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11910 if (!mref_reader.is_state_satisfied())
11911 return -CEPHFS_ENOTCONN;
11912
11913 vinodeno_t vparent = _get_vino(parent);
11914 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11915 tout(cct) << __func__ << std::endl;
11916 tout(cct) << name << std::endl;
11917
11918 std::scoped_lock lock(client_lock);
11919
11920 int r = 0;
11921 if (!fuse_default_permissions) {
11922 if (strcmp(name, ".") && strcmp(name, "..")) {
11923 r = may_lookup(parent, perms);
11924 if (r < 0)
11925 return r;
11926 }
11927 }
11928
11929 string dname(name);
11930 InodeRef in;
11931
11932 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11933 if (r < 0) {
11934 attr->st_ino = 0;
11935 goto out;
11936 }
11937
11938 ceph_assert(in);
11939 fill_stat(in, attr);
11940 _ll_get(in.get());
11941
11942 out:
11943 ldout(cct, 3) << __func__ << " " << vparent << " " << name
11944 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11945 tout(cct) << attr->st_ino << std::endl;
11946 *out = in.get();
11947 return r;
11948 }
11949
11950 int Client::ll_lookup_vino(
11951 vinodeno_t vino,
11952 const UserPerm& perms,
11953 Inode **inode)
11954 {
11955 ceph_assert(inode != NULL);
11956 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11957 if (!mref_reader.is_state_satisfied())
11958 return -CEPHFS_ENOTCONN;
11959
11960 if (is_reserved_vino(vino))
11961 return -CEPHFS_ESTALE;
11962
11963 std::scoped_lock lock(client_lock);
11964 ldout(cct, 3) << __func__ << " " << vino << dendl;
11965
11966 // Check the cache first
11967 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11968 if (p != inode_map.end()) {
11969 *inode = p->second;
11970 _ll_get(*inode);
11971 return 0;
11972 }
11973
11974 uint64_t snapid = vino.snapid;
11975
11976 // for snapdir, find the non-snapped dir inode
11977 if (snapid == CEPH_SNAPDIR)
11978 vino.snapid = CEPH_NOSNAP;
11979
11980 int r = _lookup_vino(vino, perms, inode);
11981 if (r)
11982 return r;
11983 ceph_assert(*inode != NULL);
11984
11985 if (snapid == CEPH_SNAPDIR) {
11986 Inode *tmp = *inode;
11987
11988 // open the snapdir and put the inode ref
11989 *inode = open_snapdir(tmp);
11990 _ll_forget(tmp, 1);
11991 _ll_get(*inode);
11992 }
11993 return 0;
11994 }
11995
11996 int Client::ll_lookup_inode(
11997 struct inodeno_t ino,
11998 const UserPerm& perms,
11999 Inode **inode)
12000 {
12001 vinodeno_t vino(ino, CEPH_NOSNAP);
12002 return ll_lookup_vino(vino, perms, inode);
12003 }
12004
12005 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
12006 struct ceph_statx *stx, unsigned want, unsigned flags,
12007 const UserPerm& perms)
12008 {
12009 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12010 if (!mref_reader.is_state_satisfied())
12011 return -CEPHFS_ENOTCONN;
12012
12013 vinodeno_t vparent = _get_vino(parent);
12014 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12015 tout(cct) << "ll_lookupx" << std::endl;
12016 tout(cct) << name << std::endl;
12017
12018 std::scoped_lock lock(client_lock);
12019
12020 int r = 0;
12021 if (!fuse_default_permissions) {
12022 r = may_lookup(parent, perms);
12023 if (r < 0)
12024 return r;
12025 }
12026
12027 string dname(name);
12028 InodeRef in;
12029
12030 unsigned mask = statx_to_mask(flags, want);
12031 r = _lookup(parent, dname, mask, &in, perms);
12032 if (r < 0) {
12033 stx->stx_ino = 0;
12034 stx->stx_mask = 0;
12035 } else {
12036 ceph_assert(in);
12037 fill_statx(in, mask, stx);
12038 _ll_get(in.get());
12039 }
12040
12041 ldout(cct, 3) << __func__ << " " << vparent << " " << name
12042 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12043 tout(cct) << stx->stx_ino << std::endl;
12044 *out = in.get();
12045 return r;
12046 }
12047
12048 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
12049 unsigned int want, unsigned int flags, const UserPerm& perms)
12050 {
12051 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12052 if (!mref_reader.is_state_satisfied())
12053 return -CEPHFS_ENOTCONN;
12054
12055 filepath fp(name, 0);
12056 InodeRef in;
12057 int rc;
12058 unsigned mask = statx_to_mask(flags, want);
12059
12060 ldout(cct, 3) << __func__ << " " << name << dendl;
12061 tout(cct) << __func__ << std::endl;
12062 tout(cct) << name << std::endl;
12063
12064 std::scoped_lock lock(client_lock);
12065 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
12066 if (rc < 0) {
12067 /* zero out mask, just in case... */
12068 stx->stx_mask = 0;
12069 stx->stx_ino = 0;
12070 *out = NULL;
12071 return rc;
12072 } else {
12073 ceph_assert(in);
12074 fill_statx(in, mask, stx);
12075 _ll_get(in.get());
12076 *out = in.get();
12077 return 0;
12078 }
12079 }
12080
12081 void Client::_ll_get(Inode *in)
12082 {
12083 if (in->ll_ref == 0) {
12084 in->iget();
12085 if (in->is_dir() && !in->dentries.empty()) {
12086 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12087 in->get_first_parent()->get(); // pin dentry
12088 }
12089 if (in->snapid != CEPH_NOSNAP)
12090 ll_snap_ref[in->snapid]++;
12091 }
12092 in->ll_get();
12093 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
12094 }
12095
12096 int Client::_ll_put(Inode *in, uint64_t num)
12097 {
12098 in->ll_put(num);
12099 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
12100 if (in->ll_ref == 0) {
12101 if (in->is_dir() && !in->dentries.empty()) {
12102 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12103 in->get_first_parent()->put(); // unpin dentry
12104 }
12105 if (in->snapid != CEPH_NOSNAP) {
12106 auto p = ll_snap_ref.find(in->snapid);
12107 ceph_assert(p != ll_snap_ref.end());
12108 ceph_assert(p->second > 0);
12109 if (--p->second == 0)
12110 ll_snap_ref.erase(p);
12111 }
12112 put_inode(in);
12113 return 0;
12114 } else {
12115 return in->ll_ref;
12116 }
12117 }
12118
12119 void Client::_ll_drop_pins()
12120 {
12121 ldout(cct, 10) << __func__ << dendl;
12122 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
12123 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12124 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12125 it != inode_map.end();
12126 it = next) {
12127 Inode *in = it->second;
12128 next = it;
12129 ++next;
12130 if (in->ll_ref){
12131 to_be_put.insert(in);
12132 _ll_put(in, in->ll_ref);
12133 }
12134 }
12135 }
12136
12137 bool Client::_ll_forget(Inode *in, uint64_t count)
12138 {
12139 inodeno_t ino = in->ino;
12140
12141 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12142 tout(cct) << __func__ << std::endl;
12143 tout(cct) << ino.val << std::endl;
12144 tout(cct) << count << std::endl;
12145
12146 // Ignore forget if we're no longer mounted
12147 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12148 if (!mref_reader.is_state_satisfied())
12149 return true;
12150
12151 if (ino == 1) return true; // ignore forget on root.
12152
12153 bool last = false;
12154 if (in->ll_ref < count) {
12155 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12156 << ", which only has ll_ref=" << in->ll_ref << dendl;
12157 _ll_put(in, in->ll_ref);
12158 last = true;
12159 } else {
12160 if (_ll_put(in, count) == 0)
12161 last = true;
12162 }
12163
12164 return last;
12165 }
12166
12167 bool Client::ll_forget(Inode *in, uint64_t count)
12168 {
12169 std::scoped_lock lock(client_lock);
12170 return _ll_forget(in, count);
12171 }
12172
12173 bool Client::ll_put(Inode *in)
12174 {
12175 /* ll_forget already takes the lock */
12176 return ll_forget(in, 1);
12177 }
12178
12179 int Client::ll_get_snap_ref(snapid_t snap)
12180 {
12181 std::scoped_lock lock(client_lock);
12182 auto p = ll_snap_ref.find(snap);
12183 if (p != ll_snap_ref.end())
12184 return p->second;
12185 return 0;
12186 }
12187
12188 snapid_t Client::ll_get_snapid(Inode *in)
12189 {
12190 std::scoped_lock lock(client_lock);
12191 return in->snapid;
12192 }
12193
12194 Inode *Client::ll_get_inode(ino_t ino)
12195 {
12196 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12197 if (!mref_reader.is_state_satisfied())
12198 return NULL;
12199
12200 std::scoped_lock lock(client_lock);
12201
12202 vinodeno_t vino = _map_faked_ino(ino);
12203 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12204 if (p == inode_map.end())
12205 return NULL;
12206 Inode *in = p->second;
12207 _ll_get(in);
12208 return in;
12209 }
12210
12211 Inode *Client::ll_get_inode(vinodeno_t vino)
12212 {
12213 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12214 if (!mref_reader.is_state_satisfied())
12215 return NULL;
12216
12217 if (is_reserved_vino(vino))
12218 return NULL;
12219
12220 std::scoped_lock lock(client_lock);
12221
12222 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12223 if (p == inode_map.end())
12224 return NULL;
12225 Inode *in = p->second;
12226 _ll_get(in);
12227 return in;
12228 }
12229
12230 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12231 {
12232 vinodeno_t vino = _get_vino(in);
12233
12234 ldout(cct, 8) << __func__ << " " << vino << dendl;
12235 tout(cct) << __func__ << std::endl;
12236 tout(cct) << vino.ino.val << std::endl;
12237
12238 if (vino.snapid < CEPH_NOSNAP)
12239 return 0;
12240 else
12241 return _getattr(in, caps, perms);
12242 }
12243
12244 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12245 {
12246 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12247 if (!mref_reader.is_state_satisfied())
12248 return -CEPHFS_ENOTCONN;
12249
12250 std::scoped_lock lock(client_lock);
12251
12252 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12253
12254 if (res == 0)
12255 fill_stat(in, attr);
12256 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12257 return res;
12258 }
12259
12260 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12261 unsigned int flags, const UserPerm& perms)
12262 {
12263 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12264 if (!mref_reader.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN;
12266
12267 std::scoped_lock lock(client_lock);
12268
12269 int res = 0;
12270 unsigned mask = statx_to_mask(flags, want);
12271
12272 if (mask && !in->caps_issued_mask(mask, true))
12273 res = _ll_getattr(in, mask, perms);
12274
12275 if (res == 0)
12276 fill_statx(in, mask, stx);
12277 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12278 return res;
12279 }
12280
12281 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12282 const UserPerm& perms, InodeRef *inp)
12283 {
12284 vinodeno_t vino = _get_vino(in);
12285
12286 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12287 << dendl;
12288 tout(cct) << __func__ << std::endl;
12289 tout(cct) << vino.ino.val << std::endl;
12290 tout(cct) << stx->stx_mode << std::endl;
12291 tout(cct) << stx->stx_uid << std::endl;
12292 tout(cct) << stx->stx_gid << std::endl;
12293 tout(cct) << stx->stx_size << std::endl;
12294 tout(cct) << stx->stx_mtime << std::endl;
12295 tout(cct) << stx->stx_atime << std::endl;
12296 tout(cct) << stx->stx_btime << std::endl;
12297 tout(cct) << mask << std::endl;
12298
12299 if (!fuse_default_permissions) {
12300 int res = may_setattr(in, stx, mask, perms);
12301 if (res < 0)
12302 return res;
12303 }
12304
12305 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12306
12307 return __setattrx(in, stx, mask, perms, inp);
12308 }
12309
12310 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12311 const UserPerm& perms)
12312 {
12313 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12314 if (!mref_reader.is_state_satisfied())
12315 return -CEPHFS_ENOTCONN;
12316
12317 std::scoped_lock lock(client_lock);
12318
12319 InodeRef target(in);
12320 int res = _ll_setattrx(in, stx, mask, perms, &target);
12321 if (res == 0) {
12322 ceph_assert(in == target.get());
12323 fill_statx(in, in->caps_issued(), stx);
12324 }
12325
12326 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12327 return res;
12328 }
12329
12330 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12331 const UserPerm& perms)
12332 {
12333 struct ceph_statx stx;
12334 stat_to_statx(attr, &stx);
12335
12336 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12337 if (!mref_reader.is_state_satisfied())
12338 return -CEPHFS_ENOTCONN;
12339
12340 std::scoped_lock lock(client_lock);
12341
12342 InodeRef target(in);
12343 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12344 if (res == 0) {
12345 ceph_assert(in == target.get());
12346 fill_stat(in, attr);
12347 }
12348
12349 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12350 return res;
12351 }
12352
12353
12354 // ----------
12355 // xattrs
12356
12357 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12358 const UserPerm& perms)
12359 {
12360 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12361 if (!mref_reader.is_state_satisfied())
12362 return -CEPHFS_ENOTCONN;
12363
12364 std::scoped_lock lock(client_lock);
12365
12366 InodeRef in;
12367 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12368 if (r < 0)
12369 return r;
12370 return _getxattr(in, name, value, size, perms);
12371 }
12372
12373 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12374 const UserPerm& perms)
12375 {
12376 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12377 if (!mref_reader.is_state_satisfied())
12378 return -CEPHFS_ENOTCONN;
12379
12380 std::scoped_lock lock(client_lock);
12381
12382 InodeRef in;
12383 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12384 if (r < 0)
12385 return r;
12386 return _getxattr(in, name, value, size, perms);
12387 }
12388
12389 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12390 const UserPerm& perms)
12391 {
12392 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12393 if (!mref_reader.is_state_satisfied())
12394 return -CEPHFS_ENOTCONN;
12395
12396 std::scoped_lock lock(client_lock);
12397
12398 Fh *f = get_filehandle(fd);
12399 if (!f)
12400 return -CEPHFS_EBADF;
12401 return _getxattr(f->inode, name, value, size, perms);
12402 }
12403
12404 int Client::listxattr(const char *path, char *list, size_t size,
12405 const UserPerm& perms)
12406 {
12407 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12408 if (!mref_reader.is_state_satisfied())
12409 return -CEPHFS_ENOTCONN;
12410
12411 std::scoped_lock lock(client_lock);
12412
12413 InodeRef in;
12414 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12415 if (r < 0)
12416 return r;
12417 return Client::_listxattr(in.get(), list, size, perms);
12418 }
12419
12420 int Client::llistxattr(const char *path, char *list, size_t size,
12421 const UserPerm& perms)
12422 {
12423 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12424 if (!mref_reader.is_state_satisfied())
12425 return -CEPHFS_ENOTCONN;
12426
12427 std::scoped_lock lock(client_lock);
12428
12429 InodeRef in;
12430 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12431 if (r < 0)
12432 return r;
12433 return Client::_listxattr(in.get(), list, size, perms);
12434 }
12435
12436 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12437 {
12438 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12439 if (!mref_reader.is_state_satisfied())
12440 return -CEPHFS_ENOTCONN;
12441
12442 std::scoped_lock lock(client_lock);
12443
12444 Fh *f = get_filehandle(fd);
12445 if (!f)
12446 return -CEPHFS_EBADF;
12447 return Client::_listxattr(f->inode.get(), list, size, perms);
12448 }
12449
12450 int Client::removexattr(const char *path, const char *name,
12451 const UserPerm& perms)
12452 {
12453 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12454 if (!mref_reader.is_state_satisfied())
12455 return -CEPHFS_ENOTCONN;
12456
12457 std::scoped_lock lock(client_lock);
12458
12459 InodeRef in;
12460 int r = Client::path_walk(path, &in, perms, true);
12461 if (r < 0)
12462 return r;
12463 return _removexattr(in, name, perms);
12464 }
12465
12466 int Client::lremovexattr(const char *path, const char *name,
12467 const UserPerm& perms)
12468 {
12469 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12470 if (!mref_reader.is_state_satisfied())
12471 return -CEPHFS_ENOTCONN;
12472
12473 std::scoped_lock lock(client_lock);
12474
12475 InodeRef in;
12476 int r = Client::path_walk(path, &in, perms, false);
12477 if (r < 0)
12478 return r;
12479 return _removexattr(in, name, perms);
12480 }
12481
12482 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12483 {
12484 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12485 if (!mref_reader.is_state_satisfied())
12486 return -CEPHFS_ENOTCONN;
12487
12488 std::scoped_lock lock(client_lock);
12489
12490 Fh *f = get_filehandle(fd);
12491 if (!f)
12492 return -CEPHFS_EBADF;
12493 return _removexattr(f->inode, name, perms);
12494 }
12495
12496 int Client::setxattr(const char *path, const char *name, const void *value,
12497 size_t size, int flags, const UserPerm& perms)
12498 {
12499 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12500 if (!mref_reader.is_state_satisfied())
12501 return -CEPHFS_ENOTCONN;
12502
12503 _setxattr_maybe_wait_for_osdmap(name, value, size);
12504
12505 std::scoped_lock lock(client_lock);
12506
12507 InodeRef in;
12508 int r = Client::path_walk(path, &in, perms, true);
12509 if (r < 0)
12510 return r;
12511 return _setxattr(in, name, value, size, flags, perms);
12512 }
12513
12514 int Client::lsetxattr(const char *path, const char *name, const void *value,
12515 size_t size, int flags, const UserPerm& perms)
12516 {
12517 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12518 if (!mref_reader.is_state_satisfied())
12519 return -CEPHFS_ENOTCONN;
12520
12521 _setxattr_maybe_wait_for_osdmap(name, value, size);
12522
12523 std::scoped_lock lock(client_lock);
12524
12525 InodeRef in;
12526 int r = Client::path_walk(path, &in, perms, false);
12527 if (r < 0)
12528 return r;
12529 return _setxattr(in, name, value, size, flags, perms);
12530 }
12531
12532 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12533 int flags, const UserPerm& perms)
12534 {
12535 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12536 if (!mref_reader.is_state_satisfied())
12537 return -CEPHFS_ENOTCONN;
12538
12539 _setxattr_maybe_wait_for_osdmap(name, value, size);
12540
12541 std::scoped_lock lock(client_lock);
12542
12543 Fh *f = get_filehandle(fd);
12544 if (!f)
12545 return -CEPHFS_EBADF;
12546 return _setxattr(f->inode, name, value, size, flags, perms);
12547 }
12548
12549 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12550 const UserPerm& perms)
12551 {
12552 int r;
12553 const VXattr *vxattr = nullptr;
12554
12555 vxattr = _match_vxattr(in, name);
12556 if (vxattr) {
12557 r = -CEPHFS_ENODATA;
12558
12559 // Do a force getattr to get the latest quota before returning
12560 // a value to userspace.
12561 int flags = 0;
12562 if (vxattr->flags & VXATTR_RSTAT) {
12563 flags |= CEPH_STAT_RSTAT;
12564 }
12565 if (vxattr->flags & VXATTR_DIRSTAT) {
12566 flags |= CEPH_CAP_FILE_SHARED;
12567 }
12568 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12569 if (r != 0) {
12570 // Error from getattr!
12571 return r;
12572 }
12573
12574 // call pointer-to-member function
12575 char buf[256];
12576 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12577 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12578 } else {
12579 r = -CEPHFS_ENODATA;
12580 }
12581
12582 if (size != 0) {
12583 if (r > (int)size) {
12584 r = -CEPHFS_ERANGE;
12585 } else if (r > 0) {
12586 memcpy(value, buf, r);
12587 }
12588 }
12589 goto out;
12590 }
12591
12592 if (!strncmp(name, "ceph.", 5)) {
12593 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12594 goto out;
12595 }
12596
12597 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12598 r = -CEPHFS_EOPNOTSUPP;
12599 goto out;
12600 }
12601
12602 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12603 if (r == 0) {
12604 string n(name);
12605 r = -CEPHFS_ENODATA;
12606 if (in->xattrs.count(n)) {
12607 r = in->xattrs[n].length();
12608 if (r > 0 && size != 0) {
12609 if (size >= (unsigned)r)
12610 memcpy(value, in->xattrs[n].c_str(), r);
12611 else
12612 r = -CEPHFS_ERANGE;
12613 }
12614 }
12615 }
12616 out:
12617 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12618 return r;
12619 }
12620
12621 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12622 const UserPerm& perms)
12623 {
12624 if (cct->_conf->client_permissions) {
12625 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12626 if (r < 0)
12627 return r;
12628 }
12629 return _getxattr(in.get(), name, value, size, perms);
12630 }
12631
12632 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12633 size_t size, const UserPerm& perms)
12634 {
12635 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12636 if (!mref_reader.is_state_satisfied())
12637 return -CEPHFS_ENOTCONN;
12638
12639 vinodeno_t vino = _get_vino(in);
12640
12641 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12642 tout(cct) << __func__ << std::endl;
12643 tout(cct) << vino.ino.val << std::endl;
12644 tout(cct) << name << std::endl;
12645
12646 std::scoped_lock lock(client_lock);
12647 if (!fuse_default_permissions) {
12648 int r = xattr_permission(in, name, MAY_READ, perms);
12649 if (r < 0)
12650 return r;
12651 }
12652
12653 return _getxattr(in, name, value, size, perms);
12654 }
12655
12656 int Client::_listxattr(Inode *in, char *name, size_t size,
12657 const UserPerm& perms)
12658 {
12659 bool len_only = (size == 0);
12660 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12661 if (r != 0) {
12662 goto out;
12663 }
12664
12665 r = 0;
12666 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12667 if (xattr_name.rfind("ceph.", 0) == 0) {
12668 continue;
12669 }
12670
12671 size_t this_len = xattr_name.length() + 1;
12672 r += this_len;
12673 if (len_only)
12674 continue;
12675
12676 if (this_len > size) {
12677 r = -CEPHFS_ERANGE;
12678 goto out;
12679 }
12680
12681 memcpy(name, xattr_name.c_str(), this_len);
12682 name += this_len;
12683 size -= this_len;
12684 }
12685 out:
12686 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12687 return r;
12688 }
12689
12690 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12691 const UserPerm& perms)
12692 {
12693 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12694 if (!mref_reader.is_state_satisfied())
12695 return -CEPHFS_ENOTCONN;
12696
12697 vinodeno_t vino = _get_vino(in);
12698
12699 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12700 tout(cct) << __func__ << std::endl;
12701 tout(cct) << vino.ino.val << std::endl;
12702 tout(cct) << size << std::endl;
12703
12704 std::scoped_lock lock(client_lock);
12705 return _listxattr(in, names, size, perms);
12706 }
12707
12708 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12709 size_t size, int flags, const UserPerm& perms)
12710 {
12711
12712 int xattr_flags = 0;
12713 if (!value)
12714 xattr_flags |= CEPH_XATTR_REMOVE;
12715 if (flags & XATTR_CREATE)
12716 xattr_flags |= CEPH_XATTR_CREATE;
12717 if (flags & XATTR_REPLACE)
12718 xattr_flags |= CEPH_XATTR_REPLACE;
12719
12720 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12721 filepath path;
12722 in->make_nosnap_relative_path(path);
12723 req->set_filepath(path);
12724 req->set_string2(name);
12725 req->set_inode(in);
12726 req->head.args.setxattr.flags = xattr_flags;
12727
12728 bufferlist bl;
12729 ceph_assert(value || size == 0);
12730 bl.append((const char*)value, size);
12731 req->set_data(bl);
12732
12733 int res = make_request(req, perms);
12734
12735 trim_cache();
12736 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12737 res << dendl;
12738 return res;
12739 }
12740
12741 int Client::_setxattr(Inode *in, const char *name, const void *value,
12742 size_t size, int flags, const UserPerm& perms)
12743 {
12744 if (in->snapid != CEPH_NOSNAP) {
12745 return -CEPHFS_EROFS;
12746 }
12747
12748 if (size == 0) {
12749 value = "";
12750 } else if (value == NULL) {
12751 return -CEPHFS_EINVAL;
12752 }
12753
12754 bool posix_acl_xattr = false;
12755 if (acl_type == POSIX_ACL)
12756 posix_acl_xattr = !strncmp(name, "system.", 7);
12757
12758 if (strncmp(name, "user.", 5) &&
12759 strncmp(name, "security.", 9) &&
12760 strncmp(name, "trusted.", 8) &&
12761 strncmp(name, "ceph.", 5) &&
12762 !posix_acl_xattr)
12763 return -CEPHFS_EOPNOTSUPP;
12764
12765 bool check_realm = false;
12766
12767 if (posix_acl_xattr) {
12768 if (!strcmp(name, ACL_EA_ACCESS)) {
12769 mode_t new_mode = in->mode;
12770 if (value) {
12771 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12772 if (ret < 0)
12773 return ret;
12774 if (ret == 0) {
12775 value = NULL;
12776 size = 0;
12777 }
12778 if (new_mode != in->mode) {
12779 struct ceph_statx stx;
12780 stx.stx_mode = new_mode;
12781 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12782 if (ret < 0)
12783 return ret;
12784 }
12785 }
12786 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12787 if (value) {
12788 if (!S_ISDIR(in->mode))
12789 return -CEPHFS_EACCES;
12790 int ret = posix_acl_check(value, size);
12791 if (ret < 0)
12792 return -CEPHFS_EINVAL;
12793 if (ret == 0) {
12794 value = NULL;
12795 size = 0;
12796 }
12797 }
12798 } else {
12799 return -CEPHFS_EOPNOTSUPP;
12800 }
12801 } else {
12802 const VXattr *vxattr = _match_vxattr(in, name);
12803 if (vxattr) {
12804 if (vxattr->readonly)
12805 return -CEPHFS_EOPNOTSUPP;
12806 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12807 check_realm = true;
12808 }
12809 }
12810
12811 int ret = _do_setxattr(in, name, value, size, flags, perms);
12812 if (ret >= 0 && check_realm) {
12813 // check if snaprealm was created for quota inode
12814 if (in->quota.is_enable() &&
12815 !(in->snaprealm && in->snaprealm->ino == in->ino))
12816 ret = -CEPHFS_EOPNOTSUPP;
12817 }
12818
12819 return ret;
12820 }
12821
12822 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12823 size_t size, int flags, const UserPerm& perms)
12824 {
12825 if (cct->_conf->client_permissions) {
12826 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12827 if (r < 0)
12828 return r;
12829 }
12830 return _setxattr(in.get(), name, value, size, flags, perms);
12831 }
12832
12833 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12834 {
12835 string tmp;
12836 if (name == "layout") {
12837 string::iterator begin = value.begin();
12838 string::iterator end = value.end();
12839 keys_and_values<string::iterator> p; // create instance of parser
12840 std::map<string, string> m; // map to receive results
12841 if (!qi::parse(begin, end, p, m)) { // returns true if successful
12842 return -CEPHFS_EINVAL;
12843 }
12844 if (begin != end)
12845 return -CEPHFS_EINVAL;
12846 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12847 if (q->first == "pool") {
12848 tmp = q->second;
12849 break;
12850 }
12851 }
12852 } else if (name == "layout.pool") {
12853 tmp = value;
12854 }
12855
12856 if (tmp.length()) {
12857 int64_t pool;
12858 try {
12859 pool = boost::lexical_cast<unsigned>(tmp);
12860 if (!osdmap->have_pg_pool(pool))
12861 return -CEPHFS_ENOENT;
12862 } catch (boost::bad_lexical_cast const&) {
12863 pool = osdmap->lookup_pg_pool_name(tmp);
12864 if (pool < 0) {
12865 return -CEPHFS_ENOENT;
12866 }
12867 }
12868 }
12869
12870 return 0;
12871 }
12872
12873 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12874 {
12875 // For setting pool of layout, MetaRequest need osdmap epoch.
12876 // There is a race which create a new data pool but client and mds both don't have.
12877 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12878 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12879 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12880 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12881 string rest(strstr(name, "layout"));
12882 string v((const char*)value, size);
12883 int r = objecter->with_osdmap([&](const OSDMap& o) {
12884 return _setxattr_check_data_pool(rest, v, &o);
12885 });
12886
12887 if (r == -CEPHFS_ENOENT) {
12888 bs::error_code ec;
12889 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12890 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12891 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12892 }
12893 }
12894 }
12895
12896 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12897 size_t size, int flags, const UserPerm& perms)
12898 {
12899 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12900 if (!mref_reader.is_state_satisfied())
12901 return -CEPHFS_ENOTCONN;
12902
12903 _setxattr_maybe_wait_for_osdmap(name, value, size);
12904
12905 vinodeno_t vino = _get_vino(in);
12906
12907 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12908 tout(cct) << __func__ << std::endl;
12909 tout(cct) << vino.ino.val << std::endl;
12910 tout(cct) << name << std::endl;
12911
12912 std::scoped_lock lock(client_lock);
12913 if (!fuse_default_permissions) {
12914 int r = xattr_permission(in, name, MAY_WRITE, perms);
12915 if (r < 0)
12916 return r;
12917 }
12918 return _setxattr(in, name, value, size, flags, perms);
12919 }
12920
12921 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12922 {
12923 if (in->snapid != CEPH_NOSNAP) {
12924 return -CEPHFS_EROFS;
12925 }
12926
12927 // same xattrs supported by kernel client
12928 if (strncmp(name, "user.", 5) &&
12929 strncmp(name, "system.", 7) &&
12930 strncmp(name, "security.", 9) &&
12931 strncmp(name, "trusted.", 8) &&
12932 strncmp(name, "ceph.", 5))
12933 return -CEPHFS_EOPNOTSUPP;
12934
12935 const VXattr *vxattr = _match_vxattr(in, name);
12936 if (vxattr && vxattr->readonly)
12937 return -CEPHFS_EOPNOTSUPP;
12938
12939 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12940 filepath path;
12941 in->make_nosnap_relative_path(path);
12942 req->set_filepath(path);
12943 req->set_filepath2(name);
12944 req->set_inode(in);
12945
12946 int res = make_request(req, perms);
12947
12948 trim_cache();
12949 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12950 return res;
12951 }
12952
12953 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12954 {
12955 if (cct->_conf->client_permissions) {
12956 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12957 if (r < 0)
12958 return r;
12959 }
12960 return _removexattr(in.get(), name, perms);
12961 }
12962
12963 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12964 {
12965 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12966 if (!mref_reader.is_state_satisfied())
12967 return -CEPHFS_ENOTCONN;
12968
12969 vinodeno_t vino = _get_vino(in);
12970
12971 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12972 tout(cct) << "ll_removexattr" << std::endl;
12973 tout(cct) << vino.ino.val << std::endl;
12974 tout(cct) << name << std::endl;
12975
12976 std::scoped_lock lock(client_lock);
12977 if (!fuse_default_permissions) {
12978 int r = xattr_permission(in, name, MAY_WRITE, perms);
12979 if (r < 0)
12980 return r;
12981 }
12982
12983 return _removexattr(in, name, perms);
12984 }
12985
12986 bool Client::_vxattrcb_quota_exists(Inode *in)
12987 {
12988 return in->quota.is_enable() &&
12989 (in->snapid != CEPH_NOSNAP ||
12990 (in->snaprealm && in->snaprealm->ino == in->ino));
12991 }
12992 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12993 {
12994 return snprintf(val, size,
12995 "max_bytes=%lld max_files=%lld",
12996 (long long int)in->quota.max_bytes,
12997 (long long int)in->quota.max_files);
12998 }
12999 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
13000 {
13001 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
13002 }
13003 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
13004 {
13005 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
13006 }
13007
13008 bool Client::_vxattrcb_layout_exists(Inode *in)
13009 {
13010 return in->layout != file_layout_t();
13011 }
13012 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
13013 {
13014 int r = snprintf(val, size,
13015 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
13016 (unsigned long long)in->layout.stripe_unit,
13017 (unsigned long long)in->layout.stripe_count,
13018 (unsigned long long)in->layout.object_size);
13019 objecter->with_osdmap([&](const OSDMap& o) {
13020 if (o.have_pg_pool(in->layout.pool_id))
13021 r += snprintf(val + r, size - r, "%s",
13022 o.get_pool_name(in->layout.pool_id).c_str());
13023 else
13024 r += snprintf(val + r, size - r, "%" PRIu64,
13025 (uint64_t)in->layout.pool_id);
13026 });
13027 if (in->layout.pool_ns.length())
13028 r += snprintf(val + r, size - r, " pool_namespace=%s",
13029 in->layout.pool_ns.c_str());
13030 return r;
13031 }
13032 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
13033 {
13034 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
13035 }
13036 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
13037 {
13038 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
13039 }
13040 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
13041 {
13042 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
13043 }
13044 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
13045 {
13046 size_t r;
13047 objecter->with_osdmap([&](const OSDMap& o) {
13048 if (o.have_pg_pool(in->layout.pool_id))
13049 r = snprintf(val, size, "%s", o.get_pool_name(
13050 in->layout.pool_id).c_str());
13051 else
13052 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
13053 });
13054 return r;
13055 }
13056 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
13057 {
13058 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
13059 }
13060 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
13061 {
13062 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
13063 }
13064 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
13065 {
13066 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
13067 }
13068 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
13069 {
13070 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
13071 }
13072 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13073 {
13074 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
13075 }
13076 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13077 {
13078 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
13079 }
13080 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13081 {
13082 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
13083 }
13084 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13085 {
13086 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13087 }
13088 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13089 {
13090 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
13091 }
13092 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13093 {
13094 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
13095 (long)in->rstat.rctime.nsec());
13096 }
13097 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13098 {
13099 return in->dir_pin != -CEPHFS_ENODATA;
13100 }
13101 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13102 {
13103 return snprintf(val, size, "%ld", (long)in->dir_pin);
13104 }
13105
13106 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13107 {
13108 return !in->snap_btime.is_zero();
13109 }
13110
13111 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13112 {
13113 return snprintf(val, size, "%llu.%09lu",
13114 (long long unsigned)in->snap_btime.sec(),
13115 (long unsigned)in->snap_btime.nsec());
13116 }
13117
13118 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13119 {
13120 int issued;
13121
13122 in->caps_issued(&issued);
13123 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13124 }
13125
13126 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13127 {
13128 // checking one of the xattrs would suffice
13129 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13130 }
13131
13132 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13133 {
13134 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13135 in->xattrs["ceph.mirror.info.cluster_id"].length(),
13136 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13137 in->xattrs["ceph.mirror.info.fs_id"].length(),
13138 in->xattrs["ceph.mirror.info.fs_id"].c_str());
13139 }
13140
13141 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13142 {
13143 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13144 }
13145
13146 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13147 {
13148 auto name = messenger->get_myname();
13149 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
13150 }
13151
13152 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13153 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13154
13155 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13156 { \
13157 name: CEPH_XATTR_NAME(_type, _name), \
13158 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13159 readonly: true, \
13160 exists_cb: NULL, \
13161 flags: _flags, \
13162 }
13163 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13164 { \
13165 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13166 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13167 readonly: false, \
13168 exists_cb: &Client::_vxattrcb_layout_exists, \
13169 flags: 0, \
13170 }
13171 #define XATTR_QUOTA_FIELD(_type, _name) \
13172 { \
13173 name: CEPH_XATTR_NAME(_type, _name), \
13174 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13175 readonly: false, \
13176 exists_cb: &Client::_vxattrcb_quota_exists, \
13177 flags: 0, \
13178 }
13179
13180 const Client::VXattr Client::_dir_vxattrs[] = {
13181 {
13182 name: "ceph.dir.layout",
13183 getxattr_cb: &Client::_vxattrcb_layout,
13184 readonly: false,
13185 exists_cb: &Client::_vxattrcb_layout_exists,
13186 flags: 0,
13187 },
13188 // FIXME
13189 // Delete the following dir layout field definitions for release "S"
13190 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13191 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13192 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13193 XATTR_LAYOUT_FIELD(dir, layout, pool),
13194 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
13195 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13196 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13197 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13198 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13199 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13200 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
13201 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
13202 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13203 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
13204 {
13205 name: "ceph.quota",
13206 getxattr_cb: &Client::_vxattrcb_quota,
13207 readonly: false,
13208 exists_cb: &Client::_vxattrcb_quota_exists,
13209 flags: 0,
13210 },
13211 XATTR_QUOTA_FIELD(quota, max_bytes),
13212 XATTR_QUOTA_FIELD(quota, max_files),
13213 // FIXME
13214 // Delete the following dir pin field definitions for release "S"
13215 {
13216 name: "ceph.dir.pin",
13217 getxattr_cb: &Client::_vxattrcb_dir_pin,
13218 readonly: false,
13219 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13220 flags: 0,
13221 },
13222 {
13223 name: "ceph.snap.btime",
13224 getxattr_cb: &Client::_vxattrcb_snap_btime,
13225 readonly: true,
13226 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13227 flags: 0,
13228 },
13229 {
13230 name: "ceph.mirror.info",
13231 getxattr_cb: &Client::_vxattrcb_mirror_info,
13232 readonly: false,
13233 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13234 flags: 0,
13235 },
13236 {
13237 name: "ceph.caps",
13238 getxattr_cb: &Client::_vxattrcb_caps,
13239 readonly: true,
13240 exists_cb: NULL,
13241 flags: 0,
13242 },
13243 { name: "" } /* Required table terminator */
13244 };
13245
13246 const Client::VXattr Client::_file_vxattrs[] = {
13247 {
13248 name: "ceph.file.layout",
13249 getxattr_cb: &Client::_vxattrcb_layout,
13250 readonly: false,
13251 exists_cb: &Client::_vxattrcb_layout_exists,
13252 flags: 0,
13253 },
13254 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13255 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13256 XATTR_LAYOUT_FIELD(file, layout, object_size),
13257 XATTR_LAYOUT_FIELD(file, layout, pool),
13258 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13259 {
13260 name: "ceph.snap.btime",
13261 getxattr_cb: &Client::_vxattrcb_snap_btime,
13262 readonly: true,
13263 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13264 flags: 0,
13265 },
13266 {
13267 name: "ceph.caps",
13268 getxattr_cb: &Client::_vxattrcb_caps,
13269 readonly: true,
13270 exists_cb: NULL,
13271 flags: 0,
13272 },
13273 { name: "" } /* Required table terminator */
13274 };
13275
13276 const Client::VXattr Client::_common_vxattrs[] = {
13277 {
13278 name: "ceph.cluster_fsid",
13279 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13280 readonly: true,
13281 exists_cb: nullptr,
13282 flags: 0,
13283 },
13284 {
13285 name: "ceph.client_id",
13286 getxattr_cb: &Client::_vxattrcb_client_id,
13287 readonly: true,
13288 exists_cb: nullptr,
13289 flags: 0,
13290 },
13291 { name: "" } /* Required table terminator */
13292 };
13293
13294 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13295 {
13296 if (in->is_dir())
13297 return _dir_vxattrs;
13298 else if (in->is_file())
13299 return _file_vxattrs;
13300 return NULL;
13301 }
13302
13303 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13304 {
13305 if (strncmp(name, "ceph.", 5) == 0) {
13306 const VXattr *vxattr = _get_vxattrs(in);
13307 if (vxattr) {
13308 while (!vxattr->name.empty()) {
13309 if (vxattr->name == name)
13310 return vxattr;
13311 vxattr++;
13312 }
13313 }
13314
13315 // for common vxattrs
13316 vxattr = _common_vxattrs;
13317 while (!vxattr->name.empty()) {
13318 if (vxattr->name == name)
13319 return vxattr;
13320 vxattr++;
13321 }
13322 }
13323
13324 return NULL;
13325 }
13326
13327 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13328 {
13329 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13330 if (!mref_reader.is_state_satisfied())
13331 return -CEPHFS_ENOTCONN;
13332
13333 vinodeno_t vino = _get_vino(in);
13334
13335 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13336 tout(cct) << "ll_readlink" << std::endl;
13337 tout(cct) << vino.ino.val << std::endl;
13338
13339 std::scoped_lock lock(client_lock);
13340 for (auto dn : in->dentries) {
13341 touch_dn(dn);
13342 }
13343
13344 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13345 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13346 return r;
13347 }
13348
13349 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13350 const UserPerm& perms, InodeRef *inp)
13351 {
13352 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13353 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13354 << ", gid " << perms.gid() << ")" << dendl;
13355
13356 if (strlen(name) > NAME_MAX)
13357 return -CEPHFS_ENAMETOOLONG;
13358
13359 if (dir->snapid != CEPH_NOSNAP) {
13360 return -CEPHFS_EROFS;
13361 }
13362 if (is_quota_files_exceeded(dir, perms)) {
13363 return -CEPHFS_EDQUOT;
13364 }
13365
13366 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13367
13368 filepath path;
13369 dir->make_nosnap_relative_path(path);
13370 path.push_dentry(name);
13371 req->set_filepath(path);
13372 req->set_inode(dir);
13373 req->head.args.mknod.rdev = rdev;
13374 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13375 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13376
13377 bufferlist xattrs_bl;
13378 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13379 if (res < 0)
13380 goto fail;
13381 req->head.args.mknod.mode = mode;
13382 if (xattrs_bl.length() > 0)
13383 req->set_data(xattrs_bl);
13384
13385 Dentry *de;
13386 res = get_or_create(dir, name, &de);
13387 if (res < 0)
13388 goto fail;
13389 req->set_dentry(de);
13390
13391 res = make_request(req, perms, inp);
13392
13393 trim_cache();
13394
13395 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13396 return res;
13397
13398 fail:
13399 put_request(req);
13400 return res;
13401 }
13402
13403 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13404 dev_t rdev, struct stat *attr, Inode **out,
13405 const UserPerm& perms)
13406 {
13407 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13408 if (!mref_reader.is_state_satisfied())
13409 return -CEPHFS_ENOTCONN;
13410
13411 vinodeno_t vparent = _get_vino(parent);
13412
13413 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13414 tout(cct) << "ll_mknod" << std::endl;
13415 tout(cct) << vparent.ino.val << std::endl;
13416 tout(cct) << name << std::endl;
13417 tout(cct) << mode << std::endl;
13418 tout(cct) << rdev << std::endl;
13419
13420 std::scoped_lock lock(client_lock);
13421 if (!fuse_default_permissions) {
13422 int r = may_create(parent, perms);
13423 if (r < 0)
13424 return r;
13425 }
13426
13427 InodeRef in;
13428 int r = _mknod(parent, name, mode, rdev, perms, &in);
13429 if (r == 0) {
13430 fill_stat(in, attr);
13431 _ll_get(in.get());
13432 }
13433 tout(cct) << attr->st_ino << std::endl;
13434 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13435 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13436 *out = in.get();
13437 return r;
13438 }
13439
13440 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13441 dev_t rdev, Inode **out,
13442 struct ceph_statx *stx, unsigned want, unsigned flags,
13443 const UserPerm& perms)
13444 {
13445 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13446 if (!mref_reader.is_state_satisfied())
13447 return -CEPHFS_ENOTCONN;
13448
13449 unsigned caps = statx_to_mask(flags, want);
13450
13451 vinodeno_t vparent = _get_vino(parent);
13452
13453 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13454 tout(cct) << "ll_mknodx" << std::endl;
13455 tout(cct) << vparent.ino.val << std::endl;
13456 tout(cct) << name << std::endl;
13457 tout(cct) << mode << std::endl;
13458 tout(cct) << rdev << std::endl;
13459
13460 std::scoped_lock lock(client_lock);
13461
13462 if (!fuse_default_permissions) {
13463 int r = may_create(parent, perms);
13464 if (r < 0)
13465 return r;
13466 }
13467
13468 InodeRef in;
13469 int r = _mknod(parent, name, mode, rdev, perms, &in);
13470 if (r == 0) {
13471 fill_statx(in, caps, stx);
13472 _ll_get(in.get());
13473 }
13474 tout(cct) << stx->stx_ino << std::endl;
13475 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13476 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13477 *out = in.get();
13478 return r;
13479 }
13480
13481 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13482 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13483 int object_size, const char *data_pool, bool *created,
13484 const UserPerm& perms, std::string alternate_name)
13485 {
13486 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13487 mode << dec << ")" << dendl;
13488
13489 if (strlen(name) > NAME_MAX)
13490 return -CEPHFS_ENAMETOOLONG;
13491 if (dir->snapid != CEPH_NOSNAP) {
13492 return -CEPHFS_EROFS;
13493 }
13494 if (is_quota_files_exceeded(dir, perms)) {
13495 return -CEPHFS_EDQUOT;
13496 }
13497
13498 // use normalized flags to generate cmode
13499 int cflags = ceph_flags_sys2wire(flags);
13500 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13501 cflags |= CEPH_O_LAZY;
13502
13503 int cmode = ceph_flags_to_mode(cflags);
13504
13505 int64_t pool_id = -1;
13506 if (data_pool && *data_pool) {
13507 pool_id = objecter->with_osdmap(
13508 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13509 if (pool_id < 0)
13510 return -CEPHFS_EINVAL;
13511 if (pool_id > 0xffffffffll)
13512 return -CEPHFS_ERANGE; // bummer!
13513 }
13514
13515 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13516
13517 filepath path;
13518 dir->make_nosnap_relative_path(path);
13519 path.push_dentry(name);
13520 req->set_filepath(path);
13521 req->set_alternate_name(std::move(alternate_name));
13522 req->set_inode(dir);
13523 req->head.args.open.flags = cflags | CEPH_O_CREAT;
13524
13525 req->head.args.open.stripe_unit = stripe_unit;
13526 req->head.args.open.stripe_count = stripe_count;
13527 req->head.args.open.object_size = object_size;
13528 if (cct->_conf->client_debug_getattr_caps)
13529 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13530 else
13531 req->head.args.open.mask = 0;
13532 req->head.args.open.pool = pool_id;
13533 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13534 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13535
13536 mode |= S_IFREG;
13537 bufferlist xattrs_bl;
13538 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13539 if (res < 0)
13540 goto fail;
13541 req->head.args.open.mode = mode;
13542 if (xattrs_bl.length() > 0)
13543 req->set_data(xattrs_bl);
13544
13545 Dentry *de;
13546 res = get_or_create(dir, name, &de);
13547 if (res < 0)
13548 goto fail;
13549 req->set_dentry(de);
13550
13551 res = make_request(req, perms, inp, created);
13552 if (res < 0) {
13553 goto reply_error;
13554 }
13555
13556 /* If the caller passed a value in fhp, do the open */
13557 if(fhp) {
13558 (*inp)->get_open_ref(cmode);
13559 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13560 }
13561
13562 reply_error:
13563 trim_cache();
13564
13565 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
13566 << " layout " << stripe_unit
13567 << ' ' << stripe_count
13568 << ' ' << object_size
13569 <<") = " << res << dendl;
13570 return res;
13571
13572 fail:
13573 put_request(req);
13574 return res;
13575 }
13576
13577 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
13578 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13579 std::string alternate_name)
13580 {
13581 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
13582 << mode << dec << ", uid " << perm.uid()
13583 << ", gid " << perm.gid() << ")" << dendl;
13584
13585 if (strlen(name) > NAME_MAX)
13586 return -CEPHFS_ENAMETOOLONG;
13587
13588 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13589 return -CEPHFS_EROFS;
13590 }
13591 if (is_quota_files_exceeded(dir, perm)) {
13592 return -CEPHFS_EDQUOT;
13593 }
13594
13595 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13596 MetaRequest *req = new MetaRequest(is_snap_op ?
13597 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13598
13599 filepath path;
13600 dir->make_nosnap_relative_path(path);
13601 path.push_dentry(name);
13602 req->set_filepath(path);
13603 req->set_inode(dir);
13604 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13605 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13606 req->set_alternate_name(std::move(alternate_name));
13607
13608 mode |= S_IFDIR;
13609 bufferlist bl;
13610 int res = _posix_acl_create(dir, &mode, bl, perm);
13611 if (res < 0)
13612 goto fail;
13613 req->head.args.mkdir.mode = mode;
13614 if (is_snap_op) {
13615 SnapPayload payload;
13616 // clear the bufferlist that may have been populated by the call
13617 // to _posix_acl_create(). MDS mksnap does not make use of it.
13618 // So, reuse it to pass metadata payload.
13619 bl.clear();
13620 payload.metadata = metadata;
13621 encode(payload, bl);
13622 }
13623 if (bl.length() > 0) {
13624 req->set_data(bl);
13625 }
13626
13627 Dentry *de;
13628 res = get_or_create(dir, name, &de);
13629 if (res < 0)
13630 goto fail;
13631 req->set_dentry(de);
13632
13633 ldout(cct, 10) << "_mkdir: making request" << dendl;
13634 res = make_request(req, perm, inp);
13635 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13636
13637 trim_cache();
13638
13639 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13640 return res;
13641
13642 fail:
13643 put_request(req);
13644 return res;
13645 }
13646
13647 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13648 struct stat *attr, Inode **out, const UserPerm& perm)
13649 {
13650 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13651 if (!mref_reader.is_state_satisfied())
13652 return -CEPHFS_ENOTCONN;
13653
13654 vinodeno_t vparent = _get_vino(parent);
13655
13656 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13657 tout(cct) << "ll_mkdir" << std::endl;
13658 tout(cct) << vparent.ino.val << std::endl;
13659 tout(cct) << name << std::endl;
13660 tout(cct) << mode << std::endl;
13661
13662 std::scoped_lock lock(client_lock);
13663
13664 if (!fuse_default_permissions) {
13665 int r = may_create(parent, perm);
13666 if (r < 0)
13667 return r;
13668 }
13669
13670 InodeRef in;
13671 int r = _mkdir(parent, name, mode, perm, &in);
13672 if (r == 0) {
13673 fill_stat(in, attr);
13674 _ll_get(in.get());
13675 }
13676 tout(cct) << attr->st_ino << std::endl;
13677 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13678 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13679 *out = in.get();
13680 return r;
13681 }
13682
13683 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13684 struct ceph_statx *stx, unsigned want, unsigned flags,
13685 const UserPerm& perms)
13686 {
13687 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13688 if (!mref_reader.is_state_satisfied())
13689 return -CEPHFS_ENOTCONN;
13690
13691 vinodeno_t vparent = _get_vino(parent);
13692
13693 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13694 tout(cct) << "ll_mkdirx" << std::endl;
13695 tout(cct) << vparent.ino.val << std::endl;
13696 tout(cct) << name << std::endl;
13697 tout(cct) << mode << std::endl;
13698
13699 std::scoped_lock lock(client_lock);
13700
13701 if (!fuse_default_permissions) {
13702 int r = may_create(parent, perms);
13703 if (r < 0)
13704 return r;
13705 }
13706
13707 InodeRef in;
13708 int r = _mkdir(parent, name, mode, perms, &in);
13709 if (r == 0) {
13710 fill_statx(in, statx_to_mask(flags, want), stx);
13711 _ll_get(in.get());
13712 } else {
13713 stx->stx_ino = 0;
13714 stx->stx_mask = 0;
13715 }
13716 tout(cct) << stx->stx_ino << std::endl;
13717 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13718 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13719 *out = in.get();
13720 return r;
13721 }
13722
13723 int Client::_symlink(Inode *dir, const char *name, const char *target,
13724 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13725 {
13726 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13727 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13728 << dendl;
13729
13730 if (strlen(name) > NAME_MAX)
13731 return -CEPHFS_ENAMETOOLONG;
13732
13733 if (dir->snapid != CEPH_NOSNAP) {
13734 return -CEPHFS_EROFS;
13735 }
13736 if (is_quota_files_exceeded(dir, perms)) {
13737 return -CEPHFS_EDQUOT;
13738 }
13739
13740 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13741
13742 filepath path;
13743 dir->make_nosnap_relative_path(path);
13744 path.push_dentry(name);
13745 req->set_filepath(path);
13746 req->set_alternate_name(std::move(alternate_name));
13747 req->set_inode(dir);
13748 req->set_string2(target);
13749 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13750 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13751
13752 Dentry *de;
13753 int res = get_or_create(dir, name, &de);
13754 if (res < 0)
13755 goto fail;
13756 req->set_dentry(de);
13757
13758 res = make_request(req, perms, inp);
13759
13760 trim_cache();
13761 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13762 res << dendl;
13763 return res;
13764
13765 fail:
13766 put_request(req);
13767 return res;
13768 }
13769
13770 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13771 struct stat *attr, Inode **out, const UserPerm& perms)
13772 {
13773 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13774 if (!mref_reader.is_state_satisfied())
13775 return -CEPHFS_ENOTCONN;
13776
13777 vinodeno_t vparent = _get_vino(parent);
13778
13779 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13780 << dendl;
13781 tout(cct) << "ll_symlink" << std::endl;
13782 tout(cct) << vparent.ino.val << std::endl;
13783 tout(cct) << name << std::endl;
13784 tout(cct) << value << std::endl;
13785
13786 std::scoped_lock lock(client_lock);
13787
13788 if (!fuse_default_permissions) {
13789 int r = may_create(parent, perms);
13790 if (r < 0)
13791 return r;
13792 }
13793
13794 InodeRef in;
13795 int r = _symlink(parent, name, value, perms, "", &in);
13796 if (r == 0) {
13797 fill_stat(in, attr);
13798 _ll_get(in.get());
13799 }
13800 tout(cct) << attr->st_ino << std::endl;
13801 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13802 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13803 *out = in.get();
13804 return r;
13805 }
13806
13807 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13808 Inode **out, struct ceph_statx *stx, unsigned want,
13809 unsigned flags, const UserPerm& perms)
13810 {
13811 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13812 if (!mref_reader.is_state_satisfied())
13813 return -CEPHFS_ENOTCONN;
13814
13815 vinodeno_t vparent = _get_vino(parent);
13816
13817 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13818 << dendl;
13819 tout(cct) << "ll_symlinkx" << std::endl;
13820 tout(cct) << vparent.ino.val << std::endl;
13821 tout(cct) << name << std::endl;
13822 tout(cct) << value << std::endl;
13823
13824 std::scoped_lock lock(client_lock);
13825
13826 if (!fuse_default_permissions) {
13827 int r = may_create(parent, perms);
13828 if (r < 0)
13829 return r;
13830 }
13831
13832 InodeRef in;
13833 int r = _symlink(parent, name, value, perms, "", &in);
13834 if (r == 0) {
13835 fill_statx(in, statx_to_mask(flags, want), stx);
13836 _ll_get(in.get());
13837 }
13838 tout(cct) << stx->stx_ino << std::endl;
13839 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13840 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13841 *out = in.get();
13842 return r;
13843 }
13844
13845 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13846 {
13847 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13848 << " uid " << perm.uid() << " gid " << perm.gid()
13849 << ")" << dendl;
13850
13851 if (dir->snapid != CEPH_NOSNAP) {
13852 return -CEPHFS_EROFS;
13853 }
13854
13855 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13856
13857 filepath path;
13858 dir->make_nosnap_relative_path(path);
13859 path.push_dentry(name);
13860 req->set_filepath(path);
13861
13862 InodeRef otherin;
13863 Inode *in;
13864 Dentry *de;
13865
13866 int res = get_or_create(dir, name, &de);
13867 if (res < 0)
13868 goto fail;
13869 req->set_dentry(de);
13870 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13871 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13872
13873 res = _lookup(dir, name, 0, &otherin, perm);
13874 if (res < 0)
13875 goto fail;
13876
13877 in = otherin.get();
13878 req->set_other_inode(in);
13879 in->break_all_delegs();
13880 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13881
13882 req->set_inode(dir);
13883
13884 res = make_request(req, perm);
13885
13886 trim_cache();
13887 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13888 return res;
13889
13890 fail:
13891 put_request(req);
13892 return res;
13893 }
13894
13895 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13896 {
13897 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13898 if (!mref_reader.is_state_satisfied())
13899 return -CEPHFS_ENOTCONN;
13900
13901 vinodeno_t vino = _get_vino(in);
13902
13903 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13904 tout(cct) << "ll_unlink" << std::endl;
13905 tout(cct) << vino.ino.val << std::endl;
13906 tout(cct) << name << std::endl;
13907
13908 std::scoped_lock lock(client_lock);
13909
13910 if (!fuse_default_permissions) {
13911 int r = may_delete(in, name, perm);
13912 if (r < 0)
13913 return r;
13914 }
13915 return _unlink(in, name, perm);
13916 }
13917
13918 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13919 {
13920 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13921 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13922
13923 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13924 return -CEPHFS_EROFS;
13925 }
13926
13927 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13928 MetaRequest *req = new MetaRequest(op);
13929 filepath path;
13930 dir->make_nosnap_relative_path(path);
13931 path.push_dentry(name);
13932 req->set_filepath(path);
13933 req->set_inode(dir);
13934
13935 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13936 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13937 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13938
13939 InodeRef in;
13940
13941 Dentry *de;
13942 int res = get_or_create(dir, name, &de);
13943 if (res < 0)
13944 goto fail;
13945 if (op == CEPH_MDS_OP_RMDIR)
13946 req->set_dentry(de);
13947 else
13948 de->get();
13949
13950 res = _lookup(dir, name, 0, &in, perms);
13951 if (res < 0)
13952 goto fail;
13953
13954 if (op == CEPH_MDS_OP_RMSNAP) {
13955 unlink(de, true, true);
13956 de->put();
13957 }
13958 req->set_other_inode(in.get());
13959
13960 res = make_request(req, perms);
13961
13962 trim_cache();
13963 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13964 return res;
13965
13966 fail:
13967 put_request(req);
13968 return res;
13969 }
13970
13971 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13972 {
13973 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13974 if (!mref_reader.is_state_satisfied())
13975 return -CEPHFS_ENOTCONN;
13976
13977 vinodeno_t vino = _get_vino(in);
13978
13979 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13980 tout(cct) << "ll_rmdir" << std::endl;
13981 tout(cct) << vino.ino.val << std::endl;
13982 tout(cct) << name << std::endl;
13983
13984 std::scoped_lock lock(client_lock);
13985
13986 if (!fuse_default_permissions) {
13987 int r = may_delete(in, name, perms);
13988 if (r < 0)
13989 return r;
13990 }
13991
13992 return _rmdir(in, name, perms);
13993 }
13994
13995 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13996 {
13997 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13998 << todir->ino << " " << toname
13999 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
14000 << dendl;
14001
14002 if (fromdir->snapid != todir->snapid)
14003 return -CEPHFS_EXDEV;
14004
14005 int op = CEPH_MDS_OP_RENAME;
14006 if (fromdir->snapid != CEPH_NOSNAP) {
14007 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
14008 op = CEPH_MDS_OP_RENAMESNAP;
14009 else
14010 return -CEPHFS_EROFS;
14011 }
14012 if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
14013 Inode *fromdir_root =
14014 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
14015 Inode *todir_root =
14016 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
14017 if (fromdir_root != todir_root) {
14018 return -CEPHFS_EXDEV;
14019 }
14020 }
14021
14022 InodeRef target;
14023 MetaRequest *req = new MetaRequest(op);
14024
14025 filepath from;
14026 fromdir->make_nosnap_relative_path(from);
14027 from.push_dentry(fromname);
14028 filepath to;
14029 todir->make_nosnap_relative_path(to);
14030 to.push_dentry(toname);
14031 req->set_filepath(to);
14032 req->set_filepath2(from);
14033 req->set_alternate_name(std::move(alternate_name));
14034
14035 Dentry *oldde;
14036 int res = get_or_create(fromdir, fromname, &oldde);
14037 if (res < 0)
14038 goto fail;
14039 Dentry *de;
14040 res = get_or_create(todir, toname, &de);
14041 if (res < 0)
14042 goto fail;
14043
14044 if (op == CEPH_MDS_OP_RENAME) {
14045 req->set_old_dentry(oldde);
14046 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
14047 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
14048
14049 req->set_dentry(de);
14050 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14051 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14052
14053 InodeRef oldin, otherin;
14054 res = _lookup(fromdir, fromname, 0, &oldin, perm);
14055 if (res < 0)
14056 goto fail;
14057
14058 Inode *oldinode = oldin.get();
14059 oldinode->break_all_delegs();
14060 req->set_old_inode(oldinode);
14061 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
14062
14063 res = _lookup(todir, toname, 0, &otherin, perm);
14064 switch (res) {
14065 case 0:
14066 {
14067 Inode *in = otherin.get();
14068 req->set_other_inode(in);
14069 in->break_all_delegs();
14070 }
14071 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14072 break;
14073 case -CEPHFS_ENOENT:
14074 break;
14075 default:
14076 goto fail;
14077 }
14078
14079 req->set_inode(todir);
14080 } else {
14081 // renamesnap reply contains no tracedn, so we need to invalidate
14082 // dentry manually
14083 unlink(oldde, true, true);
14084 unlink(de, true, true);
14085
14086 req->set_inode(todir);
14087 }
14088
14089 res = make_request(req, perm, &target);
14090 ldout(cct, 10) << "rename result is " << res << dendl;
14091
14092 // renamed item from our cache
14093
14094 trim_cache();
14095 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
14096 return res;
14097
14098 fail:
14099 put_request(req);
14100 return res;
14101 }
14102
14103 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14104 const char *newname, const UserPerm& perm)
14105 {
14106 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14107 if (!mref_reader.is_state_satisfied())
14108 return -CEPHFS_ENOTCONN;
14109
14110 vinodeno_t vparent = _get_vino(parent);
14111 vinodeno_t vnewparent = _get_vino(newparent);
14112
14113 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14114 << vnewparent << " " << newname << dendl;
14115 tout(cct) << "ll_rename" << std::endl;
14116 tout(cct) << vparent.ino.val << std::endl;
14117 tout(cct) << name << std::endl;
14118 tout(cct) << vnewparent.ino.val << std::endl;
14119 tout(cct) << newname << std::endl;
14120
14121 std::scoped_lock lock(client_lock);
14122
14123 if (!fuse_default_permissions) {
14124 int r = may_delete(parent, name, perm);
14125 if (r < 0)
14126 return r;
14127 r = may_delete(newparent, newname, perm);
14128 if (r < 0 && r != -CEPHFS_ENOENT)
14129 return r;
14130 }
14131
14132 return _rename(parent, name, newparent, newname, perm, "");
14133 }
14134
14135 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
14136 {
14137 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
14138 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14139
14140 if (strlen(newname) > NAME_MAX)
14141 return -CEPHFS_ENAMETOOLONG;
14142
14143 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
14144 return -CEPHFS_EROFS;
14145 }
14146 if (is_quota_files_exceeded(dir, perm)) {
14147 return -CEPHFS_EDQUOT;
14148 }
14149
14150 in->break_all_delegs();
14151 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14152
14153 filepath path(newname, dir->ino);
14154 req->set_filepath(path);
14155 req->set_alternate_name(std::move(alternate_name));
14156 filepath existing(in->ino);
14157 req->set_filepath2(existing);
14158
14159 req->set_inode(dir);
14160 req->inode_drop = CEPH_CAP_FILE_SHARED;
14161 req->inode_unless = CEPH_CAP_FILE_EXCL;
14162
14163 Dentry *de;
14164 int res = get_or_create(dir, newname, &de);
14165 if (res < 0)
14166 goto fail;
14167 req->set_dentry(de);
14168
14169 res = make_request(req, perm, inp);
14170 ldout(cct, 10) << "link result is " << res << dendl;
14171
14172 trim_cache();
14173 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
14174 return res;
14175
14176 fail:
14177 put_request(req);
14178 return res;
14179 }
14180
14181 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14182 const UserPerm& perm)
14183 {
14184 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14185 if (!mref_reader.is_state_satisfied())
14186 return -CEPHFS_ENOTCONN;
14187
14188 vinodeno_t vino = _get_vino(in);
14189 vinodeno_t vnewparent = _get_vino(newparent);
14190
14191 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
14192 newname << dendl;
14193 tout(cct) << "ll_link" << std::endl;
14194 tout(cct) << vino.ino.val << std::endl;
14195 tout(cct) << vnewparent << std::endl;
14196 tout(cct) << newname << std::endl;
14197
14198 InodeRef target;
14199
14200 std::scoped_lock lock(client_lock);
14201
14202 if (!fuse_default_permissions) {
14203 if (S_ISDIR(in->mode))
14204 return -CEPHFS_EPERM;
14205
14206 int r = may_hardlink(in, perm);
14207 if (r < 0)
14208 return r;
14209
14210 r = may_create(newparent, perm);
14211 if (r < 0)
14212 return r;
14213 }
14214
14215 return _link(in, newparent, newname, perm, "", &target);
14216 }
14217
14218 int Client::ll_num_osds(void)
14219 {
14220 std::scoped_lock lock(client_lock);
14221 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14222 }
14223
14224 int Client::ll_osdaddr(int osd, uint32_t *addr)
14225 {
14226 std::scoped_lock lock(client_lock);
14227
14228 entity_addr_t g;
14229 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14230 if (!o.exists(osd))
14231 return false;
14232 g = o.get_addrs(osd).front();
14233 return true;
14234 });
14235 if (!exists)
14236 return -1;
14237 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14238 *addr = ntohl(nb_addr);
14239 return 0;
14240 }
14241
14242 uint32_t Client::ll_stripe_unit(Inode *in)
14243 {
14244 std::scoped_lock lock(client_lock);
14245 return in->layout.stripe_unit;
14246 }
14247
14248 uint64_t Client::ll_snap_seq(Inode *in)
14249 {
14250 std::scoped_lock lock(client_lock);
14251 return in->snaprealm->seq;
14252 }
14253
14254 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14255 {
14256 std::scoped_lock lock(client_lock);
14257 *layout = in->layout;
14258 return 0;
14259 }
14260
14261 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14262 {
14263 return ll_file_layout(fh->inode.get(), layout);
14264 }
14265
14266 /* Currently we cannot take advantage of redundancy in reads, since we
14267 would have to go through all possible placement groups (a
14268 potentially quite large number determined by a hash), and use CRUSH
14269 to calculate the appropriate set of OSDs for each placement group,
14270 then index into that. An array with one entry per OSD is much more
14271 tractable and works for demonstration purposes. */
14272
14273 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14274 file_layout_t* layout)
14275 {
14276 std::scoped_lock lock(client_lock);
14277
14278 inodeno_t ino = in->ino;
14279 uint32_t object_size = layout->object_size;
14280 uint32_t su = layout->stripe_unit;
14281 uint32_t stripe_count = layout->stripe_count;
14282 uint64_t stripes_per_object = object_size / su;
14283 uint64_t stripeno = 0, stripepos = 0;
14284
14285 if(stripe_count) {
14286 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14287 stripepos = blockno % stripe_count; // which object in the object set (X)
14288 }
14289 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14290 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14291
14292 object_t oid = file_object_t(ino, objectno);
14293 return objecter->with_osdmap([&](const OSDMap& o) {
14294 ceph_object_layout olayout =
14295 o.file_to_object_layout(oid, *layout);
14296 pg_t pg = (pg_t)olayout.ol_pgid;
14297 vector<int> osds;
14298 int primary;
14299 o.pg_to_acting_osds(pg, &osds, &primary);
14300 return primary;
14301 });
14302 }
14303
14304 /* Return the offset of the block, internal to the object */
14305
14306 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14307 {
14308 std::scoped_lock lock(client_lock);
14309 file_layout_t *layout=&(in->layout);
14310 uint32_t object_size = layout->object_size;
14311 uint32_t su = layout->stripe_unit;
14312 uint64_t stripes_per_object = object_size / su;
14313
14314 return (blockno % stripes_per_object) * su;
14315 }
14316
14317 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14318 const UserPerm& perms)
14319 {
14320 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14321 if (!mref_reader.is_state_satisfied())
14322 return -CEPHFS_ENOTCONN;
14323
14324 vinodeno_t vino = _get_vino(in);
14325
14326 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14327 tout(cct) << "ll_opendir" << std::endl;
14328 tout(cct) << vino.ino.val << std::endl;
14329
14330 std::scoped_lock lock(client_lock);
14331
14332 if (!fuse_default_permissions) {
14333 int r = may_open(in, flags, perms);
14334 if (r < 0)
14335 return r;
14336 }
14337
14338 int r = _opendir(in, dirpp, perms);
14339 tout(cct) << (uintptr_t)*dirpp << std::endl;
14340
14341 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14342 << dendl;
14343 return r;
14344 }
14345
14346 int Client::ll_releasedir(dir_result_t *dirp)
14347 {
14348 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14349 if (!mref_reader.is_state_satisfied())
14350 return -CEPHFS_ENOTCONN;
14351
14352 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14353 tout(cct) << "ll_releasedir" << std::endl;
14354 tout(cct) << (uintptr_t)dirp << std::endl;
14355
14356 std::scoped_lock lock(client_lock);
14357
14358 _closedir(dirp);
14359 return 0;
14360 }
14361
14362 int Client::ll_fsyncdir(dir_result_t *dirp)
14363 {
14364 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14365 if (!mref_reader.is_state_satisfied())
14366 return -CEPHFS_ENOTCONN;
14367
14368 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14369 tout(cct) << "ll_fsyncdir" << std::endl;
14370 tout(cct) << (uintptr_t)dirp << std::endl;
14371
14372 std::scoped_lock lock(client_lock);
14373 return _fsync(dirp->inode.get(), false);
14374 }
14375
14376 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14377 {
14378 ceph_assert(!(flags & O_CREAT));
14379
14380 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14381 if (!mref_reader.is_state_satisfied())
14382 return -CEPHFS_ENOTCONN;
14383
14384 vinodeno_t vino = _get_vino(in);
14385
14386 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14387 tout(cct) << "ll_open" << std::endl;
14388 tout(cct) << vino.ino.val << std::endl;
14389 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14390
14391 std::scoped_lock lock(client_lock);
14392
14393 int r;
14394 if (!fuse_default_permissions) {
14395 r = may_open(in, flags, perms);
14396 if (r < 0)
14397 goto out;
14398 }
14399
14400 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14401
14402 out:
14403 Fh *fhptr = fhp ? *fhp : NULL;
14404 if (fhptr) {
14405 ll_unclosed_fh_set.insert(fhptr);
14406 }
14407 tout(cct) << (uintptr_t)fhptr << std::endl;
14408 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14409 " = " << r << " (" << fhptr << ")" << dendl;
14410 return r;
14411 }
14412
14413 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14414 int flags, InodeRef *in, int caps, Fh **fhp,
14415 const UserPerm& perms)
14416 {
14417 *fhp = NULL;
14418
14419 vinodeno_t vparent = _get_vino(parent);
14420
14421 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14422 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14423 << ", gid " << perms.gid() << dendl;
14424 tout(cct) << "ll_create" << std::endl;
14425 tout(cct) << vparent.ino.val << std::endl;
14426 tout(cct) << name << std::endl;
14427 tout(cct) << mode << std::endl;
14428 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14429
14430 bool created = false;
14431 int r = _lookup(parent, name, caps, in, perms);
14432
14433 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14434 return -CEPHFS_EEXIST;
14435
14436 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14437 if (!fuse_default_permissions) {
14438 r = may_create(parent, perms);
14439 if (r < 0)
14440 goto out;
14441 }
14442 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14443 perms, "");
14444 if (r < 0)
14445 goto out;
14446 }
14447
14448 if (r < 0)
14449 goto out;
14450
14451 ceph_assert(*in);
14452
14453 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14454 if (!created) {
14455 if (!fuse_default_permissions) {
14456 r = may_open(in->get(), flags, perms);
14457 if (r < 0) {
14458 if (*fhp) {
14459 int release_r = _release_fh(*fhp);
14460 ceph_assert(release_r == 0); // during create, no async data ops should have happened
14461 }
14462 goto out;
14463 }
14464 }
14465 if (*fhp == NULL) {
14466 r = _open(in->get(), flags, mode, fhp, perms);
14467 if (r < 0)
14468 goto out;
14469 }
14470 }
14471
14472 out:
14473 if (*fhp) {
14474 ll_unclosed_fh_set.insert(*fhp);
14475 }
14476
14477 ino_t ino = 0;
14478 if (r >= 0) {
14479 Inode *inode = in->get();
14480 if (use_faked_inos())
14481 ino = inode->faked_ino;
14482 else
14483 ino = inode->ino;
14484 }
14485
14486 tout(cct) << (uintptr_t)*fhp << std::endl;
14487 tout(cct) << ino << std::endl;
14488 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14489 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14490 *fhp << " " << hex << ino << dec << ")" << dendl;
14491
14492 return r;
14493 }
14494
14495 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14496 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14497 const UserPerm& perms)
14498 {
14499 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14500 if (!mref_reader.is_state_satisfied())
14501 return -CEPHFS_ENOTCONN;
14502
14503 std::scoped_lock lock(client_lock);
14504 InodeRef in;
14505
14506 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14507 fhp, perms);
14508 if (r >= 0) {
14509 ceph_assert(in);
14510
14511 // passing an Inode in outp requires an additional ref
14512 if (outp) {
14513 _ll_get(in.get());
14514 *outp = in.get();
14515 }
14516 fill_stat(in, attr);
14517 } else {
14518 attr->st_ino = 0;
14519 }
14520
14521 return r;
14522 }
14523
14524 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14525 int oflags, Inode **outp, Fh **fhp,
14526 struct ceph_statx *stx, unsigned want, unsigned lflags,
14527 const UserPerm& perms)
14528 {
14529 unsigned caps = statx_to_mask(lflags, want);
14530 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14531 if (!mref_reader.is_state_satisfied())
14532 return -CEPHFS_ENOTCONN;
14533
14534 std::scoped_lock lock(client_lock);
14535 InodeRef in;
14536
14537 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14538 if (r >= 0) {
14539 ceph_assert(in);
14540
14541 // passing an Inode in outp requires an additional ref
14542 if (outp) {
14543 _ll_get(in.get());
14544 *outp = in.get();
14545 }
14546 fill_statx(in, caps, stx);
14547 } else {
14548 stx->stx_ino = 0;
14549 stx->stx_mask = 0;
14550 }
14551
14552 return r;
14553 }
14554
14555 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14556 {
14557 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14558 if (!mref_reader.is_state_satisfied())
14559 return -CEPHFS_ENOTCONN;
14560
14561 tout(cct) << "ll_lseek" << std::endl;
14562 tout(cct) << offset << std::endl;
14563 tout(cct) << whence << std::endl;
14564
14565 std::scoped_lock lock(client_lock);
14566 return _lseek(fh, offset, whence);
14567 }
14568
14569 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14570 {
14571 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14572 if (!mref_reader.is_state_satisfied())
14573 return -CEPHFS_ENOTCONN;
14574
14575 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14576 tout(cct) << "ll_read" << std::endl;
14577 tout(cct) << (uintptr_t)fh << std::endl;
14578 tout(cct) << off << std::endl;
14579 tout(cct) << len << std::endl;
14580
14581 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14582 len = std::min(len, (loff_t)INT_MAX);
14583 std::scoped_lock lock(client_lock);
14584
14585 int r = _read(fh, off, len, bl);
14586 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14587 << dendl;
14588 return r;
14589 }
14590
14591 int Client::ll_read_block(Inode *in, uint64_t blockid,
14592 char *buf,
14593 uint64_t offset,
14594 uint64_t length,
14595 file_layout_t* layout)
14596 {
14597 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14598 if (!mref_reader.is_state_satisfied())
14599 return -CEPHFS_ENOTCONN;
14600
14601 vinodeno_t vino = _get_vino(in);
14602 object_t oid = file_object_t(vino.ino, blockid);
14603 C_SaferCond onfinish;
14604 bufferlist bl;
14605
14606 objecter->read(oid,
14607 object_locator_t(layout->pool_id),
14608 offset,
14609 length,
14610 vino.snapid,
14611 &bl,
14612 CEPH_OSD_FLAG_READ,
14613 &onfinish);
14614
14615 int r = onfinish.wait();
14616 if (r >= 0) {
14617 bl.begin().copy(bl.length(), buf);
14618 r = bl.length();
14619 }
14620
14621 return r;
14622 }
14623
14624 /* It appears that the OSD doesn't return success unless the entire
14625 buffer was written, return the write length on success. */
14626
14627 int Client::ll_write_block(Inode *in, uint64_t blockid,
14628 char* buf, uint64_t offset,
14629 uint64_t length, file_layout_t* layout,
14630 uint64_t snapseq, uint32_t sync)
14631 {
14632 vinodeno_t vino = ll_get_vino(in);
14633 int r = 0;
14634 std::unique_ptr<C_SaferCond> onsafe = nullptr;
14635
14636 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14637 if (!mref_reader.is_state_satisfied())
14638 return -CEPHFS_ENOTCONN;
14639
14640 if (length == 0) {
14641 return -CEPHFS_EINVAL;
14642 }
14643 if (true || sync) {
14644 /* if write is stable, the epilogue is waiting on
14645 * flock */
14646 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14647 }
14648 object_t oid = file_object_t(vino.ino, blockid);
14649 SnapContext fakesnap;
14650 ceph::bufferlist bl;
14651 if (length > 0) {
14652 bl.push_back(buffer::copy(buf, length));
14653 }
14654
14655 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14656 << dendl;
14657
14658 fakesnap.seq = snapseq;
14659
14660 /* lock just in time */
14661 objecter->write(oid,
14662 object_locator_t(layout->pool_id),
14663 offset,
14664 length,
14665 fakesnap,
14666 bl,
14667 ceph::real_clock::now(),
14668 0,
14669 onsafe.get());
14670
14671 if (nullptr != onsafe) {
14672 r = onsafe->wait();
14673 }
14674
14675 if (r < 0) {
14676 return r;
14677 } else {
14678 return length;
14679 }
14680 }
14681
14682 int Client::ll_commit_blocks(Inode *in,
14683 uint64_t offset,
14684 uint64_t length)
14685 {
14686 /*
14687 BarrierContext *bctx;
14688 vinodeno_t vino = _get_vino(in);
14689 uint64_t ino = vino.ino;
14690
14691 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14692 << offset << " to " << length << dendl;
14693
14694 if (length == 0) {
14695 return -CEPHFS_EINVAL;
14696 }
14697
14698 std::scoped_lock lock(client_lock);
14699 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14700 if (p != barriers.end()) {
14701 barrier_interval civ(offset, offset + length);
14702 p->second->commit_barrier(civ);
14703 }
14704 */
14705 return 0;
14706 }
14707
14708 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14709 {
14710 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14711 "~" << len << dendl;
14712 tout(cct) << "ll_write" << std::endl;
14713 tout(cct) << (uintptr_t)fh << std::endl;
14714 tout(cct) << off << std::endl;
14715 tout(cct) << len << std::endl;
14716
14717 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14718 if (!mref_reader.is_state_satisfied())
14719 return -CEPHFS_ENOTCONN;
14720
14721 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14722 len = std::min(len, (loff_t)INT_MAX);
14723 std::scoped_lock lock(client_lock);
14724
14725 int r = _write(fh, off, len, data, NULL, 0);
14726 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14727 << dendl;
14728 return r;
14729 }
14730
14731 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14732 {
14733 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14734 if (!mref_reader.is_state_satisfied())
14735 return -CEPHFS_ENOTCONN;
14736
14737 std::scoped_lock cl(client_lock);
14738 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
14739 }
14740
14741 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14742 {
14743 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14744 if (!mref_reader.is_state_satisfied())
14745 return -CEPHFS_ENOTCONN;
14746
14747 std::scoped_lock cl(client_lock);
14748 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
14749 }
14750
14751 int Client::ll_flush(Fh *fh)
14752 {
14753 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14754 if (!mref_reader.is_state_satisfied())
14755 return -CEPHFS_ENOTCONN;
14756
14757 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14758 tout(cct) << "ll_flush" << std::endl;
14759 tout(cct) << (uintptr_t)fh << std::endl;
14760
14761 std::scoped_lock lock(client_lock);
14762 return _flush(fh);
14763 }
14764
14765 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14766 {
14767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14768 if (!mref_reader.is_state_satisfied())
14769 return -CEPHFS_ENOTCONN;
14770
14771 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14772 tout(cct) << "ll_fsync" << std::endl;
14773 tout(cct) << (uintptr_t)fh << std::endl;
14774
14775 std::scoped_lock lock(client_lock);
14776 int r = _fsync(fh, syncdataonly);
14777 if (r) {
14778 // If we're returning an error, clear it from the FH
14779 fh->take_async_err();
14780 }
14781 return r;
14782 }
14783
14784 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14785 {
14786 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14787 if (!mref_reader.is_state_satisfied())
14788 return -CEPHFS_ENOTCONN;
14789
14790 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14791 tout(cct) << "ll_sync_inode" << std::endl;
14792 tout(cct) << (uintptr_t)in << std::endl;
14793
14794 std::scoped_lock lock(client_lock);
14795 return _fsync(in, syncdataonly);
14796 }
14797
14798 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14799 {
14800 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14801
14802 if (offset < 0 || length <= 0)
14803 return -CEPHFS_EINVAL;
14804
14805 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14806 return -CEPHFS_EOPNOTSUPP;
14807
14808 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14809 return -CEPHFS_EOPNOTSUPP;
14810
14811 Inode *in = fh->inode.get();
14812
14813 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14814 !(mode & FALLOC_FL_PUNCH_HOLE)) {
14815 return -CEPHFS_ENOSPC;
14816 }
14817
14818 if (in->snapid != CEPH_NOSNAP)
14819 return -CEPHFS_EROFS;
14820
14821 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14822 return -CEPHFS_EBADF;
14823
14824 uint64_t size = offset + length;
14825 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14826 size > in->size &&
14827 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14828 return -CEPHFS_EDQUOT;
14829 }
14830
14831 int have;
14832 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14833 if (r < 0)
14834 return r;
14835
14836 std::unique_ptr<C_SaferCond> onuninline = nullptr;
14837 if (mode & FALLOC_FL_PUNCH_HOLE) {
14838 if (in->inline_version < CEPH_INLINE_NONE &&
14839 (have & CEPH_CAP_FILE_BUFFER)) {
14840 bufferlist bl;
14841 auto inline_iter = in->inline_data.cbegin();
14842 int len = in->inline_data.length();
14843 if (offset < len) {
14844 if (offset > 0)
14845 inline_iter.copy(offset, bl);
14846 int size = length;
14847 if (offset + size > len)
14848 size = len - offset;
14849 if (size > 0)
14850 bl.append_zero(size);
14851 if (offset + size < len) {
14852 inline_iter += size;
14853 inline_iter.copy(len - offset - size, bl);
14854 }
14855 in->inline_data = bl;
14856 in->inline_version++;
14857 }
14858 in->mtime = in->ctime = ceph_clock_now();
14859 in->change_attr++;
14860 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14861 } else {
14862 if (in->inline_version < CEPH_INLINE_NONE) {
14863 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14864 uninline_data(in, onuninline.get());
14865 }
14866
14867 C_SaferCond onfinish("Client::_punch_hole flock");
14868
14869 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14870
14871 _invalidate_inode_cache(in, offset, length);
14872 filer->zero(in->ino, &in->layout,
14873 in->snaprealm->get_snap_context(),
14874 offset, length,
14875 ceph::real_clock::now(),
14876 0, true, &onfinish);
14877 in->mtime = in->ctime = ceph_clock_now();
14878 in->change_attr++;
14879 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14880
14881 client_lock.unlock();
14882 onfinish.wait();
14883 client_lock.lock();
14884 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14885 }
14886 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14887 uint64_t size = offset + length;
14888 if (size > in->size) {
14889 in->size = size;
14890 in->mtime = in->ctime = ceph_clock_now();
14891 in->change_attr++;
14892 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14893
14894 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14895 check_caps(in, CHECK_CAPS_NODELAY);
14896 } else if (is_max_size_approaching(in)) {
14897 check_caps(in, 0);
14898 }
14899 }
14900 }
14901
14902 if (nullptr != onuninline) {
14903 client_lock.unlock();
14904 int ret = onuninline->wait();
14905 client_lock.lock();
14906
14907 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14908 in->inline_data.clear();
14909 in->inline_version = CEPH_INLINE_NONE;
14910 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14911 check_caps(in, 0);
14912 } else
14913 r = ret;
14914 }
14915
14916 put_cap_ref(in, CEPH_CAP_FILE_WR);
14917 return r;
14918 }
14919
14920 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14921 {
14922 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14923 if (!mref_reader.is_state_satisfied())
14924 return -CEPHFS_ENOTCONN;
14925
14926 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14927 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14928 tout(cct) << (uintptr_t)fh << std::endl;
14929
14930 std::scoped_lock lock(client_lock);
14931 return _fallocate(fh, mode, offset, length);
14932 }
14933
14934 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14935 {
14936 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14937 if (!mref_reader.is_state_satisfied())
14938 return -CEPHFS_ENOTCONN;
14939
14940 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14941
14942 std::scoped_lock lock(client_lock);
14943 Fh *fh = get_filehandle(fd);
14944 if (!fh)
14945 return -CEPHFS_EBADF;
14946 #if defined(__linux__) && defined(O_PATH)
14947 if (fh->flags & O_PATH)
14948 return -CEPHFS_EBADF;
14949 #endif
14950 return _fallocate(fh, mode, offset, length);
14951 }
14952
14953 int Client::ll_release(Fh *fh)
14954 {
14955 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14956 if (!mref_reader.is_state_satisfied())
14957 return -CEPHFS_ENOTCONN;
14958
14959 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14960 dendl;
14961 tout(cct) << __func__ << " (fh)" << std::endl;
14962 tout(cct) << (uintptr_t)fh << std::endl;
14963
14964 std::scoped_lock lock(client_lock);
14965
14966 if (ll_unclosed_fh_set.count(fh))
14967 ll_unclosed_fh_set.erase(fh);
14968 return _release_fh(fh);
14969 }
14970
14971 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14972 {
14973 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14974 if (!mref_reader.is_state_satisfied())
14975 return -CEPHFS_ENOTCONN;
14976
14977 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14978 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14979
14980 std::scoped_lock lock(client_lock);
14981 return _getlk(fh, fl, owner);
14982 }
14983
14984 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14985 {
14986 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14987 if (!mref_reader.is_state_satisfied())
14988 return -CEPHFS_ENOTCONN;
14989
14990 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
14991 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14992
14993 std::scoped_lock lock(client_lock);
14994 return _setlk(fh, fl, owner, sleep);
14995 }
14996
14997 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14998 {
14999 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15000 if (!mref_reader.is_state_satisfied())
15001 return -CEPHFS_ENOTCONN;
15002
15003 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
15004 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
15005
15006 std::scoped_lock lock(client_lock);
15007 return _flock(fh, cmd, owner);
15008 }
15009
15010 int Client::set_deleg_timeout(uint32_t timeout)
15011 {
15012 std::scoped_lock lock(client_lock);
15013
15014 /*
15015 * The whole point is to prevent blocklisting so we must time out the
15016 * delegation before the session autoclose timeout kicks in.
15017 */
15018 if (timeout >= mdsmap->get_session_autoclose())
15019 return -CEPHFS_EINVAL;
15020
15021 deleg_timeout = timeout;
15022 return 0;
15023 }
15024
15025 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
15026 {
15027 int ret = -CEPHFS_EINVAL;
15028
15029 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15030 if (!mref_reader.is_state_satisfied())
15031 return -CEPHFS_ENOTCONN;
15032
15033 std::scoped_lock lock(client_lock);
15034
15035 Inode *inode = fh->inode.get();
15036
15037 switch(cmd) {
15038 case CEPH_DELEGATION_NONE:
15039 inode->unset_deleg(fh);
15040 ret = 0;
15041 break;
15042 default:
15043 try {
15044 ret = inode->set_deleg(fh, cmd, cb, priv);
15045 } catch (std::bad_alloc&) {
15046 ret = -CEPHFS_ENOMEM;
15047 }
15048 break;
15049 }
15050 return ret;
15051 }
15052
15053 class C_Client_RequestInterrupt : public Context {
15054 private:
15055 Client *client;
15056 MetaRequest *req;
15057 public:
15058 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
15059 req->get();
15060 }
15061 void finish(int r) override {
15062 std::scoped_lock l(client->client_lock);
15063 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
15064 client->_interrupt_filelock(req);
15065 client->put_request(req);
15066 }
15067 };
15068
15069 void Client::ll_interrupt(void *d)
15070 {
15071 MetaRequest *req = static_cast<MetaRequest*>(d);
15072 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15073 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
15074 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15075 }
15076
15077 // =========================================
15078 // layout
15079
15080 // expose file layouts
15081
15082 int Client::describe_layout(const char *relpath, file_layout_t *lp,
15083 const UserPerm& perms)
15084 {
15085 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15086 if (!mref_reader.is_state_satisfied())
15087 return -CEPHFS_ENOTCONN;
15088
15089 std::scoped_lock lock(client_lock);
15090
15091 filepath path(relpath);
15092 InodeRef in;
15093 int r = path_walk(path, &in, perms);
15094 if (r < 0)
15095 return r;
15096
15097 *lp = in->layout;
15098
15099 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
15100 return 0;
15101 }
15102
15103 int Client::fdescribe_layout(int fd, file_layout_t *lp)
15104 {
15105 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15106 if (!mref_reader.is_state_satisfied())
15107 return -CEPHFS_ENOTCONN;
15108
15109 std::scoped_lock lock(client_lock);
15110
15111 Fh *f = get_filehandle(fd);
15112 if (!f)
15113 return -CEPHFS_EBADF;
15114 Inode *in = f->inode.get();
15115
15116 *lp = in->layout;
15117
15118 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
15119 return 0;
15120 }
15121
15122 int64_t Client::get_default_pool_id()
15123 {
15124 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15125 if (!mref_reader.is_state_satisfied())
15126 return -CEPHFS_ENOTCONN;
15127
15128 std::scoped_lock lock(client_lock);
15129
15130 /* first data pool is the default */
15131 return mdsmap->get_first_data_pool();
15132 }
15133
15134 // expose osdmap
15135
15136 int64_t Client::get_pool_id(const char *pool_name)
15137 {
15138 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15139 if (!mref_reader.is_state_satisfied())
15140 return -CEPHFS_ENOTCONN;
15141
15142 std::scoped_lock lock(client_lock);
15143
15144 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15145 pool_name);
15146 }
15147
15148 string Client::get_pool_name(int64_t pool)
15149 {
15150 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15151 if (!mref_reader.is_state_satisfied())
15152 return string();
15153
15154 std::scoped_lock lock(client_lock);
15155
15156 return objecter->with_osdmap([pool](const OSDMap& o) {
15157 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15158 });
15159 }
15160
15161 int Client::get_pool_replication(int64_t pool)
15162 {
15163 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15164 if (!mref_reader.is_state_satisfied())
15165 return -CEPHFS_ENOTCONN;
15166
15167 std::scoped_lock lock(client_lock);
15168
15169 return objecter->with_osdmap([pool](const OSDMap& o) {
15170 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
15171 });
15172 }
15173
15174 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15175 {
15176 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15177 if (!mref_reader.is_state_satisfied())
15178 return -CEPHFS_ENOTCONN;
15179
15180 std::scoped_lock lock(client_lock);
15181
15182 Fh *f = get_filehandle(fd);
15183 if (!f)
15184 return -CEPHFS_EBADF;
15185 Inode *in = f->inode.get();
15186
15187 vector<ObjectExtent> extents;
15188 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
15189 ceph_assert(extents.size() == 1);
15190
15191 objecter->with_osdmap([&](const OSDMap& o) {
15192 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15193 o.pg_to_acting_osds(pg, osds);
15194 });
15195
15196 if (osds.empty())
15197 return -CEPHFS_EINVAL;
15198
15199 /*
15200 * Return the remainder of the extent (stripe unit)
15201 *
15202 * If length = 1 is passed to Striper::file_to_extents we get a single
15203 * extent back, but its length is one so we still need to compute the length
15204 * to the end of the stripe unit.
15205 *
15206 * If length = su then we may get 1 or 2 objects back in the extents vector
15207 * which would have to be examined. Even then, the offsets are local to the
15208 * object, so matching up to the file offset is extra work.
15209 *
15210 * It seems simpler to stick with length = 1 and manually compute the
15211 * remainder.
15212 */
15213 if (len) {
15214 uint64_t su = in->layout.stripe_unit;
15215 *len = su - (off % su);
15216 }
15217
15218 return 0;
15219 }
15220
15221 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15222 {
15223 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15224 if (!mref_reader.is_state_satisfied())
15225 return -CEPHFS_ENOTCONN;
15226
15227 std::scoped_lock lock(client_lock);
15228
15229 if (id < 0)
15230 return -CEPHFS_EINVAL;
15231 return objecter->with_osdmap([&](const OSDMap& o) {
15232 return o.crush->get_full_location_ordered(id, path);
15233 });
15234 }
15235
15236 int Client::get_file_stripe_address(int fd, loff_t offset,
15237 vector<entity_addr_t>& address)
15238 {
15239 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15240 if (!mref_reader.is_state_satisfied())
15241 return -CEPHFS_ENOTCONN;
15242
15243 std::scoped_lock lock(client_lock);
15244
15245 Fh *f = get_filehandle(fd);
15246 if (!f)
15247 return -CEPHFS_EBADF;
15248 Inode *in = f->inode.get();
15249
15250 // which object?
15251 vector<ObjectExtent> extents;
15252 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15253 in->truncate_size, extents);
15254 ceph_assert(extents.size() == 1);
15255
15256 // now we have the object and its 'layout'
15257 return objecter->with_osdmap([&](const OSDMap& o) {
15258 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15259 vector<int> osds;
15260 o.pg_to_acting_osds(pg, osds);
15261 if (osds.empty())
15262 return -CEPHFS_EINVAL;
15263 for (unsigned i = 0; i < osds.size(); i++) {
15264 entity_addr_t addr = o.get_addrs(osds[i]).front();
15265 address.push_back(addr);
15266 }
15267 return 0;
15268 });
15269 }
15270
15271 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15272 {
15273 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15274 if (!mref_reader.is_state_satisfied())
15275 return -CEPHFS_ENOTCONN;
15276
15277 std::scoped_lock lock(client_lock);
15278
15279 return objecter->with_osdmap([&](const OSDMap& o) {
15280 if (!o.exists(osd))
15281 return -CEPHFS_ENOENT;
15282
15283 addr = o.get_addrs(osd).front();
15284 return 0;
15285 });
15286 }
15287
15288 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15289 loff_t length, loff_t offset)
15290 {
15291 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15292 if (!mref_reader.is_state_satisfied())
15293 return -CEPHFS_ENOTCONN;
15294
15295 std::scoped_lock lock(client_lock);
15296
15297 Fh *f = get_filehandle(fd);
15298 if (!f)
15299 return -CEPHFS_EBADF;
15300 Inode *in = f->inode.get();
15301
15302 // map to a list of extents
15303 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15304
15305 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15306 return 0;
15307 }
15308
15309
15310 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15311 int Client::get_local_osd()
15312 {
15313 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15314 if (!mref_reader.is_state_satisfied())
15315 return -CEPHFS_ENOTCONN;
15316
15317 std::scoped_lock lock(client_lock);
15318
15319 objecter->with_osdmap([this](const OSDMap& o) {
15320 if (o.get_epoch() != local_osd_epoch) {
15321 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15322 local_osd_epoch = o.get_epoch();
15323 }
15324 });
15325 return local_osd;
15326 }
15327
15328
15329
15330
15331
15332
15333 // ===============================
15334
15335 void Client::ms_handle_connect(Connection *con)
15336 {
15337 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15338 }
15339
15340 bool Client::ms_handle_reset(Connection *con)
15341 {
15342 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15343 return false;
15344 }
15345
15346 void Client::ms_handle_remote_reset(Connection *con)
15347 {
15348 std::scoped_lock lock(client_lock);
15349 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15350 switch (con->get_peer_type()) {
15351 case CEPH_ENTITY_TYPE_MDS:
15352 {
15353 // kludge to figure out which mds this is; fixme with a Connection* state
15354 mds_rank_t mds = MDS_RANK_NONE;
15355 MetaSessionRef s = NULL;
15356 for (auto &p : mds_sessions) {
15357 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15358 mds = p.first;
15359 s = p.second;
15360 }
15361 }
15362 if (mds >= 0) {
15363 ceph_assert(s != NULL);
15364 switch (s->state) {
15365 case MetaSession::STATE_CLOSING:
15366 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15367 _closed_mds_session(s.get());
15368 break;
15369
15370 case MetaSession::STATE_OPENING:
15371 {
15372 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15373 list<Context*> waiters;
15374 waiters.swap(s->waiting_for_open);
15375 _closed_mds_session(s.get());
15376 auto news = _get_or_open_mds_session(mds);
15377 news->waiting_for_open.swap(waiters);
15378 }
15379 break;
15380
15381 case MetaSession::STATE_OPEN:
15382 {
15383 objecter->maybe_request_map(); /* to check if we are blocklisted */
15384 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15385 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15386 _closed_mds_session(s.get());
15387 } else {
15388 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15389 s->state = MetaSession::STATE_STALE;
15390 }
15391 }
15392 break;
15393
15394 case MetaSession::STATE_NEW:
15395 case MetaSession::STATE_CLOSED:
15396 default:
15397 break;
15398 }
15399 }
15400 }
15401 break;
15402 }
15403 }
15404
15405 bool Client::ms_handle_refused(Connection *con)
15406 {
15407 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15408 return false;
15409 }
15410
15411 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15412 {
15413 Inode *quota_in = root_ancestor;
15414 SnapRealm *realm = in->snaprealm;
15415
15416 if (!cct->_conf.get_val<bool>("client_quota"))
15417 return NULL;
15418
15419 while (realm) {
15420 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15421 if (realm->ino != in->ino) {
15422 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15423 if (p == inode_map.end())
15424 break;
15425
15426 if (p->second->quota.is_enable()) {
15427 quota_in = p->second;
15428 break;
15429 }
15430 }
15431 realm = realm->pparent;
15432 }
15433 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15434 return quota_in;
15435 }
15436
15437 /**
15438 * Traverse quota ancestors of the Inode, return true
15439 * if any of them passes the passed function
15440 */
15441 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15442 std::function<bool (const Inode &in)> test)
15443 {
15444 if (!cct->_conf.get_val<bool>("client_quota"))
15445 return false;
15446
15447 while (true) {
15448 ceph_assert(in != NULL);
15449 if (test(*in)) {
15450 return true;
15451 }
15452
15453 if (in == root_ancestor) {
15454 // We're done traversing, drop out
15455 return false;
15456 } else {
15457 // Continue up the tree
15458 in = get_quota_root(in, perms);
15459 }
15460 }
15461
15462 return false;
15463 }
15464
15465 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15466 {
15467 return check_quota_condition(in, perms,
15468 [](const Inode &in) {
15469 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15470 });
15471 }
15472
15473 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15474 const UserPerm& perms)
15475 {
15476 return check_quota_condition(in, perms,
15477 [&new_bytes](const Inode &in) {
15478 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15479 > in.quota.max_bytes;
15480 });
15481 }
15482
15483 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15484 {
15485 ceph_assert(in->size >= in->reported_size);
15486 const uint64_t size = in->size - in->reported_size;
15487 return check_quota_condition(in, perms,
15488 [&size](const Inode &in) {
15489 if (in.quota.max_bytes) {
15490 if (in.rstat.rbytes >= in.quota.max_bytes) {
15491 return true;
15492 }
15493
15494 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15495 return (space >> 4) < size;
15496 } else {
15497 return false;
15498 }
15499 });
15500 }
15501
15502 enum {
15503 POOL_CHECKED = 1,
15504 POOL_CHECKING = 2,
15505 POOL_READ = 4,
15506 POOL_WRITE = 8,
15507 };
15508
15509 int Client::check_pool_perm(Inode *in, int need)
15510 {
15511 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15512
15513 if (!cct->_conf->client_check_pool_perm)
15514 return 0;
15515
15516 /* Only need to do this for regular files */
15517 if (!in->is_file())
15518 return 0;
15519
15520 int64_t pool_id = in->layout.pool_id;
15521 std::string pool_ns = in->layout.pool_ns;
15522 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15523 int have = 0;
15524 while (true) {
15525 auto it = pool_perms.find(perm_key);
15526 if (it == pool_perms.end())
15527 break;
15528 if (it->second == POOL_CHECKING) {
15529 // avoid concurrent checkings
15530 wait_on_list(waiting_for_pool_perm);
15531 } else {
15532 have = it->second;
15533 ceph_assert(have & POOL_CHECKED);
15534 break;
15535 }
15536 }
15537
15538 if (!have) {
15539 if (in->snapid != CEPH_NOSNAP) {
15540 // pool permission check needs to write to the first object. But for snapshot,
15541 // head of the first object may have already been deleted. To avoid creating
15542 // orphan object, skip the check for now.
15543 return 0;
15544 }
15545
15546 pool_perms[perm_key] = POOL_CHECKING;
15547
15548 char oid_buf[32];
15549 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15550 object_t oid = oid_buf;
15551
15552 SnapContext nullsnapc;
15553
15554 C_SaferCond rd_cond;
15555 ObjectOperation rd_op;
15556 rd_op.stat(nullptr, nullptr, nullptr);
15557
15558 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15559 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15560
15561 C_SaferCond wr_cond;
15562 ObjectOperation wr_op;
15563 wr_op.create(true);
15564
15565 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15566 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15567
15568 client_lock.unlock();
15569 int rd_ret = rd_cond.wait();
15570 int wr_ret = wr_cond.wait();
15571 client_lock.lock();
15572
15573 bool errored = false;
15574
15575 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
15576 have |= POOL_READ;
15577 else if (rd_ret != -CEPHFS_EPERM) {
15578 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15579 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15580 errored = true;
15581 }
15582
15583 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
15584 have |= POOL_WRITE;
15585 else if (wr_ret != -CEPHFS_EPERM) {
15586 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15587 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15588 errored = true;
15589 }
15590
15591 if (errored) {
15592 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15593 // Raise EIO because actual error code might be misleading for
15594 // userspace filesystem user.
15595 pool_perms.erase(perm_key);
15596 signal_cond_list(waiting_for_pool_perm);
15597 return -CEPHFS_EIO;
15598 }
15599
15600 pool_perms[perm_key] = have | POOL_CHECKED;
15601 signal_cond_list(waiting_for_pool_perm);
15602 }
15603
15604 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15605 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15606 << " need " << ccap_string(need) << ", but no read perm" << dendl;
15607 return -CEPHFS_EPERM;
15608 }
15609 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15610 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15611 << " need " << ccap_string(need) << ", but no write perm" << dendl;
15612 return -CEPHFS_EPERM;
15613 }
15614
15615 return 0;
15616 }
15617
15618 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15619 {
15620 if (acl_type == POSIX_ACL) {
15621 if (in->xattrs.count(ACL_EA_ACCESS)) {
15622 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15623
15624 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15625 }
15626 }
15627 return -CEPHFS_EAGAIN;
15628 }
15629
15630 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15631 {
15632 if (acl_type == NO_ACL)
15633 return 0;
15634
15635 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15636 if (r < 0)
15637 goto out;
15638
15639 if (acl_type == POSIX_ACL) {
15640 if (in->xattrs.count(ACL_EA_ACCESS)) {
15641 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15642 bufferptr acl(access_acl.c_str(), access_acl.length());
15643 r = posix_acl_access_chmod(acl, mode);
15644 if (r < 0)
15645 goto out;
15646 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15647 } else {
15648 r = 0;
15649 }
15650 }
15651 out:
15652 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15653 return r;
15654 }
15655
15656 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15657 const UserPerm& perms)
15658 {
15659 if (acl_type == NO_ACL)
15660 return 0;
15661
15662 if (S_ISLNK(*mode))
15663 return 0;
15664
15665 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15666 if (r < 0)
15667 goto out;
15668
15669 if (acl_type == POSIX_ACL) {
15670 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15671 map<string, bufferptr> xattrs;
15672
15673 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15674 bufferptr acl(default_acl.c_str(), default_acl.length());
15675 r = posix_acl_inherit_mode(acl, mode);
15676 if (r < 0)
15677 goto out;
15678
15679 if (r > 0) {
15680 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15681 if (r < 0)
15682 goto out;
15683 if (r > 0)
15684 xattrs[ACL_EA_ACCESS] = acl;
15685 }
15686
15687 if (S_ISDIR(*mode))
15688 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15689
15690 r = xattrs.size();
15691 if (r > 0)
15692 encode(xattrs, xattrs_bl);
15693 } else {
15694 if (umask_cb)
15695 *mode &= ~umask_cb(callback_handle);
15696 r = 0;
15697 }
15698 }
15699 out:
15700 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15701 return r;
15702 }
15703
15704 void Client::set_filer_flags(int flags)
15705 {
15706 std::scoped_lock l(client_lock);
15707 ceph_assert(flags == 0 ||
15708 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15709 objecter->add_global_op_flags(flags);
15710 }
15711
15712 void Client::clear_filer_flags(int flags)
15713 {
15714 std::scoped_lock l(client_lock);
15715 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15716 objecter->clear_global_op_flag(flags);
15717 }
15718
15719 // called before mount
15720 void Client::set_uuid(const std::string& uuid)
15721 {
15722 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15723 ceph_assert(iref_reader.is_state_satisfied());
15724
15725 std::scoped_lock l(client_lock);
15726 ceph_assert(!uuid.empty());
15727
15728 metadata["uuid"] = uuid;
15729 _close_sessions();
15730 }
15731
15732 // called before mount. 0 means infinite
15733 void Client::set_session_timeout(unsigned timeout)
15734 {
15735 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15736 ceph_assert(iref_reader.is_state_satisfied());
15737
15738 std::scoped_lock l(client_lock);
15739
15740 metadata["timeout"] = stringify(timeout);
15741 }
15742
15743 // called before mount
15744 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15745 const std::string& fs_name)
15746 {
15747 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15748 if (!iref_reader.is_state_satisfied())
15749 return -CEPHFS_ENOTCONN;
15750
15751 if (uuid.empty())
15752 return -CEPHFS_EINVAL;
15753
15754 std::unique_lock l(client_lock);
15755 {
15756 auto it = metadata.find("uuid");
15757 if (it != metadata.end() && it->second == uuid)
15758 return -CEPHFS_EINVAL;
15759 }
15760
15761 int r = subscribe_mdsmap(fs_name);
15762 if (r < 0) {
15763 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15764 return r;
15765 }
15766
15767 if (metadata.empty())
15768 populate_metadata("");
15769
15770 while (mdsmap->get_epoch() == 0)
15771 wait_on_list(waiting_for_mdsmap);
15772
15773 reclaim_errno = 0;
15774 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15775 if (!mdsmap->is_up(mds)) {
15776 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15777 wait_on_list(waiting_for_mdsmap);
15778 continue;
15779 }
15780
15781 MetaSessionRef session;
15782 if (!have_open_session(mds)) {
15783 session = _get_or_open_mds_session(mds);
15784 if (session->state == MetaSession::STATE_REJECTED)
15785 return -CEPHFS_EPERM;
15786 if (session->state != MetaSession::STATE_OPENING) {
15787 // umounting?
15788 return -CEPHFS_EINVAL;
15789 }
15790 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15791 wait_on_context_list(session->waiting_for_open);
15792 continue;
15793 }
15794
15795 session = mds_sessions.at(mds);
15796 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15797 return -CEPHFS_EOPNOTSUPP;
15798
15799 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15800 session->reclaim_state == MetaSession::RECLAIMING) {
15801 session->reclaim_state = MetaSession::RECLAIMING;
15802 auto m = make_message<MClientReclaim>(uuid, flags);
15803 session->con->send_message2(std::move(m));
15804 wait_on_list(waiting_for_reclaim);
15805 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15806 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15807 } else {
15808 mds++;
15809 }
15810 }
15811
15812 // didn't find target session in any mds
15813 if (reclaim_target_addrs.empty()) {
15814 if (flags & CEPH_RECLAIM_RESET)
15815 return -CEPHFS_ENOENT;
15816 return -CEPHFS_ENOTRECOVERABLE;
15817 }
15818
15819 if (flags & CEPH_RECLAIM_RESET)
15820 return 0;
15821
15822 // use blocklist to check if target session was killed
15823 // (config option mds_session_blocklist_on_evict needs to be true)
15824 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15825 bs::error_code ec;
15826 l.unlock();
15827 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15828 l.lock();
15829
15830 if (ec)
15831 return ceph::from_error_code(ec);
15832
15833 bool blocklisted = objecter->with_osdmap(
15834 [this](const OSDMap &osd_map) -> bool {
15835 return osd_map.is_blocklisted(reclaim_target_addrs);
15836 });
15837 if (blocklisted)
15838 return -CEPHFS_ENOTRECOVERABLE;
15839
15840 metadata["reclaiming_uuid"] = uuid;
15841 return 0;
15842 }
15843
15844 void Client::finish_reclaim()
15845 {
15846 auto it = metadata.find("reclaiming_uuid");
15847 if (it == metadata.end()) {
15848 for (auto &p : mds_sessions)
15849 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15850 return;
15851 }
15852
15853 for (auto &p : mds_sessions) {
15854 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15855 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15856 p.second->con->send_message2(std::move(m));
15857 }
15858
15859 metadata["uuid"] = it->second;
15860 metadata.erase(it);
15861 }
15862
15863 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15864 {
15865 mds_rank_t from = mds_rank_t(reply->get_source().num());
15866 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15867
15868 std::scoped_lock cl(client_lock);
15869 auto session = _get_mds_session(from, reply->get_connection().get());
15870 if (!session) {
15871 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15872 return;
15873 }
15874
15875 if (reply->get_result() >= 0) {
15876 session->reclaim_state = MetaSession::RECLAIM_OK;
15877 if (reply->get_epoch() > reclaim_osd_epoch)
15878 reclaim_osd_epoch = reply->get_epoch();
15879 if (!reply->get_addrs().empty())
15880 reclaim_target_addrs = reply->get_addrs();
15881 } else {
15882 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15883 reclaim_errno = reply->get_result();
15884 }
15885
15886 signal_cond_list(waiting_for_reclaim);
15887 }
15888
15889 /**
15890 * This is included in cap release messages, to cause
15891 * the MDS to wait until this OSD map epoch. It is necessary
15892 * in corner cases where we cancel RADOS ops, so that
15893 * nobody else tries to do IO to the same objects in
15894 * the same epoch as the cancelled ops.
15895 */
15896 void Client::set_cap_epoch_barrier(epoch_t e)
15897 {
15898 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15899 cap_epoch_barrier = e;
15900 }
15901
15902 const char** Client::get_tracked_conf_keys() const
15903 {
15904 static const char* keys[] = {
15905 "client_cache_size",
15906 "client_cache_mid",
15907 "client_acl_type",
15908 "client_deleg_timeout",
15909 "client_deleg_break_on_open",
15910 "client_oc_size",
15911 "client_oc_max_objects",
15912 "client_oc_max_dirty",
15913 "client_oc_target_dirty",
15914 "client_oc_max_dirty_age",
15915 "client_caps_release_delay",
15916 "client_mount_timeout",
15917 NULL
15918 };
15919 return keys;
15920 }
15921
15922 void Client::handle_conf_change(const ConfigProxy& conf,
15923 const std::set <std::string> &changed)
15924 {
15925 std::scoped_lock lock(client_lock);
15926
15927 if (changed.count("client_cache_mid")) {
15928 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15929 }
15930 if (changed.count("client_acl_type")) {
15931 acl_type = NO_ACL;
15932 if (cct->_conf->client_acl_type == "posix_acl")
15933 acl_type = POSIX_ACL;
15934 }
15935 if (changed.count("client_oc_size")) {
15936 objectcacher->set_max_size(cct->_conf->client_oc_size);
15937 }
15938 if (changed.count("client_oc_max_objects")) {
15939 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15940 }
15941 if (changed.count("client_oc_max_dirty")) {
15942 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15943 }
15944 if (changed.count("client_oc_target_dirty")) {
15945 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15946 }
15947 if (changed.count("client_oc_max_dirty_age")) {
15948 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15949 }
15950 if (changed.count("client_collect_and_send_global_metrics")) {
15951 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
15952 "client_collect_and_send_global_metrics");
15953 }
15954 if (changed.count("client_caps_release_delay")) {
15955 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
15956 "client_caps_release_delay");
15957 }
15958 if (changed.count("client_mount_timeout")) {
15959 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
15960 "client_mount_timeout");
15961 }
15962 }
15963
15964 void intrusive_ptr_add_ref(Inode *in)
15965 {
15966 in->iget();
15967 }
15968
15969 void intrusive_ptr_release(Inode *in)
15970 {
15971 in->client->put_inode(in);
15972 }
15973
15974 mds_rank_t Client::_get_random_up_mds() const
15975 {
15976 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15977
15978 std::set<mds_rank_t> up;
15979 mdsmap->get_up_mds_set(up);
15980
15981 if (up.empty())
15982 return MDS_RANK_NONE;
15983 std::set<mds_rank_t>::const_iterator p = up.begin();
15984 for (int n = rand() % up.size(); n; n--)
15985 ++p;
15986 return *p;
15987 }
15988
15989
15990 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15991 boost::asio::io_context& ictx)
15992 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15993 {
15994 monclient->set_messenger(m);
15995 objecter->set_client_incarnation(0);
15996 }
15997
15998 StandaloneClient::~StandaloneClient()
15999 {
16000 delete objecter;
16001 objecter = nullptr;
16002 }
16003
16004 int StandaloneClient::init()
16005 {
16006 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
16007 ceph_assert(iref_writer.is_first_writer());
16008
16009 _pre_init();
16010 objecter->init();
16011
16012 client_lock.lock();
16013
16014 messenger->add_dispatcher_tail(objecter);
16015 messenger->add_dispatcher_tail(this);
16016
16017 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
16018 int r = monclient->init();
16019 if (r < 0) {
16020 // need to do cleanup because we're in an intermediate init state
16021 {
16022 std::scoped_lock l(timer_lock);
16023 timer.shutdown();
16024 }
16025
16026 client_lock.unlock();
16027 objecter->shutdown();
16028 objectcacher->stop();
16029 monclient->shutdown();
16030 return r;
16031 }
16032 objecter->start();
16033
16034 client_lock.unlock();
16035 _finish_init();
16036 iref_writer.update_state(CLIENT_INITIALIZED);
16037
16038 return 0;
16039 }
16040
16041 void StandaloneClient::shutdown()
16042 {
16043 Client::shutdown();
16044 objecter->shutdown();
16045 monclient->shutdown();
16046 }