]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #ifndef _WIN32
27 #include <sys/utsname.h>
28 #endif
29 #include <sys/uio.h>
30
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
33
34 #include "common/async/waiter.h"
35
36 #if defined(__FreeBSD__)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #elif !defined(_WIN32)
40 #include <sys/xattr.h>
41 #endif
42
43 #if defined(__linux__)
44 #include <linux/falloc.h>
45 #endif
46
47 #include <sys/statvfs.h>
48
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
52
53 #include "mon/MonClient.h"
54
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
72
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "mds/snap.h"
76 #include "osd/OSDMap.h"
77 #include "osdc/Filer.h"
78
79 #include "common/Cond.h"
80 #include "common/perf_counters.h"
81 #include "common/admin_socket.h"
82 #include "common/errno.h"
83 #include "include/str_list.h"
84
85 #define dout_subsys ceph_subsys_client
86
87 #include "include/lru.h"
88 #include "include/compat.h"
89 #include "include/stringify.h"
90 #include "include/random.h"
91
92 #include "Client.h"
93 #include "Inode.h"
94 #include "Dentry.h"
95 #include "Delegation.h"
96 #include "Dir.h"
97 #include "ClientSnapRealm.h"
98 #include "Fh.h"
99 #include "MetaSession.h"
100 #include "MetaRequest.h"
101 #include "ObjecterWriteback.h"
102 #include "posix_acl.h"
103
104 #include "include/ceph_assert.h"
105 #include "include/stat.h"
106
107 #include "include/cephfs/ceph_ll_client.h"
108
109 #if HAVE_GETGROUPLIST
110 #include <grp.h>
111 #include <pwd.h>
112 #include <unistd.h>
113 #endif
114
115 #undef dout_prefix
116 #define dout_prefix *_dout << "client." << whoami << " "
117
118 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
119
120 // FreeBSD fails to define this
121 #ifndef O_DSYNC
122 #define O_DSYNC 0x0
123 #endif
124 // Darwin fails to define this
125 #ifndef O_RSYNC
126 #define O_RSYNC 0x0
127 #endif
128
129 #ifndef O_DIRECT
130 #define O_DIRECT 0x0
131 #endif
132
133 // Windows doesn't define those values. While the Posix compatibilty layer
134 // doesn't support those values, the Windows native functions do provide
135 // similar flags. Special care should be taken if we're going to use those
136 // flags in ceph-dokan. The current values are no-ops, while propagating
137 // them to the rest of the code might cause the Windows functions to reject
138 // them as invalid.
139 #ifndef O_NOFOLLOW
140 #define O_NOFOLLOW 0x0
141 #endif
142
143 #ifndef O_SYNC
144 #define O_SYNC 0x0
145 #endif
146
147 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
148
149 #ifndef S_IXUGO
150 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
151 #endif
152
153 using std::dec;
154 using std::hex;
155 using std::list;
156 using std::oct;
157 using std::pair;
158 using std::string;
159 using std::vector;
160
161 using namespace TOPNSPC::common;
162
163 namespace bs = boost::system;
164 namespace ca = ceph::async;
165
166 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
167 {
168 Client *client = static_cast<Client*>(p);
169 client->flush_set_callback(oset);
170 }
171
172 bool Client::is_reserved_vino(vinodeno_t &vino) {
173 if (MDS_IS_PRIVATE_INO(vino.ino)) {
174 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
175 return true;
176 }
177 return false;
178 }
179
180 // running average and standard deviation -- presented in
181 // Donald Knuth's TAoCP, Volume II.
182 double calc_average(double old_avg, double value, uint64_t count) {
183 double new_avg;
184 if (count == 1) {
185 new_avg = value;
186 } else {
187 new_avg = old_avg + ((value - old_avg) / count);
188 }
189
190 return new_avg;
191 }
192
193 double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
194 double value, uint64_t count) {
195 double new_sq_sum;
196 if (count == 1) {
197 new_sq_sum = 0.0;
198 } else {
199 new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
200 }
201
202 return new_sq_sum;
203 }
204
205 // -------------
206
207 Client::CommandHook::CommandHook(Client *client) :
208 m_client(client)
209 {
210 }
211
212 int Client::CommandHook::call(
213 std::string_view command,
214 const cmdmap_t& cmdmap,
215 const bufferlist&,
216 Formatter *f,
217 std::ostream& errss,
218 bufferlist& out)
219 {
220 f->open_object_section("result");
221 {
222 std::scoped_lock l{m_client->client_lock};
223 if (command == "mds_requests")
224 m_client->dump_mds_requests(f);
225 else if (command == "mds_sessions") {
226 bool cap_dump = false;
227 cmd_getval(cmdmap, "cap_dump", cap_dump);
228 m_client->dump_mds_sessions(f, cap_dump);
229 } else if (command == "dump_cache")
230 m_client->dump_cache(f);
231 else if (command == "kick_stale_sessions")
232 m_client->_kick_stale_sessions();
233 else if (command == "status")
234 m_client->dump_status(f);
235 else
236 ceph_abort_msg("bad command registered");
237 }
238 f->close_section();
239 return 0;
240 }
241
242
243 // -------------
244
245 int Client::get_fd_inode(int fd, InodeRef *in) {
246 int r = 0;
247 if (fd == CEPHFS_AT_FDCWD) {
248 *in = cwd;
249 } else {
250 Fh *f = get_filehandle(fd);
251 if (!f) {
252 r = -CEPHFS_EBADF;
253 } else {
254 *in = f->inode;
255 }
256 }
257 return r;
258 }
259
260 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
261 : inode(in), offset(0), next_offset(2),
262 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
263 perms(perms)
264 { }
265
266 void Client::_reset_faked_inos()
267 {
268 ino_t start = 1024;
269 free_faked_inos.clear();
270 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
271 last_used_faked_ino = 0;
272 last_used_faked_root = 0;
273 #ifdef _WIN32
274 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
275 // Windows structures, including Dokan ones, are using 64B identifiers.
276 _use_faked_inos = false;
277 #else
278 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
279 #endif
280 }
281
282 void Client::_assign_faked_ino(Inode *in)
283 {
284 if (0 == last_used_faked_ino)
285 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
286 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
287 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
288 last_used_faked_ino = 2048;
289 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
290 }
291 ceph_assert(it != free_faked_inos.end());
292 if (last_used_faked_ino < it.get_start()) {
293 ceph_assert(it.get_len() > 0);
294 last_used_faked_ino = it.get_start();
295 } else {
296 ++last_used_faked_ino;
297 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
298 }
299 in->faked_ino = last_used_faked_ino;
300 free_faked_inos.erase(in->faked_ino);
301 faked_ino_map[in->faked_ino] = in->vino();
302 }
303
304 /*
305 * In the faked mode, if you export multiple subdirectories,
306 * you will see that the inode numbers of the exported subdirectories
307 * are the same. so we distinguish the mount point by reserving
308 * the "fake ids" between "1024~2048" and combining the last
309 * 10bits(0x3ff) of the "root inodes".
310 */
311 void Client::_assign_faked_root(Inode *in)
312 {
313 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
314 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
315 last_used_faked_root = 0;
316 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
317 }
318 ceph_assert(it != free_faked_inos.end());
319 vinodeno_t inode_info = in->vino();
320 uint64_t inode_num = (uint64_t)inode_info.ino;
321 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
322 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
323 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
324
325 in->faked_ino = last_used_faked_root;
326 free_faked_inos.erase(in->faked_ino);
327 faked_ino_map[in->faked_ino] = in->vino();
328 }
329
330 void Client::_release_faked_ino(Inode *in)
331 {
332 free_faked_inos.insert(in->faked_ino);
333 faked_ino_map.erase(in->faked_ino);
334 }
335
336 vinodeno_t Client::_map_faked_ino(ino_t ino)
337 {
338 vinodeno_t vino;
339 if (ino == 1)
340 vino = root->vino();
341 else if (faked_ino_map.count(ino))
342 vino = faked_ino_map[ino];
343 else
344 vino = vinodeno_t(0, CEPH_NOSNAP);
345 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
346 return vino;
347 }
348
349 vinodeno_t Client::map_faked_ino(ino_t ino)
350 {
351 std::scoped_lock lock(client_lock);
352 return _map_faked_ino(ino);
353 }
354
355 // cons/des
356
357 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
358 : Dispatcher(m->cct->get()),
359 timer(m->cct, timer_lock, false),
360 messenger(m),
361 monclient(mc),
362 objecter(objecter_),
363 whoami(mc->get_global_id()),
364 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
365 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
366 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
367 async_ino_invalidator(m->cct),
368 async_dentry_invalidator(m->cct),
369 interrupt_finisher(m->cct),
370 remount_finisher(m->cct),
371 async_ino_releasor(m->cct),
372 objecter_finisher(m->cct),
373 m_command_hook(this),
374 fscid(0)
375 {
376 _reset_faked_inos();
377
378 user_id = cct->_conf->client_mount_uid;
379 group_id = cct->_conf->client_mount_gid;
380 fuse_default_permissions = cct->_conf.get_val<bool>(
381 "fuse_default_permissions");
382
383 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
384 "client_collect_and_send_global_metrics");
385
386 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
387 "client_mount_timeout");
388
389 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
390 "client_caps_release_delay");
391
392 if (cct->_conf->client_acl_type == "posix_acl")
393 acl_type = POSIX_ACL;
394
395 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
396
397 // file handles
398 free_fd_set.insert(10, 1<<30);
399
400 mdsmap.reset(new MDSMap);
401
402 // osd interfaces
403 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
404 &client_lock));
405 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
406 client_flush_set_callback, // all commit callback
407 (void*)this,
408 cct->_conf->client_oc_size,
409 cct->_conf->client_oc_max_objects,
410 cct->_conf->client_oc_max_dirty,
411 cct->_conf->client_oc_target_dirty,
412 cct->_conf->client_oc_max_dirty_age,
413 true));
414 }
415
416
417 Client::~Client()
418 {
419 ceph_assert(ceph_mutex_is_not_locked(client_lock));
420
421 // If the task is crashed or aborted and doesn't
422 // get any chance to run the umount and shutdow.
423 {
424 std::scoped_lock l{client_lock};
425 tick_thread_stopped = true;
426 upkeep_cond.notify_one();
427 }
428
429 if (upkeeper.joinable())
430 upkeeper.join();
431
432 // It is necessary to hold client_lock, because any inode destruction
433 // may call into ObjectCacher, which asserts that it's lock (which is
434 // client_lock) is held.
435 std::scoped_lock l{client_lock};
436 tear_down_cache();
437 }
438
439 void Client::tear_down_cache()
440 {
441 // fd's
442 for (auto &[fd, fh] : fd_map) {
443 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
444 _release_fh(fh);
445 }
446 fd_map.clear();
447
448 while (!opened_dirs.empty()) {
449 dir_result_t *dirp = *opened_dirs.begin();
450 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
451 _closedir(dirp);
452 }
453
454 // caps!
455 // *** FIXME ***
456
457 // empty lru
458 trim_cache();
459 ceph_assert(lru.lru_get_size() == 0);
460
461 // close root ino
462 ceph_assert(inode_map.size() <= 1 + root_parents.size());
463 if (root && inode_map.size() == 1 + root_parents.size()) {
464 root.reset();
465 }
466
467 ceph_assert(inode_map.empty());
468 }
469
470 inodeno_t Client::get_root_ino()
471 {
472 std::scoped_lock l(client_lock);
473 if (use_faked_inos())
474 return root->faked_ino;
475 else
476 return root->ino;
477 }
478
479 Inode *Client::get_root()
480 {
481 std::scoped_lock l(client_lock);
482 root->ll_get();
483 return root.get();
484 }
485
486
487 // debug crapola
488
489 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
490 {
491 filepath path;
492 in->make_long_path(path);
493 ldout(cct, 1) << "dump_inode: "
494 << (disconnected ? "DISCONNECTED ":"")
495 << "inode " << in->ino
496 << " " << path
497 << " ref " << in->get_nref()
498 << " " << *in << dendl;
499
500 if (f) {
501 f->open_object_section("inode");
502 f->dump_stream("path") << path;
503 if (disconnected)
504 f->dump_int("disconnected", 1);
505 in->dump(f);
506 f->close_section();
507 }
508
509 did.insert(in);
510 if (in->dir) {
511 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
512 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
513 it != in->dir->dentries.end();
514 ++it) {
515 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
516 if (f) {
517 f->open_object_section("dentry");
518 it->second->dump(f);
519 f->close_section();
520 }
521 if (it->second->inode)
522 dump_inode(f, it->second->inode.get(), did, false);
523 }
524 }
525 }
526
527 void Client::dump_cache(Formatter *f)
528 {
529 set<Inode*> did;
530
531 ldout(cct, 1) << __func__ << dendl;
532
533 if (f)
534 f->open_array_section("cache");
535
536 if (root)
537 dump_inode(f, root.get(), did, true);
538
539 // make a second pass to catch anything disconnected
540 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
541 it != inode_map.end();
542 ++it) {
543 if (did.count(it->second))
544 continue;
545 dump_inode(f, it->second, did, true);
546 }
547
548 if (f)
549 f->close_section();
550 }
551
552 void Client::dump_status(Formatter *f)
553 {
554 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
555
556 ldout(cct, 1) << __func__ << dendl;
557
558 const epoch_t osd_epoch
559 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
560
561 if (f) {
562 f->open_object_section("metadata");
563 for (const auto& kv : metadata)
564 f->dump_string(kv.first.c_str(), kv.second);
565 f->close_section();
566
567 f->dump_int("dentry_count", lru.lru_get_size());
568 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
569 f->dump_int("id", get_nodeid().v);
570 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
571 f->dump_object("inst", inst);
572 f->dump_object("addr", inst.addr);
573 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
574 f->dump_string("addr_str", inst.addr.get_legacy_str());
575 f->dump_int("inode_count", inode_map.size());
576 f->dump_int("mds_epoch", mdsmap->get_epoch());
577 f->dump_int("osd_epoch", osd_epoch);
578 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
579 f->dump_bool("blocklisted", blocklisted);
580 f->dump_string("fs_name", mdsmap->get_fs_name());
581 }
582 }
583
584 void Client::_pre_init()
585 {
586 timer.init();
587
588 objecter_finisher.start();
589 filer.reset(new Filer(objecter, &objecter_finisher));
590
591 objectcacher->start();
592 }
593
594 int Client::init()
595 {
596 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
597 ceph_assert(iref_writer.is_first_writer());
598
599 _pre_init();
600 {
601 std::scoped_lock l{client_lock};
602 messenger->add_dispatcher_tail(this);
603 }
604 _finish_init();
605 iref_writer.update_state(CLIENT_INITIALIZED);
606 return 0;
607 }
608
609 void Client::_finish_init()
610 {
611 {
612 std::scoped_lock l{client_lock};
613 // logger
614 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
615 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
616 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
617 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
618 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
619 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
620 // average, standard deviation mds/r/w/ latencies
621 plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
622 plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
623 plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
624 plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
625 plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
626 plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
627 plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
628 plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
629 plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
630 logger.reset(plb.create_perf_counters());
631 cct->get_perfcounters_collection()->add(logger.get());
632 }
633
634 cct->_conf.add_observer(this);
635
636 AdminSocket* admin_socket = cct->get_admin_socket();
637 int ret = admin_socket->register_command("mds_requests",
638 &m_command_hook,
639 "show in-progress mds requests");
640 if (ret < 0) {
641 lderr(cct) << "error registering admin socket command: "
642 << cpp_strerror(-ret) << dendl;
643 }
644 ret = admin_socket->register_command("mds_sessions "
645 "name=cap_dump,type=CephBool,req=false",
646 &m_command_hook,
647 "show mds session state");
648 if (ret < 0) {
649 lderr(cct) << "error registering admin socket command: "
650 << cpp_strerror(-ret) << dendl;
651 }
652 ret = admin_socket->register_command("dump_cache",
653 &m_command_hook,
654 "show in-memory metadata cache contents");
655 if (ret < 0) {
656 lderr(cct) << "error registering admin socket command: "
657 << cpp_strerror(-ret) << dendl;
658 }
659 ret = admin_socket->register_command("kick_stale_sessions",
660 &m_command_hook,
661 "kick sessions that were remote reset");
662 if (ret < 0) {
663 lderr(cct) << "error registering admin socket command: "
664 << cpp_strerror(-ret) << dendl;
665 }
666 ret = admin_socket->register_command("status",
667 &m_command_hook,
668 "show overall client status");
669 if (ret < 0) {
670 lderr(cct) << "error registering admin socket command: "
671 << cpp_strerror(-ret) << dendl;
672 }
673 }
674
675 void Client::shutdown()
676 {
677 ldout(cct, 1) << __func__ << dendl;
678
679 // If we were not mounted, but were being used for sending
680 // MDS commands, we may have sessions that need closing.
681 {
682 std::scoped_lock l{client_lock};
683
684 // To make sure the tick thread will be stoppped before
685 // destructing the Client, just in case like the _mount()
686 // failed but didn't not get a chance to stop the tick
687 // thread
688 tick_thread_stopped = true;
689 upkeep_cond.notify_one();
690
691 _close_sessions();
692 }
693 cct->_conf.remove_observer(this);
694
695 cct->get_admin_socket()->unregister_commands(&m_command_hook);
696
697 if (ino_invalidate_cb) {
698 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
699 async_ino_invalidator.wait_for_empty();
700 async_ino_invalidator.stop();
701 }
702
703 if (dentry_invalidate_cb) {
704 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
705 async_dentry_invalidator.wait_for_empty();
706 async_dentry_invalidator.stop();
707 }
708
709 if (switch_interrupt_cb) {
710 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
711 interrupt_finisher.wait_for_empty();
712 interrupt_finisher.stop();
713 }
714
715 if (remount_cb) {
716 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
717 remount_finisher.wait_for_empty();
718 remount_finisher.stop();
719 }
720
721 if (ino_release_cb) {
722 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
723 async_ino_releasor.wait_for_empty();
724 async_ino_releasor.stop();
725 }
726
727 objectcacher->stop(); // outside of client_lock! this does a join.
728
729 /*
730 * We are shuting down the client.
731 *
732 * Just declare the state to CLIENT_NEW to block and fail any
733 * new comming "reader" and then try to wait all the in-flight
734 * "readers" to finish.
735 */
736 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
737 if (!iref_writer.is_first_writer())
738 return;
739 iref_writer.wait_readers_done();
740
741 {
742 std::scoped_lock l(timer_lock);
743 timer.shutdown();
744 }
745
746 objecter_finisher.wait_for_empty();
747 objecter_finisher.stop();
748
749 if (logger) {
750 cct->get_perfcounters_collection()->remove(logger.get());
751 logger.reset();
752 }
753 }
754
755 void Client::update_io_stat_metadata(utime_t latency) {
756 auto lat_nsec = latency.to_nsec();
757 // old values are used to compute new ones
758 auto o_avg = logger->tget(l_c_md_avg).to_nsec();
759 auto o_sqsum = logger->get(l_c_md_sqsum);
760
761 auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
762 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
763 nr_metadata_request);
764
765 logger->tinc(l_c_lat, latency);
766 logger->tinc(l_c_reply, latency);
767
768 utime_t avg;
769 avg.set_from_double(n_avg / 1000000000);
770 logger->tset(l_c_md_avg, avg);
771 logger->set(l_c_md_sqsum, n_sqsum);
772 logger->set(l_c_md_ops, nr_metadata_request);
773 }
774
775 void Client::update_io_stat_read(utime_t latency) {
776 auto lat_nsec = latency.to_nsec();
777 // old values are used to compute new ones
778 auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
779 auto o_sqsum = logger->get(l_c_rd_sqsum);
780
781 auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
782 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
783 nr_read_request);
784
785 logger->tinc(l_c_read, latency);
786
787 utime_t avg;
788 avg.set_from_double(n_avg / 1000000000);
789 logger->tset(l_c_rd_avg, avg);
790 logger->set(l_c_rd_sqsum, n_sqsum);
791 logger->set(l_c_rd_ops, nr_read_request);
792 }
793
794 void Client::update_io_stat_write(utime_t latency) {
795 auto lat_nsec = latency.to_nsec();
796 // old values are used to compute new ones
797 auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
798 auto o_sqsum = logger->get(l_c_wr_sqsum);
799
800 auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
801 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
802 nr_write_request);
803
804 logger->tinc(l_c_wrlat, latency);
805
806 utime_t avg;
807 avg.set_from_double(n_avg / 1000000000);
808 logger->tset(l_c_wr_avg, avg);
809 logger->set(l_c_wr_sqsum, n_sqsum);
810 logger->set(l_c_wr_ops, nr_write_request);
811 }
812
813 // ===================
814 // metadata cache stuff
815
816 void Client::trim_cache(bool trim_kernel_dcache)
817 {
818 uint64_t max = cct->_conf->client_cache_size;
819 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
820 unsigned last = 0;
821 while (lru.lru_get_size() != last) {
822 last = lru.lru_get_size();
823
824 if (!is_unmounting() && lru.lru_get_size() <= max) break;
825
826 // trim!
827 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
828 if (!dn)
829 break; // done
830
831 trim_dentry(dn);
832 }
833
834 if (trim_kernel_dcache && lru.lru_get_size() > max)
835 _invalidate_kernel_dcache();
836
837 // hose root?
838 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
839 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
840 root.reset();
841 }
842 }
843
844 void Client::trim_cache_for_reconnect(MetaSession *s)
845 {
846 mds_rank_t mds = s->mds_num;
847 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
848
849 int trimmed = 0;
850 list<Dentry*> skipped;
851 while (lru.lru_get_size() > 0) {
852 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
853 if (!dn)
854 break;
855
856 if ((dn->inode && dn->inode->caps.count(mds)) ||
857 dn->dir->parent_inode->caps.count(mds)) {
858 trim_dentry(dn);
859 trimmed++;
860 } else
861 skipped.push_back(dn);
862 }
863
864 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
865 lru.lru_insert_mid(*p);
866
867 ldout(cct, 20) << __func__ << " mds." << mds
868 << " trimmed " << trimmed << " dentries" << dendl;
869
870 if (s->caps.size() > 0)
871 _invalidate_kernel_dcache();
872 }
873
874 void Client::trim_dentry(Dentry *dn)
875 {
876 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
877 << " in dir "
878 << std::hex << dn->dir->parent_inode->ino << std::dec
879 << dendl;
880 if (dn->inode) {
881 Inode *diri = dn->dir->parent_inode;
882 clear_dir_complete_and_ordered(diri, true);
883 }
884 unlink(dn, false, false); // drop dir, drop dentry
885 }
886
887
888 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
889 uint64_t truncate_seq, uint64_t truncate_size)
890 {
891 uint64_t prior_size = in->size;
892
893 if (truncate_seq > in->truncate_seq ||
894 (truncate_seq == in->truncate_seq && size > in->size)) {
895 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
896 in->size = size;
897 in->reported_size = size;
898 if (truncate_seq != in->truncate_seq) {
899 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
900 << truncate_seq << dendl;
901 in->truncate_seq = truncate_seq;
902 in->oset.truncate_seq = truncate_seq;
903
904 // truncate cached file data
905 if (prior_size > size) {
906 _invalidate_inode_cache(in, size, prior_size - size);
907 }
908 }
909
910 // truncate inline data
911 if (in->inline_version < CEPH_INLINE_NONE) {
912 uint32_t len = in->inline_data.length();
913 if (size < len)
914 in->inline_data.splice(size, len - size);
915 }
916 }
917 if (truncate_seq >= in->truncate_seq &&
918 in->truncate_size != truncate_size) {
919 if (in->is_file()) {
920 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
921 << truncate_size << dendl;
922 in->truncate_size = truncate_size;
923 in->oset.truncate_size = truncate_size;
924 } else {
925 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
926 }
927 }
928 }
929
930 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
931 utime_t ctime, utime_t mtime, utime_t atime)
932 {
933 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
934 << " ctime " << ctime << " mtime " << mtime << dendl;
935
936 if (time_warp_seq > in->time_warp_seq)
937 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
938 << " is higher than local time_warp_seq "
939 << in->time_warp_seq << dendl;
940
941 int warn = false;
942 // be careful with size, mtime, atime
943 if (issued & (CEPH_CAP_FILE_EXCL|
944 CEPH_CAP_FILE_WR|
945 CEPH_CAP_FILE_BUFFER|
946 CEPH_CAP_AUTH_EXCL|
947 CEPH_CAP_XATTR_EXCL)) {
948 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
949 if (ctime > in->ctime)
950 in->ctime = ctime;
951 if (time_warp_seq > in->time_warp_seq) {
952 //the mds updated times, so take those!
953 in->mtime = mtime;
954 in->atime = atime;
955 in->time_warp_seq = time_warp_seq;
956 } else if (time_warp_seq == in->time_warp_seq) {
957 //take max times
958 if (mtime > in->mtime)
959 in->mtime = mtime;
960 if (atime > in->atime)
961 in->atime = atime;
962 } else if (issued & CEPH_CAP_FILE_EXCL) {
963 //ignore mds values as we have a higher seq
964 } else warn = true;
965 } else {
966 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
967 if (time_warp_seq >= in->time_warp_seq) {
968 in->ctime = ctime;
969 in->mtime = mtime;
970 in->atime = atime;
971 in->time_warp_seq = time_warp_seq;
972 } else warn = true;
973 }
974 if (warn) {
975 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
976 << time_warp_seq << " is lower than local time_warp_seq "
977 << in->time_warp_seq
978 << dendl;
979 }
980 }
981
982 void Client::_fragmap_remove_non_leaves(Inode *in)
983 {
984 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
985 if (!in->dirfragtree.is_leaf(p->first))
986 in->fragmap.erase(p++);
987 else
988 ++p;
989 }
990
991 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
992 {
993 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
994 if (p->second == mds)
995 in->fragmap.erase(p++);
996 else
997 ++p;
998 }
999
1000 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
1001 MetaSession *session,
1002 const UserPerm& request_perms)
1003 {
1004 Inode *in;
1005 bool was_new = false;
1006 if (inode_map.count(st->vino)) {
1007 in = inode_map[st->vino];
1008 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1009 } else {
1010 in = new Inode(this, st->vino, &st->layout);
1011 inode_map[st->vino] = in;
1012
1013 if (use_faked_inos())
1014 _assign_faked_ino(in);
1015
1016 if (!root) {
1017 root = in;
1018 if (use_faked_inos())
1019 _assign_faked_root(root.get());
1020 root_ancestor = in;
1021 cwd = root;
1022 } else if (is_mounting()) {
1023 root_parents[root_ancestor] = in;
1024 root_ancestor = in;
1025 }
1026
1027 // immutable bits
1028 in->ino = st->vino.ino;
1029 in->snapid = st->vino.snapid;
1030 in->mode = st->mode & S_IFMT;
1031 was_new = true;
1032 }
1033
1034 in->rdev = st->rdev;
1035 if (in->is_symlink())
1036 in->symlink = st->symlink;
1037
1038 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1039 bool new_version = false;
1040 if (in->version == 0 ||
1041 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1042 (in->version & ~1) < st->version))
1043 new_version = true;
1044
1045 int issued;
1046 in->caps_issued(&issued);
1047 issued |= in->caps_dirty();
1048 int new_issued = ~issued & (int)st->cap.caps;
1049
1050 bool need_snapdir_attr_refresh = false;
1051 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1052 !(issued & CEPH_CAP_AUTH_EXCL)) {
1053 in->mode = st->mode;
1054 in->uid = st->uid;
1055 in->gid = st->gid;
1056 in->btime = st->btime;
1057 in->snap_btime = st->snap_btime;
1058 in->snap_metadata = st->snap_metadata;
1059 in->fscrypt_auth = st->fscrypt_auth;
1060 need_snapdir_attr_refresh = true;
1061 }
1062
1063 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1064 !(issued & CEPH_CAP_LINK_EXCL)) {
1065 in->nlink = st->nlink;
1066 }
1067
1068 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1069 need_snapdir_attr_refresh = true;
1070 update_inode_file_time(in, issued, st->time_warp_seq,
1071 st->ctime, st->mtime, st->atime);
1072 }
1073
1074 if (new_version ||
1075 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1076 in->layout = st->layout;
1077 in->fscrypt_file = st->fscrypt_file;
1078 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1079 }
1080
1081 if (in->is_dir()) {
1082 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1083 in->dirstat = st->dirstat;
1084 }
1085 // dir_layout/rstat/quota are not tracked by capability, update them only if
1086 // the inode stat is from auth mds
1087 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1088 in->dir_layout = st->dir_layout;
1089 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1090 in->rstat = st->rstat;
1091 in->quota = st->quota;
1092 in->dir_pin = st->dir_pin;
1093 }
1094 // move me if/when version reflects fragtree changes.
1095 if (in->dirfragtree != st->dirfragtree) {
1096 in->dirfragtree = st->dirfragtree;
1097 _fragmap_remove_non_leaves(in);
1098 }
1099 }
1100
1101 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1102 st->xattrbl.length() &&
1103 st->xattr_version > in->xattr_version) {
1104 auto p = st->xattrbl.cbegin();
1105 decode(in->xattrs, p);
1106 in->xattr_version = st->xattr_version;
1107 need_snapdir_attr_refresh = true;
1108 }
1109
1110 if (st->inline_version > in->inline_version) {
1111 in->inline_data = st->inline_data;
1112 in->inline_version = st->inline_version;
1113 }
1114
1115 /* always take a newer change attr */
1116 ldout(cct, 12) << __func__ << " client inode change_attr: " << in->change_attr << " , mds inodestat change_attr: " << st->change_attr << dendl;
1117 if (st->change_attr > in->change_attr)
1118 in->change_attr = st->change_attr;
1119
1120 if (st->version > in->version)
1121 in->version = st->version;
1122
1123 if (was_new)
1124 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1125
1126 if (!st->cap.caps)
1127 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1128
1129 if (in->snapid == CEPH_NOSNAP) {
1130 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1131 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1132 st->cap.flags, request_perms);
1133 if (in->auth_cap && in->auth_cap->session == session) {
1134 in->max_size = st->max_size;
1135 in->rstat = st->rstat;
1136 }
1137
1138 // setting I_COMPLETE needs to happen after adding the cap
1139 if (in->is_dir() &&
1140 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1141 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1142 in->dirstat.nfiles == 0 &&
1143 in->dirstat.nsubdirs == 0) {
1144 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1145 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1146 if (in->dir) {
1147 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1148 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1149 in->dir->readdir_cache.clear();
1150 for (const auto& p : in->dir->dentries) {
1151 unlink(p.second, true, true); // keep dir, keep dentry
1152 }
1153 if (in->dir->dentries.empty())
1154 close_dir(in->dir);
1155 }
1156 }
1157 } else {
1158 in->snap_caps |= st->cap.caps;
1159 }
1160
1161 if (need_snapdir_attr_refresh && in->is_dir() && in->snapid == CEPH_NOSNAP) {
1162 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
1163 if (inode_map.count(vino)) {
1164 refresh_snapdir_attrs(inode_map[vino], in);
1165 }
1166 }
1167
1168 return in;
1169 }
1170
1171
1172 /*
1173 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1174 */
1175 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1176 Inode *in, utime_t from, MetaSession *session,
1177 Dentry *old_dentry)
1178 {
1179 Dentry *dn = NULL;
1180 if (dir->dentries.count(dname))
1181 dn = dir->dentries[dname];
1182
1183 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1184 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1185 << dendl;
1186
1187 if (dn && dn->inode) {
1188 if (dn->inode->vino() == in->vino()) {
1189 touch_dn(dn);
1190 ldout(cct, 12) << " had dentry " << dname
1191 << " with correct vino " << dn->inode->vino()
1192 << dendl;
1193 } else {
1194 ldout(cct, 12) << " had dentry " << dname
1195 << " with WRONG vino " << dn->inode->vino()
1196 << dendl;
1197 unlink(dn, true, true); // keep dir, keep dentry
1198 }
1199 }
1200
1201 if (!dn || !dn->inode) {
1202 InodeRef tmp_ref(in);
1203 if (old_dentry) {
1204 if (old_dentry->dir != dir) {
1205 Inode *old_diri = old_dentry->dir->parent_inode;
1206 clear_dir_complete_and_ordered(old_diri, false);
1207 }
1208 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1209 }
1210 Inode *diri = dir->parent_inode;
1211 clear_dir_complete_and_ordered(diri, false);
1212 dn = link(dir, dname, in, dn);
1213
1214 if (old_dentry) {
1215 dn->is_renaming = false;
1216 signal_cond_list(waiting_for_rename);
1217 }
1218 }
1219
1220 update_dentry_lease(dn, dlease, from, session);
1221 return dn;
1222 }
1223
1224 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1225 {
1226 utime_t dttl = from;
1227 dttl += (float)dlease->duration_ms / 1000.0;
1228
1229 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1230
1231 ceph_assert(dn);
1232
1233 if (dlease->mask & CEPH_LEASE_VALID) {
1234 if (dttl > dn->lease_ttl) {
1235 ldout(cct, 10) << "got dentry lease on " << dn->name
1236 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1237 dn->lease_ttl = dttl;
1238 dn->lease_mds = session->mds_num;
1239 dn->lease_seq = dlease->seq;
1240 dn->lease_gen = session->cap_gen;
1241 }
1242 }
1243 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1244 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1245 dn->mark_primary();
1246 dn->alternate_name = std::move(dlease->alternate_name);
1247 }
1248
1249
1250 /*
1251 * update MDS location cache for a single inode
1252 */
1253 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1254 {
1255 // auth
1256 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1257 if (dst->auth >= 0) {
1258 in->fragmap[dst->frag] = dst->auth;
1259 } else {
1260 in->fragmap.erase(dst->frag);
1261 }
1262 if (!in->dirfragtree.is_leaf(dst->frag)) {
1263 in->dirfragtree.force_to_leaf(cct, dst->frag);
1264 _fragmap_remove_non_leaves(in);
1265 }
1266
1267 // replicated, only update from auth mds reply
1268 if (from == dst->auth) {
1269 in->dir_replicated = !dst->dist.empty();
1270 if (!dst->dist.empty())
1271 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1272 else
1273 in->frag_repmap.erase(dst->frag);
1274 }
1275 }
1276
1277 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1278 {
1279 if (complete)
1280 diri->dir_release_count++;
1281 else
1282 diri->dir_ordered_count++;
1283 if (diri->flags & I_COMPLETE) {
1284 if (complete) {
1285 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1286 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1287 } else {
1288 if (diri->flags & I_DIR_ORDERED) {
1289 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1290 diri->flags &= ~I_DIR_ORDERED;
1291 }
1292 }
1293 if (diri->dir)
1294 diri->dir->readdir_cache.clear();
1295 }
1296 }
1297
1298 /*
1299 * insert results from readdir or lssnap into the metadata cache.
1300 */
1301 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session,
1302 Inode *diri, Inode *diri_other) {
1303
1304 auto& reply = request->reply;
1305 ConnectionRef con = request->reply->get_connection();
1306 uint64_t features;
1307 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1308 features = (uint64_t)-1;
1309 }
1310 else {
1311 features = con->get_features();
1312 }
1313
1314 dir_result_t *dirp = request->dirp;
1315 ceph_assert(dirp);
1316
1317 // the extra buffer list is only set for readdir, lssnap and
1318 // readdir_snapdiff replies
1319 auto p = reply->get_extra_bl().cbegin();
1320 if (!p.end()) {
1321 // snapdir?
1322 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1323 ceph_assert(diri);
1324 diri = open_snapdir(diri);
1325 }
1326 bool snapdiff_req = request->head.op == CEPH_MDS_OP_READDIR_SNAPDIFF;
1327 frag_t fg;
1328 unsigned offset_hash;
1329 if (snapdiff_req) {
1330 fg = (unsigned)request->head.args.snapdiff.frag;
1331 offset_hash = (unsigned)request->head.args.snapdiff.offset_hash;
1332 } else {
1333 fg = (unsigned)request->head.args.readdir.frag;
1334 offset_hash = (unsigned)request->head.args.readdir.offset_hash;
1335 }
1336
1337 // only open dir if we're actually adding stuff to it!
1338 Dir *dir = diri->open_dir();
1339 ceph_assert(dir);
1340 //open opponent dir for snapdiff if any
1341 Dir *dir_other = nullptr;
1342 if (snapdiff_req) {
1343 ceph_assert(diri_other);
1344 dir_other = diri_other->open_dir();
1345 ceph_assert(dir_other);
1346 }
1347
1348 // dirstat
1349 DirStat dst(p, features);
1350 __u32 numdn;
1351 __u16 flags;
1352 decode(numdn, p);
1353 decode(flags, p);
1354
1355 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1356 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1357
1358 unsigned readdir_offset = dirp->next_offset;
1359 string readdir_start = dirp->last_name;
1360 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1361
1362 unsigned last_hash = 0;
1363 if (hash_order) {
1364 if (!readdir_start.empty()) {
1365 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1366 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1367 /* mds understands offset_hash */
1368 last_hash = offset_hash;
1369 }
1370 }
1371
1372 if (fg != dst.frag) {
1373 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1374 fg = dst.frag;
1375 if (!hash_order) {
1376 readdir_offset = 2;
1377 readdir_start.clear();
1378 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1379 }
1380 }
1381
1382 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1383 << ", hash_order=" << hash_order
1384 << ", readdir_start " << readdir_start
1385 << ", last_hash " << last_hash
1386 << ", next_offset " << readdir_offset << dendl;
1387
1388 if (diri->snapid != CEPH_SNAPDIR &&
1389 fg.is_leftmost() && readdir_offset == 2 &&
1390 !(hash_order && last_hash)) {
1391 dirp->release_count = diri->dir_release_count;
1392 dirp->ordered_count = diri->dir_ordered_count;
1393 dirp->start_shared_gen = diri->shared_gen;
1394 dirp->cache_index = 0;
1395 }
1396
1397 dirp->buffer_frag = fg;
1398
1399 _readdir_drop_dirp_buffer(dirp);
1400 dirp->buffer.reserve(numdn);
1401
1402 string dname;
1403 LeaseStat dlease;
1404 for (unsigned i=0; i<numdn; i++) {
1405 decode(dname, p);
1406 dlease.decode(p, features);
1407 InodeStat ist(p, features);
1408
1409 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1410
1411 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1412 request->perms);
1413 auto *effective_dir = dir;
1414 auto *effective_diri = diri;
1415
1416 if (snapdiff_req && in->snapid != diri->snapid) {
1417 ceph_assert(diri_other);
1418 ceph_assert(dir_other);
1419 effective_diri = diri_other;
1420 effective_dir = dir_other;
1421 }
1422 Dentry *dn;
1423 if (effective_dir->dentries.count(dname)) {
1424 Dentry *olddn = effective_dir->dentries[dname];
1425 if (olddn->inode != in) {
1426 // replace incorrect dentry
1427 unlink(olddn, true, true); // keep dir, dentry
1428 dn = link(effective_dir, dname, in, olddn);
1429 ceph_assert(dn == olddn);
1430 } else {
1431 // keep existing dn
1432 dn = olddn;
1433 touch_dn(dn);
1434 }
1435 } else {
1436 // new dn
1437 dn = link(effective_dir, dname, in, NULL);
1438 }
1439 dn->alternate_name = std::move(dlease.alternate_name);
1440
1441 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1442 if (hash_order) {
1443 unsigned hash = ceph_frag_value(effective_diri->hash_dentry_name(dname));
1444 if (hash != last_hash)
1445 readdir_offset = 2;
1446 last_hash = hash;
1447 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1448 } else {
1449 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1450 }
1451 // add to readdir cache
1452 if (!snapdiff_req &&
1453 dirp->release_count == effective_diri->dir_release_count &&
1454 dirp->ordered_count == effective_diri->dir_ordered_count &&
1455 dirp->start_shared_gen == effective_diri->shared_gen) {
1456 if (dirp->cache_index == effective_dir->readdir_cache.size()) {
1457 if (i == 0) {
1458 ceph_assert(!dirp->inode->is_complete_and_ordered());
1459 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1460 }
1461 effective_dir->readdir_cache.push_back(dn);
1462 } else if (dirp->cache_index < effective_dir->readdir_cache.size()) {
1463 if (dirp->inode->is_complete_and_ordered())
1464 ceph_assert(effective_dir->readdir_cache[dirp->cache_index] == dn);
1465 else
1466 effective_dir->readdir_cache[dirp->cache_index] = dn;
1467 } else {
1468 ceph_abort_msg("unexpected readdir buffer idx");
1469 }
1470 dirp->cache_index++;
1471 }
1472 // add to cached result list
1473 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1474 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1475 }
1476
1477 if (numdn > 0)
1478 dirp->last_name = dname;
1479 if (end)
1480 dirp->next_offset = 2;
1481 else
1482 dirp->next_offset = readdir_offset;
1483
1484 if (dir->is_empty())
1485 close_dir(dir);
1486 if (dir_other && dir_other->is_empty())
1487 close_dir(dir_other);
1488 }
1489 }
1490
1491 /** insert_trace
1492 *
1493 * insert a trace from a MDS reply into the cache.
1494 */
1495 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1496 {
1497 auto& reply = request->reply;
1498 int op = request->get_op();
1499
1500 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1501 << " is_target=" << (int)reply->head.is_target
1502 << " is_dentry=" << (int)reply->head.is_dentry
1503 << dendl;
1504
1505 auto p = reply->get_trace_bl().cbegin();
1506 if (request->got_unsafe) {
1507 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1508 ceph_assert(p.end());
1509 return NULL;
1510 }
1511
1512 if (p.end()) {
1513 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1514
1515 Dentry *d = request->dentry();
1516 if (d) {
1517 Inode *diri = d->dir->parent_inode;
1518 clear_dir_complete_and_ordered(diri, true);
1519 }
1520
1521 if (d && reply->get_result() == 0) {
1522 if (op == CEPH_MDS_OP_RENAME) {
1523 // rename
1524 Dentry *od = request->old_dentry();
1525 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1526 ceph_assert(od);
1527 unlink(od, true, true); // keep dir, dentry
1528 } else if (op == CEPH_MDS_OP_RMDIR ||
1529 op == CEPH_MDS_OP_UNLINK) {
1530 // unlink, rmdir
1531 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1532 unlink(d, true, true); // keep dir, dentry
1533 }
1534 }
1535 return NULL;
1536 }
1537
1538 ConnectionRef con = request->reply->get_connection();
1539 uint64_t features;
1540 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1541 features = (uint64_t)-1;
1542 }
1543 else {
1544 features = con->get_features();
1545 }
1546 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1547
1548 // snap trace
1549 SnapRealm *realm = NULL;
1550 if (reply->snapbl.length())
1551 update_snap_trace(session, reply->snapbl, &realm);
1552
1553 ldout(cct, 10) << " hrm "
1554 << " is_target=" << (int)reply->head.is_target
1555 << " is_dentry=" << (int)reply->head.is_dentry
1556 << dendl;
1557
1558 InodeStat dirst;
1559 DirStat dst;
1560 string dname;
1561 LeaseStat dlease;
1562 InodeStat ist;
1563
1564 if (reply->head.is_dentry) {
1565 dirst.decode(p, features);
1566 dst.decode(p, features);
1567 decode(dname, p);
1568 dlease.decode(p, features);
1569 }
1570
1571 Inode *in = 0;
1572 if (reply->head.is_target) {
1573 ist.decode(p, features);
1574 if (cct->_conf->client_debug_getattr_caps) {
1575 unsigned wanted = 0;
1576 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1577 wanted = request->head.args.getattr.mask;
1578 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1579 wanted = request->head.args.open.mask;
1580
1581 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1582 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1583 ceph_abort_msg("MDS reply does not contain xattrs");
1584 }
1585
1586 in = add_update_inode(&ist, request->sent_stamp, session,
1587 request->perms);
1588 }
1589
1590 Inode *diri = NULL;
1591 if (reply->head.is_dentry) {
1592 diri = add_update_inode(&dirst, request->sent_stamp, session,
1593 request->perms);
1594 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1595 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
1596
1597 if (in) {
1598 Dir *dir = diri->open_dir();
1599 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1600 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1601 } else {
1602 Dentry *dn = NULL;
1603 if (diri->dir && diri->dir->dentries.count(dname)) {
1604 dn = diri->dir->dentries[dname];
1605 if (dn->inode) {
1606 clear_dir_complete_and_ordered(diri, false);
1607 unlink(dn, true, true); // keep dir, dentry
1608 }
1609 }
1610 if (dlease.duration_ms > 0) {
1611 if (!dn) {
1612 Dir *dir = diri->open_dir();
1613 dn = link(dir, dname, NULL, NULL);
1614 }
1615 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1616 }
1617 }
1618 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1619 op == CEPH_MDS_OP_MKSNAP) {
1620 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1621 // fake it for snap lookup
1622 vinodeno_t vino = ist.vino;
1623 vino.snapid = CEPH_SNAPDIR;
1624 ceph_assert(inode_map.count(vino));
1625 diri = inode_map[vino];
1626
1627 string dname = request->path.last_dentry();
1628
1629 LeaseStat dlease;
1630 dlease.duration_ms = 0;
1631
1632 if (in) {
1633 Dir *dir = diri->open_dir();
1634 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1635 } else {
1636 if (diri->dir && diri->dir->dentries.count(dname)) {
1637 Dentry *dn = diri->dir->dentries[dname];
1638 if (dn->inode)
1639 unlink(dn, true, true); // keep dir, dentry
1640 }
1641 }
1642 }
1643
1644 if (in) {
1645 if (op == CEPH_MDS_OP_READDIR ||
1646 op == CEPH_MDS_OP_LSSNAP) {
1647 insert_readdir_results(request,
1648 session,
1649 in,
1650 nullptr);
1651 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1652 // hack: return parent inode instead
1653 in = diri;
1654 } else if (op == CEPH_MDS_OP_READDIR_SNAPDIFF) {
1655 // provide both request's inode (aka snapA) and traced one (snapB)
1656 // to properly match snapdiff results
1657 insert_readdir_results(request,
1658 session,
1659 request->inode(),
1660 in);
1661 }
1662
1663 if (request->dentry() == NULL && in != request->inode()) {
1664 // pin the target inode if its parent dentry is not pinned
1665 request->set_other_inode(in);
1666 }
1667 }
1668
1669 if (realm)
1670 put_snap_realm(realm);
1671
1672 request->target = in;
1673 return in;
1674 }
1675
1676 // -------
1677
1678 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1679 {
1680 mds_rank_t mds = MDS_RANK_NONE;
1681 __u32 hash = 0;
1682 bool is_hash = false;
1683 int issued = 0;
1684
1685 Inode *in = NULL;
1686 Dentry *de = NULL;
1687
1688 if (req->resend_mds >= 0) {
1689 mds = req->resend_mds;
1690 req->resend_mds = -1;
1691 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1692 goto out;
1693 }
1694
1695 if (cct->_conf->client_use_random_mds)
1696 goto random_mds;
1697
1698 in = req->inode();
1699 de = req->dentry();
1700 if (in) {
1701 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1702 if (req->path.depth()) {
1703 hash = in->hash_dentry_name(req->path[0]);
1704 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1705 << " on " << req->path[0]
1706 << " => " << hash << dendl;
1707 is_hash = true;
1708 }
1709 } else if (de) {
1710 if (de->inode) {
1711 in = de->inode.get();
1712 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1713 } else {
1714 in = de->dir->parent_inode;
1715 hash = in->hash_dentry_name(de->name);
1716 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1717 << " on " << de->name
1718 << " => " << hash << dendl;
1719 is_hash = true;
1720 }
1721 }
1722 if (in) {
1723 if (in->snapid != CEPH_NOSNAP) {
1724 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1725 while (in->snapid != CEPH_NOSNAP) {
1726 if (in->snapid == CEPH_SNAPDIR)
1727 in = in->snapdir_parent.get();
1728 else if (!in->dentries.empty())
1729 /* In most cases there will only be one dentry, so getting it
1730 * will be the correct action. If there are multiple hard links,
1731 * I think the MDS should be able to redirect as needed*/
1732 in = in->get_first_parent()->dir->parent_inode;
1733 else {
1734 ldout(cct, 10) << __func__ << "got unlinked inode, can't look at parent" << dendl;
1735 break;
1736 }
1737 }
1738 is_hash = false;
1739 }
1740
1741 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1742 << " hash=" << hash << dendl;
1743
1744 if (req->get_op() == CEPH_MDS_OP_GETATTR)
1745 issued = req->inode()->caps_issued();
1746
1747 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1748 frag_t fg = in->dirfragtree[hash];
1749 if (!req->auth_is_best(issued)) {
1750 auto repmapit = in->frag_repmap.find(fg);
1751 if (repmapit != in->frag_repmap.end()) {
1752 auto& repmap = repmapit->second;
1753 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1754 mds = repmap.at(r);
1755 }
1756 } else if (in->fragmap.count(fg)) {
1757 mds = in->fragmap[fg];
1758 if (phash_diri)
1759 *phash_diri = in;
1760 } else if (in->auth_cap) {
1761 req->send_to_auth = true;
1762 mds = in->auth_cap->session->mds_num;
1763 }
1764 if (mds >= 0) {
1765 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1766 goto out;
1767 }
1768 }
1769
1770 if (in->auth_cap && req->auth_is_best(issued)) {
1771 mds = in->auth_cap->session->mds_num;
1772 } else if (!in->caps.empty()) {
1773 mds = in->caps.begin()->second.session->mds_num;
1774 } else {
1775 goto random_mds;
1776 }
1777 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1778
1779 goto out;
1780 }
1781
1782 random_mds:
1783 if (mds < 0) {
1784 mds = _get_random_up_mds();
1785 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1786 }
1787
1788 out:
1789 ldout(cct, 20) << "mds is " << mds << dendl;
1790 return mds;
1791 }
1792
1793 void Client::connect_mds_targets(mds_rank_t mds)
1794 {
1795 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1796 ceph_assert(mds_sessions.count(mds));
1797 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1798 for (const auto &rank : info.export_targets) {
1799 if (mds_sessions.count(rank) == 0 &&
1800 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1801 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1802 << " export target mds." << rank << dendl;
1803
1804 auto session = _get_or_open_mds_session(rank);
1805 if (session->state == MetaSession::STATE_OPENING ||
1806 session->state == MetaSession::STATE_OPEN)
1807 continue;
1808
1809 _open_mds_session(rank);
1810 }
1811 }
1812 }
1813
1814 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1815 {
1816 f->dump_int("id", get_nodeid().v);
1817 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1818 f->dump_object("inst", inst);
1819 f->dump_stream("inst_str") << inst;
1820 f->dump_stream("addr_str") << inst.addr;
1821 f->open_array_section("sessions");
1822 for (const auto &p : mds_sessions) {
1823 f->open_object_section("session");
1824 p.second->dump(f, cap_dump);
1825 f->close_section();
1826 }
1827 f->close_section();
1828 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1829 }
1830
1831 void Client::dump_mds_requests(Formatter *f)
1832 {
1833 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1834 p != mds_requests.end();
1835 ++p) {
1836 f->open_object_section("request");
1837 p->second->dump(f);
1838 f->close_section();
1839 }
1840 }
1841
1842 int Client::verify_reply_trace(int r, MetaSession *session,
1843 MetaRequest *request, const MConstRef<MClientReply>& reply,
1844 InodeRef *ptarget, bool *pcreated,
1845 const UserPerm& perms)
1846 {
1847 // check whether this request actually did the create, and set created flag
1848 bufferlist extra_bl;
1849 inodeno_t created_ino;
1850 bool got_created_ino = false;
1851 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1852
1853 extra_bl = reply->get_extra_bl();
1854 if (extra_bl.length() >= 8) {
1855 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1856 struct openc_response_t ocres;
1857
1858 decode(ocres, extra_bl);
1859 created_ino = ocres.created_ino;
1860 /*
1861 * The userland cephfs client doesn't have a way to do an async create
1862 * (yet), so just discard delegated_inos for now. Eventually we should
1863 * store them and use them in create calls, even if they are synchronous,
1864 * if only for testing purposes.
1865 */
1866 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1867 } else {
1868 // u64 containing number of created ino
1869 decode(created_ino, extra_bl);
1870 }
1871 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1872 got_created_ino = true;
1873 }
1874
1875 if (pcreated)
1876 *pcreated = got_created_ino;
1877
1878 if (request->target) {
1879 *ptarget = request->target;
1880 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1881 } else {
1882 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1883 (*ptarget) = p->second;
1884 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1885 } else {
1886 // we got a traceless reply, and need to look up what we just
1887 // created. for now, do this by name. someday, do this by the
1888 // ino... which we know! FIXME.
1889 InodeRef target;
1890 Dentry *d = request->dentry();
1891 if (d) {
1892 if (d->dir) {
1893 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1894 << d->dir->parent_inode->ino << "/" << d->name
1895 << " got_ino " << got_created_ino
1896 << " ino " << created_ino
1897 << dendl;
1898 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1899 &target, perms);
1900 } else {
1901 // if the dentry is not linked, just do our best. see #5021.
1902 ceph_abort_msg("how did this happen? i want logs!");
1903 }
1904 } else {
1905 Inode *in = request->inode();
1906 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1907 << in->ino << dendl;
1908 r = _getattr(in, request->regetattr_mask, perms, true);
1909 target = in;
1910 }
1911 if (r >= 0) {
1912 // verify ino returned in reply and trace_dist are the same
1913 if (got_created_ino &&
1914 created_ino.val != target->ino.val) {
1915 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1916 r = -CEPHFS_EINTR;
1917 }
1918 if (ptarget)
1919 ptarget->swap(target);
1920 }
1921 }
1922 }
1923
1924 return r;
1925 }
1926
1927
1928 /**
1929 * make a request
1930 *
1931 * Blocking helper to make an MDS request.
1932 *
1933 * If the ptarget flag is set, behavior changes slightly: the caller
1934 * expects to get a pointer to the inode we are creating or operating
1935 * on. As a result, we will follow up any traceless mutation reply
1936 * with a getattr or lookup to transparently handle a traceless reply
1937 * from the MDS (as when the MDS restarts and the client has to replay
1938 * a request).
1939 *
1940 * @param request the MetaRequest to execute
1941 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1942 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1943 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1944 * @param use_mds [optional] prefer a specific mds (-1 for default)
1945 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1946 */
1947 int Client::make_request(MetaRequest *request,
1948 const UserPerm& perms,
1949 InodeRef *ptarget, bool *pcreated,
1950 mds_rank_t use_mds,
1951 bufferlist *pdirbl,
1952 size_t feature_needed)
1953 {
1954 int r = 0;
1955
1956 // assign a unique tid
1957 ceph_tid_t tid = ++last_tid;
1958 request->set_tid(tid);
1959
1960 // and timestamp
1961 request->op_stamp = ceph_clock_now();
1962 request->created = ceph::coarse_mono_clock::now();
1963
1964 // make note
1965 mds_requests[tid] = request->get();
1966 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1967 oldest_tid = tid;
1968
1969 request->set_caller_perms(perms);
1970
1971 if (cct->_conf->client_inject_fixed_oldest_tid) {
1972 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1973 request->set_oldest_client_tid(1);
1974 } else {
1975 request->set_oldest_client_tid(oldest_tid);
1976 }
1977
1978 // hack target mds?
1979 if (use_mds >= 0)
1980 request->resend_mds = use_mds;
1981
1982 MetaSessionRef session = NULL;
1983 while (1) {
1984 if (request->aborted())
1985 break;
1986
1987 if (blocklisted) {
1988 request->abort(-CEPHFS_EBLOCKLISTED);
1989 break;
1990 }
1991
1992 // set up wait cond
1993 ceph::condition_variable caller_cond;
1994 request->caller_cond = &caller_cond;
1995
1996 // choose mds
1997 Inode *hash_diri = NULL;
1998 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1999 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
2000 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
2001 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
2002 if (hash_diri) {
2003 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
2004 _fragmap_remove_stopped_mds(hash_diri, mds);
2005 } else {
2006 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
2007 request->resend_mds = _get_random_up_mds();
2008 }
2009 } else {
2010 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
2011 wait_on_list(waiting_for_mdsmap);
2012 }
2013 continue;
2014 }
2015
2016 // open a session?
2017 if (!have_open_session(mds)) {
2018 session = _get_or_open_mds_session(mds);
2019 if (session->state == MetaSession::STATE_REJECTED) {
2020 request->abort(-CEPHFS_EPERM);
2021 break;
2022 }
2023 // wait
2024 if (session->state == MetaSession::STATE_OPENING) {
2025 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
2026 wait_on_context_list(session->waiting_for_open);
2027 continue;
2028 }
2029
2030 if (!have_open_session(mds))
2031 continue;
2032 } else {
2033 session = mds_sessions.at(mds);
2034 }
2035
2036 if (feature_needed != ULONG_MAX && !session->mds_features.test(feature_needed)) {
2037 request->abort(-CEPHFS_EOPNOTSUPP);
2038 break;
2039 }
2040
2041 // send request.
2042 send_request(request, session.get());
2043
2044 // wait for signal
2045 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
2046 request->kick = false;
2047 std::unique_lock l{client_lock, std::adopt_lock};
2048 caller_cond.wait(l, [request] {
2049 return (request->reply || // reply
2050 request->resend_mds >= 0 || // forward
2051 request->kick);
2052 });
2053 l.release();
2054 request->caller_cond = nullptr;
2055
2056 // did we get a reply?
2057 if (request->reply)
2058 break;
2059 }
2060
2061 if (!request->reply) {
2062 ceph_assert(request->aborted());
2063 ceph_assert(!request->got_unsafe);
2064 r = request->get_abort_code();
2065 request->item.remove_myself();
2066 unregister_request(request);
2067 put_request(request);
2068 return r;
2069 }
2070
2071 // got it!
2072 auto reply = std::move(request->reply);
2073 r = reply->get_result();
2074 if (r >= 0)
2075 request->success = true;
2076
2077 // kick dispatcher (we've got it!)
2078 ceph_assert(request->dispatch_cond);
2079 request->dispatch_cond->notify_all();
2080 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2081 request->dispatch_cond = 0;
2082
2083 if (r >= 0 && ptarget)
2084 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
2085
2086 if (pdirbl)
2087 *pdirbl = reply->get_extra_bl();
2088
2089 // -- log times --
2090 utime_t lat = ceph_clock_now();
2091 lat -= request->sent_stamp;
2092 ldout(cct, 20) << "lat " << lat << dendl;
2093
2094 ++nr_metadata_request;
2095 update_io_stat_metadata(lat);
2096
2097 put_request(request);
2098 return r;
2099 }
2100
2101 void Client::unregister_request(MetaRequest *req)
2102 {
2103 mds_requests.erase(req->tid);
2104 if (req->tid == oldest_tid) {
2105 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2106 while (true) {
2107 if (p == mds_requests.end()) {
2108 oldest_tid = 0;
2109 break;
2110 }
2111 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2112 oldest_tid = p->first;
2113 break;
2114 }
2115 ++p;
2116 }
2117 }
2118 put_request(req);
2119 }
2120
2121 void Client::put_request(MetaRequest *request)
2122 {
2123 if (request->_put()) {
2124 int op = -1;
2125 if (request->success)
2126 op = request->get_op();
2127 InodeRef other_in;
2128 request->take_other_inode(&other_in);
2129 delete request;
2130
2131 if (other_in &&
2132 (op == CEPH_MDS_OP_RMDIR ||
2133 op == CEPH_MDS_OP_RENAME ||
2134 op == CEPH_MDS_OP_RMSNAP)) {
2135 _try_to_trim_inode(other_in.get(), false);
2136 }
2137 }
2138 }
2139
2140 int Client::encode_inode_release(Inode *in, MetaRequest *req,
2141 mds_rank_t mds, int drop,
2142 int unless, int force)
2143 {
2144 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
2145 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
2146 << ", force:" << force << ")" << dendl;
2147 int released = 0;
2148 auto it = in->caps.find(mds);
2149 if (it != in->caps.end()) {
2150 Cap &cap = it->second;
2151 drop &= ~(in->dirty_caps | get_caps_used(in));
2152 if ((drop & cap.issued) &&
2153 !(unless & cap.issued)) {
2154 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
2155 cap.issued &= ~drop;
2156 cap.implemented &= ~drop;
2157 released = 1;
2158 } else {
2159 released = force;
2160 }
2161 if (released) {
2162 cap.wanted = in->caps_wanted();
2163 if (&cap == in->auth_cap &&
2164 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2165 in->requested_max_size = 0;
2166 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2167 }
2168 ceph_mds_request_release rel;
2169 rel.ino = in->ino;
2170 rel.cap_id = cap.cap_id;
2171 rel.seq = cap.seq;
2172 rel.issue_seq = cap.issue_seq;
2173 rel.mseq = cap.mseq;
2174 rel.caps = cap.implemented;
2175 rel.wanted = cap.wanted;
2176 rel.dname_len = 0;
2177 rel.dname_seq = 0;
2178 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2179 }
2180 }
2181 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2182 << released << dendl;
2183 return released;
2184 }
2185
2186 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2187 mds_rank_t mds, int drop, int unless)
2188 {
2189 ldout(cct, 20) << __func__ << " enter(dn:"
2190 << dn << ")" << dendl;
2191 int released = 0;
2192 if (dn->dir)
2193 released = encode_inode_release(dn->dir->parent_inode, req,
2194 mds, drop, unless, 1);
2195 if (released && dn->lease_mds == mds) {
2196 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2197 auto& rel = req->cap_releases.back();
2198 rel.item.dname_len = dn->name.length();
2199 rel.item.dname_seq = dn->lease_seq;
2200 rel.dname = dn->name;
2201 dn->lease_mds = -1;
2202 }
2203 ldout(cct, 25) << __func__ << " exit(dn:"
2204 << dn << ")" << dendl;
2205 }
2206
2207
2208 /*
2209 * This requires the MClientRequest *request member to be set.
2210 * It will error out horribly without one.
2211 * Additionally, if you set any *drop member, you'd better have
2212 * set the corresponding dentry!
2213 */
2214 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2215 {
2216 ldout(cct, 20) << __func__ << " enter (req: "
2217 << req << ", mds: " << mds << ")" << dendl;
2218 if (req->inode_drop && req->inode())
2219 encode_inode_release(req->inode(), req,
2220 mds, req->inode_drop,
2221 req->inode_unless);
2222
2223 if (req->old_inode_drop && req->old_inode())
2224 encode_inode_release(req->old_inode(), req,
2225 mds, req->old_inode_drop,
2226 req->old_inode_unless);
2227 if (req->other_inode_drop && req->other_inode())
2228 encode_inode_release(req->other_inode(), req,
2229 mds, req->other_inode_drop,
2230 req->other_inode_unless);
2231
2232 if (req->dentry_drop && req->dentry())
2233 encode_dentry_release(req->dentry(), req,
2234 mds, req->dentry_drop,
2235 req->dentry_unless);
2236
2237 if (req->old_dentry_drop && req->old_dentry())
2238 encode_dentry_release(req->old_dentry(), req,
2239 mds, req->old_dentry_drop,
2240 req->old_dentry_unless);
2241 ldout(cct, 25) << __func__ << " exit (req: "
2242 << req << ", mds " << mds <<dendl;
2243 }
2244
2245 bool Client::have_open_session(mds_rank_t mds)
2246 {
2247 const auto &it = mds_sessions.find(mds);
2248 return it != mds_sessions.end() &&
2249 (it->second->state == MetaSession::STATE_OPEN ||
2250 it->second->state == MetaSession::STATE_STALE);
2251 }
2252
2253 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2254 {
2255 const auto &it = mds_sessions.find(mds);
2256 if (it == mds_sessions.end() || it->second->con != con) {
2257 return NULL;
2258 } else {
2259 return it->second;
2260 }
2261 }
2262
2263 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2264 {
2265 auto it = mds_sessions.find(mds);
2266 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2267 }
2268
2269 /**
2270 * Populate a map of strings with client-identifying metadata,
2271 * such as the hostname. Call this once at initialization.
2272 */
2273 void Client::populate_metadata(const std::string &mount_root)
2274 {
2275 // Hostname
2276 #ifdef _WIN32
2277 // TODO: move this to compat.h
2278 char hostname[64];
2279 DWORD hostname_sz = 64;
2280 GetComputerNameA(hostname, &hostname_sz);
2281 metadata["hostname"] = hostname;
2282 #else
2283 struct utsname u;
2284 int r = uname(&u);
2285 if (r >= 0) {
2286 metadata["hostname"] = u.nodename;
2287 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2288 } else {
2289 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2290 }
2291 #endif
2292
2293 metadata["pid"] = stringify(getpid());
2294
2295 // Ceph entity id (the '0' in "client.0")
2296 metadata["entity_id"] = cct->_conf->name.get_id();
2297
2298 // Our mount position
2299 if (!mount_root.empty()) {
2300 metadata["root"] = mount_root;
2301 }
2302
2303 // Ceph version
2304 metadata["ceph_version"] = pretty_version_to_str();
2305 metadata["ceph_sha1"] = git_version_to_str();
2306
2307 // Apply any metadata from the user's configured overrides
2308 std::vector<std::string> tokens;
2309 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2310 for (const auto &i : tokens) {
2311 auto eqpos = i.find("=");
2312 // Throw out anything that isn't of the form "<str>=<str>"
2313 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2314 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2315 continue;
2316 }
2317 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2318 }
2319 }
2320
2321 /**
2322 * Optionally add or override client metadata fields.
2323 */
2324 void Client::update_metadata(std::string const &k, std::string const &v)
2325 {
2326 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2327 ceph_assert(iref_reader.is_state_satisfied());
2328
2329 std::scoped_lock l(client_lock);
2330
2331 auto it = metadata.find(k);
2332 if (it != metadata.end()) {
2333 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2334 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2335 }
2336
2337 metadata[k] = v;
2338 }
2339
2340 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2341 {
2342 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2343 auto addrs = mdsmap->get_addrs(mds);
2344 auto em = mds_sessions.emplace(std::piecewise_construct,
2345 std::forward_as_tuple(mds),
2346 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2347 ceph_assert(em.second); /* not already present */
2348 auto session = em.first->second;
2349
2350 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2351 m->metadata = metadata;
2352 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2353 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2354 session->con->send_message2(std::move(m));
2355 return session;
2356 }
2357
2358 void Client::_close_mds_session(MetaSession *s)
2359 {
2360 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2361 s->state = MetaSession::STATE_CLOSING;
2362 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2363 }
2364
2365 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2366 {
2367 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2368 if (rejected && s->state != MetaSession::STATE_CLOSING)
2369 s->state = MetaSession::STATE_REJECTED;
2370 else
2371 s->state = MetaSession::STATE_CLOSED;
2372 s->con->mark_down();
2373 signal_context_list(s->waiting_for_open);
2374 mount_cond.notify_all();
2375 remove_session_caps(s, err);
2376 kick_requests_closed(s);
2377 mds_ranks_closing.erase(s->mds_num);
2378 if (s->state == MetaSession::STATE_CLOSED)
2379 mds_sessions.erase(s->mds_num);
2380 }
2381
2382 static void reinit_mds_features(MetaSession *session,
2383 const MConstRef<MClientSession>& m) {
2384 session->mds_features = std::move(m->supported_features);
2385 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
2386 }
2387
2388 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2389 {
2390 mds_rank_t from = mds_rank_t(m->get_source().num());
2391 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2392
2393 std::scoped_lock cl(client_lock);
2394 auto session = _get_mds_session(from, m->get_connection().get());
2395 if (!session) {
2396 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2397 return;
2398 }
2399
2400 switch (m->get_op()) {
2401 case CEPH_SESSION_OPEN:
2402 {
2403 if (session->state == MetaSession::STATE_OPEN) {
2404 ldout(cct, 10) << "mds." << from << " already opened, ignore it"
2405 << dendl;
2406 // The MDS could send a client_session(open) message even when
2407 // the session state is STATE_OPEN. Normally, its fine to
2408 // ignore this message, but, if the MDS sent this message just
2409 // after it got upgraded, the MDS feature bits could differ
2410 // than the one before the upgrade - so, refresh the feature
2411 // bits the client holds.
2412 reinit_mds_features(session.get(), m);
2413 return;
2414 }
2415 /*
2416 * The connection maybe broken and the session in client side
2417 * has been reinitialized, need to update the seq anyway.
2418 */
2419 if (!session->seq && m->get_seq())
2420 session->seq = m->get_seq();
2421
2422 reinit_mds_features(session.get(), m);
2423
2424 renew_caps(session.get());
2425 session->state = MetaSession::STATE_OPEN;
2426 if (is_unmounting())
2427 mount_cond.notify_all();
2428 else
2429 connect_mds_targets(from);
2430 signal_context_list(session->waiting_for_open);
2431 break;
2432 }
2433
2434 case CEPH_SESSION_CLOSE:
2435 _closed_mds_session(session.get());
2436 break;
2437
2438 case CEPH_SESSION_RENEWCAPS:
2439 if (session->cap_renew_seq == m->get_seq()) {
2440 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2441 session->cap_ttl =
2442 session->last_cap_renew_request + mdsmap->get_session_timeout();
2443 if (was_stale)
2444 wake_up_session_caps(session.get(), false);
2445 }
2446 break;
2447
2448 case CEPH_SESSION_STALE:
2449 // invalidate session caps/leases
2450 session->cap_gen++;
2451 session->cap_ttl = ceph_clock_now();
2452 session->cap_ttl -= 1;
2453 renew_caps(session.get());
2454 break;
2455
2456 case CEPH_SESSION_RECALL_STATE:
2457 /*
2458 * Call the renew caps and flush cap releases just before
2459 * triming the caps in case the tick() won't get a chance
2460 * to run them, which could cause the client to be blocklisted
2461 * and MDS daemons trying to recall the caps again and
2462 * again.
2463 *
2464 * In most cases it will do nothing, and the new cap releases
2465 * added by trim_caps() followed will be deferred flushing
2466 * by tick().
2467 */
2468 renew_and_flush_cap_releases();
2469 trim_caps(session.get(), m->get_max_caps());
2470 break;
2471
2472 case CEPH_SESSION_FLUSHMSG:
2473 /* flush cap release */
2474 if (auto& m = session->release; m) {
2475 session->con->send_message2(std::move(m));
2476 }
2477 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2478 break;
2479
2480 case CEPH_SESSION_FORCE_RO:
2481 force_session_readonly(session.get());
2482 break;
2483
2484 case CEPH_SESSION_REJECT:
2485 {
2486 std::string_view error_str;
2487 auto it = m->metadata.find("error_string");
2488 if (it != m->metadata.end())
2489 error_str = it->second;
2490 else
2491 error_str = "unknown error";
2492 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2493
2494 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2495 }
2496 break;
2497
2498 default:
2499 ceph_abort();
2500 }
2501 }
2502
2503 bool Client::_any_stale_sessions() const
2504 {
2505 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2506
2507 for (const auto &p : mds_sessions) {
2508 if (p.second->state == MetaSession::STATE_STALE) {
2509 return true;
2510 }
2511 }
2512
2513 return false;
2514 }
2515
2516 void Client::_kick_stale_sessions()
2517 {
2518 ldout(cct, 1) << __func__ << dendl;
2519
2520 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2521 auto s = it->second;
2522 if (s->state == MetaSession::STATE_REJECTED) {
2523 mds_sessions.erase(it->first);
2524 continue;
2525 }
2526 if (s->state == MetaSession::STATE_STALE)
2527 _closed_mds_session(s.get());
2528 }
2529 }
2530
2531 void Client::send_request(MetaRequest *request, MetaSession *session,
2532 bool drop_cap_releases)
2533 {
2534 // make the request
2535 mds_rank_t mds = session->mds_num;
2536 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2537 << " for mds." << mds << dendl;
2538 auto r = build_client_request(request, mds);
2539 if (!r)
2540 return;
2541
2542 if (request->dentry()) {
2543 r->set_dentry_wanted();
2544 }
2545 if (request->got_unsafe) {
2546 r->set_replayed_op();
2547 if (request->target)
2548 r->head.ino = request->target->ino;
2549 } else {
2550 encode_cap_releases(request, mds);
2551 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2552 request->cap_releases.clear();
2553 else
2554 r->releases.swap(request->cap_releases);
2555 }
2556 r->set_mdsmap_epoch(mdsmap->get_epoch());
2557 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2558 objecter->with_osdmap([r](const OSDMap& o) {
2559 r->set_osdmap_epoch(o.get_epoch());
2560 });
2561 }
2562
2563 if (request->mds == -1) {
2564 request->sent_stamp = ceph_clock_now();
2565 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2566 }
2567 request->mds = mds;
2568
2569 Inode *in = request->inode();
2570 if (in) {
2571 auto it = in->caps.find(mds);
2572 if (it != in->caps.end()) {
2573 request->sent_on_mseq = it->second.mseq;
2574 }
2575 }
2576
2577 session->requests.push_back(&request->item);
2578
2579 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2580 session->con->send_message2(std::move(r));
2581 }
2582
2583 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request, mds_rank_t mds)
2584 {
2585 auto session = mds_sessions.at(mds);
2586 bool old_version = !session->mds_features.test(CEPHFS_FEATURE_32BITS_RETRY_FWD);
2587
2588 /*
2589 * Avoid inifinite retrying after overflow.
2590 *
2591 * The client will increase the retry count and if the MDS is
2592 * old version, so we limit to retry at most 256 times.
2593 */
2594 if (request->retry_attempt) {
2595 int old_max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry);
2596 old_max_retry = 1 << (old_max_retry * CHAR_BIT);
2597 if ((old_version && request->retry_attempt >= old_max_retry) ||
2598 (uint32_t)request->retry_attempt >= UINT32_MAX) {
2599 request->abort(-CEPHFS_EMULTIHOP);
2600 request->caller_cond->notify_all();
2601 ldout(cct, 1) << __func__ << " request tid " << request->tid
2602 << " retry seq overflow" << ", abort it" << dendl;
2603 return nullptr;
2604 }
2605 }
2606
2607 auto req = make_message<MClientRequest>(request->get_op(), session->mds_features);
2608 req->set_tid(request->tid);
2609 req->set_stamp(request->op_stamp);
2610 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2611
2612 // if the filepath's haven't been set, set them!
2613 if (request->path.empty()) {
2614 Inode *in = request->inode();
2615 Dentry *de = request->dentry();
2616 if (in)
2617 in->make_nosnap_relative_path(request->path);
2618 else if (de) {
2619 if (de->inode)
2620 de->inode->make_nosnap_relative_path(request->path);
2621 else if (de->dir) {
2622 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2623 request->path.push_dentry(de->name);
2624 }
2625 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2626 << " No path, inode, or appropriately-endowed dentry given!"
2627 << dendl;
2628 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2629 << " No path, inode, or dentry given!"
2630 << dendl;
2631 }
2632 req->set_filepath(request->get_filepath());
2633 req->set_filepath2(request->get_filepath2());
2634 req->set_alternate_name(request->alternate_name);
2635 req->set_data(request->data);
2636 req->fscrypt_auth = request->fscrypt_auth;
2637 req->fscrypt_file = request->fscrypt_file;
2638 req->set_retry_attempt(request->retry_attempt++);
2639 req->head.ext_num_fwd = request->num_fwd;
2640 const gid_t *_gids;
2641 int gid_count = request->perms.get_gids(&_gids);
2642 req->set_gid_list(gid_count, _gids);
2643 return req;
2644 }
2645
2646
2647
2648 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2649 {
2650 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2651
2652 std::scoped_lock cl(client_lock);
2653 auto session = _get_mds_session(mds, fwd->get_connection().get());
2654 if (!session) {
2655 return;
2656 }
2657 ceph_tid_t tid = fwd->get_tid();
2658
2659 if (mds_requests.count(tid) == 0) {
2660 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2661 return;
2662 }
2663
2664 MetaRequest *request = mds_requests[tid];
2665 ceph_assert(request);
2666
2667 /*
2668 * Avoid inifinite retrying after overflow.
2669 *
2670 * The MDS will increase the fwd count and in client side
2671 * if the num_fwd is less than the one saved in request
2672 * that means the MDS is an old version and overflowed of
2673 * 8 bits.
2674 */
2675 auto num_fwd = fwd->get_num_fwd();
2676 if (num_fwd <= request->num_fwd || (uint32_t)num_fwd >= UINT32_MAX) {
2677 request->abort(-CEPHFS_EMULTIHOP);
2678 request->caller_cond->notify_all();
2679 ldout(cct, 0) << __func__ << " request tid " << tid << " new num_fwd "
2680 << num_fwd << " old num_fwd " << request->num_fwd << ", fwd seq overflow"
2681 << ", abort it" << dendl;
2682 return;
2683 }
2684
2685 // reset retry counter
2686 request->retry_attempt = 0;
2687
2688 // request not forwarded, or dest mds has no session.
2689 // resend.
2690 ldout(cct, 10) << __func__ << " tid " << tid
2691 << " fwd " << fwd->get_num_fwd()
2692 << " to mds." << fwd->get_dest_mds()
2693 << ", resending to " << fwd->get_dest_mds()
2694 << dendl;
2695
2696 request->mds = -1;
2697 request->item.remove_myself();
2698 request->num_fwd = num_fwd;
2699 request->resend_mds = fwd->get_dest_mds();
2700 request->caller_cond->notify_all();
2701 }
2702
2703 bool Client::is_dir_operation(MetaRequest *req)
2704 {
2705 int op = req->get_op();
2706 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2707 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2708 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2709 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2710 return true;
2711 return false;
2712 }
2713
2714 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2715 {
2716 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2717
2718 std::scoped_lock cl(client_lock);
2719 auto session = _get_mds_session(mds_num, reply->get_connection().get());
2720 if (!session) {
2721 return;
2722 }
2723
2724 ceph_tid_t tid = reply->get_tid();
2725 bool is_safe = reply->is_safe();
2726
2727 if (mds_requests.count(tid) == 0) {
2728 lderr(cct) << __func__ << " no pending request on tid " << tid
2729 << " safe is:" << is_safe << dendl;
2730 return;
2731 }
2732 MetaRequest *request = mds_requests.at(tid);
2733
2734 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2735 << " tid " << tid << dendl;
2736
2737 // correct sessions ?
2738 if (request->mds != mds_num) {
2739 ldout(cct, 0) << "got a stale reply from mds." << mds_num
2740 << " instead of mds." << request->mds << dendl;
2741 return;
2742 }
2743
2744 if (request->got_unsafe && !is_safe) {
2745 //duplicate response
2746 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2747 << mds_num << " safe:" << is_safe << dendl;
2748 return;
2749 }
2750
2751 ceph_assert(!request->reply);
2752 request->reply = reply;
2753 insert_trace(request, session.get());
2754
2755 // Handle unsafe reply
2756 if (!is_safe) {
2757 request->got_unsafe = true;
2758 session->unsafe_requests.push_back(&request->unsafe_item);
2759 if (is_dir_operation(request)) {
2760 Inode *dir = request->inode();
2761 ceph_assert(dir);
2762 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2763 }
2764 if (request->target) {
2765 InodeRef &in = request->target;
2766 in->unsafe_ops.push_back(&request->unsafe_target_item);
2767 }
2768 }
2769
2770 // Only signal the caller once (on the first reply):
2771 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2772 if (!is_safe || !request->got_unsafe) {
2773 ceph::condition_variable cond;
2774 request->dispatch_cond = &cond;
2775
2776 // wake up waiter
2777 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2778 request->caller_cond->notify_all();
2779
2780 // wake for kick back
2781 std::unique_lock l{client_lock, std::adopt_lock};
2782 cond.wait(l, [tid, request, &cond, this] {
2783 if (request->dispatch_cond) {
2784 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2785 << tid << " " << &cond << dendl;
2786 }
2787 return !request->dispatch_cond;
2788 });
2789 l.release();
2790 }
2791
2792 if (is_safe) {
2793 // the filesystem change is committed to disk
2794 // we're done, clean up
2795 if (request->got_unsafe) {
2796 request->unsafe_item.remove_myself();
2797 request->unsafe_dir_item.remove_myself();
2798 request->unsafe_target_item.remove_myself();
2799 signal_cond_list(request->waitfor_safe);
2800 }
2801 request->item.remove_myself();
2802 unregister_request(request);
2803 }
2804 if (is_unmounting())
2805 mount_cond.notify_all();
2806 }
2807
2808 void Client::_handle_full_flag(int64_t pool)
2809 {
2810 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2811 << "on " << pool << dendl;
2812 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2813 // to do this rather than blocking, because otherwise when we fill up we
2814 // potentially lock caps forever on files with dirty pages, and we need
2815 // to be able to release those caps to the MDS so that it can delete files
2816 // and free up space.
2817 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2818
2819 // For all inodes with layouts in this pool and a pending flush write op
2820 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2821 // from ObjectCacher so that it doesn't re-issue the write in response to
2822 // the ENOSPC error.
2823 // Fortunately since we're cancelling everything in a given pool, we don't
2824 // need to know which ops belong to which ObjectSet, we can just blow all
2825 // the un-flushed cached data away and mark any dirty inodes' async_err
2826 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2827 // affecting this pool, and all the objectsets we're purging were also
2828 // in this pool.
2829 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2830 i != inode_map.end(); ++i)
2831 {
2832 Inode *inode = i->second;
2833 if (inode->oset.dirty_or_tx
2834 && (pool == -1 || inode->layout.pool_id == pool)) {
2835 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2836 << " has dirty objects, purging and setting ENOSPC" << dendl;
2837 objectcacher->purge_set(&inode->oset);
2838 inode->set_async_err(-CEPHFS_ENOSPC);
2839 }
2840 }
2841
2842 if (cancelled_epoch != (epoch_t)-1) {
2843 set_cap_epoch_barrier(cancelled_epoch);
2844 }
2845 }
2846
2847 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2848 {
2849 std::scoped_lock cl(client_lock);
2850
2851 const auto myaddrs = messenger->get_myaddrs();
2852 bool new_blocklist = objecter->with_osdmap(
2853 [&](const OSDMap& o) {
2854 return o.is_blocklisted(myaddrs);
2855 });
2856
2857 if (new_blocklist && !blocklisted) {
2858 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2859 return o.get_epoch();
2860 });
2861 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2862 blocklisted = true;
2863
2864 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2865
2866 // Since we know all our OSD ops will fail, cancel them all preemtively,
2867 // so that on an unhealthy cluster we can umount promptly even if e.g.
2868 // some PGs were inaccessible.
2869 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2870
2871 }
2872
2873 if (blocklisted) {
2874 // Handle case where we were blocklisted but no longer are
2875 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2876 return o.is_blocklisted(myaddrs);});
2877 }
2878
2879 // Always subscribe to next osdmap for blocklisted client
2880 // until this client is not blocklisted.
2881 if (blocklisted) {
2882 objecter->maybe_request_map();
2883 }
2884
2885 if (objecter->osdmap_full_flag()) {
2886 _handle_full_flag(-1);
2887 } else {
2888 // Accumulate local list of full pools so that I can drop
2889 // the objecter lock before re-entering objecter in
2890 // cancel_writes
2891 std::vector<int64_t> full_pools;
2892
2893 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2894 for (const auto& kv : o.get_pools()) {
2895 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2896 full_pools.push_back(kv.first);
2897 }
2898 }
2899 });
2900
2901 for (auto p : full_pools)
2902 _handle_full_flag(p);
2903
2904 // Subscribe to subsequent maps to watch for the full flag going
2905 // away. For the global full flag objecter does this for us, but
2906 // it pays no attention to the per-pool full flag so in this branch
2907 // we do it ourselves.
2908 if (!full_pools.empty()) {
2909 objecter->maybe_request_map();
2910 }
2911 }
2912 }
2913
2914
2915 // ------------------------
2916 // incoming messages
2917
2918
2919 bool Client::ms_dispatch2(const MessageRef &m)
2920 {
2921 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2922 if (!iref_reader.is_state_satisfied()) {
2923 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2924 return true;
2925 }
2926
2927 switch (m->get_type()) {
2928 // mounting and mds sessions
2929 case CEPH_MSG_MDS_MAP:
2930 handle_mds_map(ref_cast<MMDSMap>(m));
2931 break;
2932 case CEPH_MSG_FS_MAP:
2933 handle_fs_map(ref_cast<MFSMap>(m));
2934 break;
2935 case CEPH_MSG_FS_MAP_USER:
2936 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2937 break;
2938 case CEPH_MSG_CLIENT_SESSION:
2939 handle_client_session(ref_cast<MClientSession>(m));
2940 break;
2941
2942 case CEPH_MSG_OSD_MAP:
2943 handle_osd_map(ref_cast<MOSDMap>(m));
2944 break;
2945
2946 // requests
2947 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2948 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2949 break;
2950 case CEPH_MSG_CLIENT_REPLY:
2951 handle_client_reply(ref_cast<MClientReply>(m));
2952 break;
2953
2954 // reclaim reply
2955 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2956 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2957 break;
2958
2959 case CEPH_MSG_CLIENT_SNAP:
2960 handle_snap(ref_cast<MClientSnap>(m));
2961 break;
2962 case CEPH_MSG_CLIENT_CAPS:
2963 handle_caps(ref_cast<MClientCaps>(m));
2964 break;
2965 case CEPH_MSG_CLIENT_LEASE:
2966 handle_lease(ref_cast<MClientLease>(m));
2967 break;
2968 case MSG_COMMAND_REPLY:
2969 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2970 handle_command_reply(ref_cast<MCommandReply>(m));
2971 } else {
2972 return false;
2973 }
2974 break;
2975 case CEPH_MSG_CLIENT_QUOTA:
2976 handle_quota(ref_cast<MClientQuota>(m));
2977 break;
2978
2979 default:
2980 return false;
2981 }
2982
2983 // unmounting?
2984 std::scoped_lock cl(client_lock);
2985 if (is_unmounting()) {
2986 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2987 << "+" << inode_map.size() << dendl;
2988 uint64_t size = lru.lru_get_size() + inode_map.size();
2989 trim_cache();
2990 if (size > lru.lru_get_size() + inode_map.size()) {
2991 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2992 mount_cond.notify_all();
2993 } else {
2994 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2995 << "+" << inode_map.size() << dendl;
2996 }
2997 }
2998
2999 return true;
3000 }
3001
3002 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
3003 {
3004 std::scoped_lock cl(client_lock);
3005 fsmap.reset(new FSMap(m->get_fsmap()));
3006
3007 signal_cond_list(waiting_for_fsmap);
3008
3009 monclient->sub_got("fsmap", fsmap->get_epoch());
3010 }
3011
3012 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
3013 {
3014 std::scoped_lock cl(client_lock);
3015 fsmap_user.reset(new FSMapUser);
3016 *fsmap_user = m->get_fsmap();
3017
3018 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
3019 signal_cond_list(waiting_for_fsmap);
3020 }
3021
3022 // Cancel all the commands for missing or laggy GIDs
3023 void Client::cancel_commands(const MDSMap& newmap)
3024 {
3025 std::vector<ceph_tid_t> cancel_ops;
3026
3027 std::scoped_lock cmd_lock(command_lock);
3028 auto &commands = command_table.get_commands();
3029 for (const auto &[tid, op] : commands) {
3030 const mds_gid_t op_mds_gid = op.mds_gid;
3031 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
3032 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
3033 cancel_ops.push_back(tid);
3034 if (op.outs) {
3035 std::ostringstream ss;
3036 ss << "MDS " << op_mds_gid << " went away";
3037 *(op.outs) = ss.str();
3038 }
3039 /*
3040 * No need to make the con->mark_down under
3041 * client_lock here, because the con will
3042 * has its own lock.
3043 */
3044 op.con->mark_down();
3045 if (op.on_finish)
3046 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
3047 }
3048 }
3049
3050 for (const auto &tid : cancel_ops)
3051 command_table.erase(tid);
3052 }
3053
3054 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
3055 {
3056 std::unique_lock cl(client_lock);
3057 if (m->get_epoch() <= mdsmap->get_epoch()) {
3058 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
3059 << " is identical to or older than our "
3060 << mdsmap->get_epoch() << dendl;
3061 return;
3062 }
3063
3064 cl.unlock();
3065 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
3066 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
3067 _mdsmap->decode(m->get_encoded());
3068 cancel_commands(*_mdsmap.get());
3069 cl.lock();
3070
3071 _mdsmap.swap(mdsmap);
3072
3073 // reset session
3074 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
3075 mds_rank_t mds = p->first;
3076 MetaSessionRef session = p->second;
3077 ++p;
3078
3079 int oldstate = _mdsmap->get_state(mds);
3080 int newstate = mdsmap->get_state(mds);
3081 if (!mdsmap->is_up(mds)) {
3082 session->con->mark_down();
3083 } else if (mdsmap->get_addrs(mds) != session->addrs) {
3084 auto old_inc = _mdsmap->get_incarnation(mds);
3085 auto new_inc = mdsmap->get_incarnation(mds);
3086 if (old_inc != new_inc) {
3087 ldout(cct, 1) << "mds incarnation changed from "
3088 << old_inc << " to " << new_inc << dendl;
3089 oldstate = MDSMap::STATE_NULL;
3090 }
3091 session->con->mark_down();
3092 session->addrs = mdsmap->get_addrs(mds);
3093 // When new MDS starts to take over, notify kernel to trim unused entries
3094 // in its dcache/icache. Hopefully, the kernel will release some unused
3095 // inodes before the new MDS enters reconnect state.
3096 trim_cache_for_reconnect(session.get());
3097 } else if (oldstate == newstate)
3098 continue; // no change
3099
3100 session->mds_state = newstate;
3101 if (newstate == MDSMap::STATE_RECONNECT) {
3102 session->con = messenger->connect_to_mds(session->addrs);
3103 send_reconnect(session.get());
3104 } else if (newstate > MDSMap::STATE_RECONNECT) {
3105 if (oldstate < MDSMap::STATE_RECONNECT) {
3106 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
3107 _closed_mds_session(session.get());
3108 continue;
3109 }
3110 if (newstate >= MDSMap::STATE_ACTIVE) {
3111 if (oldstate < MDSMap::STATE_ACTIVE) {
3112 // kick new requests
3113 kick_requests(session.get());
3114 kick_flushing_caps(session.get());
3115 signal_context_list(session->waiting_for_open);
3116 wake_up_session_caps(session.get(), true);
3117 }
3118 connect_mds_targets(mds);
3119 }
3120 } else if (newstate == MDSMap::STATE_NULL &&
3121 mds >= mdsmap->get_max_mds()) {
3122 _closed_mds_session(session.get());
3123 }
3124 }
3125
3126 // kick any waiting threads
3127 signal_cond_list(waiting_for_mdsmap);
3128
3129 monclient->sub_got("mdsmap", mdsmap->get_epoch());
3130 }
3131
3132 void Client::send_reconnect(MetaSession *session)
3133 {
3134 mds_rank_t mds = session->mds_num;
3135 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
3136
3137 // trim unused caps to reduce MDS's cache rejoin time
3138 trim_cache_for_reconnect(session);
3139
3140 session->readonly = false;
3141
3142 session->release.reset();
3143
3144 // reset my cap seq number
3145 session->seq = 0;
3146 //connect to the mds' offload targets
3147 connect_mds_targets(mds);
3148 //make sure unsafe requests get saved
3149 resend_unsafe_requests(session);
3150
3151 early_kick_flushing_caps(session);
3152
3153 auto m = make_message<MClientReconnect>();
3154 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
3155
3156 // i have an open session.
3157 ceph::unordered_set<inodeno_t> did_snaprealm;
3158 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3159 p != inode_map.end();
3160 ++p) {
3161 Inode *in = p->second;
3162 auto it = in->caps.find(mds);
3163 if (it != in->caps.end()) {
3164 if (allow_multi &&
3165 m->get_approx_size() >=
3166 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
3167 m->mark_more();
3168 session->con->send_message2(std::move(m));
3169
3170 m = make_message<MClientReconnect>();
3171 }
3172
3173 Cap &cap = it->second;
3174 ldout(cct, 10) << " caps on " << p->first
3175 << " " << ccap_string(cap.issued)
3176 << " wants " << ccap_string(in->caps_wanted())
3177 << dendl;
3178 filepath path;
3179 in->make_short_path(path);
3180 ldout(cct, 10) << " path " << path << dendl;
3181
3182 bufferlist flockbl;
3183 _encode_filelocks(in, flockbl);
3184
3185 cap.seq = 0; // reset seq.
3186 cap.issue_seq = 0; // reset seq.
3187 cap.mseq = 0; // reset seq.
3188 // cap gen should catch up with session cap_gen
3189 if (cap.gen < session->cap_gen) {
3190 cap.gen = session->cap_gen;
3191 cap.issued = cap.implemented = CEPH_CAP_PIN;
3192 } else {
3193 cap.issued = cap.implemented;
3194 }
3195 snapid_t snap_follows = 0;
3196 if (!in->cap_snaps.empty())
3197 snap_follows = in->cap_snaps.begin()->first;
3198
3199 m->add_cap(p->first.ino,
3200 cap.cap_id,
3201 path.get_ino(), path.get_path(), // ino
3202 in->caps_wanted(), // wanted
3203 cap.issued, // issued
3204 in->snaprealm->ino,
3205 snap_follows,
3206 flockbl);
3207
3208 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3209 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3210 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3211 did_snaprealm.insert(in->snaprealm->ino);
3212 }
3213 }
3214 }
3215
3216 if (!allow_multi)
3217 m->set_encoding_version(0); // use connection features to choose encoding
3218 session->con->send_message2(std::move(m));
3219
3220 mount_cond.notify_all();
3221
3222 if (session->reclaim_state == MetaSession::RECLAIMING)
3223 signal_cond_list(waiting_for_reclaim);
3224 }
3225
3226
3227 void Client::kick_requests(MetaSession *session)
3228 {
3229 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3230 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3231 p != mds_requests.end();
3232 ++p) {
3233 MetaRequest *req = p->second;
3234 if (req->got_unsafe)
3235 continue;
3236 if (req->aborted()) {
3237 if (req->caller_cond) {
3238 req->kick = true;
3239 req->caller_cond->notify_all();
3240 }
3241 continue;
3242 }
3243 if (req->retry_attempt > 0)
3244 continue; // new requests only
3245 if (req->mds == session->mds_num) {
3246 send_request(p->second, session);
3247 }
3248 }
3249 }
3250
3251 void Client::resend_unsafe_requests(MetaSession *session)
3252 {
3253 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3254 !iter.end();
3255 ++iter)
3256 send_request(*iter, session);
3257
3258 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3259 // process completed requests in clientreplay stage.
3260 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3261 p != mds_requests.end();
3262 ++p) {
3263 MetaRequest *req = p->second;
3264 if (req->got_unsafe)
3265 continue;
3266 if (req->aborted())
3267 continue;
3268 if (req->retry_attempt == 0)
3269 continue; // old requests only
3270 if (req->mds == session->mds_num)
3271 send_request(req, session, true);
3272 }
3273 }
3274
3275 void Client::wait_unsafe_requests()
3276 {
3277 list<MetaRequest*> last_unsafe_reqs;
3278 for (const auto &p : mds_sessions) {
3279 const auto s = p.second;
3280 if (!s->unsafe_requests.empty()) {
3281 MetaRequest *req = s->unsafe_requests.back();
3282 req->get();
3283 last_unsafe_reqs.push_back(req);
3284 }
3285 }
3286
3287 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3288 p != last_unsafe_reqs.end();
3289 ++p) {
3290 MetaRequest *req = *p;
3291 if (req->unsafe_item.is_on_list())
3292 wait_on_list(req->waitfor_safe);
3293 put_request(req);
3294 }
3295 }
3296
3297 void Client::kick_requests_closed(MetaSession *session)
3298 {
3299 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3300 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3301 p != mds_requests.end(); ) {
3302 MetaRequest *req = p->second;
3303 ++p;
3304 if (req->mds == session->mds_num) {
3305 if (req->caller_cond) {
3306 req->kick = true;
3307 req->caller_cond->notify_all();
3308 }
3309 req->item.remove_myself();
3310 if (req->got_unsafe) {
3311 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3312 req->unsafe_item.remove_myself();
3313 if (is_dir_operation(req)) {
3314 Inode *dir = req->inode();
3315 ceph_assert(dir);
3316 dir->set_async_err(-CEPHFS_EIO);
3317 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3318 << dir->ino << " " << req->get_tid() << dendl;
3319 req->unsafe_dir_item.remove_myself();
3320 }
3321 if (req->target) {
3322 InodeRef &in = req->target;
3323 in->set_async_err(-CEPHFS_EIO);
3324 lderr(cct) << "kick_requests_closed drop req of inode : "
3325 << in->ino << " " << req->get_tid() << dendl;
3326 req->unsafe_target_item.remove_myself();
3327 }
3328 signal_cond_list(req->waitfor_safe);
3329 unregister_request(req);
3330 }
3331 }
3332 }
3333 ceph_assert(session->requests.empty());
3334 ceph_assert(session->unsafe_requests.empty());
3335 }
3336
3337
3338
3339
3340 /************
3341 * leases
3342 */
3343
3344 void Client::got_mds_push(MetaSession *s)
3345 {
3346 s->seq++;
3347 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3348 if (s->state == MetaSession::STATE_CLOSING) {
3349 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3350 }
3351 }
3352
3353 void Client::handle_lease(const MConstRef<MClientLease>& m)
3354 {
3355 ldout(cct, 10) << __func__ << " " << *m << dendl;
3356
3357 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3358 mds_rank_t mds = mds_rank_t(m->get_source().num());
3359
3360 std::scoped_lock cl(client_lock);
3361 auto session = _get_mds_session(mds, m->get_connection().get());
3362 if (!session) {
3363 return;
3364 }
3365
3366 got_mds_push(session.get());
3367
3368 ceph_seq_t seq = m->get_seq();
3369
3370 Inode *in;
3371 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3372 if (inode_map.count(vino) == 0) {
3373 ldout(cct, 10) << " don't have vino " << vino << dendl;
3374 goto revoke;
3375 }
3376 in = inode_map[vino];
3377
3378 if (m->get_mask() & CEPH_LEASE_VALID) {
3379 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3380 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3381 goto revoke;
3382 }
3383 Dentry *dn = in->dir->dentries[m->dname];
3384 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3385 dn->lease_mds = -1;
3386 }
3387
3388 revoke:
3389 {
3390 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3391 m->get_mask(), m->get_ino(),
3392 m->get_first(), m->get_last(), m->dname);
3393 m->get_connection()->send_message2(std::move(reply));
3394 }
3395 }
3396
3397 void Client::_put_inode(Inode *in, int n)
3398 {
3399 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3400
3401 int left = in->get_nref();
3402 ceph_assert(left >= n + 1);
3403 in->iput(n);
3404 left -= n;
3405 if (left == 1) { // the last one will be held by the inode_map
3406 // release any caps
3407 remove_all_caps(in);
3408
3409 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3410 bool unclean = objectcacher->release_set(&in->oset);
3411 ceph_assert(!unclean);
3412 inode_map.erase(in->vino());
3413 if (use_faked_inos())
3414 _release_faked_ino(in);
3415
3416 if (root == nullptr) {
3417 root_ancestor = 0;
3418 while (!root_parents.empty())
3419 root_parents.erase(root_parents.begin());
3420 }
3421
3422 in->iput();
3423 }
3424 }
3425
3426 void Client::delay_put_inodes(bool wakeup)
3427 {
3428 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3429
3430 std::map<Inode*,int> release;
3431 {
3432 std::scoped_lock dl(delay_i_lock);
3433 release.swap(delay_i_release);
3434 }
3435
3436 if (release.empty())
3437 return;
3438
3439 for (auto &[in, cnt] : release)
3440 _put_inode(in, cnt);
3441
3442 if (wakeup)
3443 mount_cond.notify_all();
3444 }
3445
3446 void Client::put_inode(Inode *in, int n)
3447 {
3448 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3449
3450 std::scoped_lock dl(delay_i_lock);
3451 delay_i_release[in] += n;
3452 }
3453
3454 void Client::close_dir(Dir *dir)
3455 {
3456 Inode *in = dir->parent_inode;
3457 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3458 ceph_assert(dir->is_empty());
3459 ceph_assert(in->dir == dir);
3460 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3461 if (!in->dentries.empty())
3462 in->get_first_parent()->put(); // unpin dentry
3463
3464 delete in->dir;
3465 in->dir = 0;
3466 put_inode(in); // unpin inode
3467 }
3468
3469 /**
3470 * Don't call this with in==NULL, use get_or_create for that
3471 * leave dn set to default NULL unless you're trying to add
3472 * a new inode to a pre-created Dentry
3473 */
3474 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3475 {
3476 if (!dn) {
3477 // create a new Dentry
3478 dn = new Dentry(dir, name);
3479
3480 lru.lru_insert_mid(dn); // mid or top?
3481
3482 if(in) {
3483 ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << *in
3484 << " dn " << *dn << " (new dn)" << dendl;
3485 } else {
3486 ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' "
3487 << " dn " << *dn << " (new dn)" << dendl;
3488 }
3489 } else {
3490 ceph_assert(!dn->inode);
3491 ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << in
3492 << " dn " << *dn << " (old dn)" << dendl;
3493 }
3494
3495 if (in) { // link to inode
3496 InodeRef tmp_ref;
3497 // only one parent for directories!
3498 if (in->is_dir() && !in->dentries.empty()) {
3499 tmp_ref = in; // prevent unlink below from freeing the inode.
3500 Dentry *olddn = in->get_first_parent();
3501 ceph_assert(olddn->dir != dir || olddn->name != name);
3502 Inode *old_diri = olddn->dir->parent_inode;
3503 clear_dir_complete_and_ordered(old_diri, true);
3504 unlink(olddn, true, true); // keep dir, dentry
3505 }
3506
3507 dn->link(in);
3508 inc_dentry_nr();
3509 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3510 }
3511
3512 return dn;
3513 }
3514
3515 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3516 {
3517 InodeRef in(dn->inode);
3518 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3519 << " inode " << dn->inode << dendl;
3520
3521 // unlink from inode
3522 if (dn->inode) {
3523 dn->unlink();
3524 dec_dentry_nr();
3525 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3526 }
3527
3528 if (keepdentry) {
3529 dn->lease_mds = -1;
3530 } else {
3531 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3532
3533 // unlink from dir
3534 Dir *dir = dn->dir;
3535 dn->detach();
3536
3537 // delete den
3538 lru.lru_remove(dn);
3539 dn->put();
3540
3541 if (dir->is_empty() && !keepdir)
3542 close_dir(dir);
3543 }
3544 }
3545
3546 /**
3547 * For asynchronous flushes, check for errors from the IO and
3548 * update the inode if necessary
3549 */
3550 class C_Client_FlushComplete : public Context {
3551 private:
3552 Client *client;
3553 InodeRef inode;
3554 public:
3555 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3556 void finish(int r) override {
3557 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3558 if (r != 0) {
3559 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3560 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3561 << " 0x" << std::hex << inode->ino << std::dec
3562 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3563 inode->set_async_err(r);
3564 }
3565 }
3566 };
3567
3568
3569 /****
3570 * caps
3571 */
3572
3573 void Client::get_cap_ref(Inode *in, int cap)
3574 {
3575 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3576 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3577 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3578 in->iget();
3579 }
3580 if ((cap & CEPH_CAP_FILE_CACHE) &&
3581 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3582 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3583 in->iget();
3584 }
3585 in->get_cap_ref(cap);
3586 }
3587
3588 void Client::put_cap_ref(Inode *in, int cap)
3589 {
3590 int last = in->put_cap_ref(cap);
3591 if (last) {
3592 int put_nref = 0;
3593 int drop = last & ~in->caps_issued();
3594 if (in->snapid == CEPH_NOSNAP) {
3595 if ((last & CEPH_CAP_FILE_WR) &&
3596 !in->cap_snaps.empty() &&
3597 in->cap_snaps.rbegin()->second.writing) {
3598 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3599 in->cap_snaps.rbegin()->second.writing = 0;
3600 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3601 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3602 }
3603 if (last & CEPH_CAP_FILE_BUFFER) {
3604 for (auto &p : in->cap_snaps)
3605 p.second.dirty_data = 0;
3606 signal_cond_list(in->waitfor_commit);
3607 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3608 ++put_nref;
3609
3610 if (!in->cap_snaps.empty()) {
3611 flush_snaps(in);
3612 }
3613 }
3614 }
3615 if (last & CEPH_CAP_FILE_CACHE) {
3616 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3617 ++put_nref;
3618 }
3619 if (drop)
3620 check_caps(in, 0);
3621 if (put_nref)
3622 put_inode(in, put_nref);
3623 }
3624 }
3625
3626 // get caps for a given file handle -- the inode should have @need caps
3627 // issued by the mds and @want caps not revoked (or not under revocation).
3628 // this routine blocks till the cap requirement is satisfied. also account
3629 // (track) for capability hit when required (when cap requirement succeedes).
3630 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3631 {
3632 Inode *in = fh->inode.get();
3633
3634 int r = check_pool_perm(in, need);
3635 if (r < 0)
3636 return r;
3637
3638 while (1) {
3639 int file_wanted = in->caps_file_wanted();
3640 if ((file_wanted & need) != need) {
3641 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3642 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3643 << dendl;
3644 return -CEPHFS_EBADF;
3645 }
3646
3647 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3648 return -CEPHFS_EBADF;
3649
3650 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3651 return -CEPHFS_EIO;
3652
3653 int implemented;
3654 int have = in->caps_issued(&implemented);
3655
3656 bool waitfor_caps = false;
3657 bool waitfor_commit = false;
3658
3659 if (have & need & CEPH_CAP_FILE_WR) {
3660 if (endoff > 0) {
3661 if ((endoff >= (loff_t)in->max_size ||
3662 endoff > (loff_t)(in->size << 1)) &&
3663 endoff > (loff_t)in->wanted_max_size) {
3664 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3665 in->wanted_max_size = endoff;
3666 }
3667 if (in->wanted_max_size > in->max_size &&
3668 in->wanted_max_size > in->requested_max_size)
3669 check_caps(in, 0);
3670 }
3671
3672 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3673 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3674 waitfor_caps = true;
3675 }
3676 if (!in->cap_snaps.empty()) {
3677 if (in->cap_snaps.rbegin()->second.writing) {
3678 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3679 waitfor_caps = true;
3680 }
3681 for (auto &p : in->cap_snaps) {
3682 if (p.second.dirty_data) {
3683 waitfor_commit = true;
3684 break;
3685 }
3686 }
3687 if (waitfor_commit) {
3688 _flush(in, new C_Client_FlushComplete(this, in));
3689 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3690 }
3691 }
3692 }
3693
3694 if (!waitfor_caps && !waitfor_commit) {
3695 if ((have & need) == need) {
3696 int revoking = implemented & ~have;
3697 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3698 << " need " << ccap_string(need) << " want " << ccap_string(want)
3699 << " revoking " << ccap_string(revoking)
3700 << dendl;
3701 if ((revoking & want) == 0) {
3702 *phave = need | (have & want);
3703 in->get_cap_ref(need);
3704 cap_hit();
3705 return 0;
3706 }
3707 }
3708 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3709 waitfor_caps = true;
3710 }
3711
3712 if ((need & CEPH_CAP_FILE_WR) &&
3713 ((in->auth_cap && in->auth_cap->session->readonly) ||
3714 // userland clients are only allowed to read if fscrypt enabled
3715 in->is_fscrypt_enabled()))
3716 return -CEPHFS_EROFS;
3717
3718 if (in->flags & I_CAP_DROPPED) {
3719 int mds_wanted = in->caps_mds_wanted();
3720 if ((mds_wanted & need) != need) {
3721 int ret = _renew_caps(in);
3722 if (ret < 0)
3723 return ret;
3724 continue;
3725 }
3726 if (!(file_wanted & ~mds_wanted))
3727 in->flags &= ~I_CAP_DROPPED;
3728 }
3729
3730 if (waitfor_caps)
3731 wait_on_list(in->waitfor_caps);
3732 else if (waitfor_commit)
3733 wait_on_list(in->waitfor_commit);
3734 }
3735 }
3736
3737 int Client::get_caps_used(Inode *in)
3738 {
3739 unsigned used = in->caps_used();
3740 if (!(used & CEPH_CAP_FILE_CACHE) &&
3741 !objectcacher->set_is_empty(&in->oset))
3742 used |= CEPH_CAP_FILE_CACHE;
3743 return used;
3744 }
3745
3746 void Client::cap_delay_requeue(Inode *in)
3747 {
3748 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3749
3750 in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
3751 delayed_list.push_back(&in->delay_cap_item);
3752 }
3753
3754 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3755 int flags, int used, int want, int retain,
3756 int flush, ceph_tid_t flush_tid)
3757 {
3758 int held = cap->issued | cap->implemented;
3759 int revoking = cap->implemented & ~cap->issued;
3760 retain &= ~revoking;
3761 int dropping = cap->issued & ~retain;
3762 int op = CEPH_CAP_OP_UPDATE;
3763
3764 ldout(cct, 10) << __func__ << " " << *in
3765 << " mds." << session->mds_num << " seq " << cap->seq
3766 << " used " << ccap_string(used)
3767 << " want " << ccap_string(want)
3768 << " flush " << ccap_string(flush)
3769 << " retain " << ccap_string(retain)
3770 << " held "<< ccap_string(held)
3771 << " revoking " << ccap_string(revoking)
3772 << " dropping " << ccap_string(dropping)
3773 << dendl;
3774
3775 if (cct->_conf->client_inject_release_failure && revoking) {
3776 const int would_have_issued = cap->issued & retain;
3777 const int would_have_implemented = cap->implemented & (cap->issued | used);
3778 // Simulated bug:
3779 // - tell the server we think issued is whatever they issued plus whatever we implemented
3780 // - leave what we have implemented in place
3781 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3782 cap->issued = cap->issued | cap->implemented;
3783
3784 // Make an exception for revoking xattr caps: we are injecting
3785 // failure to release other caps, but allow xattr because client
3786 // will block on xattr ops if it can't release these to MDS (#9800)
3787 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3788 cap->issued ^= xattr_mask & revoking;
3789 cap->implemented ^= xattr_mask & revoking;
3790
3791 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3792 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3793 } else {
3794 // Normal behaviour
3795 cap->issued &= retain;
3796 cap->implemented &= cap->issued | used;
3797 }
3798
3799 snapid_t follows = 0;
3800
3801 if (flush)
3802 follows = in->snaprealm->get_snap_context().seq;
3803
3804 auto m = make_message<MClientCaps>(op,
3805 in->ino,
3806 0,
3807 cap->cap_id, cap->seq,
3808 cap->implemented,
3809 want,
3810 flush,
3811 cap->mseq,
3812 cap_epoch_barrier);
3813 m->caller_uid = in->cap_dirtier_uid;
3814 m->caller_gid = in->cap_dirtier_gid;
3815
3816 m->head.issue_seq = cap->issue_seq;
3817 m->set_tid(flush_tid);
3818
3819 m->head.uid = in->uid;
3820 m->head.gid = in->gid;
3821 m->head.mode = in->mode;
3822
3823 m->head.nlink = in->nlink;
3824
3825 if (flush & CEPH_CAP_XATTR_EXCL) {
3826 encode(in->xattrs, m->xattrbl);
3827 m->head.xattr_version = in->xattr_version;
3828 }
3829
3830 m->size = in->size;
3831 m->max_size = in->max_size;
3832 m->truncate_seq = in->truncate_seq;
3833 m->truncate_size = in->truncate_size;
3834 m->mtime = in->mtime;
3835 m->atime = in->atime;
3836 m->ctime = in->ctime;
3837 m->btime = in->btime;
3838 m->time_warp_seq = in->time_warp_seq;
3839 m->change_attr = in->change_attr;
3840 m->fscrypt_auth = in->fscrypt_auth;
3841 m->fscrypt_file = in->fscrypt_file;
3842
3843 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3844 !in->cap_snaps.empty() &&
3845 in->cap_snaps.rbegin()->second.flush_tid == 0)
3846 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3847 m->flags = flags;
3848
3849 if (flush & CEPH_CAP_FILE_WR) {
3850 m->inline_version = in->inline_version;
3851 m->inline_data = in->inline_data;
3852 }
3853
3854 in->reported_size = in->size;
3855 m->set_snap_follows(follows);
3856 cap->wanted = want;
3857 if (cap == in->auth_cap) {
3858 if (want & CEPH_CAP_ANY_FILE_WR) {
3859 m->set_max_size(in->wanted_max_size);
3860 in->requested_max_size = in->wanted_max_size;
3861 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3862 } else {
3863 in->requested_max_size = 0;
3864 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3865 }
3866 }
3867
3868 if (!session->flushing_caps_tids.empty())
3869 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3870
3871 session->con->send_message2(std::move(m));
3872 }
3873
3874 static bool is_max_size_approaching(Inode *in)
3875 {
3876 /* mds will adjust max size according to the reported size */
3877 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3878 return false;
3879 if (in->size >= in->max_size)
3880 return true;
3881 /* half of previous max_size increment has been used */
3882 if (in->max_size > in->reported_size &&
3883 (in->size << 1) >= in->max_size + in->reported_size)
3884 return true;
3885 return false;
3886 }
3887
3888 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3889 {
3890 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3891 return used;
3892 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3893 return used;
3894
3895 if (issued & CEPH_CAP_FILE_LAZYIO) {
3896 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3897 used &= ~CEPH_CAP_FILE_CACHE;
3898 used |= CEPH_CAP_FILE_LAZYIO;
3899 }
3900 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3901 used &= ~CEPH_CAP_FILE_BUFFER;
3902 used |= CEPH_CAP_FILE_LAZYIO;
3903 }
3904 } else {
3905 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3906 used &= ~CEPH_CAP_FILE_CACHE;
3907 used |= CEPH_CAP_FILE_LAZYIO;
3908 }
3909 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3910 used &= ~CEPH_CAP_FILE_BUFFER;
3911 used |= CEPH_CAP_FILE_LAZYIO;
3912 }
3913 }
3914 return used;
3915 }
3916
3917 /**
3918 * check_caps
3919 *
3920 * Examine currently used and wanted versus held caps. Release, flush or ack
3921 * revoked caps to the MDS as appropriate.
3922 *
3923 * @param in the inode to check
3924 * @param flags flags to apply to cap check
3925 */
3926 void Client::check_caps(Inode *in, unsigned flags)
3927 {
3928 unsigned wanted = in->caps_wanted();
3929 unsigned used = get_caps_used(in);
3930 unsigned cap_used;
3931
3932 int implemented;
3933 int issued = in->caps_issued(&implemented);
3934 int revoking = implemented & ~issued;
3935
3936 int orig_used = used;
3937 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3938
3939 int retain = wanted | used | CEPH_CAP_PIN;
3940 if (!is_unmounting() && in->nlink > 0) {
3941 if (wanted) {
3942 retain |= CEPH_CAP_ANY;
3943 } else if (in->is_dir() &&
3944 (issued & CEPH_CAP_FILE_SHARED) &&
3945 (in->flags & I_COMPLETE)) {
3946 // we do this here because we don't want to drop to Fs (and then
3947 // drop the Fs if we do a create!) if that alone makes us send lookups
3948 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3949 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3950 retain |= wanted;
3951 } else {
3952 retain |= CEPH_CAP_ANY_SHARED;
3953 // keep RD only if we didn't have the file open RW,
3954 // because then the mds would revoke it anyway to
3955 // journal max_size=0.
3956 if (in->max_size == 0)
3957 retain |= CEPH_CAP_ANY_RD;
3958 }
3959 }
3960
3961 ldout(cct, 10) << __func__ << " on " << *in
3962 << " wanted " << ccap_string(wanted)
3963 << " used " << ccap_string(used)
3964 << " issued " << ccap_string(issued)
3965 << " revoking " << ccap_string(revoking)
3966 << " flags=" << flags
3967 << dendl;
3968
3969 if (in->snapid != CEPH_NOSNAP)
3970 return; //snap caps last forever, can't write
3971
3972 if (in->caps.empty())
3973 return; // guard if at end of func
3974
3975 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3976 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3977 if (_release(in))
3978 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3979 }
3980
3981 for (auto &[mds, cap] : in->caps) {
3982 auto session = mds_sessions.at(mds);
3983
3984 cap_used = used;
3985 if (in->auth_cap && &cap != in->auth_cap)
3986 cap_used &= ~in->auth_cap->issued;
3987
3988 revoking = cap.implemented & ~cap.issued;
3989
3990 ldout(cct, 10) << " cap mds." << mds
3991 << " issued " << ccap_string(cap.issued)
3992 << " implemented " << ccap_string(cap.implemented)
3993 << " revoking " << ccap_string(revoking) << dendl;
3994
3995 if (in->wanted_max_size > in->max_size &&
3996 in->wanted_max_size > in->requested_max_size &&
3997 &cap == in->auth_cap)
3998 goto ack;
3999
4000 /* approaching file_max? */
4001 if ((cap.issued & CEPH_CAP_FILE_WR) &&
4002 &cap == in->auth_cap &&
4003 is_max_size_approaching(in)) {
4004 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
4005 << ", reported " << in->reported_size << dendl;
4006 goto ack;
4007 }
4008
4009 /* completed revocation? */
4010 if (revoking && (revoking & cap_used) == 0) {
4011 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
4012 goto ack;
4013 }
4014
4015 /* want more caps from mds? */
4016 if (wanted & ~(cap.wanted | cap.issued))
4017 goto ack;
4018
4019 if (!revoking && is_unmounting() && (cap_used == 0))
4020 goto ack;
4021
4022 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
4023 !in->dirty_caps) // and we have no dirty caps
4024 continue;
4025
4026 if (!(flags & CHECK_CAPS_NODELAY)) {
4027 ldout(cct, 10) << "delaying cap release" << dendl;
4028 cap_delay_requeue(in);
4029 continue;
4030 }
4031
4032 ack:
4033 if (&cap == in->auth_cap) {
4034 if (in->flags & I_KICK_FLUSH) {
4035 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
4036 << " to mds." << mds << dendl;
4037 kick_flushing_caps(in, session.get());
4038 }
4039 if (!in->cap_snaps.empty() &&
4040 in->cap_snaps.rbegin()->second.flush_tid == 0)
4041 flush_snaps(in);
4042 }
4043
4044 int flushing;
4045 int msg_flags = 0;
4046 ceph_tid_t flush_tid;
4047 if (in->auth_cap == &cap && in->dirty_caps) {
4048 flushing = mark_caps_flushing(in, &flush_tid);
4049 if (flags & CHECK_CAPS_SYNCHRONOUS)
4050 msg_flags |= MClientCaps::FLAG_SYNC;
4051 } else {
4052 flushing = 0;
4053 flush_tid = 0;
4054 }
4055
4056 in->delay_cap_item.remove_myself();
4057 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
4058 flushing, flush_tid);
4059 }
4060 }
4061
4062
4063 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
4064 {
4065 int used = get_caps_used(in);
4066 int dirty = in->caps_dirty();
4067 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
4068
4069 if (in->cap_snaps.size() &&
4070 in->cap_snaps.rbegin()->second.writing) {
4071 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
4072 return;
4073 } else if (dirty || (used & CEPH_CAP_FILE_WR)) {
4074 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
4075 ceph_assert(capsnapem.second); /* element inserted */
4076 CapSnap &capsnap = capsnapem.first->second;
4077 capsnap.context = old_snapc;
4078 capsnap.issued = in->caps_issued();
4079 capsnap.dirty = dirty;
4080
4081 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
4082
4083 capsnap.uid = in->uid;
4084 capsnap.gid = in->gid;
4085 capsnap.mode = in->mode;
4086 capsnap.btime = in->btime;
4087 capsnap.xattrs = in->xattrs;
4088 capsnap.xattr_version = in->xattr_version;
4089 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4090 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4091
4092 if (used & CEPH_CAP_FILE_WR) {
4093 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
4094 capsnap.writing = 1;
4095 } else {
4096 finish_cap_snap(in, capsnap, used);
4097 }
4098 } else {
4099 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
4100 }
4101 }
4102
4103 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
4104 {
4105 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
4106 capsnap.size = in->size;
4107 capsnap.mtime = in->mtime;
4108 capsnap.atime = in->atime;
4109 capsnap.ctime = in->ctime;
4110 capsnap.time_warp_seq = in->time_warp_seq;
4111 capsnap.change_attr = in->change_attr;
4112 capsnap.dirty |= in->caps_dirty();
4113
4114 /* Only reset it if it wasn't set before */
4115 if (capsnap.cap_dirtier_uid == -1) {
4116 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4117 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4118 }
4119
4120 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4121 capsnap.inline_data = in->inline_data;
4122 capsnap.inline_version = in->inline_version;
4123 }
4124
4125 if (used & CEPH_CAP_FILE_BUFFER) {
4126 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
4127 << " WRBUFFER, trigger to flush dirty buffer" << dendl;
4128
4129 /* trigger to flush the buffer */
4130 _flush(in, new C_Client_FlushComplete(this, in));
4131 } else {
4132 capsnap.dirty_data = 0;
4133 flush_snaps(in);
4134 }
4135 }
4136
4137 void Client::send_flush_snap(Inode *in, MetaSession *session,
4138 snapid_t follows, CapSnap& capsnap)
4139 {
4140 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4141 in->ino, in->snaprealm->ino, 0,
4142 in->auth_cap->mseq, cap_epoch_barrier);
4143 m->caller_uid = capsnap.cap_dirtier_uid;
4144 m->caller_gid = capsnap.cap_dirtier_gid;
4145
4146 m->set_client_tid(capsnap.flush_tid);
4147 m->head.snap_follows = follows;
4148
4149 m->head.caps = capsnap.issued;
4150 m->head.dirty = capsnap.dirty;
4151
4152 m->head.uid = capsnap.uid;
4153 m->head.gid = capsnap.gid;
4154 m->head.mode = capsnap.mode;
4155 m->btime = capsnap.btime;
4156
4157 m->size = capsnap.size;
4158
4159 m->head.xattr_version = capsnap.xattr_version;
4160 encode(capsnap.xattrs, m->xattrbl);
4161
4162 m->ctime = capsnap.ctime;
4163 m->btime = capsnap.btime;
4164 m->mtime = capsnap.mtime;
4165 m->atime = capsnap.atime;
4166 m->time_warp_seq = capsnap.time_warp_seq;
4167 m->change_attr = capsnap.change_attr;
4168
4169 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4170 m->inline_version = in->inline_version;
4171 m->inline_data = in->inline_data;
4172 }
4173
4174 ceph_assert(!session->flushing_caps_tids.empty());
4175 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4176
4177 session->con->send_message2(std::move(m));
4178 }
4179
4180 void Client::flush_snaps(Inode *in)
4181 {
4182 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
4183 ceph_assert(in->cap_snaps.size());
4184
4185 // pick auth mds
4186 ceph_assert(in->auth_cap);
4187 MetaSession *session = in->auth_cap->session;
4188
4189 for (auto &p : in->cap_snaps) {
4190 CapSnap &capsnap = p.second;
4191 // only do new flush
4192 if (capsnap.flush_tid > 0)
4193 continue;
4194
4195 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4196 << " follows " << p.first
4197 << " size " << capsnap.size
4198 << " mtime " << capsnap.mtime
4199 << " dirty_data=" << capsnap.dirty_data
4200 << " writing=" << capsnap.writing
4201 << " on " << *in << dendl;
4202 if (capsnap.dirty_data || capsnap.writing)
4203 break;
4204
4205 capsnap.flush_tid = ++last_flush_tid;
4206 session->flushing_caps_tids.insert(capsnap.flush_tid);
4207 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4208 if (!in->flushing_cap_item.is_on_list())
4209 session->flushing_caps.push_back(&in->flushing_cap_item);
4210
4211 send_flush_snap(in, session, p.first, capsnap);
4212 }
4213 }
4214
4215 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
4216 {
4217 ceph::condition_variable cond;
4218 ls.push_back(&cond);
4219 std::unique_lock l{client_lock, std::adopt_lock};
4220 cond.wait(l);
4221 l.release();
4222 ls.remove(&cond);
4223 }
4224
4225 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4226 {
4227 for (auto cond : ls) {
4228 cond->notify_all();
4229 }
4230 }
4231
4232 void Client::wait_on_context_list(list<Context*>& ls)
4233 {
4234 ceph::condition_variable cond;
4235 bool done = false;
4236 int r;
4237 ls.push_back(new C_Cond(cond, &done, &r));
4238 std::unique_lock l{client_lock, std::adopt_lock};
4239 cond.wait(l, [&done] { return done;});
4240 l.release();
4241 }
4242
4243 void Client::signal_context_list(list<Context*>& ls)
4244 {
4245 while (!ls.empty()) {
4246 ls.front()->complete(0);
4247 ls.pop_front();
4248 }
4249 }
4250
4251 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4252 {
4253 for (const auto &cap : s->caps) {
4254 auto &in = cap->inode;
4255 if (reconnect) {
4256 in.requested_max_size = 0;
4257 in.wanted_max_size = 0;
4258 } else {
4259 if (cap->gen < s->cap_gen) {
4260 // mds did not re-issue stale cap.
4261 cap->issued = cap->implemented = CEPH_CAP_PIN;
4262 // make sure mds knows what we want.
4263 if (in.caps_file_wanted() & ~cap->wanted)
4264 in.flags |= I_CAP_DROPPED;
4265 }
4266 }
4267 signal_cond_list(in.waitfor_caps);
4268 }
4269 }
4270
4271
4272 // flush dirty data (from objectcache)
4273
4274 class C_Client_CacheInvalidate : public Context {
4275 private:
4276 Client *client;
4277 vinodeno_t ino;
4278 int64_t offset, length;
4279 public:
4280 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4281 client(c), offset(off), length(len) {
4282 if (client->use_faked_inos())
4283 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4284 else
4285 ino = in->vino();
4286 }
4287 void finish(int r) override {
4288 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4289 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4290 client->_async_invalidate(ino, offset, length);
4291 }
4292 };
4293
4294 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4295 {
4296 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4297 if (!mref_reader.is_state_satisfied())
4298 return;
4299
4300 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4301 ino_invalidate_cb(callback_handle, ino, off, len);
4302 }
4303
4304 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4305
4306 if (ino_invalidate_cb)
4307 // we queue the invalidate, which calls the callback and decrements the ref
4308 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4309 }
4310
4311 void Client::_invalidate_inode_cache(Inode *in)
4312 {
4313 ldout(cct, 10) << __func__ << " " << *in << dendl;
4314
4315 // invalidate our userspace inode cache
4316 if (cct->_conf->client_oc) {
4317 objectcacher->release_set(&in->oset);
4318 if (!objectcacher->set_is_empty(&in->oset))
4319 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4320 }
4321
4322 _schedule_invalidate_callback(in, 0, 0);
4323 }
4324
4325 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4326 {
4327 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4328
4329 // invalidate our userspace inode cache
4330 if (cct->_conf->client_oc) {
4331 vector<ObjectExtent> ls;
4332 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4333 objectcacher->discard_writeback(&in->oset, ls, nullptr);
4334 }
4335
4336 _schedule_invalidate_callback(in, off, len);
4337 }
4338
4339 bool Client::_release(Inode *in)
4340 {
4341 ldout(cct, 20) << "_release " << *in << dendl;
4342 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4343 _invalidate_inode_cache(in);
4344 return true;
4345 }
4346 return false;
4347 }
4348
4349 bool Client::_flush(Inode *in, Context *onfinish)
4350 {
4351 ldout(cct, 10) << "_flush " << *in << dendl;
4352
4353 if (!in->oset.dirty_or_tx) {
4354 ldout(cct, 10) << " nothing to flush" << dendl;
4355 onfinish->complete(0);
4356 return true;
4357 }
4358
4359 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4360 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4361 objectcacher->purge_set(&in->oset);
4362 if (onfinish) {
4363 onfinish->complete(-CEPHFS_ENOSPC);
4364 }
4365 return true;
4366 }
4367
4368 return objectcacher->flush_set(&in->oset, onfinish);
4369 }
4370
4371 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4372 {
4373 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4374 if (!in->oset.dirty_or_tx) {
4375 ldout(cct, 10) << " nothing to flush" << dendl;
4376 return;
4377 }
4378
4379 C_SaferCond onflush("Client::_flush_range flock");
4380 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4381 offset, size, &onflush);
4382 if (!ret) {
4383 // wait for flush
4384 client_lock.unlock();
4385 onflush.wait();
4386 client_lock.lock();
4387 }
4388 }
4389
4390 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4391 {
4392 // std::scoped_lock l(client_lock);
4393 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
4394 Inode *in = static_cast<Inode *>(oset->parent);
4395 ceph_assert(in);
4396 _flushed(in);
4397 }
4398
4399 void Client::_flushed(Inode *in)
4400 {
4401 ldout(cct, 10) << "_flushed " << *in << dendl;
4402
4403 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4404 }
4405
4406
4407
4408 // checks common to add_update_cap, handle_cap_grant
4409 void Client::check_cap_issue(Inode *in, unsigned issued)
4410 {
4411 unsigned had = in->caps_issued();
4412
4413 if ((issued & CEPH_CAP_FILE_CACHE) &&
4414 !(had & CEPH_CAP_FILE_CACHE))
4415 in->cache_gen++;
4416
4417 if ((issued & CEPH_CAP_FILE_SHARED) !=
4418 (had & CEPH_CAP_FILE_SHARED)) {
4419 if (issued & CEPH_CAP_FILE_SHARED)
4420 in->shared_gen++;
4421 if (in->is_dir())
4422 clear_dir_complete_and_ordered(in, true);
4423 }
4424 }
4425
4426 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4427 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4428 inodeno_t realm, int flags, const UserPerm& cap_perms)
4429 {
4430 if (!in->is_any_caps()) {
4431 ceph_assert(in->snaprealm == 0);
4432 in->snaprealm = get_snap_realm(realm);
4433 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4434 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4435 } else {
4436 ceph_assert(in->snaprealm);
4437 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4438 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4439 in->snaprealm_item.remove_myself();
4440 auto oldrealm = in->snaprealm;
4441 in->snaprealm = get_snap_realm(realm);
4442 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4443 put_snap_realm(oldrealm);
4444 }
4445 }
4446
4447 mds_rank_t mds = mds_session->mds_num;
4448 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4449 Cap &cap = capem.first->second;
4450 if (!capem.second) {
4451 if (cap.gen < mds_session->cap_gen)
4452 cap.issued = cap.implemented = CEPH_CAP_PIN;
4453
4454 /*
4455 * auth mds of the inode changed. we received the cap export
4456 * message, but still haven't received the cap import message.
4457 * handle_cap_export() updated the new auth MDS' cap.
4458 *
4459 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4460 * a message that was send before the cap import message. So
4461 * don't remove caps.
4462 */
4463 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4464 if (&cap != in->auth_cap)
4465 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4466
4467 ceph_assert(cap.cap_id == cap_id);
4468 seq = cap.seq;
4469 mseq = cap.mseq;
4470 issued |= cap.issued;
4471 flags |= CEPH_CAP_FLAG_AUTH;
4472 }
4473 } else {
4474 inc_pinned_icaps();
4475 }
4476
4477 check_cap_issue(in, issued);
4478
4479 if (flags & CEPH_CAP_FLAG_AUTH) {
4480 if (in->auth_cap != &cap &&
4481 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4482 if (in->auth_cap) {
4483 if (in->flushing_cap_item.is_on_list()) {
4484 ldout(cct, 10) << __func__ << " changing auth cap: "
4485 << "add myself to new auth MDS' flushing caps list" << dendl;
4486 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4487 }
4488 if (in->dirty_cap_item.is_on_list()) {
4489 ldout(cct, 10) << __func__ << " changing auth cap: "
4490 << "add myself to new auth MDS' dirty caps list" << dendl;
4491 mds_session->get_dirty_list().push_back(&in->dirty_cap_item);
4492 }
4493 }
4494
4495 in->auth_cap = &cap;
4496 }
4497 }
4498
4499 unsigned old_caps = cap.issued;
4500 cap.cap_id = cap_id;
4501 cap.issued = issued;
4502 cap.implemented |= issued;
4503 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4504 cap.wanted = wanted;
4505 else
4506 cap.wanted |= wanted;
4507 cap.seq = seq;
4508 cap.issue_seq = seq;
4509 cap.mseq = mseq;
4510 cap.gen = mds_session->cap_gen;
4511 cap.latest_perms = cap_perms;
4512 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4513 << " from mds." << mds
4514 << " on " << *in
4515 << dendl;
4516
4517 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4518 // non-auth MDS is revoking the newly grant caps ?
4519 for (auto &p : in->caps) {
4520 if (&p.second == &cap)
4521 continue;
4522 if (p.second.implemented & ~p.second.issued & issued) {
4523 check_caps(in, CHECK_CAPS_NODELAY);
4524 break;
4525 }
4526 }
4527 }
4528
4529 if (issued & ~old_caps)
4530 signal_cond_list(in->waitfor_caps);
4531 }
4532
4533 void Client::remove_cap(Cap *cap, bool queue_release)
4534 {
4535 auto &in = cap->inode;
4536 MetaSession *session = cap->session;
4537 mds_rank_t mds = cap->session->mds_num;
4538
4539 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4540
4541 if (queue_release) {
4542 session->enqueue_cap_release(
4543 in.ino,
4544 cap->cap_id,
4545 cap->issue_seq,
4546 cap->mseq,
4547 cap_epoch_barrier);
4548 } else {
4549 dec_pinned_icaps();
4550 }
4551
4552
4553 if (in.auth_cap == cap) {
4554 if (in.flushing_cap_item.is_on_list()) {
4555 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4556 in.flushing_cap_item.remove_myself();
4557 }
4558 in.auth_cap = NULL;
4559 }
4560 size_t n = in.caps.erase(mds);
4561 ceph_assert(n == 1);
4562 cap = nullptr;
4563
4564 if (!in.is_any_caps()) {
4565 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4566 in.snaprealm_item.remove_myself();
4567 put_snap_realm(in.snaprealm);
4568 in.snaprealm = 0;
4569 }
4570 }
4571
4572 void Client::remove_all_caps(Inode *in)
4573 {
4574 while (!in->caps.empty())
4575 remove_cap(&in->caps.begin()->second, true);
4576 }
4577
4578 void Client::remove_session_caps(MetaSession *s, int err)
4579 {
4580 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4581
4582 while (s->caps.size()) {
4583 Cap *cap = *s->caps.begin();
4584 InodeRef in(&cap->inode);
4585 bool dirty_caps = false;
4586 if (in->auth_cap == cap) {
4587 dirty_caps = in->dirty_caps | in->flushing_caps;
4588 in->wanted_max_size = 0;
4589 in->requested_max_size = 0;
4590 if (in->has_any_filelocks())
4591 in->flags |= I_ERROR_FILELOCK;
4592 }
4593 auto caps = cap->implemented;
4594 if (cap->wanted | cap->issued)
4595 in->flags |= I_CAP_DROPPED;
4596 remove_cap(cap, false);
4597 in->cap_snaps.clear();
4598 if (dirty_caps) {
4599 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4600 if (in->flushing_caps) {
4601 num_flushing_caps--;
4602 in->flushing_cap_tids.clear();
4603 }
4604 in->flushing_caps = 0;
4605 in->mark_caps_clean();
4606 put_inode(in.get());
4607 }
4608 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4609 if (caps && !in->caps_issued_mask(caps, true)) {
4610 if (err == -CEPHFS_EBLOCKLISTED) {
4611 if (in->oset.dirty_or_tx) {
4612 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4613 in->set_async_err(err);
4614 }
4615 objectcacher->purge_set(&in->oset);
4616 } else {
4617 objectcacher->release_set(&in->oset);
4618 }
4619 _schedule_invalidate_callback(in.get(), 0, 0);
4620 }
4621
4622 signal_cond_list(in->waitfor_caps);
4623 }
4624 s->flushing_caps_tids.clear();
4625 sync_cond.notify_all();
4626 }
4627
4628 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4629 {
4630 uint64_t max_retries = cct->_conf.get_val<uint64_t>("client_max_retries_on_remount_failure");
4631 bool abort_on_failure = false;
4632
4633 errno = 0;
4634 int r = remount_cb(callback_handle);
4635 if (r == 0) {
4636 retries_on_invalidate = 0;
4637 } else {
4638 int e = errno;
4639 client_t whoami = get_nodeid();
4640 if (r == -1) {
4641 lderr(cct) <<
4642 "failed to remount (to trim kernel dentries): "
4643 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4644 } else {
4645 lderr(cct) <<
4646 "failed to remount (to trim kernel dentries): "
4647 "return code = " << r << dendl;
4648 }
4649 bool should_abort =
4650 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4651 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4652 !(retry_on_error && (++retries_on_invalidate < max_retries));
4653 if (should_abort && !is_unmounting()) {
4654 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4655 abort_on_failure = true;
4656 }
4657 }
4658 return std::make_pair(r, abort_on_failure);
4659 }
4660
4661 class C_Client_Remount : public Context {
4662 private:
4663 Client *client;
4664 public:
4665 explicit C_Client_Remount(Client *c) : client(c) {}
4666 void finish(int r) override {
4667 ceph_assert(r == 0);
4668 auto result = client->_do_remount(true);
4669 if (result.second) {
4670 ceph_abort();
4671 }
4672 }
4673 };
4674
4675 void Client::_invalidate_kernel_dcache()
4676 {
4677 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4678 if (!mref_reader.is_state_satisfied())
4679 return;
4680
4681 if (can_invalidate_dentries) {
4682 if (dentry_invalidate_cb && root->dir) {
4683 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4684 p != root->dir->dentries.end();
4685 ++p) {
4686 if (p->second->inode)
4687 _schedule_invalidate_dentry_callback(p->second, false);
4688 }
4689 }
4690 } else if (remount_cb) {
4691 // Hacky:
4692 // when remounting a file system, linux kernel trims all unused dentries in the fs
4693 remount_finisher.queue(new C_Client_Remount(this));
4694 }
4695 }
4696
4697 void Client::_trim_negative_child_dentries(InodeRef& in)
4698 {
4699 if (!in->is_dir())
4700 return;
4701
4702 Dir* dir = in->dir;
4703 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4704 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4705 Dentry *dn = p->second;
4706 ++p;
4707 ceph_assert(!dn->inode);
4708 if (dn->lru_is_expireable())
4709 unlink(dn, true, false); // keep dir, drop dentry
4710 }
4711 if (dir->dentries.empty()) {
4712 close_dir(dir);
4713 }
4714 }
4715
4716 if (in->flags & I_SNAPDIR_OPEN) {
4717 InodeRef snapdir = open_snapdir(in.get());
4718 _trim_negative_child_dentries(snapdir);
4719 }
4720 }
4721
4722 class C_Client_CacheRelease : public Context {
4723 private:
4724 Client *client;
4725 vinodeno_t ino;
4726 public:
4727 C_Client_CacheRelease(Client *c, Inode *in) :
4728 client(c) {
4729 if (client->use_faked_inos())
4730 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4731 else
4732 ino = in->vino();
4733 }
4734 void finish(int r) override {
4735 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4736 client->_async_inode_release(ino);
4737 }
4738 };
4739
4740 void Client::_async_inode_release(vinodeno_t ino)
4741 {
4742 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4743 if (!mref_reader.is_state_satisfied())
4744 return;
4745
4746 ldout(cct, 10) << __func__ << " " << ino << dendl;
4747 ino_release_cb(callback_handle, ino);
4748 }
4749
4750 void Client::_schedule_ino_release_callback(Inode *in) {
4751
4752 if (ino_release_cb)
4753 // we queue the invalidate, which calls the callback and decrements the ref
4754 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4755 }
4756
4757 void Client::trim_caps(MetaSession *s, uint64_t max)
4758 {
4759 mds_rank_t mds = s->mds_num;
4760 size_t caps_size = s->caps.size();
4761 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4762 << " caps " << caps_size << dendl;
4763
4764 uint64_t trimmed = 0;
4765 auto p = s->caps.begin();
4766 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4767 * looking at from getting deleted during traversal. */
4768 while ((caps_size - trimmed) > max && !p.end()) {
4769 Cap *cap = *p;
4770 InodeRef in(&cap->inode);
4771
4772 // Increment p early because it will be invalidated if cap
4773 // is deleted inside remove_cap
4774 ++p;
4775
4776 if (in->caps.size() > 1 && cap != in->auth_cap) {
4777 int mine = cap->issued | cap->implemented;
4778 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4779 // disposable non-auth cap
4780 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4781 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4782 cap = (remove_cap(cap, true), nullptr);
4783 trimmed++;
4784 }
4785 } else {
4786 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4787 _trim_negative_child_dentries(in);
4788 bool all = true;
4789 auto q = in->dentries.begin();
4790 while (q != in->dentries.end()) {
4791 Dentry *dn = *q;
4792 ++q;
4793 if (dn->lru_is_expireable()) {
4794 if (can_invalidate_dentries &&
4795 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4796 // Only issue one of these per DN for inodes in root: handle
4797 // others more efficiently by calling for root-child DNs at
4798 // the end of this function.
4799 _schedule_invalidate_dentry_callback(dn, true);
4800 }
4801 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4802 to_trim.insert(dn);
4803 } else {
4804 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4805 all = false;
4806 }
4807 }
4808 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4809 _schedule_ino_release_callback(in.get());
4810 }
4811 if (all && in->ino != CEPH_INO_ROOT) {
4812 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4813 trimmed++;
4814 }
4815 }
4816 }
4817 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4818 for (const auto &dn : to_trim) {
4819 trim_dentry(dn);
4820 }
4821 to_trim.clear();
4822
4823 caps_size = s->caps.size();
4824 if (caps_size > (size_t)max)
4825 _invalidate_kernel_dcache();
4826 }
4827
4828 void Client::force_session_readonly(MetaSession *s)
4829 {
4830 s->readonly = true;
4831 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4832 auto &in = (*p)->inode;
4833 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4834 signal_cond_list(in.waitfor_caps);
4835 }
4836 }
4837
4838 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4839 {
4840 MetaSession *session = in->auth_cap->session;
4841
4842 int flushing = in->dirty_caps;
4843 ceph_assert(flushing);
4844
4845 ceph_tid_t flush_tid = ++last_flush_tid;
4846 in->flushing_cap_tids[flush_tid] = flushing;
4847
4848 if (!in->flushing_caps) {
4849 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4850 num_flushing_caps++;
4851 } else {
4852 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4853 }
4854
4855 in->flushing_caps |= flushing;
4856 in->mark_caps_clean();
4857
4858 if (!in->flushing_cap_item.is_on_list())
4859 session->flushing_caps.push_back(&in->flushing_cap_item);
4860 session->flushing_caps_tids.insert(flush_tid);
4861
4862 *ptid = flush_tid;
4863 return flushing;
4864 }
4865
4866 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4867 {
4868 for (auto &p : in->cap_snaps) {
4869 CapSnap &capsnap = p.second;
4870 if (capsnap.flush_tid > 0) {
4871 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4872 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4873 }
4874 }
4875 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4876 it != in->flushing_cap_tids.end();
4877 ++it) {
4878 old_s->flushing_caps_tids.erase(it->first);
4879 new_s->flushing_caps_tids.insert(it->first);
4880 }
4881 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4882 }
4883
4884 /*
4885 * Flush all the dirty caps back to the MDS. Because the callers
4886 * generally wait on the result of this function (syncfs and umount
4887 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4888 */
4889 void Client::flush_caps_sync()
4890 {
4891 ldout(cct, 10) << __func__ << dendl;
4892 for (auto &q : mds_sessions) {
4893 auto s = q.second;
4894 xlist<Inode*>::iterator p = s->dirty_list.begin();
4895 while (!p.end()) {
4896 unsigned flags = CHECK_CAPS_NODELAY;
4897 Inode *in = *p;
4898
4899 ++p;
4900 if (p.end())
4901 flags |= CHECK_CAPS_SYNCHRONOUS;
4902 check_caps(in, flags);
4903 }
4904 }
4905 }
4906
4907 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4908 {
4909 while (in->flushing_caps) {
4910 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4911 ceph_assert(it != in->flushing_cap_tids.end());
4912 if (it->first > want)
4913 break;
4914 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4915 << ccap_string(it->second) << " want " << want
4916 << " last " << it->first << dendl;
4917 wait_on_list(in->waitfor_caps);
4918 }
4919 }
4920
4921 void Client::wait_sync_caps(ceph_tid_t want)
4922 {
4923 retry:
4924 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4925 << num_flushing_caps << " total flushing)" << dendl;
4926 for (auto &p : mds_sessions) {
4927 auto s = p.second;
4928 if (s->flushing_caps_tids.empty())
4929 continue;
4930 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4931 if (oldest_tid <= want) {
4932 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4933 << " (want " << want << ")" << dendl;
4934 std::unique_lock l{client_lock, std::adopt_lock};
4935 sync_cond.wait(l);
4936 l.release();
4937 goto retry;
4938 }
4939 }
4940 }
4941
4942 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4943 {
4944 in->flags &= ~I_KICK_FLUSH;
4945
4946 Cap *cap = in->auth_cap;
4947 ceph_assert(cap->session == session);
4948
4949 ceph_tid_t last_snap_flush = 0;
4950 for (auto p = in->flushing_cap_tids.rbegin();
4951 p != in->flushing_cap_tids.rend();
4952 ++p) {
4953 if (!p->second) {
4954 last_snap_flush = p->first;
4955 break;
4956 }
4957 }
4958
4959 int wanted = in->caps_wanted();
4960 int used = get_caps_used(in) | in->caps_dirty();
4961 auto it = in->cap_snaps.begin();
4962 for (auto& p : in->flushing_cap_tids) {
4963 if (p.second) {
4964 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4965 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4966 p.second, p.first);
4967 } else {
4968 ceph_assert(it != in->cap_snaps.end());
4969 ceph_assert(it->second.flush_tid == p.first);
4970 send_flush_snap(in, session, it->first, it->second);
4971 ++it;
4972 }
4973 }
4974 }
4975
4976 void Client::kick_flushing_caps(MetaSession *session)
4977 {
4978 mds_rank_t mds = session->mds_num;
4979 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4980
4981 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4982 Inode *in = *p;
4983 if (in->flags & I_KICK_FLUSH) {
4984 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4985 kick_flushing_caps(in, session);
4986 }
4987 }
4988 }
4989
4990 void Client::early_kick_flushing_caps(MetaSession *session)
4991 {
4992 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4993 Inode *in = *p;
4994 Cap *cap = in->auth_cap;
4995 ceph_assert(cap);
4996
4997 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4998 // stage. This guarantees that MDS processes the cap flush message before issuing
4999 // the flushing caps to other client.
5000 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
5001 in->flags |= I_KICK_FLUSH;
5002 continue;
5003 }
5004
5005 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
5006 << " to mds." << session->mds_num << dendl;
5007 // send_reconnect() also will reset these sequence numbers. make sure
5008 // sequence numbers in cap flush message match later reconnect message.
5009 cap->seq = 0;
5010 cap->issue_seq = 0;
5011 cap->mseq = 0;
5012 cap->issued = cap->implemented;
5013
5014 kick_flushing_caps(in, session);
5015 }
5016 }
5017
5018 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
5019 {
5020 list<SnapRealm*> q;
5021 q.push_back(realm);
5022
5023 while (!q.empty()) {
5024 realm = q.front();
5025 q.pop_front();
5026
5027 ldout(cct, 10) << __func__ << " " << *realm << dendl;
5028 realm->invalidate_cache();
5029
5030 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5031 p != realm->pchildren.end();
5032 ++p)
5033 q.push_back(*p);
5034 }
5035 }
5036
5037 SnapRealm *Client::get_snap_realm(inodeno_t r)
5038 {
5039 SnapRealm *realm = snap_realms[r];
5040
5041 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
5042 << (realm ? realm->nref : 0) << dendl;
5043 if (!realm) {
5044 snap_realms[r] = realm = new SnapRealm(r);
5045
5046 // Do not release the global snaprealm until unmounting.
5047 if (r == CEPH_INO_GLOBAL_SNAPREALM)
5048 realm->nref++;
5049 }
5050
5051 realm->nref++;
5052 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
5053 << realm->nref << dendl;
5054 return realm;
5055 }
5056
5057 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
5058 {
5059 if (snap_realms.count(r) == 0) {
5060 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
5061 return NULL;
5062 }
5063 SnapRealm *realm = snap_realms[r];
5064 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
5065 realm->nref++;
5066 return realm;
5067 }
5068
5069 void Client::put_snap_realm(SnapRealm *realm)
5070 {
5071 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
5072 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
5073 if (--realm->nref == 0) {
5074 snap_realms.erase(realm->ino);
5075 if (realm->pparent) {
5076 realm->pparent->pchildren.erase(realm);
5077 put_snap_realm(realm->pparent);
5078 }
5079 delete realm;
5080 }
5081 }
5082
5083 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
5084 {
5085 if (realm->parent != parent) {
5086 ldout(cct, 10) << __func__ << " " << *realm
5087 << " " << realm->parent << " -> " << parent << dendl;
5088 realm->parent = parent;
5089 if (realm->pparent) {
5090 realm->pparent->pchildren.erase(realm);
5091 put_snap_realm(realm->pparent);
5092 }
5093 realm->pparent = get_snap_realm(parent);
5094 realm->pparent->pchildren.insert(realm);
5095 return true;
5096 }
5097 return false;
5098 }
5099
5100 static bool has_new_snaps(const SnapContext& old_snapc,
5101 const SnapContext& new_snapc)
5102 {
5103 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
5104 }
5105
5106 struct SnapRealmInfoMeta {
5107 SnapRealmInfoMeta(utime_t last_modified, uint64_t change_attr)
5108 : last_modified(last_modified),
5109 change_attr(change_attr) {
5110 }
5111
5112 utime_t last_modified;
5113 uint64_t change_attr;
5114 };
5115
5116 static std::pair<SnapRealmInfo, std::optional<SnapRealmInfoMeta>> get_snap_realm_info(
5117 MetaSession *session, bufferlist::const_iterator &p) {
5118 if (session->mds_features.test(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
5119 SnapRealmInfoNew ninfo;
5120 decode(ninfo, p);
5121 return std::make_pair(ninfo.info, SnapRealmInfoMeta(ninfo.last_modified, ninfo.change_attr));
5122 } else {
5123 SnapRealmInfo info;
5124 decode(info, p);
5125 return std::make_pair(info, std::nullopt);
5126 }
5127 }
5128
5129
5130 void Client::update_snap_trace(MetaSession *session, const bufferlist& bl, SnapRealm **realm_ret, bool flush)
5131 {
5132 SnapRealm *first_realm = NULL;
5133 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
5134
5135 map<SnapRealm*, SnapContext> dirty_realms;
5136
5137 auto p = bl.cbegin();
5138 while (!p.end()) {
5139 auto [info, realm_info_meta] = get_snap_realm_info(session, p);
5140 SnapRealm *realm = get_snap_realm(info.ino());
5141
5142 bool invalidate = false;
5143
5144 if (info.seq() > realm->seq ||
5145 (realm_info_meta && (*realm_info_meta).change_attr > realm->change_attr)) {
5146 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
5147 << dendl;
5148
5149 if (flush) {
5150 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5151 // flush me + children
5152 list<SnapRealm*> q;
5153 q.push_back(realm);
5154 while (!q.empty()) {
5155 SnapRealm *realm = q.front();
5156 q.pop_front();
5157
5158 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5159 p != realm->pchildren.end();
5160 ++p)
5161 q.push_back(*p);
5162
5163 if (dirty_realms.count(realm) == 0) {
5164 realm->nref++;
5165 dirty_realms[realm] = realm->get_snap_context();
5166 }
5167 }
5168 }
5169
5170 // update
5171 realm->seq = info.seq();
5172 realm->created = info.created();
5173 realm->parent_since = info.parent_since();
5174 realm->prior_parent_snaps = info.prior_parent_snaps;
5175 if (realm_info_meta) {
5176 realm->last_modified = (*realm_info_meta).last_modified;
5177 realm->change_attr = (*realm_info_meta).change_attr;
5178 }
5179 realm->my_snaps = info.my_snaps;
5180 invalidate = true;
5181 }
5182
5183 // _always_ verify parent
5184 if (adjust_realm_parent(realm, info.parent()))
5185 invalidate = true;
5186
5187 if (invalidate) {
5188 invalidate_snaprealm_and_children(realm);
5189 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
5190 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
5191 } else {
5192 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
5193 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5194 }
5195
5196 if (!first_realm)
5197 first_realm = realm;
5198 else
5199 put_snap_realm(realm);
5200 }
5201
5202 for (auto &[realm, snapc] : dirty_realms) {
5203 // if there are new snaps ?
5204 if (has_new_snaps(snapc, realm->get_snap_context())) {
5205 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
5206 for (auto&& in : realm->inodes_with_caps) {
5207 queue_cap_snap(in, snapc);
5208 }
5209 } else {
5210 ldout(cct, 10) << " no new snap on " << *realm << dendl;
5211 }
5212 put_snap_realm(realm);
5213 }
5214
5215 if (realm_ret)
5216 *realm_ret = first_realm;
5217 else
5218 put_snap_realm(first_realm);
5219 }
5220
5221 void Client::handle_snap(const MConstRef<MClientSnap>& m)
5222 {
5223 ldout(cct, 10) << __func__ << " " << *m << dendl;
5224 mds_rank_t mds = mds_rank_t(m->get_source().num());
5225
5226 std::scoped_lock cl(client_lock);
5227 auto session = _get_mds_session(mds, m->get_connection().get());
5228 if (!session) {
5229 return;
5230 }
5231
5232 got_mds_push(session.get());
5233
5234 map<Inode*, SnapContext> to_move;
5235 SnapRealm *realm = 0;
5236
5237 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5238 ceph_assert(m->head.split);
5239 auto p = m->bl.cbegin();
5240 auto [info, _] = get_snap_realm_info(session.get(), p);
5241 ceph_assert(info.ino() == m->head.split);
5242
5243 // flush, then move, ino's.
5244 realm = get_snap_realm(info.ino());
5245 ldout(cct, 10) << " splitting off " << *realm << dendl;
5246 for (auto& ino : m->split_inos) {
5247 vinodeno_t vino(ino, CEPH_NOSNAP);
5248 if (inode_map.count(vino)) {
5249 Inode *in = inode_map[vino];
5250 if (!in->snaprealm || in->snaprealm == realm)
5251 continue;
5252 if (in->snaprealm->created > info.created()) {
5253 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5254 << *in->snaprealm << dendl;
5255 continue;
5256 }
5257 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5258
5259
5260 in->snaprealm_item.remove_myself();
5261 to_move[in] = in->snaprealm->get_snap_context();
5262 put_snap_realm(in->snaprealm);
5263 }
5264 }
5265
5266 // move child snaprealms, too
5267 for (auto& child_realm : m->split_realms) {
5268 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5269 SnapRealm *child = get_snap_realm_maybe(child_realm);
5270 if (!child)
5271 continue;
5272 adjust_realm_parent(child, realm->ino);
5273 put_snap_realm(child);
5274 }
5275 }
5276
5277 update_snap_trace(session.get(), m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5278
5279 if (realm) {
5280 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5281 Inode *in = p->first;
5282 in->snaprealm = realm;
5283 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5284 realm->nref++;
5285 // queue for snap writeback
5286 if (has_new_snaps(p->second, realm->get_snap_context()))
5287 queue_cap_snap(in, p->second);
5288 }
5289 put_snap_realm(realm);
5290 }
5291 }
5292
5293 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5294 {
5295 mds_rank_t mds = mds_rank_t(m->get_source().num());
5296
5297 std::scoped_lock cl(client_lock);
5298 auto session = _get_mds_session(mds, m->get_connection().get());
5299 if (!session) {
5300 return;
5301 }
5302
5303 got_mds_push(session.get());
5304
5305 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5306
5307 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5308 if (inode_map.count(vino)) {
5309 Inode *in = NULL;
5310 in = inode_map[vino];
5311
5312 if (in) {
5313 in->quota = m->quota;
5314 in->rstat = m->rstat;
5315 }
5316 }
5317 }
5318
5319 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5320 {
5321 mds_rank_t mds = mds_rank_t(m->get_source().num());
5322
5323 std::scoped_lock cl(client_lock);
5324 auto session = _get_mds_session(mds, m->get_connection().get());
5325 if (!session) {
5326 return;
5327 }
5328
5329 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5330 // Pause RADOS operations until we see the required epoch
5331 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5332 }
5333
5334 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5335 // Record the barrier so that we will transmit it to MDS when releasing
5336 set_cap_epoch_barrier(m->osd_epoch_barrier);
5337 }
5338
5339 got_mds_push(session.get());
5340
5341 bool do_cap_release = false;
5342 Inode *in;
5343 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5344 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5345 in = it->second;
5346
5347 /* MDS maybe waiting for cap release with increased seq */
5348 switch (m->get_op()) {
5349 case CEPH_CAP_OP_REVOKE:
5350 case CEPH_CAP_OP_GRANT:
5351 if (!in->caps.count(mds)) {
5352 do_cap_release = true;
5353 ldout(cct, 5) << __func__ << " vino " << vino << " don't have cap "
5354 << m->get_cap_id() << " op " << m->get_op()
5355 << ", immediately releasing" << dendl;
5356 }
5357 }
5358 } else {
5359 /* MDS maybe waiting for cap release with increased seq */
5360 switch (m->get_op()) {
5361 case CEPH_CAP_OP_IMPORT:
5362 case CEPH_CAP_OP_REVOKE:
5363 case CEPH_CAP_OP_GRANT:
5364 do_cap_release = true;
5365 ldout(cct, 5) << __func__ << " don't have vino " << vino << " op "
5366 << m->get_op() << ", immediately releasing" << dendl;
5367 break;
5368 default:
5369 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5370 return;
5371 }
5372 }
5373
5374 // In case the mds is waiting on e.g. a revocation
5375 if (do_cap_release) {
5376 session->enqueue_cap_release(
5377 m->get_ino(),
5378 m->get_cap_id(),
5379 m->get_seq(),
5380 m->get_mseq(),
5381 cap_epoch_barrier);
5382
5383 flush_cap_releases();
5384 return;
5385 }
5386
5387 switch (m->get_op()) {
5388 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5389 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5390 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5391 }
5392
5393 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5394 Cap &cap = in->caps.at(mds);
5395
5396 switch (m->get_op()) {
5397 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5398 case CEPH_CAP_OP_IMPORT:
5399 case CEPH_CAP_OP_REVOKE:
5400 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5401 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5402 }
5403 } else {
5404 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5405 return;
5406 }
5407 }
5408
5409 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5410 {
5411 mds_rank_t mds = session->mds_num;
5412
5413 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5414 << " IMPORT from mds." << mds << dendl;
5415
5416 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5417 Cap *cap = NULL;
5418 UserPerm cap_perms;
5419 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5420 cap = &it->second;
5421 cap_perms = cap->latest_perms;
5422 }
5423
5424 // add/update it
5425 SnapRealm *realm = NULL;
5426 update_snap_trace(session, m->snapbl, &realm);
5427
5428 int issued = m->get_caps();
5429 int wanted = m->get_wanted();
5430 add_update_cap(in, session, m->get_cap_id(),
5431 issued, wanted, m->get_seq(), m->get_mseq(),
5432 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5433
5434 if (cap && cap->cap_id == m->peer.cap_id) {
5435 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5436 }
5437
5438 if (realm)
5439 put_snap_realm(realm);
5440
5441 if (in->auth_cap && in->auth_cap->session == session) {
5442 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5443 in->requested_max_size > m->get_max_size()) {
5444 in->requested_max_size = 0;
5445 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5446 }
5447 // reflush any/all caps (if we are now the auth_cap)
5448 kick_flushing_caps(in, session);
5449 }
5450 }
5451
5452 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5453 {
5454 mds_rank_t mds = session->mds_num;
5455
5456 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5457 << " EXPORT from mds." << mds << dendl;
5458
5459 auto it = in->caps.find(mds);
5460 if (it != in->caps.end()) {
5461 Cap &cap = it->second;
5462 if (cap.cap_id == m->get_cap_id()) {
5463 if (m->peer.cap_id) {
5464 const auto peer_mds = mds_rank_t(m->peer.mds);
5465 auto tsession = _get_or_open_mds_session(peer_mds);
5466 auto it = in->caps.find(peer_mds);
5467 if (it != in->caps.end()) {
5468 Cap &tcap = it->second;
5469 if (tcap.cap_id == m->peer.cap_id &&
5470 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5471 tcap.cap_id = m->peer.cap_id;
5472 tcap.seq = m->peer.seq - 1;
5473 tcap.issue_seq = tcap.seq;
5474 tcap.issued |= cap.issued;
5475 tcap.implemented |= cap.issued;
5476 if (&cap == in->auth_cap)
5477 in->auth_cap = &tcap;
5478 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5479 adjust_session_flushing_caps(in, session, tsession.get());
5480 }
5481 } else {
5482 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5483 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5484 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5485 cap.latest_perms);
5486 }
5487 } else {
5488 if (cap.wanted | cap.issued)
5489 in->flags |= I_CAP_DROPPED;
5490 }
5491
5492 remove_cap(&cap, false);
5493 }
5494 }
5495 }
5496
5497 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5498 {
5499 mds_rank_t mds = session->mds_num;
5500 ceph_assert(in->caps.count(mds));
5501
5502 uint64_t size = m->get_size();
5503 if (in->is_fscrypt_enabled()) {
5504 size = std::stoll(std::string(std::rbegin(m->fscrypt_file),
5505 std::rend(m->fscrypt_file)));
5506 }
5507 ldout(cct, 10) << __func__ << " on ino " << *in
5508 << " size " << in->size << " -> " << m->get_size()
5509 << dendl;
5510
5511 int issued;
5512 in->caps_issued(&issued);
5513 issued |= in->caps_dirty();
5514 update_inode_file_size(in, issued, size, m->get_truncate_seq(),
5515 m->get_truncate_size());
5516 }
5517
5518 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5519 {
5520 ceph_tid_t flush_ack_tid = m->get_client_tid();
5521 int dirty = m->get_dirty();
5522 int cleaned = 0;
5523 int flushed = 0;
5524
5525 auto it = in->flushing_cap_tids.begin();
5526 if (it->first < flush_ack_tid) {
5527 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5528 << " got unexpected flush ack tid " << flush_ack_tid
5529 << " expected is " << it->first << dendl;
5530 }
5531 for (; it != in->flushing_cap_tids.end(); ) {
5532 if (!it->second) {
5533 // cap snap
5534 ++it;
5535 continue;
5536 }
5537 if (it->first == flush_ack_tid)
5538 cleaned = it->second;
5539 if (it->first <= flush_ack_tid) {
5540 session->flushing_caps_tids.erase(it->first);
5541 in->flushing_cap_tids.erase(it++);
5542 ++flushed;
5543 continue;
5544 }
5545 cleaned &= ~it->second;
5546 if (!cleaned)
5547 break;
5548 ++it;
5549 }
5550
5551 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5552 << " cleaned " << ccap_string(cleaned) << " on " << *in
5553 << " with " << ccap_string(dirty) << dendl;
5554
5555 if (flushed) {
5556 signal_cond_list(in->waitfor_caps);
5557 if (session->flushing_caps_tids.empty() ||
5558 *session->flushing_caps_tids.begin() > flush_ack_tid)
5559 sync_cond.notify_all();
5560 }
5561
5562 if (!dirty) {
5563 in->cap_dirtier_uid = -1;
5564 in->cap_dirtier_gid = -1;
5565 }
5566
5567 if (!cleaned) {
5568 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5569 } else {
5570 if (in->flushing_caps) {
5571 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5572 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5573 in->flushing_caps &= ~cleaned;
5574 if (in->flushing_caps == 0) {
5575 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5576 num_flushing_caps--;
5577 if (in->flushing_cap_tids.empty())
5578 in->flushing_cap_item.remove_myself();
5579 }
5580 if (!in->caps_dirty())
5581 put_inode(in);
5582 }
5583 }
5584 }
5585
5586
5587 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5588 {
5589 ceph_tid_t flush_ack_tid = m->get_client_tid();
5590 mds_rank_t mds = session->mds_num;
5591 ceph_assert(in->caps.count(mds));
5592 snapid_t follows = m->get_snap_follows();
5593
5594 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5595 auto& capsnap = it->second;
5596 if (flush_ack_tid != capsnap.flush_tid) {
5597 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5598 } else {
5599 InodeRef tmp_ref(in);
5600 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5601 << " on " << *in << dendl;
5602 session->flushing_caps_tids.erase(capsnap.flush_tid);
5603 in->flushing_cap_tids.erase(capsnap.flush_tid);
5604 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5605 in->flushing_cap_item.remove_myself();
5606 in->cap_snaps.erase(it);
5607
5608 signal_cond_list(in->waitfor_caps);
5609 if (session->flushing_caps_tids.empty() ||
5610 *session->flushing_caps_tids.begin() > flush_ack_tid)
5611 sync_cond.notify_all();
5612 }
5613 } else {
5614 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5615 << " on " << *in << dendl;
5616 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5617 }
5618 }
5619
5620 class C_Client_DentryInvalidate : public Context {
5621 private:
5622 Client *client;
5623 vinodeno_t dirino;
5624 vinodeno_t ino;
5625 string name;
5626 public:
5627 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5628 client(c), name(dn->name) {
5629 if (client->use_faked_inos()) {
5630 dirino.ino = dn->dir->parent_inode->faked_ino;
5631 if (del)
5632 ino.ino = dn->inode->faked_ino;
5633 } else {
5634 dirino = dn->dir->parent_inode->vino();
5635 if (del)
5636 ino = dn->inode->vino();
5637 }
5638 if (!del)
5639 ino.ino = inodeno_t();
5640 }
5641 void finish(int r) override {
5642 // _async_dentry_invalidate is responsible for its own locking
5643 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5644 client->_async_dentry_invalidate(dirino, ino, name);
5645 }
5646 };
5647
5648 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5649 {
5650 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5651 if (!mref_reader.is_state_satisfied())
5652 return;
5653
5654 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5655 << " in dir " << dirino << dendl;
5656 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5657 }
5658
5659 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5660 {
5661 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5662 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5663 }
5664
5665 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5666 {
5667 int ref = in->get_nref();
5668 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5669
5670 if (in->dir && !in->dir->dentries.empty()) {
5671 for (auto p = in->dir->dentries.begin();
5672 p != in->dir->dentries.end(); ) {
5673 Dentry *dn = p->second;
5674 ++p;
5675 /* rmsnap removes whole subtree, need trim inodes recursively.
5676 * we don't need to invalidate dentries recursively. because
5677 * invalidating a directory dentry effectively invalidate
5678 * whole subtree */
5679 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5680 _try_to_trim_inode(dn->inode.get(), false);
5681
5682 if (dn->lru_is_expireable())
5683 unlink(dn, true, false); // keep dir, drop dentry
5684 }
5685 if (in->dir->dentries.empty()) {
5686 close_dir(in->dir);
5687 --ref;
5688 }
5689 }
5690
5691 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5692 InodeRef snapdir = open_snapdir(in);
5693 _try_to_trim_inode(snapdir.get(), false);
5694 --ref;
5695 }
5696
5697 if (ref > 1) {
5698 auto q = in->dentries.begin();
5699 while (q != in->dentries.end()) {
5700 Dentry *dn = *q;
5701 ++q;
5702 if( in->ll_ref > 0 && sched_inval) {
5703 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5704 // so in->dentries doesn't always reflect the state of kernel's dcache.
5705 _schedule_invalidate_dentry_callback(dn, true);
5706 }
5707 unlink(dn, true, true);
5708 }
5709 }
5710 }
5711
5712 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5713 {
5714 mds_rank_t mds = session->mds_num;
5715 int used = get_caps_used(in);
5716 int wanted = in->caps_wanted();
5717 int flags = 0;
5718
5719 const unsigned new_caps = m->get_caps();
5720 const bool was_stale = session->cap_gen > cap->gen;
5721 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5722 << " mds." << mds << " seq " << m->get_seq()
5723 << " caps now " << ccap_string(new_caps)
5724 << " was " << ccap_string(cap->issued)
5725 << (was_stale ? " (stale)" : "") << dendl;
5726
5727 if (was_stale)
5728 cap->issued = cap->implemented = CEPH_CAP_PIN;
5729 cap->seq = m->get_seq();
5730 cap->gen = session->cap_gen;
5731
5732 check_cap_issue(in, new_caps);
5733
5734 // update inode
5735 int issued;
5736 in->caps_issued(&issued);
5737 issued |= in->caps_dirty();
5738
5739 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5740 !(issued & CEPH_CAP_AUTH_EXCL)) {
5741 in->mode = m->head.mode;
5742 in->uid = m->head.uid;
5743 in->gid = m->head.gid;
5744 in->btime = m->btime;
5745 }
5746 bool deleted_inode = false;
5747 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5748 !(issued & CEPH_CAP_LINK_EXCL)) {
5749 in->nlink = m->head.nlink;
5750 if (in->nlink == 0)
5751 deleted_inode = true;
5752 }
5753 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5754 m->xattrbl.length() &&
5755 m->head.xattr_version > in->xattr_version) {
5756 auto p = m->xattrbl.cbegin();
5757 decode(in->xattrs, p);
5758 in->xattr_version = m->head.xattr_version;
5759 }
5760
5761 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5762 in->dirstat.nfiles = m->get_nfiles();
5763 in->dirstat.nsubdirs = m->get_nsubdirs();
5764 }
5765
5766 if (new_caps & CEPH_CAP_ANY_RD) {
5767 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5768 m->get_ctime(), m->get_mtime(), m->get_atime());
5769 }
5770
5771 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5772 in->layout = m->get_layout();
5773 update_inode_file_size(in, issued, m->get_size(),
5774 m->get_truncate_seq(), m->get_truncate_size());
5775 }
5776
5777 if (m->inline_version > in->inline_version) {
5778 in->inline_data = m->inline_data;
5779 in->inline_version = m->inline_version;
5780 }
5781
5782 /* always take a newer change attr */
5783 if (m->get_change_attr() > in->change_attr)
5784 in->change_attr = m->get_change_attr();
5785
5786 // max_size
5787 if (cap == in->auth_cap &&
5788 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5789 (m->get_max_size() != in->max_size)) {
5790 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5791 in->max_size = m->get_max_size();
5792 if (in->max_size > in->wanted_max_size) {
5793 in->wanted_max_size = 0;
5794 in->requested_max_size = 0;
5795 }
5796 }
5797
5798 bool check = false;
5799 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5800 (wanted & ~(cap->wanted | new_caps))) {
5801 // If mds is importing cap, prior cap messages that update 'wanted'
5802 // may get dropped by mds (migrate seq mismatch).
5803 //
5804 // We don't send cap message to update 'wanted' if what we want are
5805 // already issued. If mds revokes caps, cap message that releases caps
5806 // also tells mds what we want. But if caps got revoked by mds forcedly
5807 // (session stale). We may haven't told mds what we want.
5808 check = true;
5809 }
5810
5811
5812 // update caps
5813 auto revoked = cap->issued & ~new_caps;
5814 if (revoked) {
5815 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5816 cap->issued = new_caps;
5817 cap->implemented |= new_caps;
5818
5819 // recall delegations if we're losing caps necessary for them
5820 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5821 in->recall_deleg(false);
5822 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5823 in->recall_deleg(true);
5824
5825 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5826 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5827 !_flush(in, new C_Client_FlushComplete(this, in))) {
5828 // waitin' for flush
5829 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5830 if (_release(in)) {
5831 check = true;
5832 flags = CHECK_CAPS_NODELAY;
5833 }
5834 } else {
5835 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5836 check = true;
5837 flags = CHECK_CAPS_NODELAY;
5838 }
5839 } else if (cap->issued == new_caps) {
5840 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5841 } else {
5842 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5843 cap->issued = new_caps;
5844 cap->implemented |= new_caps;
5845
5846 if (cap == in->auth_cap) {
5847 // non-auth MDS is revoking the newly grant caps ?
5848 for (const auto &p : in->caps) {
5849 if (&p.second == cap)
5850 continue;
5851 if (p.second.implemented & ~p.second.issued & new_caps) {
5852 check = true;
5853 break;
5854 }
5855 }
5856 }
5857 }
5858
5859 // just in case the caps was released just before we get the revoke msg
5860 if (!check && m->get_op() == CEPH_CAP_OP_REVOKE) {
5861 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5862 check = true;
5863 flags = CHECK_CAPS_NODELAY;
5864 }
5865
5866 if (check)
5867 check_caps(in, flags);
5868
5869 // wake up waiters
5870 if (new_caps)
5871 signal_cond_list(in->waitfor_caps);
5872
5873 // may drop inode's last ref
5874 if (deleted_inode)
5875 _try_to_trim_inode(in, true);
5876 }
5877
5878 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5879 {
5880 if (perms.uid() == 0) {
5881 // For directories, DACs are overridable.
5882 // For files, Read/write DACs are always overridable but executable DACs are
5883 // overridable when there is at least one exec bit set
5884 if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
5885 return -CEPHFS_EACCES;
5886 return 0;
5887 }
5888
5889 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5890 int ret = _posix_acl_permission(in, perms, want);
5891 if (ret != -CEPHFS_EAGAIN)
5892 return ret;
5893 }
5894
5895 // check permissions before doing anything else
5896 if (!in->check_mode(perms, want))
5897 return -CEPHFS_EACCES;
5898 return 0;
5899 }
5900
5901 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5902 const UserPerm& perms)
5903 {
5904 int r = _getattr_for_perm(in, perms);
5905 if (r < 0)
5906 goto out;
5907
5908 r = 0;
5909 if (strncmp(name, "system.", 7) == 0) {
5910 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5911 r = -CEPHFS_EPERM;
5912 } else {
5913 r = inode_permission(in, perms, want);
5914 }
5915 out:
5916 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5917 return r;
5918 }
5919
5920 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5921 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5922 return out;
5923 }
5924
5925 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5926 const UserPerm& perms)
5927 {
5928 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " stx_mode: "
5929 << hex << stx->stx_mode << " mask:" << mask << dec << dendl;
5930 int r = _getattr_for_perm(in, perms);
5931 if (r < 0)
5932 goto out;
5933
5934 if (mask & CEPH_SETATTR_SIZE) {
5935 r = inode_permission(in, perms, MAY_WRITE);
5936 if (r < 0)
5937 goto out;
5938 }
5939
5940 r = -CEPHFS_EPERM;
5941 if (mask & CEPH_SETATTR_UID) {
5942 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5943 goto out;
5944 }
5945 if (mask & CEPH_SETATTR_GID) {
5946 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5947 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5948 goto out;
5949 }
5950
5951 if (mask & CEPH_SETATTR_MODE) {
5952 uint32_t m = ~stx->stx_mode & in->mode; // mode bits removed
5953 ldout(cct, 20) << __func__ << " " << *in << " = " << hex << m << dec << dendl;
5954 if (perms.uid() != 0 && perms.uid() != in->uid &&
5955 /*
5956 * Currently the kernel fuse and libfuse code is buggy and
5957 * won't pass the ATTR_KILL_SUID/ATTR_KILL_SGID to ceph-fuse.
5958 * But will just set the ATTR_MODE and at the same time by
5959 * clearing the suid/sgid bits.
5960 *
5961 * Only allow unprivileged users to clear S_ISUID and S_ISUID.
5962 */
5963 (m & ~(S_ISUID | S_ISGID)))
5964 goto out;
5965
5966 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5967 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5968 stx->stx_mode &= ~S_ISGID;
5969 }
5970
5971 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5972 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5973 if (perms.uid() != 0 && perms.uid() != in->uid) {
5974 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5975 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5976 check_mask |= CEPH_SETATTR_MTIME;
5977 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5978 check_mask |= CEPH_SETATTR_ATIME;
5979 if (check_mask & mask) {
5980 goto out;
5981 } else {
5982 r = inode_permission(in, perms, MAY_WRITE);
5983 if (r < 0)
5984 goto out;
5985 }
5986 }
5987 }
5988 r = 0;
5989 out:
5990 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5991 return r;
5992 }
5993
5994 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5995 {
5996 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5997 unsigned want = 0;
5998
5999 if ((flags & O_ACCMODE) == O_WRONLY)
6000 want = MAY_WRITE;
6001 else if ((flags & O_ACCMODE) == O_RDWR)
6002 want = MAY_READ | MAY_WRITE;
6003 else if ((flags & O_ACCMODE) == O_RDONLY)
6004 want = MAY_READ;
6005 if (flags & O_TRUNC)
6006 want |= MAY_WRITE;
6007
6008 int r = 0;
6009 switch (in->mode & S_IFMT) {
6010 case S_IFLNK:
6011 r = -CEPHFS_ELOOP;
6012 goto out;
6013 case S_IFDIR:
6014 if (want & MAY_WRITE) {
6015 r = -CEPHFS_EISDIR;
6016 goto out;
6017 }
6018 break;
6019 }
6020
6021 r = _getattr_for_perm(in, perms);
6022 if (r < 0)
6023 goto out;
6024
6025 r = inode_permission(in, perms, want);
6026 out:
6027 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
6028 return r;
6029 }
6030
6031 int Client::may_lookup(Inode *dir, const UserPerm& perms)
6032 {
6033 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
6034 int r = _getattr_for_perm(dir, perms);
6035 if (r < 0)
6036 goto out;
6037
6038 r = inode_permission(dir, perms, MAY_EXEC);
6039 out:
6040 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
6041 return r;
6042 }
6043
6044 int Client::may_create(Inode *dir, const UserPerm& perms)
6045 {
6046 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
6047 int r = _getattr_for_perm(dir, perms);
6048 if (r < 0)
6049 goto out;
6050
6051 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
6052 out:
6053 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
6054 return r;
6055 }
6056
6057 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
6058 {
6059 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
6060 int r = _getattr_for_perm(dir, perms);
6061 if (r < 0)
6062 goto out;
6063
6064 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
6065 if (r < 0)
6066 goto out;
6067
6068 /* 'name == NULL' means rmsnap w/o permission checks */
6069 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
6070 InodeRef otherin;
6071 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
6072 if (r < 0)
6073 goto out;
6074 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
6075 r = -CEPHFS_EPERM;
6076 }
6077 out:
6078 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
6079 return r;
6080 }
6081
6082 int Client::may_delete(const char *relpath, const UserPerm& perms) {
6083 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
6084
6085 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6086 if (!mref_reader.is_state_satisfied())
6087 return -CEPHFS_ENOTCONN;
6088
6089 filepath path(relpath);
6090 string name = path.last_dentry();
6091 path.pop_dentry();
6092 InodeRef dir;
6093
6094 std::scoped_lock lock(client_lock);
6095 int r = path_walk(path, &dir, perms);
6096 if (r < 0)
6097 return r;
6098 if (cct->_conf->client_permissions) {
6099 int r = may_delete(dir.get(), name.c_str(), perms);
6100 if (r < 0)
6101 return r;
6102 }
6103
6104 return 0;
6105 }
6106
6107 int Client::may_hardlink(Inode *in, const UserPerm& perms)
6108 {
6109 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
6110 int r = _getattr_for_perm(in, perms);
6111 if (r < 0)
6112 goto out;
6113
6114 if (perms.uid() == 0 || perms.uid() == in->uid) {
6115 r = 0;
6116 goto out;
6117 }
6118
6119 r = -CEPHFS_EPERM;
6120 if (!S_ISREG(in->mode))
6121 goto out;
6122
6123 if (in->mode & S_ISUID)
6124 goto out;
6125
6126 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
6127 goto out;
6128
6129 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
6130 out:
6131 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
6132 return r;
6133 }
6134
6135 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
6136 {
6137 int mask = CEPH_STAT_CAP_MODE;
6138 bool force = false;
6139 if (acl_type != NO_ACL) {
6140 mask |= CEPH_STAT_CAP_XATTR;
6141 force = in->xattr_version == 0;
6142 }
6143 return _getattr(in, mask, perms, force);
6144 }
6145
6146 vinodeno_t Client::_get_vino(Inode *in)
6147 {
6148 /* The caller must hold the client lock */
6149 return vinodeno_t(in->ino, in->snapid);
6150 }
6151
6152 /**
6153 * Resolve an MDS spec to a list of MDS daemon GIDs.
6154 *
6155 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
6156 * It may be '*' in which case it matches all GIDs.
6157 *
6158 * If no error is returned, the `targets` vector will be populated with at least
6159 * one MDS.
6160 */
6161 int Client::resolve_mds(
6162 const std::string &mds_spec,
6163 std::vector<mds_gid_t> *targets)
6164 {
6165 ceph_assert(fsmap);
6166 ceph_assert(targets != nullptr);
6167
6168 mds_role_t role;
6169 CachedStackStringStream css;
6170 int role_r = fsmap->parse_role(mds_spec, &role, *css);
6171 if (role_r == 0) {
6172 // We got a role, resolve it to a GID
6173 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
6174 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
6175 << role << "' aka " << info.human_name() << dendl;
6176 targets->push_back(info.global_id);
6177 return 0;
6178 }
6179
6180 std::string strtol_err;
6181 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
6182 if (strtol_err.empty()) {
6183 // It is a possible GID
6184 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
6185 if (fsmap->gid_exists(mds_gid)) {
6186 auto& info = fsmap->get_info_gid(mds_gid);
6187 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
6188 << info.human_name() << dendl;
6189 targets->push_back(mds_gid);
6190 return 0;
6191 } else {
6192 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
6193 << dendl;
6194 lderr(cct) << "FSMap: " << *fsmap << dendl;
6195 return -CEPHFS_ENOENT;
6196 }
6197 } else if (mds_spec == "*") {
6198 // It is a wildcard: use all MDSs
6199 const auto& mds_info = fsmap->get_mds_info();
6200
6201 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
6202 if (mds_info.empty()) {
6203 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
6204 lderr(cct) << "FSMap: " << *fsmap << dendl;
6205 return -CEPHFS_ENOENT;
6206 }
6207
6208 for (const auto& [gid, info] : mds_info) {
6209 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6210 targets->push_back(gid);
6211 }
6212 return 0;
6213 } else {
6214 // It did not parse as an integer, it is not a wildcard, it must be a name
6215 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6216 if (mds_gid == mds_gid_t{0}) {
6217 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
6218 lderr(cct) << "FSMap: " << *fsmap << dendl;
6219 return -CEPHFS_ENOENT;
6220 } else {
6221 auto& info = fsmap->get_info_gid(mds_gid);
6222 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6223 << "' to " << info.human_name() << dendl;
6224 targets->push_back(mds_gid);
6225 }
6226 return 0;
6227 }
6228 }
6229
6230
6231 /**
6232 * Authenticate with mon and establish global ID
6233 */
6234 int Client::authenticate()
6235 {
6236 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6237
6238 if (monclient->is_authenticated()) {
6239 return 0;
6240 }
6241
6242 client_lock.unlock();
6243 int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
6244 client_lock.lock();
6245 if (r < 0) {
6246 return r;
6247 }
6248
6249 whoami = monclient->get_global_id();
6250 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6251
6252 return 0;
6253 }
6254
6255 int Client::fetch_fsmap(bool user)
6256 {
6257 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6258
6259 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6260 // rather than MDSMap because no one MDSMap contains all the daemons, and
6261 // a `tell` can address any daemon.
6262 version_t fsmap_latest;
6263 bs::error_code ec;
6264 do {
6265 client_lock.unlock();
6266 std::tie(fsmap_latest, std::ignore) =
6267 monclient->get_version("fsmap", ca::use_blocked[ec]);
6268 client_lock.lock();
6269 } while (ec == bs::errc::resource_unavailable_try_again);
6270
6271 if (ec) {
6272 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6273 return ceph::from_error_code(ec);
6274 }
6275
6276 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6277
6278 if (user) {
6279 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6280 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6281 monclient->renew_subs();
6282 wait_on_list(waiting_for_fsmap);
6283 }
6284 ceph_assert(fsmap_user);
6285 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6286 } else {
6287 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6288 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6289 monclient->renew_subs();
6290 wait_on_list(waiting_for_fsmap);
6291 }
6292 ceph_assert(fsmap);
6293 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6294 }
6295 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6296 << fsmap_latest << dendl;
6297 return 0;
6298 }
6299
6300 /**
6301 *
6302 * @mds_spec one of ID, rank, GID, "*"
6303 *
6304 */
6305 int Client::mds_command(
6306 const std::string &mds_spec,
6307 const vector<string>& cmd,
6308 const bufferlist& inbl,
6309 bufferlist *outbl,
6310 string *outs,
6311 Context *onfinish)
6312 {
6313 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6314 if (!iref_reader.is_state_satisfied())
6315 return -CEPHFS_ENOTCONN;
6316
6317 std::unique_lock cl(client_lock);
6318
6319 int r;
6320 r = authenticate();
6321 if (r < 0) {
6322 return r;
6323 }
6324
6325 r = fetch_fsmap(false);
6326 if (r < 0) {
6327 return r;
6328 }
6329
6330 // Look up MDS target(s) of the command
6331 std::vector<mds_gid_t> targets;
6332 r = resolve_mds(mds_spec, &targets);
6333 if (r < 0) {
6334 return r;
6335 }
6336
6337 // If daemons are laggy, we won't send them commands. If all
6338 // are laggy then we fail.
6339 std::vector<mds_gid_t> non_laggy;
6340 for (const auto& gid : targets) {
6341 const auto info = fsmap->get_info_gid(gid);
6342 if (!info.laggy()) {
6343 non_laggy.push_back(gid);
6344 }
6345 }
6346 if (non_laggy.size() == 0) {
6347 *outs = "All targeted MDS daemons are laggy";
6348 return -CEPHFS_ENOENT;
6349 }
6350
6351 if (metadata.empty()) {
6352 // We are called on an unmounted client, so metadata
6353 // won't be initialized yet.
6354 populate_metadata("");
6355 }
6356
6357 // Send commands to targets
6358 C_GatherBuilder gather(cct, onfinish);
6359 for (const auto& target_gid : non_laggy) {
6360 const auto info = fsmap->get_info_gid(target_gid);
6361
6362 // Open a connection to the target MDS
6363 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6364
6365 cl.unlock();
6366 {
6367 std::scoped_lock cmd_lock(command_lock);
6368 // Generate MDSCommandOp state
6369 auto &op = command_table.start_command();
6370
6371 op.on_finish = gather.new_sub();
6372 op.cmd = cmd;
6373 op.outbl = outbl;
6374 op.outs = outs;
6375 op.inbl = inbl;
6376 op.mds_gid = target_gid;
6377 op.con = conn;
6378
6379 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6380 << " tid=" << op.tid << cmd << dendl;
6381
6382 // Construct and send MCommand
6383 MessageRef m = op.get_message(monclient->get_fsid());
6384 conn->send_message2(std::move(m));
6385 }
6386 cl.lock();
6387 }
6388 gather.activate();
6389
6390 return 0;
6391 }
6392
6393 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6394 {
6395 ceph_tid_t const tid = m->get_tid();
6396
6397 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6398
6399 std::scoped_lock cmd_lock(command_lock);
6400 if (!command_table.exists(tid)) {
6401 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6402 return;
6403 }
6404
6405 auto &op = command_table.get_command(tid);
6406 if (op.outbl) {
6407 *op.outbl = m->get_data();
6408 }
6409 if (op.outs) {
6410 *op.outs = m->rs;
6411 }
6412
6413 if (op.on_finish) {
6414 op.on_finish->complete(m->r);
6415 }
6416
6417 command_table.erase(tid);
6418 }
6419
6420 // -------------------
6421 // MOUNT
6422
6423 int Client::subscribe_mdsmap(const std::string &fs_name)
6424 {
6425 int r = authenticate();
6426 if (r < 0) {
6427 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6428 return r;
6429 }
6430
6431 std::string resolved_fs_name;
6432 if (fs_name.empty()) {
6433 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6434 if (resolved_fs_name.empty())
6435 // Try the backwards compatibility fs name option
6436 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6437 } else {
6438 resolved_fs_name = fs_name;
6439 }
6440
6441 std::string want = "mdsmap";
6442 if (!resolved_fs_name.empty()) {
6443 r = fetch_fsmap(true);
6444 if (r < 0)
6445 return r;
6446 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6447 if (fscid == FS_CLUSTER_ID_NONE) {
6448 return -CEPHFS_ENOENT;
6449 }
6450
6451 std::ostringstream oss;
6452 oss << want << "." << fscid;
6453 want = oss.str();
6454 }
6455 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6456
6457 monclient->sub_want(want, 0, 0);
6458 monclient->renew_subs();
6459
6460 return 0;
6461 }
6462
6463 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6464 bool require_mds, const std::string &fs_name)
6465 {
6466 ceph_assert(is_initialized());
6467
6468 /*
6469 * To make sure that the _unmount() must wait until the mount()
6470 * is done.
6471 */
6472 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6473 if (!mref_writer.is_first_writer()) // already mounting or mounted
6474 return 0;
6475
6476 std::unique_lock cl(client_lock);
6477
6478 int r = subscribe_mdsmap(fs_name);
6479 if (r < 0) {
6480 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6481 return r;
6482 }
6483
6484 start_tick_thread(); // start tick thread
6485
6486 if (require_mds) {
6487 while (1) {
6488 auto availability = mdsmap->is_cluster_available();
6489 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6490 // Error out
6491 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6492 return CEPH_FUSE_NO_MDS_UP;
6493 } else if (availability == MDSMap::AVAILABLE) {
6494 // Continue to mount
6495 break;
6496 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6497 // Else, wait. MDSMonitor will update the map to bring
6498 // us to a conclusion eventually.
6499 wait_on_list(waiting_for_mdsmap);
6500 } else {
6501 // Unexpected value!
6502 ceph_abort();
6503 }
6504 }
6505 }
6506
6507 if(mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
6508 lderr(cct) << "connections cannot be made while"
6509 " the flag refuse_client_session is set" << dendl;
6510 return -CEPHFS_EACCES;
6511 }
6512
6513 populate_metadata(mount_root.empty() ? "/" : mount_root);
6514
6515 filepath fp(CEPH_INO_ROOT);
6516 if (!mount_root.empty()) {
6517 fp = filepath(mount_root.c_str());
6518 }
6519 while (true) {
6520 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6521 req->set_filepath(fp);
6522 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6523 int res = make_request(req, perms);
6524 if (res < 0) {
6525 if (res == -CEPHFS_EACCES && root) {
6526 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6527 break;
6528 }
6529 return res;
6530 }
6531
6532 if (fp.depth())
6533 fp.pop_dentry();
6534 else
6535 break;
6536 }
6537
6538 ceph_assert(root);
6539 _ll_get(root.get());
6540
6541 // trace?
6542 if (!cct->_conf->client_trace.empty()) {
6543 traceout.open(cct->_conf->client_trace.c_str());
6544 if (traceout.is_open()) {
6545 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6546 } else {
6547 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6548 }
6549 }
6550
6551 /*
6552 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6553 ldout(cct, 3) << "op: struct stat st;" << dendl;
6554 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6555 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6556 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6557 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6558 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6559 ldout(cct, 3) << "op: int fd;" << dendl;
6560 */
6561
6562 mref_writer.update_state(CLIENT_MOUNTED);
6563 return 0;
6564 }
6565
6566 // UNMOUNT
6567
6568 void Client::_close_sessions()
6569 {
6570 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6571 if (it->second->state == MetaSession::STATE_REJECTED)
6572 mds_sessions.erase(it++);
6573 else
6574 ++it;
6575 }
6576
6577 while (!mds_sessions.empty()) {
6578 // send session closes!
6579 for (auto &p : mds_sessions) {
6580 if (p.second->state != MetaSession::STATE_CLOSING) {
6581 _close_mds_session(p.second.get());
6582 mds_ranks_closing.insert(p.first);
6583 }
6584 }
6585
6586 // wait for sessions to close
6587 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6588 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6589 << timo << "s)" << dendl;
6590 std::unique_lock l{client_lock, std::adopt_lock};
6591 if (!timo) {
6592 mount_cond.wait(l);
6593 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6594 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6595 while (!mds_ranks_closing.empty()) {
6596 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6597 // this prunes entry from mds_sessions and mds_ranks_closing
6598 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6599 }
6600 }
6601
6602 mds_ranks_closing.clear();
6603 l.release();
6604 }
6605 }
6606
6607 void Client::flush_mdlog_sync(Inode *in)
6608 {
6609 if (in->unsafe_ops.empty()) {
6610 return;
6611 }
6612
6613 std::set<mds_rank_t> anchor;
6614 for (auto &&p : in->unsafe_ops) {
6615 anchor.emplace(p->mds);
6616 }
6617 if (in->auth_cap) {
6618 anchor.emplace(in->auth_cap->session->mds_num);
6619 }
6620
6621 for (auto &rank : anchor) {
6622 auto session = &mds_sessions.at(rank);
6623 flush_mdlog(session->get());
6624 }
6625 }
6626
6627 void Client::flush_mdlog_sync()
6628 {
6629 if (mds_requests.empty())
6630 return;
6631 for (auto &p : mds_sessions) {
6632 flush_mdlog(p.second.get());
6633 }
6634 }
6635
6636 void Client::flush_mdlog(MetaSession *session)
6637 {
6638 // Only send this to Luminous or newer MDS daemons, older daemons
6639 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6640 const uint64_t features = session->con->get_features();
6641 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6642 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6643 session->con->send_message2(std::move(m));
6644 }
6645 }
6646
6647
6648 void Client::_abort_mds_sessions(int err)
6649 {
6650 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6651 auto req = p->second;
6652 ++p;
6653 // unsafe requests will be removed during close session below.
6654 if (req->got_unsafe)
6655 continue;
6656
6657 req->abort(err);
6658 if (req->caller_cond) {
6659 req->kick = true;
6660 req->caller_cond->notify_all();
6661 }
6662 }
6663
6664 // Process aborts on any requests that were on this waitlist.
6665 // Any requests that were on a waiting_for_open session waitlist
6666 // will get kicked during close session below.
6667 signal_cond_list(waiting_for_mdsmap);
6668
6669 // Force-close all sessions
6670 while(!mds_sessions.empty()) {
6671 auto session = mds_sessions.begin()->second;
6672 _closed_mds_session(session.get(), err);
6673 }
6674 }
6675
6676 void Client::_unmount(bool abort)
6677 {
6678 /*
6679 * We are unmounting the client.
6680 *
6681 * Just declare the state to STATE_UNMOUNTING to block and fail
6682 * any new comming "reader" and then try to wait all the in-flight
6683 * "readers" to finish.
6684 */
6685 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6686 if (!mref_writer.is_first_writer())
6687 return;
6688 mref_writer.wait_readers_done();
6689
6690 std::unique_lock lock{client_lock};
6691
6692 if (abort || blocklisted) {
6693 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6694 } else {
6695 ldout(cct, 2) << "unmounting" << dendl;
6696 }
6697
6698 deleg_timeout = 0;
6699
6700 if (abort) {
6701 mount_aborted = true;
6702 // Abort all mds sessions
6703 _abort_mds_sessions(-CEPHFS_ENOTCONN);
6704
6705 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6706 } else {
6707 // flush the mdlog for pending requests, if any
6708 flush_mdlog_sync();
6709 }
6710
6711 mount_cond.wait(lock, [this] {
6712 // Only wait for write OPs
6713 for (auto& [tid, req] : mds_requests) {
6714 if (req->is_write()) {
6715 ldout(cct, 10) << "waiting for write request '" << tid
6716 << "' to complete, currently there are "
6717 << mds_requests.size()
6718 << " outstanding read/write requests"
6719 << dendl;
6720 return false;
6721 }
6722 }
6723 return true;
6724 });
6725
6726 cwd.reset();
6727 root.reset();
6728
6729 // clean up any unclosed files
6730 while (!fd_map.empty()) {
6731 Fh *fh = fd_map.begin()->second;
6732 fd_map.erase(fd_map.begin());
6733 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6734 _release_fh(fh);
6735 }
6736
6737 while (!ll_unclosed_fh_set.empty()) {
6738 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6739 Fh *fh = *it;
6740 ll_unclosed_fh_set.erase(fh);
6741 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6742 _release_fh(fh);
6743 }
6744
6745 while (!opened_dirs.empty()) {
6746 dir_result_t *dirp = *opened_dirs.begin();
6747 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6748 _closedir(dirp);
6749 }
6750
6751 _ll_drop_pins();
6752
6753 if (cct->_conf->client_oc) {
6754 // flush/release all buffered data
6755 std::list<InodeRef> anchor;
6756 for (auto& p : inode_map) {
6757 Inode *in = p.second;
6758 if (!in) {
6759 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6760 ceph_assert(in);
6761 }
6762
6763 // prevent inode from getting freed
6764 anchor.emplace_back(in);
6765
6766 if (abort || blocklisted) {
6767 objectcacher->purge_set(&in->oset);
6768 } else if (!in->caps.empty()) {
6769 _release(in);
6770 _flush(in, new C_Client_FlushComplete(this, in));
6771 }
6772 }
6773 }
6774
6775 if (abort || blocklisted) {
6776 for (auto &q : mds_sessions) {
6777 auto s = q.second;
6778 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6779 Inode *in = *p;
6780 ++p;
6781 if (in->dirty_caps) {
6782 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6783 in->mark_caps_clean();
6784 put_inode(in);
6785 }
6786 }
6787 }
6788 } else {
6789 flush_caps_sync();
6790 wait_sync_caps(last_flush_tid);
6791 }
6792
6793 // empty lru cache
6794 trim_cache();
6795
6796 delay_put_inodes();
6797
6798 while (lru.lru_get_size() > 0 ||
6799 !inode_map.empty()) {
6800 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6801 << "+" << inode_map.size() << " items"
6802 << ", waiting (for caps to release?)"
6803 << dendl;
6804
6805 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6806 r == std::cv_status::timeout) {
6807 dump_cache(NULL);
6808 }
6809 }
6810 ceph_assert(lru.lru_get_size() == 0);
6811 ceph_assert(inode_map.empty());
6812
6813 // stop tracing
6814 if (!cct->_conf->client_trace.empty()) {
6815 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6816 traceout.close();
6817 }
6818
6819 // stop the tick thread
6820 tick_thread_stopped = true;
6821 upkeep_cond.notify_one();
6822
6823 _close_sessions();
6824
6825 // release the global snapshot realm
6826 SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6827 if (global_realm) {
6828 ceph_assert(global_realm->nref == 1);
6829 put_snap_realm(global_realm);
6830 }
6831
6832 mref_writer.update_state(CLIENT_UNMOUNTED);
6833
6834 /*
6835 * Stop the remount_queue before clearing the mountpoint memory
6836 * to avoid possible use-after-free bug.
6837 */
6838 if (remount_cb) {
6839 ldout(cct, 10) << "unmount stopping remount finisher" << dendl;
6840 remount_finisher.wait_for_empty();
6841 remount_finisher.stop();
6842 remount_cb = nullptr;
6843 }
6844
6845 ldout(cct, 2) << "unmounted." << dendl;
6846 }
6847
6848 void Client::unmount()
6849 {
6850 _unmount(false);
6851 }
6852
6853 void Client::abort_conn()
6854 {
6855 _unmount(true);
6856 }
6857
6858 void Client::flush_cap_releases()
6859 {
6860 uint64_t nr_caps = 0;
6861
6862 // send any cap releases
6863 for (auto &p : mds_sessions) {
6864 auto session = p.second;
6865 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6866 p.first)) {
6867 nr_caps += session->release->caps.size();
6868 if (cct->_conf->client_inject_release_failure) {
6869 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6870 } else {
6871 session->con->send_message2(std::move(session->release));
6872 }
6873 session->release.reset();
6874 }
6875 }
6876
6877 if (nr_caps > 0) {
6878 dec_pinned_icaps(nr_caps);
6879 }
6880 }
6881
6882 void Client::renew_and_flush_cap_releases()
6883 {
6884 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6885
6886 if (!mount_aborted && mdsmap->get_epoch()) {
6887 // renew caps?
6888 auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6889 if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
6890 renew_caps();
6891
6892 flush_cap_releases();
6893 }
6894 }
6895
6896 void Client::tick()
6897 {
6898 ldout(cct, 20) << "tick" << dendl;
6899
6900 auto now = ceph::coarse_mono_clock::now();
6901
6902 /*
6903 * If the mount() is not finished
6904 */
6905 if (is_mounting() && !mds_requests.empty()) {
6906 MetaRequest *req = mds_requests.begin()->second;
6907
6908 if (req->created + mount_timeout < now) {
6909 req->abort(-CEPHFS_ETIMEDOUT);
6910 if (req->caller_cond) {
6911 req->kick = true;
6912 req->caller_cond->notify_all();
6913 }
6914 signal_cond_list(waiting_for_mdsmap);
6915 for (auto &p : mds_sessions) {
6916 signal_context_list(p.second->waiting_for_open);
6917 }
6918 }
6919 }
6920
6921 renew_and_flush_cap_releases();
6922
6923 // delayed caps
6924 xlist<Inode*>::iterator p = delayed_list.begin();
6925 while (!p.end()) {
6926 Inode *in = *p;
6927 ++p;
6928 if (!mount_aborted && in->hold_caps_until > now)
6929 break;
6930 delayed_list.pop_front();
6931 if (!mount_aborted)
6932 check_caps(in, CHECK_CAPS_NODELAY);
6933 }
6934
6935 if (!mount_aborted)
6936 collect_and_send_metrics();
6937
6938 delay_put_inodes(is_unmounting());
6939 trim_cache(true);
6940
6941 if (blocklisted && (is_mounted() || is_unmounting()) &&
6942 last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
6943 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6944 messenger->client_reset();
6945 fd_gen++; // invalidate open files
6946 blocklisted = false;
6947 _kick_stale_sessions();
6948 last_auto_reconnect = now;
6949 }
6950 }
6951
6952 void Client::start_tick_thread()
6953 {
6954 upkeeper = std::thread([this]() {
6955 using time = ceph::coarse_mono_time;
6956 using sec = std::chrono::seconds;
6957
6958 auto last_tick = time::min();
6959
6960 std::unique_lock cl(client_lock);
6961 while (!tick_thread_stopped) {
6962 auto now = clock::now();
6963 auto since = now - last_tick;
6964
6965 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6966 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6967
6968 auto interval = std::max(t_interval, d_interval);
6969 if (likely(since >= interval*.90)) {
6970 tick();
6971 last_tick = clock::now();
6972 } else {
6973 interval -= since;
6974 }
6975
6976 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6977 if (!tick_thread_stopped)
6978 upkeep_cond.wait_for(cl, interval);
6979 }
6980 });
6981 }
6982
6983 void Client::collect_and_send_metrics() {
6984 ldout(cct, 20) << __func__ << dendl;
6985
6986 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6987
6988 // right now, we only track and send global metrics. its sufficient
6989 // to send these metrics to MDS rank0.
6990 collect_and_send_global_metrics();
6991 }
6992
6993 void Client::collect_and_send_global_metrics() {
6994 ldout(cct, 20) << __func__ << dendl;
6995 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6996
6997 /* Do not send the metrics until the MDS rank is ready */
6998 if (!mdsmap->is_active((mds_rank_t)0)) {
6999 ldout(cct, 5) << __func__ << " MDS rank 0 is not ready yet -- not sending metric"
7000 << dendl;
7001 return;
7002 }
7003
7004 if (!have_open_session((mds_rank_t)0)) {
7005 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
7006 << dendl;
7007 return;
7008 }
7009 auto session = _get_or_open_mds_session((mds_rank_t)0);
7010 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
7011 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
7012 return;
7013 }
7014
7015 ClientMetricMessage metric;
7016 std::vector<ClientMetricMessage> message;
7017
7018 // read latency
7019 if (_collect_and_send_global_metrics ||
7020 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
7021 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
7022 logger->tget(l_c_rd_avg),
7023 logger->get(l_c_rd_sqsum),
7024 nr_read_request));
7025 message.push_back(metric);
7026 }
7027
7028 // write latency
7029 if (_collect_and_send_global_metrics ||
7030 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
7031 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
7032 logger->tget(l_c_wr_avg),
7033 logger->get(l_c_wr_sqsum),
7034 nr_write_request));
7035 message.push_back(metric);
7036 }
7037
7038 // metadata latency
7039 if (_collect_and_send_global_metrics ||
7040 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
7041 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
7042 logger->tget(l_c_md_avg),
7043 logger->get(l_c_md_sqsum),
7044 nr_metadata_request));
7045 message.push_back(metric);
7046 }
7047
7048 // cap hit ratio -- nr_caps is unused right now
7049 if (_collect_and_send_global_metrics ||
7050 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
7051 auto [cap_hits, cap_misses] = get_cap_hit_rates();
7052 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
7053 message.push_back(metric);
7054 }
7055
7056 // dentry lease hit ratio
7057 if (_collect_and_send_global_metrics ||
7058 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
7059 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
7060 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
7061 message.push_back(metric);
7062 }
7063
7064 // opened files
7065 if (_collect_and_send_global_metrics ||
7066 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
7067 auto [opened_files, total_inodes] = get_opened_files_rates();
7068 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
7069 message.push_back(metric);
7070 }
7071
7072 // pinned i_caps
7073 if (_collect_and_send_global_metrics ||
7074 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
7075 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
7076 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
7077 message.push_back(metric);
7078 }
7079
7080 // opened inodes
7081 if (_collect_and_send_global_metrics ||
7082 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
7083 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
7084 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
7085 message.push_back(metric);
7086 }
7087
7088 // read io sizes
7089 if (_collect_and_send_global_metrics ||
7090 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
7091 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
7092 total_read_size));
7093 message.push_back(metric);
7094 }
7095
7096 // write io sizes
7097 if (_collect_and_send_global_metrics ||
7098 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
7099 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
7100 total_write_size));
7101 message.push_back(metric);
7102 }
7103
7104 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
7105 }
7106
7107 void Client::renew_caps()
7108 {
7109 ldout(cct, 10) << "renew_caps()" << dendl;
7110 last_cap_renew = ceph::coarse_mono_clock::now();
7111
7112 for (auto &p : mds_sessions) {
7113 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
7114 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
7115 renew_caps(p.second.get());
7116 }
7117 }
7118
7119 void Client::renew_caps(MetaSession *session)
7120 {
7121 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
7122 session->last_cap_renew_request = ceph_clock_now();
7123 uint64_t seq = ++session->cap_renew_seq;
7124 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7125 }
7126
7127
7128 // ===============================================================
7129 // high level (POSIXy) interface
7130
7131 int Client::_do_lookup(Inode *dir, const string& name, int mask,
7132 InodeRef *target, const UserPerm& perms)
7133 {
7134 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
7135 MetaRequest *req = new MetaRequest(op);
7136 filepath path;
7137 dir->make_nosnap_relative_path(path);
7138 path.push_dentry(name);
7139 req->set_filepath(path);
7140 req->set_inode(dir);
7141 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
7142 mask |= DEBUG_GETATTR_CAPS;
7143 req->head.args.getattr.mask = mask;
7144
7145 ldout(cct, 10) << __func__ << " on " << path << dendl;
7146
7147 int r = make_request(req, perms, target);
7148 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7149 return r;
7150 }
7151
7152 bool Client::_dentry_valid(const Dentry *dn)
7153 {
7154 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7155
7156 // is dn lease valid?
7157 utime_t now = ceph_clock_now();
7158 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
7159 mds_sessions.count(dn->lease_mds)) {
7160 auto s = mds_sessions.at(dn->lease_mds);
7161 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
7162 dlease_hit();
7163 return true;
7164 }
7165
7166 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
7167 << " vs lease_gen " << dn->lease_gen << dendl;
7168 }
7169
7170 dlease_miss();
7171 return false;
7172 }
7173
7174 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
7175 const UserPerm& perms, std::string* alternate_name,
7176 bool is_rename)
7177 {
7178 int r = 0;
7179 Dentry *dn = NULL;
7180 bool did_lookup_request = false;
7181 // can only request shared caps
7182 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7183
7184 if (dname == "..") {
7185 if (dir->dentries.empty()) {
7186 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
7187 filepath path(dir->ino);
7188 req->set_filepath(path);
7189
7190 InodeRef tmptarget;
7191 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
7192
7193 if (r == 0) {
7194 *target = std::move(tmptarget);
7195 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
7196 } else {
7197 *target = dir;
7198 }
7199 }
7200 else
7201 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
7202 goto done;
7203 }
7204
7205 if (dname == ".") {
7206 *target = dir;
7207 goto done;
7208 }
7209
7210 if (!dir->is_dir()) {
7211 r = -CEPHFS_ENOTDIR;
7212 goto done;
7213 }
7214
7215 if (dname.length() > NAME_MAX) {
7216 r = -CEPHFS_ENAMETOOLONG;
7217 goto done;
7218 }
7219
7220 if (dname == cct->_conf->client_snapdir &&
7221 dir->snapid == CEPH_NOSNAP) {
7222 *target = open_snapdir(dir);
7223 goto done;
7224 }
7225
7226 relookup:
7227 if (dir->dir &&
7228 dir->dir->dentries.count(dname)) {
7229 dn = dir->dir->dentries[dname];
7230
7231 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
7232 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7233
7234 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7235 if (_dentry_valid(dn)) {
7236 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7237 // make trim_caps() behave.
7238 dir->try_touch_cap(dn->lease_mds);
7239 goto hit_dn;
7240 }
7241 // dir shared caps?
7242 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7243 if (dn->cap_shared_gen == dir->shared_gen &&
7244 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7245 goto hit_dn;
7246 if (!dn->inode && (dir->flags & I_COMPLETE)) {
7247 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7248 << *dir << " dn '" << dname << "'" << dendl;
7249 return -CEPHFS_ENOENT;
7250 }
7251 }
7252 } else {
7253 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7254 }
7255
7256 // In rare case during the rename if another thread tries to
7257 // lookup the dst dentry, it may get an inconsistent result
7258 // that both src dentry and dst dentry will link to the same
7259 // inode at the same time.
7260 // Will wait the rename to finish and try it again.
7261 if (!is_rename && dn->is_renaming) {
7262 ldout(cct, 1) << __func__ << " dir " << *dir
7263 << " rename is on the way, will wait for dn '"
7264 << dname << "'" << dendl;
7265 wait_on_list(waiting_for_rename);
7266 goto relookup;
7267 }
7268 } else {
7269 // can we conclude ENOENT locally?
7270 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7271 (dir->flags & I_COMPLETE)) {
7272 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7273 return -CEPHFS_ENOENT;
7274 }
7275 }
7276
7277 if (did_lookup_request) {
7278 r = 0;
7279 goto done;
7280 }
7281 r = _do_lookup(dir, dname, mask, target, perms);
7282 did_lookup_request = true;
7283 if (r == 0) {
7284 /* complete lookup to get dentry for alternate_name */
7285 goto relookup;
7286 } else {
7287 goto done;
7288 }
7289
7290 hit_dn:
7291 if (dn->inode) {
7292 *target = dn->inode;
7293 if (alternate_name)
7294 *alternate_name = dn->alternate_name;
7295 } else {
7296 r = -CEPHFS_ENOENT;
7297 }
7298 touch_dn(dn);
7299 goto done;
7300
7301 done:
7302 if (r < 0)
7303 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7304 else
7305 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7306 return r;
7307 }
7308
7309 Dentry *Client::get_or_create(Inode *dir, const char* name)
7310 {
7311 // lookup
7312 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7313 dir->open_dir();
7314 if (dir->dir->dentries.count(name))
7315 return dir->dir->dentries[name];
7316 else // otherwise link up a new one
7317 return link(dir->dir, name, NULL, NULL);
7318 }
7319
7320 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7321 {
7322 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7323 if (!mref_reader.is_state_satisfied())
7324 return -CEPHFS_ENOTCONN;
7325
7326 ldout(cct, 10) << __func__ << ": " << path << dendl;
7327
7328 std::scoped_lock lock(client_lock);
7329
7330 return path_walk(path, wdr, perms, followsym);
7331 }
7332
7333 int Client::path_walk(const filepath& origpath, InodeRef *end,
7334 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
7335 {
7336 walk_dentry_result wdr;
7337 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
7338 *end = std::move(wdr.in);
7339 return rc;
7340 }
7341
7342 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7343 bool followsym, int mask, InodeRef dirinode)
7344 {
7345 filepath path = origpath;
7346 InodeRef cur;
7347 std::string alternate_name;
7348 if (origpath.absolute())
7349 cur = root;
7350 else if (!dirinode)
7351 cur = cwd;
7352 else {
7353 cur = dirinode;
7354 }
7355 ceph_assert(cur);
7356
7357 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
7358 ldout(cct, 10) << __func__ << " " << path << dendl;
7359
7360 int symlinks = 0;
7361
7362 unsigned i=0;
7363 while (i < path.depth() && cur) {
7364 int caps = 0;
7365 const string &dname = path[i];
7366 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7367 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7368 InodeRef next;
7369 if (cct->_conf->client_permissions) {
7370 int r = may_lookup(cur.get(), perms);
7371 if (r < 0)
7372 return r;
7373 caps = CEPH_CAP_AUTH_SHARED;
7374 }
7375
7376 /* Get extra requested caps on the last component */
7377 if (i == (path.depth() - 1))
7378 caps |= mask;
7379 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7380 if (r < 0)
7381 return r;
7382 // only follow trailing symlink if followsym. always follow
7383 // 'directory' symlinks.
7384 if (next && next->is_symlink()) {
7385 symlinks++;
7386 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7387 if (symlinks > MAXSYMLINKS) {
7388 return -CEPHFS_ELOOP;
7389 }
7390
7391 if (i < path.depth() - 1) {
7392 // dir symlink
7393 // replace consumed components of path with symlink dir target
7394 filepath resolved(next->symlink.c_str());
7395 resolved.append(path.postfixpath(i + 1));
7396 path = resolved;
7397 i = 0;
7398 if (next->symlink[0] == '/') {
7399 cur = root;
7400 }
7401 continue;
7402 } else if (followsym) {
7403 if (next->symlink[0] == '/') {
7404 path = next->symlink.c_str();
7405 i = 0;
7406 // reset position
7407 cur = root;
7408 } else {
7409 filepath more(next->symlink.c_str());
7410 // we need to remove the symlink component from off of the path
7411 // before adding the target that the symlink points to. remain
7412 // at the same position in the path.
7413 path.pop_dentry();
7414 path.append(more);
7415 }
7416 continue;
7417 }
7418 }
7419 cur.swap(next);
7420 i++;
7421 }
7422 if (!cur)
7423 return -CEPHFS_ENOENT;
7424 if (result) {
7425 result->in = std::move(cur);
7426 result->alternate_name = std::move(alternate_name);
7427 }
7428 return 0;
7429 }
7430
7431
7432 // namespace ops
7433
7434 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7435 {
7436 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7437 if (!mref_reader.is_state_satisfied())
7438 return -CEPHFS_ENOTCONN;
7439
7440 tout(cct) << "link" << std::endl;
7441 tout(cct) << relexisting << std::endl;
7442 tout(cct) << relpath << std::endl;
7443
7444 filepath existing(relexisting);
7445
7446 InodeRef in, dir;
7447
7448 std::scoped_lock lock(client_lock);
7449 int r = path_walk(existing, &in, perm, true);
7450 if (r < 0)
7451 return r;
7452 if (std::string(relpath) == "/") {
7453 r = -CEPHFS_EEXIST;
7454 return r;
7455 }
7456 filepath path(relpath);
7457 string name = path.last_dentry();
7458 path.pop_dentry();
7459
7460 r = path_walk(path, &dir, perm, true);
7461 if (r < 0)
7462 return r;
7463 if (cct->_conf->client_permissions) {
7464 if (S_ISDIR(in->mode)) {
7465 r = -CEPHFS_EPERM;
7466 return r;
7467 }
7468 r = may_hardlink(in.get(), perm);
7469 if (r < 0)
7470 return r;
7471 r = may_create(dir.get(), perm);
7472 if (r < 0)
7473 return r;
7474 }
7475 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7476 return r;
7477 }
7478
7479 int Client::unlink(const char *relpath, const UserPerm& perm)
7480 {
7481 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7482 }
7483
7484 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7485 {
7486 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7487 if (!mref_reader.is_state_satisfied()) {
7488 return -CEPHFS_ENOTCONN;
7489 }
7490
7491 tout(cct) << __func__ << std::endl;
7492 tout(cct) << dirfd << std::endl;
7493 tout(cct) << relpath << std::endl;
7494 tout(cct) << flags << std::endl;
7495
7496 if (std::string(relpath) == "/") {
7497 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7498 }
7499
7500 filepath path(relpath);
7501 string name = path.last_dentry();
7502 path.pop_dentry();
7503 InodeRef dir;
7504
7505 std::scoped_lock lock(client_lock);
7506
7507 InodeRef dirinode;
7508 int r = get_fd_inode(dirfd, &dirinode);
7509 if (r < 0) {
7510 return r;
7511 }
7512
7513 r = path_walk(path, &dir, perm, true, 0, dirinode);
7514 if (r < 0) {
7515 return r;
7516 }
7517 if (cct->_conf->client_permissions) {
7518 r = may_delete(dir.get(), name.c_str(), perm);
7519 if (r < 0) {
7520 return r;
7521 }
7522 }
7523 if (flags & AT_REMOVEDIR) {
7524 r = _rmdir(dir.get(), name.c_str(), perm);
7525 } else {
7526 r = _unlink(dir.get(), name.c_str(), perm);
7527 }
7528 return r;
7529 }
7530
7531 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7532 {
7533 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7534 if (!mref_reader.is_state_satisfied())
7535 return -CEPHFS_ENOTCONN;
7536
7537 tout(cct) << __func__ << std::endl;
7538 tout(cct) << relfrom << std::endl;
7539 tout(cct) << relto << std::endl;
7540
7541 if (std::string(relfrom) == "/" || std::string(relto) == "/")
7542 return -CEPHFS_EBUSY;
7543
7544 filepath from(relfrom);
7545 filepath to(relto);
7546 string fromname = from.last_dentry();
7547 from.pop_dentry();
7548 string toname = to.last_dentry();
7549 to.pop_dentry();
7550
7551 InodeRef fromdir, todir;
7552
7553 std::scoped_lock lock(client_lock);
7554 int r = path_walk(from, &fromdir, perm);
7555 if (r < 0)
7556 goto out;
7557 r = path_walk(to, &todir, perm);
7558 if (r < 0)
7559 goto out;
7560
7561 if (cct->_conf->client_permissions) {
7562 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7563 if (r < 0)
7564 return r;
7565 r = may_delete(todir.get(), toname.c_str(), perm);
7566 if (r < 0 && r != -CEPHFS_ENOENT)
7567 return r;
7568 }
7569 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7570 out:
7571 return r;
7572 }
7573
7574 // dirs
7575
7576 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7577 {
7578 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7579 }
7580
7581 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7582 std::string alternate_name)
7583 {
7584 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7585 if (!mref_reader.is_state_satisfied())
7586 return -CEPHFS_ENOTCONN;
7587
7588 tout(cct) << __func__ << std::endl;
7589 tout(cct) << dirfd << std::endl;
7590 tout(cct) << relpath << std::endl;
7591 tout(cct) << mode << std::endl;
7592 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7593
7594 if (std::string(relpath) == "/") {
7595 return -CEPHFS_EEXIST;
7596 }
7597
7598 filepath path(relpath);
7599 string name = path.last_dentry();
7600 path.pop_dentry();
7601 InodeRef dir;
7602
7603 std::scoped_lock lock(client_lock);
7604
7605 InodeRef dirinode;
7606 int r = get_fd_inode(dirfd, &dirinode);
7607 if (r < 0) {
7608 return r;
7609 }
7610
7611 r = path_walk(path, &dir, perm, true, 0, dirinode);
7612 if (r < 0) {
7613 return r;
7614 }
7615 if (cct->_conf->client_permissions) {
7616 r = may_create(dir.get(), perm);
7617 if (r < 0) {
7618 return r;
7619 }
7620 }
7621 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7622 }
7623
7624 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7625 {
7626 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7627 if (!mref_reader.is_state_satisfied())
7628 return -CEPHFS_ENOTCONN;
7629
7630 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7631 tout(cct) << __func__ << std::endl;
7632 tout(cct) << relpath << std::endl;
7633 tout(cct) << mode << std::endl;
7634
7635 //get through existing parts of path
7636 filepath path(relpath);
7637 unsigned int i;
7638 int r = 0, caps = 0;
7639 InodeRef cur, next;
7640
7641 std::scoped_lock lock(client_lock);
7642 cur = cwd;
7643 for (i=0; i<path.depth(); ++i) {
7644 if (cct->_conf->client_permissions) {
7645 r = may_lookup(cur.get(), perms);
7646 if (r < 0)
7647 break;
7648 caps = CEPH_CAP_AUTH_SHARED;
7649 }
7650 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7651 if (r < 0)
7652 break;
7653 cur.swap(next);
7654 }
7655 if (r!=-CEPHFS_ENOENT) return r;
7656 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7657 //make new directory at each level
7658 for (; i<path.depth(); ++i) {
7659 if (cct->_conf->client_permissions) {
7660 r = may_create(cur.get(), perms);
7661 if (r < 0)
7662 return r;
7663 }
7664 //make new dir
7665 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7666
7667 //check proper creation/existence
7668 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7669 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7670 }
7671 if (r < 0)
7672 return r;
7673 //move to new dir and continue
7674 cur.swap(next);
7675 ldout(cct, 20) << __func__ << ": successfully created directory "
7676 << filepath(cur->ino).get_path() << dendl;
7677 }
7678 return 0;
7679 }
7680
7681 int Client::rmdir(const char *relpath, const UserPerm& perms)
7682 {
7683 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7684 }
7685
7686 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7687 {
7688 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7689 if (!mref_reader.is_state_satisfied())
7690 return -CEPHFS_ENOTCONN;
7691
7692 tout(cct) << __func__ << std::endl;
7693 tout(cct) << relpath << std::endl;
7694 tout(cct) << mode << std::endl;
7695 tout(cct) << rdev << std::endl;
7696
7697 if (std::string(relpath) == "/")
7698 return -CEPHFS_EEXIST;
7699
7700 filepath path(relpath);
7701 string name = path.last_dentry();
7702 path.pop_dentry();
7703 InodeRef dir;
7704
7705 std::scoped_lock lock(client_lock);
7706 int r = path_walk(path, &dir, perms);
7707 if (r < 0)
7708 return r;
7709 if (cct->_conf->client_permissions) {
7710 int r = may_create(dir.get(), perms);
7711 if (r < 0)
7712 return r;
7713 }
7714 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7715 }
7716
7717 // symlinks
7718
7719 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7720 {
7721 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7722 }
7723
7724 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7725 std::string alternate_name)
7726 {
7727 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7728 if (!mref_reader.is_state_satisfied()) {
7729 return -CEPHFS_ENOTCONN;
7730 }
7731
7732 tout(cct) << __func__ << std::endl;
7733 tout(cct) << target << std::endl;
7734 tout(cct) << dirfd << std::endl;
7735 tout(cct) << relpath << std::endl;
7736
7737 if (std::string(relpath) == "/") {
7738 return -CEPHFS_EEXIST;
7739 }
7740
7741 filepath path(relpath);
7742 string name = path.last_dentry();
7743 path.pop_dentry();
7744 InodeRef dir;
7745
7746 std::scoped_lock lock(client_lock);
7747
7748 InodeRef dirinode;
7749 int r = get_fd_inode(dirfd, &dirinode);
7750 if (r < 0) {
7751 return r;
7752 }
7753 r = path_walk(path, &dir, perms, true, 0, dirinode);
7754 if (r < 0) {
7755 return r;
7756 }
7757 if (cct->_conf->client_permissions) {
7758 int r = may_create(dir.get(), perms);
7759 if (r < 0) {
7760 return r;
7761 }
7762 }
7763 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7764 }
7765
7766 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7767 {
7768 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7769 }
7770
7771 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7772 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7773 if (!mref_reader.is_state_satisfied()) {
7774 return -CEPHFS_ENOTCONN;
7775 }
7776
7777 tout(cct) << __func__ << std::endl;
7778 tout(cct) << dirfd << std::endl;
7779 tout(cct) << relpath << std::endl;
7780
7781 InodeRef dirinode;
7782 std::scoped_lock lock(client_lock);
7783 int r = get_fd_inode(dirfd, &dirinode);
7784 if (r < 0) {
7785 return r;
7786 }
7787
7788 InodeRef in;
7789 filepath path(relpath);
7790 r = path_walk(path, &in, perms, false, 0, dirinode);
7791 if (r < 0) {
7792 return r;
7793 }
7794
7795 return _readlink(in.get(), buf, size);
7796 }
7797
7798 int Client::_readlink(Inode *in, char *buf, size_t size)
7799 {
7800 if (!in->is_symlink())
7801 return -CEPHFS_EINVAL;
7802
7803 // copy into buf (at most size bytes)
7804 int r = in->symlink.length();
7805 if (r > (int)size)
7806 r = size;
7807 memcpy(buf, in->symlink.c_str(), r);
7808 return r;
7809 }
7810
7811
7812 // inode stuff
7813
7814 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7815 {
7816 bool yes = in->caps_issued_mask(mask, true);
7817
7818 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7819 if (yes && !force)
7820 return 0;
7821
7822 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7823 filepath path;
7824 in->make_nosnap_relative_path(path);
7825 req->set_filepath(path);
7826 req->set_inode(in);
7827 req->head.args.getattr.mask = mask;
7828
7829 int res = make_request(req, perms);
7830 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7831 return res;
7832 }
7833
7834 int Client::_getvxattr(
7835 Inode *in,
7836 const UserPerm& perms,
7837 const char *xattr_name,
7838 ssize_t size,
7839 void *value,
7840 mds_rank_t rank)
7841 {
7842 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7843 return -CEPHFS_ENODATA;
7844 }
7845
7846 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7847 filepath path;
7848 in->make_nosnap_relative_path(path);
7849 req->set_filepath(path);
7850 req->set_inode(in);
7851 req->set_string2(xattr_name);
7852
7853 bufferlist bl;
7854 int res = make_request(req, perms, nullptr, nullptr, rank, &bl,
7855 CEPHFS_FEATURE_OP_GETVXATTR);
7856 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7857
7858 if (res < 0) {
7859 if (res == -CEPHFS_EOPNOTSUPP) {
7860 return -CEPHFS_ENODATA;
7861 }
7862 return res;
7863 }
7864
7865 std::string buf;
7866 auto p = bl.cbegin();
7867
7868 DECODE_START(1, p);
7869 decode(buf, p);
7870 DECODE_FINISH(p);
7871
7872 ssize_t len = buf.length();
7873
7874 res = len; // refer to man getxattr(2) for output buffer size == 0
7875
7876 if (size > 0) {
7877 if (len > size) {
7878 res = -CEPHFS_ERANGE; // insufficient output buffer space
7879 } else {
7880 memcpy(value, buf.c_str(), len);
7881 }
7882 }
7883 return res;
7884 }
7885
7886 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7887 const UserPerm& perms, InodeRef *inp,
7888 std::vector<uint8_t>* aux)
7889 {
7890 int issued = in->caps_issued();
7891 union ceph_mds_request_args args;
7892 bool kill_sguid = false;
7893 int inode_drop = 0;
7894 size_t auxsize = 0;
7895
7896 if (aux)
7897 auxsize = aux->size();
7898
7899 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7900 ccap_string(issued) << " aux size " << auxsize << dendl;
7901
7902 if (in->snapid != CEPH_NOSNAP) {
7903 return -CEPHFS_EROFS;
7904 }
7905 if ((mask & CEPH_SETATTR_SIZE) &&
7906 (uint64_t)stx->stx_size > in->size &&
7907 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7908 perms)) {
7909 return -CEPHFS_EDQUOT;
7910 }
7911
7912 // Can't set fscrypt_auth and file at the same time!
7913 if ((mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)) ==
7914 (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE))
7915 return -CEPHFS_EINVAL;
7916
7917 if (!aux && (mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)))
7918 return -CEPHFS_EINVAL;
7919
7920 memset(&args, 0, sizeof(args));
7921
7922 // make the change locally?
7923 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7924 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7925 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7926 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7927 << in->cap_dirtier_gid << ", forcing sync setattr"
7928 << dendl;
7929 /*
7930 * This works because we implicitly flush the caps as part of the
7931 * request, so the cap update check will happen with the writeback
7932 * cap context, and then the setattr check will happen with the
7933 * caller's context.
7934 *
7935 * In reality this pattern is likely pretty rare (different users
7936 * setattr'ing the same file). If that turns out not to be the
7937 * case later, we can build a more complex pipelined cap writeback
7938 * infrastructure...
7939 */
7940 mask |= CEPH_SETATTR_CTIME;
7941 }
7942
7943 if (!mask) {
7944 // caller just needs us to bump the ctime
7945 in->ctime = ceph_clock_now();
7946 in->cap_dirtier_uid = perms.uid();
7947 in->cap_dirtier_gid = perms.gid();
7948 if (issued & CEPH_CAP_AUTH_EXCL)
7949 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7950 else if (issued & CEPH_CAP_FILE_EXCL)
7951 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7952 else if (issued & CEPH_CAP_XATTR_EXCL)
7953 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7954 else
7955 mask |= CEPH_SETATTR_CTIME;
7956 }
7957
7958 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7959 kill_sguid = !!(mask & CEPH_SETATTR_KILL_SGUID);
7960 }
7961
7962 if (mask & CEPH_SETATTR_UID) {
7963 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7964
7965 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7966 in->ctime = ceph_clock_now();
7967 in->cap_dirtier_uid = perms.uid();
7968 in->cap_dirtier_gid = perms.gid();
7969 in->uid = stx->stx_uid;
7970 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7971 mask &= ~CEPH_SETATTR_UID;
7972 kill_sguid = true;
7973 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7974 in->uid != stx->stx_uid) {
7975 args.setattr.uid = stx->stx_uid;
7976 inode_drop |= CEPH_CAP_AUTH_SHARED;
7977 } else {
7978 mask &= ~CEPH_SETATTR_UID;
7979 }
7980 }
7981
7982 if (mask & CEPH_SETATTR_GID) {
7983 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7984
7985 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7986 in->ctime = ceph_clock_now();
7987 in->cap_dirtier_uid = perms.uid();
7988 in->cap_dirtier_gid = perms.gid();
7989 in->gid = stx->stx_gid;
7990 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7991 mask &= ~CEPH_SETATTR_GID;
7992 kill_sguid = true;
7993 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7994 in->gid != stx->stx_gid) {
7995 args.setattr.gid = stx->stx_gid;
7996 inode_drop |= CEPH_CAP_AUTH_SHARED;
7997 } else {
7998 mask &= ~CEPH_SETATTR_GID;
7999 }
8000 }
8001
8002 if (mask & CEPH_SETATTR_MODE) {
8003 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
8004
8005 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8006 in->ctime = ceph_clock_now();
8007 in->cap_dirtier_uid = perms.uid();
8008 in->cap_dirtier_gid = perms.gid();
8009 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
8010 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8011 mask &= ~CEPH_SETATTR_MODE;
8012 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8013 in->mode != stx->stx_mode) {
8014 args.setattr.mode = stx->stx_mode;
8015 inode_drop |= CEPH_CAP_AUTH_SHARED;
8016 } else {
8017 mask &= ~CEPH_SETATTR_MODE;
8018 }
8019 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) && S_ISREG(in->mode)) {
8020 if (kill_sguid && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
8021 in->mode &= ~(S_ISUID|S_ISGID);
8022 } else {
8023 if (mask & CEPH_SETATTR_KILL_SUID) {
8024 in->mode &= ~S_ISUID;
8025 }
8026 if (mask & CEPH_SETATTR_KILL_SGID) {
8027 in->mode &= ~S_ISGID;
8028 }
8029 }
8030 mask &= ~(CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID);
8031 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8032 }
8033
8034 if (mask & CEPH_SETATTR_BTIME) {
8035 ldout(cct,10) << "changing btime to " << in->btime << dendl;
8036
8037 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8038 in->ctime = ceph_clock_now();
8039 in->cap_dirtier_uid = perms.uid();
8040 in->cap_dirtier_gid = perms.gid();
8041 in->btime = utime_t(stx->stx_btime);
8042 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8043 mask &= ~CEPH_SETATTR_BTIME;
8044 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8045 in->btime != utime_t(stx->stx_btime)) {
8046 args.setattr.btime = utime_t(stx->stx_btime);
8047 inode_drop |= CEPH_CAP_AUTH_SHARED;
8048 } else {
8049 mask &= ~CEPH_SETATTR_BTIME;
8050 }
8051 }
8052
8053 if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
8054 ldout(cct,10) << "resetting cached fscrypt_auth field. size now "
8055 << in->fscrypt_auth.size() << dendl;
8056
8057 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8058 in->ctime = ceph_clock_now();
8059 in->cap_dirtier_uid = perms.uid();
8060 in->cap_dirtier_gid = perms.gid();
8061 in->fscrypt_auth = *aux;
8062 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8063 mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
8064 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8065 in->fscrypt_auth != *aux) {
8066 inode_drop |= CEPH_CAP_AUTH_SHARED;
8067 } else {
8068 mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
8069 }
8070 }
8071
8072 if (mask & CEPH_SETATTR_SIZE) {
8073 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
8074 //too big!
8075 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
8076 return -CEPHFS_EFBIG;
8077 }
8078
8079 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
8080 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
8081 !(mask & CEPH_SETATTR_KILL_SGUID) &&
8082 stx->stx_size >= in->size) {
8083 if (stx->stx_size > in->size) {
8084 in->size = in->reported_size = stx->stx_size;
8085 in->cap_dirtier_uid = perms.uid();
8086 in->cap_dirtier_gid = perms.gid();
8087 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8088 mask &= ~(CEPH_SETATTR_SIZE);
8089 mask |= CEPH_SETATTR_MTIME;
8090 } else {
8091 // ignore it when size doesn't change
8092 mask &= ~(CEPH_SETATTR_SIZE);
8093 }
8094 } else {
8095 args.setattr.size = stx->stx_size;
8096 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
8097 CEPH_CAP_FILE_WR;
8098 }
8099 }
8100
8101 if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
8102 ldout(cct,10) << "resetting cached fscrypt_file field. size now "
8103 << in->fscrypt_file.size() << dendl;
8104
8105 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8106 in->ctime = ceph_clock_now();
8107 in->cap_dirtier_uid = perms.uid();
8108 in->cap_dirtier_gid = perms.gid();
8109 in->fscrypt_file = *aux;
8110 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8111 mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
8112 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8113 in->fscrypt_file != *aux) {
8114 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
8115 } else {
8116 mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
8117 }
8118 }
8119
8120 if (mask & CEPH_SETATTR_MTIME) {
8121 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8122 in->mtime = utime_t(stx->stx_mtime);
8123 in->ctime = ceph_clock_now();
8124 in->cap_dirtier_uid = perms.uid();
8125 in->cap_dirtier_gid = perms.gid();
8126 in->time_warp_seq++;
8127 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8128 mask &= ~CEPH_SETATTR_MTIME;
8129 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8130 utime_t(stx->stx_mtime) > in->mtime) {
8131 in->mtime = utime_t(stx->stx_mtime);
8132 in->ctime = ceph_clock_now();
8133 in->cap_dirtier_uid = perms.uid();
8134 in->cap_dirtier_gid = perms.gid();
8135 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8136 mask &= ~CEPH_SETATTR_MTIME;
8137 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8138 in->mtime != utime_t(stx->stx_mtime)) {
8139 args.setattr.mtime = utime_t(stx->stx_mtime);
8140 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
8141 CEPH_CAP_FILE_WR;
8142 } else {
8143 mask &= ~CEPH_SETATTR_MTIME;
8144 }
8145 }
8146
8147 if (mask & CEPH_SETATTR_ATIME) {
8148 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8149 in->atime = utime_t(stx->stx_atime);
8150 in->ctime = ceph_clock_now();
8151 in->cap_dirtier_uid = perms.uid();
8152 in->cap_dirtier_gid = perms.gid();
8153 in->time_warp_seq++;
8154 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8155 mask &= ~CEPH_SETATTR_ATIME;
8156 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8157 utime_t(stx->stx_atime) > in->atime) {
8158 in->atime = utime_t(stx->stx_atime);
8159 in->ctime = ceph_clock_now();
8160 in->cap_dirtier_uid = perms.uid();
8161 in->cap_dirtier_gid = perms.gid();
8162 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8163 mask &= ~CEPH_SETATTR_ATIME;
8164 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8165 in->atime != utime_t(stx->stx_atime)) {
8166 args.setattr.atime = utime_t(stx->stx_atime);
8167 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
8168 CEPH_CAP_FILE_WR;
8169 } else {
8170 mask &= ~CEPH_SETATTR_ATIME;
8171 }
8172 }
8173
8174 if (!mask) {
8175 in->change_attr++;
8176 if (in->is_dir() && in->snapid == CEPH_NOSNAP) {
8177 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
8178 if (inode_map.count(vino)) {
8179 refresh_snapdir_attrs(inode_map[vino], in);
8180 }
8181 }
8182 return 0;
8183 }
8184
8185 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
8186
8187 filepath path;
8188
8189 in->make_nosnap_relative_path(path);
8190 req->set_filepath(path);
8191 req->set_inode(in);
8192
8193 req->head.args = args;
8194 req->inode_drop = inode_drop;
8195 if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
8196 req->fscrypt_auth = *aux;
8197 } else if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
8198 req->fscrypt_file = *aux;
8199 }
8200 req->head.args.setattr.mask = mask;
8201 req->regetattr_mask = mask;
8202
8203 int res = make_request(req, perms, inp);
8204 ldout(cct, 10) << "_setattr result=" << res << dendl;
8205 return res;
8206 }
8207
8208 /* Note that we only care about attrs that setattr cares about */
8209 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
8210 {
8211 stx->stx_size = st->st_size;
8212 stx->stx_mode = st->st_mode;
8213 stx->stx_uid = st->st_uid;
8214 stx->stx_gid = st->st_gid;
8215 #ifdef __APPLE__
8216 stx->stx_mtime = st->st_mtimespec;
8217 stx->stx_atime = st->st_atimespec;
8218 #elif __WIN32
8219 stx->stx_mtime.tv_sec = st->st_mtime;
8220 stx->stx_mtime.tv_nsec = 0;
8221 stx->stx_atime.tv_sec = st->st_atime;
8222 stx->stx_atime.tv_nsec = 0;
8223 #else
8224 stx->stx_mtime = st->st_mtim;
8225 stx->stx_atime = st->st_atim;
8226 #endif
8227 }
8228
8229 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
8230 const UserPerm& perms, InodeRef *inp)
8231 {
8232 if (mask & CEPH_SETATTR_SIZE) {
8233 mask |= clear_suid_sgid(in, perms, true);
8234 }
8235
8236 int ret = _do_setattr(in, stx, mask, perms, inp);
8237 if (ret < 0)
8238 return ret;
8239 if (mask & CEPH_SETATTR_MODE)
8240 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
8241 return ret;
8242 }
8243
8244 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
8245 const UserPerm& perms)
8246 {
8247 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
8248 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
8249 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
8250 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
8251 if (cct->_conf->client_permissions) {
8252 int r = may_setattr(in.get(), stx, mask, perms);
8253 if (r < 0)
8254 return r;
8255 }
8256 return __setattrx(in.get(), stx, mask, perms);
8257 }
8258
8259 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
8260 const UserPerm& perms)
8261 {
8262 struct ceph_statx stx;
8263
8264 stat_to_statx(attr, &stx);
8265 mask &= ~CEPH_SETATTR_BTIME;
8266
8267 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
8268 mask &= ~CEPH_SETATTR_UID;
8269 }
8270 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
8271 mask &= ~CEPH_SETATTR_GID;
8272 }
8273
8274 return _setattrx(in, &stx, mask, perms);
8275 }
8276
8277 int Client::setattr(const char *relpath, struct stat *attr, int mask,
8278 const UserPerm& perms)
8279 {
8280 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8281 if (!mref_reader.is_state_satisfied())
8282 return -CEPHFS_ENOTCONN;
8283
8284 tout(cct) << __func__ << std::endl;
8285 tout(cct) << relpath << std::endl;
8286 tout(cct) << mask << std::endl;
8287
8288 filepath path(relpath);
8289 InodeRef in;
8290
8291 std::scoped_lock lock(client_lock);
8292 int r = path_walk(path, &in, perms);
8293 if (r < 0)
8294 return r;
8295 return _setattr(in, attr, mask, perms);
8296 }
8297
8298 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
8299 const UserPerm& perms, int flags)
8300 {
8301 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8302 if (!mref_reader.is_state_satisfied())
8303 return -CEPHFS_ENOTCONN;
8304
8305 tout(cct) << __func__ << std::endl;
8306 tout(cct) << relpath << std::endl;
8307 tout(cct) << mask << std::endl;
8308
8309 filepath path(relpath);
8310 InodeRef in;
8311
8312 std::scoped_lock lock(client_lock);
8313 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8314 if (r < 0)
8315 return r;
8316 return _setattrx(in, stx, mask, perms);
8317 }
8318
8319 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8320 {
8321 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8322 if (!mref_reader.is_state_satisfied())
8323 return -CEPHFS_ENOTCONN;
8324
8325 tout(cct) << __func__ << std::endl;
8326 tout(cct) << fd << std::endl;
8327 tout(cct) << mask << std::endl;
8328
8329 std::scoped_lock lock(client_lock);
8330 Fh *f = get_filehandle(fd);
8331 if (!f)
8332 return -CEPHFS_EBADF;
8333 #if defined(__linux__) && defined(O_PATH)
8334 if (f->flags & O_PATH)
8335 return -CEPHFS_EBADF;
8336 #endif
8337 return _setattr(f->inode, attr, mask, perms);
8338 }
8339
8340 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8341 {
8342 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8343 if (!mref_reader.is_state_satisfied())
8344 return -CEPHFS_ENOTCONN;
8345
8346 tout(cct) << __func__ << std::endl;
8347 tout(cct) << fd << std::endl;
8348 tout(cct) << mask << std::endl;
8349
8350 std::scoped_lock lock(client_lock);
8351 Fh *f = get_filehandle(fd);
8352 if (!f)
8353 return -CEPHFS_EBADF;
8354 #if defined(__linux__) && defined(O_PATH)
8355 if (f->flags & O_PATH)
8356 return -CEPHFS_EBADF;
8357 #endif
8358 return _setattrx(f->inode, stx, mask, perms);
8359 }
8360
8361 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8362 frag_info_t *dirstat, int mask)
8363 {
8364 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8365 if (!mref_reader.is_state_satisfied())
8366 return -CEPHFS_ENOTCONN;
8367
8368 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8369 tout(cct) << "stat" << std::endl;
8370 tout(cct) << relpath << std::endl;
8371
8372 filepath path(relpath);
8373 InodeRef in;
8374
8375 std::scoped_lock lock(client_lock);
8376 int r = path_walk(path, &in, perms, true, mask);
8377 if (r < 0)
8378 return r;
8379 r = _getattr(in, mask, perms);
8380 if (r < 0) {
8381 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8382 return r;
8383 }
8384 fill_stat(in, stbuf, dirstat);
8385 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8386 return r;
8387 }
8388
8389 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8390 {
8391 unsigned mask = 0;
8392
8393 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8394 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
8395 goto out;
8396
8397 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8398 mask |= CEPH_CAP_PIN;
8399 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8400 mask |= CEPH_CAP_AUTH_SHARED;
8401 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8402 mask |= CEPH_CAP_LINK_SHARED;
8403 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
8404 mask |= CEPH_CAP_FILE_SHARED;
8405 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8406 mask |= CEPH_CAP_XATTR_SHARED;
8407 out:
8408 return mask;
8409 }
8410
8411 int Client::statx(const char *relpath, struct ceph_statx *stx,
8412 const UserPerm& perms,
8413 unsigned int want, unsigned int flags)
8414 {
8415 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
8416 }
8417
8418 int Client::lstat(const char *relpath, struct stat *stbuf,
8419 const UserPerm& perms, frag_info_t *dirstat, int mask)
8420 {
8421 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8422 if (!mref_reader.is_state_satisfied())
8423 return -CEPHFS_ENOTCONN;
8424
8425 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8426 tout(cct) << __func__ << std::endl;
8427 tout(cct) << relpath << std::endl;
8428
8429 filepath path(relpath);
8430 InodeRef in;
8431
8432 std::scoped_lock lock(client_lock);
8433 // don't follow symlinks
8434 int r = path_walk(path, &in, perms, false, mask);
8435 if (r < 0)
8436 return r;
8437 r = _getattr(in, mask, perms);
8438 if (r < 0) {
8439 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8440 return r;
8441 }
8442 fill_stat(in, stbuf, dirstat);
8443 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8444 return r;
8445 }
8446
8447 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8448 {
8449 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8450 << " mode 0" << oct << in->mode << dec
8451 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8452 memset(st, 0, sizeof(struct stat));
8453 if (use_faked_inos())
8454 st->st_ino = in->faked_ino;
8455 else
8456 st->st_ino = in->ino;
8457 st->st_dev = in->snapid;
8458 st->st_mode = in->mode;
8459 st->st_rdev = in->rdev;
8460 if (in->is_dir()) {
8461 switch (in->nlink) {
8462 case 0:
8463 st->st_nlink = 0; /* dir is unlinked */
8464 break;
8465 case 1:
8466 st->st_nlink = 1 /* parent dentry */
8467 + 1 /* <dir>/. */
8468 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8469 break;
8470 default:
8471 ceph_abort();
8472 }
8473 } else {
8474 st->st_nlink = in->nlink;
8475 }
8476 st->st_uid = in->uid;
8477 st->st_gid = in->gid;
8478 if (in->ctime > in->mtime) {
8479 stat_set_ctime_sec(st, in->ctime.sec());
8480 stat_set_ctime_nsec(st, in->ctime.nsec());
8481 } else {
8482 stat_set_ctime_sec(st, in->mtime.sec());
8483 stat_set_ctime_nsec(st, in->mtime.nsec());
8484 }
8485 stat_set_atime_sec(st, in->atime.sec());
8486 stat_set_atime_nsec(st, in->atime.nsec());
8487 stat_set_mtime_sec(st, in->mtime.sec());
8488 stat_set_mtime_nsec(st, in->mtime.nsec());
8489 if (in->is_dir()) {
8490 if (cct->_conf->client_dirsize_rbytes) {
8491 st->st_size = in->rstat.rbytes;
8492 } else if (in->snapid == CEPH_SNAPDIR) {
8493 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8494 if (realm) {
8495 st->st_size = realm->my_snaps.size();
8496 put_snap_realm(realm);
8497 }
8498 } else {
8499 st->st_size = in->dirstat.size();
8500 }
8501 // The Windows "stat" structure provides just a subset of the fields that are
8502 // available on Linux.
8503 #ifndef _WIN32
8504 st->st_blocks = 1;
8505 #endif
8506 } else {
8507 st->st_size = in->size;
8508 #ifndef _WIN32
8509 st->st_blocks = (in->size + 511) >> 9;
8510 #endif
8511 }
8512 #ifndef _WIN32
8513 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8514 #endif
8515
8516 if (dirstat)
8517 *dirstat = in->dirstat;
8518 if (rstat)
8519 *rstat = in->rstat;
8520
8521 return in->caps_issued();
8522 }
8523
8524 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8525 {
8526 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8527 << " mode 0" << oct << in->mode << dec
8528 << " mtime " << in->mtime << " ctime " << in->ctime << " change_attr " << in->change_attr << dendl;
8529 memset(stx, 0, sizeof(struct ceph_statx));
8530
8531 /*
8532 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8533 * so that all bits are set.
8534 */
8535 if (!mask)
8536 mask = ~0;
8537
8538 /* These are always considered to be available */
8539 stx->stx_dev = in->snapid;
8540 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8541
8542 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8543 stx->stx_mode = S_IFMT & in->mode;
8544 stx->stx_ino = use_faked_inos() ? in->faked_ino : (uint64_t)in->ino;
8545 stx->stx_rdev = in->rdev;
8546 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8547
8548 if (mask & CEPH_CAP_AUTH_SHARED) {
8549 stx->stx_uid = in->uid;
8550 stx->stx_gid = in->gid;
8551 stx->stx_mode = in->mode;
8552 in->btime.to_timespec(&stx->stx_btime);
8553 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8554 }
8555
8556 if (mask & CEPH_CAP_LINK_SHARED) {
8557 if (in->is_dir()) {
8558 switch (in->nlink) {
8559 case 0:
8560 stx->stx_nlink = 0; /* dir is unlinked */
8561 break;
8562 case 1:
8563 stx->stx_nlink = 1 /* parent dentry */
8564 + 1 /* <dir>/. */
8565 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8566 break;
8567 default:
8568 ceph_abort();
8569 }
8570 } else {
8571 stx->stx_nlink = in->nlink;
8572 }
8573 stx->stx_mask |= CEPH_STATX_NLINK;
8574 }
8575
8576 if (mask & CEPH_CAP_FILE_SHARED) {
8577
8578 in->atime.to_timespec(&stx->stx_atime);
8579 in->mtime.to_timespec(&stx->stx_mtime);
8580
8581 if (in->is_dir()) {
8582 if (cct->_conf->client_dirsize_rbytes) {
8583 stx->stx_size = in->rstat.rbytes;
8584 } else if (in->snapid == CEPH_SNAPDIR) {
8585 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8586 if (realm) {
8587 stx->stx_size = realm->my_snaps.size();
8588 put_snap_realm(realm);
8589 }
8590 } else {
8591 stx->stx_size = in->dirstat.size();
8592 }
8593 stx->stx_blocks = 1;
8594 } else {
8595 stx->stx_size = in->size;
8596 stx->stx_blocks = (in->size + 511) >> 9;
8597 }
8598 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8599 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8600 }
8601
8602 /* Change time and change_attr both require all shared caps to view */
8603 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8604 stx->stx_version = in->change_attr;
8605 if (in->ctime > in->mtime)
8606 in->ctime.to_timespec(&stx->stx_ctime);
8607 else
8608 in->mtime.to_timespec(&stx->stx_ctime);
8609 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8610 }
8611
8612 }
8613
8614 void Client::touch_dn(Dentry *dn)
8615 {
8616 lru.lru_touch(dn);
8617 }
8618
8619 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8620 {
8621 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8622 }
8623
8624 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8625 {
8626 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8627 if (!mref_reader.is_state_satisfied())
8628 return -CEPHFS_ENOTCONN;
8629
8630 tout(cct) << __func__ << std::endl;
8631 tout(cct) << fd << std::endl;
8632 tout(cct) << mode << std::endl;
8633
8634 std::scoped_lock lock(client_lock);
8635 Fh *f = get_filehandle(fd);
8636 if (!f)
8637 return -CEPHFS_EBADF;
8638 #if defined(__linux__) && defined(O_PATH)
8639 if (f->flags & O_PATH)
8640 return -CEPHFS_EBADF;
8641 #endif
8642 struct stat attr;
8643 attr.st_mode = mode;
8644 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8645 }
8646
8647 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8648 const UserPerm& perms) {
8649 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8650 if (!mref_reader.is_state_satisfied()) {
8651 return -CEPHFS_ENOTCONN;
8652 }
8653
8654 tout(cct) << __func__ << std::endl;
8655 tout(cct) << dirfd << std::endl;
8656 tout(cct) << relpath << std::endl;
8657 tout(cct) << mode << std::endl;
8658 tout(cct) << flags << std::endl;
8659
8660 filepath path(relpath);
8661 InodeRef in;
8662 InodeRef dirinode;
8663
8664 std::scoped_lock lock(client_lock);
8665 int r = get_fd_inode(dirfd, &dirinode);
8666 if (r < 0) {
8667 return r;
8668 }
8669
8670 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8671 if (r < 0) {
8672 return r;
8673 }
8674 struct stat attr;
8675 attr.st_mode = mode;
8676 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8677 }
8678
8679 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8680 {
8681 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8682 }
8683
8684 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8685 const UserPerm& perms)
8686 {
8687 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8688 }
8689
8690 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8691 {
8692 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8693 if (!mref_reader.is_state_satisfied())
8694 return -CEPHFS_ENOTCONN;
8695
8696 tout(cct) << __func__ << std::endl;
8697 tout(cct) << fd << std::endl;
8698 tout(cct) << new_uid << std::endl;
8699 tout(cct) << new_gid << std::endl;
8700
8701 std::scoped_lock lock(client_lock);
8702 Fh *f = get_filehandle(fd);
8703 if (!f)
8704 return -CEPHFS_EBADF;
8705 #if defined(__linux__) && defined(O_PATH)
8706 if (f->flags & O_PATH)
8707 return -CEPHFS_EBADF;
8708 #endif
8709 struct stat attr;
8710 attr.st_uid = new_uid;
8711 attr.st_gid = new_gid;
8712 int mask = 0;
8713 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8714 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8715 return _setattr(f->inode, &attr, mask, perms);
8716 }
8717
8718 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8719 const UserPerm& perms)
8720 {
8721 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8722 }
8723
8724 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8725 int flags, const UserPerm& perms) {
8726 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8727 if (!mref_reader.is_state_satisfied()) {
8728 return -CEPHFS_ENOTCONN;
8729 }
8730
8731 tout(cct) << __func__ << std::endl;
8732 tout(cct) << dirfd << std::endl;
8733 tout(cct) << relpath << std::endl;
8734 tout(cct) << new_uid << std::endl;
8735 tout(cct) << new_gid << std::endl;
8736 tout(cct) << flags << std::endl;
8737
8738 filepath path(relpath);
8739 InodeRef in;
8740 InodeRef dirinode;
8741
8742 std::scoped_lock lock(client_lock);
8743 int r = get_fd_inode(dirfd, &dirinode);
8744 if (r < 0) {
8745 return r;
8746 }
8747
8748 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8749 if (r < 0) {
8750 return r;
8751 }
8752 struct stat attr;
8753 attr.st_uid = new_uid;
8754 attr.st_gid = new_gid;
8755 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8756 }
8757
8758 static void attr_set_atime_and_mtime(struct stat *attr,
8759 const utime_t &atime,
8760 const utime_t &mtime)
8761 {
8762 stat_set_atime_sec(attr, atime.tv.tv_sec);
8763 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8764 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8765 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8766 }
8767
8768 // for [l]utime() invoke the timeval variant as the timespec
8769 // variant are not yet implemented. for futime[s](), invoke
8770 // the timespec variant.
8771 int Client::utime(const char *relpath, struct utimbuf *buf,
8772 const UserPerm& perms)
8773 {
8774 struct timeval tv[2];
8775 tv[0].tv_sec = buf->actime;
8776 tv[0].tv_usec = 0;
8777 tv[1].tv_sec = buf->modtime;
8778 tv[1].tv_usec = 0;
8779
8780 return utimes(relpath, tv, perms);
8781 }
8782
8783 int Client::lutime(const char *relpath, struct utimbuf *buf,
8784 const UserPerm& perms)
8785 {
8786 struct timeval tv[2];
8787 tv[0].tv_sec = buf->actime;
8788 tv[0].tv_usec = 0;
8789 tv[1].tv_sec = buf->modtime;
8790 tv[1].tv_usec = 0;
8791
8792 return lutimes(relpath, tv, perms);
8793 }
8794
8795 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8796 {
8797 struct timespec ts[2];
8798 ts[0].tv_sec = buf->actime;
8799 ts[0].tv_nsec = 0;
8800 ts[1].tv_sec = buf->modtime;
8801 ts[1].tv_nsec = 0;
8802
8803 return futimens(fd, ts, perms);
8804 }
8805
8806 int Client::utimes(const char *relpath, struct timeval times[2],
8807 const UserPerm& perms)
8808 {
8809 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8810 if (!mref_reader.is_state_satisfied())
8811 return -CEPHFS_ENOTCONN;
8812
8813 tout(cct) << __func__ << std::endl;
8814 tout(cct) << relpath << std::endl;
8815 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8816 << std::endl;
8817 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8818 << std::endl;
8819
8820 filepath path(relpath);
8821 InodeRef in;
8822
8823 std::scoped_lock lock(client_lock);
8824 int r = path_walk(path, &in, perms);
8825 if (r < 0)
8826 return r;
8827 struct ceph_statx attr;
8828 utime_t(times[0]).to_timespec(&attr.stx_atime);
8829 utime_t(times[1]).to_timespec(&attr.stx_mtime);
8830
8831 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8832 }
8833
8834 int Client::lutimes(const char *relpath, struct timeval times[2],
8835 const UserPerm& perms)
8836 {
8837 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8838 if (!mref_reader.is_state_satisfied())
8839 return -CEPHFS_ENOTCONN;
8840
8841 tout(cct) << __func__ << std::endl;
8842 tout(cct) << relpath << std::endl;
8843 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8844 << std::endl;
8845 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8846 << std::endl;
8847
8848 filepath path(relpath);
8849 InodeRef in;
8850
8851 std::scoped_lock lock(client_lock);
8852 int r = path_walk(path, &in, perms, false);
8853 if (r < 0)
8854 return r;
8855 struct ceph_statx attr;
8856 utime_t(times[0]).to_timespec(&attr.stx_atime);
8857 utime_t(times[1]).to_timespec(&attr.stx_mtime);
8858
8859 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8860 }
8861
8862 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8863 {
8864 struct timespec ts[2];
8865 ts[0].tv_sec = times[0].tv_sec;
8866 ts[0].tv_nsec = times[0].tv_usec * 1000;
8867 ts[1].tv_sec = times[1].tv_sec;
8868 ts[1].tv_nsec = times[1].tv_usec * 1000;
8869
8870 return futimens(fd, ts, perms);
8871 }
8872
8873 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8874 {
8875 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8876 if (!mref_reader.is_state_satisfied())
8877 return -CEPHFS_ENOTCONN;
8878
8879 tout(cct) << __func__ << std::endl;
8880 tout(cct) << fd << std::endl;
8881 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8882 << std::endl;
8883 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8884 << std::endl;
8885
8886 std::scoped_lock lock(client_lock);
8887 Fh *f = get_filehandle(fd);
8888 if (!f)
8889 return -CEPHFS_EBADF;
8890 #if defined(__linux__) && defined(O_PATH)
8891 if (f->flags & O_PATH)
8892 return -CEPHFS_EBADF;
8893 #endif
8894 struct ceph_statx attr;
8895 utime_t(times[0]).to_timespec(&attr.stx_atime);
8896 utime_t(times[1]).to_timespec(&attr.stx_mtime);
8897
8898 return _setattrx(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8899 }
8900
8901 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8902 const UserPerm& perms) {
8903 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8904 if (!mref_reader.is_state_satisfied()) {
8905 return -CEPHFS_ENOTCONN;
8906 }
8907
8908 tout(cct) << __func__ << std::endl;
8909 tout(cct) << dirfd << std::endl;
8910 tout(cct) << relpath << std::endl;
8911 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8912 << std::endl;
8913 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8914 << std::endl;
8915 tout(cct) << flags << std::endl;
8916
8917 filepath path(relpath);
8918 InodeRef in;
8919 InodeRef dirinode;
8920
8921 std::scoped_lock lock(client_lock);
8922 int r = get_fd_inode(dirfd, &dirinode);
8923 if (r < 0) {
8924 return r;
8925 }
8926
8927 #if defined(__linux__) && defined(O_PATH)
8928 if (flags & O_PATH) {
8929 return -CEPHFS_EBADF;
8930 }
8931 #endif
8932
8933 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8934 if (r < 0) {
8935 return r;
8936 }
8937 struct ceph_statx attr;
8938 utime_t(times[0]).to_timespec(&attr.stx_atime);
8939 utime_t(times[1]).to_timespec(&attr.stx_mtime);
8940
8941 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8942 }
8943
8944 int Client::flock(int fd, int operation, uint64_t owner)
8945 {
8946 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8947 if (!mref_reader.is_state_satisfied())
8948 return -CEPHFS_ENOTCONN;
8949
8950 tout(cct) << __func__ << std::endl;
8951 tout(cct) << fd << std::endl;
8952 tout(cct) << operation << std::endl;
8953 tout(cct) << owner << std::endl;
8954
8955 std::scoped_lock lock(client_lock);
8956 Fh *f = get_filehandle(fd);
8957 if (!f)
8958 return -CEPHFS_EBADF;
8959
8960 return _flock(f, operation, owner);
8961 }
8962
8963 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8964 {
8965 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8966 if (!mref_reader.is_state_satisfied())
8967 return -CEPHFS_ENOTCONN;
8968
8969 tout(cct) << __func__ << std::endl;
8970 tout(cct) << relpath << std::endl;
8971
8972 filepath path(relpath);
8973 InodeRef in;
8974
8975 std::scoped_lock lock(client_lock);
8976 int r = path_walk(path, &in, perms, true);
8977 if (r < 0)
8978 return r;
8979 if (cct->_conf->client_permissions) {
8980 int r = may_open(in.get(), O_RDONLY, perms);
8981 if (r < 0)
8982 return r;
8983 }
8984 r = _opendir(in.get(), dirpp, perms);
8985 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8986 if (r != -CEPHFS_ENOTDIR)
8987 tout(cct) << (uintptr_t)*dirpp << std::endl;
8988 return r;
8989 }
8990
8991 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8992 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8993 if (!mref_reader.is_state_satisfied()) {
8994 return -CEPHFS_ENOTCONN;
8995 }
8996
8997 tout(cct) << __func__ << std::endl;
8998 tout(cct) << dirfd << std::endl;
8999
9000 InodeRef dirinode;
9001 std::scoped_lock locker(client_lock);
9002 int r = get_fd_inode(dirfd, &dirinode);
9003 if (r < 0) {
9004 return r;
9005 }
9006
9007 if (cct->_conf->client_permissions) {
9008 r = may_open(dirinode.get(), O_RDONLY, perms);
9009 if (r < 0) {
9010 return r;
9011 }
9012 }
9013 r = _opendir(dirinode.get(), dirpp, perms);
9014 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
9015 if (r != -CEPHFS_ENOTDIR) {
9016 tout(cct) << (uintptr_t)*dirpp << std::endl;
9017 }
9018 return r;
9019 }
9020
9021 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
9022 {
9023 if (!in->is_dir())
9024 return -CEPHFS_ENOTDIR;
9025 *dirpp = new dir_result_t(in, perms);
9026 opened_dirs.insert(*dirpp);
9027 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
9028 return 0;
9029 }
9030
9031
9032 int Client::closedir(dir_result_t *dir)
9033 {
9034 tout(cct) << __func__ << std::endl;
9035 tout(cct) << (uintptr_t)dir << std::endl;
9036
9037 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
9038 std::scoped_lock lock(client_lock);
9039 _closedir(dir);
9040 return 0;
9041 }
9042
9043 void Client::_closedir(dir_result_t *dirp)
9044 {
9045 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
9046
9047 if (dirp->inode) {
9048 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
9049 dirp->inode.reset();
9050 }
9051 _readdir_drop_dirp_buffer(dirp);
9052 opened_dirs.erase(dirp);
9053 delete dirp;
9054 }
9055
9056 void Client::rewinddir(dir_result_t *dirp)
9057 {
9058 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
9059
9060 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9061 if (!mref_reader.is_state_satisfied())
9062 return;
9063
9064 std::scoped_lock lock(client_lock);
9065 dir_result_t *d = static_cast<dir_result_t*>(dirp);
9066 _readdir_drop_dirp_buffer(d);
9067 d->reset();
9068 }
9069
9070 loff_t Client::telldir(dir_result_t *dirp)
9071 {
9072 dir_result_t *d = static_cast<dir_result_t*>(dirp);
9073 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
9074 return d->offset;
9075 }
9076
9077 void Client::seekdir(dir_result_t *dirp, loff_t offset)
9078 {
9079 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
9080
9081 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9082 if (!mref_reader.is_state_satisfied())
9083 return;
9084
9085 std::scoped_lock lock(client_lock);
9086
9087 if (offset == dirp->offset)
9088 return;
9089
9090 if (offset > dirp->offset)
9091 dirp->release_count = 0; // bump if we do a forward seek
9092 else
9093 dirp->ordered_count = 0; // disable filling readdir cache
9094
9095 if (dirp->hash_order()) {
9096 if (dirp->offset > offset) {
9097 _readdir_drop_dirp_buffer(dirp);
9098 dirp->reset();
9099 }
9100 } else {
9101 if (offset == 0 ||
9102 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
9103 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
9104 _readdir_drop_dirp_buffer(dirp);
9105 dirp->reset();
9106 }
9107 }
9108
9109 dirp->offset = offset;
9110 }
9111
9112
9113 //struct dirent {
9114 // ino_t d_ino; /* inode number */
9115 // off_t d_off; /* offset to the next dirent */
9116 // unsigned short d_reclen; /* length of this record */
9117 // unsigned char d_type; /* type of file */
9118 // char d_name[256]; /* filename */
9119 //};
9120 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
9121 {
9122 strncpy(de->d_name, name, 255);
9123 de->d_name[255] = '\0';
9124 #if !defined(__CYGWIN__) && !(defined(_WIN32))
9125 de->d_ino = ino;
9126 #if !defined(__APPLE__) && !defined(__FreeBSD__)
9127 de->d_off = next_off;
9128 #endif
9129 de->d_reclen = 1;
9130 de->d_type = IFTODT(type);
9131 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
9132 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
9133 #endif
9134 }
9135
9136 void Client::_readdir_next_frag(dir_result_t *dirp)
9137 {
9138 frag_t fg = dirp->buffer_frag;
9139
9140 if (fg.is_rightmost()) {
9141 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
9142 dirp->set_end();
9143 return;
9144 }
9145
9146 // advance
9147 fg = fg.next();
9148 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
9149
9150 if (dirp->hash_order()) {
9151 // keep last_name
9152 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
9153 if (dirp->offset < new_offset) // don't decrease offset
9154 dirp->offset = new_offset;
9155 } else {
9156 dirp->last_name.clear();
9157 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9158 _readdir_rechoose_frag(dirp);
9159 }
9160 }
9161
9162 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
9163 {
9164 ceph_assert(dirp->inode);
9165
9166 if (dirp->hash_order())
9167 return;
9168
9169 frag_t cur = frag_t(dirp->offset_high());
9170 frag_t fg = dirp->inode->dirfragtree[cur.value()];
9171 if (fg != cur) {
9172 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
9173 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9174 dirp->last_name.clear();
9175 dirp->next_offset = 2;
9176 }
9177 }
9178
9179 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
9180 {
9181 ldout(cct, 10) << __func__ << " " << dirp << dendl;
9182 dirp->buffer.clear();
9183 }
9184
9185 int Client::_readdir_get_frag(int op, dir_result_t* dirp,
9186 fill_readdir_args_cb_t fill_req_cb)
9187 {
9188 ceph_assert(dirp);
9189 ceph_assert(dirp->inode);
9190
9191 // get the current frag.
9192 frag_t fg;
9193 if (dirp->hash_order())
9194 fg = dirp->inode->dirfragtree[dirp->offset_high()];
9195 else
9196 fg = frag_t(dirp->offset_high());
9197
9198 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
9199 << " offset " << hex << dirp->offset << dec << dendl;
9200
9201 InodeRef& diri = dirp->inode;
9202
9203 MetaRequest *req = new MetaRequest(op);
9204 fill_req_cb(dirp, req, diri, fg);
9205
9206 bufferlist dirbl;
9207 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
9208
9209 if (res == -CEPHFS_EAGAIN) {
9210 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
9211 _readdir_rechoose_frag(dirp);
9212 return _readdir_get_frag(op, dirp, fill_req_cb);
9213 }
9214
9215 if (res == 0) {
9216 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
9217 << " size " << dirp->buffer.size() << dendl;
9218 } else {
9219 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
9220 dirp->set_end();
9221 }
9222
9223 return res;
9224 }
9225
9226 struct dentry_off_lt {
9227 bool operator()(const Dentry* dn, int64_t off) const {
9228 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
9229 }
9230 };
9231
9232 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
9233 int caps, bool getref)
9234 {
9235 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9236 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
9237 << " last_name " << dirp->last_name
9238 << " offset " << hex << dirp->offset << dec
9239 << dendl;
9240 Dir *dir = dirp->inode->dir;
9241
9242 if (!dir) {
9243 ldout(cct, 10) << " dir is empty" << dendl;
9244 dirp->set_end();
9245 return 0;
9246 }
9247
9248 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
9249 dir->readdir_cache.end(),
9250 dirp->offset, dentry_off_lt());
9251
9252 string dn_name;
9253 while (true) {
9254 int mask = caps;
9255 if (!dirp->inode->is_complete_and_ordered())
9256 return -CEPHFS_EAGAIN;
9257 if (pd == dir->readdir_cache.end())
9258 break;
9259 Dentry *dn = *pd;
9260 if (dn->inode == NULL) {
9261 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
9262 ++pd;
9263 continue;
9264 }
9265 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
9266 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
9267 ++pd;
9268 continue;
9269 }
9270
9271 int idx = pd - dir->readdir_cache.begin();
9272 if (dn->inode->is_dir()) {
9273 mask |= CEPH_STAT_RSTAT;
9274 }
9275 int r = _getattr(dn->inode, mask, dirp->perms);
9276 if (r < 0)
9277 return r;
9278
9279 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9280 pd = dir->readdir_cache.begin() + idx;
9281 if (pd >= dir->readdir_cache.end() || *pd != dn)
9282 return -CEPHFS_EAGAIN;
9283
9284 struct ceph_statx stx;
9285 struct dirent de;
9286 fill_statx(dn->inode, caps, &stx);
9287
9288 uint64_t next_off = dn->offset + 1;
9289 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9290 ++pd;
9291 if (pd == dir->readdir_cache.end())
9292 next_off = dir_result_t::END;
9293
9294 Inode *in = NULL;
9295 if (getref) {
9296 in = dn->inode.get();
9297 _ll_get(in);
9298 }
9299
9300 dn_name = dn->name; // fill in name while we have lock
9301
9302 client_lock.unlock();
9303 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9304 client_lock.lock();
9305 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
9306 << " = " << r << dendl;
9307 if (r < 0) {
9308 return r;
9309 }
9310
9311 dirp->offset = next_off;
9312 if (dirp->at_end())
9313 dirp->next_offset = 2;
9314 else
9315 dirp->next_offset = dirp->offset_low();
9316 dirp->last_name = dn_name; // we successfully returned this one; update!
9317 dirp->release_count = 0; // last_name no longer match cache index
9318 if (r > 0)
9319 return r;
9320 }
9321
9322 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
9323 dirp->set_end();
9324 return 0;
9325 }
9326
9327 int Client::readdir_r_cb(dir_result_t* d,
9328 add_dirent_cb_t cb,
9329 void* p,
9330 unsigned want,
9331 unsigned flags,
9332 bool getref)
9333 {
9334 auto fill_readdir_cb = [](dir_result_t* dirp,
9335 MetaRequest* req,
9336 InodeRef& diri,
9337 frag_t fg) {
9338 filepath path;
9339 diri->make_nosnap_relative_path(path);
9340 req->set_filepath(path);
9341 req->set_inode(diri.get());
9342 req->head.args.readdir.frag = fg;
9343 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
9344 if (dirp->last_name.length()) {
9345 req->path2.set_path(dirp->last_name);
9346 } else if (dirp->hash_order()) {
9347 req->head.args.readdir.offset_hash = dirp->offset_high();
9348 }
9349 req->dirp = dirp;
9350 };
9351 int op = CEPH_MDS_OP_READDIR;
9352 if (d->inode && d->inode->snapid == CEPH_SNAPDIR)
9353 op = CEPH_MDS_OP_LSSNAP;
9354 return _readdir_r_cb(op,
9355 d,
9356 cb,
9357 fill_readdir_cb,
9358 p,
9359 want,
9360 flags,
9361 getref,
9362 false);
9363 }
9364
9365 //
9366 // NB: this is used for both readdir and readdir_snapdiff results processing
9367 // hence it should be request type agnostic
9368 //
9369 int Client::_readdir_r_cb(int op,
9370 dir_result_t *d,
9371 add_dirent_cb_t cb,
9372 fill_readdir_args_cb_t fill_cb,
9373 void *p,
9374 unsigned want,
9375 unsigned flags,
9376 bool getref,
9377 bool bypass_cache)
9378 {
9379 int caps = statx_to_mask(flags, want);
9380
9381 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9382 if (!mref_reader.is_state_satisfied())
9383 return -CEPHFS_ENOTCONN;
9384
9385 std::unique_lock cl(client_lock);
9386
9387 dir_result_t *dirp = static_cast<dir_result_t*>(d);
9388
9389 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
9390 << dec << " at_end=" << dirp->at_end()
9391 << " hash_order=" << dirp->hash_order() << dendl;
9392
9393 struct dirent de;
9394 struct ceph_statx stx;
9395 memset(&de, 0, sizeof(de));
9396 memset(&stx, 0, sizeof(stx));
9397
9398 InodeRef& diri = dirp->inode;
9399
9400 if (dirp->at_end())
9401 return 0;
9402
9403 if (dirp->offset == 0) {
9404 ldout(cct, 15) << " including ." << dendl;
9405 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
9406 uint64_t next_off = 1;
9407
9408 int r;
9409 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
9410 if (r < 0)
9411 return r;
9412
9413 fill_statx(diri, caps, &stx);
9414 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9415
9416 Inode *inode = NULL;
9417 if (getref) {
9418 inode = diri.get();
9419 _ll_get(inode);
9420 }
9421
9422 cl.unlock();
9423 r = cb(p, &de, &stx, next_off, inode);
9424 cl.lock();
9425 if (r < 0)
9426 return r;
9427
9428 dirp->offset = next_off;
9429 if (r > 0)
9430 return r;
9431 }
9432 if (dirp->offset == 1) {
9433 ldout(cct, 15) << " including .." << dendl;
9434 uint64_t next_off = 2;
9435 InodeRef in;
9436 if (diri->dentries.empty())
9437 in = diri;
9438 else
9439 in = diri->get_first_parent()->dir->parent_inode;
9440
9441 int r;
9442 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
9443 if (r < 0)
9444 return r;
9445
9446 fill_statx(in, caps, &stx);
9447 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9448
9449 Inode *inode = NULL;
9450 if (getref) {
9451 inode = in.get();
9452 _ll_get(inode);
9453 }
9454
9455 cl.unlock();
9456 r = cb(p, &de, &stx, next_off, inode);
9457 cl.lock();
9458 if (r < 0)
9459 return r;
9460
9461 dirp->offset = next_off;
9462 if (r > 0)
9463 return r;
9464 }
9465
9466 // can we read from our cache?
9467 ldout(cct, 10) << __func__
9468 << " offset " << hex << dirp->offset << dec
9469 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9470 << dirp->inode->is_complete_and_ordered()
9471 << " issued " << ccap_string(dirp->inode->caps_issued())
9472 << dendl;
9473 if (!bypass_cache &&
9474 dirp->inode->snapid != CEPH_SNAPDIR &&
9475 dirp->inode->is_complete_and_ordered() &&
9476 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
9477 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
9478 if (err != -CEPHFS_EAGAIN)
9479 return err;
9480 }
9481
9482 while (1) {
9483 if (dirp->at_end())
9484 return 0;
9485
9486 bool check_caps = true;
9487 if (!dirp->is_cached()) {
9488 int r = _readdir_get_frag(op, dirp, fill_cb);
9489 if (r)
9490 return r;
9491 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9492 // different than the requested one. (our dirfragtree was outdated)
9493 check_caps = false;
9494 }
9495 frag_t fg = dirp->buffer_frag;
9496
9497 ldout(cct, 10) << __func__
9498 << " frag " << fg << " buffer size " << dirp->buffer.size()
9499 << " offset " << hex << dirp->offset << dendl;
9500
9501 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9502 dirp->offset, dir_result_t::dentry_off_lt());
9503 it != dirp->buffer.end();
9504 ++it) {
9505 dir_result_t::dentry &entry = *it;
9506
9507 uint64_t next_off = entry.offset + 1;
9508
9509 int r;
9510 if (check_caps) {
9511 int mask = caps;
9512 if(entry.inode->is_dir()){
9513 mask |= CEPH_STAT_RSTAT;
9514 }
9515 r = _getattr(entry.inode, mask, dirp->perms);
9516 if (r < 0)
9517 return r;
9518 }
9519
9520 fill_statx(entry.inode, caps, &stx);
9521 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9522
9523 Inode *inode = NULL;
9524 if (getref) {
9525 inode = entry.inode.get();
9526 _ll_get(inode);
9527 }
9528
9529 cl.unlock();
9530 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
9531 cl.lock();
9532
9533 ldout(cct, 15) << __func__
9534 << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9535 << " snap " << entry.inode->snapid
9536 << " = " << r << dendl;
9537 if (r < 0)
9538 return r;
9539
9540 dirp->offset = next_off;
9541 if (r > 0)
9542 return r;
9543 }
9544
9545 if (dirp->next_offset > 2) {
9546 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9547 _readdir_drop_dirp_buffer(dirp);
9548 continue; // more!
9549 }
9550
9551 if (!fg.is_rightmost()) {
9552 // next frag!
9553 _readdir_next_frag(dirp);
9554 continue;
9555 }
9556
9557 if (!bypass_cache &&
9558 diri->shared_gen == dirp->start_shared_gen &&
9559 diri->dir_release_count == dirp->release_count) {
9560 if (diri->dir_ordered_count == dirp->ordered_count) {
9561 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9562 if (diri->dir) {
9563 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9564 diri->dir->readdir_cache.resize(dirp->cache_index);
9565 }
9566 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9567 } else {
9568 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9569 diri->flags |= I_COMPLETE;
9570 }
9571 }
9572
9573 dirp->set_end();
9574 return 0;
9575 }
9576 ceph_abort();
9577 return 0;
9578 }
9579
9580
9581 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9582 {
9583 return readdirplus_r(d, de, 0, 0, 0, NULL);
9584 }
9585
9586 /*
9587 * readdirplus_r
9588 *
9589 * returns
9590 * 1 if we got a dirent
9591 * 0 for end of directory
9592 * <0 on error
9593 */
9594
9595 struct single_readdir {
9596 struct dirent *de;
9597 struct ceph_statx *stx;
9598 Inode *inode;
9599 bool full;
9600 };
9601
9602 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9603 struct ceph_statx *stx, off_t off,
9604 Inode *in)
9605 {
9606 single_readdir *c = static_cast<single_readdir *>(p);
9607
9608 if (c->full)
9609 return -1; // already filled this dirent
9610
9611 *c->de = *de;
9612 if (c->stx)
9613 *c->stx = *stx;
9614 c->inode = in;
9615 c->full = true;
9616 return 1;
9617 }
9618
9619 struct dirent *Client::readdir(dir_result_t *d)
9620 {
9621 int ret;
9622 auto& de = d->de;
9623 single_readdir sr;
9624 sr.de = &de;
9625 sr.stx = NULL;
9626 sr.inode = NULL;
9627 sr.full = false;
9628
9629 // our callback fills the dirent and sets sr.full=true on first
9630 // call, and returns -1 the second time around.
9631 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9632 if (ret < -1) {
9633 errno = -ret; // this sucks.
9634 return (dirent *) NULL;
9635 }
9636 if (sr.full) {
9637 return &de;
9638 }
9639 return (dirent *) NULL;
9640 }
9641
9642 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9643 struct ceph_statx *stx, unsigned want,
9644 unsigned flags, Inode **out)
9645 {
9646 single_readdir sr;
9647 sr.de = de;
9648 sr.stx = stx;
9649 sr.inode = NULL;
9650 sr.full = false;
9651
9652 // our callback fills the dirent and sets sr.full=true on first
9653 // call, and returns -1 the second time around.
9654 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9655 if (r < -1)
9656 return r;
9657 if (out)
9658 *out = sr.inode;
9659 if (sr.full)
9660 return 1;
9661 return 0;
9662 }
9663
9664 int Client::readdir_snapdiff(dir_result_t* d1, snapid_t snap2,
9665 struct dirent* out_de,
9666 snapid_t* out_snap)
9667 {
9668 if (!d1 || !d1->inode || d1->inode->snapid == snap2) {
9669 lderr(cct) << __func__ << " invalid parameters: "
9670 << " d1:" << d1
9671 << " d1->inode:" << (d1 ? d1->inode : nullptr)
9672 << " snap2 id :" << snap2
9673 << dendl;
9674 errno = EINVAL;
9675 return -errno;
9676 }
9677
9678 auto& de = d1->de;
9679 ceph_statx stx;
9680 single_readdir sr;
9681 sr.de = &de;
9682 sr.stx = &stx;
9683 sr.inode = NULL;
9684 sr.full = false;
9685
9686 auto fill_snapdiff_cb = [&](dir_result_t* dirp,
9687 MetaRequest* req,
9688 InodeRef& diri,
9689 frag_t fg) {
9690 filepath path;
9691 diri->make_nosnap_relative_path(path);
9692 req->set_filepath(path);
9693 req->set_inode(diri.get());
9694 req->head.args.snapdiff.snap_other = snap2;
9695 req->head.args.snapdiff.frag = fg;
9696 req->head.args.snapdiff.flags = CEPH_READDIR_REPLY_BITFLAGS;
9697 if (dirp->last_name.length()) {
9698 req->path2.set_path(dirp->last_name);
9699 } else if (dirp->hash_order()) {
9700 req->head.args.snapdiff.offset_hash = dirp->offset_high();
9701 }
9702 req->dirp = dirp;
9703 };
9704
9705 // our callback fills the dirent and sets sr.full=true on first
9706 // call, and returns -1 the second time around.
9707 int ret = _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF,
9708 d1,
9709 _readdir_single_dirent_cb,
9710 fill_snapdiff_cb,
9711 (void*)&sr,
9712 0,
9713 AT_STATX_DONT_SYNC,
9714 false,
9715 true);
9716 if (ret < -1) {
9717 lderr(cct) << __func__ << " error: "
9718 << cpp_strerror(ret)
9719 << dendl;
9720 errno = -ret; // this sucks.
9721 return ret;
9722 }
9723
9724 ldout(cct, 15) << __func__ << " " << ret
9725 << " " << sr.de->d_name
9726 << " " << stx.stx_dev
9727 << dendl;
9728 if (sr.full) {
9729 if (out_de) {
9730 *out_de = de;
9731 }
9732 if (out_snap) {
9733 *out_snap = stx.stx_dev;
9734 }
9735 return 1;
9736 }
9737 return 0;
9738 }
9739
9740 /* getdents */
9741 struct getdents_result {
9742 char *buf;
9743 int buflen;
9744 int pos;
9745 bool fullent;
9746 };
9747
9748 static int _readdir_getdent_cb(void *p, struct dirent *de,
9749 struct ceph_statx *stx, off_t off, Inode *in)
9750 {
9751 struct getdents_result *c = static_cast<getdents_result *>(p);
9752
9753 int dlen;
9754 if (c->fullent)
9755 dlen = sizeof(*de);
9756 else
9757 dlen = strlen(de->d_name) + 1;
9758
9759 if (c->pos + dlen > c->buflen)
9760 return -1; // doesn't fit
9761
9762 if (c->fullent) {
9763 memcpy(c->buf + c->pos, de, sizeof(*de));
9764 } else {
9765 memcpy(c->buf + c->pos, de->d_name, dlen);
9766 }
9767 c->pos += dlen;
9768 return 0;
9769 }
9770
9771 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9772 {
9773 getdents_result gr;
9774 gr.buf = buf;
9775 gr.buflen = buflen;
9776 gr.fullent = fullent;
9777 gr.pos = 0;
9778
9779 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9780
9781 if (r < 0) { // some error
9782 if (r == -1) { // buffer ran out of space
9783 if (gr.pos) { // but we got some entries already!
9784 return gr.pos;
9785 } // or we need a larger buffer
9786 return -CEPHFS_ERANGE;
9787 } else { // actual error, return it
9788 return r;
9789 }
9790 }
9791 return gr.pos;
9792 }
9793
9794
9795 /* getdir */
9796 struct getdir_result {
9797 list<string> *contents;
9798 int num;
9799 };
9800
9801 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9802 {
9803 getdir_result *r = static_cast<getdir_result *>(p);
9804
9805 r->contents->push_back(de->d_name);
9806 r->num++;
9807 return 0;
9808 }
9809
9810 int Client::getdir(const char *relpath, list<string>& contents,
9811 const UserPerm& perms)
9812 {
9813 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9814 tout(cct) << "getdir" << std::endl;
9815 tout(cct) << relpath << std::endl;
9816
9817 dir_result_t *d;
9818 int r = opendir(relpath, &d, perms);
9819 if (r < 0)
9820 return r;
9821
9822 getdir_result gr;
9823 gr.contents = &contents;
9824 gr.num = 0;
9825 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9826
9827 closedir(d);
9828
9829 if (r < 0)
9830 return r;
9831 return gr.num;
9832 }
9833
9834
9835 /****** file i/o **********/
9836
9837 // common parts for open and openat. call with client_lock locked.
9838 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9839 const UserPerm& perms, mode_t mode, int stripe_unit,
9840 int stripe_count, int object_size, const char *data_pool,
9841 std::string alternate_name) {
9842 ceph_assert(ceph_mutex_is_locked(client_lock));
9843 int cflags = ceph_flags_sys2wire(flags);
9844 tout(cct) << cflags << std::endl;
9845
9846 Fh *fh = NULL;
9847
9848 #if defined(__linux__) && defined(O_PATH)
9849 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9850 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9851 * in kernel (fs/open.c). */
9852 if (flags & O_PATH)
9853 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9854 #endif
9855
9856 filepath path(relpath);
9857 InodeRef in;
9858 bool created = false;
9859 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9860 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9861 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9862
9863 InodeRef dirinode = nullptr;
9864 int r = get_fd_inode(dirfd, &dirinode);
9865 if (r < 0) {
9866 return r;
9867 }
9868
9869 r = path_walk(path, &in, perms, followsym, mask, dirinode);
9870 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9871 return -CEPHFS_EEXIST;
9872
9873 #if defined(__linux__) && defined(O_PATH)
9874 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9875 #else
9876 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9877 #endif
9878 return -CEPHFS_ELOOP;
9879
9880 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9881 filepath dirpath = path;
9882 string dname = dirpath.last_dentry();
9883 dirpath.pop_dentry();
9884 InodeRef dir;
9885 r = path_walk(dirpath, &dir, perms, true,
9886 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9887 if (r < 0) {
9888 goto out;
9889 }
9890 if (cct->_conf->client_permissions) {
9891 r = may_create(dir.get(), perms);
9892 if (r < 0)
9893 goto out;
9894 }
9895 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9896 stripe_count, object_size, data_pool, &created, perms,
9897 std::move(alternate_name));
9898 }
9899 if (r < 0)
9900 goto out;
9901
9902 if (!created) {
9903 // posix says we can only check permissions of existing files
9904 if (cct->_conf->client_permissions) {
9905 r = may_open(in.get(), flags, perms);
9906 if (r < 0)
9907 goto out;
9908 }
9909 }
9910
9911 if (!fh)
9912 r = _open(in.get(), flags, mode, &fh, perms);
9913 if (r >= 0) {
9914 // allocate a integer file descriptor
9915 ceph_assert(fh);
9916 r = get_fd();
9917 ceph_assert(fd_map.count(r) == 0);
9918 fd_map[r] = fh;
9919 }
9920
9921 out:
9922 return r;
9923 }
9924
9925 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9926 mode_t mode, int stripe_unit, int stripe_count,
9927 int object_size, const char *data_pool, std::string alternate_name)
9928 {
9929 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9930 stripe_count, object_size, data_pool, alternate_name);
9931 }
9932
9933 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9934 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9935 const char *data_pool, std::string alternate_name) {
9936 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9937 if (!mref_reader.is_state_satisfied()) {
9938 return -CEPHFS_ENOTCONN;
9939 }
9940
9941 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9942 tout(cct) << dirfd << std::endl;
9943 tout(cct) << relpath << std::endl;
9944 tout(cct) << flags << std::endl;
9945 tout(cct) << mode << std::endl;
9946
9947 std::scoped_lock locker(client_lock);
9948 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9949 object_size, data_pool, alternate_name);
9950
9951 tout(cct) << r << std::endl;
9952 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9953 return r;
9954 }
9955
9956 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9957 const UserPerm& perms)
9958 {
9959 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9960
9961 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9962 if (!mref_reader.is_state_satisfied())
9963 return -CEPHFS_ENOTCONN;
9964
9965 std::scoped_lock lock(client_lock);
9966 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9967 filepath path(ino);
9968 req->set_filepath(path);
9969
9970 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9971 char f[30];
9972 sprintf(f, "%u", h);
9973 filepath path2(dirino);
9974 path2.push_dentry(string(f));
9975 req->set_filepath2(path2);
9976
9977 int r = make_request(req, perms, NULL, NULL,
9978 rand() % mdsmap->get_num_in_mds());
9979 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9980 return r;
9981 }
9982
9983
9984 /**
9985 * Load inode into local cache.
9986 *
9987 * If inode pointer is non-NULL, and take a reference on
9988 * the resulting Inode object in one operation, so that caller
9989 * can safely assume inode will still be there after return.
9990 */
9991 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9992 {
9993 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9994
9995 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9996 if (!mref_reader.is_state_satisfied())
9997 return -CEPHFS_ENOTCONN;
9998
9999 if (is_reserved_vino(vino))
10000 return -CEPHFS_ESTALE;
10001
10002 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
10003 filepath path(vino.ino);
10004 req->set_filepath(path);
10005
10006 /*
10007 * The MDS expects either a "real" snapid here or 0. The special value
10008 * carveouts for the snapid are all at the end of the range so we can
10009 * just look for any snapid below this value.
10010 */
10011 if (vino.snapid < CEPH_NOSNAP)
10012 req->head.args.lookupino.snapid = vino.snapid;
10013
10014 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
10015 if (r == 0 && inode != NULL) {
10016 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10017 ceph_assert(p != inode_map.end());
10018 *inode = p->second;
10019 _ll_get(*inode);
10020 }
10021 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
10022 return r;
10023 }
10024
10025 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
10026 {
10027 vinodeno_t vino(ino, CEPH_NOSNAP);
10028 std::scoped_lock lock(client_lock);
10029 return _lookup_vino(vino, perms, inode);
10030 }
10031
10032 /**
10033 * Find the parent inode of `ino` and insert it into
10034 * our cache. Conditionally also set `parent` to a referenced
10035 * Inode* if caller provides non-NULL value.
10036 */
10037 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
10038 {
10039 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
10040
10041 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
10042 filepath path(ino->ino);
10043 req->set_filepath(path);
10044
10045 InodeRef target;
10046 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
10047 // Give caller a reference to the parent ino if they provided a pointer.
10048 if (parent != NULL) {
10049 if (r == 0) {
10050 *parent = target.get();
10051 _ll_get(*parent);
10052 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
10053 } else {
10054 *parent = NULL;
10055 }
10056 }
10057 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
10058 return r;
10059 }
10060
10061 /**
10062 * Populate the parent dentry for `ino`, provided it is
10063 * a child of `parent`.
10064 */
10065 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
10066 {
10067 ceph_assert(parent->is_dir());
10068 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
10069
10070 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10071 if (!mref_reader.is_state_satisfied())
10072 return -CEPHFS_ENOTCONN;
10073
10074 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10075 req->set_filepath2(filepath(parent->ino));
10076 req->set_filepath(filepath(ino->ino));
10077 req->set_inode(ino);
10078
10079 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
10080 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
10081 return r;
10082 }
10083
10084 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
10085 {
10086 std::scoped_lock lock(client_lock);
10087 return _lookup_name(ino, parent, perms);
10088 }
10089
10090 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
10091 {
10092 ceph_assert(in);
10093 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
10094
10095 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
10096
10097 if (in->snapid != CEPH_NOSNAP) {
10098 in->snap_cap_refs++;
10099 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
10100 << ccap_string(in->caps_issued()) << dendl;
10101 }
10102
10103 const auto& conf = cct->_conf;
10104 f->readahead.set_trigger_requests(1);
10105 f->readahead.set_min_readahead_size(conf->client_readahead_min);
10106 uint64_t max_readahead = Readahead::NO_LIMIT;
10107 if (conf->client_readahead_max_bytes) {
10108 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
10109 }
10110 if (conf->client_readahead_max_periods) {
10111 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
10112 }
10113 f->readahead.set_max_readahead_size(max_readahead);
10114 vector<uint64_t> alignments;
10115 alignments.push_back(in->layout.get_period());
10116 alignments.push_back(in->layout.stripe_unit);
10117 f->readahead.set_alignments(alignments);
10118
10119 return f;
10120 }
10121
10122 int Client::_release_fh(Fh *f)
10123 {
10124 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
10125 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
10126 Inode *in = f->inode.get();
10127 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
10128
10129 in->unset_deleg(f);
10130
10131 if (in->snapid == CEPH_NOSNAP) {
10132 if (in->put_open_ref(f->mode)) {
10133 _flush(in, new C_Client_FlushComplete(this, in));
10134 check_caps(in, 0);
10135 }
10136 } else {
10137 ceph_assert(in->snap_cap_refs > 0);
10138 in->snap_cap_refs--;
10139 }
10140
10141 _release_filelocks(f);
10142
10143 // Finally, read any async err (i.e. from flushes)
10144 int err = f->take_async_err();
10145 if (err != 0) {
10146 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
10147 << cpp_strerror(err) << dendl;
10148 } else {
10149 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
10150 }
10151
10152 _put_fh(f);
10153
10154 return err;
10155 }
10156
10157 void Client::_put_fh(Fh *f)
10158 {
10159 int left = f->put();
10160 if (!left) {
10161 delete f;
10162 }
10163 }
10164
10165 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
10166 const UserPerm& perms)
10167 {
10168 if (in->snapid != CEPH_NOSNAP &&
10169 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
10170 return -CEPHFS_EROFS;
10171 }
10172
10173 // use normalized flags to generate cmode
10174 int cflags = ceph_flags_sys2wire(flags);
10175 if (cct->_conf.get_val<bool>("client_force_lazyio"))
10176 cflags |= CEPH_O_LAZY;
10177
10178 int cmode = ceph_flags_to_mode(cflags);
10179 int want = ceph_caps_for_mode(cmode);
10180 int result = 0;
10181
10182 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
10183
10184 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
10185 // update wanted?
10186 check_caps(in, CHECK_CAPS_NODELAY);
10187 } else {
10188
10189 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
10190 filepath path;
10191 in->make_nosnap_relative_path(path);
10192 req->set_filepath(path);
10193 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
10194 req->head.args.open.mode = mode;
10195 req->head.args.open.pool = -1;
10196 if (cct->_conf->client_debug_getattr_caps)
10197 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
10198 else
10199 req->head.args.open.mask = 0;
10200 req->head.args.open.old_size = in->size; // for O_TRUNC
10201 req->set_inode(in);
10202 result = make_request(req, perms);
10203
10204 /*
10205 * NFS expects that delegations will be broken on a conflicting open,
10206 * not just when there is actual conflicting access to the file. SMB leases
10207 * and oplocks also have similar semantics.
10208 *
10209 * Ensure that clients that have delegations enabled will wait on minimal
10210 * caps during open, just to ensure that other clients holding delegations
10211 * return theirs first.
10212 */
10213 if (deleg_timeout && result == 0) {
10214 int need = 0, have;
10215
10216 if (cmode & CEPH_FILE_MODE_WR)
10217 need |= CEPH_CAP_FILE_WR;
10218 if (cmode & CEPH_FILE_MODE_RD)
10219 need |= CEPH_CAP_FILE_RD;
10220
10221 Fh fh(in, flags, cmode, fd_gen, perms);
10222 result = get_caps(&fh, need, want, &have, -1);
10223 if (result < 0) {
10224 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
10225 " . Denying open: " <<
10226 cpp_strerror(result) << dendl;
10227 } else {
10228 put_cap_ref(in, need);
10229 }
10230 }
10231 }
10232
10233 // success?
10234 if (result >= 0) {
10235 if (fhp)
10236 *fhp = _create_fh(in, flags, cmode, perms);
10237 } else {
10238 in->put_open_ref(cmode);
10239 }
10240
10241 trim_cache();
10242
10243 return result;
10244 }
10245
10246 int Client::_renew_caps(Inode *in)
10247 {
10248 int wanted = in->caps_file_wanted();
10249 if (in->is_any_caps() &&
10250 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
10251 check_caps(in, CHECK_CAPS_NODELAY);
10252 return 0;
10253 }
10254
10255 int flags = 0;
10256 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
10257 flags = O_RDWR;
10258 else if (wanted & CEPH_CAP_FILE_RD)
10259 flags = O_RDONLY;
10260 else if (wanted & CEPH_CAP_FILE_WR)
10261 flags = O_WRONLY;
10262
10263 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
10264 filepath path;
10265 in->make_nosnap_relative_path(path);
10266 req->set_filepath(path);
10267 req->head.args.open.flags = flags;
10268 req->head.args.open.pool = -1;
10269 if (cct->_conf->client_debug_getattr_caps)
10270 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
10271 else
10272 req->head.args.open.mask = 0;
10273 req->set_inode(in);
10274
10275 // duplicate in case Cap goes away; not sure if that race is a concern?
10276 const UserPerm *pperm = in->get_best_perms();
10277 UserPerm perms;
10278 if (pperm != NULL)
10279 perms = *pperm;
10280 int ret = make_request(req, perms);
10281 return ret;
10282 }
10283
10284 int Client::_close(int fd)
10285 {
10286 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
10287 tout(cct) << "close" << std::endl;
10288 tout(cct) << fd << std::endl;
10289
10290 Fh *fh = get_filehandle(fd);
10291 if (!fh)
10292 return -CEPHFS_EBADF;
10293 int err = _release_fh(fh);
10294 fd_map.erase(fd);
10295 put_fd(fd);
10296 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
10297 return err;
10298 }
10299
10300 int Client::close(int fd) {
10301 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10302 if (!mref_reader.is_state_satisfied())
10303 return -CEPHFS_ENOTCONN;
10304
10305 std::scoped_lock lock(client_lock);
10306 return _close(fd);
10307 }
10308
10309 // ------------
10310 // read, write
10311
10312 loff_t Client::lseek(int fd, loff_t offset, int whence)
10313 {
10314 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10315 if (!mref_reader.is_state_satisfied())
10316 return -CEPHFS_ENOTCONN;
10317
10318 tout(cct) << "lseek" << std::endl;
10319 tout(cct) << fd << std::endl;
10320 tout(cct) << offset << std::endl;
10321 tout(cct) << whence << std::endl;
10322
10323 std::scoped_lock lock(client_lock);
10324 Fh *f = get_filehandle(fd);
10325 if (!f)
10326 return -CEPHFS_EBADF;
10327 #if defined(__linux__) && defined(O_PATH)
10328 if (f->flags & O_PATH)
10329 return -CEPHFS_EBADF;
10330 #endif
10331 return _lseek(f, offset, whence);
10332 }
10333
10334 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
10335 {
10336 Inode *in = f->inode.get();
10337 bool whence_check = false;
10338 loff_t pos = -1;
10339
10340 switch (whence) {
10341 case SEEK_END:
10342 whence_check = true;
10343 break;
10344
10345 #ifdef SEEK_DATA
10346 case SEEK_DATA:
10347 whence_check = true;
10348 break;
10349 #endif
10350
10351 #ifdef SEEK_HOLE
10352 case SEEK_HOLE:
10353 whence_check = true;
10354 break;
10355 #endif
10356 }
10357
10358 if (whence_check) {
10359 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10360 if (r < 0)
10361 return r;
10362 }
10363
10364 switch (whence) {
10365 case SEEK_SET:
10366 pos = offset;
10367 break;
10368
10369 case SEEK_CUR:
10370 pos = f->pos + offset;
10371 break;
10372
10373 case SEEK_END:
10374 pos = in->size + offset;
10375 break;
10376
10377 #ifdef SEEK_DATA
10378 case SEEK_DATA:
10379 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10380 return -CEPHFS_ENXIO;
10381 pos = offset;
10382 break;
10383 #endif
10384
10385 #ifdef SEEK_HOLE
10386 case SEEK_HOLE:
10387 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10388 return -CEPHFS_ENXIO;
10389 pos = in->size;
10390 break;
10391 #endif
10392
10393 default:
10394 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
10395 return -CEPHFS_EINVAL;
10396 }
10397
10398 if (pos < 0) {
10399 return -CEPHFS_EINVAL;
10400 } else {
10401 f->pos = pos;
10402 }
10403
10404 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
10405 return f->pos;
10406 }
10407
10408
10409 void Client::lock_fh_pos(Fh *f)
10410 {
10411 ldout(cct, 10) << __func__ << " " << f << dendl;
10412
10413 if (f->pos_locked || !f->pos_waiters.empty()) {
10414 ceph::condition_variable cond;
10415 f->pos_waiters.push_back(&cond);
10416 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
10417 std::unique_lock l{client_lock, std::adopt_lock};
10418 cond.wait(l, [f, me=&cond] {
10419 return !f->pos_locked && f->pos_waiters.front() == me;
10420 });
10421 l.release();
10422 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
10423 ceph_assert(f->pos_waiters.front() == &cond);
10424 f->pos_waiters.pop_front();
10425 }
10426
10427 f->pos_locked = true;
10428 }
10429
10430 void Client::unlock_fh_pos(Fh *f)
10431 {
10432 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10433
10434 ldout(cct, 10) << __func__ << " " << f << dendl;
10435 f->pos_locked = false;
10436 if (!f->pos_waiters.empty()) {
10437 // only wake up the oldest waiter
10438 auto cond = f->pos_waiters.front();
10439 cond->notify_one();
10440 }
10441 }
10442
10443 int Client::uninline_data(Inode *in, Context *onfinish)
10444 {
10445 if (!in->inline_data.length()) {
10446 onfinish->complete(0);
10447 return 0;
10448 }
10449
10450 char oid_buf[32];
10451 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10452 object_t oid = oid_buf;
10453
10454 ObjectOperation create_ops;
10455 create_ops.create(false);
10456
10457 objecter->mutate(oid,
10458 OSDMap::file_to_object_locator(in->layout),
10459 create_ops,
10460 in->snaprealm->get_snap_context(),
10461 ceph::real_clock::now(),
10462 0,
10463 NULL);
10464
10465 bufferlist inline_version_bl;
10466 encode(in->inline_version, inline_version_bl);
10467
10468 ObjectOperation uninline_ops;
10469 uninline_ops.cmpxattr("inline_version",
10470 CEPH_OSD_CMPXATTR_OP_GT,
10471 CEPH_OSD_CMPXATTR_MODE_U64,
10472 inline_version_bl);
10473 bufferlist inline_data = in->inline_data;
10474 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10475 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10476
10477 objecter->mutate(oid,
10478 OSDMap::file_to_object_locator(in->layout),
10479 uninline_ops,
10480 in->snaprealm->get_snap_context(),
10481 ceph::real_clock::now(),
10482 0,
10483 onfinish);
10484
10485 return 0;
10486 }
10487
10488 //
10489
10490 // blocking osd interface
10491
10492 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10493 {
10494 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10495 if (!mref_reader.is_state_satisfied())
10496 return -CEPHFS_ENOTCONN;
10497
10498 tout(cct) << "read" << std::endl;
10499 tout(cct) << fd << std::endl;
10500 tout(cct) << size << std::endl;
10501 tout(cct) << offset << std::endl;
10502
10503 std::unique_lock lock(client_lock);
10504 Fh *f = get_filehandle(fd);
10505 if (!f)
10506 return -CEPHFS_EBADF;
10507 #if defined(__linux__) && defined(O_PATH)
10508 if (f->flags & O_PATH)
10509 return -CEPHFS_EBADF;
10510 #endif
10511 bufferlist bl;
10512 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10513 size = std::min(size, (loff_t)INT_MAX);
10514 int r = _read(f, offset, size, &bl);
10515 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10516 if (r >= 0) {
10517 lock.unlock();
10518 bl.begin().copy(bl.length(), buf);
10519 r = bl.length();
10520 }
10521 return r;
10522 }
10523
10524 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10525 {
10526 if (iovcnt < 0)
10527 return -CEPHFS_EINVAL;
10528 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10529 }
10530
10531 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
10532 {
10533 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10534
10535 int want, have = 0;
10536 bool movepos = false;
10537 int64_t rc = 0;
10538 const auto& conf = cct->_conf;
10539 Inode *in = f->inode.get();
10540 utime_t lat;
10541 utime_t start = ceph_clock_now();
10542
10543 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
10544 return -CEPHFS_EBADF;
10545 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10546
10547 if (offset < 0) {
10548 lock_fh_pos(f);
10549 offset = f->pos;
10550 movepos = true;
10551 }
10552 loff_t start_pos = offset;
10553
10554 if (in->inline_version == 0) {
10555 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10556 if (r < 0) {
10557 rc = r;
10558 goto done;
10559 }
10560 ceph_assert(in->inline_version > 0);
10561 }
10562
10563 retry:
10564 if (f->mode & CEPH_FILE_MODE_LAZY)
10565 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10566 else
10567 want = CEPH_CAP_FILE_CACHE;
10568 {
10569 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10570 if (r < 0) {
10571 rc = r;
10572 goto done;
10573 }
10574 }
10575 if (f->flags & O_DIRECT)
10576 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
10577
10578 if (in->inline_version < CEPH_INLINE_NONE) {
10579 uint32_t len = in->inline_data.length();
10580 uint64_t endoff = offset + size;
10581 if (endoff > in->size)
10582 endoff = in->size;
10583
10584 if (offset < len) {
10585 if (endoff <= len) {
10586 bl->substr_of(in->inline_data, offset, endoff - offset);
10587 } else {
10588 bl->substr_of(in->inline_data, offset, len - offset);
10589 bl->append_zero(endoff - len);
10590 }
10591 rc = endoff - offset;
10592 } else if ((uint64_t)offset < endoff) {
10593 bl->append_zero(endoff - offset);
10594 rc = endoff - offset;
10595 } else {
10596 rc = 0;
10597 }
10598 goto success;
10599 }
10600
10601 if (!conf->client_debug_force_sync_read &&
10602 conf->client_oc &&
10603 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10604
10605 if (f->flags & O_RSYNC) {
10606 _flush_range(in, offset, size);
10607 }
10608 rc = _read_async(f, offset, size, bl);
10609 if (rc < 0)
10610 goto done;
10611 } else {
10612 if (f->flags & O_DIRECT)
10613 _flush_range(in, offset, size);
10614
10615 bool checkeof = false;
10616 rc = _read_sync(f, offset, size, bl, &checkeof);
10617 if (rc < 0)
10618 goto done;
10619 if (checkeof) {
10620 offset += rc;
10621 size -= rc;
10622
10623 put_cap_ref(in, CEPH_CAP_FILE_RD);
10624 have = 0;
10625 // reverify size
10626 {
10627 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10628 if (r < 0) {
10629 rc = r;
10630 goto done;
10631 }
10632 }
10633
10634 // eof? short read.
10635 if ((uint64_t)offset < in->size)
10636 goto retry;
10637 }
10638 }
10639
10640 success:
10641 ceph_assert(rc >= 0);
10642 update_read_io_size(bl->length());
10643 if (movepos) {
10644 // adjust fd pos
10645 f->pos = start_pos + rc;
10646 }
10647
10648 lat = ceph_clock_now();
10649 lat -= start;
10650
10651 ++nr_read_request;
10652 update_io_stat_read(lat);
10653
10654 done:
10655 // done!
10656 if (have) {
10657 put_cap_ref(in, CEPH_CAP_FILE_RD);
10658 }
10659 if (movepos) {
10660 unlock_fh_pos(f);
10661 }
10662 return rc;
10663 }
10664
10665 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10666 client(c), f(f) {
10667 f->get();
10668 f->readahead.inc_pending();
10669 }
10670
10671 Client::C_Readahead::~C_Readahead() {
10672 f->readahead.dec_pending();
10673 client->_put_fh(f);
10674 }
10675
10676 void Client::C_Readahead::finish(int r) {
10677 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10678 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10679 if (r > 0) {
10680 client->update_read_io_size(r);
10681 }
10682 }
10683
10684 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10685 {
10686 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10687
10688 const auto& conf = cct->_conf;
10689 Inode *in = f->inode.get();
10690
10691 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10692
10693 // trim read based on file size?
10694 if (off >= in->size)
10695 return 0;
10696 if (len == 0)
10697 return 0;
10698 if (off + len > in->size) {
10699 len = in->size - off;
10700 }
10701
10702 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10703 << " max_bytes=" << f->readahead.get_max_readahead_size()
10704 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10705
10706 // read (and possibly block)
10707 int r = 0;
10708 C_SaferCond onfinish("Client::_read_async flock");
10709 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10710 off, len, bl, 0, &onfinish);
10711 if (r == 0) {
10712 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10713 client_lock.unlock();
10714 r = onfinish.wait();
10715 client_lock.lock();
10716 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10717 update_read_io_size(bl->length());
10718 }
10719
10720 if(f->readahead.get_min_readahead_size() > 0) {
10721 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10722 if (readahead_extent.second > 0) {
10723 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10724 << " (caller wants " << off << "~" << len << ")" << dendl;
10725 Context *onfinish2 = new C_Readahead(this, f);
10726 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10727 readahead_extent.first, readahead_extent.second,
10728 NULL, 0, onfinish2);
10729 if (r2 == 0) {
10730 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10731 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10732 } else {
10733 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10734 delete onfinish2;
10735 }
10736 }
10737 }
10738
10739 return r;
10740 }
10741
10742 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10743 bool *checkeof)
10744 {
10745 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10746
10747 Inode *in = f->inode.get();
10748 uint64_t pos = off;
10749 int left = len;
10750 int read = 0;
10751
10752 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10753
10754 // 0 success, 1 continue and < 0 error happen.
10755 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10756 int r = onfinish.wait();
10757
10758 // if we get ENOENT from OSD, assume 0 bytes returned
10759 if (r == -CEPHFS_ENOENT)
10760 r = 0;
10761 if (r < 0)
10762 return r;
10763
10764 if (tbl.length()) {
10765 r = tbl.length();
10766
10767 read += r;
10768 pos += r;
10769 left -= r;
10770 bl->claim_append(tbl);
10771 }
10772 // short read?
10773 if (r >= 0 && r < wanted) {
10774 if (pos < in->size) {
10775 // zero up to known EOF
10776 int64_t some = in->size - pos;
10777 if (some > left)
10778 some = left;
10779 auto z = buffer::ptr_node::create(some);
10780 z->zero();
10781 bl->push_back(std::move(z));
10782 read += some;
10783 pos += some;
10784 left -= some;
10785 if (left == 0)
10786 return 0;
10787 }
10788
10789 *checkeof = true;
10790 return 0;
10791 }
10792 return 1;
10793 };
10794
10795 while (left > 0) {
10796 C_SaferCond onfinish("Client::_read_sync flock");
10797 bufferlist tbl;
10798
10799 int wanted = left;
10800 filer->read_trunc(in->ino, &in->layout, in->snapid,
10801 pos, left, &tbl, 0,
10802 in->truncate_size, in->truncate_seq,
10803 &onfinish);
10804 client_lock.unlock();
10805 int r = wait_and_copy(onfinish, tbl, wanted);
10806 client_lock.lock();
10807 if (!r)
10808 return read;
10809 if (r < 0)
10810 return r;
10811 }
10812 return read;
10813 }
10814
10815 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10816 {
10817 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10818 if (!mref_reader.is_state_satisfied())
10819 return -CEPHFS_ENOTCONN;
10820
10821 tout(cct) << "write" << std::endl;
10822 tout(cct) << fd << std::endl;
10823 tout(cct) << size << std::endl;
10824 tout(cct) << offset << std::endl;
10825
10826 std::scoped_lock lock(client_lock);
10827 Fh *fh = get_filehandle(fd);
10828 if (!fh)
10829 return -CEPHFS_EBADF;
10830 #if defined(__linux__) && defined(O_PATH)
10831 if (fh->flags & O_PATH)
10832 return -CEPHFS_EBADF;
10833 #endif
10834 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10835 size = std::min(size, (loff_t)INT_MAX);
10836 int r = _write(fh, offset, size, buf, NULL, false);
10837 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10838 return r;
10839 }
10840
10841 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10842 {
10843 if (iovcnt < 0)
10844 return -CEPHFS_EINVAL;
10845 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10846 }
10847
10848 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10849 unsigned iovcnt, int64_t offset,
10850 bool write, bool clamp_to_int)
10851 {
10852 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10853
10854 #if defined(__linux__) && defined(O_PATH)
10855 if (fh->flags & O_PATH)
10856 return -CEPHFS_EBADF;
10857 #endif
10858 loff_t totallen = 0;
10859 for (unsigned i = 0; i < iovcnt; i++) {
10860 totallen += iov[i].iov_len;
10861 }
10862
10863 /*
10864 * Some of the API functions take 64-bit size values, but only return
10865 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10866 * we don't do I/Os larger than the values we can return.
10867 */
10868 if (clamp_to_int) {
10869 totallen = std::min(totallen, (loff_t)INT_MAX);
10870 }
10871 if (write) {
10872 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10873 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10874 return w;
10875 } else {
10876 bufferlist bl;
10877 int64_t r = _read(fh, offset, totallen, &bl);
10878 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
10879 if (r <= 0)
10880 return r;
10881
10882 client_lock.unlock();
10883 auto iter = bl.cbegin();
10884 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10885 /*
10886 * This piece of code aims to handle the case that bufferlist
10887 * does not have enough data to fill in the iov
10888 */
10889 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10890 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10891 resid -= round_size;
10892 /* iter is self-updating */
10893 }
10894 client_lock.lock();
10895 return r;
10896 }
10897 }
10898
10899 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10900 {
10901 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10902 if (!mref_reader.is_state_satisfied())
10903 return -CEPHFS_ENOTCONN;
10904
10905 tout(cct) << fd << std::endl;
10906 tout(cct) << offset << std::endl;
10907
10908 std::scoped_lock cl(client_lock);
10909 Fh *fh = get_filehandle(fd);
10910 if (!fh)
10911 return -CEPHFS_EBADF;
10912 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10913 }
10914
10915 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10916 const struct iovec *iov, int iovcnt)
10917 {
10918 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10919
10920 uint64_t fpos = 0;
10921 Inode *in = f->inode.get();
10922
10923 if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10924 (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10925 return -CEPHFS_EFBIG;
10926 }
10927 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10928
10929 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10930 return -CEPHFS_ENOSPC;
10931 }
10932
10933 ceph_assert(in->snapid == CEPH_NOSNAP);
10934
10935 // was Fh opened as writeable?
10936 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10937 return -CEPHFS_EBADF;
10938
10939 // use/adjust fd pos?
10940 if (offset < 0) {
10941 lock_fh_pos(f);
10942 /*
10943 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10944 * change out from under us.
10945 */
10946 if (f->flags & O_APPEND) {
10947 auto r = _lseek(f, 0, SEEK_END);
10948 if (r < 0) {
10949 unlock_fh_pos(f);
10950 return r;
10951 }
10952 }
10953 offset = f->pos;
10954 fpos = offset+size;
10955 unlock_fh_pos(f);
10956 }
10957
10958 // check quota
10959 uint64_t endoff = offset + size;
10960 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10961 f->actor_perms)) {
10962 return -CEPHFS_EDQUOT;
10963 }
10964
10965 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10966
10967 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10968
10969 // time it.
10970 utime_t start = ceph_clock_now();
10971
10972 if (in->inline_version == 0) {
10973 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10974 if (r < 0)
10975 return r;
10976 ceph_assert(in->inline_version > 0);
10977 }
10978
10979 // copy into fresh buffer (since our write may be resub, async)
10980 bufferlist bl;
10981 if (buf) {
10982 if (size > 0)
10983 bl.append(buf, size);
10984 } else if (iov){
10985 for (int i = 0; i < iovcnt; i++) {
10986 if (iov[i].iov_len > 0) {
10987 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10988 }
10989 }
10990 }
10991
10992 utime_t lat;
10993 uint64_t totalwritten;
10994 int want, have;
10995 if (f->mode & CEPH_FILE_MODE_LAZY)
10996 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10997 else
10998 want = CEPH_CAP_FILE_BUFFER;
10999 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
11000 if (r < 0)
11001 return r;
11002
11003 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
11004 if (size > 0) {
11005 r = clear_suid_sgid(in, f->actor_perms);
11006 if (r < 0) {
11007 put_cap_ref(in, CEPH_CAP_FILE_WR);
11008 return r;
11009 }
11010 }
11011
11012 if (f->flags & O_DIRECT)
11013 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
11014
11015 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
11016
11017 std::unique_ptr<C_SaferCond> onuninline = nullptr;
11018
11019 if (in->inline_version < CEPH_INLINE_NONE) {
11020 if (endoff > cct->_conf->client_max_inline_size ||
11021 endoff > CEPH_INLINE_MAX_SIZE ||
11022 !(have & CEPH_CAP_FILE_BUFFER)) {
11023 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
11024 uninline_data(in, onuninline.get());
11025 } else {
11026 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11027
11028 uint32_t len = in->inline_data.length();
11029
11030 if (endoff < len)
11031 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
11032
11033 if (offset < len)
11034 in->inline_data.splice(offset, len - offset);
11035 else if (offset > len)
11036 in->inline_data.append_zero(offset - len);
11037
11038 in->inline_data.append(bl);
11039 in->inline_version++;
11040
11041 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11042
11043 goto success;
11044 }
11045 }
11046
11047 if (cct->_conf->client_oc &&
11048 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
11049 // do buffered write
11050 if (!in->oset.dirty_or_tx)
11051 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
11052
11053 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11054
11055 // async, caching, non-blocking.
11056 r = objectcacher->file_write(&in->oset, &in->layout,
11057 in->snaprealm->get_snap_context(),
11058 offset, size, bl, ceph::real_clock::now(),
11059 0);
11060 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11061
11062 if (r < 0)
11063 goto done;
11064
11065 // flush cached write if O_SYNC is set on file fh
11066 // O_DSYNC == O_SYNC on linux < 2.6.33
11067 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
11068 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
11069 _flush_range(in, offset, size);
11070 }
11071 } else {
11072 if (f->flags & O_DIRECT)
11073 _flush_range(in, offset, size);
11074
11075 // simple, non-atomic sync write
11076 C_SaferCond onfinish("Client::_write flock");
11077 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11078
11079 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
11080 offset, size, bl, ceph::real_clock::now(), 0,
11081 in->truncate_size, in->truncate_seq,
11082 &onfinish);
11083 client_lock.unlock();
11084 r = onfinish.wait();
11085 client_lock.lock();
11086 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11087 if (r < 0)
11088 goto done;
11089 }
11090
11091 // if we get here, write was successful, update client metadata
11092 success:
11093 update_write_io_size(size);
11094 // time
11095 lat = ceph_clock_now();
11096 lat -= start;
11097
11098 ++nr_write_request;
11099 update_io_stat_write(lat);
11100
11101 if (fpos) {
11102 lock_fh_pos(f);
11103 f->pos = fpos;
11104 unlock_fh_pos(f);
11105 }
11106 totalwritten = size;
11107 r = (int64_t)totalwritten;
11108
11109 // extend file?
11110 if (totalwritten + offset > in->size) {
11111 in->size = totalwritten + offset;
11112 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11113
11114 if (is_quota_bytes_approaching(in, f->actor_perms)) {
11115 check_caps(in, CHECK_CAPS_NODELAY);
11116 } else if (is_max_size_approaching(in)) {
11117 check_caps(in, 0);
11118 }
11119
11120 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
11121 } else {
11122 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
11123 }
11124
11125 // mtime
11126 in->mtime = in->ctime = ceph_clock_now();
11127 in->change_attr++;
11128 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11129
11130 done:
11131
11132 if (nullptr != onuninline) {
11133 client_lock.unlock();
11134 int uninline_ret = onuninline->wait();
11135 client_lock.lock();
11136
11137 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
11138 in->inline_data.clear();
11139 in->inline_version = CEPH_INLINE_NONE;
11140 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11141 check_caps(in, 0);
11142 } else
11143 r = uninline_ret;
11144 }
11145
11146 put_cap_ref(in, CEPH_CAP_FILE_WR);
11147 return r;
11148 }
11149
11150 int Client::_flush(Fh *f)
11151 {
11152 Inode *in = f->inode.get();
11153 int err = f->take_async_err();
11154 if (err != 0) {
11155 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
11156 << cpp_strerror(err) << dendl;
11157 } else {
11158 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
11159 }
11160
11161 return err;
11162 }
11163
11164 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
11165 {
11166 struct ceph_statx stx;
11167 stx.stx_size = length;
11168 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
11169 }
11170
11171 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
11172 {
11173 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11174 if (!mref_reader.is_state_satisfied())
11175 return -CEPHFS_ENOTCONN;
11176
11177 tout(cct) << __func__ << std::endl;
11178 tout(cct) << fd << std::endl;
11179 tout(cct) << length << std::endl;
11180
11181 std::scoped_lock lock(client_lock);
11182 Fh *f = get_filehandle(fd);
11183 if (!f)
11184 return -CEPHFS_EBADF;
11185 #if defined(__linux__) && defined(O_PATH)
11186 if (f->flags & O_PATH)
11187 return -CEPHFS_EBADF;
11188 #endif
11189 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
11190 return -CEPHFS_EBADF;
11191 struct stat attr;
11192 attr.st_size = length;
11193 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
11194 }
11195
11196 int Client::fsync(int fd, bool syncdataonly)
11197 {
11198 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11199 if (!mref_reader.is_state_satisfied())
11200 return -CEPHFS_ENOTCONN;
11201
11202 tout(cct) << "fsync" << std::endl;
11203 tout(cct) << fd << std::endl;
11204 tout(cct) << syncdataonly << std::endl;
11205
11206 std::scoped_lock lock(client_lock);
11207 Fh *f = get_filehandle(fd);
11208 if (!f)
11209 return -CEPHFS_EBADF;
11210 #if defined(__linux__) && defined(O_PATH)
11211 if (f->flags & O_PATH)
11212 return -CEPHFS_EBADF;
11213 #endif
11214 int r = _fsync(f, syncdataonly);
11215 if (r == 0) {
11216 // The IOs in this fsync were okay, but maybe something happened
11217 // in the background that we shoudl be reporting?
11218 r = f->take_async_err();
11219 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
11220 << ") = 0, async_err = " << r << dendl;
11221 } else {
11222 // Assume that an error we encountered during fsync, even reported
11223 // synchronously, would also have applied the error to the Fh, and we
11224 // should clear it here to avoid returning the same error again on next
11225 // call.
11226 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
11227 << r << dendl;
11228 f->take_async_err();
11229 }
11230 return r;
11231 }
11232
11233 int Client::_fsync(Inode *in, bool syncdataonly)
11234 {
11235 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11236
11237 int r = 0;
11238 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
11239 ceph_tid_t flush_tid = 0;
11240 InodeRef tmp_ref;
11241 utime_t lat;
11242 utime_t start = ceph_clock_now();
11243
11244 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
11245
11246 if (cct->_conf->client_oc) {
11247 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
11248 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
11249 _flush(in, object_cacher_completion.get());
11250 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
11251 }
11252
11253 if (!syncdataonly && in->dirty_caps) {
11254 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
11255 if (in->flushing_caps)
11256 flush_tid = last_flush_tid;
11257 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
11258
11259 if (!syncdataonly && !in->unsafe_ops.empty()) {
11260 flush_mdlog_sync(in);
11261
11262 MetaRequest *req = in->unsafe_ops.back();
11263 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
11264
11265 req->get();
11266 wait_on_list(req->waitfor_safe);
11267 put_request(req);
11268 }
11269
11270 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
11271 client_lock.unlock();
11272 ldout(cct, 15) << "waiting on data to flush" << dendl;
11273 r = object_cacher_completion->wait();
11274 client_lock.lock();
11275 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
11276 } else {
11277 // FIXME: this can starve
11278 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
11279 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
11280 << " uncommitted, waiting" << dendl;
11281 wait_on_list(in->waitfor_commit);
11282 }
11283 }
11284
11285 if (!r) {
11286 if (flush_tid > 0)
11287 wait_sync_caps(in, flush_tid);
11288
11289 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
11290 } else {
11291 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
11292 << cpp_strerror(-r) << dendl;
11293 }
11294
11295 lat = ceph_clock_now();
11296 lat -= start;
11297 logger->tinc(l_c_fsync, lat);
11298
11299 return r;
11300 }
11301
11302 int Client::_fsync(Fh *f, bool syncdataonly)
11303 {
11304 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
11305 return _fsync(f->inode.get(), syncdataonly);
11306 }
11307
11308 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
11309 {
11310 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11311 if (!mref_reader.is_state_satisfied())
11312 return -CEPHFS_ENOTCONN;
11313
11314 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
11315 tout(cct) << fd << std::endl;
11316
11317 std::scoped_lock lock(client_lock);
11318 Fh *f = get_filehandle(fd);
11319 if (!f)
11320 return -CEPHFS_EBADF;
11321 int r = _getattr(f->inode, mask, perms);
11322 if (r < 0)
11323 return r;
11324 fill_stat(f->inode, stbuf, NULL);
11325 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
11326 return r;
11327 }
11328
11329 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
11330 unsigned int want, unsigned int flags)
11331 {
11332 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11333 if (!mref_reader.is_state_satisfied())
11334 return -CEPHFS_ENOTCONN;
11335
11336 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
11337 tout(cct) << fd << std::endl;
11338
11339 std::scoped_lock lock(client_lock);
11340 Fh *f = get_filehandle(fd);
11341 if (!f)
11342 return -CEPHFS_EBADF;
11343
11344 unsigned mask = statx_to_mask(flags, want);
11345
11346 int r = 0;
11347 if (mask) {
11348 r = _getattr(f->inode, mask, perms);
11349 if (r < 0) {
11350 ldout(cct, 3) << "fstatx exit on error!" << dendl;
11351 return r;
11352 }
11353 }
11354
11355 fill_statx(f->inode, mask, stx);
11356 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
11357 return r;
11358 }
11359
11360 int Client::statxat(int dirfd, const char *relpath,
11361 struct ceph_statx *stx, const UserPerm& perms,
11362 unsigned int want, unsigned int flags) {
11363 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11364 if (!mref_reader.is_state_satisfied()) {
11365 return -CEPHFS_ENOTCONN;
11366 }
11367
11368 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
11369 tout(cct) << dirfd << std::endl;
11370 tout(cct) << relpath << std::endl;
11371
11372 unsigned mask = statx_to_mask(flags, want);
11373
11374 InodeRef dirinode;
11375 std::scoped_lock lock(client_lock);
11376 int r = get_fd_inode(dirfd, &dirinode);
11377 if (r < 0) {
11378 return r;
11379 }
11380
11381 InodeRef in;
11382 filepath path(relpath);
11383 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
11384 if (r < 0) {
11385 return r;
11386 }
11387 r = _getattr(in, mask, perms);
11388 if (r < 0) {
11389 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
11390 return r;
11391 }
11392
11393 fill_statx(in, mask, stx);
11394 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
11395 return r;
11396 }
11397
11398 // not written yet, but i want to link!
11399
11400 int Client::chdir(const char *relpath, std::string &new_cwd,
11401 const UserPerm& perms)
11402 {
11403 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11404 if (!mref_reader.is_state_satisfied())
11405 return -CEPHFS_ENOTCONN;
11406
11407 tout(cct) << "chdir" << std::endl;
11408 tout(cct) << relpath << std::endl;
11409
11410 filepath path(relpath);
11411 InodeRef in;
11412
11413 std::scoped_lock lock(client_lock);
11414 int r = path_walk(path, &in, perms);
11415 if (r < 0)
11416 return r;
11417
11418 if (!(in.get()->is_dir()))
11419 return -CEPHFS_ENOTDIR;
11420
11421 if (cwd != in)
11422 cwd.swap(in);
11423 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
11424
11425 _getcwd(new_cwd, perms);
11426 return 0;
11427 }
11428
11429 void Client::_getcwd(string& dir, const UserPerm& perms)
11430 {
11431 filepath path;
11432 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
11433
11434 Inode *in = cwd.get();
11435 while (in != root.get()) {
11436 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
11437
11438 // A cwd or ancester is unlinked
11439 if (in->dentries.empty()) {
11440 return;
11441 }
11442
11443 Dentry *dn = in->get_first_parent();
11444
11445
11446 if (!dn) {
11447 // look it up
11448 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
11449 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11450 filepath path(in->ino);
11451 req->set_filepath(path);
11452 req->set_inode(in);
11453 int res = make_request(req, perms);
11454 if (res < 0)
11455 break;
11456
11457 // start over
11458 path = filepath();
11459 in = cwd.get();
11460 continue;
11461 }
11462 path.push_front_dentry(dn->name);
11463 in = dn->dir->parent_inode;
11464 }
11465 dir = "/";
11466 dir += path.get_path();
11467 }
11468
11469 void Client::getcwd(string& dir, const UserPerm& perms)
11470 {
11471 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11472 if (!mref_reader.is_state_satisfied())
11473 return;
11474
11475 std::scoped_lock l(client_lock);
11476
11477 _getcwd(dir, perms);
11478 }
11479
11480 int Client::statfs(const char *path, struct statvfs *stbuf,
11481 const UserPerm& perms)
11482 {
11483 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11484 if (!mref_reader.is_state_satisfied())
11485 return -CEPHFS_ENOTCONN;
11486
11487 tout(cct) << __func__ << std::endl;
11488 unsigned long int total_files_on_fs;
11489
11490 ceph_statfs stats;
11491 C_SaferCond cond;
11492
11493 std::unique_lock lock(client_lock);
11494 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11495 if (data_pools.size() == 1) {
11496 objecter->get_fs_stats(stats, data_pools[0], &cond);
11497 } else {
11498 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
11499 }
11500
11501 lock.unlock();
11502 int rval = cond.wait();
11503 lock.lock();
11504
11505 ceph_assert(root);
11506 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
11507
11508 if (rval < 0) {
11509 ldout(cct, 1) << "underlying call to statfs returned error: "
11510 << cpp_strerror(rval)
11511 << dendl;
11512 return rval;
11513 }
11514
11515 memset(stbuf, 0, sizeof(*stbuf));
11516
11517 /*
11518 * we're going to set a block size of 4MB so we can represent larger
11519 * FSes without overflowing. Additionally convert the space
11520 * measurements from KB to bytes while making them in terms of
11521 * blocks. We use 4MB only because it is big enough, and because it
11522 * actually *is* the (ceph) default block size.
11523 */
11524 const int CEPH_BLOCK_SHIFT = 22;
11525 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11526 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
11527 stbuf->f_files = total_files_on_fs;
11528 stbuf->f_ffree = -1;
11529 stbuf->f_favail = -1;
11530 stbuf->f_fsid = -1; // ??
11531 stbuf->f_flag = 0; // ??
11532 stbuf->f_namemax = NAME_MAX;
11533
11534 // Usually quota_root will == root_ancestor, but if the mount root has no
11535 // quota but we can see a parent of it that does have a quota, we'll
11536 // respect that one instead.
11537 ceph_assert(root != nullptr);
11538 InodeRef quota_root = root->quota.is_enabled(QUOTA_MAX_BYTES) ? root : get_quota_root(root.get(), perms, QUOTA_MAX_BYTES);
11539
11540 // get_quota_root should always give us something if client quotas are
11541 // enabled
11542 ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
11543
11544 /* If bytes quota is set on a directory and conf option "client quota df"
11545 * is also set, available space = quota limit - used space. Else,
11546 * available space = total space - used space. */
11547 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11548
11549 // Skip the getattr if any sessions are stale, as we don't want to
11550 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11551 // is unhealthy.
11552 if (!_any_stale_sessions()) {
11553 int r = _getattr(quota_root, 0, perms, true);
11554 if (r != 0) {
11555 // Ignore return value: error getting latest inode metadata is not a good
11556 // reason to break "df".
11557 lderr(cct) << "Error in getattr on quota root 0x"
11558 << std::hex << quota_root->ino << std::dec
11559 << " statfs result may be outdated" << dendl;
11560 }
11561 }
11562
11563 // Special case: if there is a size quota set on the Inode acting
11564 // as the root for this client mount, then report the quota status
11565 // as the filesystem statistics.
11566 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11567 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
11568 // It is possible for a quota to be exceeded: arithmetic here must
11569 // handle case where used > total.
11570 const fsblkcnt_t free = total > used ? total - used : 0;
11571
11572 stbuf->f_blocks = total;
11573 stbuf->f_bfree = free;
11574 stbuf->f_bavail = free;
11575 } else {
11576 // General case: report the cluster statistics returned from RADOS. Because
11577 // multiple pools may be used without one filesystem namespace via
11578 // layouts, this is the most correct thing we can do.
11579 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11580 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11581 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11582 }
11583
11584 return rval;
11585 }
11586
11587 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11588 struct flock *fl, uint64_t owner, bool removing)
11589 {
11590 ldout(cct, 10) << __func__ << " ino " << in->ino
11591 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11592 << " type " << fl->l_type << " owner " << owner
11593 << " " << fl->l_start << "~" << fl->l_len << dendl;
11594
11595 if (in->flags & I_ERROR_FILELOCK)
11596 return -CEPHFS_EIO;
11597
11598 int lock_cmd;
11599 if (F_RDLCK == fl->l_type)
11600 lock_cmd = CEPH_LOCK_SHARED;
11601 else if (F_WRLCK == fl->l_type)
11602 lock_cmd = CEPH_LOCK_EXCL;
11603 else if (F_UNLCK == fl->l_type)
11604 lock_cmd = CEPH_LOCK_UNLOCK;
11605 else
11606 return -CEPHFS_EIO;
11607
11608 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11609 sleep = 0;
11610
11611 /*
11612 * Set the most significant bit, so that MDS knows the 'owner'
11613 * is sufficient to identify the owner of lock. (old code uses
11614 * both 'owner' and 'pid')
11615 */
11616 owner |= (1ULL << 63);
11617
11618 MetaRequest *req = new MetaRequest(op);
11619 filepath path;
11620 in->make_nosnap_relative_path(path);
11621 req->set_filepath(path);
11622 req->set_inode(in);
11623
11624 req->head.args.filelock_change.rule = lock_type;
11625 req->head.args.filelock_change.type = lock_cmd;
11626 req->head.args.filelock_change.owner = owner;
11627 req->head.args.filelock_change.pid = fl->l_pid;
11628 req->head.args.filelock_change.start = fl->l_start;
11629 req->head.args.filelock_change.length = fl->l_len;
11630 req->head.args.filelock_change.wait = sleep;
11631
11632 int ret;
11633 bufferlist bl;
11634
11635 if (sleep && switch_interrupt_cb) {
11636 // enable interrupt
11637 switch_interrupt_cb(callback_handle, req->get());
11638 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11639 // disable interrupt
11640 switch_interrupt_cb(callback_handle, NULL);
11641 if (ret == 0 && req->aborted()) {
11642 // effect of this lock request has been revoked by the 'lock intr' request
11643 ret = req->get_abort_code();
11644 }
11645 put_request(req);
11646 } else {
11647 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11648 }
11649
11650 if (ret == 0) {
11651 if (op == CEPH_MDS_OP_GETFILELOCK) {
11652 ceph_filelock filelock;
11653 auto p = bl.cbegin();
11654 decode(filelock, p);
11655
11656 if (CEPH_LOCK_SHARED == filelock.type)
11657 fl->l_type = F_RDLCK;
11658 else if (CEPH_LOCK_EXCL == filelock.type)
11659 fl->l_type = F_WRLCK;
11660 else
11661 fl->l_type = F_UNLCK;
11662
11663 fl->l_whence = SEEK_SET;
11664 fl->l_start = filelock.start;
11665 fl->l_len = filelock.length;
11666 fl->l_pid = filelock.pid;
11667 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11668 ceph_lock_state_t *lock_state;
11669 if (lock_type == CEPH_LOCK_FCNTL) {
11670 if (!in->fcntl_locks)
11671 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11672 lock_state = in->fcntl_locks.get();
11673 } else if (lock_type == CEPH_LOCK_FLOCK) {
11674 if (!in->flock_locks)
11675 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11676 lock_state = in->flock_locks.get();
11677 } else {
11678 ceph_abort();
11679 return -CEPHFS_EINVAL;
11680 }
11681 _update_lock_state(fl, owner, lock_state);
11682
11683 if (!removing) {
11684 if (lock_type == CEPH_LOCK_FCNTL) {
11685 if (!fh->fcntl_locks)
11686 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11687 lock_state = fh->fcntl_locks.get();
11688 } else {
11689 if (!fh->flock_locks)
11690 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11691 lock_state = fh->flock_locks.get();
11692 }
11693 _update_lock_state(fl, owner, lock_state);
11694 }
11695 } else
11696 ceph_abort();
11697 }
11698 return ret;
11699 }
11700
11701 int Client::_interrupt_filelock(MetaRequest *req)
11702 {
11703 // Set abort code, but do not kick. The abort code prevents the request
11704 // from being re-sent.
11705 req->abort(-CEPHFS_EINTR);
11706 if (req->mds < 0)
11707 return 0; // haven't sent the request
11708
11709 Inode *in = req->inode();
11710
11711 int lock_type;
11712 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11713 lock_type = CEPH_LOCK_FLOCK_INTR;
11714 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11715 lock_type = CEPH_LOCK_FCNTL_INTR;
11716 else {
11717 ceph_abort();
11718 return -CEPHFS_EINVAL;
11719 }
11720
11721 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11722 filepath path;
11723 in->make_nosnap_relative_path(path);
11724 intr_req->set_filepath(path);
11725 intr_req->set_inode(in);
11726 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11727 intr_req->head.args.filelock_change.rule = lock_type;
11728 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11729
11730 UserPerm perms(req->get_uid(), req->get_gid());
11731 return make_request(intr_req, perms, NULL, NULL, -1);
11732 }
11733
11734 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11735 {
11736 if (!in->fcntl_locks && !in->flock_locks)
11737 return;
11738
11739 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11740 encode(nr_fcntl_locks, bl);
11741 if (nr_fcntl_locks) {
11742 auto &lock_state = in->fcntl_locks;
11743 for(auto p = lock_state->held_locks.begin();
11744 p != lock_state->held_locks.end();
11745 ++p)
11746 encode(p->second, bl);
11747 }
11748
11749 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11750 encode(nr_flock_locks, bl);
11751 if (nr_flock_locks) {
11752 auto &lock_state = in->flock_locks;
11753 for(auto p = lock_state->held_locks.begin();
11754 p != lock_state->held_locks.end();
11755 ++p)
11756 encode(p->second, bl);
11757 }
11758
11759 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11760 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11761 }
11762
11763 void Client::_release_filelocks(Fh *fh)
11764 {
11765 if (!fh->fcntl_locks && !fh->flock_locks)
11766 return;
11767
11768 Inode *in = fh->inode.get();
11769 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11770
11771 list<ceph_filelock> activated_locks;
11772
11773 list<pair<int, ceph_filelock> > to_release;
11774
11775 if (fh->fcntl_locks) {
11776 auto &lock_state = fh->fcntl_locks;
11777 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11778 auto q = p++;
11779 if (in->flags & I_ERROR_FILELOCK) {
11780 lock_state->remove_lock(q->second, activated_locks);
11781 } else {
11782 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11783 }
11784 }
11785 lock_state.reset();
11786 }
11787 if (fh->flock_locks) {
11788 auto &lock_state = fh->flock_locks;
11789 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11790 auto q = p++;
11791 if (in->flags & I_ERROR_FILELOCK) {
11792 lock_state->remove_lock(q->second, activated_locks);
11793 } else {
11794 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11795 }
11796 }
11797 lock_state.reset();
11798 }
11799
11800 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11801 in->flags &= ~I_ERROR_FILELOCK;
11802
11803 if (to_release.empty())
11804 return;
11805
11806 struct flock fl;
11807 memset(&fl, 0, sizeof(fl));
11808 fl.l_whence = SEEK_SET;
11809 fl.l_type = F_UNLCK;
11810
11811 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11812 p != to_release.end();
11813 ++p) {
11814 fl.l_start = p->second.start;
11815 fl.l_len = p->second.length;
11816 fl.l_pid = p->second.pid;
11817 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11818 p->second.owner, true);
11819 }
11820 }
11821
11822 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11823 ceph_lock_state_t *lock_state)
11824 {
11825 int lock_cmd;
11826 if (F_RDLCK == fl->l_type)
11827 lock_cmd = CEPH_LOCK_SHARED;
11828 else if (F_WRLCK == fl->l_type)
11829 lock_cmd = CEPH_LOCK_EXCL;
11830 else
11831 lock_cmd = CEPH_LOCK_UNLOCK;;
11832
11833 ceph_filelock filelock;
11834 filelock.start = fl->l_start;
11835 filelock.length = fl->l_len;
11836 filelock.client = 0;
11837 // see comment in _do_filelock()
11838 filelock.owner = owner | (1ULL << 63);
11839 filelock.pid = fl->l_pid;
11840 filelock.type = lock_cmd;
11841
11842 if (filelock.type == CEPH_LOCK_UNLOCK) {
11843 list<ceph_filelock> activated_locks;
11844 lock_state->remove_lock(filelock, activated_locks);
11845 } else {
11846 bool r = lock_state->add_lock(filelock, false, false, NULL);
11847 ceph_assert(r);
11848 }
11849 }
11850
11851 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11852 {
11853 Inode *in = fh->inode.get();
11854 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11855 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11856 return ret;
11857 }
11858
11859 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11860 {
11861 Inode *in = fh->inode.get();
11862 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11863 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11864 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11865 return ret;
11866 }
11867
11868 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11869 {
11870 Inode *in = fh->inode.get();
11871 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11872
11873 int sleep = !(cmd & LOCK_NB);
11874 cmd &= ~LOCK_NB;
11875
11876 int type;
11877 switch (cmd) {
11878 case LOCK_SH:
11879 type = F_RDLCK;
11880 break;
11881 case LOCK_EX:
11882 type = F_WRLCK;
11883 break;
11884 case LOCK_UN:
11885 type = F_UNLCK;
11886 break;
11887 default:
11888 return -CEPHFS_EINVAL;
11889 }
11890
11891 struct flock fl;
11892 memset(&fl, 0, sizeof(fl));
11893 fl.l_type = type;
11894 fl.l_whence = SEEK_SET;
11895
11896 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11897 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11898 return ret;
11899 }
11900
11901 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11902 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11903 if (!mref_reader.is_state_satisfied()) {
11904 return -CEPHFS_ENOTCONN;
11905 }
11906
11907 std::scoped_lock lock(client_lock);
11908 InodeRef in;
11909 int r = Client::path_walk(path, &in, perms, true);
11910 if (r < 0) {
11911 return r;
11912 }
11913
11914 if (in->snapid == CEPH_NOSNAP) {
11915 return -CEPHFS_EINVAL;
11916 }
11917
11918 snap_info->id = in->snapid;
11919 snap_info->metadata = in->snap_metadata;
11920 return 0;
11921 }
11922
11923 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11924 {
11925 /* Since the only thing this does is wrap a call to statfs, and
11926 statfs takes a lock, it doesn't seem we have a need to split it
11927 out. */
11928 return statfs(0, stbuf, perms);
11929 }
11930
11931 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11932 {
11933 if (!args)
11934 return;
11935
11936 ldout(cct, 10) << __func__ << " cb " << args->handle
11937 << " invalidate_ino_cb " << args->ino_cb
11938 << " invalidate_dentry_cb " << args->dentry_cb
11939 << " switch_interrupt_cb " << args->switch_intr_cb
11940 << " remount_cb " << args->remount_cb
11941 << dendl;
11942 callback_handle = args->handle;
11943 if (args->ino_cb) {
11944 ino_invalidate_cb = args->ino_cb;
11945 async_ino_invalidator.start();
11946 }
11947 if (args->dentry_cb) {
11948 dentry_invalidate_cb = args->dentry_cb;
11949 async_dentry_invalidator.start();
11950 }
11951 if (args->switch_intr_cb) {
11952 switch_interrupt_cb = args->switch_intr_cb;
11953 interrupt_finisher.start();
11954 }
11955 if (args->remount_cb) {
11956 remount_cb = args->remount_cb;
11957 remount_finisher.start();
11958 }
11959 if (args->ino_release_cb) {
11960 ino_release_cb = args->ino_release_cb;
11961 async_ino_releasor.start();
11962 }
11963 if (args->umask_cb)
11964 umask_cb = args->umask_cb;
11965 }
11966
11967 // This is deprecated, use ll_register_callbacks2() instead.
11968 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11969 {
11970 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11971
11972 _ll_register_callbacks(args);
11973 }
11974
11975 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11976 {
11977 if (is_mounting() || is_mounted() || is_unmounting())
11978 return -CEPHFS_EBUSY;
11979
11980 _ll_register_callbacks(args);
11981 return 0;
11982 }
11983
11984 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11985 {
11986 std::pair <int, bool> r(0, false);
11987
11988 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11989 if (!iref_reader.is_state_satisfied())
11990 return std::make_pair(-CEPHFS_ENOTCONN, false);
11991
11992 can_invalidate_dentries = can_invalidate;
11993
11994 /*
11995 * Force to use the old and slow method to invalidate the dcache
11996 * if the euid is non-root, or the remount may fail with return
11997 * code 1 or 32.
11998 */
11999 uid_t euid = geteuid();
12000 ldout(cct, 10) << "euid: " << euid << dendl;
12001 if (euid != 0) {
12002 can_invalidate_dentries = true;
12003 }
12004
12005 if (can_invalidate_dentries) {
12006 ceph_assert(dentry_invalidate_cb);
12007 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
12008 } else {
12009 ceph_assert(remount_cb);
12010 ldout(cct, 1) << "using remount_cb" << dendl;
12011 r = _do_remount(false);
12012 }
12013
12014 return r;
12015 }
12016
12017 int Client::_sync_fs()
12018 {
12019 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
12020
12021 ldout(cct, 10) << __func__ << dendl;
12022
12023 // flush file data
12024 std::unique_ptr<C_SaferCond> cond = nullptr;
12025 if (cct->_conf->client_oc) {
12026 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
12027 objectcacher->flush_all(cond.get());
12028 }
12029
12030 // flush caps
12031 flush_caps_sync();
12032 ceph_tid_t flush_tid = last_flush_tid;
12033
12034 // flush the mdlog before waiting for unsafe requests.
12035 flush_mdlog_sync();
12036
12037 // wait for unsafe mds requests
12038 wait_unsafe_requests();
12039
12040 wait_sync_caps(flush_tid);
12041
12042 if (nullptr != cond) {
12043 client_lock.unlock();
12044 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
12045 cond->wait();
12046 ldout(cct, 15) << __func__ << " flush finished" << dendl;
12047 client_lock.lock();
12048 }
12049
12050 return 0;
12051 }
12052
12053 int Client::sync_fs()
12054 {
12055 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12056 if (!mref_reader.is_state_satisfied())
12057 return -CEPHFS_ENOTCONN;
12058
12059 std::scoped_lock l(client_lock);
12060
12061 return _sync_fs();
12062 }
12063
12064 int64_t Client::drop_caches()
12065 {
12066 std::scoped_lock l(client_lock);
12067 return objectcacher->release_all();
12068 }
12069
12070 int Client::_lazyio(Fh *fh, int enable)
12071 {
12072 Inode *in = fh->inode.get();
12073 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
12074
12075 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
12076 return 0;
12077
12078 int orig_mode = fh->mode;
12079 if (enable) {
12080 fh->mode |= CEPH_FILE_MODE_LAZY;
12081 in->get_open_ref(fh->mode);
12082 in->put_open_ref(orig_mode);
12083 check_caps(in, CHECK_CAPS_NODELAY);
12084 } else {
12085 fh->mode &= ~CEPH_FILE_MODE_LAZY;
12086 in->get_open_ref(fh->mode);
12087 in->put_open_ref(orig_mode);
12088 check_caps(in, 0);
12089 }
12090
12091 return 0;
12092 }
12093
12094 int Client::lazyio(int fd, int enable)
12095 {
12096 std::scoped_lock l(client_lock);
12097 Fh *f = get_filehandle(fd);
12098 if (!f)
12099 return -CEPHFS_EBADF;
12100
12101 return _lazyio(f, enable);
12102 }
12103
12104 int Client::ll_lazyio(Fh *fh, int enable)
12105 {
12106 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
12107 tout(cct) << __func__ << std::endl;
12108
12109 std::scoped_lock lock(client_lock);
12110 return _lazyio(fh, enable);
12111 }
12112
12113 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
12114 {
12115 std::scoped_lock l(client_lock);
12116 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
12117 << ", " << offset << ", " << count << ")" << dendl;
12118
12119 Fh *f = get_filehandle(fd);
12120 if (!f)
12121 return -CEPHFS_EBADF;
12122
12123 // for now
12124 _fsync(f, true);
12125
12126 return 0;
12127 }
12128
12129 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
12130 {
12131 std::scoped_lock l(client_lock);
12132 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
12133 << ", " << offset << ", " << count << ")" << dendl;
12134
12135 Fh *f = get_filehandle(fd);
12136 if (!f)
12137 return -CEPHFS_EBADF;
12138 Inode *in = f->inode.get();
12139
12140 _fsync(f, true);
12141 if (_release(in)) {
12142 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
12143 if (r < 0)
12144 return r;
12145 }
12146 return 0;
12147 }
12148
12149
12150 // =============================
12151 // snaps
12152
12153 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
12154 mode_t mode, const std::map<std::string, std::string> &metadata)
12155 {
12156 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12157 if (!mref_reader.is_state_satisfied())
12158 return -CEPHFS_ENOTCONN;
12159
12160 std::scoped_lock l(client_lock);
12161
12162 filepath path(relpath);
12163 InodeRef in;
12164 int r = path_walk(path, &in, perm);
12165 if (r < 0)
12166 return r;
12167 if (cct->_conf->client_permissions) {
12168 r = may_create(in.get(), perm);
12169 if (r < 0)
12170 return r;
12171 }
12172 Inode *snapdir = open_snapdir(in.get());
12173 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
12174 }
12175
12176 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
12177 {
12178 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12179 if (!mref_reader.is_state_satisfied())
12180 return -CEPHFS_ENOTCONN;
12181
12182 std::scoped_lock l(client_lock);
12183
12184 filepath path(relpath);
12185 InodeRef in;
12186 int r = path_walk(path, &in, perms);
12187 if (r < 0)
12188 return r;
12189 Inode *snapdir = open_snapdir(in.get());
12190 if (cct->_conf->client_permissions) {
12191 r = may_delete(snapdir, check_perms ? name : NULL, perms);
12192 if (r < 0)
12193 return r;
12194 }
12195 return _rmdir(snapdir, name, perms);
12196 }
12197
12198 // =============================
12199 // expose caps
12200
12201 int Client::get_caps_issued(int fd)
12202 {
12203 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12204 if (!mref_reader.is_state_satisfied())
12205 return -CEPHFS_ENOTCONN;
12206
12207 std::scoped_lock lock(client_lock);
12208
12209 Fh *f = get_filehandle(fd);
12210 if (!f)
12211 return -CEPHFS_EBADF;
12212
12213 return f->inode->caps_issued();
12214 }
12215
12216 int Client::get_caps_issued(const char *path, const UserPerm& perms)
12217 {
12218 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12219 if (!mref_reader.is_state_satisfied())
12220 return -CEPHFS_ENOTCONN;
12221
12222 std::scoped_lock lock(client_lock);
12223
12224 filepath p(path);
12225 InodeRef in;
12226 int r = path_walk(p, &in, perms, true);
12227 if (r < 0)
12228 return r;
12229 return in->caps_issued();
12230 }
12231
12232 // =========================================
12233 // low level
12234
12235 void Client::refresh_snapdir_attrs(Inode *in, Inode *diri) {
12236 ldout(cct, 10) << __func__ << ": snapdir inode=" << *in
12237 << ", inode=" << *diri << dendl;
12238 in->ino = diri->ino;
12239 in->snapid = CEPH_SNAPDIR;
12240 in->mode = diri->mode;
12241 in->uid = diri->uid;
12242 in->gid = diri->gid;
12243 in->nlink = 1;
12244 in->mtime = diri->snaprealm->last_modified;
12245 in->ctime = in->mtime;
12246 in->change_attr = diri->snaprealm->change_attr;
12247 in->btime = diri->btime;
12248 in->atime = diri->atime;
12249 in->size = diri->size;
12250
12251 in->dirfragtree.clear();
12252 in->snapdir_parent = diri;
12253 // copy posix acls to snapshotted inode
12254 in->xattrs.clear();
12255 for (auto &[xattr_key, xattr_value] : diri->xattrs) {
12256 if (xattr_key.rfind("system.", 0) == 0) {
12257 in->xattrs[xattr_key] = xattr_value;
12258 }
12259 }
12260 }
12261
12262 Inode *Client::open_snapdir(Inode *diri)
12263 {
12264 Inode *in;
12265 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
12266 if (!inode_map.count(vino)) {
12267 in = new Inode(this, vino, &diri->layout);
12268 refresh_snapdir_attrs(in, diri);
12269 diri->flags |= I_SNAPDIR_OPEN;
12270 inode_map[vino] = in;
12271 if (use_faked_inos())
12272 _assign_faked_ino(in);
12273 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
12274 } else {
12275 in = inode_map[vino];
12276 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
12277 }
12278 return in;
12279 }
12280
12281 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
12282 Inode **out, const UserPerm& perms)
12283 {
12284 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12285 if (!mref_reader.is_state_satisfied())
12286 return -CEPHFS_ENOTCONN;
12287
12288 vinodeno_t vparent = _get_vino(parent);
12289 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12290 tout(cct) << __func__ << std::endl;
12291 tout(cct) << name << std::endl;
12292
12293 std::scoped_lock lock(client_lock);
12294
12295 int r = 0;
12296 if (!fuse_default_permissions) {
12297 if (strcmp(name, ".") && strcmp(name, "..")) {
12298 r = may_lookup(parent, perms);
12299 if (r < 0)
12300 return r;
12301 }
12302 }
12303
12304 string dname(name);
12305 InodeRef in;
12306
12307 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
12308 if (r < 0) {
12309 attr->st_ino = 0;
12310 goto out;
12311 }
12312
12313 ceph_assert(in);
12314 fill_stat(in, attr);
12315 _ll_get(in.get());
12316
12317 out:
12318 ldout(cct, 3) << __func__ << " " << vparent << " " << name
12319 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12320 tout(cct) << attr->st_ino << std::endl;
12321 *out = in.get();
12322 return r;
12323 }
12324
12325 int Client::ll_lookup_vino(
12326 vinodeno_t vino,
12327 const UserPerm& perms,
12328 Inode **inode)
12329 {
12330 ceph_assert(inode != NULL);
12331 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12332 if (!mref_reader.is_state_satisfied())
12333 return -CEPHFS_ENOTCONN;
12334
12335 if (is_reserved_vino(vino))
12336 return -CEPHFS_ESTALE;
12337
12338 std::scoped_lock lock(client_lock);
12339 ldout(cct, 3) << __func__ << " " << vino << dendl;
12340
12341 // Check the cache first
12342 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12343 if (p != inode_map.end()) {
12344 *inode = p->second;
12345 _ll_get(*inode);
12346 return 0;
12347 }
12348
12349 uint64_t snapid = vino.snapid;
12350
12351 // for snapdir, find the non-snapped dir inode
12352 if (snapid == CEPH_SNAPDIR)
12353 vino.snapid = CEPH_NOSNAP;
12354
12355 int r = _lookup_vino(vino, perms, inode);
12356 if (r)
12357 return r;
12358 ceph_assert(*inode != NULL);
12359
12360 if (snapid == CEPH_SNAPDIR) {
12361 Inode *tmp = *inode;
12362
12363 // open the snapdir and put the inode ref
12364 *inode = open_snapdir(tmp);
12365 _ll_forget(tmp, 1);
12366 _ll_get(*inode);
12367 }
12368 return 0;
12369 }
12370
12371 int Client::ll_lookup_inode(
12372 struct inodeno_t ino,
12373 const UserPerm& perms,
12374 Inode **inode)
12375 {
12376 vinodeno_t vino(ino, CEPH_NOSNAP);
12377 return ll_lookup_vino(vino, perms, inode);
12378 }
12379
12380 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
12381 struct ceph_statx *stx, unsigned want, unsigned flags,
12382 const UserPerm& perms)
12383 {
12384 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12385 if (!mref_reader.is_state_satisfied())
12386 return -CEPHFS_ENOTCONN;
12387
12388 vinodeno_t vparent = _get_vino(parent);
12389 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12390 tout(cct) << "ll_lookupx" << std::endl;
12391 tout(cct) << name << std::endl;
12392
12393 std::scoped_lock lock(client_lock);
12394
12395 int r = 0;
12396 if (!fuse_default_permissions) {
12397 r = may_lookup(parent, perms);
12398 if (r < 0)
12399 return r;
12400 }
12401
12402 string dname(name);
12403 InodeRef in;
12404
12405 unsigned mask = statx_to_mask(flags, want);
12406 r = _lookup(parent, dname, mask, &in, perms);
12407 if (r < 0) {
12408 stx->stx_ino = 0;
12409 stx->stx_mask = 0;
12410 } else {
12411 ceph_assert(in);
12412 fill_statx(in, mask, stx);
12413 _ll_get(in.get());
12414 }
12415
12416 ldout(cct, 3) << __func__ << " " << vparent << " " << name
12417 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12418 tout(cct) << stx->stx_ino << std::endl;
12419 *out = in.get();
12420 return r;
12421 }
12422
12423 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
12424 unsigned int want, unsigned int flags, const UserPerm& perms)
12425 {
12426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12427 if (!mref_reader.is_state_satisfied())
12428 return -CEPHFS_ENOTCONN;
12429
12430 filepath fp(name, 0);
12431 InodeRef in;
12432 int rc;
12433 unsigned mask = statx_to_mask(flags, want);
12434
12435 ldout(cct, 3) << __func__ << " " << name << dendl;
12436 tout(cct) << __func__ << std::endl;
12437 tout(cct) << name << std::endl;
12438
12439 std::scoped_lock lock(client_lock);
12440 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
12441 if (rc < 0) {
12442 /* zero out mask, just in case... */
12443 stx->stx_mask = 0;
12444 stx->stx_ino = 0;
12445 *out = NULL;
12446 return rc;
12447 } else {
12448 ceph_assert(in);
12449 fill_statx(in, mask, stx);
12450 _ll_get(in.get());
12451 *out = in.get();
12452 return 0;
12453 }
12454 }
12455
12456 void Client::_ll_get(Inode *in)
12457 {
12458 if (in->ll_ref == 0) {
12459 in->iget();
12460 if (in->is_dir() && !in->dentries.empty()) {
12461 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12462 in->get_first_parent()->get(); // pin dentry
12463 }
12464 if (in->snapid != CEPH_NOSNAP)
12465 ll_snap_ref[in->snapid]++;
12466 }
12467 in->ll_get();
12468 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
12469 }
12470
12471 int Client::_ll_put(Inode *in, uint64_t num)
12472 {
12473 in->ll_put(num);
12474 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
12475 if (in->ll_ref == 0) {
12476 if (in->is_dir() && !in->dentries.empty()) {
12477 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12478 in->get_first_parent()->put(); // unpin dentry
12479 }
12480 if (in->snapid != CEPH_NOSNAP) {
12481 auto p = ll_snap_ref.find(in->snapid);
12482 ceph_assert(p != ll_snap_ref.end());
12483 ceph_assert(p->second > 0);
12484 if (--p->second == 0)
12485 ll_snap_ref.erase(p);
12486 }
12487 put_inode(in);
12488 return 0;
12489 } else {
12490 return in->ll_ref;
12491 }
12492 }
12493
12494 void Client::_ll_drop_pins()
12495 {
12496 ldout(cct, 10) << __func__ << dendl;
12497 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
12498 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12499 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12500 it != inode_map.end();
12501 it = next) {
12502 Inode *in = it->second;
12503 next = it;
12504 ++next;
12505 if (in->ll_ref){
12506 to_be_put.insert(in);
12507 _ll_put(in, in->ll_ref);
12508 }
12509 }
12510 }
12511
12512 bool Client::_ll_forget(Inode *in, uint64_t count)
12513 {
12514 inodeno_t ino = in->ino;
12515
12516 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12517 tout(cct) << __func__ << std::endl;
12518 tout(cct) << ino.val << std::endl;
12519 tout(cct) << count << std::endl;
12520
12521 // Ignore forget if we're no longer mounted
12522 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12523 if (!mref_reader.is_state_satisfied())
12524 return true;
12525
12526 if (ino == 1) return true; // ignore forget on root.
12527
12528 bool last = false;
12529 if (in->ll_ref < count) {
12530 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12531 << ", which only has ll_ref=" << in->ll_ref << dendl;
12532 _ll_put(in, in->ll_ref);
12533 last = true;
12534 } else {
12535 if (_ll_put(in, count) == 0)
12536 last = true;
12537 }
12538
12539 return last;
12540 }
12541
12542 bool Client::ll_forget(Inode *in, uint64_t count)
12543 {
12544 std::scoped_lock lock(client_lock);
12545 return _ll_forget(in, count);
12546 }
12547
12548 bool Client::ll_put(Inode *in)
12549 {
12550 /* ll_forget already takes the lock */
12551 return ll_forget(in, 1);
12552 }
12553
12554 int Client::ll_get_snap_ref(snapid_t snap)
12555 {
12556 std::scoped_lock lock(client_lock);
12557 auto p = ll_snap_ref.find(snap);
12558 if (p != ll_snap_ref.end())
12559 return p->second;
12560 return 0;
12561 }
12562
12563 snapid_t Client::ll_get_snapid(Inode *in)
12564 {
12565 std::scoped_lock lock(client_lock);
12566 return in->snapid;
12567 }
12568
12569 Inode *Client::ll_get_inode(ino_t ino)
12570 {
12571 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12572 if (!mref_reader.is_state_satisfied())
12573 return NULL;
12574
12575 std::scoped_lock lock(client_lock);
12576
12577 vinodeno_t vino = _map_faked_ino(ino);
12578 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12579 if (p == inode_map.end())
12580 return NULL;
12581 Inode *in = p->second;
12582 _ll_get(in);
12583 return in;
12584 }
12585
12586 Inode *Client::ll_get_inode(vinodeno_t vino)
12587 {
12588 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12589 if (!mref_reader.is_state_satisfied())
12590 return NULL;
12591
12592 if (is_reserved_vino(vino))
12593 return NULL;
12594
12595 std::scoped_lock lock(client_lock);
12596
12597 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12598 if (p == inode_map.end())
12599 return NULL;
12600 Inode *in = p->second;
12601 _ll_get(in);
12602 return in;
12603 }
12604
12605 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12606 {
12607 vinodeno_t vino = _get_vino(in);
12608
12609 ldout(cct, 8) << __func__ << " " << vino << dendl;
12610 tout(cct) << __func__ << std::endl;
12611 tout(cct) << vino.ino.val << std::endl;
12612
12613 if (vino.snapid < CEPH_NOSNAP)
12614 return 0;
12615 else
12616 return _getattr(in, caps, perms);
12617 }
12618
12619 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12620 {
12621 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12622 if (!mref_reader.is_state_satisfied())
12623 return -CEPHFS_ENOTCONN;
12624
12625 std::scoped_lock lock(client_lock);
12626
12627 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12628
12629 if (res == 0)
12630 fill_stat(in, attr);
12631 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12632 return res;
12633 }
12634
12635 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12636 unsigned int flags, const UserPerm& perms)
12637 {
12638 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12639 if (!mref_reader.is_state_satisfied())
12640 return -CEPHFS_ENOTCONN;
12641
12642 std::scoped_lock lock(client_lock);
12643
12644 int res = 0;
12645 unsigned mask = statx_to_mask(flags, want);
12646
12647 if (mask && !in->caps_issued_mask(mask, true))
12648 res = _ll_getattr(in, mask, perms);
12649
12650 if (res == 0)
12651 fill_statx(in, mask, stx);
12652 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12653 return res;
12654 }
12655
12656 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12657 const UserPerm& perms, InodeRef *inp)
12658 {
12659 vinodeno_t vino = _get_vino(in);
12660
12661 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12662 << dendl;
12663 tout(cct) << __func__ << std::endl;
12664 tout(cct) << vino.ino.val << std::endl;
12665 tout(cct) << stx->stx_mode << std::endl;
12666 tout(cct) << stx->stx_uid << std::endl;
12667 tout(cct) << stx->stx_gid << std::endl;
12668 tout(cct) << stx->stx_size << std::endl;
12669 tout(cct) << stx->stx_mtime << std::endl;
12670 tout(cct) << stx->stx_atime << std::endl;
12671 tout(cct) << stx->stx_btime << std::endl;
12672 tout(cct) << mask << std::endl;
12673
12674 if (!fuse_default_permissions) {
12675 int res = may_setattr(in, stx, mask, perms);
12676 if (res < 0)
12677 return res;
12678 }
12679
12680 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12681
12682 return __setattrx(in, stx, mask, perms, inp);
12683 }
12684
12685 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12686 const UserPerm& perms)
12687 {
12688 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12689 if (!mref_reader.is_state_satisfied())
12690 return -CEPHFS_ENOTCONN;
12691
12692 std::scoped_lock lock(client_lock);
12693
12694 InodeRef target(in);
12695 int res = _ll_setattrx(in, stx, mask, perms, &target);
12696 if (res == 0) {
12697 ceph_assert(in == target.get());
12698 fill_statx(in, in->caps_issued(), stx);
12699 }
12700
12701 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12702 return res;
12703 }
12704
12705 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12706 const UserPerm& perms)
12707 {
12708 struct ceph_statx stx;
12709 stat_to_statx(attr, &stx);
12710
12711 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12712 if (!mref_reader.is_state_satisfied())
12713 return -CEPHFS_ENOTCONN;
12714
12715 std::scoped_lock lock(client_lock);
12716
12717 InodeRef target(in);
12718 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12719 if (res == 0) {
12720 ceph_assert(in == target.get());
12721 fill_stat(in, attr);
12722 }
12723
12724 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12725 return res;
12726 }
12727
12728
12729 // ----------
12730 // xattrs
12731
12732 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12733 const UserPerm& perms)
12734 {
12735 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12736 if (!mref_reader.is_state_satisfied())
12737 return -CEPHFS_ENOTCONN;
12738
12739 std::scoped_lock lock(client_lock);
12740
12741 InodeRef in;
12742 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12743 if (r < 0)
12744 return r;
12745 return _getxattr(in, name, value, size, perms);
12746 }
12747
12748 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12749 const UserPerm& perms)
12750 {
12751 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12752 if (!mref_reader.is_state_satisfied())
12753 return -CEPHFS_ENOTCONN;
12754
12755 std::scoped_lock lock(client_lock);
12756
12757 InodeRef in;
12758 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12759 if (r < 0)
12760 return r;
12761 return _getxattr(in, name, value, size, perms);
12762 }
12763
12764 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12765 const UserPerm& perms)
12766 {
12767 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12768 if (!mref_reader.is_state_satisfied())
12769 return -CEPHFS_ENOTCONN;
12770
12771 std::scoped_lock lock(client_lock);
12772
12773 Fh *f = get_filehandle(fd);
12774 if (!f)
12775 return -CEPHFS_EBADF;
12776 return _getxattr(f->inode, name, value, size, perms);
12777 }
12778
12779 int Client::listxattr(const char *path, char *list, size_t size,
12780 const UserPerm& perms)
12781 {
12782 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12783 if (!mref_reader.is_state_satisfied())
12784 return -CEPHFS_ENOTCONN;
12785
12786 std::scoped_lock lock(client_lock);
12787
12788 InodeRef in;
12789 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12790 if (r < 0)
12791 return r;
12792 return Client::_listxattr(in.get(), list, size, perms);
12793 }
12794
12795 int Client::llistxattr(const char *path, char *list, size_t size,
12796 const UserPerm& perms)
12797 {
12798 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12799 if (!mref_reader.is_state_satisfied())
12800 return -CEPHFS_ENOTCONN;
12801
12802 std::scoped_lock lock(client_lock);
12803
12804 InodeRef in;
12805 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12806 if (r < 0)
12807 return r;
12808 return Client::_listxattr(in.get(), list, size, perms);
12809 }
12810
12811 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12812 {
12813 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12814 if (!mref_reader.is_state_satisfied())
12815 return -CEPHFS_ENOTCONN;
12816
12817 std::scoped_lock lock(client_lock);
12818
12819 Fh *f = get_filehandle(fd);
12820 if (!f)
12821 return -CEPHFS_EBADF;
12822 return Client::_listxattr(f->inode.get(), list, size, perms);
12823 }
12824
12825 int Client::removexattr(const char *path, const char *name,
12826 const UserPerm& perms)
12827 {
12828 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12829 if (!mref_reader.is_state_satisfied())
12830 return -CEPHFS_ENOTCONN;
12831
12832 std::scoped_lock lock(client_lock);
12833
12834 InodeRef in;
12835 int r = Client::path_walk(path, &in, perms, true);
12836 if (r < 0)
12837 return r;
12838 return _removexattr(in, name, perms);
12839 }
12840
12841 int Client::lremovexattr(const char *path, const char *name,
12842 const UserPerm& perms)
12843 {
12844 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12845 if (!mref_reader.is_state_satisfied())
12846 return -CEPHFS_ENOTCONN;
12847
12848 std::scoped_lock lock(client_lock);
12849
12850 InodeRef in;
12851 int r = Client::path_walk(path, &in, perms, false);
12852 if (r < 0)
12853 return r;
12854 return _removexattr(in, name, perms);
12855 }
12856
12857 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12858 {
12859 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12860 if (!mref_reader.is_state_satisfied())
12861 return -CEPHFS_ENOTCONN;
12862
12863 std::scoped_lock lock(client_lock);
12864
12865 Fh *f = get_filehandle(fd);
12866 if (!f)
12867 return -CEPHFS_EBADF;
12868 return _removexattr(f->inode, name, perms);
12869 }
12870
12871 int Client::setxattr(const char *path, const char *name, const void *value,
12872 size_t size, int flags, const UserPerm& perms)
12873 {
12874 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12875 if (!mref_reader.is_state_satisfied())
12876 return -CEPHFS_ENOTCONN;
12877
12878 _setxattr_maybe_wait_for_osdmap(name, value, size);
12879
12880 std::scoped_lock lock(client_lock);
12881
12882 InodeRef in;
12883 int r = Client::path_walk(path, &in, perms, true);
12884 if (r < 0)
12885 return r;
12886 return _setxattr(in, name, value, size, flags, perms);
12887 }
12888
12889 int Client::lsetxattr(const char *path, const char *name, const void *value,
12890 size_t size, int flags, const UserPerm& perms)
12891 {
12892 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12893 if (!mref_reader.is_state_satisfied())
12894 return -CEPHFS_ENOTCONN;
12895
12896 _setxattr_maybe_wait_for_osdmap(name, value, size);
12897
12898 std::scoped_lock lock(client_lock);
12899
12900 InodeRef in;
12901 int r = Client::path_walk(path, &in, perms, false);
12902 if (r < 0)
12903 return r;
12904 return _setxattr(in, name, value, size, flags, perms);
12905 }
12906
12907 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12908 int flags, const UserPerm& perms)
12909 {
12910 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12911 if (!mref_reader.is_state_satisfied())
12912 return -CEPHFS_ENOTCONN;
12913
12914 _setxattr_maybe_wait_for_osdmap(name, value, size);
12915
12916 std::scoped_lock lock(client_lock);
12917
12918 Fh *f = get_filehandle(fd);
12919 if (!f)
12920 return -CEPHFS_EBADF;
12921 return _setxattr(f->inode, name, value, size, flags, perms);
12922 }
12923
12924 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12925 const UserPerm& perms)
12926 {
12927 int r;
12928 const VXattr *vxattr = nullptr;
12929
12930 vxattr = _match_vxattr(in, name);
12931 if (vxattr) {
12932 r = -CEPHFS_ENODATA;
12933
12934 // Do a force getattr to get the latest quota before returning
12935 // a value to userspace.
12936 int flags = 0;
12937 if (vxattr->flags & VXATTR_RSTAT) {
12938 flags |= CEPH_STAT_RSTAT;
12939 }
12940 if (vxattr->flags & VXATTR_DIRSTAT) {
12941 flags |= CEPH_CAP_FILE_SHARED;
12942 }
12943 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12944 if (r != 0) {
12945 // Error from getattr!
12946 return r;
12947 }
12948
12949 // call pointer-to-member function
12950 char buf[256];
12951 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12952 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12953 } else {
12954 r = -CEPHFS_ENODATA;
12955 }
12956
12957 if (size != 0) {
12958 if (r > (int)size) {
12959 r = -CEPHFS_ERANGE;
12960 } else if (r > 0) {
12961 memcpy(value, buf, r);
12962 }
12963 }
12964 goto out;
12965 }
12966
12967 if (!strncmp(name, "ceph.", 5)) {
12968 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12969 goto out;
12970 }
12971
12972 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12973 r = -CEPHFS_EOPNOTSUPP;
12974 goto out;
12975 }
12976
12977 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12978 if (r == 0) {
12979 string n(name);
12980 r = -CEPHFS_ENODATA;
12981 if (in->xattrs.count(n)) {
12982 r = in->xattrs[n].length();
12983 if (r > 0 && size != 0) {
12984 if (size >= (unsigned)r)
12985 memcpy(value, in->xattrs[n].c_str(), r);
12986 else
12987 r = -CEPHFS_ERANGE;
12988 }
12989 }
12990 }
12991 out:
12992 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12993 return r;
12994 }
12995
12996 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12997 const UserPerm& perms)
12998 {
12999 if (cct->_conf->client_permissions) {
13000 int r = xattr_permission(in.get(), name, MAY_READ, perms);
13001 if (r < 0)
13002 return r;
13003 }
13004 return _getxattr(in.get(), name, value, size, perms);
13005 }
13006
13007 int Client::ll_getxattr(Inode *in, const char *name, void *value,
13008 size_t size, const UserPerm& perms)
13009 {
13010 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13011 if (!mref_reader.is_state_satisfied())
13012 return -CEPHFS_ENOTCONN;
13013
13014 vinodeno_t vino = _get_vino(in);
13015
13016 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
13017 tout(cct) << __func__ << std::endl;
13018 tout(cct) << vino.ino.val << std::endl;
13019 tout(cct) << name << std::endl;
13020
13021 std::scoped_lock lock(client_lock);
13022 if (!fuse_default_permissions) {
13023 int r = xattr_permission(in, name, MAY_READ, perms);
13024 if (r < 0)
13025 return r;
13026 }
13027
13028 return _getxattr(in, name, value, size, perms);
13029 }
13030
13031 int Client::_listxattr(Inode *in, char *name, size_t size,
13032 const UserPerm& perms)
13033 {
13034 bool len_only = (size == 0);
13035 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13036 if (r != 0) {
13037 goto out;
13038 }
13039
13040 r = 0;
13041 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
13042 if (xattr_name.rfind("ceph.", 0) == 0) {
13043 continue;
13044 }
13045
13046 size_t this_len = xattr_name.length() + 1;
13047 r += this_len;
13048 if (len_only)
13049 continue;
13050
13051 if (this_len > size) {
13052 r = -CEPHFS_ERANGE;
13053 goto out;
13054 }
13055
13056 memcpy(name, xattr_name.c_str(), this_len);
13057 name += this_len;
13058 size -= this_len;
13059 }
13060 out:
13061 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
13062 return r;
13063 }
13064
13065 int Client::ll_listxattr(Inode *in, char *names, size_t size,
13066 const UserPerm& perms)
13067 {
13068 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13069 if (!mref_reader.is_state_satisfied())
13070 return -CEPHFS_ENOTCONN;
13071
13072 vinodeno_t vino = _get_vino(in);
13073
13074 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
13075 tout(cct) << __func__ << std::endl;
13076 tout(cct) << vino.ino.val << std::endl;
13077 tout(cct) << size << std::endl;
13078
13079 std::scoped_lock lock(client_lock);
13080 return _listxattr(in, names, size, perms);
13081 }
13082
13083 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
13084 size_t size, int flags, const UserPerm& perms)
13085 {
13086
13087 int xattr_flags = 0;
13088 if (!value)
13089 xattr_flags |= CEPH_XATTR_REMOVE;
13090 if (flags & XATTR_CREATE)
13091 xattr_flags |= CEPH_XATTR_CREATE;
13092 if (flags & XATTR_REPLACE)
13093 xattr_flags |= CEPH_XATTR_REPLACE;
13094
13095 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
13096 filepath path;
13097 in->make_nosnap_relative_path(path);
13098 req->set_filepath(path);
13099 req->set_string2(name);
13100 req->set_inode(in);
13101 req->head.args.setxattr.flags = xattr_flags;
13102
13103 bufferlist bl;
13104 ceph_assert(value || size == 0);
13105 bl.append((const char*)value, size);
13106 req->set_data(bl);
13107
13108 int res = make_request(req, perms);
13109
13110 trim_cache();
13111 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
13112 res << dendl;
13113 return res;
13114 }
13115
13116 int Client::_setxattr(Inode *in, const char *name, const void *value,
13117 size_t size, int flags, const UserPerm& perms)
13118 {
13119 if (in->snapid != CEPH_NOSNAP) {
13120 return -CEPHFS_EROFS;
13121 }
13122
13123 if (size == 0) {
13124 value = "";
13125 } else if (value == NULL) {
13126 return -CEPHFS_EINVAL;
13127 }
13128
13129 bool posix_acl_xattr = false;
13130 if (acl_type == POSIX_ACL)
13131 posix_acl_xattr = !strncmp(name, "system.", 7);
13132
13133 if (strncmp(name, "user.", 5) &&
13134 strncmp(name, "security.", 9) &&
13135 strncmp(name, "trusted.", 8) &&
13136 strncmp(name, "ceph.", 5) &&
13137 !posix_acl_xattr)
13138 return -CEPHFS_EOPNOTSUPP;
13139
13140 bool check_realm = false;
13141
13142 if (posix_acl_xattr) {
13143 if (!strcmp(name, ACL_EA_ACCESS)) {
13144 mode_t new_mode = in->mode;
13145 if (value) {
13146 int ret = posix_acl_equiv_mode(value, size, &new_mode);
13147 if (ret < 0)
13148 return ret;
13149 if (ret == 0) {
13150 value = NULL;
13151 size = 0;
13152 }
13153 if (new_mode != in->mode) {
13154 struct ceph_statx stx;
13155 stx.stx_mode = new_mode;
13156 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, nullptr);
13157 if (ret < 0)
13158 return ret;
13159 }
13160 }
13161 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
13162 if (value) {
13163 if (!S_ISDIR(in->mode))
13164 return -CEPHFS_EACCES;
13165 int ret = posix_acl_check(value, size);
13166 if (ret < 0)
13167 return -CEPHFS_EINVAL;
13168 if (ret == 0) {
13169 value = NULL;
13170 size = 0;
13171 }
13172 }
13173 } else {
13174 return -CEPHFS_EOPNOTSUPP;
13175 }
13176 } else {
13177 const VXattr *vxattr = _match_vxattr(in, name);
13178 if (vxattr) {
13179 if (vxattr->readonly)
13180 return -CEPHFS_EOPNOTSUPP;
13181 if (vxattr->setxattr_cb)
13182 return (this->*(vxattr->setxattr_cb))(in, value, size, perms);
13183 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
13184 check_realm = true;
13185 }
13186 }
13187
13188 int ret = _do_setxattr(in, name, value, size, flags, perms);
13189 if (ret >= 0 && check_realm) {
13190 // check if snaprealm was created for quota inode
13191 if (in->quota.is_enabled() &&
13192 !(in->snaprealm && in->snaprealm->ino == in->ino))
13193 ret = -CEPHFS_EOPNOTSUPP;
13194 }
13195
13196 return ret;
13197 }
13198
13199 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
13200 size_t size, int flags, const UserPerm& perms)
13201 {
13202 if (cct->_conf->client_permissions) {
13203 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
13204 if (r < 0)
13205 return r;
13206 }
13207 return _setxattr(in.get(), name, value, size, flags, perms);
13208 }
13209
13210 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
13211 {
13212 string tmp;
13213 if (name == "layout") {
13214 string::iterator begin = value.begin();
13215 string::iterator end = value.end();
13216 keys_and_values<string::iterator> p; // create instance of parser
13217 std::map<string, string> m; // map to receive results
13218 if (!qi::parse(begin, end, p, m)) { // returns true if successful
13219 return -CEPHFS_EINVAL;
13220 }
13221 if (begin != end)
13222 return -CEPHFS_EINVAL;
13223 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
13224 if (q->first == "pool") {
13225 tmp = q->second;
13226 break;
13227 }
13228 }
13229 } else if (name == "layout.pool") {
13230 tmp = value;
13231 }
13232
13233 if (tmp.length()) {
13234 int64_t pool;
13235 try {
13236 pool = boost::lexical_cast<unsigned>(tmp);
13237 if (!osdmap->have_pg_pool(pool))
13238 return -CEPHFS_ENOENT;
13239 } catch (boost::bad_lexical_cast const&) {
13240 pool = osdmap->lookup_pg_pool_name(tmp);
13241 if (pool < 0) {
13242 return -CEPHFS_ENOENT;
13243 }
13244 }
13245 }
13246
13247 return 0;
13248 }
13249
13250 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
13251 {
13252 // For setting pool of layout, MetaRequest need osdmap epoch.
13253 // There is a race which create a new data pool but client and mds both don't have.
13254 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
13255 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
13256 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
13257 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
13258 string rest(strstr(name, "layout"));
13259 string v((const char*)value, size);
13260 int r = objecter->with_osdmap([&](const OSDMap& o) {
13261 return _setxattr_check_data_pool(rest, v, &o);
13262 });
13263
13264 if (r == -CEPHFS_ENOENT) {
13265 bs::error_code ec;
13266 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
13267 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
13268 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
13269 }
13270 }
13271 }
13272
13273 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
13274 size_t size, int flags, const UserPerm& perms)
13275 {
13276 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13277 if (!mref_reader.is_state_satisfied())
13278 return -CEPHFS_ENOTCONN;
13279
13280 _setxattr_maybe_wait_for_osdmap(name, value, size);
13281
13282 vinodeno_t vino = _get_vino(in);
13283
13284 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
13285 tout(cct) << __func__ << std::endl;
13286 tout(cct) << vino.ino.val << std::endl;
13287 tout(cct) << name << std::endl;
13288
13289 std::scoped_lock lock(client_lock);
13290 if (!fuse_default_permissions) {
13291 int r = xattr_permission(in, name, MAY_WRITE, perms);
13292 if (r < 0)
13293 return r;
13294 }
13295 return _setxattr(in, name, value, size, flags, perms);
13296 }
13297
13298 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
13299 {
13300 if (in->snapid != CEPH_NOSNAP) {
13301 return -CEPHFS_EROFS;
13302 }
13303
13304 // same xattrs supported by kernel client
13305 if (strncmp(name, "user.", 5) &&
13306 strncmp(name, "system.", 7) &&
13307 strncmp(name, "security.", 9) &&
13308 strncmp(name, "trusted.", 8) &&
13309 strncmp(name, "ceph.", 5))
13310 return -CEPHFS_EOPNOTSUPP;
13311
13312 const VXattr *vxattr = _match_vxattr(in, name);
13313 if (vxattr && vxattr->readonly)
13314 return -CEPHFS_EOPNOTSUPP;
13315
13316 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
13317 filepath path;
13318 in->make_nosnap_relative_path(path);
13319 req->set_filepath(path);
13320 req->set_filepath2(name);
13321 req->set_inode(in);
13322
13323 int res = make_request(req, perms);
13324
13325 trim_cache();
13326 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
13327 return res;
13328 }
13329
13330 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
13331 {
13332 if (cct->_conf->client_permissions) {
13333 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
13334 if (r < 0)
13335 return r;
13336 }
13337 return _removexattr(in.get(), name, perms);
13338 }
13339
13340 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
13341 {
13342 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13343 if (!mref_reader.is_state_satisfied())
13344 return -CEPHFS_ENOTCONN;
13345
13346 vinodeno_t vino = _get_vino(in);
13347
13348 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
13349 tout(cct) << "ll_removexattr" << std::endl;
13350 tout(cct) << vino.ino.val << std::endl;
13351 tout(cct) << name << std::endl;
13352
13353 std::scoped_lock lock(client_lock);
13354 if (!fuse_default_permissions) {
13355 int r = xattr_permission(in, name, MAY_WRITE, perms);
13356 if (r < 0)
13357 return r;
13358 }
13359
13360 return _removexattr(in, name, perms);
13361 }
13362
13363 bool Client::_vxattrcb_fscrypt_auth_exists(Inode *in)
13364 {
13365 bool exists = !in->fscrypt_auth.empty();
13366
13367 ldout(cct, 10) << "fscrypt_auth exists " << exists << dendl;
13368 return exists;
13369 }
13370
13371 size_t Client::_vxattrcb_fscrypt_auth(Inode *in, char *val, size_t size)
13372 {
13373 size_t count = in->fscrypt_auth.size();
13374
13375 if (count <= size)
13376 memcpy(val, in->fscrypt_auth.data(), count);
13377 return count;
13378 }
13379
13380 int Client::_vxattrcb_fscrypt_auth_set(Inode *in, const void *val, size_t size,
13381 const UserPerm& perms)
13382 {
13383 struct ceph_statx stx = { 0 };
13384 std::vector<uint8_t> aux;
13385
13386 aux.resize(size);
13387 memcpy(aux.data(), val, size);
13388
13389 return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_AUTH, perms, nullptr, &aux);
13390 }
13391
13392 bool Client::_vxattrcb_fscrypt_file_exists(Inode *in)
13393 {
13394 return !in->fscrypt_file.empty();
13395 }
13396
13397 size_t Client::_vxattrcb_fscrypt_file(Inode *in, char *val, size_t size)
13398 {
13399 size_t count = in->fscrypt_file.size();
13400
13401 if (count <= size)
13402 memcpy(val, in->fscrypt_file.data(), count);
13403 return count;
13404 }
13405
13406 int Client::_vxattrcb_fscrypt_file_set(Inode *in, const void *val, size_t size,
13407 const UserPerm& perms)
13408 {
13409 struct ceph_statx stx = { 0 };
13410 std::vector<uint8_t> aux;
13411
13412 aux.resize(size);
13413 memcpy(aux.data(), val, size);
13414
13415 return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_FILE, perms, nullptr, &aux);
13416 }
13417
13418 bool Client::_vxattrcb_quota_exists(Inode *in)
13419 {
13420 return in->quota.is_enabled() &&
13421 (in->snapid != CEPH_NOSNAP ||
13422 (in->snaprealm && in->snaprealm->ino == in->ino));
13423 }
13424 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
13425 {
13426 return snprintf(val, size,
13427 "max_bytes=%lld max_files=%lld",
13428 (long long int)in->quota.max_bytes,
13429 (long long int)in->quota.max_files);
13430 }
13431 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
13432 {
13433 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
13434 }
13435 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
13436 {
13437 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
13438 }
13439
13440 bool Client::_vxattrcb_layout_exists(Inode *in)
13441 {
13442 return in->layout != file_layout_t();
13443 }
13444 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
13445 {
13446 int r = snprintf(val, size,
13447 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
13448 (unsigned long long)in->layout.stripe_unit,
13449 (unsigned long long)in->layout.stripe_count,
13450 (unsigned long long)in->layout.object_size);
13451 objecter->with_osdmap([&](const OSDMap& o) {
13452 if (o.have_pg_pool(in->layout.pool_id))
13453 r += snprintf(val + r, size - r, "%s",
13454 o.get_pool_name(in->layout.pool_id).c_str());
13455 else
13456 r += snprintf(val + r, size - r, "%" PRIu64,
13457 (uint64_t)in->layout.pool_id);
13458 });
13459 if (in->layout.pool_ns.length())
13460 r += snprintf(val + r, size - r, " pool_namespace=%s",
13461 in->layout.pool_ns.c_str());
13462 return r;
13463 }
13464 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
13465 {
13466 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
13467 }
13468 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
13469 {
13470 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
13471 }
13472 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
13473 {
13474 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
13475 }
13476 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
13477 {
13478 size_t r;
13479 objecter->with_osdmap([&](const OSDMap& o) {
13480 if (o.have_pg_pool(in->layout.pool_id))
13481 r = snprintf(val, size, "%s", o.get_pool_name(
13482 in->layout.pool_id).c_str());
13483 else
13484 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
13485 });
13486 return r;
13487 }
13488 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
13489 {
13490 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
13491 }
13492 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
13493 {
13494 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
13495 }
13496 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
13497 {
13498 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
13499 }
13500 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
13501 {
13502 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
13503 }
13504 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13505 {
13506 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
13507 }
13508 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13509 {
13510 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
13511 }
13512 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13513 {
13514 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
13515 }
13516 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13517 {
13518 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13519 }
13520 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13521 {
13522 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
13523 }
13524 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13525 {
13526 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
13527 (long)in->rstat.rctime.nsec());
13528 }
13529 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13530 {
13531 return in->dir_pin != -CEPHFS_ENODATA;
13532 }
13533 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13534 {
13535 return snprintf(val, size, "%ld", (long)in->dir_pin);
13536 }
13537
13538 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13539 {
13540 return !in->snap_btime.is_zero();
13541 }
13542
13543 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13544 {
13545 return snprintf(val, size, "%llu.%09lu",
13546 (long long unsigned)in->snap_btime.sec(),
13547 (long unsigned)in->snap_btime.nsec());
13548 }
13549
13550 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13551 {
13552 int issued;
13553
13554 in->caps_issued(&issued);
13555 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13556 }
13557
13558 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13559 {
13560 // checking one of the xattrs would suffice
13561 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13562 }
13563
13564 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13565 {
13566 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13567 in->xattrs["ceph.mirror.info.cluster_id"].length(),
13568 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13569 in->xattrs["ceph.mirror.info.fs_id"].length(),
13570 in->xattrs["ceph.mirror.info.fs_id"].c_str());
13571 }
13572
13573 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13574 {
13575 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13576 }
13577
13578 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13579 {
13580 auto name = messenger->get_myname();
13581 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
13582 }
13583
13584 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13585 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13586
13587 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13588 { \
13589 name: CEPH_XATTR_NAME(_type, _name), \
13590 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13591 readonly: true, \
13592 exists_cb: NULL, \
13593 flags: _flags, \
13594 }
13595 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13596 { \
13597 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13598 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13599 readonly: false, \
13600 exists_cb: &Client::_vxattrcb_layout_exists, \
13601 flags: 0, \
13602 }
13603 #define XATTR_QUOTA_FIELD(_type, _name) \
13604 { \
13605 name: CEPH_XATTR_NAME(_type, _name), \
13606 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13607 readonly: false, \
13608 exists_cb: &Client::_vxattrcb_quota_exists, \
13609 flags: 0, \
13610 }
13611
13612 const Client::VXattr Client::_dir_vxattrs[] = {
13613 {
13614 name: "ceph.dir.layout",
13615 getxattr_cb: &Client::_vxattrcb_layout,
13616 readonly: false,
13617 exists_cb: &Client::_vxattrcb_layout_exists,
13618 flags: 0,
13619 },
13620 // FIXME
13621 // Delete the following dir layout field definitions for release "S"
13622 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13623 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13624 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13625 XATTR_LAYOUT_FIELD(dir, layout, pool),
13626 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
13627 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13628 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13629 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13630 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13631 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13632 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
13633 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
13634 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13635 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
13636 {
13637 name: "ceph.quota",
13638 getxattr_cb: &Client::_vxattrcb_quota,
13639 readonly: false,
13640 exists_cb: &Client::_vxattrcb_quota_exists,
13641 flags: 0,
13642 },
13643 XATTR_QUOTA_FIELD(quota, max_bytes),
13644 XATTR_QUOTA_FIELD(quota, max_files),
13645 // FIXME
13646 // Delete the following dir pin field definitions for release "S"
13647 {
13648 name: "ceph.dir.pin",
13649 getxattr_cb: &Client::_vxattrcb_dir_pin,
13650 readonly: false,
13651 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13652 flags: 0,
13653 },
13654 {
13655 name: "ceph.snap.btime",
13656 getxattr_cb: &Client::_vxattrcb_snap_btime,
13657 readonly: true,
13658 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13659 flags: 0,
13660 },
13661 {
13662 name: "ceph.mirror.info",
13663 getxattr_cb: &Client::_vxattrcb_mirror_info,
13664 readonly: false,
13665 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13666 flags: 0,
13667 },
13668 {
13669 name: "ceph.caps",
13670 getxattr_cb: &Client::_vxattrcb_caps,
13671 readonly: true,
13672 exists_cb: NULL,
13673 flags: 0,
13674 },
13675 { name: "" } /* Required table terminator */
13676 };
13677
13678 const Client::VXattr Client::_file_vxattrs[] = {
13679 {
13680 name: "ceph.file.layout",
13681 getxattr_cb: &Client::_vxattrcb_layout,
13682 readonly: false,
13683 exists_cb: &Client::_vxattrcb_layout_exists,
13684 flags: 0,
13685 },
13686 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13687 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13688 XATTR_LAYOUT_FIELD(file, layout, object_size),
13689 XATTR_LAYOUT_FIELD(file, layout, pool),
13690 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13691 {
13692 name: "ceph.snap.btime",
13693 getxattr_cb: &Client::_vxattrcb_snap_btime,
13694 readonly: true,
13695 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13696 flags: 0,
13697 },
13698 {
13699 name: "ceph.caps",
13700 getxattr_cb: &Client::_vxattrcb_caps,
13701 readonly: true,
13702 exists_cb: NULL,
13703 flags: 0,
13704 },
13705 { name: "" } /* Required table terminator */
13706 };
13707
13708 const Client::VXattr Client::_common_vxattrs[] = {
13709 {
13710 name: "ceph.cluster_fsid",
13711 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13712 readonly: true,
13713 exists_cb: nullptr,
13714 flags: 0,
13715 },
13716 {
13717 name: "ceph.client_id",
13718 getxattr_cb: &Client::_vxattrcb_client_id,
13719 readonly: true,
13720 exists_cb: nullptr,
13721 flags: 0,
13722 },
13723 {
13724 name: "ceph.fscrypt.auth",
13725 getxattr_cb: &Client::_vxattrcb_fscrypt_auth,
13726 setxattr_cb: &Client::_vxattrcb_fscrypt_auth_set,
13727 readonly: false,
13728 exists_cb: &Client::_vxattrcb_fscrypt_auth_exists,
13729 flags: 0,
13730 },
13731 {
13732 name: "ceph.fscrypt.file",
13733 getxattr_cb: &Client::_vxattrcb_fscrypt_file,
13734 setxattr_cb: &Client::_vxattrcb_fscrypt_file_set,
13735 readonly: false,
13736 exists_cb: &Client::_vxattrcb_fscrypt_file_exists,
13737 flags: 0,
13738 },
13739 { name: "" } /* Required table terminator */
13740 };
13741
13742 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13743 {
13744 if (in->is_dir())
13745 return _dir_vxattrs;
13746 else if (in->is_file())
13747 return _file_vxattrs;
13748 return NULL;
13749 }
13750
13751 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13752 {
13753 if (strncmp(name, "ceph.", 5) == 0) {
13754 const VXattr *vxattr = _get_vxattrs(in);
13755 if (vxattr) {
13756 while (!vxattr->name.empty()) {
13757 if (vxattr->name == name)
13758 return vxattr;
13759 vxattr++;
13760 }
13761 }
13762
13763 // for common vxattrs
13764 vxattr = _common_vxattrs;
13765 while (!vxattr->name.empty()) {
13766 if (vxattr->name == name)
13767 return vxattr;
13768 vxattr++;
13769 }
13770 }
13771
13772 return NULL;
13773 }
13774
13775 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13776 {
13777 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13778 if (!mref_reader.is_state_satisfied())
13779 return -CEPHFS_ENOTCONN;
13780
13781 vinodeno_t vino = _get_vino(in);
13782
13783 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13784 tout(cct) << "ll_readlink" << std::endl;
13785 tout(cct) << vino.ino.val << std::endl;
13786
13787 std::scoped_lock lock(client_lock);
13788 for (auto dn : in->dentries) {
13789 touch_dn(dn);
13790 }
13791
13792 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13793 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13794 return r;
13795 }
13796
13797 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13798 const UserPerm& perms, InodeRef *inp)
13799 {
13800 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13801 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13802 << ", gid " << perms.gid() << ")" << dendl;
13803
13804 if (strlen(name) > NAME_MAX)
13805 return -CEPHFS_ENAMETOOLONG;
13806
13807 if (dir->snapid != CEPH_NOSNAP) {
13808 return -CEPHFS_EROFS;
13809 }
13810 if (is_quota_files_exceeded(dir, perms)) {
13811 return -CEPHFS_EDQUOT;
13812 }
13813
13814 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13815
13816 req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
13817
13818 filepath path;
13819 dir->make_nosnap_relative_path(path);
13820 path.push_dentry(name);
13821 req->set_filepath(path);
13822 req->set_inode(dir);
13823 req->head.args.mknod.rdev = rdev;
13824 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13825 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13826
13827 bufferlist xattrs_bl;
13828 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13829 if (res < 0) {
13830 put_request(req);
13831 return res;
13832 }
13833 req->head.args.mknod.mode = mode;
13834 if (xattrs_bl.length() > 0)
13835 req->set_data(xattrs_bl);
13836
13837 Dentry *de = get_or_create(dir, name);
13838 req->set_dentry(de);
13839
13840 res = make_request(req, perms, inp);
13841
13842 trim_cache();
13843
13844 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13845 return res;
13846 }
13847
13848 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13849 dev_t rdev, struct stat *attr, Inode **out,
13850 const UserPerm& perms)
13851 {
13852 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13853 if (!mref_reader.is_state_satisfied())
13854 return -CEPHFS_ENOTCONN;
13855
13856 vinodeno_t vparent = _get_vino(parent);
13857
13858 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13859 tout(cct) << "ll_mknod" << std::endl;
13860 tout(cct) << vparent.ino.val << std::endl;
13861 tout(cct) << name << std::endl;
13862 tout(cct) << mode << std::endl;
13863 tout(cct) << rdev << std::endl;
13864
13865 std::scoped_lock lock(client_lock);
13866 if (!fuse_default_permissions) {
13867 int r = may_create(parent, perms);
13868 if (r < 0)
13869 return r;
13870 }
13871
13872 InodeRef in;
13873 int r = _mknod(parent, name, mode, rdev, perms, &in);
13874 if (r == 0) {
13875 fill_stat(in, attr);
13876 _ll_get(in.get());
13877 }
13878 tout(cct) << attr->st_ino << std::endl;
13879 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13880 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13881 *out = in.get();
13882 return r;
13883 }
13884
13885 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13886 dev_t rdev, Inode **out,
13887 struct ceph_statx *stx, unsigned want, unsigned flags,
13888 const UserPerm& perms)
13889 {
13890 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13891 if (!mref_reader.is_state_satisfied())
13892 return -CEPHFS_ENOTCONN;
13893
13894 unsigned caps = statx_to_mask(flags, want);
13895
13896 vinodeno_t vparent = _get_vino(parent);
13897
13898 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13899 tout(cct) << "ll_mknodx" << std::endl;
13900 tout(cct) << vparent.ino.val << std::endl;
13901 tout(cct) << name << std::endl;
13902 tout(cct) << mode << std::endl;
13903 tout(cct) << rdev << std::endl;
13904
13905 std::scoped_lock lock(client_lock);
13906
13907 if (!fuse_default_permissions) {
13908 int r = may_create(parent, perms);
13909 if (r < 0)
13910 return r;
13911 }
13912
13913 InodeRef in;
13914 int r = _mknod(parent, name, mode, rdev, perms, &in);
13915 if (r == 0) {
13916 fill_statx(in, caps, stx);
13917 _ll_get(in.get());
13918 }
13919 tout(cct) << stx->stx_ino << std::endl;
13920 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13921 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13922 *out = in.get();
13923 return r;
13924 }
13925
13926 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13927 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13928 int object_size, const char *data_pool, bool *created,
13929 const UserPerm& perms, std::string alternate_name)
13930 {
13931 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13932 mode << dec << ")" << dendl;
13933
13934 if (strlen(name) > NAME_MAX)
13935 return -CEPHFS_ENAMETOOLONG;
13936 if (dir->snapid != CEPH_NOSNAP) {
13937 return -CEPHFS_EROFS;
13938 }
13939 if (is_quota_files_exceeded(dir, perms)) {
13940 return -CEPHFS_EDQUOT;
13941 }
13942
13943 // use normalized flags to generate cmode
13944 int cflags = ceph_flags_sys2wire(flags);
13945 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13946 cflags |= CEPH_O_LAZY;
13947
13948 int cmode = ceph_flags_to_mode(cflags);
13949
13950 int64_t pool_id = -1;
13951 if (data_pool && *data_pool) {
13952 pool_id = objecter->with_osdmap(
13953 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13954 if (pool_id < 0)
13955 return -CEPHFS_EINVAL;
13956 if (pool_id > 0xffffffffll)
13957 return -CEPHFS_ERANGE; // bummer!
13958 }
13959
13960 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13961
13962 req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
13963
13964 filepath path;
13965 dir->make_nosnap_relative_path(path);
13966 path.push_dentry(name);
13967 req->set_filepath(path);
13968 req->set_alternate_name(std::move(alternate_name));
13969 req->set_inode(dir);
13970 req->head.args.open.flags = cflags | CEPH_O_CREAT;
13971
13972 req->head.args.open.stripe_unit = stripe_unit;
13973 req->head.args.open.stripe_count = stripe_count;
13974 req->head.args.open.object_size = object_size;
13975 if (cct->_conf->client_debug_getattr_caps)
13976 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13977 else
13978 req->head.args.open.mask = 0;
13979 req->head.args.open.pool = pool_id;
13980 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13981 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13982
13983 mode |= S_IFREG;
13984 bufferlist xattrs_bl;
13985 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13986 if (res < 0) {
13987 put_request(req);
13988 return res;
13989 }
13990 req->head.args.open.mode = mode;
13991 if (xattrs_bl.length() > 0)
13992 req->set_data(xattrs_bl);
13993
13994 Dentry *de = get_or_create(dir, name);
13995 req->set_dentry(de);
13996
13997 res = make_request(req, perms, inp, created);
13998 if (res < 0) {
13999 goto reply_error;
14000 }
14001
14002 /* If the caller passed a value in fhp, do the open */
14003 if(fhp) {
14004 (*inp)->get_open_ref(cmode);
14005 *fhp = _create_fh(inp->get(), flags, cmode, perms);
14006 }
14007
14008 reply_error:
14009 trim_cache();
14010
14011 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
14012 << " layout " << stripe_unit
14013 << ' ' << stripe_count
14014 << ' ' << object_size
14015 <<") = " << res << dendl;
14016 return res;
14017 }
14018
14019 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
14020 InodeRef *inp, const std::map<std::string, std::string> &metadata,
14021 std::string alternate_name)
14022 {
14023 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
14024 << mode << dec << ", uid " << perm.uid()
14025 << ", gid " << perm.gid() << ")" << dendl;
14026
14027 if (strlen(name) > NAME_MAX)
14028 return -CEPHFS_ENAMETOOLONG;
14029
14030 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
14031 return -CEPHFS_EROFS;
14032 }
14033 if (is_quota_files_exceeded(dir, perm)) {
14034 return -CEPHFS_EDQUOT;
14035 }
14036
14037 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
14038 MetaRequest *req = new MetaRequest(is_snap_op ?
14039 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
14040
14041 if (!is_snap_op)
14042 req->set_inode_owner_uid_gid(perm.uid(), perm.gid());
14043
14044 filepath path;
14045 dir->make_nosnap_relative_path(path);
14046 path.push_dentry(name);
14047 req->set_filepath(path);
14048 req->set_inode(dir);
14049 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14050 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14051 req->set_alternate_name(std::move(alternate_name));
14052
14053 mode |= S_IFDIR;
14054 bufferlist bl;
14055 int res = _posix_acl_create(dir, &mode, bl, perm);
14056 if (res < 0) {
14057 put_request(req);
14058 return res;
14059 }
14060 req->head.args.mkdir.mode = mode;
14061 if (is_snap_op) {
14062 SnapPayload payload;
14063 // clear the bufferlist that may have been populated by the call
14064 // to _posix_acl_create(). MDS mksnap does not make use of it.
14065 // So, reuse it to pass metadata payload.
14066 bl.clear();
14067 payload.metadata = metadata;
14068 encode(payload, bl);
14069 }
14070 if (bl.length() > 0) {
14071 req->set_data(bl);
14072 }
14073
14074 Dentry *de = get_or_create(dir, name);
14075 req->set_dentry(de);
14076
14077 ldout(cct, 10) << "_mkdir: making request" << dendl;
14078 res = make_request(req, perm, inp);
14079 ldout(cct, 10) << "_mkdir result is " << res << dendl;
14080
14081 trim_cache();
14082
14083 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
14084 return res;
14085 }
14086
14087 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
14088 struct stat *attr, Inode **out, const UserPerm& perm)
14089 {
14090 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14091 if (!mref_reader.is_state_satisfied())
14092 return -CEPHFS_ENOTCONN;
14093
14094 vinodeno_t vparent = _get_vino(parent);
14095
14096 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
14097 tout(cct) << "ll_mkdir" << std::endl;
14098 tout(cct) << vparent.ino.val << std::endl;
14099 tout(cct) << name << std::endl;
14100 tout(cct) << mode << std::endl;
14101
14102 std::scoped_lock lock(client_lock);
14103
14104 if (!fuse_default_permissions) {
14105 int r = may_create(parent, perm);
14106 if (r < 0)
14107 return r;
14108 }
14109
14110 InodeRef in;
14111 int r = _mkdir(parent, name, mode, perm, &in);
14112 if (r == 0) {
14113 fill_stat(in, attr);
14114 _ll_get(in.get());
14115 }
14116 tout(cct) << attr->st_ino << std::endl;
14117 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
14118 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
14119 *out = in.get();
14120 return r;
14121 }
14122
14123 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
14124 struct ceph_statx *stx, unsigned want, unsigned flags,
14125 const UserPerm& perms)
14126 {
14127 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14128 if (!mref_reader.is_state_satisfied())
14129 return -CEPHFS_ENOTCONN;
14130
14131 vinodeno_t vparent = _get_vino(parent);
14132
14133 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
14134 tout(cct) << "ll_mkdirx" << std::endl;
14135 tout(cct) << vparent.ino.val << std::endl;
14136 tout(cct) << name << std::endl;
14137 tout(cct) << mode << std::endl;
14138
14139 std::scoped_lock lock(client_lock);
14140
14141 if (!fuse_default_permissions) {
14142 int r = may_create(parent, perms);
14143 if (r < 0)
14144 return r;
14145 }
14146
14147 InodeRef in;
14148 int r = _mkdir(parent, name, mode, perms, &in);
14149 if (r == 0) {
14150 fill_statx(in, statx_to_mask(flags, want), stx);
14151 _ll_get(in.get());
14152 } else {
14153 stx->stx_ino = 0;
14154 stx->stx_mask = 0;
14155 }
14156 tout(cct) << stx->stx_ino << std::endl;
14157 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
14158 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
14159 *out = in.get();
14160 return r;
14161 }
14162
14163 int Client::_symlink(Inode *dir, const char *name, const char *target,
14164 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
14165 {
14166 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
14167 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
14168 << dendl;
14169
14170 if (strlen(name) > NAME_MAX)
14171 return -CEPHFS_ENAMETOOLONG;
14172
14173 if (dir->snapid != CEPH_NOSNAP) {
14174 return -CEPHFS_EROFS;
14175 }
14176 if (is_quota_files_exceeded(dir, perms)) {
14177 return -CEPHFS_EDQUOT;
14178 }
14179
14180 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
14181
14182 req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
14183
14184 filepath path;
14185 dir->make_nosnap_relative_path(path);
14186 path.push_dentry(name);
14187 req->set_filepath(path);
14188 req->set_alternate_name(std::move(alternate_name));
14189 req->set_inode(dir);
14190 req->set_string2(target);
14191 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14192 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14193
14194 Dentry *de = get_or_create(dir, name);
14195 req->set_dentry(de);
14196
14197 int res = make_request(req, perms, inp);
14198
14199 trim_cache();
14200 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
14201 res << dendl;
14202 return res;
14203 }
14204
14205 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
14206 struct stat *attr, Inode **out, const UserPerm& perms)
14207 {
14208 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14209 if (!mref_reader.is_state_satisfied())
14210 return -CEPHFS_ENOTCONN;
14211
14212 vinodeno_t vparent = _get_vino(parent);
14213
14214 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
14215 << dendl;
14216 tout(cct) << "ll_symlink" << std::endl;
14217 tout(cct) << vparent.ino.val << std::endl;
14218 tout(cct) << name << std::endl;
14219 tout(cct) << value << std::endl;
14220
14221 std::scoped_lock lock(client_lock);
14222
14223 if (!fuse_default_permissions) {
14224 int r = may_create(parent, perms);
14225 if (r < 0)
14226 return r;
14227 }
14228
14229 InodeRef in;
14230 int r = _symlink(parent, name, value, perms, "", &in);
14231 if (r == 0) {
14232 fill_stat(in, attr);
14233 _ll_get(in.get());
14234 }
14235 tout(cct) << attr->st_ino << std::endl;
14236 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
14237 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
14238 *out = in.get();
14239 return r;
14240 }
14241
14242 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
14243 Inode **out, struct ceph_statx *stx, unsigned want,
14244 unsigned flags, const UserPerm& perms)
14245 {
14246 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14247 if (!mref_reader.is_state_satisfied())
14248 return -CEPHFS_ENOTCONN;
14249
14250 vinodeno_t vparent = _get_vino(parent);
14251
14252 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
14253 << dendl;
14254 tout(cct) << "ll_symlinkx" << std::endl;
14255 tout(cct) << vparent.ino.val << std::endl;
14256 tout(cct) << name << std::endl;
14257 tout(cct) << value << std::endl;
14258
14259 std::scoped_lock lock(client_lock);
14260
14261 if (!fuse_default_permissions) {
14262 int r = may_create(parent, perms);
14263 if (r < 0)
14264 return r;
14265 }
14266
14267 InodeRef in;
14268 int r = _symlink(parent, name, value, perms, "", &in);
14269 if (r == 0) {
14270 fill_statx(in, statx_to_mask(flags, want), stx);
14271 _ll_get(in.get());
14272 }
14273 tout(cct) << stx->stx_ino << std::endl;
14274 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
14275 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
14276 *out = in.get();
14277 return r;
14278 }
14279
14280 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
14281 {
14282 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
14283 << " uid " << perm.uid() << " gid " << perm.gid()
14284 << ")" << dendl;
14285
14286 if (dir->snapid != CEPH_NOSNAP) {
14287 return -CEPHFS_EROFS;
14288 }
14289
14290 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
14291
14292 filepath path;
14293 dir->make_nosnap_relative_path(path);
14294 path.push_dentry(name);
14295 req->set_filepath(path);
14296
14297 InodeRef otherin;
14298 Inode *in;
14299 Dentry *de = get_or_create(dir, name);
14300 req->set_dentry(de);
14301 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14302 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14303
14304 int res = _lookup(dir, name, 0, &otherin, perm);
14305 if (res < 0) {
14306 put_request(req);
14307 return res;
14308 }
14309
14310 in = otherin.get();
14311 req->set_other_inode(in);
14312 in->break_all_delegs();
14313 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14314
14315 req->set_inode(dir);
14316
14317 res = make_request(req, perm);
14318
14319 trim_cache();
14320 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
14321 return res;
14322 }
14323
14324 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
14325 {
14326 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14327 if (!mref_reader.is_state_satisfied())
14328 return -CEPHFS_ENOTCONN;
14329
14330 vinodeno_t vino = _get_vino(in);
14331
14332 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
14333 tout(cct) << "ll_unlink" << std::endl;
14334 tout(cct) << vino.ino.val << std::endl;
14335 tout(cct) << name << std::endl;
14336
14337 std::scoped_lock lock(client_lock);
14338
14339 if (!fuse_default_permissions) {
14340 int r = may_delete(in, name, perm);
14341 if (r < 0)
14342 return r;
14343 }
14344 return _unlink(in, name, perm);
14345 }
14346
14347 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
14348 {
14349 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
14350 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
14351
14352 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
14353 return -CEPHFS_EROFS;
14354 }
14355
14356 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
14357 MetaRequest *req = new MetaRequest(op);
14358 filepath path;
14359 dir->make_nosnap_relative_path(path);
14360 path.push_dentry(name);
14361 req->set_filepath(path);
14362 req->set_inode(dir);
14363
14364 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14365 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14366 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14367
14368 InodeRef in;
14369
14370 Dentry *de = get_or_create(dir, name);
14371 if (op == CEPH_MDS_OP_RMDIR)
14372 req->set_dentry(de);
14373 else
14374 de->get();
14375
14376 int res = _lookup(dir, name, 0, &in, perms);
14377 if (res < 0) {
14378 put_request(req);
14379 return res;
14380 }
14381
14382 if (op == CEPH_MDS_OP_RMSNAP) {
14383 unlink(de, true, true);
14384 de->put();
14385 }
14386 req->set_other_inode(in.get());
14387
14388 res = make_request(req, perms);
14389
14390 trim_cache();
14391 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
14392 return res;
14393 }
14394
14395 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
14396 {
14397 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14398 if (!mref_reader.is_state_satisfied())
14399 return -CEPHFS_ENOTCONN;
14400
14401 vinodeno_t vino = _get_vino(in);
14402
14403 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
14404 tout(cct) << "ll_rmdir" << std::endl;
14405 tout(cct) << vino.ino.val << std::endl;
14406 tout(cct) << name << std::endl;
14407
14408 std::scoped_lock lock(client_lock);
14409
14410 if (!fuse_default_permissions) {
14411 int r = may_delete(in, name, perms);
14412 if (r < 0)
14413 return r;
14414 }
14415
14416 return _rmdir(in, name, perms);
14417 }
14418
14419 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
14420 {
14421 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
14422 << todir->ino << " " << toname
14423 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
14424 << dendl;
14425
14426 if (fromdir->snapid != todir->snapid)
14427 return -CEPHFS_EXDEV;
14428
14429 int op = CEPH_MDS_OP_RENAME;
14430 if (fromdir->snapid != CEPH_NOSNAP) {
14431 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
14432 op = CEPH_MDS_OP_RENAMESNAP;
14433 else
14434 return -CEPHFS_EROFS;
14435 }
14436
14437 // don't allow cross-quota renames
14438 if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
14439 Inode *fromdir_root =
14440 fromdir->quota.is_enabled() ? fromdir : get_quota_root(fromdir, perm);
14441 Inode *todir_root =
14442 todir->quota.is_enabled() ? todir : get_quota_root(todir, perm);
14443 if (fromdir_root != todir_root) {
14444 return -CEPHFS_EXDEV;
14445 }
14446 }
14447
14448 InodeRef target;
14449 MetaRequest *req = new MetaRequest(op);
14450
14451 filepath from;
14452 fromdir->make_nosnap_relative_path(from);
14453 from.push_dentry(fromname);
14454 filepath to;
14455 todir->make_nosnap_relative_path(to);
14456 to.push_dentry(toname);
14457 req->set_filepath(to);
14458 req->set_filepath2(from);
14459 req->set_alternate_name(std::move(alternate_name));
14460
14461 Dentry *oldde = get_or_create(fromdir, fromname);
14462 Dentry *de = get_or_create(todir, toname);
14463
14464 int res;
14465 if (op == CEPH_MDS_OP_RENAME) {
14466 req->set_old_dentry(oldde);
14467 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
14468 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
14469
14470 de->is_renaming = true;
14471 req->set_dentry(de);
14472 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14473 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14474
14475 InodeRef oldin, otherin;
14476 res = _lookup(fromdir, fromname, 0, &oldin, perm, nullptr, true);
14477 if (res < 0)
14478 goto fail;
14479
14480 Inode *oldinode = oldin.get();
14481 oldinode->break_all_delegs();
14482 req->set_old_inode(oldinode);
14483 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
14484
14485 res = _lookup(todir, toname, 0, &otherin, perm, nullptr, true);
14486 switch (res) {
14487 case 0:
14488 {
14489 Inode *in = otherin.get();
14490 req->set_other_inode(in);
14491 in->break_all_delegs();
14492 }
14493 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14494 break;
14495 case -CEPHFS_ENOENT:
14496 break;
14497 default:
14498 goto fail;
14499 }
14500
14501 req->set_inode(todir);
14502 } else {
14503 // renamesnap reply contains no tracedn, so we need to invalidate
14504 // dentry manually
14505 unlink(oldde, true, true);
14506 unlink(de, true, true);
14507
14508 req->set_inode(todir);
14509 }
14510
14511 res = make_request(req, perm, &target);
14512 ldout(cct, 10) << "rename result is " << res << dendl;
14513
14514 // if rename fails it will miss waking up the waiters
14515 if (op == CEPH_MDS_OP_RENAME && de->is_renaming) {
14516 de->is_renaming = false;
14517 signal_cond_list(waiting_for_rename);
14518 }
14519
14520 // renamed item from our cache
14521
14522 trim_cache();
14523 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
14524 return res;
14525
14526 fail:
14527 put_request(req);
14528 return res;
14529 }
14530
14531 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14532 const char *newname, const UserPerm& perm)
14533 {
14534 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14535 if (!mref_reader.is_state_satisfied())
14536 return -CEPHFS_ENOTCONN;
14537
14538 vinodeno_t vparent = _get_vino(parent);
14539 vinodeno_t vnewparent = _get_vino(newparent);
14540
14541 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14542 << vnewparent << " " << newname << dendl;
14543 tout(cct) << "ll_rename" << std::endl;
14544 tout(cct) << vparent.ino.val << std::endl;
14545 tout(cct) << name << std::endl;
14546 tout(cct) << vnewparent.ino.val << std::endl;
14547 tout(cct) << newname << std::endl;
14548
14549 std::scoped_lock lock(client_lock);
14550
14551 if (!fuse_default_permissions) {
14552 int r = may_delete(parent, name, perm);
14553 if (r < 0)
14554 return r;
14555 r = may_delete(newparent, newname, perm);
14556 if (r < 0 && r != -CEPHFS_ENOENT)
14557 return r;
14558 }
14559
14560 return _rename(parent, name, newparent, newname, perm, "");
14561 }
14562
14563 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
14564 {
14565 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
14566 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14567
14568 if (strlen(newname) > NAME_MAX)
14569 return -CEPHFS_ENAMETOOLONG;
14570
14571 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
14572 return -CEPHFS_EROFS;
14573 }
14574 if (is_quota_files_exceeded(dir, perm)) {
14575 return -CEPHFS_EDQUOT;
14576 }
14577
14578 in->break_all_delegs();
14579 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14580
14581 filepath path(newname, dir->ino);
14582 req->set_filepath(path);
14583 req->set_alternate_name(std::move(alternate_name));
14584 filepath existing(in->ino);
14585 req->set_filepath2(existing);
14586
14587 req->set_inode(dir);
14588 req->inode_drop = CEPH_CAP_FILE_SHARED;
14589 req->inode_unless = CEPH_CAP_FILE_EXCL;
14590
14591 Dentry *de = get_or_create(dir, newname);
14592 req->set_dentry(de);
14593
14594 int res = make_request(req, perm, inp);
14595 ldout(cct, 10) << "link result is " << res << dendl;
14596
14597 trim_cache();
14598 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
14599 return res;
14600 }
14601
14602 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14603 const UserPerm& perm)
14604 {
14605 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14606 if (!mref_reader.is_state_satisfied())
14607 return -CEPHFS_ENOTCONN;
14608
14609 vinodeno_t vino = _get_vino(in);
14610 vinodeno_t vnewparent = _get_vino(newparent);
14611
14612 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
14613 newname << dendl;
14614 tout(cct) << "ll_link" << std::endl;
14615 tout(cct) << vino.ino.val << std::endl;
14616 tout(cct) << vnewparent << std::endl;
14617 tout(cct) << newname << std::endl;
14618
14619 InodeRef target;
14620
14621 std::scoped_lock lock(client_lock);
14622
14623 if (!fuse_default_permissions) {
14624 if (S_ISDIR(in->mode))
14625 return -CEPHFS_EPERM;
14626
14627 int r = may_hardlink(in, perm);
14628 if (r < 0)
14629 return r;
14630
14631 r = may_create(newparent, perm);
14632 if (r < 0)
14633 return r;
14634 }
14635
14636 return _link(in, newparent, newname, perm, "", &target);
14637 }
14638
14639 int Client::ll_num_osds(void)
14640 {
14641 std::scoped_lock lock(client_lock);
14642 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14643 }
14644
14645 int Client::ll_osdaddr(int osd, uint32_t *addr)
14646 {
14647 std::scoped_lock lock(client_lock);
14648
14649 entity_addr_t g;
14650 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14651 if (!o.exists(osd))
14652 return false;
14653 g = o.get_addrs(osd).front();
14654 return true;
14655 });
14656 if (!exists)
14657 return -1;
14658 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14659 *addr = ntohl(nb_addr);
14660 return 0;
14661 }
14662
14663 uint32_t Client::ll_stripe_unit(Inode *in)
14664 {
14665 std::scoped_lock lock(client_lock);
14666 return in->layout.stripe_unit;
14667 }
14668
14669 uint64_t Client::ll_snap_seq(Inode *in)
14670 {
14671 std::scoped_lock lock(client_lock);
14672 return in->snaprealm->seq;
14673 }
14674
14675 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14676 {
14677 std::scoped_lock lock(client_lock);
14678 *layout = in->layout;
14679 return 0;
14680 }
14681
14682 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14683 {
14684 return ll_file_layout(fh->inode.get(), layout);
14685 }
14686
14687 /* Currently we cannot take advantage of redundancy in reads, since we
14688 would have to go through all possible placement groups (a
14689 potentially quite large number determined by a hash), and use CRUSH
14690 to calculate the appropriate set of OSDs for each placement group,
14691 then index into that. An array with one entry per OSD is much more
14692 tractable and works for demonstration purposes. */
14693
14694 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14695 file_layout_t* layout)
14696 {
14697 std::scoped_lock lock(client_lock);
14698
14699 inodeno_t ino = in->ino;
14700 uint32_t object_size = layout->object_size;
14701 uint32_t su = layout->stripe_unit;
14702 uint32_t stripe_count = layout->stripe_count;
14703 uint64_t stripes_per_object = object_size / su;
14704 uint64_t stripeno = 0, stripepos = 0;
14705
14706 if(stripe_count) {
14707 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14708 stripepos = blockno % stripe_count; // which object in the object set (X)
14709 }
14710 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14711 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14712
14713 object_t oid = file_object_t(ino, objectno);
14714 return objecter->with_osdmap([&](const OSDMap& o) {
14715 ceph_object_layout olayout =
14716 o.file_to_object_layout(oid, *layout);
14717 pg_t pg = (pg_t)olayout.ol_pgid;
14718 vector<int> osds;
14719 int primary;
14720 o.pg_to_acting_osds(pg, &osds, &primary);
14721 return primary;
14722 });
14723 }
14724
14725 /* Return the offset of the block, internal to the object */
14726
14727 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14728 {
14729 std::scoped_lock lock(client_lock);
14730 file_layout_t *layout=&(in->layout);
14731 uint32_t object_size = layout->object_size;
14732 uint32_t su = layout->stripe_unit;
14733 uint64_t stripes_per_object = object_size / su;
14734
14735 return (blockno % stripes_per_object) * su;
14736 }
14737
14738 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14739 const UserPerm& perms)
14740 {
14741 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14742 if (!mref_reader.is_state_satisfied())
14743 return -CEPHFS_ENOTCONN;
14744
14745 vinodeno_t vino = _get_vino(in);
14746
14747 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14748 tout(cct) << "ll_opendir" << std::endl;
14749 tout(cct) << vino.ino.val << std::endl;
14750
14751 std::scoped_lock lock(client_lock);
14752
14753 if (!fuse_default_permissions) {
14754 int r = may_open(in, flags, perms);
14755 if (r < 0)
14756 return r;
14757 }
14758
14759 int r = _opendir(in, dirpp, perms);
14760 tout(cct) << (uintptr_t)*dirpp << std::endl;
14761
14762 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14763 << dendl;
14764 return r;
14765 }
14766
14767 int Client::ll_releasedir(dir_result_t *dirp)
14768 {
14769 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14770 if (!mref_reader.is_state_satisfied())
14771 return -CEPHFS_ENOTCONN;
14772
14773 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14774 tout(cct) << "ll_releasedir" << std::endl;
14775 tout(cct) << (uintptr_t)dirp << std::endl;
14776
14777 std::scoped_lock lock(client_lock);
14778
14779 _closedir(dirp);
14780 return 0;
14781 }
14782
14783 int Client::ll_fsyncdir(dir_result_t *dirp)
14784 {
14785 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14786 if (!mref_reader.is_state_satisfied())
14787 return -CEPHFS_ENOTCONN;
14788
14789 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14790 tout(cct) << "ll_fsyncdir" << std::endl;
14791 tout(cct) << (uintptr_t)dirp << std::endl;
14792
14793 std::scoped_lock lock(client_lock);
14794 return _fsync(dirp->inode.get(), false);
14795 }
14796
14797 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14798 {
14799 ceph_assert(!(flags & O_CREAT));
14800
14801 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14802 if (!mref_reader.is_state_satisfied())
14803 return -CEPHFS_ENOTCONN;
14804
14805 vinodeno_t vino = _get_vino(in);
14806
14807 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14808 tout(cct) << "ll_open" << std::endl;
14809 tout(cct) << vino.ino.val << std::endl;
14810 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14811
14812 std::scoped_lock lock(client_lock);
14813
14814 int r;
14815 if (!fuse_default_permissions) {
14816 r = may_open(in, flags, perms);
14817 if (r < 0)
14818 goto out;
14819 }
14820
14821 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14822
14823 out:
14824 Fh *fhptr = fhp ? *fhp : NULL;
14825 if (fhptr) {
14826 ll_unclosed_fh_set.insert(fhptr);
14827 }
14828 tout(cct) << (uintptr_t)fhptr << std::endl;
14829 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14830 " = " << r << " (" << fhptr << ")" << dendl;
14831 return r;
14832 }
14833
14834 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14835 int flags, InodeRef *in, int caps, Fh **fhp,
14836 const UserPerm& perms)
14837 {
14838 *fhp = NULL;
14839
14840 vinodeno_t vparent = _get_vino(parent);
14841
14842 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14843 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14844 << ", gid " << perms.gid() << dendl;
14845 tout(cct) << "ll_create" << std::endl;
14846 tout(cct) << vparent.ino.val << std::endl;
14847 tout(cct) << name << std::endl;
14848 tout(cct) << mode << std::endl;
14849 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14850
14851 bool created = false;
14852 int r = _lookup(parent, name, caps, in, perms);
14853
14854 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14855 return -CEPHFS_EEXIST;
14856
14857 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14858 if (!fuse_default_permissions) {
14859 r = may_create(parent, perms);
14860 if (r < 0)
14861 goto out;
14862 }
14863 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14864 perms, "");
14865 if (r < 0)
14866 goto out;
14867 }
14868
14869 if (r < 0)
14870 goto out;
14871
14872 ceph_assert(*in);
14873
14874 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14875 if (!created) {
14876 if (!fuse_default_permissions) {
14877 r = may_open(in->get(), flags, perms);
14878 if (r < 0) {
14879 if (*fhp) {
14880 int release_r = _release_fh(*fhp);
14881 ceph_assert(release_r == 0); // during create, no async data ops should have happened
14882 }
14883 goto out;
14884 }
14885 }
14886 if (*fhp == NULL) {
14887 r = _open(in->get(), flags, mode, fhp, perms);
14888 if (r < 0)
14889 goto out;
14890 }
14891 }
14892
14893 out:
14894 if (*fhp) {
14895 ll_unclosed_fh_set.insert(*fhp);
14896 }
14897
14898 #ifdef _WIN32
14899 uint64_t ino = 0;
14900 #else
14901 ino_t ino = 0;
14902 #endif
14903 if (r >= 0) {
14904 Inode *inode = in->get();
14905 if (use_faked_inos())
14906 ino = inode->faked_ino;
14907 else
14908 ino = inode->ino;
14909 }
14910
14911 tout(cct) << (uintptr_t)*fhp << std::endl;
14912 tout(cct) << ino << std::endl;
14913 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14914 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14915 *fhp << " " << hex << ino << dec << ")" << dendl;
14916
14917 return r;
14918 }
14919
14920 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14921 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14922 const UserPerm& perms)
14923 {
14924 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14925 if (!mref_reader.is_state_satisfied())
14926 return -CEPHFS_ENOTCONN;
14927
14928 std::scoped_lock lock(client_lock);
14929 InodeRef in;
14930
14931 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14932 fhp, perms);
14933 if (r >= 0) {
14934 ceph_assert(in);
14935
14936 // passing an Inode in outp requires an additional ref
14937 if (outp) {
14938 _ll_get(in.get());
14939 *outp = in.get();
14940 }
14941 fill_stat(in, attr);
14942 } else {
14943 attr->st_ino = 0;
14944 }
14945
14946 return r;
14947 }
14948
14949 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14950 int oflags, Inode **outp, Fh **fhp,
14951 struct ceph_statx *stx, unsigned want, unsigned lflags,
14952 const UserPerm& perms)
14953 {
14954 unsigned caps = statx_to_mask(lflags, want);
14955 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14956 if (!mref_reader.is_state_satisfied())
14957 return -CEPHFS_ENOTCONN;
14958
14959 std::scoped_lock lock(client_lock);
14960 InodeRef in;
14961
14962 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14963 if (r >= 0) {
14964 ceph_assert(in);
14965
14966 // passing an Inode in outp requires an additional ref
14967 if (outp) {
14968 _ll_get(in.get());
14969 *outp = in.get();
14970 }
14971 fill_statx(in, caps, stx);
14972 } else {
14973 stx->stx_ino = 0;
14974 stx->stx_mask = 0;
14975 }
14976
14977 return r;
14978 }
14979
14980 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14981 {
14982 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14983 if (!mref_reader.is_state_satisfied())
14984 return -CEPHFS_ENOTCONN;
14985
14986 tout(cct) << "ll_lseek" << std::endl;
14987 tout(cct) << offset << std::endl;
14988 tout(cct) << whence << std::endl;
14989
14990 std::scoped_lock lock(client_lock);
14991 return _lseek(fh, offset, whence);
14992 }
14993
14994 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14995 {
14996 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14997 if (!mref_reader.is_state_satisfied())
14998 return -CEPHFS_ENOTCONN;
14999
15000 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
15001 tout(cct) << "ll_read" << std::endl;
15002 tout(cct) << (uintptr_t)fh << std::endl;
15003 tout(cct) << off << std::endl;
15004 tout(cct) << len << std::endl;
15005
15006 /* We can't return bytes written larger than INT_MAX, clamp len to that */
15007 len = std::min(len, (loff_t)INT_MAX);
15008 std::scoped_lock lock(client_lock);
15009
15010 int r = _read(fh, off, len, bl);
15011 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
15012 << dendl;
15013 return r;
15014 }
15015
15016 int Client::ll_read_block(Inode *in, uint64_t blockid,
15017 char *buf,
15018 uint64_t offset,
15019 uint64_t length,
15020 file_layout_t* layout)
15021 {
15022 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15023 if (!mref_reader.is_state_satisfied())
15024 return -CEPHFS_ENOTCONN;
15025
15026 vinodeno_t vino = _get_vino(in);
15027 object_t oid = file_object_t(vino.ino, blockid);
15028 C_SaferCond onfinish;
15029 bufferlist bl;
15030
15031 objecter->read(oid,
15032 object_locator_t(layout->pool_id),
15033 offset,
15034 length,
15035 vino.snapid,
15036 &bl,
15037 CEPH_OSD_FLAG_READ,
15038 &onfinish);
15039
15040 int r = onfinish.wait();
15041 if (r >= 0) {
15042 bl.begin().copy(bl.length(), buf);
15043 r = bl.length();
15044 }
15045
15046 return r;
15047 }
15048
15049 /* It appears that the OSD doesn't return success unless the entire
15050 buffer was written, return the write length on success. */
15051
15052 int Client::ll_write_block(Inode *in, uint64_t blockid,
15053 char* buf, uint64_t offset,
15054 uint64_t length, file_layout_t* layout,
15055 uint64_t snapseq, uint32_t sync)
15056 {
15057 vinodeno_t vino = ll_get_vino(in);
15058 int r = 0;
15059 std::unique_ptr<C_SaferCond> onsafe = nullptr;
15060
15061 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15062 if (!mref_reader.is_state_satisfied())
15063 return -CEPHFS_ENOTCONN;
15064
15065 if (length == 0) {
15066 return -CEPHFS_EINVAL;
15067 }
15068 if (true || sync) {
15069 /* if write is stable, the epilogue is waiting on
15070 * flock */
15071 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
15072 }
15073 object_t oid = file_object_t(vino.ino, blockid);
15074 SnapContext fakesnap;
15075 ceph::bufferlist bl;
15076 if (length > 0) {
15077 bl.push_back(buffer::copy(buf, length));
15078 }
15079
15080 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
15081 << dendl;
15082
15083 fakesnap.seq = snapseq;
15084
15085 /* lock just in time */
15086 objecter->write(oid,
15087 object_locator_t(layout->pool_id),
15088 offset,
15089 length,
15090 fakesnap,
15091 bl,
15092 ceph::real_clock::now(),
15093 0,
15094 onsafe.get());
15095
15096 if (nullptr != onsafe) {
15097 r = onsafe->wait();
15098 }
15099
15100 if (r < 0) {
15101 return r;
15102 } else {
15103 return length;
15104 }
15105 }
15106
15107 int Client::ll_commit_blocks(Inode *in,
15108 uint64_t offset,
15109 uint64_t length)
15110 {
15111 /*
15112 BarrierContext *bctx;
15113 vinodeno_t vino = _get_vino(in);
15114 uint64_t ino = vino.ino;
15115
15116 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
15117 << offset << " to " << length << dendl;
15118
15119 if (length == 0) {
15120 return -CEPHFS_EINVAL;
15121 }
15122
15123 std::scoped_lock lock(client_lock);
15124 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
15125 if (p != barriers.end()) {
15126 barrier_interval civ(offset, offset + length);
15127 p->second->commit_barrier(civ);
15128 }
15129 */
15130 return 0;
15131 }
15132
15133 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
15134 {
15135 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
15136 "~" << len << dendl;
15137 tout(cct) << "ll_write" << std::endl;
15138 tout(cct) << (uintptr_t)fh << std::endl;
15139 tout(cct) << off << std::endl;
15140 tout(cct) << len << std::endl;
15141
15142 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15143 if (!mref_reader.is_state_satisfied())
15144 return -CEPHFS_ENOTCONN;
15145
15146 /* We can't return bytes written larger than INT_MAX, clamp len to that */
15147 len = std::min(len, (loff_t)INT_MAX);
15148 std::scoped_lock lock(client_lock);
15149
15150 int r = _write(fh, off, len, data, NULL, 0);
15151 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
15152 << dendl;
15153 return r;
15154 }
15155
15156 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
15157 {
15158 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15159 if (!mref_reader.is_state_satisfied())
15160 return -CEPHFS_ENOTCONN;
15161
15162 std::scoped_lock cl(client_lock);
15163 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
15164 }
15165
15166 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
15167 {
15168 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15169 if (!mref_reader.is_state_satisfied())
15170 return -CEPHFS_ENOTCONN;
15171
15172 std::scoped_lock cl(client_lock);
15173 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
15174 }
15175
15176 int Client::ll_flush(Fh *fh)
15177 {
15178 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15179 if (!mref_reader.is_state_satisfied())
15180 return -CEPHFS_ENOTCONN;
15181
15182 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
15183 tout(cct) << "ll_flush" << std::endl;
15184 tout(cct) << (uintptr_t)fh << std::endl;
15185
15186 std::scoped_lock lock(client_lock);
15187 return _flush(fh);
15188 }
15189
15190 int Client::ll_fsync(Fh *fh, bool syncdataonly)
15191 {
15192 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15193 if (!mref_reader.is_state_satisfied())
15194 return -CEPHFS_ENOTCONN;
15195
15196 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
15197 tout(cct) << "ll_fsync" << std::endl;
15198 tout(cct) << (uintptr_t)fh << std::endl;
15199
15200 std::scoped_lock lock(client_lock);
15201 int r = _fsync(fh, syncdataonly);
15202 if (r) {
15203 // If we're returning an error, clear it from the FH
15204 fh->take_async_err();
15205 }
15206 return r;
15207 }
15208
15209 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
15210 {
15211 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15212 if (!mref_reader.is_state_satisfied())
15213 return -CEPHFS_ENOTCONN;
15214
15215 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
15216 tout(cct) << "ll_sync_inode" << std::endl;
15217 tout(cct) << (uintptr_t)in << std::endl;
15218
15219 std::scoped_lock lock(client_lock);
15220 return _fsync(in, syncdataonly);
15221 }
15222
15223 int Client::clear_suid_sgid(Inode *in, const UserPerm& perms, bool defer)
15224 {
15225 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " defer "
15226 << defer << dendl;
15227
15228 if (!in->is_file()) {
15229 return 0;
15230 }
15231
15232 if (likely(!(in->mode & (S_ISUID|S_ISGID)))) {
15233 return 0;
15234 }
15235
15236 if (perms.uid() == 0 || perms.uid() == in->uid) {
15237 return 0;
15238 }
15239
15240 int mask = 0;
15241
15242 // always drop the suid
15243 if (unlikely(in->mode & S_ISUID)) {
15244 mask = CEPH_SETATTR_KILL_SUID;
15245 }
15246
15247 // remove the sgid if S_IXUGO is set or the inode is
15248 // is not in the caller's group list.
15249 if ((in->mode & S_ISGID) &&
15250 ((in->mode & S_IXUGO) || !perms.gid_in_groups(in->gid))) {
15251 mask |= CEPH_SETATTR_KILL_SGID;
15252 }
15253
15254 ldout(cct, 20) << __func__ << " mask " << mask << dendl;
15255 if (defer) {
15256 return mask;
15257 }
15258
15259 struct ceph_statx stx = { 0 };
15260 return __setattrx(in, &stx, mask, perms);
15261 }
15262
15263 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
15264 {
15265 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15266
15267 if (offset < 0 || length <= 0)
15268 return -CEPHFS_EINVAL;
15269
15270 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
15271 return -CEPHFS_EOPNOTSUPP;
15272
15273 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
15274 return -CEPHFS_EOPNOTSUPP;
15275
15276 Inode *in = fh->inode.get();
15277
15278 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
15279 !(mode & FALLOC_FL_PUNCH_HOLE)) {
15280 return -CEPHFS_ENOSPC;
15281 }
15282
15283 if (in->snapid != CEPH_NOSNAP)
15284 return -CEPHFS_EROFS;
15285
15286 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
15287 return -CEPHFS_EBADF;
15288
15289 uint64_t size = offset + length;
15290 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
15291 size > in->size &&
15292 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
15293 return -CEPHFS_EDQUOT;
15294 }
15295
15296 int have;
15297 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
15298 if (r < 0)
15299 return r;
15300
15301 r = clear_suid_sgid(in, fh->actor_perms);
15302 if (r < 0) {
15303 put_cap_ref(in, CEPH_CAP_FILE_WR);
15304 return r;
15305 }
15306
15307 std::unique_ptr<C_SaferCond> onuninline = nullptr;
15308 if (mode & FALLOC_FL_PUNCH_HOLE) {
15309 if (in->inline_version < CEPH_INLINE_NONE &&
15310 (have & CEPH_CAP_FILE_BUFFER)) {
15311 bufferlist bl;
15312 auto inline_iter = in->inline_data.cbegin();
15313 int len = in->inline_data.length();
15314 if (offset < len) {
15315 if (offset > 0)
15316 inline_iter.copy(offset, bl);
15317 int size = length;
15318 if (offset + size > len)
15319 size = len - offset;
15320 if (size > 0)
15321 bl.append_zero(size);
15322 if (offset + size < len) {
15323 inline_iter += size;
15324 inline_iter.copy(len - offset - size, bl);
15325 }
15326 in->inline_data = bl;
15327 in->inline_version++;
15328 }
15329 in->mtime = in->ctime = ceph_clock_now();
15330 in->change_attr++;
15331 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15332 } else {
15333 if (in->inline_version < CEPH_INLINE_NONE) {
15334 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
15335 uninline_data(in, onuninline.get());
15336 }
15337
15338 C_SaferCond onfinish("Client::_punch_hole flock");
15339
15340 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
15341
15342 _invalidate_inode_cache(in, offset, length);
15343 filer->zero(in->ino, &in->layout,
15344 in->snaprealm->get_snap_context(),
15345 offset, length,
15346 ceph::real_clock::now(),
15347 0, true, &onfinish);
15348 in->mtime = in->ctime = ceph_clock_now();
15349 in->change_attr++;
15350 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15351
15352 client_lock.unlock();
15353 onfinish.wait();
15354 client_lock.lock();
15355 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
15356 }
15357 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
15358 uint64_t size = offset + length;
15359 if (size > in->size) {
15360 in->size = size;
15361 in->mtime = in->ctime = ceph_clock_now();
15362 in->change_attr++;
15363 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15364
15365 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
15366 check_caps(in, CHECK_CAPS_NODELAY);
15367 } else if (is_max_size_approaching(in)) {
15368 check_caps(in, 0);
15369 }
15370 }
15371 }
15372
15373 if (nullptr != onuninline) {
15374 client_lock.unlock();
15375 int ret = onuninline->wait();
15376 client_lock.lock();
15377
15378 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
15379 in->inline_data.clear();
15380 in->inline_version = CEPH_INLINE_NONE;
15381 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15382 check_caps(in, 0);
15383 } else
15384 r = ret;
15385 }
15386
15387 put_cap_ref(in, CEPH_CAP_FILE_WR);
15388 return r;
15389 }
15390
15391 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
15392 {
15393 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15394 if (!mref_reader.is_state_satisfied())
15395 return -CEPHFS_ENOTCONN;
15396
15397 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
15398 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
15399 tout(cct) << (uintptr_t)fh << std::endl;
15400
15401 std::scoped_lock lock(client_lock);
15402 return _fallocate(fh, mode, offset, length);
15403 }
15404
15405 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
15406 {
15407 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15408 if (!mref_reader.is_state_satisfied())
15409 return -CEPHFS_ENOTCONN;
15410
15411 tout(cct) << __func__ << " " << fd << mode << " " << offset << " " << length << std::endl;
15412
15413 std::scoped_lock lock(client_lock);
15414 Fh *fh = get_filehandle(fd);
15415 if (!fh)
15416 return -CEPHFS_EBADF;
15417 #if defined(__linux__) && defined(O_PATH)
15418 if (fh->flags & O_PATH)
15419 return -CEPHFS_EBADF;
15420 #endif
15421 return _fallocate(fh, mode, offset, length);
15422 }
15423
15424 int Client::ll_release(Fh *fh)
15425 {
15426 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15427 if (!mref_reader.is_state_satisfied())
15428 return -CEPHFS_ENOTCONN;
15429
15430 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
15431 dendl;
15432 tout(cct) << __func__ << " (fh)" << std::endl;
15433 tout(cct) << (uintptr_t)fh << std::endl;
15434
15435 std::scoped_lock lock(client_lock);
15436
15437 if (ll_unclosed_fh_set.count(fh))
15438 ll_unclosed_fh_set.erase(fh);
15439 return _release_fh(fh);
15440 }
15441
15442 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
15443 {
15444 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15445 if (!mref_reader.is_state_satisfied())
15446 return -CEPHFS_ENOTCONN;
15447
15448 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
15449 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
15450
15451 std::scoped_lock lock(client_lock);
15452 return _getlk(fh, fl, owner);
15453 }
15454
15455 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
15456 {
15457 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15458 if (!mref_reader.is_state_satisfied())
15459 return -CEPHFS_ENOTCONN;
15460
15461 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
15462 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
15463
15464 std::scoped_lock lock(client_lock);
15465 return _setlk(fh, fl, owner, sleep);
15466 }
15467
15468 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
15469 {
15470 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15471 if (!mref_reader.is_state_satisfied())
15472 return -CEPHFS_ENOTCONN;
15473
15474 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
15475 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
15476
15477 std::scoped_lock lock(client_lock);
15478 return _flock(fh, cmd, owner);
15479 }
15480
15481 int Client::set_deleg_timeout(uint32_t timeout)
15482 {
15483 std::scoped_lock lock(client_lock);
15484
15485 /*
15486 * The whole point is to prevent blocklisting so we must time out the
15487 * delegation before the session autoclose timeout kicks in.
15488 */
15489 if (timeout >= mdsmap->get_session_autoclose())
15490 return -CEPHFS_EINVAL;
15491
15492 deleg_timeout = timeout;
15493 return 0;
15494 }
15495
15496 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
15497 {
15498 int ret = -CEPHFS_EINVAL;
15499
15500 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15501 if (!mref_reader.is_state_satisfied())
15502 return -CEPHFS_ENOTCONN;
15503
15504 std::scoped_lock lock(client_lock);
15505
15506 Inode *inode = fh->inode.get();
15507
15508 switch(cmd) {
15509 case CEPH_DELEGATION_NONE:
15510 inode->unset_deleg(fh);
15511 ret = 0;
15512 break;
15513 default:
15514 try {
15515 ret = inode->set_deleg(fh, cmd, cb, priv);
15516 } catch (std::bad_alloc&) {
15517 ret = -CEPHFS_ENOMEM;
15518 }
15519 break;
15520 }
15521 return ret;
15522 }
15523
15524 class C_Client_RequestInterrupt : public Context {
15525 private:
15526 Client *client;
15527 MetaRequest *req;
15528 public:
15529 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
15530 req->get();
15531 }
15532 void finish(int r) override {
15533 std::scoped_lock l(client->client_lock);
15534 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
15535 client->_interrupt_filelock(req);
15536 client->put_request(req);
15537 }
15538 };
15539
15540 void Client::ll_interrupt(void *d)
15541 {
15542 MetaRequest *req = static_cast<MetaRequest*>(d);
15543 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15544 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
15545 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15546 }
15547
15548 // =========================================
15549 // layout
15550
15551 // expose file layouts
15552
15553 int Client::describe_layout(const char *relpath, file_layout_t *lp,
15554 const UserPerm& perms)
15555 {
15556 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15557 if (!mref_reader.is_state_satisfied())
15558 return -CEPHFS_ENOTCONN;
15559
15560 std::scoped_lock lock(client_lock);
15561
15562 filepath path(relpath);
15563 InodeRef in;
15564 int r = path_walk(path, &in, perms);
15565 if (r < 0)
15566 return r;
15567
15568 *lp = in->layout;
15569
15570 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
15571 return 0;
15572 }
15573
15574 int Client::fdescribe_layout(int fd, file_layout_t *lp)
15575 {
15576 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15577 if (!mref_reader.is_state_satisfied())
15578 return -CEPHFS_ENOTCONN;
15579
15580 std::scoped_lock lock(client_lock);
15581
15582 Fh *f = get_filehandle(fd);
15583 if (!f)
15584 return -CEPHFS_EBADF;
15585 Inode *in = f->inode.get();
15586
15587 *lp = in->layout;
15588
15589 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
15590 return 0;
15591 }
15592
15593 int64_t Client::get_default_pool_id()
15594 {
15595 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15596 if (!mref_reader.is_state_satisfied())
15597 return -CEPHFS_ENOTCONN;
15598
15599 std::scoped_lock lock(client_lock);
15600
15601 /* first data pool is the default */
15602 return mdsmap->get_first_data_pool();
15603 }
15604
15605 // expose osdmap
15606
15607 int64_t Client::get_pool_id(const char *pool_name)
15608 {
15609 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15610 if (!mref_reader.is_state_satisfied())
15611 return -CEPHFS_ENOTCONN;
15612
15613 std::scoped_lock lock(client_lock);
15614
15615 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15616 pool_name);
15617 }
15618
15619 string Client::get_pool_name(int64_t pool)
15620 {
15621 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15622 if (!mref_reader.is_state_satisfied())
15623 return string();
15624
15625 std::scoped_lock lock(client_lock);
15626
15627 return objecter->with_osdmap([pool](const OSDMap& o) {
15628 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15629 });
15630 }
15631
15632 int Client::get_pool_replication(int64_t pool)
15633 {
15634 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15635 if (!mref_reader.is_state_satisfied())
15636 return -CEPHFS_ENOTCONN;
15637
15638 std::scoped_lock lock(client_lock);
15639
15640 return objecter->with_osdmap([pool](const OSDMap& o) {
15641 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
15642 });
15643 }
15644
15645 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15646 {
15647 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15648 if (!mref_reader.is_state_satisfied())
15649 return -CEPHFS_ENOTCONN;
15650
15651 std::scoped_lock lock(client_lock);
15652
15653 Fh *f = get_filehandle(fd);
15654 if (!f)
15655 return -CEPHFS_EBADF;
15656 Inode *in = f->inode.get();
15657
15658 vector<ObjectExtent> extents;
15659 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
15660 ceph_assert(extents.size() == 1);
15661
15662 objecter->with_osdmap([&](const OSDMap& o) {
15663 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15664 o.pg_to_acting_osds(pg, osds);
15665 });
15666
15667 if (osds.empty())
15668 return -CEPHFS_EINVAL;
15669
15670 /*
15671 * Return the remainder of the extent (stripe unit)
15672 *
15673 * If length = 1 is passed to Striper::file_to_extents we get a single
15674 * extent back, but its length is one so we still need to compute the length
15675 * to the end of the stripe unit.
15676 *
15677 * If length = su then we may get 1 or 2 objects back in the extents vector
15678 * which would have to be examined. Even then, the offsets are local to the
15679 * object, so matching up to the file offset is extra work.
15680 *
15681 * It seems simpler to stick with length = 1 and manually compute the
15682 * remainder.
15683 */
15684 if (len) {
15685 uint64_t su = in->layout.stripe_unit;
15686 *len = su - (off % su);
15687 }
15688
15689 return 0;
15690 }
15691
15692 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15693 {
15694 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15695 if (!mref_reader.is_state_satisfied())
15696 return -CEPHFS_ENOTCONN;
15697
15698 std::scoped_lock lock(client_lock);
15699
15700 if (id < 0)
15701 return -CEPHFS_EINVAL;
15702 return objecter->with_osdmap([&](const OSDMap& o) {
15703 return o.crush->get_full_location_ordered(id, path);
15704 });
15705 }
15706
15707 int Client::get_file_stripe_address(int fd, loff_t offset,
15708 vector<entity_addr_t>& address)
15709 {
15710 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15711 if (!mref_reader.is_state_satisfied())
15712 return -CEPHFS_ENOTCONN;
15713
15714 std::scoped_lock lock(client_lock);
15715
15716 Fh *f = get_filehandle(fd);
15717 if (!f)
15718 return -CEPHFS_EBADF;
15719 Inode *in = f->inode.get();
15720
15721 // which object?
15722 vector<ObjectExtent> extents;
15723 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15724 in->truncate_size, extents);
15725 ceph_assert(extents.size() == 1);
15726
15727 // now we have the object and its 'layout'
15728 return objecter->with_osdmap([&](const OSDMap& o) {
15729 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15730 vector<int> osds;
15731 o.pg_to_acting_osds(pg, osds);
15732 if (osds.empty())
15733 return -CEPHFS_EINVAL;
15734 for (unsigned i = 0; i < osds.size(); i++) {
15735 entity_addr_t addr = o.get_addrs(osds[i]).front();
15736 address.push_back(addr);
15737 }
15738 return 0;
15739 });
15740 }
15741
15742 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15743 {
15744 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15745 if (!mref_reader.is_state_satisfied())
15746 return -CEPHFS_ENOTCONN;
15747
15748 std::scoped_lock lock(client_lock);
15749
15750 return objecter->with_osdmap([&](const OSDMap& o) {
15751 if (!o.exists(osd))
15752 return -CEPHFS_ENOENT;
15753
15754 addr = o.get_addrs(osd).front();
15755 return 0;
15756 });
15757 }
15758
15759 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15760 loff_t length, loff_t offset)
15761 {
15762 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15763 if (!mref_reader.is_state_satisfied())
15764 return -CEPHFS_ENOTCONN;
15765
15766 std::scoped_lock lock(client_lock);
15767
15768 Fh *f = get_filehandle(fd);
15769 if (!f)
15770 return -CEPHFS_EBADF;
15771 Inode *in = f->inode.get();
15772
15773 // map to a list of extents
15774 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15775
15776 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15777 return 0;
15778 }
15779
15780
15781 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15782 int Client::get_local_osd()
15783 {
15784 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15785 if (!mref_reader.is_state_satisfied())
15786 return -CEPHFS_ENOTCONN;
15787
15788 std::scoped_lock lock(client_lock);
15789
15790 objecter->with_osdmap([this](const OSDMap& o) {
15791 if (o.get_epoch() != local_osd_epoch) {
15792 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15793 local_osd_epoch = o.get_epoch();
15794 }
15795 });
15796 return local_osd;
15797 }
15798
15799
15800
15801
15802
15803
15804 // ===============================
15805
15806 void Client::ms_handle_connect(Connection *con)
15807 {
15808 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15809 }
15810
15811 bool Client::ms_handle_reset(Connection *con)
15812 {
15813 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15814 return false;
15815 }
15816
15817 void Client::ms_handle_remote_reset(Connection *con)
15818 {
15819 std::scoped_lock lock(client_lock);
15820 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15821 switch (con->get_peer_type()) {
15822 case CEPH_ENTITY_TYPE_MDS:
15823 {
15824 // kludge to figure out which mds this is; fixme with a Connection* state
15825 mds_rank_t mds = MDS_RANK_NONE;
15826 MetaSessionRef s = NULL;
15827 for (auto &p : mds_sessions) {
15828 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15829 mds = p.first;
15830 s = p.second;
15831 }
15832 }
15833 if (mds >= 0) {
15834 ceph_assert(s != NULL);
15835 switch (s->state) {
15836 case MetaSession::STATE_CLOSING:
15837 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15838 _closed_mds_session(s.get());
15839 break;
15840
15841 case MetaSession::STATE_OPENING:
15842 {
15843 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15844 list<Context*> waiters;
15845 waiters.swap(s->waiting_for_open);
15846 _closed_mds_session(s.get());
15847 auto news = _get_or_open_mds_session(mds);
15848 news->waiting_for_open.swap(waiters);
15849 }
15850 break;
15851
15852 case MetaSession::STATE_OPEN:
15853 {
15854 objecter->maybe_request_map(); /* to check if we are blocklisted */
15855 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15856 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15857 _closed_mds_session(s.get());
15858 } else {
15859 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15860 s->state = MetaSession::STATE_STALE;
15861 }
15862 }
15863 break;
15864
15865 case MetaSession::STATE_NEW:
15866 case MetaSession::STATE_CLOSED:
15867 default:
15868 break;
15869 }
15870 }
15871 }
15872 break;
15873 }
15874 }
15875
15876 bool Client::ms_handle_refused(Connection *con)
15877 {
15878 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15879 return false;
15880 }
15881
15882 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms, quota_max_t type)
15883 {
15884 Inode *quota_in = root_ancestor;
15885 SnapRealm *realm = in->snaprealm;
15886
15887 if (!cct->_conf.get_val<bool>("client_quota"))
15888 return NULL;
15889
15890 while (realm) {
15891 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15892 if (realm->ino != in->ino) {
15893 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15894 if (p == inode_map.end())
15895 break;
15896
15897 if (p->second->quota.is_enabled(type)) {
15898 quota_in = p->second;
15899 break;
15900 }
15901 }
15902 realm = realm->pparent;
15903 }
15904 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15905 return quota_in;
15906 }
15907
15908 /**
15909 * Traverse quota ancestors of the Inode, return true
15910 * if any of them passes the passed function
15911 */
15912 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15913 std::function<bool (const Inode &in)> test)
15914 {
15915 if (!cct->_conf.get_val<bool>("client_quota"))
15916 return false;
15917
15918 while (true) {
15919 ceph_assert(in != NULL);
15920 if (test(*in)) {
15921 return true;
15922 }
15923
15924 if (in == root_ancestor) {
15925 // We're done traversing, drop out
15926 return false;
15927 } else {
15928 // Continue up the tree
15929 in = get_quota_root(in, perms);
15930 }
15931 }
15932
15933 return false;
15934 }
15935
15936 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15937 {
15938 return check_quota_condition(in, perms,
15939 [](const Inode &in) {
15940 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15941 });
15942 }
15943
15944 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15945 const UserPerm& perms)
15946 {
15947 return check_quota_condition(in, perms,
15948 [&new_bytes](const Inode &in) {
15949 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15950 > in.quota.max_bytes;
15951 });
15952 }
15953
15954 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15955 {
15956 ceph_assert(in->size >= in->reported_size);
15957 const uint64_t size = in->size - in->reported_size;
15958 return check_quota_condition(in, perms,
15959 [&size](const Inode &in) {
15960 if (in.quota.max_bytes) {
15961 if (in.rstat.rbytes >= in.quota.max_bytes) {
15962 return true;
15963 }
15964
15965 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15966 return (space >> 4) < size;
15967 } else {
15968 return false;
15969 }
15970 });
15971 }
15972
15973 enum {
15974 POOL_CHECKED = 1,
15975 POOL_CHECKING = 2,
15976 POOL_READ = 4,
15977 POOL_WRITE = 8,
15978 };
15979
15980 int Client::check_pool_perm(Inode *in, int need)
15981 {
15982 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15983
15984 if (!cct->_conf->client_check_pool_perm)
15985 return 0;
15986
15987 /* Only need to do this for regular files */
15988 if (!in->is_file())
15989 return 0;
15990
15991 int64_t pool_id = in->layout.pool_id;
15992 std::string pool_ns = in->layout.pool_ns;
15993 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15994 int have = 0;
15995 while (true) {
15996 auto it = pool_perms.find(perm_key);
15997 if (it == pool_perms.end())
15998 break;
15999 if (it->second == POOL_CHECKING) {
16000 // avoid concurrent checkings
16001 wait_on_list(waiting_for_pool_perm);
16002 } else {
16003 have = it->second;
16004 ceph_assert(have & POOL_CHECKED);
16005 break;
16006 }
16007 }
16008
16009 if (!have) {
16010 if (in->snapid != CEPH_NOSNAP) {
16011 // pool permission check needs to write to the first object. But for snapshot,
16012 // head of the first object may have already been deleted. To avoid creating
16013 // orphan object, skip the check for now.
16014 return 0;
16015 }
16016
16017 pool_perms[perm_key] = POOL_CHECKING;
16018
16019 char oid_buf[32];
16020 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
16021 object_t oid = oid_buf;
16022
16023 SnapContext nullsnapc;
16024
16025 C_SaferCond rd_cond;
16026 ObjectOperation rd_op;
16027 rd_op.stat(nullptr, nullptr, nullptr);
16028
16029 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
16030 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
16031
16032 C_SaferCond wr_cond;
16033 ObjectOperation wr_op;
16034 wr_op.create(true);
16035
16036 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
16037 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
16038
16039 client_lock.unlock();
16040 int rd_ret = rd_cond.wait();
16041 int wr_ret = wr_cond.wait();
16042 client_lock.lock();
16043
16044 bool errored = false;
16045
16046 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
16047 have |= POOL_READ;
16048 else if (rd_ret != -CEPHFS_EPERM) {
16049 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16050 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
16051 errored = true;
16052 }
16053
16054 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
16055 have |= POOL_WRITE;
16056 else if (wr_ret != -CEPHFS_EPERM) {
16057 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16058 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
16059 errored = true;
16060 }
16061
16062 if (errored) {
16063 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
16064 // Raise EIO because actual error code might be misleading for
16065 // userspace filesystem user.
16066 pool_perms.erase(perm_key);
16067 signal_cond_list(waiting_for_pool_perm);
16068 return -CEPHFS_EIO;
16069 }
16070
16071 pool_perms[perm_key] = have | POOL_CHECKED;
16072 signal_cond_list(waiting_for_pool_perm);
16073 }
16074
16075 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
16076 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16077 << " need " << ccap_string(need) << ", but no read perm" << dendl;
16078 return -CEPHFS_EPERM;
16079 }
16080 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
16081 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16082 << " need " << ccap_string(need) << ", but no write perm" << dendl;
16083 return -CEPHFS_EPERM;
16084 }
16085
16086 return 0;
16087 }
16088
16089 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
16090 {
16091 if (acl_type == POSIX_ACL) {
16092 if (in->xattrs.count(ACL_EA_ACCESS)) {
16093 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
16094
16095 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
16096 }
16097 }
16098 return -CEPHFS_EAGAIN;
16099 }
16100
16101 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
16102 {
16103 if (acl_type == NO_ACL)
16104 return 0;
16105
16106 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
16107 if (r < 0)
16108 goto out;
16109
16110 if (acl_type == POSIX_ACL) {
16111 if (in->xattrs.count(ACL_EA_ACCESS)) {
16112 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
16113 bufferptr acl(access_acl.c_str(), access_acl.length());
16114 r = posix_acl_access_chmod(acl, mode);
16115 if (r < 0)
16116 goto out;
16117 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
16118 } else {
16119 r = 0;
16120 }
16121 }
16122 out:
16123 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
16124 return r;
16125 }
16126
16127 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
16128 const UserPerm& perms)
16129 {
16130 if (acl_type == NO_ACL)
16131 return 0;
16132
16133 if (S_ISLNK(*mode))
16134 return 0;
16135
16136 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
16137 if (r < 0)
16138 goto out;
16139
16140 if (acl_type == POSIX_ACL) {
16141 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
16142 map<string, bufferptr> xattrs;
16143
16144 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
16145 bufferptr acl(default_acl.c_str(), default_acl.length());
16146 r = posix_acl_inherit_mode(acl, mode);
16147 if (r < 0)
16148 goto out;
16149
16150 if (r > 0) {
16151 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
16152 if (r < 0)
16153 goto out;
16154 if (r > 0)
16155 xattrs[ACL_EA_ACCESS] = acl;
16156 }
16157
16158 if (S_ISDIR(*mode))
16159 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
16160
16161 r = xattrs.size();
16162 if (r > 0)
16163 encode(xattrs, xattrs_bl);
16164 } else {
16165 if (umask_cb)
16166 *mode &= ~umask_cb(callback_handle);
16167 r = 0;
16168 }
16169 }
16170 out:
16171 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
16172 return r;
16173 }
16174
16175 void Client::set_filer_flags(int flags)
16176 {
16177 std::scoped_lock l(client_lock);
16178 ceph_assert(flags == 0 ||
16179 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
16180 objecter->add_global_op_flags(flags);
16181 }
16182
16183 void Client::clear_filer_flags(int flags)
16184 {
16185 std::scoped_lock l(client_lock);
16186 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
16187 objecter->clear_global_op_flag(flags);
16188 }
16189
16190 // called before mount
16191 void Client::set_uuid(const std::string& uuid)
16192 {
16193 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16194 ceph_assert(iref_reader.is_state_satisfied());
16195
16196 std::scoped_lock l(client_lock);
16197 ceph_assert(!uuid.empty());
16198
16199 metadata["uuid"] = uuid;
16200 _close_sessions();
16201 }
16202
16203 // called before mount. 0 means infinite
16204 void Client::set_session_timeout(unsigned timeout)
16205 {
16206 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16207 ceph_assert(iref_reader.is_state_satisfied());
16208
16209 std::scoped_lock l(client_lock);
16210
16211 metadata["timeout"] = stringify(timeout);
16212 }
16213
16214 // called before mount
16215 int Client::start_reclaim(const std::string& uuid, unsigned flags,
16216 const std::string& fs_name)
16217 {
16218 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16219 if (!iref_reader.is_state_satisfied())
16220 return -CEPHFS_ENOTCONN;
16221
16222 if (uuid.empty())
16223 return -CEPHFS_EINVAL;
16224
16225 std::unique_lock l(client_lock);
16226 {
16227 auto it = metadata.find("uuid");
16228 if (it != metadata.end() && it->second == uuid)
16229 return -CEPHFS_EINVAL;
16230 }
16231
16232 int r = subscribe_mdsmap(fs_name);
16233 if (r < 0) {
16234 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
16235 return r;
16236 }
16237
16238 if (metadata.empty())
16239 populate_metadata("");
16240
16241 while (mdsmap->get_epoch() == 0)
16242 wait_on_list(waiting_for_mdsmap);
16243
16244 reclaim_errno = 0;
16245 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
16246 if (!mdsmap->is_up(mds)) {
16247 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
16248 wait_on_list(waiting_for_mdsmap);
16249 continue;
16250 }
16251
16252 MetaSessionRef session;
16253 if (!have_open_session(mds)) {
16254 session = _get_or_open_mds_session(mds);
16255 if (session->state == MetaSession::STATE_REJECTED)
16256 return -CEPHFS_EPERM;
16257 if (session->state != MetaSession::STATE_OPENING) {
16258 // umounting?
16259 return -CEPHFS_EINVAL;
16260 }
16261 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
16262 wait_on_context_list(session->waiting_for_open);
16263 continue;
16264 }
16265
16266 session = mds_sessions.at(mds);
16267 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
16268 return -CEPHFS_EOPNOTSUPP;
16269
16270 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
16271 session->reclaim_state == MetaSession::RECLAIMING) {
16272 session->reclaim_state = MetaSession::RECLAIMING;
16273 auto m = make_message<MClientReclaim>(uuid, flags);
16274 session->con->send_message2(std::move(m));
16275 wait_on_list(waiting_for_reclaim);
16276 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
16277 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
16278 } else {
16279 mds++;
16280 }
16281 }
16282
16283 // didn't find target session in any mds
16284 if (reclaim_target_addrs.empty()) {
16285 if (flags & CEPH_RECLAIM_RESET)
16286 return -CEPHFS_ENOENT;
16287 return -CEPHFS_ENOTRECOVERABLE;
16288 }
16289
16290 if (flags & CEPH_RECLAIM_RESET)
16291 return 0;
16292
16293 // use blocklist to check if target session was killed
16294 // (config option mds_session_blocklist_on_evict needs to be true)
16295 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
16296 bs::error_code ec;
16297 l.unlock();
16298 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
16299 l.lock();
16300
16301 if (ec)
16302 return ceph::from_error_code(ec);
16303
16304 bool blocklisted = objecter->with_osdmap(
16305 [this](const OSDMap &osd_map) -> bool {
16306 return osd_map.is_blocklisted(reclaim_target_addrs);
16307 });
16308 if (blocklisted)
16309 return -CEPHFS_ENOTRECOVERABLE;
16310
16311 metadata["reclaiming_uuid"] = uuid;
16312 return 0;
16313 }
16314
16315 void Client::finish_reclaim()
16316 {
16317 auto it = metadata.find("reclaiming_uuid");
16318 if (it == metadata.end()) {
16319 for (auto &p : mds_sessions)
16320 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
16321 return;
16322 }
16323
16324 for (auto &p : mds_sessions) {
16325 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
16326 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
16327 p.second->con->send_message2(std::move(m));
16328 }
16329
16330 metadata["uuid"] = it->second;
16331 metadata.erase(it);
16332 }
16333
16334 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
16335 {
16336 mds_rank_t from = mds_rank_t(reply->get_source().num());
16337 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
16338
16339 std::scoped_lock cl(client_lock);
16340 auto session = _get_mds_session(from, reply->get_connection().get());
16341 if (!session) {
16342 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
16343 return;
16344 }
16345
16346 if (reply->get_result() >= 0) {
16347 session->reclaim_state = MetaSession::RECLAIM_OK;
16348 if (reply->get_epoch() > reclaim_osd_epoch)
16349 reclaim_osd_epoch = reply->get_epoch();
16350 if (!reply->get_addrs().empty())
16351 reclaim_target_addrs = reply->get_addrs();
16352 } else {
16353 session->reclaim_state = MetaSession::RECLAIM_FAIL;
16354 reclaim_errno = reply->get_result();
16355 }
16356
16357 signal_cond_list(waiting_for_reclaim);
16358 }
16359
16360 /**
16361 * This is included in cap release messages, to cause
16362 * the MDS to wait until this OSD map epoch. It is necessary
16363 * in corner cases where we cancel RADOS ops, so that
16364 * nobody else tries to do IO to the same objects in
16365 * the same epoch as the cancelled ops.
16366 */
16367 void Client::set_cap_epoch_barrier(epoch_t e)
16368 {
16369 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
16370 cap_epoch_barrier = e;
16371 }
16372
16373 const char** Client::get_tracked_conf_keys() const
16374 {
16375 static const char* keys[] = {
16376 "client_cache_size",
16377 "client_cache_mid",
16378 "client_acl_type",
16379 "client_deleg_timeout",
16380 "client_deleg_break_on_open",
16381 "client_oc_size",
16382 "client_oc_max_objects",
16383 "client_oc_max_dirty",
16384 "client_oc_target_dirty",
16385 "client_oc_max_dirty_age",
16386 "client_caps_release_delay",
16387 "client_mount_timeout",
16388 NULL
16389 };
16390 return keys;
16391 }
16392
16393 void Client::handle_conf_change(const ConfigProxy& conf,
16394 const std::set <std::string> &changed)
16395 {
16396 std::scoped_lock lock(client_lock);
16397
16398 if (changed.count("client_cache_mid")) {
16399 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
16400 }
16401 if (changed.count("client_acl_type")) {
16402 acl_type = NO_ACL;
16403 if (cct->_conf->client_acl_type == "posix_acl")
16404 acl_type = POSIX_ACL;
16405 }
16406 if (changed.count("client_oc_size")) {
16407 objectcacher->set_max_size(cct->_conf->client_oc_size);
16408 }
16409 if (changed.count("client_oc_max_objects")) {
16410 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
16411 }
16412 if (changed.count("client_oc_max_dirty")) {
16413 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
16414 }
16415 if (changed.count("client_oc_target_dirty")) {
16416 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
16417 }
16418 if (changed.count("client_oc_max_dirty_age")) {
16419 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
16420 }
16421 if (changed.count("client_collect_and_send_global_metrics")) {
16422 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
16423 "client_collect_and_send_global_metrics");
16424 }
16425 if (changed.count("client_caps_release_delay")) {
16426 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
16427 "client_caps_release_delay");
16428 }
16429 if (changed.count("client_mount_timeout")) {
16430 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
16431 "client_mount_timeout");
16432 }
16433 }
16434
16435 void intrusive_ptr_add_ref(Inode *in)
16436 {
16437 in->iget();
16438 }
16439
16440 void intrusive_ptr_release(Inode *in)
16441 {
16442 in->client->put_inode(in);
16443 }
16444
16445 mds_rank_t Client::_get_random_up_mds() const
16446 {
16447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
16448
16449 std::set<mds_rank_t> up;
16450 mdsmap->get_up_mds_set(up);
16451
16452 if (up.empty())
16453 return MDS_RANK_NONE;
16454 std::set<mds_rank_t>::const_iterator p = up.begin();
16455 for (int n = rand() % up.size(); n; n--)
16456 ++p;
16457 return *p;
16458 }
16459
16460
16461 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
16462 boost::asio::io_context& ictx)
16463 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
16464 {
16465 monclient->set_messenger(m);
16466 objecter->set_client_incarnation(0);
16467 }
16468
16469 StandaloneClient::~StandaloneClient()
16470 {
16471 delete objecter;
16472 objecter = nullptr;
16473 }
16474
16475 int StandaloneClient::init()
16476 {
16477 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
16478 ceph_assert(iref_writer.is_first_writer());
16479
16480 _pre_init();
16481 objecter->init();
16482
16483 client_lock.lock();
16484
16485 messenger->add_dispatcher_tail(objecter);
16486 messenger->add_dispatcher_tail(this);
16487
16488 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
16489 int r = monclient->init();
16490 if (r < 0) {
16491 // need to do cleanup because we're in an intermediate init state
16492 {
16493 std::scoped_lock l(timer_lock);
16494 timer.shutdown();
16495 }
16496
16497 client_lock.unlock();
16498 objecter->shutdown();
16499 objectcacher->stop();
16500 monclient->shutdown();
16501 return r;
16502 }
16503 objecter->start();
16504
16505 client_lock.unlock();
16506 _finish_init();
16507 iref_writer.update_state(CLIENT_INITIALIZED);
16508
16509 return 0;
16510 }
16511
16512 void StandaloneClient::shutdown()
16513 {
16514 Client::shutdown();
16515 objecter->shutdown();
16516 monclient->shutdown();
16517 }