]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import ceph 15.2.13
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
77
78 #define dout_subsys ceph_subsys_client
79
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
83
84 #include "Client.h"
85 #include "Inode.h"
86 #include "Dentry.h"
87 #include "Delegation.h"
88 #include "Dir.h"
89 #include "ClientSnapRealm.h"
90 #include "Fh.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
95
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
98
99 #include "include/cephfs/ceph_ll_client.h"
100
101 #if HAVE_GETGROUPLIST
102 #include <grp.h>
103 #include <pwd.h>
104 #include <unistd.h>
105 #endif
106
107 #undef dout_prefix
108 #define dout_prefix *_dout << "client." << whoami << " "
109
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112 // FreeBSD fails to define this
113 #ifndef O_DSYNC
114 #define O_DSYNC 0x0
115 #endif
116 // Darwin fails to define this
117 #ifndef O_RSYNC
118 #define O_RSYNC 0x0
119 #endif
120
121 #ifndef O_DIRECT
122 #define O_DIRECT 0x0
123 #endif
124
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
127 using namespace TOPNSPC::common;
128
129 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
130 {
131 Client *client = static_cast<Client*>(p);
132 client->flush_set_callback(oset);
133 }
134
135
136 // -------------
137
138 Client::CommandHook::CommandHook(Client *client) :
139 m_client(client)
140 {
141 }
142
143 int Client::CommandHook::call(
144 std::string_view command,
145 const cmdmap_t& cmdmap,
146 Formatter *f,
147 std::ostream& errss,
148 bufferlist& out)
149 {
150 f->open_object_section("result");
151 {
152 std::lock_guard l{m_client->client_lock};
153 if (command == "mds_requests")
154 m_client->dump_mds_requests(f);
155 else if (command == "mds_sessions") {
156 bool cap_dump = false;
157 cmd_getval(cmdmap, "cap_dump", cap_dump);
158 m_client->dump_mds_sessions(f, cap_dump);
159 } else if (command == "dump_cache")
160 m_client->dump_cache(f);
161 else if (command == "kick_stale_sessions")
162 m_client->_kick_stale_sessions();
163 else if (command == "status")
164 m_client->dump_status(f);
165 else
166 ceph_abort_msg("bad command registered");
167 }
168 f->close_section();
169 return 0;
170 }
171
172
173 // -------------
174
175 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
176 : inode(in), offset(0), next_offset(2),
177 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
178 perms(perms)
179 { }
180
181 void Client::_reset_faked_inos()
182 {
183 ino_t start = 1024;
184 free_faked_inos.clear();
185 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
186 last_used_faked_ino = 0;
187 last_used_faked_root = 0;
188 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
189 }
190
191 void Client::_assign_faked_ino(Inode *in)
192 {
193 if (0 == last_used_faked_ino)
194 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
195 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
196 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
197 last_used_faked_ino = 2048;
198 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
199 }
200 ceph_assert(it != free_faked_inos.end());
201 if (last_used_faked_ino < it.get_start()) {
202 ceph_assert(it.get_len() > 0);
203 last_used_faked_ino = it.get_start();
204 } else {
205 ++last_used_faked_ino;
206 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
207 }
208 in->faked_ino = last_used_faked_ino;
209 free_faked_inos.erase(in->faked_ino);
210 faked_ino_map[in->faked_ino] = in->vino();
211 }
212
213 /*
214 * In the faked mode, if you export multiple subdirectories,
215 * you will see that the inode numbers of the exported subdirectories
216 * are the same. so we distinguish the mount point by reserving
217 * the "fake ids" between "1024~2048" and combining the last
218 * 10bits(0x3ff) of the "root inodes".
219 */
220 void Client::_assign_faked_root(Inode *in)
221 {
222 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
223 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
224 last_used_faked_root = 0;
225 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
226 }
227 assert(it != free_faked_inos.end());
228 vinodeno_t inode_info = in->vino();
229 uint64_t inode_num = (uint64_t)inode_info.ino;
230 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
231 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
232 assert(it.get_start() + it.get_len() > last_used_faked_root);
233
234 in->faked_ino = last_used_faked_root;
235 free_faked_inos.erase(in->faked_ino);
236 faked_ino_map[in->faked_ino] = in->vino();
237 }
238
239 void Client::_release_faked_ino(Inode *in)
240 {
241 free_faked_inos.insert(in->faked_ino);
242 faked_ino_map.erase(in->faked_ino);
243 }
244
245 vinodeno_t Client::_map_faked_ino(ino_t ino)
246 {
247 vinodeno_t vino;
248 if (ino == 1)
249 vino = root->vino();
250 else if (faked_ino_map.count(ino))
251 vino = faked_ino_map[ino];
252 else
253 vino = vinodeno_t(0, CEPH_NOSNAP);
254 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
255 return vino;
256 }
257
258 vinodeno_t Client::map_faked_ino(ino_t ino)
259 {
260 std::lock_guard lock(client_lock);
261 return _map_faked_ino(ino);
262 }
263
264 // cons/des
265
266 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
267 : Dispatcher(m->cct),
268 timer(m->cct, client_lock),
269 messenger(m),
270 monclient(mc),
271 objecter(objecter_),
272 whoami(mc->get_global_id()),
273 async_ino_invalidator(m->cct),
274 async_dentry_invalidator(m->cct),
275 interrupt_finisher(m->cct),
276 remount_finisher(m->cct),
277 async_ino_releasor(m->cct),
278 objecter_finisher(m->cct),
279 m_command_hook(this),
280 fscid(0)
281 {
282 _reset_faked_inos();
283
284 user_id = cct->_conf->client_mount_uid;
285 group_id = cct->_conf->client_mount_gid;
286 fuse_default_permissions = cct->_conf.get_val<bool>(
287 "fuse_default_permissions");
288
289 if (cct->_conf->client_acl_type == "posix_acl")
290 acl_type = POSIX_ACL;
291
292 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
293
294 // file handles
295 free_fd_set.insert(10, 1<<30);
296
297 mdsmap.reset(new MDSMap);
298
299 // osd interfaces
300 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
301 &client_lock));
302 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
303 client_flush_set_callback, // all commit callback
304 (void*)this,
305 cct->_conf->client_oc_size,
306 cct->_conf->client_oc_max_objects,
307 cct->_conf->client_oc_max_dirty,
308 cct->_conf->client_oc_target_dirty,
309 cct->_conf->client_oc_max_dirty_age,
310 true));
311 }
312
313
314 Client::~Client()
315 {
316 ceph_assert(ceph_mutex_is_not_locked(client_lock));
317
318 // It is necessary to hold client_lock, because any inode destruction
319 // may call into ObjectCacher, which asserts that it's lock (which is
320 // client_lock) is held.
321 std::lock_guard l{client_lock};
322 tear_down_cache();
323 }
324
325 void Client::tear_down_cache()
326 {
327 // fd's
328 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
329 it != fd_map.end();
330 ++it) {
331 Fh *fh = it->second;
332 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
333 _release_fh(fh);
334 }
335 fd_map.clear();
336
337 while (!opened_dirs.empty()) {
338 dir_result_t *dirp = *opened_dirs.begin();
339 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
340 _closedir(dirp);
341 }
342
343 // caps!
344 // *** FIXME ***
345
346 // empty lru
347 trim_cache();
348 ceph_assert(lru.lru_get_size() == 0);
349
350 // close root ino
351 ceph_assert(inode_map.size() <= 1 + root_parents.size());
352 if (root && inode_map.size() == 1 + root_parents.size()) {
353 delete root;
354 root = 0;
355 root_ancestor = 0;
356 while (!root_parents.empty())
357 root_parents.erase(root_parents.begin());
358 inode_map.clear();
359 _reset_faked_inos();
360 }
361
362 ceph_assert(inode_map.empty());
363 }
364
365 inodeno_t Client::get_root_ino()
366 {
367 std::lock_guard l(client_lock);
368 if (use_faked_inos())
369 return root->faked_ino;
370 else
371 return root->ino;
372 }
373
374 Inode *Client::get_root()
375 {
376 std::lock_guard l(client_lock);
377 root->ll_get();
378 return root;
379 }
380
381
382 // debug crapola
383
384 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
385 {
386 filepath path;
387 in->make_long_path(path);
388 ldout(cct, 1) << "dump_inode: "
389 << (disconnected ? "DISCONNECTED ":"")
390 << "inode " << in->ino
391 << " " << path
392 << " ref " << in->get_num_ref()
393 << *in << dendl;
394
395 if (f) {
396 f->open_object_section("inode");
397 f->dump_stream("path") << path;
398 if (disconnected)
399 f->dump_int("disconnected", 1);
400 in->dump(f);
401 f->close_section();
402 }
403
404 did.insert(in);
405 if (in->dir) {
406 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
407 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
408 it != in->dir->dentries.end();
409 ++it) {
410 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
411 if (f) {
412 f->open_object_section("dentry");
413 it->second->dump(f);
414 f->close_section();
415 }
416 if (it->second->inode)
417 dump_inode(f, it->second->inode.get(), did, false);
418 }
419 }
420 }
421
422 void Client::dump_cache(Formatter *f)
423 {
424 set<Inode*> did;
425
426 ldout(cct, 1) << __func__ << dendl;
427
428 if (f)
429 f->open_array_section("cache");
430
431 if (root)
432 dump_inode(f, root, did, true);
433
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
436 it != inode_map.end();
437 ++it) {
438 if (did.count(it->second))
439 continue;
440 dump_inode(f, it->second, did, true);
441 }
442
443 if (f)
444 f->close_section();
445 }
446
447 void Client::dump_status(Formatter *f)
448 {
449 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
450
451 ldout(cct, 1) << __func__ << dendl;
452
453 const epoch_t osd_epoch
454 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
455
456 if (f) {
457 f->open_object_section("metadata");
458 for (const auto& kv : metadata)
459 f->dump_string(kv.first.c_str(), kv.second);
460 f->close_section();
461
462 f->dump_int("dentry_count", lru.lru_get_size());
463 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
464 f->dump_int("id", get_nodeid().v);
465 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
466 f->dump_object("inst", inst);
467 f->dump_object("addr", inst.addr);
468 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
469 f->dump_string("addr_str", inst.addr.get_legacy_str());
470 f->dump_int("inode_count", inode_map.size());
471 f->dump_int("mds_epoch", mdsmap->get_epoch());
472 f->dump_int("osd_epoch", osd_epoch);
473 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
474 f->dump_bool("blacklisted", blacklisted);
475 f->dump_string("fs_name", mdsmap->get_fs_name());
476 }
477 }
478
479 void Client::_pre_init()
480 {
481 timer.init();
482
483 objecter_finisher.start();
484 filer.reset(new Filer(objecter, &objecter_finisher));
485 objecter->enable_blacklist_events();
486
487 objectcacher->start();
488 }
489
490 int Client::init()
491 {
492 _pre_init();
493 {
494 std::lock_guard l{client_lock};
495 ceph_assert(!initialized);
496 messenger->add_dispatcher_tail(this);
497 }
498 _finish_init();
499 return 0;
500 }
501
502 void Client::_finish_init()
503 {
504 {
505 std::lock_guard l{client_lock};
506 // logger
507 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
508 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
509 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
510 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
511 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
512 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
513 logger.reset(plb.create_perf_counters());
514 cct->get_perfcounters_collection()->add(logger.get());
515 }
516
517 cct->_conf.add_observer(this);
518
519 AdminSocket* admin_socket = cct->get_admin_socket();
520 int ret = admin_socket->register_command("mds_requests",
521 &m_command_hook,
522 "show in-progress mds requests");
523 if (ret < 0) {
524 lderr(cct) << "error registering admin socket command: "
525 << cpp_strerror(-ret) << dendl;
526 }
527 ret = admin_socket->register_command("mds_sessions "
528 "name=cap_dump,type=CephBool,req=false",
529 &m_command_hook,
530 "show mds session state");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("dump_cache",
536 &m_command_hook,
537 "show in-memory metadata cache contents");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542 ret = admin_socket->register_command("kick_stale_sessions",
543 &m_command_hook,
544 "kick sessions that were remote reset");
545 if (ret < 0) {
546 lderr(cct) << "error registering admin socket command: "
547 << cpp_strerror(-ret) << dendl;
548 }
549 ret = admin_socket->register_command("status",
550 &m_command_hook,
551 "show overall client status");
552 if (ret < 0) {
553 lderr(cct) << "error registering admin socket command: "
554 << cpp_strerror(-ret) << dendl;
555 }
556
557 std::lock_guard l{client_lock};
558 initialized = true;
559 }
560
561 void Client::shutdown()
562 {
563 ldout(cct, 1) << __func__ << dendl;
564
565 // If we were not mounted, but were being used for sending
566 // MDS commands, we may have sessions that need closing.
567 {
568 std::lock_guard l{client_lock};
569 _close_sessions();
570 }
571 cct->_conf.remove_observer(this);
572
573 cct->get_admin_socket()->unregister_commands(&m_command_hook);
574
575 if (ino_invalidate_cb) {
576 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
577 async_ino_invalidator.wait_for_empty();
578 async_ino_invalidator.stop();
579 }
580
581 if (dentry_invalidate_cb) {
582 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
583 async_dentry_invalidator.wait_for_empty();
584 async_dentry_invalidator.stop();
585 }
586
587 if (switch_interrupt_cb) {
588 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
589 interrupt_finisher.wait_for_empty();
590 interrupt_finisher.stop();
591 }
592
593 if (remount_cb) {
594 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
595 remount_finisher.wait_for_empty();
596 remount_finisher.stop();
597 }
598
599 if (ino_release_cb) {
600 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
601 async_ino_releasor.wait_for_empty();
602 async_ino_releasor.stop();
603 }
604
605 objectcacher->stop(); // outside of client_lock! this does a join.
606 {
607 std::lock_guard l{client_lock};
608 ceph_assert(initialized);
609 initialized = false;
610 timer.shutdown();
611 }
612 objecter_finisher.wait_for_empty();
613 objecter_finisher.stop();
614
615 if (logger) {
616 cct->get_perfcounters_collection()->remove(logger.get());
617 logger.reset();
618 }
619 }
620
621
622 // ===================
623 // metadata cache stuff
624
625 void Client::trim_cache(bool trim_kernel_dcache)
626 {
627 uint64_t max = cct->_conf->client_cache_size;
628 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
629 unsigned last = 0;
630 while (lru.lru_get_size() != last) {
631 last = lru.lru_get_size();
632
633 if (!unmounting && lru.lru_get_size() <= max) break;
634
635 // trim!
636 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
637 if (!dn)
638 break; // done
639
640 trim_dentry(dn);
641 }
642
643 if (trim_kernel_dcache && lru.lru_get_size() > max)
644 _invalidate_kernel_dcache();
645
646 // hose root?
647 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
648 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
649 delete root;
650 root = 0;
651 root_ancestor = 0;
652 while (!root_parents.empty())
653 root_parents.erase(root_parents.begin());
654 inode_map.clear();
655 _reset_faked_inos();
656 }
657 }
658
659 void Client::trim_cache_for_reconnect(MetaSession *s)
660 {
661 mds_rank_t mds = s->mds_num;
662 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
663
664 int trimmed = 0;
665 list<Dentry*> skipped;
666 while (lru.lru_get_size() > 0) {
667 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
668 if (!dn)
669 break;
670
671 if ((dn->inode && dn->inode->caps.count(mds)) ||
672 dn->dir->parent_inode->caps.count(mds)) {
673 trim_dentry(dn);
674 trimmed++;
675 } else
676 skipped.push_back(dn);
677 }
678
679 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
680 lru.lru_insert_mid(*p);
681
682 ldout(cct, 20) << __func__ << " mds." << mds
683 << " trimmed " << trimmed << " dentries" << dendl;
684
685 if (s->caps.size() > 0)
686 _invalidate_kernel_dcache();
687 }
688
689 void Client::trim_dentry(Dentry *dn)
690 {
691 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
692 << " in dir "
693 << std::hex << dn->dir->parent_inode->ino << std::dec
694 << dendl;
695 if (dn->inode) {
696 Inode *diri = dn->dir->parent_inode;
697 clear_dir_complete_and_ordered(diri, true);
698 }
699 unlink(dn, false, false); // drop dir, drop dentry
700 }
701
702
703 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
704 uint64_t truncate_seq, uint64_t truncate_size)
705 {
706 uint64_t prior_size = in->size;
707
708 if (truncate_seq > in->truncate_seq ||
709 (truncate_seq == in->truncate_seq && size > in->size)) {
710 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
711 in->size = size;
712 in->reported_size = size;
713 if (truncate_seq != in->truncate_seq) {
714 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
715 << truncate_seq << dendl;
716 in->truncate_seq = truncate_seq;
717 in->oset.truncate_seq = truncate_seq;
718
719 // truncate cached file data
720 if (prior_size > size) {
721 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
722 }
723 }
724
725 // truncate inline data
726 if (in->inline_version < CEPH_INLINE_NONE) {
727 uint32_t len = in->inline_data.length();
728 if (size < len)
729 in->inline_data.splice(size, len - size);
730 }
731 }
732 if (truncate_seq >= in->truncate_seq &&
733 in->truncate_size != truncate_size) {
734 if (in->is_file()) {
735 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
736 << truncate_size << dendl;
737 in->truncate_size = truncate_size;
738 in->oset.truncate_size = truncate_size;
739 } else {
740 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
741 }
742 }
743 }
744
745 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
746 utime_t ctime, utime_t mtime, utime_t atime)
747 {
748 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
749 << " ctime " << ctime << " mtime " << mtime << dendl;
750
751 if (time_warp_seq > in->time_warp_seq)
752 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
753 << " is higher than local time_warp_seq "
754 << in->time_warp_seq << dendl;
755
756 int warn = false;
757 // be careful with size, mtime, atime
758 if (issued & (CEPH_CAP_FILE_EXCL|
759 CEPH_CAP_FILE_WR|
760 CEPH_CAP_FILE_BUFFER|
761 CEPH_CAP_AUTH_EXCL|
762 CEPH_CAP_XATTR_EXCL)) {
763 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
764 if (ctime > in->ctime)
765 in->ctime = ctime;
766 if (time_warp_seq > in->time_warp_seq) {
767 //the mds updated times, so take those!
768 in->mtime = mtime;
769 in->atime = atime;
770 in->time_warp_seq = time_warp_seq;
771 } else if (time_warp_seq == in->time_warp_seq) {
772 //take max times
773 if (mtime > in->mtime)
774 in->mtime = mtime;
775 if (atime > in->atime)
776 in->atime = atime;
777 } else if (issued & CEPH_CAP_FILE_EXCL) {
778 //ignore mds values as we have a higher seq
779 } else warn = true;
780 } else {
781 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
782 if (time_warp_seq >= in->time_warp_seq) {
783 in->ctime = ctime;
784 in->mtime = mtime;
785 in->atime = atime;
786 in->time_warp_seq = time_warp_seq;
787 } else warn = true;
788 }
789 if (warn) {
790 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
791 << time_warp_seq << " is lower than local time_warp_seq "
792 << in->time_warp_seq
793 << dendl;
794 }
795 }
796
797 void Client::_fragmap_remove_non_leaves(Inode *in)
798 {
799 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
800 if (!in->dirfragtree.is_leaf(p->first))
801 in->fragmap.erase(p++);
802 else
803 ++p;
804 }
805
806 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
807 {
808 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
809 if (p->second == mds)
810 in->fragmap.erase(p++);
811 else
812 ++p;
813 }
814
815 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
816 MetaSession *session,
817 const UserPerm& request_perms)
818 {
819 Inode *in;
820 bool was_new = false;
821 if (inode_map.count(st->vino)) {
822 in = inode_map[st->vino];
823 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
824 } else {
825 in = new Inode(this, st->vino, &st->layout);
826 inode_map[st->vino] = in;
827
828 if (use_faked_inos())
829 _assign_faked_ino(in);
830
831 if (!root) {
832 root = in;
833 if (use_faked_inos())
834 _assign_faked_root(root);
835 root_ancestor = in;
836 cwd = root;
837 } else if (!mounted) {
838 root_parents[root_ancestor] = in;
839 root_ancestor = in;
840 }
841
842 // immutable bits
843 in->ino = st->vino.ino;
844 in->snapid = st->vino.snapid;
845 in->mode = st->mode & S_IFMT;
846 was_new = true;
847 }
848
849 in->rdev = st->rdev;
850 if (in->is_symlink())
851 in->symlink = st->symlink;
852
853 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
854 bool new_version = false;
855 if (in->version == 0 ||
856 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
857 (in->version & ~1) < st->version))
858 new_version = true;
859
860 int issued;
861 in->caps_issued(&issued);
862 issued |= in->caps_dirty();
863 int new_issued = ~issued & (int)st->cap.caps;
864
865 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
866 !(issued & CEPH_CAP_AUTH_EXCL)) {
867 in->mode = st->mode;
868 in->uid = st->uid;
869 in->gid = st->gid;
870 in->btime = st->btime;
871 in->snap_btime = st->snap_btime;
872 }
873
874 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
875 !(issued & CEPH_CAP_LINK_EXCL)) {
876 in->nlink = st->nlink;
877 }
878
879 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
880 update_inode_file_time(in, issued, st->time_warp_seq,
881 st->ctime, st->mtime, st->atime);
882 }
883
884 if (new_version ||
885 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
886 in->layout = st->layout;
887 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
888 }
889
890 if (in->is_dir()) {
891 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
892 in->dirstat = st->dirstat;
893 }
894 // dir_layout/rstat/quota are not tracked by capability, update them only if
895 // the inode stat is from auth mds
896 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
897 in->dir_layout = st->dir_layout;
898 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
899 in->rstat = st->rstat;
900 in->quota = st->quota;
901 in->dir_pin = st->dir_pin;
902 }
903 // move me if/when version reflects fragtree changes.
904 if (in->dirfragtree != st->dirfragtree) {
905 in->dirfragtree = st->dirfragtree;
906 _fragmap_remove_non_leaves(in);
907 }
908 }
909
910 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
911 st->xattrbl.length() &&
912 st->xattr_version > in->xattr_version) {
913 auto p = st->xattrbl.cbegin();
914 decode(in->xattrs, p);
915 in->xattr_version = st->xattr_version;
916 }
917
918 if (st->inline_version > in->inline_version) {
919 in->inline_data = st->inline_data;
920 in->inline_version = st->inline_version;
921 }
922
923 /* always take a newer change attr */
924 if (st->change_attr > in->change_attr)
925 in->change_attr = st->change_attr;
926
927 if (st->version > in->version)
928 in->version = st->version;
929
930 if (was_new)
931 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
932
933 if (!st->cap.caps)
934 return in; // as with readdir returning indoes in different snaprealms (no caps!)
935
936 if (in->snapid == CEPH_NOSNAP) {
937 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
938 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
939 st->cap.flags, request_perms);
940 if (in->auth_cap && in->auth_cap->session == session) {
941 in->max_size = st->max_size;
942 in->rstat = st->rstat;
943 }
944
945 // setting I_COMPLETE needs to happen after adding the cap
946 if (in->is_dir() &&
947 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
948 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
949 in->dirstat.nfiles == 0 &&
950 in->dirstat.nsubdirs == 0) {
951 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
952 in->flags |= I_COMPLETE | I_DIR_ORDERED;
953 if (in->dir) {
954 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
955 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
956 in->dir->readdir_cache.clear();
957 for (const auto& p : in->dir->dentries) {
958 unlink(p.second, true, true); // keep dir, keep dentry
959 }
960 if (in->dir->dentries.empty())
961 close_dir(in->dir);
962 }
963 }
964 } else {
965 in->snap_caps |= st->cap.caps;
966 }
967
968 return in;
969 }
970
971
972 /*
973 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
974 */
975 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
976 Inode *in, utime_t from, MetaSession *session,
977 Dentry *old_dentry)
978 {
979 Dentry *dn = NULL;
980 if (dir->dentries.count(dname))
981 dn = dir->dentries[dname];
982
983 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
984 << " in dir " << dir->parent_inode->vino() << " dn " << dn
985 << dendl;
986
987 if (dn && dn->inode) {
988 if (dn->inode->vino() == in->vino()) {
989 touch_dn(dn);
990 ldout(cct, 12) << " had dentry " << dname
991 << " with correct vino " << dn->inode->vino()
992 << dendl;
993 } else {
994 ldout(cct, 12) << " had dentry " << dname
995 << " with WRONG vino " << dn->inode->vino()
996 << dendl;
997 unlink(dn, true, true); // keep dir, keep dentry
998 }
999 }
1000
1001 if (!dn || !dn->inode) {
1002 InodeRef tmp_ref(in);
1003 if (old_dentry) {
1004 if (old_dentry->dir != dir) {
1005 Inode *old_diri = old_dentry->dir->parent_inode;
1006 clear_dir_complete_and_ordered(old_diri, false);
1007 }
1008 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1009 }
1010 Inode *diri = dir->parent_inode;
1011 clear_dir_complete_and_ordered(diri, false);
1012 dn = link(dir, dname, in, dn);
1013 }
1014
1015 update_dentry_lease(dn, dlease, from, session);
1016 return dn;
1017 }
1018
1019 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1020 {
1021 utime_t dttl = from;
1022 dttl += (float)dlease->duration_ms / 1000.0;
1023
1024 ceph_assert(dn);
1025
1026 if (dlease->mask & CEPH_LEASE_VALID) {
1027 if (dttl > dn->lease_ttl) {
1028 ldout(cct, 10) << "got dentry lease on " << dn->name
1029 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1030 dn->lease_ttl = dttl;
1031 dn->lease_mds = session->mds_num;
1032 dn->lease_seq = dlease->seq;
1033 dn->lease_gen = session->cap_gen;
1034 }
1035 }
1036 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1037 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1038 dn->mark_primary();
1039 }
1040
1041
1042 /*
1043 * update MDS location cache for a single inode
1044 */
1045 void Client::update_dir_dist(Inode *in, DirStat *dst)
1046 {
1047 // auth
1048 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1049 if (dst->auth >= 0) {
1050 in->fragmap[dst->frag] = dst->auth;
1051 } else {
1052 in->fragmap.erase(dst->frag);
1053 }
1054 if (!in->dirfragtree.is_leaf(dst->frag)) {
1055 in->dirfragtree.force_to_leaf(cct, dst->frag);
1056 _fragmap_remove_non_leaves(in);
1057 }
1058
1059 // replicated
1060 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1061 }
1062
1063 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1064 {
1065 if (complete)
1066 diri->dir_release_count++;
1067 else
1068 diri->dir_ordered_count++;
1069 if (diri->flags & I_COMPLETE) {
1070 if (complete) {
1071 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1072 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1073 } else {
1074 if (diri->flags & I_DIR_ORDERED) {
1075 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1076 diri->flags &= ~I_DIR_ORDERED;
1077 }
1078 }
1079 if (diri->dir)
1080 diri->dir->readdir_cache.clear();
1081 }
1082 }
1083
1084 /*
1085 * insert results from readdir or lssnap into the metadata cache.
1086 */
1087 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1088
1089 auto& reply = request->reply;
1090 ConnectionRef con = request->reply->get_connection();
1091 uint64_t features;
1092 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1093 features = (uint64_t)-1;
1094 }
1095 else {
1096 features = con->get_features();
1097 }
1098
1099 dir_result_t *dirp = request->dirp;
1100 ceph_assert(dirp);
1101
1102 // the extra buffer list is only set for readdir and lssnap replies
1103 auto p = reply->get_extra_bl().cbegin();
1104 if (!p.end()) {
1105 // snapdir?
1106 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1107 ceph_assert(diri);
1108 diri = open_snapdir(diri);
1109 }
1110
1111 // only open dir if we're actually adding stuff to it!
1112 Dir *dir = diri->open_dir();
1113 ceph_assert(dir);
1114
1115 // dirstat
1116 DirStat dst(p, features);
1117 __u32 numdn;
1118 __u16 flags;
1119 decode(numdn, p);
1120 decode(flags, p);
1121
1122 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1123 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1124
1125 frag_t fg = (unsigned)request->head.args.readdir.frag;
1126 unsigned readdir_offset = dirp->next_offset;
1127 string readdir_start = dirp->last_name;
1128 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1129
1130 unsigned last_hash = 0;
1131 if (hash_order) {
1132 if (!readdir_start.empty()) {
1133 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1134 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1135 /* mds understands offset_hash */
1136 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1137 }
1138 }
1139
1140 if (fg != dst.frag) {
1141 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1142 fg = dst.frag;
1143 if (!hash_order) {
1144 readdir_offset = 2;
1145 readdir_start.clear();
1146 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1147 }
1148 }
1149
1150 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1151 << ", hash_order=" << hash_order
1152 << ", readdir_start " << readdir_start
1153 << ", last_hash " << last_hash
1154 << ", next_offset " << readdir_offset << dendl;
1155
1156 if (diri->snapid != CEPH_SNAPDIR &&
1157 fg.is_leftmost() && readdir_offset == 2 &&
1158 !(hash_order && last_hash)) {
1159 dirp->release_count = diri->dir_release_count;
1160 dirp->ordered_count = diri->dir_ordered_count;
1161 dirp->start_shared_gen = diri->shared_gen;
1162 dirp->cache_index = 0;
1163 }
1164
1165 dirp->buffer_frag = fg;
1166
1167 _readdir_drop_dirp_buffer(dirp);
1168 dirp->buffer.reserve(numdn);
1169
1170 string dname;
1171 LeaseStat dlease;
1172 for (unsigned i=0; i<numdn; i++) {
1173 decode(dname, p);
1174 dlease.decode(p, features);
1175 InodeStat ist(p, features);
1176
1177 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1178
1179 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1180 request->perms);
1181 Dentry *dn;
1182 if (diri->dir->dentries.count(dname)) {
1183 Dentry *olddn = diri->dir->dentries[dname];
1184 if (olddn->inode != in) {
1185 // replace incorrect dentry
1186 unlink(olddn, true, true); // keep dir, dentry
1187 dn = link(dir, dname, in, olddn);
1188 ceph_assert(dn == olddn);
1189 } else {
1190 // keep existing dn
1191 dn = olddn;
1192 touch_dn(dn);
1193 }
1194 } else {
1195 // new dn
1196 dn = link(dir, dname, in, NULL);
1197 }
1198
1199 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1200 if (hash_order) {
1201 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1202 if (hash != last_hash)
1203 readdir_offset = 2;
1204 last_hash = hash;
1205 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1206 } else {
1207 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1208 }
1209 // add to readdir cache
1210 if (dirp->release_count == diri->dir_release_count &&
1211 dirp->ordered_count == diri->dir_ordered_count &&
1212 dirp->start_shared_gen == diri->shared_gen) {
1213 if (dirp->cache_index == dir->readdir_cache.size()) {
1214 if (i == 0) {
1215 ceph_assert(!dirp->inode->is_complete_and_ordered());
1216 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1217 }
1218 dir->readdir_cache.push_back(dn);
1219 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1220 if (dirp->inode->is_complete_and_ordered())
1221 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1222 else
1223 dir->readdir_cache[dirp->cache_index] = dn;
1224 } else {
1225 ceph_abort_msg("unexpected readdir buffer idx");
1226 }
1227 dirp->cache_index++;
1228 }
1229 // add to cached result list
1230 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1231 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1232 }
1233
1234 if (numdn > 0)
1235 dirp->last_name = dname;
1236 if (end)
1237 dirp->next_offset = 2;
1238 else
1239 dirp->next_offset = readdir_offset;
1240
1241 if (dir->is_empty())
1242 close_dir(dir);
1243 }
1244 }
1245
1246 /** insert_trace
1247 *
1248 * insert a trace from a MDS reply into the cache.
1249 */
1250 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1251 {
1252 auto& reply = request->reply;
1253 int op = request->get_op();
1254
1255 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1256 << " is_target=" << (int)reply->head.is_target
1257 << " is_dentry=" << (int)reply->head.is_dentry
1258 << dendl;
1259
1260 auto p = reply->get_trace_bl().cbegin();
1261 if (request->got_unsafe) {
1262 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1263 ceph_assert(p.end());
1264 return NULL;
1265 }
1266
1267 if (p.end()) {
1268 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1269
1270 Dentry *d = request->dentry();
1271 if (d) {
1272 Inode *diri = d->dir->parent_inode;
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1281 ceph_assert(od);
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1338 ceph_abort_msg("MDS reply does not contain xattrs");
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
1360 clear_dir_complete_and_ordered(diri, false);
1361 unlink(dn, true, true); // keep dir, dentry
1362 }
1363 }
1364 if (dlease.duration_ms > 0) {
1365 if (!dn) {
1366 Dir *dir = diri->open_dir();
1367 dn = link(dir, dname, NULL, NULL);
1368 }
1369 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1370 }
1371 }
1372 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1373 op == CEPH_MDS_OP_MKSNAP) {
1374 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1375 // fake it for snap lookup
1376 vinodeno_t vino = ist.vino;
1377 vino.snapid = CEPH_SNAPDIR;
1378 ceph_assert(inode_map.count(vino));
1379 diri = inode_map[vino];
1380
1381 string dname = request->path.last_dentry();
1382
1383 LeaseStat dlease;
1384 dlease.duration_ms = 0;
1385
1386 if (in) {
1387 Dir *dir = diri->open_dir();
1388 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1389 } else {
1390 if (diri->dir && diri->dir->dentries.count(dname)) {
1391 Dentry *dn = diri->dir->dentries[dname];
1392 if (dn->inode)
1393 unlink(dn, true, true); // keep dir, dentry
1394 }
1395 }
1396 }
1397
1398 if (in) {
1399 if (op == CEPH_MDS_OP_READDIR ||
1400 op == CEPH_MDS_OP_LSSNAP) {
1401 insert_readdir_results(request, session, in);
1402 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1403 // hack: return parent inode instead
1404 in = diri;
1405 }
1406
1407 if (request->dentry() == NULL && in != request->inode()) {
1408 // pin the target inode if its parent dentry is not pinned
1409 request->set_other_inode(in);
1410 }
1411 }
1412
1413 if (realm)
1414 put_snap_realm(realm);
1415
1416 request->target = in;
1417 return in;
1418 }
1419
1420 // -------
1421
1422 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1423 {
1424 mds_rank_t mds = MDS_RANK_NONE;
1425 __u32 hash = 0;
1426 bool is_hash = false;
1427
1428 Inode *in = NULL;
1429 Dentry *de = NULL;
1430
1431 if (req->resend_mds >= 0) {
1432 mds = req->resend_mds;
1433 req->resend_mds = -1;
1434 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1435 goto out;
1436 }
1437
1438 if (cct->_conf->client_use_random_mds)
1439 goto random_mds;
1440
1441 in = req->inode();
1442 de = req->dentry();
1443 if (in) {
1444 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1445 if (req->path.depth()) {
1446 hash = in->hash_dentry_name(req->path[0]);
1447 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1448 << " on " << req->path[0]
1449 << " => " << hash << dendl;
1450 is_hash = true;
1451 }
1452 } else if (de) {
1453 if (de->inode) {
1454 in = de->inode.get();
1455 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1456 } else {
1457 in = de->dir->parent_inode;
1458 hash = in->hash_dentry_name(de->name);
1459 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1460 << " on " << de->name
1461 << " => " << hash << dendl;
1462 is_hash = true;
1463 }
1464 }
1465 if (in) {
1466 if (in->snapid != CEPH_NOSNAP) {
1467 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1468 while (in->snapid != CEPH_NOSNAP) {
1469 if (in->snapid == CEPH_SNAPDIR)
1470 in = in->snapdir_parent.get();
1471 else if (!in->dentries.empty())
1472 /* In most cases there will only be one dentry, so getting it
1473 * will be the correct action. If there are multiple hard links,
1474 * I think the MDS should be able to redirect as needed*/
1475 in = in->get_first_parent()->dir->parent_inode;
1476 else {
1477 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1478 break;
1479 }
1480 }
1481 is_hash = false;
1482 }
1483
1484 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1485 << " hash=" << hash << dendl;
1486
1487 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1488 frag_t fg = in->dirfragtree[hash];
1489 if (in->fragmap.count(fg)) {
1490 mds = in->fragmap[fg];
1491 if (phash_diri)
1492 *phash_diri = in;
1493 } else if (in->auth_cap) {
1494 mds = in->auth_cap->session->mds_num;
1495 }
1496 if (mds >= 0) {
1497 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1498 goto out;
1499 }
1500 }
1501
1502 if (in->auth_cap && req->auth_is_best()) {
1503 mds = in->auth_cap->session->mds_num;
1504 } else if (!in->caps.empty()) {
1505 mds = in->caps.begin()->second.session->mds_num;
1506 } else {
1507 goto random_mds;
1508 }
1509 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1510
1511 goto out;
1512 }
1513
1514 random_mds:
1515 if (mds < 0) {
1516 mds = _get_random_up_mds();
1517 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1518 }
1519
1520 out:
1521 ldout(cct, 20) << "mds is " << mds << dendl;
1522 return mds;
1523 }
1524
1525
1526 void Client::connect_mds_targets(mds_rank_t mds)
1527 {
1528 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1529 ceph_assert(mds_sessions.count(mds));
1530 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1531 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1532 q != info.export_targets.end();
1533 ++q) {
1534 if (mds_sessions.count(*q) == 0 &&
1535 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1536 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1537 << " export target mds." << *q << dendl;
1538 _open_mds_session(*q);
1539 }
1540 }
1541 }
1542
1543 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1544 {
1545 f->dump_int("id", get_nodeid().v);
1546 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1547 f->dump_object("inst", inst);
1548 f->dump_stream("inst_str") << inst;
1549 f->dump_stream("addr_str") << inst.addr;
1550 f->open_array_section("sessions");
1551 for (const auto &p : mds_sessions) {
1552 f->open_object_section("session");
1553 p.second.dump(f, cap_dump);
1554 f->close_section();
1555 }
1556 f->close_section();
1557 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1558 }
1559 void Client::dump_mds_requests(Formatter *f)
1560 {
1561 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1562 p != mds_requests.end();
1563 ++p) {
1564 f->open_object_section("request");
1565 p->second->dump(f);
1566 f->close_section();
1567 }
1568 }
1569
1570 int Client::verify_reply_trace(int r, MetaSession *session,
1571 MetaRequest *request, const MConstRef<MClientReply>& reply,
1572 InodeRef *ptarget, bool *pcreated,
1573 const UserPerm& perms)
1574 {
1575 // check whether this request actually did the create, and set created flag
1576 bufferlist extra_bl;
1577 inodeno_t created_ino;
1578 bool got_created_ino = false;
1579 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1580
1581 extra_bl = reply->get_extra_bl();
1582 if (extra_bl.length() >= 8) {
1583 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1584 struct openc_response_t ocres;
1585
1586 decode(ocres, extra_bl);
1587 created_ino = ocres.created_ino;
1588 /*
1589 * The userland cephfs client doesn't have a way to do an async create
1590 * (yet), so just discard delegated_inos for now. Eventually we should
1591 * store them and use them in create calls, even if they are synchronous,
1592 * if only for testing purposes.
1593 */
1594 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1595 } else {
1596 // u64 containing number of created ino
1597 decode(created_ino, extra_bl);
1598 }
1599 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1600 got_created_ino = true;
1601 }
1602
1603 if (pcreated)
1604 *pcreated = got_created_ino;
1605
1606 if (request->target) {
1607 *ptarget = request->target;
1608 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1609 } else {
1610 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1611 (*ptarget) = p->second;
1612 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1613 } else {
1614 // we got a traceless reply, and need to look up what we just
1615 // created. for now, do this by name. someday, do this by the
1616 // ino... which we know! FIXME.
1617 InodeRef target;
1618 Dentry *d = request->dentry();
1619 if (d) {
1620 if (d->dir) {
1621 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1622 << d->dir->parent_inode->ino << "/" << d->name
1623 << " got_ino " << got_created_ino
1624 << " ino " << created_ino
1625 << dendl;
1626 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1627 &target, perms);
1628 } else {
1629 // if the dentry is not linked, just do our best. see #5021.
1630 ceph_abort_msg("how did this happen? i want logs!");
1631 }
1632 } else {
1633 Inode *in = request->inode();
1634 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1635 << in->ino << dendl;
1636 r = _getattr(in, request->regetattr_mask, perms, true);
1637 target = in;
1638 }
1639 if (r >= 0) {
1640 // verify ino returned in reply and trace_dist are the same
1641 if (got_created_ino &&
1642 created_ino.val != target->ino.val) {
1643 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1644 r = -EINTR;
1645 }
1646 if (ptarget)
1647 ptarget->swap(target);
1648 }
1649 }
1650 }
1651
1652 return r;
1653 }
1654
1655
1656 /**
1657 * make a request
1658 *
1659 * Blocking helper to make an MDS request.
1660 *
1661 * If the ptarget flag is set, behavior changes slightly: the caller
1662 * expects to get a pointer to the inode we are creating or operating
1663 * on. As a result, we will follow up any traceless mutation reply
1664 * with a getattr or lookup to transparently handle a traceless reply
1665 * from the MDS (as when the MDS restarts and the client has to replay
1666 * a request).
1667 *
1668 * @param request the MetaRequest to execute
1669 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1670 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1671 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1672 * @param use_mds [optional] prefer a specific mds (-1 for default)
1673 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1674 */
1675 int Client::make_request(MetaRequest *request,
1676 const UserPerm& perms,
1677 InodeRef *ptarget, bool *pcreated,
1678 mds_rank_t use_mds,
1679 bufferlist *pdirbl)
1680 {
1681 int r = 0;
1682
1683 // assign a unique tid
1684 ceph_tid_t tid = ++last_tid;
1685 request->set_tid(tid);
1686
1687 // and timestamp
1688 request->op_stamp = ceph_clock_now();
1689
1690 // make note
1691 mds_requests[tid] = request->get();
1692 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1693 oldest_tid = tid;
1694
1695 request->set_caller_perms(perms);
1696
1697 if (cct->_conf->client_inject_fixed_oldest_tid) {
1698 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1699 request->set_oldest_client_tid(1);
1700 } else {
1701 request->set_oldest_client_tid(oldest_tid);
1702 }
1703
1704 // hack target mds?
1705 if (use_mds >= 0)
1706 request->resend_mds = use_mds;
1707
1708 MetaSession *session = NULL;
1709 while (1) {
1710 if (request->aborted())
1711 break;
1712
1713 if (blacklisted) {
1714 request->abort(-EBLACKLISTED);
1715 break;
1716 }
1717
1718 // set up wait cond
1719 ceph::condition_variable caller_cond;
1720 request->caller_cond = &caller_cond;
1721
1722 // choose mds
1723 Inode *hash_diri = NULL;
1724 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1725 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1726 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1727 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1728 if (hash_diri) {
1729 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1730 _fragmap_remove_stopped_mds(hash_diri, mds);
1731 } else {
1732 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1733 request->resend_mds = _get_random_up_mds();
1734 }
1735 } else {
1736 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1737 wait_on_list(waiting_for_mdsmap);
1738 }
1739 continue;
1740 }
1741
1742 // open a session?
1743 if (!have_open_session(mds)) {
1744 session = _get_or_open_mds_session(mds);
1745 if (session->state == MetaSession::STATE_REJECTED) {
1746 request->abort(-EPERM);
1747 break;
1748 }
1749 // wait
1750 if (session->state == MetaSession::STATE_OPENING) {
1751 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1752 wait_on_context_list(session->waiting_for_open);
1753 continue;
1754 }
1755
1756 if (!have_open_session(mds))
1757 continue;
1758 } else {
1759 session = &mds_sessions.at(mds);
1760 }
1761
1762 // send request.
1763 send_request(request, session);
1764
1765 // wait for signal
1766 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1767 request->kick = false;
1768 std::unique_lock l{client_lock, std::adopt_lock};
1769 caller_cond.wait(l, [request] {
1770 return (request->reply || // reply
1771 request->resend_mds >= 0 || // forward
1772 request->kick);
1773 });
1774 l.release();
1775 request->caller_cond = nullptr;
1776
1777 // did we get a reply?
1778 if (request->reply)
1779 break;
1780 }
1781
1782 if (!request->reply) {
1783 ceph_assert(request->aborted());
1784 ceph_assert(!request->got_unsafe);
1785 r = request->get_abort_code();
1786 request->item.remove_myself();
1787 unregister_request(request);
1788 put_request(request);
1789 return r;
1790 }
1791
1792 // got it!
1793 auto reply = std::move(request->reply);
1794 r = reply->get_result();
1795 if (r >= 0)
1796 request->success = true;
1797
1798 // kick dispatcher (we've got it!)
1799 ceph_assert(request->dispatch_cond);
1800 request->dispatch_cond->notify_all();
1801 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1802 request->dispatch_cond = 0;
1803
1804 if (r >= 0 && ptarget)
1805 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1806
1807 if (pdirbl)
1808 *pdirbl = reply->get_extra_bl();
1809
1810 // -- log times --
1811 utime_t lat = ceph_clock_now();
1812 lat -= request->sent_stamp;
1813 ldout(cct, 20) << "lat " << lat << dendl;
1814 logger->tinc(l_c_lat, lat);
1815 logger->tinc(l_c_reply, lat);
1816
1817 put_request(request);
1818 return r;
1819 }
1820
1821 void Client::unregister_request(MetaRequest *req)
1822 {
1823 mds_requests.erase(req->tid);
1824 if (req->tid == oldest_tid) {
1825 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1826 while (true) {
1827 if (p == mds_requests.end()) {
1828 oldest_tid = 0;
1829 break;
1830 }
1831 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1832 oldest_tid = p->first;
1833 break;
1834 }
1835 ++p;
1836 }
1837 }
1838 put_request(req);
1839 }
1840
1841 void Client::put_request(MetaRequest *request)
1842 {
1843 if (request->_put()) {
1844 int op = -1;
1845 if (request->success)
1846 op = request->get_op();
1847 InodeRef other_in;
1848 request->take_other_inode(&other_in);
1849 delete request;
1850
1851 if (other_in &&
1852 (op == CEPH_MDS_OP_RMDIR ||
1853 op == CEPH_MDS_OP_RENAME ||
1854 op == CEPH_MDS_OP_RMSNAP)) {
1855 _try_to_trim_inode(other_in.get(), false);
1856 }
1857 }
1858 }
1859
1860 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1861 mds_rank_t mds, int drop,
1862 int unless, int force)
1863 {
1864 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1865 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1866 << ", force:" << force << ")" << dendl;
1867 int released = 0;
1868 auto it = in->caps.find(mds);
1869 if (it != in->caps.end()) {
1870 Cap &cap = it->second;
1871 drop &= ~(in->dirty_caps | get_caps_used(in));
1872 if ((drop & cap.issued) &&
1873 !(unless & cap.issued)) {
1874 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1875 cap.issued &= ~drop;
1876 cap.implemented &= ~drop;
1877 released = 1;
1878 } else {
1879 released = force;
1880 }
1881 if (released) {
1882 cap.wanted = in->caps_wanted();
1883 if (&cap == in->auth_cap &&
1884 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1885 in->requested_max_size = 0;
1886 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1887 }
1888 ceph_mds_request_release rel;
1889 rel.ino = in->ino;
1890 rel.cap_id = cap.cap_id;
1891 rel.seq = cap.seq;
1892 rel.issue_seq = cap.issue_seq;
1893 rel.mseq = cap.mseq;
1894 rel.caps = cap.implemented;
1895 rel.wanted = cap.wanted;
1896 rel.dname_len = 0;
1897 rel.dname_seq = 0;
1898 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1899 }
1900 }
1901 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1902 << released << dendl;
1903 return released;
1904 }
1905
1906 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1907 mds_rank_t mds, int drop, int unless)
1908 {
1909 ldout(cct, 20) << __func__ << " enter(dn:"
1910 << dn << ")" << dendl;
1911 int released = 0;
1912 if (dn->dir)
1913 released = encode_inode_release(dn->dir->parent_inode, req,
1914 mds, drop, unless, 1);
1915 if (released && dn->lease_mds == mds) {
1916 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1917 auto& rel = req->cap_releases.back();
1918 rel.item.dname_len = dn->name.length();
1919 rel.item.dname_seq = dn->lease_seq;
1920 rel.dname = dn->name;
1921 dn->lease_mds = -1;
1922 }
1923 ldout(cct, 25) << __func__ << " exit(dn:"
1924 << dn << ")" << dendl;
1925 }
1926
1927
1928 /*
1929 * This requires the MClientRequest *request member to be set.
1930 * It will error out horribly without one.
1931 * Additionally, if you set any *drop member, you'd better have
1932 * set the corresponding dentry!
1933 */
1934 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1935 {
1936 ldout(cct, 20) << __func__ << " enter (req: "
1937 << req << ", mds: " << mds << ")" << dendl;
1938 if (req->inode_drop && req->inode())
1939 encode_inode_release(req->inode(), req,
1940 mds, req->inode_drop,
1941 req->inode_unless);
1942
1943 if (req->old_inode_drop && req->old_inode())
1944 encode_inode_release(req->old_inode(), req,
1945 mds, req->old_inode_drop,
1946 req->old_inode_unless);
1947 if (req->other_inode_drop && req->other_inode())
1948 encode_inode_release(req->other_inode(), req,
1949 mds, req->other_inode_drop,
1950 req->other_inode_unless);
1951
1952 if (req->dentry_drop && req->dentry())
1953 encode_dentry_release(req->dentry(), req,
1954 mds, req->dentry_drop,
1955 req->dentry_unless);
1956
1957 if (req->old_dentry_drop && req->old_dentry())
1958 encode_dentry_release(req->old_dentry(), req,
1959 mds, req->old_dentry_drop,
1960 req->old_dentry_unless);
1961 ldout(cct, 25) << __func__ << " exit (req: "
1962 << req << ", mds " << mds <<dendl;
1963 }
1964
1965 bool Client::have_open_session(mds_rank_t mds)
1966 {
1967 const auto &it = mds_sessions.find(mds);
1968 return it != mds_sessions.end() &&
1969 (it->second.state == MetaSession::STATE_OPEN ||
1970 it->second.state == MetaSession::STATE_STALE);
1971 }
1972
1973 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1974 {
1975 const auto &it = mds_sessions.find(mds);
1976 if (it == mds_sessions.end() || it->second.con != con) {
1977 return NULL;
1978 } else {
1979 return &it->second;
1980 }
1981 }
1982
1983 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1984 {
1985 auto it = mds_sessions.find(mds);
1986 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1987 }
1988
1989 /**
1990 * Populate a map of strings with client-identifying metadata,
1991 * such as the hostname. Call this once at initialization.
1992 */
1993 void Client::populate_metadata(const std::string &mount_root)
1994 {
1995 // Hostname
1996 struct utsname u;
1997 int r = uname(&u);
1998 if (r >= 0) {
1999 metadata["hostname"] = u.nodename;
2000 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2001 } else {
2002 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2003 }
2004
2005 metadata["pid"] = stringify(getpid());
2006
2007 // Ceph entity id (the '0' in "client.0")
2008 metadata["entity_id"] = cct->_conf->name.get_id();
2009
2010 // Our mount position
2011 if (!mount_root.empty()) {
2012 metadata["root"] = mount_root;
2013 }
2014
2015 // Ceph version
2016 metadata["ceph_version"] = pretty_version_to_str();
2017 metadata["ceph_sha1"] = git_version_to_str();
2018
2019 // Apply any metadata from the user's configured overrides
2020 std::vector<std::string> tokens;
2021 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2022 for (const auto &i : tokens) {
2023 auto eqpos = i.find("=");
2024 // Throw out anything that isn't of the form "<str>=<str>"
2025 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2026 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2027 continue;
2028 }
2029 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2030 }
2031 }
2032
2033 /**
2034 * Optionally add or override client metadata fields.
2035 */
2036 void Client::update_metadata(std::string const &k, std::string const &v)
2037 {
2038 std::lock_guard l(client_lock);
2039 ceph_assert(initialized);
2040
2041 auto it = metadata.find(k);
2042 if (it != metadata.end()) {
2043 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2044 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2045 }
2046
2047 metadata[k] = v;
2048 }
2049
2050 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2051 {
2052 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2053 auto addrs = mdsmap->get_addrs(mds);
2054 auto em = mds_sessions.emplace(std::piecewise_construct,
2055 std::forward_as_tuple(mds),
2056 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2057 ceph_assert(em.second); /* not already present */
2058 MetaSession *session = &em.first->second;
2059
2060 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2061 m->metadata = metadata;
2062 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2063 session->con->send_message2(std::move(m));
2064 return session;
2065 }
2066
2067 void Client::_close_mds_session(MetaSession *s)
2068 {
2069 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2070 s->state = MetaSession::STATE_CLOSING;
2071 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2072 }
2073
2074 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2075 {
2076 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2077 if (rejected && s->state != MetaSession::STATE_CLOSING)
2078 s->state = MetaSession::STATE_REJECTED;
2079 else
2080 s->state = MetaSession::STATE_CLOSED;
2081 s->con->mark_down();
2082 signal_context_list(s->waiting_for_open);
2083 mount_cond.notify_all();
2084 remove_session_caps(s, err);
2085 kick_requests_closed(s);
2086 mds_ranks_closing.erase(s->mds_num);
2087 if (s->state == MetaSession::STATE_CLOSED)
2088 mds_sessions.erase(s->mds_num);
2089 }
2090
2091 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2092 {
2093 mds_rank_t from = mds_rank_t(m->get_source().num());
2094 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2095
2096 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2097 if (!session) {
2098 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2099 return;
2100 }
2101
2102 switch (m->get_op()) {
2103 case CEPH_SESSION_OPEN:
2104 {
2105 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2106 missing_features -= m->supported_features;
2107 if (!missing_features.empty()) {
2108 lderr(cct) << "mds." << from << " lacks required features '"
2109 << missing_features << "', closing session " << dendl;
2110 _close_mds_session(session);
2111 _closed_mds_session(session, -EPERM, true);
2112 break;
2113 }
2114 session->mds_features = std::move(m->supported_features);
2115
2116 renew_caps(session);
2117 session->state = MetaSession::STATE_OPEN;
2118 if (unmounting)
2119 mount_cond.notify_all();
2120 else
2121 connect_mds_targets(from);
2122 signal_context_list(session->waiting_for_open);
2123 break;
2124 }
2125
2126 case CEPH_SESSION_CLOSE:
2127 _closed_mds_session(session);
2128 break;
2129
2130 case CEPH_SESSION_RENEWCAPS:
2131 if (session->cap_renew_seq == m->get_seq()) {
2132 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2133 session->cap_ttl =
2134 session->last_cap_renew_request + mdsmap->get_session_timeout();
2135 if (was_stale)
2136 wake_up_session_caps(session, false);
2137 }
2138 break;
2139
2140 case CEPH_SESSION_STALE:
2141 // invalidate session caps/leases
2142 session->cap_gen++;
2143 session->cap_ttl = ceph_clock_now();
2144 session->cap_ttl -= 1;
2145 renew_caps(session);
2146 break;
2147
2148 case CEPH_SESSION_RECALL_STATE:
2149 trim_caps(session, m->get_max_caps());
2150 break;
2151
2152 case CEPH_SESSION_FLUSHMSG:
2153 /* flush cap release */
2154 if (auto& m = session->release; m) {
2155 session->con->send_message2(std::move(m));
2156 }
2157 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2158 break;
2159
2160 case CEPH_SESSION_FORCE_RO:
2161 force_session_readonly(session);
2162 break;
2163
2164 case CEPH_SESSION_REJECT:
2165 {
2166 std::string_view error_str;
2167 auto it = m->metadata.find("error_string");
2168 if (it != m->metadata.end())
2169 error_str = it->second;
2170 else
2171 error_str = "unknown error";
2172 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2173
2174 _closed_mds_session(session, -EPERM, true);
2175 }
2176 break;
2177
2178 default:
2179 ceph_abort();
2180 }
2181 }
2182
2183 bool Client::_any_stale_sessions() const
2184 {
2185 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2186
2187 for (const auto &p : mds_sessions) {
2188 if (p.second.state == MetaSession::STATE_STALE) {
2189 return true;
2190 }
2191 }
2192
2193 return false;
2194 }
2195
2196 void Client::_kick_stale_sessions()
2197 {
2198 ldout(cct, 1) << __func__ << dendl;
2199
2200 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2201 MetaSession &s = it->second;
2202 if (s.state == MetaSession::STATE_REJECTED) {
2203 mds_sessions.erase(it++);
2204 continue;
2205 }
2206 ++it;
2207 if (s.state == MetaSession::STATE_STALE)
2208 _closed_mds_session(&s);
2209 }
2210 }
2211
2212 void Client::send_request(MetaRequest *request, MetaSession *session,
2213 bool drop_cap_releases)
2214 {
2215 // make the request
2216 mds_rank_t mds = session->mds_num;
2217 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2218 << " for mds." << mds << dendl;
2219 auto r = build_client_request(request);
2220 if (request->dentry()) {
2221 r->set_dentry_wanted();
2222 }
2223 if (request->got_unsafe) {
2224 r->set_replayed_op();
2225 if (request->target)
2226 r->head.ino = request->target->ino;
2227 } else {
2228 encode_cap_releases(request, mds);
2229 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2230 request->cap_releases.clear();
2231 else
2232 r->releases.swap(request->cap_releases);
2233 }
2234 r->set_mdsmap_epoch(mdsmap->get_epoch());
2235 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2236 objecter->with_osdmap([r](const OSDMap& o) {
2237 r->set_osdmap_epoch(o.get_epoch());
2238 });
2239 }
2240
2241 if (request->mds == -1) {
2242 request->sent_stamp = ceph_clock_now();
2243 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2244 }
2245 request->mds = mds;
2246
2247 Inode *in = request->inode();
2248 if (in) {
2249 auto it = in->caps.find(mds);
2250 if (it != in->caps.end()) {
2251 request->sent_on_mseq = it->second.mseq;
2252 }
2253 }
2254
2255 session->requests.push_back(&request->item);
2256
2257 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2258 session->con->send_message2(std::move(r));
2259 }
2260
2261 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2262 {
2263 auto req = make_message<MClientRequest>(request->get_op());
2264 req->set_tid(request->tid);
2265 req->set_stamp(request->op_stamp);
2266 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2267
2268 // if the filepath's haven't been set, set them!
2269 if (request->path.empty()) {
2270 Inode *in = request->inode();
2271 Dentry *de = request->dentry();
2272 if (in)
2273 in->make_nosnap_relative_path(request->path);
2274 else if (de) {
2275 if (de->inode)
2276 de->inode->make_nosnap_relative_path(request->path);
2277 else if (de->dir) {
2278 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2279 request->path.push_dentry(de->name);
2280 }
2281 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2282 << " No path, inode, or appropriately-endowed dentry given!"
2283 << dendl;
2284 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2285 << " No path, inode, or dentry given!"
2286 << dendl;
2287 }
2288 req->set_filepath(request->get_filepath());
2289 req->set_filepath2(request->get_filepath2());
2290 req->set_data(request->data);
2291 req->set_retry_attempt(request->retry_attempt++);
2292 req->head.num_fwd = request->num_fwd;
2293 const gid_t *_gids;
2294 int gid_count = request->perms.get_gids(&_gids);
2295 req->set_gid_list(gid_count, _gids);
2296 return req;
2297 }
2298
2299
2300
2301 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2302 {
2303 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2304 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2305 if (!session) {
2306 return;
2307 }
2308 ceph_tid_t tid = fwd->get_tid();
2309
2310 if (mds_requests.count(tid) == 0) {
2311 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2312 return;
2313 }
2314
2315 MetaRequest *request = mds_requests[tid];
2316 ceph_assert(request);
2317
2318 // reset retry counter
2319 request->retry_attempt = 0;
2320
2321 // request not forwarded, or dest mds has no session.
2322 // resend.
2323 ldout(cct, 10) << __func__ << " tid " << tid
2324 << " fwd " << fwd->get_num_fwd()
2325 << " to mds." << fwd->get_dest_mds()
2326 << ", resending to " << fwd->get_dest_mds()
2327 << dendl;
2328
2329 request->mds = -1;
2330 request->item.remove_myself();
2331 request->num_fwd = fwd->get_num_fwd();
2332 request->resend_mds = fwd->get_dest_mds();
2333 request->caller_cond->notify_all();
2334 }
2335
2336 bool Client::is_dir_operation(MetaRequest *req)
2337 {
2338 int op = req->get_op();
2339 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2340 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2341 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2342 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2343 return true;
2344 return false;
2345 }
2346
2347 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2348 {
2349 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2350 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2351 if (!session) {
2352 return;
2353 }
2354
2355 ceph_tid_t tid = reply->get_tid();
2356 bool is_safe = reply->is_safe();
2357
2358 if (mds_requests.count(tid) == 0) {
2359 lderr(cct) << __func__ << " no pending request on tid " << tid
2360 << " safe is:" << is_safe << dendl;
2361 return;
2362 }
2363 MetaRequest *request = mds_requests.at(tid);
2364
2365 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2366 << " tid " << tid << dendl;
2367
2368 if (request->got_unsafe && !is_safe) {
2369 //duplicate response
2370 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2371 << mds_num << " safe:" << is_safe << dendl;
2372 return;
2373 }
2374
2375 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2376 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2377 << " from mds." << request->mds << dendl;
2378 request->send_to_auth = true;
2379 request->resend_mds = choose_target_mds(request);
2380 Inode *in = request->inode();
2381 std::map<mds_rank_t, Cap>::const_iterator it;
2382 if (request->resend_mds >= 0 &&
2383 request->resend_mds == request->mds &&
2384 (in == NULL ||
2385 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2386 request->sent_on_mseq == it->second.mseq)) {
2387 ldout(cct, 20) << "have to return ESTALE" << dendl;
2388 } else {
2389 request->caller_cond->notify_all();
2390 return;
2391 }
2392 }
2393
2394 ceph_assert(!request->reply);
2395 request->reply = reply;
2396 insert_trace(request, session);
2397
2398 // Handle unsafe reply
2399 if (!is_safe) {
2400 request->got_unsafe = true;
2401 session->unsafe_requests.push_back(&request->unsafe_item);
2402 if (is_dir_operation(request)) {
2403 Inode *dir = request->inode();
2404 ceph_assert(dir);
2405 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2406 }
2407 if (request->target) {
2408 InodeRef &in = request->target;
2409 in->unsafe_ops.push_back(&request->unsafe_target_item);
2410 }
2411 }
2412
2413 // Only signal the caller once (on the first reply):
2414 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2415 if (!is_safe || !request->got_unsafe) {
2416 ceph::condition_variable cond;
2417 request->dispatch_cond = &cond;
2418
2419 // wake up waiter
2420 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2421 request->caller_cond->notify_all();
2422
2423 // wake for kick back
2424 std::unique_lock l{client_lock, std::adopt_lock};
2425 cond.wait(l, [tid, request, &cond, this] {
2426 if (request->dispatch_cond) {
2427 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2428 << tid << " " << &cond << dendl;
2429 }
2430 return !request->dispatch_cond;
2431 });
2432 l.release();
2433 }
2434
2435 if (is_safe) {
2436 // the filesystem change is committed to disk
2437 // we're done, clean up
2438 if (request->got_unsafe) {
2439 request->unsafe_item.remove_myself();
2440 request->unsafe_dir_item.remove_myself();
2441 request->unsafe_target_item.remove_myself();
2442 signal_cond_list(request->waitfor_safe);
2443 }
2444 request->item.remove_myself();
2445 unregister_request(request);
2446 }
2447 if (unmounting)
2448 mount_cond.notify_all();
2449 }
2450
2451 void Client::_handle_full_flag(int64_t pool)
2452 {
2453 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2454 << "on " << pool << dendl;
2455 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2456 // to do this rather than blocking, because otherwise when we fill up we
2457 // potentially lock caps forever on files with dirty pages, and we need
2458 // to be able to release those caps to the MDS so that it can delete files
2459 // and free up space.
2460 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2461
2462 // For all inodes with layouts in this pool and a pending flush write op
2463 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2464 // from ObjectCacher so that it doesn't re-issue the write in response to
2465 // the ENOSPC error.
2466 // Fortunately since we're cancelling everything in a given pool, we don't
2467 // need to know which ops belong to which ObjectSet, we can just blow all
2468 // the un-flushed cached data away and mark any dirty inodes' async_err
2469 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2470 // affecting this pool, and all the objectsets we're purging were also
2471 // in this pool.
2472 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2473 i != inode_map.end(); ++i)
2474 {
2475 Inode *inode = i->second;
2476 if (inode->oset.dirty_or_tx
2477 && (pool == -1 || inode->layout.pool_id == pool)) {
2478 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2479 << " has dirty objects, purging and setting ENOSPC" << dendl;
2480 objectcacher->purge_set(&inode->oset);
2481 inode->set_async_err(-ENOSPC);
2482 }
2483 }
2484
2485 if (cancelled_epoch != (epoch_t)-1) {
2486 set_cap_epoch_barrier(cancelled_epoch);
2487 }
2488 }
2489
2490 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2491 {
2492 std::set<entity_addr_t> new_blacklists;
2493 objecter->consume_blacklist_events(&new_blacklists);
2494
2495 const auto myaddrs = messenger->get_myaddrs();
2496 bool new_blacklist = false;
2497 bool prenautilus = objecter->with_osdmap(
2498 [&](const OSDMap& o) {
2499 return o.require_osd_release < ceph_release_t::nautilus;
2500 });
2501 if (!blacklisted) {
2502 for (auto a : myaddrs.v) {
2503 // blacklist entries are always TYPE_ANY for nautilus+
2504 a.set_type(entity_addr_t::TYPE_ANY);
2505 if (new_blacklists.count(a)) {
2506 new_blacklist = true;
2507 break;
2508 }
2509 if (prenautilus) {
2510 // ...except pre-nautilus, they were TYPE_LEGACY
2511 a.set_type(entity_addr_t::TYPE_LEGACY);
2512 if (new_blacklists.count(a)) {
2513 new_blacklist = true;
2514 break;
2515 }
2516 }
2517 }
2518 }
2519 if (new_blacklist) {
2520 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2521 return o.get_epoch();
2522 });
2523 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2524 blacklisted = true;
2525
2526 _abort_mds_sessions(-EBLACKLISTED);
2527
2528 // Since we know all our OSD ops will fail, cancel them all preemtively,
2529 // so that on an unhealthy cluster we can umount promptly even if e.g.
2530 // some PGs were inaccessible.
2531 objecter->op_cancel_writes(-EBLACKLISTED);
2532
2533 } else if (blacklisted) {
2534 // Handle case where we were blacklisted but no longer are
2535 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2536 return o.is_blacklisted(myaddrs);});
2537 }
2538
2539 // Always subscribe to next osdmap for blacklisted client
2540 // until this client is not blacklisted.
2541 if (blacklisted) {
2542 objecter->maybe_request_map();
2543 }
2544
2545 if (objecter->osdmap_full_flag()) {
2546 _handle_full_flag(-1);
2547 } else {
2548 // Accumulate local list of full pools so that I can drop
2549 // the objecter lock before re-entering objecter in
2550 // cancel_writes
2551 std::vector<int64_t> full_pools;
2552
2553 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2554 for (const auto& kv : o.get_pools()) {
2555 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2556 full_pools.push_back(kv.first);
2557 }
2558 }
2559 });
2560
2561 for (auto p : full_pools)
2562 _handle_full_flag(p);
2563
2564 // Subscribe to subsequent maps to watch for the full flag going
2565 // away. For the global full flag objecter does this for us, but
2566 // it pays no attention to the per-pool full flag so in this branch
2567 // we do it ourselves.
2568 if (!full_pools.empty()) {
2569 objecter->maybe_request_map();
2570 }
2571 }
2572 }
2573
2574
2575 // ------------------------
2576 // incoming messages
2577
2578
2579 bool Client::ms_dispatch2(const MessageRef &m)
2580 {
2581 std::lock_guard l(client_lock);
2582 if (!initialized) {
2583 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2584 return true;
2585 }
2586
2587 switch (m->get_type()) {
2588 // mounting and mds sessions
2589 case CEPH_MSG_MDS_MAP:
2590 handle_mds_map(ref_cast<MMDSMap>(m));
2591 break;
2592 case CEPH_MSG_FS_MAP:
2593 handle_fs_map(ref_cast<MFSMap>(m));
2594 break;
2595 case CEPH_MSG_FS_MAP_USER:
2596 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2597 break;
2598 case CEPH_MSG_CLIENT_SESSION:
2599 handle_client_session(ref_cast<MClientSession>(m));
2600 break;
2601
2602 case CEPH_MSG_OSD_MAP:
2603 handle_osd_map(ref_cast<MOSDMap>(m));
2604 break;
2605
2606 // requests
2607 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2608 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2609 break;
2610 case CEPH_MSG_CLIENT_REPLY:
2611 handle_client_reply(ref_cast<MClientReply>(m));
2612 break;
2613
2614 // reclaim reply
2615 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2616 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2617 break;
2618
2619 case CEPH_MSG_CLIENT_SNAP:
2620 handle_snap(ref_cast<MClientSnap>(m));
2621 break;
2622 case CEPH_MSG_CLIENT_CAPS:
2623 handle_caps(ref_cast<MClientCaps>(m));
2624 break;
2625 case CEPH_MSG_CLIENT_LEASE:
2626 handle_lease(ref_cast<MClientLease>(m));
2627 break;
2628 case MSG_COMMAND_REPLY:
2629 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2630 handle_command_reply(ref_cast<MCommandReply>(m));
2631 } else {
2632 return false;
2633 }
2634 break;
2635 case CEPH_MSG_CLIENT_QUOTA:
2636 handle_quota(ref_cast<MClientQuota>(m));
2637 break;
2638
2639 default:
2640 return false;
2641 }
2642
2643 // unmounting?
2644 if (unmounting) {
2645 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2646 << "+" << inode_map.size() << dendl;
2647 long unsigned size = lru.lru_get_size() + inode_map.size();
2648 trim_cache();
2649 if (size < lru.lru_get_size() + inode_map.size()) {
2650 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2651 mount_cond.notify_all();
2652 } else {
2653 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2654 << "+" << inode_map.size() << dendl;
2655 }
2656 }
2657
2658 return true;
2659 }
2660
2661 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2662 {
2663 fsmap.reset(new FSMap(m->get_fsmap()));
2664
2665 signal_cond_list(waiting_for_fsmap);
2666
2667 monclient->sub_got("fsmap", fsmap->get_epoch());
2668 }
2669
2670 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2671 {
2672 fsmap_user.reset(new FSMapUser);
2673 *fsmap_user = m->get_fsmap();
2674
2675 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2676 signal_cond_list(waiting_for_fsmap);
2677 }
2678
2679 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2680 {
2681 mds_gid_t old_inc, new_inc;
2682 if (m->get_epoch() <= mdsmap->get_epoch()) {
2683 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2684 << " is identical to or older than our "
2685 << mdsmap->get_epoch() << dendl;
2686 return;
2687 }
2688
2689 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2690
2691 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2692 oldmap.swap(mdsmap);
2693
2694 mdsmap->decode(m->get_encoded());
2695
2696 // Cancel any commands for missing or laggy GIDs
2697 std::list<ceph_tid_t> cancel_ops;
2698 auto &commands = command_table.get_commands();
2699 for (const auto &i : commands) {
2700 auto &op = i.second;
2701 const mds_gid_t op_mds_gid = op.mds_gid;
2702 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2703 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2704 cancel_ops.push_back(i.first);
2705 if (op.outs) {
2706 std::ostringstream ss;
2707 ss << "MDS " << op_mds_gid << " went away";
2708 *(op.outs) = ss.str();
2709 }
2710 op.con->mark_down();
2711 if (op.on_finish) {
2712 op.on_finish->complete(-ETIMEDOUT);
2713 }
2714 }
2715 }
2716
2717 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2718 i != cancel_ops.end(); ++i) {
2719 command_table.erase(*i);
2720 }
2721
2722 // reset session
2723 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2724 mds_rank_t mds = p->first;
2725 MetaSession *session = &p->second;
2726 ++p;
2727
2728 int oldstate = oldmap->get_state(mds);
2729 int newstate = mdsmap->get_state(mds);
2730 if (!mdsmap->is_up(mds)) {
2731 session->con->mark_down();
2732 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2733 old_inc = oldmap->get_incarnation(mds);
2734 new_inc = mdsmap->get_incarnation(mds);
2735 if (old_inc != new_inc) {
2736 ldout(cct, 1) << "mds incarnation changed from "
2737 << old_inc << " to " << new_inc << dendl;
2738 oldstate = MDSMap::STATE_NULL;
2739 }
2740 session->con->mark_down();
2741 session->addrs = mdsmap->get_addrs(mds);
2742 // When new MDS starts to take over, notify kernel to trim unused entries
2743 // in its dcache/icache. Hopefully, the kernel will release some unused
2744 // inodes before the new MDS enters reconnect state.
2745 trim_cache_for_reconnect(session);
2746 } else if (oldstate == newstate)
2747 continue; // no change
2748
2749 session->mds_state = newstate;
2750 if (newstate == MDSMap::STATE_RECONNECT) {
2751 session->con = messenger->connect_to_mds(session->addrs);
2752 send_reconnect(session);
2753 } else if (newstate > MDSMap::STATE_RECONNECT) {
2754 if (oldstate < MDSMap::STATE_RECONNECT) {
2755 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2756 _closed_mds_session(session);
2757 continue;
2758 }
2759 if (newstate >= MDSMap::STATE_ACTIVE) {
2760 if (oldstate < MDSMap::STATE_ACTIVE) {
2761 // kick new requests
2762 kick_requests(session);
2763 kick_flushing_caps(session);
2764 signal_context_list(session->waiting_for_open);
2765 wake_up_session_caps(session, true);
2766 }
2767 connect_mds_targets(mds);
2768 }
2769 } else if (newstate == MDSMap::STATE_NULL &&
2770 mds >= mdsmap->get_max_mds()) {
2771 _closed_mds_session(session);
2772 }
2773 }
2774
2775 // kick any waiting threads
2776 signal_cond_list(waiting_for_mdsmap);
2777
2778 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2779 }
2780
2781 void Client::send_reconnect(MetaSession *session)
2782 {
2783 mds_rank_t mds = session->mds_num;
2784 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2785
2786 // trim unused caps to reduce MDS's cache rejoin time
2787 trim_cache_for_reconnect(session);
2788
2789 session->readonly = false;
2790
2791 session->release.reset();
2792
2793 // reset my cap seq number
2794 session->seq = 0;
2795 //connect to the mds' offload targets
2796 connect_mds_targets(mds);
2797 //make sure unsafe requests get saved
2798 resend_unsafe_requests(session);
2799
2800 early_kick_flushing_caps(session);
2801
2802 auto m = make_message<MClientReconnect>();
2803 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2804
2805 // i have an open session.
2806 ceph::unordered_set<inodeno_t> did_snaprealm;
2807 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2808 p != inode_map.end();
2809 ++p) {
2810 Inode *in = p->second;
2811 auto it = in->caps.find(mds);
2812 if (it != in->caps.end()) {
2813 if (allow_multi &&
2814 m->get_approx_size() >=
2815 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2816 m->mark_more();
2817 session->con->send_message2(std::move(m));
2818
2819 m = make_message<MClientReconnect>();
2820 }
2821
2822 Cap &cap = it->second;
2823 ldout(cct, 10) << " caps on " << p->first
2824 << " " << ccap_string(cap.issued)
2825 << " wants " << ccap_string(in->caps_wanted())
2826 << dendl;
2827 filepath path;
2828 in->make_short_path(path);
2829 ldout(cct, 10) << " path " << path << dendl;
2830
2831 bufferlist flockbl;
2832 _encode_filelocks(in, flockbl);
2833
2834 cap.seq = 0; // reset seq.
2835 cap.issue_seq = 0; // reset seq.
2836 cap.mseq = 0; // reset seq.
2837 // cap gen should catch up with session cap_gen
2838 if (cap.gen < session->cap_gen) {
2839 cap.gen = session->cap_gen;
2840 cap.issued = cap.implemented = CEPH_CAP_PIN;
2841 } else {
2842 cap.issued = cap.implemented;
2843 }
2844 snapid_t snap_follows = 0;
2845 if (!in->cap_snaps.empty())
2846 snap_follows = in->cap_snaps.begin()->first;
2847
2848 m->add_cap(p->first.ino,
2849 cap.cap_id,
2850 path.get_ino(), path.get_path(), // ino
2851 in->caps_wanted(), // wanted
2852 cap.issued, // issued
2853 in->snaprealm->ino,
2854 snap_follows,
2855 flockbl);
2856
2857 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2858 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2859 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2860 did_snaprealm.insert(in->snaprealm->ino);
2861 }
2862 }
2863 }
2864
2865 if (!allow_multi)
2866 m->set_encoding_version(0); // use connection features to choose encoding
2867 session->con->send_message2(std::move(m));
2868
2869 mount_cond.notify_all();
2870
2871 if (session->reclaim_state == MetaSession::RECLAIMING)
2872 signal_cond_list(waiting_for_reclaim);
2873 }
2874
2875
2876 void Client::kick_requests(MetaSession *session)
2877 {
2878 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2879 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2880 p != mds_requests.end();
2881 ++p) {
2882 MetaRequest *req = p->second;
2883 if (req->got_unsafe)
2884 continue;
2885 if (req->aborted()) {
2886 if (req->caller_cond) {
2887 req->kick = true;
2888 req->caller_cond->notify_all();
2889 }
2890 continue;
2891 }
2892 if (req->retry_attempt > 0)
2893 continue; // new requests only
2894 if (req->mds == session->mds_num) {
2895 send_request(p->second, session);
2896 }
2897 }
2898 }
2899
2900 void Client::resend_unsafe_requests(MetaSession *session)
2901 {
2902 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2903 !iter.end();
2904 ++iter)
2905 send_request(*iter, session);
2906
2907 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2908 // process completed requests in clientreplay stage.
2909 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2910 p != mds_requests.end();
2911 ++p) {
2912 MetaRequest *req = p->second;
2913 if (req->got_unsafe)
2914 continue;
2915 if (req->aborted())
2916 continue;
2917 if (req->retry_attempt == 0)
2918 continue; // old requests only
2919 if (req->mds == session->mds_num)
2920 send_request(req, session, true);
2921 }
2922 }
2923
2924 void Client::wait_unsafe_requests()
2925 {
2926 list<MetaRequest*> last_unsafe_reqs;
2927 for (const auto &p : mds_sessions) {
2928 const MetaSession &s = p.second;
2929 if (!s.unsafe_requests.empty()) {
2930 MetaRequest *req = s.unsafe_requests.back();
2931 req->get();
2932 last_unsafe_reqs.push_back(req);
2933 }
2934 }
2935
2936 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2937 p != last_unsafe_reqs.end();
2938 ++p) {
2939 MetaRequest *req = *p;
2940 if (req->unsafe_item.is_on_list())
2941 wait_on_list(req->waitfor_safe);
2942 put_request(req);
2943 }
2944 }
2945
2946 void Client::kick_requests_closed(MetaSession *session)
2947 {
2948 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2949 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2950 p != mds_requests.end(); ) {
2951 MetaRequest *req = p->second;
2952 ++p;
2953 if (req->mds == session->mds_num) {
2954 if (req->caller_cond) {
2955 req->kick = true;
2956 req->caller_cond->notify_all();
2957 }
2958 req->item.remove_myself();
2959 if (req->got_unsafe) {
2960 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2961 req->unsafe_item.remove_myself();
2962 if (is_dir_operation(req)) {
2963 Inode *dir = req->inode();
2964 assert(dir);
2965 dir->set_async_err(-EIO);
2966 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2967 << dir->ino << " " << req->get_tid() << dendl;
2968 req->unsafe_dir_item.remove_myself();
2969 }
2970 if (req->target) {
2971 InodeRef &in = req->target;
2972 in->set_async_err(-EIO);
2973 lderr(cct) << "kick_requests_closed drop req of inode : "
2974 << in->ino << " " << req->get_tid() << dendl;
2975 req->unsafe_target_item.remove_myself();
2976 }
2977 signal_cond_list(req->waitfor_safe);
2978 unregister_request(req);
2979 }
2980 }
2981 }
2982 ceph_assert(session->requests.empty());
2983 ceph_assert(session->unsafe_requests.empty());
2984 }
2985
2986
2987
2988
2989 /************
2990 * leases
2991 */
2992
2993 void Client::got_mds_push(MetaSession *s)
2994 {
2995 s->seq++;
2996 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2997 if (s->state == MetaSession::STATE_CLOSING) {
2998 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2999 }
3000 }
3001
3002 void Client::handle_lease(const MConstRef<MClientLease>& m)
3003 {
3004 ldout(cct, 10) << __func__ << " " << *m << dendl;
3005
3006 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3007
3008 mds_rank_t mds = mds_rank_t(m->get_source().num());
3009 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3010 if (!session) {
3011 return;
3012 }
3013
3014 got_mds_push(session);
3015
3016 ceph_seq_t seq = m->get_seq();
3017
3018 Inode *in;
3019 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3020 if (inode_map.count(vino) == 0) {
3021 ldout(cct, 10) << " don't have vino " << vino << dendl;
3022 goto revoke;
3023 }
3024 in = inode_map[vino];
3025
3026 if (m->get_mask() & CEPH_LEASE_VALID) {
3027 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3028 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3029 goto revoke;
3030 }
3031 Dentry *dn = in->dir->dentries[m->dname];
3032 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3033 dn->lease_mds = -1;
3034 }
3035
3036 revoke:
3037 {
3038 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3039 m->get_mask(), m->get_ino(),
3040 m->get_first(), m->get_last(), m->dname);
3041 m->get_connection()->send_message2(std::move(reply));
3042 }
3043 }
3044
3045 void Client::put_inode(Inode *in, int n)
3046 {
3047 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3048 int left = in->_put(n);
3049 if (left == 0) {
3050 // release any caps
3051 remove_all_caps(in);
3052
3053 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3054 bool unclean = objectcacher->release_set(&in->oset);
3055 ceph_assert(!unclean);
3056 inode_map.erase(in->vino());
3057 if (use_faked_inos())
3058 _release_faked_ino(in);
3059
3060 if (in == root) {
3061 root = 0;
3062 root_ancestor = 0;
3063 while (!root_parents.empty())
3064 root_parents.erase(root_parents.begin());
3065 }
3066
3067 delete in;
3068 }
3069 }
3070
3071 void Client::close_dir(Dir *dir)
3072 {
3073 Inode *in = dir->parent_inode;
3074 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3075 ceph_assert(dir->is_empty());
3076 ceph_assert(in->dir == dir);
3077 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3078 if (!in->dentries.empty())
3079 in->get_first_parent()->put(); // unpin dentry
3080
3081 delete in->dir;
3082 in->dir = 0;
3083 put_inode(in); // unpin inode
3084 }
3085
3086 /**
3087 * Don't call this with in==NULL, use get_or_create for that
3088 * leave dn set to default NULL unless you're trying to add
3089 * a new inode to a pre-created Dentry
3090 */
3091 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3092 {
3093 if (!dn) {
3094 // create a new Dentry
3095 dn = new Dentry(dir, name);
3096
3097 lru.lru_insert_mid(dn); // mid or top?
3098
3099 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3100 << " dn " << dn << " (new dn)" << dendl;
3101 } else {
3102 ceph_assert(!dn->inode);
3103 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3104 << " dn " << dn << " (old dn)" << dendl;
3105 }
3106
3107 if (in) { // link to inode
3108 InodeRef tmp_ref;
3109 // only one parent for directories!
3110 if (in->is_dir() && !in->dentries.empty()) {
3111 tmp_ref = in; // prevent unlink below from freeing the inode.
3112 Dentry *olddn = in->get_first_parent();
3113 ceph_assert(olddn->dir != dir || olddn->name != name);
3114 Inode *old_diri = olddn->dir->parent_inode;
3115 clear_dir_complete_and_ordered(old_diri, true);
3116 unlink(olddn, true, true); // keep dir, dentry
3117 }
3118
3119 dn->link(in);
3120 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3121 }
3122
3123 return dn;
3124 }
3125
3126 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3127 {
3128 InodeRef in(dn->inode);
3129 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3130 << " inode " << dn->inode << dendl;
3131
3132 // unlink from inode
3133 if (dn->inode) {
3134 dn->unlink();
3135 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3136 }
3137
3138 if (keepdentry) {
3139 dn->lease_mds = -1;
3140 } else {
3141 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3142
3143 // unlink from dir
3144 Dir *dir = dn->dir;
3145 dn->detach();
3146
3147 // delete den
3148 lru.lru_remove(dn);
3149 dn->put();
3150
3151 if (dir->is_empty() && !keepdir)
3152 close_dir(dir);
3153 }
3154 }
3155
3156 /**
3157 * For asynchronous flushes, check for errors from the IO and
3158 * update the inode if necessary
3159 */
3160 class C_Client_FlushComplete : public Context {
3161 private:
3162 Client *client;
3163 InodeRef inode;
3164 public:
3165 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3166 void finish(int r) override {
3167 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3168 if (r != 0) {
3169 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3170 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3171 << " 0x" << std::hex << inode->ino << std::dec
3172 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3173 inode->set_async_err(r);
3174 }
3175 }
3176 };
3177
3178
3179 /****
3180 * caps
3181 */
3182
3183 void Client::get_cap_ref(Inode *in, int cap)
3184 {
3185 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3186 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3187 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3188 in->get();
3189 }
3190 if ((cap & CEPH_CAP_FILE_CACHE) &&
3191 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3192 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3193 in->get();
3194 }
3195 in->get_cap_ref(cap);
3196 }
3197
3198 void Client::put_cap_ref(Inode *in, int cap)
3199 {
3200 int last = in->put_cap_ref(cap);
3201 if (last) {
3202 int put_nref = 0;
3203 int drop = last & ~in->caps_issued();
3204 if (in->snapid == CEPH_NOSNAP) {
3205 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3206 !in->cap_snaps.empty() &&
3207 in->cap_snaps.rbegin()->second.writing) {
3208 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3209 in->cap_snaps.rbegin()->second.writing = 0;
3210 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3211 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3212 }
3213 if (last & CEPH_CAP_FILE_BUFFER) {
3214 for (auto &p : in->cap_snaps)
3215 p.second.dirty_data = 0;
3216 signal_cond_list(in->waitfor_commit);
3217 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3218 ++put_nref;
3219 }
3220 }
3221 if (last & CEPH_CAP_FILE_CACHE) {
3222 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3223 ++put_nref;
3224 }
3225 if (drop)
3226 check_caps(in, 0);
3227 if (put_nref)
3228 put_inode(in, put_nref);
3229 }
3230 }
3231
3232 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3233 {
3234 Inode *in = fh->inode.get();
3235
3236 int r = check_pool_perm(in, need);
3237 if (r < 0)
3238 return r;
3239
3240 while (1) {
3241 int file_wanted = in->caps_file_wanted();
3242 if ((file_wanted & need) != need) {
3243 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3244 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3245 << dendl;
3246 return -EBADF;
3247 }
3248
3249 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3250 return -EBADF;
3251
3252 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3253 return -EIO;
3254
3255 int implemented;
3256 int have = in->caps_issued(&implemented);
3257
3258 bool waitfor_caps = false;
3259 bool waitfor_commit = false;
3260
3261 if (have & need & CEPH_CAP_FILE_WR) {
3262 if (endoff > 0) {
3263 if ((endoff >= (loff_t)in->max_size ||
3264 endoff > (loff_t)(in->size << 1)) &&
3265 endoff > (loff_t)in->wanted_max_size) {
3266 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3267 in->wanted_max_size = endoff;
3268 }
3269 if (in->wanted_max_size > in->max_size &&
3270 in->wanted_max_size > in->requested_max_size)
3271 check_caps(in, 0);
3272 }
3273
3274 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3275 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3276 waitfor_caps = true;
3277 }
3278 if (!in->cap_snaps.empty()) {
3279 if (in->cap_snaps.rbegin()->second.writing) {
3280 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3281 waitfor_caps = true;
3282 }
3283 for (auto &p : in->cap_snaps) {
3284 if (p.second.dirty_data) {
3285 waitfor_commit = true;
3286 break;
3287 }
3288 }
3289 if (waitfor_commit) {
3290 _flush(in, new C_Client_FlushComplete(this, in));
3291 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3292 }
3293 }
3294 }
3295
3296 if (!waitfor_caps && !waitfor_commit) {
3297 if ((have & need) == need) {
3298 int revoking = implemented & ~have;
3299 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3300 << " need " << ccap_string(need) << " want " << ccap_string(want)
3301 << " revoking " << ccap_string(revoking)
3302 << dendl;
3303 if ((revoking & want) == 0) {
3304 *phave = need | (have & want);
3305 in->get_cap_ref(need);
3306 return 0;
3307 }
3308 }
3309 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3310 waitfor_caps = true;
3311 }
3312
3313 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3314 in->auth_cap->session->readonly)
3315 return -EROFS;
3316
3317 if (in->flags & I_CAP_DROPPED) {
3318 int mds_wanted = in->caps_mds_wanted();
3319 if ((mds_wanted & need) != need) {
3320 int ret = _renew_caps(in);
3321 if (ret < 0)
3322 return ret;
3323 continue;
3324 }
3325 if (!(file_wanted & ~mds_wanted))
3326 in->flags &= ~I_CAP_DROPPED;
3327 }
3328
3329 if (waitfor_caps)
3330 wait_on_list(in->waitfor_caps);
3331 else if (waitfor_commit)
3332 wait_on_list(in->waitfor_commit);
3333 }
3334 }
3335
3336 int Client::get_caps_used(Inode *in)
3337 {
3338 unsigned used = in->caps_used();
3339 if (!(used & CEPH_CAP_FILE_CACHE) &&
3340 !objectcacher->set_is_empty(&in->oset))
3341 used |= CEPH_CAP_FILE_CACHE;
3342 return used;
3343 }
3344
3345 void Client::cap_delay_requeue(Inode *in)
3346 {
3347 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3348 in->hold_caps_until = ceph_clock_now();
3349 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3350 delayed_list.push_back(&in->delay_cap_item);
3351 }
3352
3353 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3354 int flags, int used, int want, int retain,
3355 int flush, ceph_tid_t flush_tid)
3356 {
3357 int held = cap->issued | cap->implemented;
3358 int revoking = cap->implemented & ~cap->issued;
3359 retain &= ~revoking;
3360 int dropping = cap->issued & ~retain;
3361 int op = CEPH_CAP_OP_UPDATE;
3362
3363 ldout(cct, 10) << __func__ << " " << *in
3364 << " mds." << session->mds_num << " seq " << cap->seq
3365 << " used " << ccap_string(used)
3366 << " want " << ccap_string(want)
3367 << " flush " << ccap_string(flush)
3368 << " retain " << ccap_string(retain)
3369 << " held "<< ccap_string(held)
3370 << " revoking " << ccap_string(revoking)
3371 << " dropping " << ccap_string(dropping)
3372 << dendl;
3373
3374 if (cct->_conf->client_inject_release_failure && revoking) {
3375 const int would_have_issued = cap->issued & retain;
3376 const int would_have_implemented = cap->implemented & (cap->issued | used);
3377 // Simulated bug:
3378 // - tell the server we think issued is whatever they issued plus whatever we implemented
3379 // - leave what we have implemented in place
3380 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3381 cap->issued = cap->issued | cap->implemented;
3382
3383 // Make an exception for revoking xattr caps: we are injecting
3384 // failure to release other caps, but allow xattr because client
3385 // will block on xattr ops if it can't release these to MDS (#9800)
3386 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3387 cap->issued ^= xattr_mask & revoking;
3388 cap->implemented ^= xattr_mask & revoking;
3389
3390 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3391 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3392 } else {
3393 // Normal behaviour
3394 cap->issued &= retain;
3395 cap->implemented &= cap->issued | used;
3396 }
3397
3398 snapid_t follows = 0;
3399
3400 if (flush)
3401 follows = in->snaprealm->get_snap_context().seq;
3402
3403 auto m = make_message<MClientCaps>(op,
3404 in->ino,
3405 0,
3406 cap->cap_id, cap->seq,
3407 cap->implemented,
3408 want,
3409 flush,
3410 cap->mseq,
3411 cap_epoch_barrier);
3412 m->caller_uid = in->cap_dirtier_uid;
3413 m->caller_gid = in->cap_dirtier_gid;
3414
3415 m->head.issue_seq = cap->issue_seq;
3416 m->set_tid(flush_tid);
3417
3418 m->head.uid = in->uid;
3419 m->head.gid = in->gid;
3420 m->head.mode = in->mode;
3421
3422 m->head.nlink = in->nlink;
3423
3424 if (flush & CEPH_CAP_XATTR_EXCL) {
3425 encode(in->xattrs, m->xattrbl);
3426 m->head.xattr_version = in->xattr_version;
3427 }
3428
3429 m->size = in->size;
3430 m->max_size = in->max_size;
3431 m->truncate_seq = in->truncate_seq;
3432 m->truncate_size = in->truncate_size;
3433 m->mtime = in->mtime;
3434 m->atime = in->atime;
3435 m->ctime = in->ctime;
3436 m->btime = in->btime;
3437 m->time_warp_seq = in->time_warp_seq;
3438 m->change_attr = in->change_attr;
3439
3440 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3441 !in->cap_snaps.empty() &&
3442 in->cap_snaps.rbegin()->second.flush_tid == 0)
3443 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3444 m->flags = flags;
3445
3446 if (flush & CEPH_CAP_FILE_WR) {
3447 m->inline_version = in->inline_version;
3448 m->inline_data = in->inline_data;
3449 }
3450
3451 in->reported_size = in->size;
3452 m->set_snap_follows(follows);
3453 cap->wanted = want;
3454 if (cap == in->auth_cap) {
3455 if (want & CEPH_CAP_ANY_FILE_WR) {
3456 m->set_max_size(in->wanted_max_size);
3457 in->requested_max_size = in->wanted_max_size;
3458 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3459 } else {
3460 in->requested_max_size = 0;
3461 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3462 }
3463 }
3464
3465 if (!session->flushing_caps_tids.empty())
3466 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3467
3468 session->con->send_message2(std::move(m));
3469 }
3470
3471 static bool is_max_size_approaching(Inode *in)
3472 {
3473 /* mds will adjust max size according to the reported size */
3474 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3475 return false;
3476 if (in->size >= in->max_size)
3477 return true;
3478 /* half of previous max_size increment has been used */
3479 if (in->max_size > in->reported_size &&
3480 (in->size << 1) >= in->max_size + in->reported_size)
3481 return true;
3482 return false;
3483 }
3484
3485 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3486 {
3487 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3488 return used;
3489 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3490 return used;
3491
3492 if (issued & CEPH_CAP_FILE_LAZYIO) {
3493 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3494 used &= ~CEPH_CAP_FILE_CACHE;
3495 used |= CEPH_CAP_FILE_LAZYIO;
3496 }
3497 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3498 used &= ~CEPH_CAP_FILE_BUFFER;
3499 used |= CEPH_CAP_FILE_LAZYIO;
3500 }
3501 } else {
3502 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3503 used &= ~CEPH_CAP_FILE_CACHE;
3504 used |= CEPH_CAP_FILE_LAZYIO;
3505 }
3506 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3507 used &= ~CEPH_CAP_FILE_BUFFER;
3508 used |= CEPH_CAP_FILE_LAZYIO;
3509 }
3510 }
3511 return used;
3512 }
3513
3514 /**
3515 * check_caps
3516 *
3517 * Examine currently used and wanted versus held caps. Release, flush or ack
3518 * revoked caps to the MDS as appropriate.
3519 *
3520 * @param in the inode to check
3521 * @param flags flags to apply to cap check
3522 */
3523 void Client::check_caps(Inode *in, unsigned flags)
3524 {
3525 unsigned wanted = in->caps_wanted();
3526 unsigned used = get_caps_used(in);
3527 unsigned cap_used;
3528
3529 int implemented;
3530 int issued = in->caps_issued(&implemented);
3531 int revoking = implemented & ~issued;
3532
3533 int orig_used = used;
3534 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3535
3536 int retain = wanted | used | CEPH_CAP_PIN;
3537 if (!unmounting && in->nlink > 0) {
3538 if (wanted) {
3539 retain |= CEPH_CAP_ANY;
3540 } else if (in->is_dir() &&
3541 (issued & CEPH_CAP_FILE_SHARED) &&
3542 (in->flags & I_COMPLETE)) {
3543 // we do this here because we don't want to drop to Fs (and then
3544 // drop the Fs if we do a create!) if that alone makes us send lookups
3545 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3546 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3547 retain |= wanted;
3548 } else {
3549 retain |= CEPH_CAP_ANY_SHARED;
3550 // keep RD only if we didn't have the file open RW,
3551 // because then the mds would revoke it anyway to
3552 // journal max_size=0.
3553 if (in->max_size == 0)
3554 retain |= CEPH_CAP_ANY_RD;
3555 }
3556 }
3557
3558 ldout(cct, 10) << __func__ << " on " << *in
3559 << " wanted " << ccap_string(wanted)
3560 << " used " << ccap_string(used)
3561 << " issued " << ccap_string(issued)
3562 << " revoking " << ccap_string(revoking)
3563 << " flags=" << flags
3564 << dendl;
3565
3566 if (in->snapid != CEPH_NOSNAP)
3567 return; //snap caps last forever, can't write
3568
3569 if (in->caps.empty())
3570 return; // guard if at end of func
3571
3572 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3573 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3574 if (_release(in))
3575 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3576 }
3577
3578
3579 for (auto &p : in->caps) {
3580 mds_rank_t mds = p.first;
3581 Cap &cap = p.second;
3582
3583 MetaSession *session = &mds_sessions.at(mds);
3584
3585 cap_used = used;
3586 if (in->auth_cap && &cap != in->auth_cap)
3587 cap_used &= ~in->auth_cap->issued;
3588
3589 revoking = cap.implemented & ~cap.issued;
3590
3591 ldout(cct, 10) << " cap mds." << mds
3592 << " issued " << ccap_string(cap.issued)
3593 << " implemented " << ccap_string(cap.implemented)
3594 << " revoking " << ccap_string(revoking) << dendl;
3595
3596 if (in->wanted_max_size > in->max_size &&
3597 in->wanted_max_size > in->requested_max_size &&
3598 &cap == in->auth_cap)
3599 goto ack;
3600
3601 /* approaching file_max? */
3602 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3603 &cap == in->auth_cap &&
3604 is_max_size_approaching(in)) {
3605 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3606 << ", reported " << in->reported_size << dendl;
3607 goto ack;
3608 }
3609
3610 /* completed revocation? */
3611 if (revoking && (revoking & cap_used) == 0) {
3612 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3613 goto ack;
3614 }
3615
3616 /* want more caps from mds? */
3617 if (wanted & ~(cap.wanted | cap.issued))
3618 goto ack;
3619
3620 if (!revoking && unmounting && (cap_used == 0))
3621 goto ack;
3622
3623 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3624 !in->dirty_caps) // and we have no dirty caps
3625 continue;
3626
3627 if (!(flags & CHECK_CAPS_NODELAY)) {
3628 ldout(cct, 10) << "delaying cap release" << dendl;
3629 cap_delay_requeue(in);
3630 continue;
3631 }
3632
3633 ack:
3634 if (&cap == in->auth_cap) {
3635 if (in->flags & I_KICK_FLUSH) {
3636 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3637 << " to mds." << mds << dendl;
3638 kick_flushing_caps(in, session);
3639 }
3640 if (!in->cap_snaps.empty() &&
3641 in->cap_snaps.rbegin()->second.flush_tid == 0)
3642 flush_snaps(in);
3643 }
3644
3645 int flushing;
3646 int msg_flags = 0;
3647 ceph_tid_t flush_tid;
3648 if (in->auth_cap == &cap && in->dirty_caps) {
3649 flushing = mark_caps_flushing(in, &flush_tid);
3650 if (flags & CHECK_CAPS_SYNCHRONOUS)
3651 msg_flags |= MClientCaps::FLAG_SYNC;
3652 } else {
3653 flushing = 0;
3654 flush_tid = 0;
3655 }
3656
3657 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3658 flushing, flush_tid);
3659 }
3660 }
3661
3662
3663 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3664 {
3665 int used = get_caps_used(in);
3666 int dirty = in->caps_dirty();
3667 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3668
3669 if (in->cap_snaps.size() &&
3670 in->cap_snaps.rbegin()->second.writing) {
3671 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3672 return;
3673 } else if (in->caps_dirty() ||
3674 (used & CEPH_CAP_FILE_WR) ||
3675 (dirty & CEPH_CAP_ANY_WR)) {
3676 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3677 ceph_assert(capsnapem.second); /* element inserted */
3678 CapSnap &capsnap = capsnapem.first->second;
3679 capsnap.context = old_snapc;
3680 capsnap.issued = in->caps_issued();
3681 capsnap.dirty = in->caps_dirty();
3682
3683 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3684
3685 capsnap.uid = in->uid;
3686 capsnap.gid = in->gid;
3687 capsnap.mode = in->mode;
3688 capsnap.btime = in->btime;
3689 capsnap.xattrs = in->xattrs;
3690 capsnap.xattr_version = in->xattr_version;
3691 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3692 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3693
3694 if (used & CEPH_CAP_FILE_WR) {
3695 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3696 capsnap.writing = 1;
3697 } else {
3698 finish_cap_snap(in, capsnap, used);
3699 }
3700 } else {
3701 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3702 }
3703 }
3704
3705 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3706 {
3707 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3708 capsnap.size = in->size;
3709 capsnap.mtime = in->mtime;
3710 capsnap.atime = in->atime;
3711 capsnap.ctime = in->ctime;
3712 capsnap.time_warp_seq = in->time_warp_seq;
3713 capsnap.change_attr = in->change_attr;
3714 capsnap.dirty |= in->caps_dirty();
3715
3716 /* Only reset it if it wasn't set before */
3717 if (capsnap.cap_dirtier_uid == -1) {
3718 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3719 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3720 }
3721
3722 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3723 capsnap.inline_data = in->inline_data;
3724 capsnap.inline_version = in->inline_version;
3725 }
3726
3727 if (used & CEPH_CAP_FILE_BUFFER) {
3728 capsnap.writing = 1;
3729 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3730 << " WRBUFFER, delaying" << dendl;
3731 } else {
3732 capsnap.dirty_data = 0;
3733 flush_snaps(in);
3734 }
3735 }
3736
3737 void Client::send_flush_snap(Inode *in, MetaSession *session,
3738 snapid_t follows, CapSnap& capsnap)
3739 {
3740 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3741 in->ino, in->snaprealm->ino, 0,
3742 in->auth_cap->mseq, cap_epoch_barrier);
3743 m->caller_uid = capsnap.cap_dirtier_uid;
3744 m->caller_gid = capsnap.cap_dirtier_gid;
3745
3746 m->set_client_tid(capsnap.flush_tid);
3747 m->head.snap_follows = follows;
3748
3749 m->head.caps = capsnap.issued;
3750 m->head.dirty = capsnap.dirty;
3751
3752 m->head.uid = capsnap.uid;
3753 m->head.gid = capsnap.gid;
3754 m->head.mode = capsnap.mode;
3755 m->btime = capsnap.btime;
3756
3757 m->size = capsnap.size;
3758
3759 m->head.xattr_version = capsnap.xattr_version;
3760 encode(capsnap.xattrs, m->xattrbl);
3761
3762 m->ctime = capsnap.ctime;
3763 m->btime = capsnap.btime;
3764 m->mtime = capsnap.mtime;
3765 m->atime = capsnap.atime;
3766 m->time_warp_seq = capsnap.time_warp_seq;
3767 m->change_attr = capsnap.change_attr;
3768
3769 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3770 m->inline_version = in->inline_version;
3771 m->inline_data = in->inline_data;
3772 }
3773
3774 ceph_assert(!session->flushing_caps_tids.empty());
3775 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3776
3777 session->con->send_message2(std::move(m));
3778 }
3779
3780 void Client::flush_snaps(Inode *in)
3781 {
3782 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3783 ceph_assert(in->cap_snaps.size());
3784
3785 // pick auth mds
3786 ceph_assert(in->auth_cap);
3787 MetaSession *session = in->auth_cap->session;
3788
3789 for (auto &p : in->cap_snaps) {
3790 CapSnap &capsnap = p.second;
3791 // only do new flush
3792 if (capsnap.flush_tid > 0)
3793 continue;
3794
3795 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3796 << " follows " << p.first
3797 << " size " << capsnap.size
3798 << " mtime " << capsnap.mtime
3799 << " dirty_data=" << capsnap.dirty_data
3800 << " writing=" << capsnap.writing
3801 << " on " << *in << dendl;
3802 if (capsnap.dirty_data || capsnap.writing)
3803 break;
3804
3805 capsnap.flush_tid = ++last_flush_tid;
3806 session->flushing_caps_tids.insert(capsnap.flush_tid);
3807 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3808 if (!in->flushing_cap_item.is_on_list())
3809 session->flushing_caps.push_back(&in->flushing_cap_item);
3810
3811 send_flush_snap(in, session, p.first, capsnap);
3812 }
3813 }
3814
3815 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3816 {
3817 ceph::condition_variable cond;
3818 ls.push_back(&cond);
3819 std::unique_lock l{client_lock, std::adopt_lock};
3820 cond.wait(l);
3821 l.release();
3822 ls.remove(&cond);
3823 }
3824
3825 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3826 {
3827 for (auto cond : ls) {
3828 cond->notify_all();
3829 }
3830 }
3831
3832 void Client::wait_on_context_list(list<Context*>& ls)
3833 {
3834 ceph::condition_variable cond;
3835 bool done = false;
3836 int r;
3837 ls.push_back(new C_Cond(cond, &done, &r));
3838 std::unique_lock l{client_lock, std::adopt_lock};
3839 cond.wait(l, [&done] { return done;});
3840 l.release();
3841 }
3842
3843 void Client::signal_context_list(list<Context*>& ls)
3844 {
3845 while (!ls.empty()) {
3846 ls.front()->complete(0);
3847 ls.pop_front();
3848 }
3849 }
3850
3851 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3852 {
3853 for (const auto &cap : s->caps) {
3854 auto &in = cap->inode;
3855 if (reconnect) {
3856 in.requested_max_size = 0;
3857 in.wanted_max_size = 0;
3858 } else {
3859 if (cap->gen < s->cap_gen) {
3860 // mds did not re-issue stale cap.
3861 cap->issued = cap->implemented = CEPH_CAP_PIN;
3862 // make sure mds knows what we want.
3863 if (in.caps_file_wanted() & ~cap->wanted)
3864 in.flags |= I_CAP_DROPPED;
3865 }
3866 }
3867 signal_cond_list(in.waitfor_caps);
3868 }
3869 }
3870
3871
3872 // flush dirty data (from objectcache)
3873
3874 class C_Client_CacheInvalidate : public Context {
3875 private:
3876 Client *client;
3877 vinodeno_t ino;
3878 int64_t offset, length;
3879 public:
3880 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3881 client(c), offset(off), length(len) {
3882 if (client->use_faked_inos())
3883 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3884 else
3885 ino = in->vino();
3886 }
3887 void finish(int r) override {
3888 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3890 client->_async_invalidate(ino, offset, length);
3891 }
3892 };
3893
3894 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3895 {
3896 if (unmounting)
3897 return;
3898 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3899 ino_invalidate_cb(callback_handle, ino, off, len);
3900 }
3901
3902 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3903
3904 if (ino_invalidate_cb)
3905 // we queue the invalidate, which calls the callback and decrements the ref
3906 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3907 }
3908
3909 void Client::_invalidate_inode_cache(Inode *in)
3910 {
3911 ldout(cct, 10) << __func__ << " " << *in << dendl;
3912
3913 // invalidate our userspace inode cache
3914 if (cct->_conf->client_oc) {
3915 objectcacher->release_set(&in->oset);
3916 if (!objectcacher->set_is_empty(&in->oset))
3917 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3918 }
3919
3920 _schedule_invalidate_callback(in, 0, 0);
3921 }
3922
3923 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3924 {
3925 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3926
3927 // invalidate our userspace inode cache
3928 if (cct->_conf->client_oc) {
3929 vector<ObjectExtent> ls;
3930 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3931 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3932 }
3933
3934 _schedule_invalidate_callback(in, off, len);
3935 }
3936
3937 bool Client::_release(Inode *in)
3938 {
3939 ldout(cct, 20) << "_release " << *in << dendl;
3940 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3941 _invalidate_inode_cache(in);
3942 return true;
3943 }
3944 return false;
3945 }
3946
3947 bool Client::_flush(Inode *in, Context *onfinish)
3948 {
3949 ldout(cct, 10) << "_flush " << *in << dendl;
3950
3951 if (!in->oset.dirty_or_tx) {
3952 ldout(cct, 10) << " nothing to flush" << dendl;
3953 onfinish->complete(0);
3954 return true;
3955 }
3956
3957 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3958 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3959 objectcacher->purge_set(&in->oset);
3960 if (onfinish) {
3961 onfinish->complete(-ENOSPC);
3962 }
3963 return true;
3964 }
3965
3966 return objectcacher->flush_set(&in->oset, onfinish);
3967 }
3968
3969 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3970 {
3971 ceph_assert(ceph_mutex_is_locked(client_lock));
3972 if (!in->oset.dirty_or_tx) {
3973 ldout(cct, 10) << " nothing to flush" << dendl;
3974 return;
3975 }
3976
3977 C_SaferCond onflush("Client::_flush_range flock");
3978 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3979 offset, size, &onflush);
3980 if (!ret) {
3981 // wait for flush
3982 client_lock.unlock();
3983 onflush.wait();
3984 client_lock.lock();
3985 }
3986 }
3987
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3989 {
3990 // std::lock_guard l(client_lock);
3991 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
3992 Inode *in = static_cast<Inode *>(oset->parent);
3993 ceph_assert(in);
3994 _flushed(in);
3995 }
3996
3997 void Client::_flushed(Inode *in)
3998 {
3999 ldout(cct, 10) << "_flushed " << *in << dendl;
4000
4001 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4002 }
4003
4004
4005
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode *in, unsigned issued)
4008 {
4009 unsigned had = in->caps_issued();
4010
4011 if ((issued & CEPH_CAP_FILE_CACHE) &&
4012 !(had & CEPH_CAP_FILE_CACHE))
4013 in->cache_gen++;
4014
4015 if ((issued & CEPH_CAP_FILE_SHARED) !=
4016 (had & CEPH_CAP_FILE_SHARED)) {
4017 if (issued & CEPH_CAP_FILE_SHARED)
4018 in->shared_gen++;
4019 if (in->is_dir())
4020 clear_dir_complete_and_ordered(in, true);
4021 }
4022 }
4023
4024 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4025 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4026 inodeno_t realm, int flags, const UserPerm& cap_perms)
4027 {
4028 if (!in->is_any_caps()) {
4029 ceph_assert(in->snaprealm == 0);
4030 in->snaprealm = get_snap_realm(realm);
4031 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4032 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4033 } else {
4034 ceph_assert(in->snaprealm);
4035 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4036 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4037 in->snaprealm_item.remove_myself();
4038 auto oldrealm = in->snaprealm;
4039 in->snaprealm = get_snap_realm(realm);
4040 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4041 put_snap_realm(oldrealm);
4042 }
4043 }
4044
4045 mds_rank_t mds = mds_session->mds_num;
4046 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4047 Cap &cap = capem.first->second;
4048 if (!capem.second) {
4049 if (cap.gen < mds_session->cap_gen)
4050 cap.issued = cap.implemented = CEPH_CAP_PIN;
4051
4052 /*
4053 * auth mds of the inode changed. we received the cap export
4054 * message, but still haven't received the cap import message.
4055 * handle_cap_export() updated the new auth MDS' cap.
4056 *
4057 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058 * a message that was send before the cap import message. So
4059 * don't remove caps.
4060 */
4061 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4062 if (&cap != in->auth_cap)
4063 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4064
4065 ceph_assert(cap.cap_id == cap_id);
4066 seq = cap.seq;
4067 mseq = cap.mseq;
4068 issued |= cap.issued;
4069 flags |= CEPH_CAP_FLAG_AUTH;
4070 }
4071 }
4072
4073 check_cap_issue(in, issued);
4074
4075 if (flags & CEPH_CAP_FLAG_AUTH) {
4076 if (in->auth_cap != &cap &&
4077 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4078 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4079 ldout(cct, 10) << __func__ << " changing auth cap: "
4080 << "add myself to new auth MDS' flushing caps list" << dendl;
4081 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4082 }
4083 in->auth_cap = &cap;
4084 }
4085 }
4086
4087 unsigned old_caps = cap.issued;
4088 cap.cap_id = cap_id;
4089 cap.issued = issued;
4090 cap.implemented |= issued;
4091 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4092 cap.wanted = wanted;
4093 else
4094 cap.wanted |= wanted;
4095 cap.seq = seq;
4096 cap.issue_seq = seq;
4097 cap.mseq = mseq;
4098 cap.gen = mds_session->cap_gen;
4099 cap.latest_perms = cap_perms;
4100 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4101 << " from mds." << mds
4102 << " on " << *in
4103 << dendl;
4104
4105 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4106 // non-auth MDS is revoking the newly grant caps ?
4107 for (auto &p : in->caps) {
4108 if (&p.second == &cap)
4109 continue;
4110 if (p.second.implemented & ~p.second.issued & issued) {
4111 check_caps(in, CHECK_CAPS_NODELAY);
4112 break;
4113 }
4114 }
4115 }
4116
4117 if (issued & ~old_caps)
4118 signal_cond_list(in->waitfor_caps);
4119 }
4120
4121 void Client::remove_cap(Cap *cap, bool queue_release)
4122 {
4123 auto &in = cap->inode;
4124 MetaSession *session = cap->session;
4125 mds_rank_t mds = cap->session->mds_num;
4126
4127 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4128
4129 if (queue_release) {
4130 session->enqueue_cap_release(
4131 in.ino,
4132 cap->cap_id,
4133 cap->issue_seq,
4134 cap->mseq,
4135 cap_epoch_barrier);
4136 }
4137
4138 if (in.auth_cap == cap) {
4139 if (in.flushing_cap_item.is_on_list()) {
4140 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4141 in.flushing_cap_item.remove_myself();
4142 }
4143 in.auth_cap = NULL;
4144 }
4145 size_t n = in.caps.erase(mds);
4146 ceph_assert(n == 1);
4147 cap = nullptr;
4148
4149 if (!in.is_any_caps()) {
4150 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4151 in.snaprealm_item.remove_myself();
4152 put_snap_realm(in.snaprealm);
4153 in.snaprealm = 0;
4154 }
4155 }
4156
4157 void Client::remove_all_caps(Inode *in)
4158 {
4159 while (!in->caps.empty())
4160 remove_cap(&in->caps.begin()->second, true);
4161 }
4162
4163 void Client::remove_session_caps(MetaSession *s, int err)
4164 {
4165 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4166
4167 while (s->caps.size()) {
4168 Cap *cap = *s->caps.begin();
4169 InodeRef in(&cap->inode);
4170 bool dirty_caps = false;
4171 if (in->auth_cap == cap) {
4172 dirty_caps = in->dirty_caps | in->flushing_caps;
4173 in->wanted_max_size = 0;
4174 in->requested_max_size = 0;
4175 if (in->has_any_filelocks())
4176 in->flags |= I_ERROR_FILELOCK;
4177 }
4178 auto caps = cap->implemented;
4179 if (cap->wanted | cap->issued)
4180 in->flags |= I_CAP_DROPPED;
4181 remove_cap(cap, false);
4182 in->cap_snaps.clear();
4183 if (dirty_caps) {
4184 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4185 if (in->flushing_caps) {
4186 num_flushing_caps--;
4187 in->flushing_cap_tids.clear();
4188 }
4189 in->flushing_caps = 0;
4190 in->mark_caps_clean();
4191 put_inode(in.get());
4192 }
4193 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4194 if (caps && !in->caps_issued_mask(caps, true)) {
4195 if (err == -EBLACKLISTED) {
4196 if (in->oset.dirty_or_tx) {
4197 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4198 in->set_async_err(err);
4199 }
4200 objectcacher->purge_set(&in->oset);
4201 } else {
4202 objectcacher->release_set(&in->oset);
4203 }
4204 _schedule_invalidate_callback(in.get(), 0, 0);
4205 }
4206
4207 signal_cond_list(in->waitfor_caps);
4208 }
4209 s->flushing_caps_tids.clear();
4210 sync_cond.notify_all();
4211 }
4212
4213 int Client::_do_remount(bool retry_on_error)
4214 {
4215 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4216
4217 errno = 0;
4218 int r = remount_cb(callback_handle);
4219 if (r == 0) {
4220 retries_on_invalidate = 0;
4221 } else {
4222 int e = errno;
4223 client_t whoami = get_nodeid();
4224 if (r == -1) {
4225 lderr(cct) <<
4226 "failed to remount (to trim kernel dentries): "
4227 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4228 } else {
4229 lderr(cct) <<
4230 "failed to remount (to trim kernel dentries): "
4231 "return code = " << r << dendl;
4232 }
4233 bool should_abort =
4234 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4235 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4236 !(retry_on_error && (++retries_on_invalidate < max_retries));
4237 if (should_abort && !unmounting) {
4238 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4239 ceph_abort();
4240 }
4241 }
4242 return r;
4243 }
4244
4245 class C_Client_Remount : public Context {
4246 private:
4247 Client *client;
4248 public:
4249 explicit C_Client_Remount(Client *c) : client(c) {}
4250 void finish(int r) override {
4251 ceph_assert(r == 0);
4252 client->_do_remount(true);
4253 }
4254 };
4255
4256 void Client::_invalidate_kernel_dcache()
4257 {
4258 if (unmounting)
4259 return;
4260 if (can_invalidate_dentries) {
4261 if (dentry_invalidate_cb && root->dir) {
4262 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4263 p != root->dir->dentries.end();
4264 ++p) {
4265 if (p->second->inode)
4266 _schedule_invalidate_dentry_callback(p->second, false);
4267 }
4268 }
4269 } else if (remount_cb) {
4270 // Hacky:
4271 // when remounting a file system, linux kernel trims all unused dentries in the fs
4272 remount_finisher.queue(new C_Client_Remount(this));
4273 }
4274 }
4275
4276 void Client::_trim_negative_child_dentries(InodeRef& in)
4277 {
4278 if (!in->is_dir())
4279 return;
4280
4281 Dir* dir = in->dir;
4282 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4283 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4284 Dentry *dn = p->second;
4285 ++p;
4286 ceph_assert(!dn->inode);
4287 if (dn->lru_is_expireable())
4288 unlink(dn, true, false); // keep dir, drop dentry
4289 }
4290 if (dir->dentries.empty()) {
4291 close_dir(dir);
4292 }
4293 }
4294
4295 if (in->flags & I_SNAPDIR_OPEN) {
4296 InodeRef snapdir = open_snapdir(in.get());
4297 _trim_negative_child_dentries(snapdir);
4298 }
4299 }
4300
4301 class C_Client_CacheRelease : public Context {
4302 private:
4303 Client *client;
4304 vinodeno_t ino;
4305 public:
4306 C_Client_CacheRelease(Client *c, Inode *in) :
4307 client(c) {
4308 if (client->use_faked_inos())
4309 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4310 else
4311 ino = in->vino();
4312 }
4313 void finish(int r) override {
4314 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4315 client->_async_inode_release(ino);
4316 }
4317 };
4318
4319 void Client::_async_inode_release(vinodeno_t ino)
4320 {
4321 if (unmounting)
4322 return;
4323 ldout(cct, 10) << __func__ << " " << ino << dendl;
4324 ino_release_cb(callback_handle, ino);
4325 }
4326
4327 void Client::_schedule_ino_release_callback(Inode *in) {
4328
4329 if (ino_release_cb)
4330 // we queue the invalidate, which calls the callback and decrements the ref
4331 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4332 }
4333
4334 void Client::trim_caps(MetaSession *s, uint64_t max)
4335 {
4336 mds_rank_t mds = s->mds_num;
4337 size_t caps_size = s->caps.size();
4338 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4339 << " caps " << caps_size << dendl;
4340
4341 uint64_t trimmed = 0;
4342 auto p = s->caps.begin();
4343 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4344 * looking at from getting deleted during traversal. */
4345 while ((caps_size - trimmed) > max && !p.end()) {
4346 Cap *cap = *p;
4347 InodeRef in(&cap->inode);
4348
4349 // Increment p early because it will be invalidated if cap
4350 // is deleted inside remove_cap
4351 ++p;
4352
4353 if (in->caps.size() > 1 && cap != in->auth_cap) {
4354 int mine = cap->issued | cap->implemented;
4355 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4356 // disposable non-auth cap
4357 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4358 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4359 cap = (remove_cap(cap, true), nullptr);
4360 trimmed++;
4361 }
4362 } else {
4363 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4364 _trim_negative_child_dentries(in);
4365 bool all = true;
4366 auto q = in->dentries.begin();
4367 while (q != in->dentries.end()) {
4368 Dentry *dn = *q;
4369 ++q;
4370 if (dn->lru_is_expireable()) {
4371 if (can_invalidate_dentries &&
4372 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4373 // Only issue one of these per DN for inodes in root: handle
4374 // others more efficiently by calling for root-child DNs at
4375 // the end of this function.
4376 _schedule_invalidate_dentry_callback(dn, true);
4377 }
4378 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4379 to_trim.insert(dn);
4380 } else {
4381 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4382 all = false;
4383 }
4384 }
4385 if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
4386 _schedule_ino_release_callback(in.get());
4387 }
4388 if (all && in->ino != MDS_INO_ROOT) {
4389 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4390 trimmed++;
4391 }
4392 }
4393 }
4394 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4395 for (const auto &dn : to_trim) {
4396 trim_dentry(dn);
4397 }
4398 to_trim.clear();
4399
4400 caps_size = s->caps.size();
4401 if (caps_size > (size_t)max)
4402 _invalidate_kernel_dcache();
4403 }
4404
4405 void Client::force_session_readonly(MetaSession *s)
4406 {
4407 s->readonly = true;
4408 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4409 auto &in = (*p)->inode;
4410 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4411 signal_cond_list(in.waitfor_caps);
4412 }
4413 }
4414
4415 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4416 {
4417 MetaSession *session = in->auth_cap->session;
4418
4419 int flushing = in->dirty_caps;
4420 ceph_assert(flushing);
4421
4422 ceph_tid_t flush_tid = ++last_flush_tid;
4423 in->flushing_cap_tids[flush_tid] = flushing;
4424
4425 if (!in->flushing_caps) {
4426 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4427 num_flushing_caps++;
4428 } else {
4429 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4430 }
4431
4432 in->flushing_caps |= flushing;
4433 in->mark_caps_clean();
4434
4435 if (!in->flushing_cap_item.is_on_list())
4436 session->flushing_caps.push_back(&in->flushing_cap_item);
4437 session->flushing_caps_tids.insert(flush_tid);
4438
4439 *ptid = flush_tid;
4440 return flushing;
4441 }
4442
4443 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4444 {
4445 for (auto &p : in->cap_snaps) {
4446 CapSnap &capsnap = p.second;
4447 if (capsnap.flush_tid > 0) {
4448 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4449 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4450 }
4451 }
4452 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4453 it != in->flushing_cap_tids.end();
4454 ++it) {
4455 old_s->flushing_caps_tids.erase(it->first);
4456 new_s->flushing_caps_tids.insert(it->first);
4457 }
4458 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4459 }
4460
4461 /*
4462 * Flush all caps back to the MDS. Because the callers generally wait on the
4463 * result of this function (syncfs and umount cases), we set
4464 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4465 */
4466 void Client::flush_caps_sync()
4467 {
4468 ldout(cct, 10) << __func__ << dendl;
4469 xlist<Inode*>::iterator p = delayed_list.begin();
4470 while (!p.end()) {
4471 unsigned flags = CHECK_CAPS_NODELAY;
4472 Inode *in = *p;
4473
4474 ++p;
4475 delayed_list.pop_front();
4476 if (p.end() && dirty_list.empty())
4477 flags |= CHECK_CAPS_SYNCHRONOUS;
4478 check_caps(in, flags);
4479 }
4480
4481 // other caps, too
4482 p = dirty_list.begin();
4483 while (!p.end()) {
4484 unsigned flags = CHECK_CAPS_NODELAY;
4485 Inode *in = *p;
4486
4487 ++p;
4488 if (p.end())
4489 flags |= CHECK_CAPS_SYNCHRONOUS;
4490 check_caps(in, flags);
4491 }
4492 }
4493
4494 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4495 {
4496 while (in->flushing_caps) {
4497 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4498 ceph_assert(it != in->flushing_cap_tids.end());
4499 if (it->first > want)
4500 break;
4501 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4502 << ccap_string(it->second) << " want " << want
4503 << " last " << it->first << dendl;
4504 wait_on_list(in->waitfor_caps);
4505 }
4506 }
4507
4508 void Client::wait_sync_caps(ceph_tid_t want)
4509 {
4510 retry:
4511 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4512 << num_flushing_caps << " total flushing)" << dendl;
4513 for (auto &p : mds_sessions) {
4514 MetaSession *s = &p.second;
4515 if (s->flushing_caps_tids.empty())
4516 continue;
4517 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4518 if (oldest_tid <= want) {
4519 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4520 << " (want " << want << ")" << dendl;
4521 std::unique_lock l{client_lock, std::adopt_lock};
4522 sync_cond.wait(l);
4523 l.release();
4524 goto retry;
4525 }
4526 }
4527 }
4528
4529 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4530 {
4531 in->flags &= ~I_KICK_FLUSH;
4532
4533 Cap *cap = in->auth_cap;
4534 ceph_assert(cap->session == session);
4535
4536 ceph_tid_t last_snap_flush = 0;
4537 for (auto p = in->flushing_cap_tids.rbegin();
4538 p != in->flushing_cap_tids.rend();
4539 ++p) {
4540 if (!p->second) {
4541 last_snap_flush = p->first;
4542 break;
4543 }
4544 }
4545
4546 int wanted = in->caps_wanted();
4547 int used = get_caps_used(in) | in->caps_dirty();
4548 auto it = in->cap_snaps.begin();
4549 for (auto& p : in->flushing_cap_tids) {
4550 if (p.second) {
4551 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4552 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4553 p.second, p.first);
4554 } else {
4555 ceph_assert(it != in->cap_snaps.end());
4556 ceph_assert(it->second.flush_tid == p.first);
4557 send_flush_snap(in, session, it->first, it->second);
4558 ++it;
4559 }
4560 }
4561 }
4562
4563 void Client::kick_flushing_caps(MetaSession *session)
4564 {
4565 mds_rank_t mds = session->mds_num;
4566 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4567
4568 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4569 Inode *in = *p;
4570 if (in->flags & I_KICK_FLUSH) {
4571 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4572 kick_flushing_caps(in, session);
4573 }
4574 }
4575 }
4576
4577 void Client::early_kick_flushing_caps(MetaSession *session)
4578 {
4579 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4580 Inode *in = *p;
4581 Cap *cap = in->auth_cap;
4582 ceph_assert(cap);
4583
4584 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4585 // stage. This guarantees that MDS processes the cap flush message before issuing
4586 // the flushing caps to other client.
4587 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4588 in->flags |= I_KICK_FLUSH;
4589 continue;
4590 }
4591
4592 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4593 << " to mds." << session->mds_num << dendl;
4594 // send_reconnect() also will reset these sequence numbers. make sure
4595 // sequence numbers in cap flush message match later reconnect message.
4596 cap->seq = 0;
4597 cap->issue_seq = 0;
4598 cap->mseq = 0;
4599 cap->issued = cap->implemented;
4600
4601 kick_flushing_caps(in, session);
4602 }
4603 }
4604
4605 void SnapRealm::build_snap_context()
4606 {
4607 set<snapid_t> snaps;
4608 snapid_t max_seq = seq;
4609
4610 // start with prior_parents?
4611 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4612 snaps.insert(prior_parent_snaps[i]);
4613
4614 // current parent's snaps
4615 if (pparent) {
4616 const SnapContext& psnapc = pparent->get_snap_context();
4617 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4618 if (psnapc.snaps[i] >= parent_since)
4619 snaps.insert(psnapc.snaps[i]);
4620 if (psnapc.seq > max_seq)
4621 max_seq = psnapc.seq;
4622 }
4623
4624 // my snaps
4625 for (unsigned i=0; i<my_snaps.size(); i++)
4626 snaps.insert(my_snaps[i]);
4627
4628 // ok!
4629 cached_snap_context.seq = max_seq;
4630 cached_snap_context.snaps.resize(0);
4631 cached_snap_context.snaps.reserve(snaps.size());
4632 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4633 cached_snap_context.snaps.push_back(*p);
4634 }
4635
4636 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4637 {
4638 list<SnapRealm*> q;
4639 q.push_back(realm);
4640
4641 while (!q.empty()) {
4642 realm = q.front();
4643 q.pop_front();
4644
4645 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4646 realm->invalidate_cache();
4647
4648 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4649 p != realm->pchildren.end();
4650 ++p)
4651 q.push_back(*p);
4652 }
4653 }
4654
4655 SnapRealm *Client::get_snap_realm(inodeno_t r)
4656 {
4657 SnapRealm *realm = snap_realms[r];
4658 if (!realm)
4659 snap_realms[r] = realm = new SnapRealm(r);
4660 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4661 realm->nref++;
4662 return realm;
4663 }
4664
4665 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4666 {
4667 if (snap_realms.count(r) == 0) {
4668 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4669 return NULL;
4670 }
4671 SnapRealm *realm = snap_realms[r];
4672 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4673 realm->nref++;
4674 return realm;
4675 }
4676
4677 void Client::put_snap_realm(SnapRealm *realm)
4678 {
4679 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4680 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4681 if (--realm->nref == 0) {
4682 snap_realms.erase(realm->ino);
4683 if (realm->pparent) {
4684 realm->pparent->pchildren.erase(realm);
4685 put_snap_realm(realm->pparent);
4686 }
4687 delete realm;
4688 }
4689 }
4690
4691 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4692 {
4693 if (realm->parent != parent) {
4694 ldout(cct, 10) << __func__ << " " << *realm
4695 << " " << realm->parent << " -> " << parent << dendl;
4696 realm->parent = parent;
4697 if (realm->pparent) {
4698 realm->pparent->pchildren.erase(realm);
4699 put_snap_realm(realm->pparent);
4700 }
4701 realm->pparent = get_snap_realm(parent);
4702 realm->pparent->pchildren.insert(realm);
4703 return true;
4704 }
4705 return false;
4706 }
4707
4708 static bool has_new_snaps(const SnapContext& old_snapc,
4709 const SnapContext& new_snapc)
4710 {
4711 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4712 }
4713
4714
4715 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4716 {
4717 SnapRealm *first_realm = NULL;
4718 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4719
4720 map<SnapRealm*, SnapContext> dirty_realms;
4721
4722 auto p = bl.cbegin();
4723 while (!p.end()) {
4724 SnapRealmInfo info;
4725 decode(info, p);
4726 SnapRealm *realm = get_snap_realm(info.ino());
4727
4728 bool invalidate = false;
4729
4730 if (info.seq() > realm->seq) {
4731 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4732 << dendl;
4733
4734 if (flush) {
4735 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4736 // flush me + children
4737 list<SnapRealm*> q;
4738 q.push_back(realm);
4739 while (!q.empty()) {
4740 SnapRealm *realm = q.front();
4741 q.pop_front();
4742
4743 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4744 p != realm->pchildren.end();
4745 ++p)
4746 q.push_back(*p);
4747
4748 if (dirty_realms.count(realm) == 0) {
4749 realm->nref++;
4750 dirty_realms[realm] = realm->get_snap_context();
4751 }
4752 }
4753 }
4754
4755 // update
4756 realm->seq = info.seq();
4757 realm->created = info.created();
4758 realm->parent_since = info.parent_since();
4759 realm->prior_parent_snaps = info.prior_parent_snaps;
4760 realm->my_snaps = info.my_snaps;
4761 invalidate = true;
4762 }
4763
4764 // _always_ verify parent
4765 if (adjust_realm_parent(realm, info.parent()))
4766 invalidate = true;
4767
4768 if (invalidate) {
4769 invalidate_snaprealm_and_children(realm);
4770 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4771 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4772 } else {
4773 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4774 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4775 }
4776
4777 if (!first_realm)
4778 first_realm = realm;
4779 else
4780 put_snap_realm(realm);
4781 }
4782
4783 for (auto &[realm, snapc] : dirty_realms) {
4784 // if there are new snaps ?
4785 if (has_new_snaps(snapc, realm->get_snap_context())) {
4786 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4787 for (auto&& in : realm->inodes_with_caps) {
4788 queue_cap_snap(in, snapc);
4789 }
4790 } else {
4791 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4792 }
4793 put_snap_realm(realm);
4794 }
4795
4796 if (realm_ret)
4797 *realm_ret = first_realm;
4798 else
4799 put_snap_realm(first_realm);
4800 }
4801
4802 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4803 {
4804 ldout(cct, 10) << __func__ << " " << *m << dendl;
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
4808 return;
4809 }
4810
4811 got_mds_push(session);
4812
4813 map<Inode*, SnapContext> to_move;
4814 SnapRealm *realm = 0;
4815
4816 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4817 ceph_assert(m->head.split);
4818 SnapRealmInfo info;
4819 auto p = m->bl.cbegin();
4820 decode(info, p);
4821 ceph_assert(info.ino() == m->head.split);
4822
4823 // flush, then move, ino's.
4824 realm = get_snap_realm(info.ino());
4825 ldout(cct, 10) << " splitting off " << *realm << dendl;
4826 for (auto& ino : m->split_inos) {
4827 vinodeno_t vino(ino, CEPH_NOSNAP);
4828 if (inode_map.count(vino)) {
4829 Inode *in = inode_map[vino];
4830 if (!in->snaprealm || in->snaprealm == realm)
4831 continue;
4832 if (in->snaprealm->created > info.created()) {
4833 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4834 << *in->snaprealm << dendl;
4835 continue;
4836 }
4837 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4838
4839
4840 in->snaprealm_item.remove_myself();
4841 to_move[in] = in->snaprealm->get_snap_context();
4842 put_snap_realm(in->snaprealm);
4843 }
4844 }
4845
4846 // move child snaprealms, too
4847 for (auto& child_realm : m->split_realms) {
4848 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4849 SnapRealm *child = get_snap_realm_maybe(child_realm);
4850 if (!child)
4851 continue;
4852 adjust_realm_parent(child, realm->ino);
4853 put_snap_realm(child);
4854 }
4855 }
4856
4857 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4858
4859 if (realm) {
4860 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4861 Inode *in = p->first;
4862 in->snaprealm = realm;
4863 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4864 realm->nref++;
4865 // queue for snap writeback
4866 if (has_new_snaps(p->second, realm->get_snap_context()))
4867 queue_cap_snap(in, p->second);
4868 }
4869 put_snap_realm(realm);
4870 }
4871 }
4872
4873 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4874 {
4875 mds_rank_t mds = mds_rank_t(m->get_source().num());
4876 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4877 if (!session) {
4878 return;
4879 }
4880
4881 got_mds_push(session);
4882
4883 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4884
4885 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4886 if (inode_map.count(vino)) {
4887 Inode *in = NULL;
4888 in = inode_map[vino];
4889
4890 if (in) {
4891 in->quota = m->quota;
4892 in->rstat = m->rstat;
4893 }
4894 }
4895 }
4896
4897 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4898 {
4899 mds_rank_t mds = mds_rank_t(m->get_source().num());
4900 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4901 if (!session) {
4902 return;
4903 }
4904
4905 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4906 // Pause RADOS operations until we see the required epoch
4907 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4908 }
4909
4910 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4911 // Record the barrier so that we will transmit it to MDS when releasing
4912 set_cap_epoch_barrier(m->osd_epoch_barrier);
4913 }
4914
4915 got_mds_push(session);
4916
4917 Inode *in;
4918 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4919 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4920 in = it->second;
4921 } else {
4922 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4923 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4924 session->enqueue_cap_release(
4925 m->get_ino(),
4926 m->get_cap_id(),
4927 m->get_seq(),
4928 m->get_mseq(),
4929 cap_epoch_barrier);
4930 } else {
4931 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4932 }
4933
4934 // in case the mds is waiting on e.g. a revocation
4935 flush_cap_releases();
4936 return;
4937 }
4938
4939 switch (m->get_op()) {
4940 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4941 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4942 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4943 }
4944
4945 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4946 Cap &cap = in->caps.at(mds);
4947
4948 switch (m->get_op()) {
4949 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4950 case CEPH_CAP_OP_IMPORT:
4951 case CEPH_CAP_OP_REVOKE:
4952 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4953 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4954 }
4955 } else {
4956 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4957 return;
4958 }
4959 }
4960
4961 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4962 {
4963 mds_rank_t mds = session->mds_num;
4964
4965 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4966 << " IMPORT from mds." << mds << dendl;
4967
4968 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4969 Cap *cap = NULL;
4970 UserPerm cap_perms;
4971 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4972 cap = &it->second;
4973 cap_perms = cap->latest_perms;
4974 }
4975
4976 // add/update it
4977 SnapRealm *realm = NULL;
4978 update_snap_trace(m->snapbl, &realm);
4979
4980 int issued = m->get_caps();
4981 int wanted = m->get_wanted();
4982 add_update_cap(in, session, m->get_cap_id(),
4983 issued, wanted, m->get_seq(), m->get_mseq(),
4984 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4985
4986 if (cap && cap->cap_id == m->peer.cap_id) {
4987 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4988 }
4989
4990 if (realm)
4991 put_snap_realm(realm);
4992
4993 if (in->auth_cap && in->auth_cap->session == session) {
4994 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4995 in->requested_max_size > m->get_max_size()) {
4996 in->requested_max_size = 0;
4997 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4998 }
4999 // reflush any/all caps (if we are now the auth_cap)
5000 kick_flushing_caps(in, session);
5001 }
5002 }
5003
5004 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5005 {
5006 mds_rank_t mds = session->mds_num;
5007
5008 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5009 << " EXPORT from mds." << mds << dendl;
5010
5011 auto it = in->caps.find(mds);
5012 if (it != in->caps.end()) {
5013 Cap &cap = it->second;
5014 if (cap.cap_id == m->get_cap_id()) {
5015 if (m->peer.cap_id) {
5016 const auto peer_mds = mds_rank_t(m->peer.mds);
5017 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5018 auto it = in->caps.find(peer_mds);
5019 if (it != in->caps.end()) {
5020 Cap &tcap = it->second;
5021 if (tcap.cap_id == m->peer.cap_id &&
5022 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5023 tcap.cap_id = m->peer.cap_id;
5024 tcap.seq = m->peer.seq - 1;
5025 tcap.issue_seq = tcap.seq;
5026 tcap.issued |= cap.issued;
5027 tcap.implemented |= cap.issued;
5028 if (&cap == in->auth_cap)
5029 in->auth_cap = &tcap;
5030 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5031 adjust_session_flushing_caps(in, session, tsession);
5032 }
5033 } else {
5034 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5035 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5036 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5037 cap.latest_perms);
5038 }
5039 } else {
5040 if (cap.wanted | cap.issued)
5041 in->flags |= I_CAP_DROPPED;
5042 }
5043
5044 remove_cap(&cap, false);
5045 }
5046 }
5047 }
5048
5049 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5050 {
5051 mds_rank_t mds = session->mds_num;
5052 ceph_assert(in->caps.count(mds));
5053
5054 ldout(cct, 10) << __func__ << " on ino " << *in
5055 << " size " << in->size << " -> " << m->get_size()
5056 << dendl;
5057
5058 int issued;
5059 in->caps_issued(&issued);
5060 issued |= in->caps_dirty();
5061 update_inode_file_size(in, issued, m->get_size(),
5062 m->get_truncate_seq(), m->get_truncate_size());
5063 }
5064
5065 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5066 {
5067 ceph_tid_t flush_ack_tid = m->get_client_tid();
5068 int dirty = m->get_dirty();
5069 int cleaned = 0;
5070 int flushed = 0;
5071
5072 auto it = in->flushing_cap_tids.begin();
5073 if (it->first < flush_ack_tid) {
5074 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5075 << " got unexpected flush ack tid " << flush_ack_tid
5076 << " expected is " << it->first << dendl;
5077 }
5078 for (; it != in->flushing_cap_tids.end(); ) {
5079 if (!it->second) {
5080 // cap snap
5081 ++it;
5082 continue;
5083 }
5084 if (it->first == flush_ack_tid)
5085 cleaned = it->second;
5086 if (it->first <= flush_ack_tid) {
5087 session->flushing_caps_tids.erase(it->first);
5088 in->flushing_cap_tids.erase(it++);
5089 ++flushed;
5090 continue;
5091 }
5092 cleaned &= ~it->second;
5093 if (!cleaned)
5094 break;
5095 ++it;
5096 }
5097
5098 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5099 << " cleaned " << ccap_string(cleaned) << " on " << *in
5100 << " with " << ccap_string(dirty) << dendl;
5101
5102 if (flushed) {
5103 signal_cond_list(in->waitfor_caps);
5104 if (session->flushing_caps_tids.empty() ||
5105 *session->flushing_caps_tids.begin() > flush_ack_tid)
5106 sync_cond.notify_all();
5107 }
5108
5109 if (!dirty) {
5110 in->cap_dirtier_uid = -1;
5111 in->cap_dirtier_gid = -1;
5112 }
5113
5114 if (!cleaned) {
5115 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5116 } else {
5117 if (in->flushing_caps) {
5118 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5119 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5120 in->flushing_caps &= ~cleaned;
5121 if (in->flushing_caps == 0) {
5122 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5123 num_flushing_caps--;
5124 if (in->flushing_cap_tids.empty())
5125 in->flushing_cap_item.remove_myself();
5126 }
5127 if (!in->caps_dirty())
5128 put_inode(in);
5129 }
5130 }
5131 }
5132
5133
5134 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5135 {
5136 ceph_tid_t flush_ack_tid = m->get_client_tid();
5137 mds_rank_t mds = session->mds_num;
5138 ceph_assert(in->caps.count(mds));
5139 snapid_t follows = m->get_snap_follows();
5140
5141 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5142 auto& capsnap = it->second;
5143 if (flush_ack_tid != capsnap.flush_tid) {
5144 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5145 } else {
5146 InodeRef tmp_ref(in);
5147 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5148 << " on " << *in << dendl;
5149 session->flushing_caps_tids.erase(capsnap.flush_tid);
5150 in->flushing_cap_tids.erase(capsnap.flush_tid);
5151 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5152 in->flushing_cap_item.remove_myself();
5153 in->cap_snaps.erase(it);
5154
5155 signal_cond_list(in->waitfor_caps);
5156 if (session->flushing_caps_tids.empty() ||
5157 *session->flushing_caps_tids.begin() > flush_ack_tid)
5158 sync_cond.notify_all();
5159 }
5160 } else {
5161 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5162 << " on " << *in << dendl;
5163 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5164 }
5165 }
5166
5167 class C_Client_DentryInvalidate : public Context {
5168 private:
5169 Client *client;
5170 vinodeno_t dirino;
5171 vinodeno_t ino;
5172 string name;
5173 public:
5174 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5175 client(c), name(dn->name) {
5176 if (client->use_faked_inos()) {
5177 dirino.ino = dn->dir->parent_inode->faked_ino;
5178 if (del)
5179 ino.ino = dn->inode->faked_ino;
5180 } else {
5181 dirino = dn->dir->parent_inode->vino();
5182 if (del)
5183 ino = dn->inode->vino();
5184 }
5185 if (!del)
5186 ino.ino = inodeno_t();
5187 }
5188 void finish(int r) override {
5189 // _async_dentry_invalidate is responsible for its own locking
5190 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5191 client->_async_dentry_invalidate(dirino, ino, name);
5192 }
5193 };
5194
5195 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5196 {
5197 if (unmounting)
5198 return;
5199 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5200 << " in dir " << dirino << dendl;
5201 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5202 }
5203
5204 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5205 {
5206 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5207 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5208 }
5209
5210 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5211 {
5212 int ref = in->get_num_ref();
5213 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5214
5215 if (in->dir && !in->dir->dentries.empty()) {
5216 for (auto p = in->dir->dentries.begin();
5217 p != in->dir->dentries.end(); ) {
5218 Dentry *dn = p->second;
5219 ++p;
5220 /* rmsnap removes whole subtree, need trim inodes recursively.
5221 * we don't need to invalidate dentries recursively. because
5222 * invalidating a directory dentry effectively invalidate
5223 * whole subtree */
5224 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5225 _try_to_trim_inode(dn->inode.get(), false);
5226
5227 if (dn->lru_is_expireable())
5228 unlink(dn, true, false); // keep dir, drop dentry
5229 }
5230 if (in->dir->dentries.empty()) {
5231 close_dir(in->dir);
5232 --ref;
5233 }
5234 }
5235
5236 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5237 InodeRef snapdir = open_snapdir(in);
5238 _try_to_trim_inode(snapdir.get(), false);
5239 --ref;
5240 }
5241
5242 if (ref > 0) {
5243 auto q = in->dentries.begin();
5244 while (q != in->dentries.end()) {
5245 Dentry *dn = *q;
5246 ++q;
5247 if( in->ll_ref > 0 && sched_inval) {
5248 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5249 // so in->dentries doesn't always reflect the state of kernel's dcache.
5250 _schedule_invalidate_dentry_callback(dn, true);
5251 }
5252 unlink(dn, true, true);
5253 }
5254 }
5255 }
5256
5257 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5258 {
5259 mds_rank_t mds = session->mds_num;
5260 int used = get_caps_used(in);
5261 int wanted = in->caps_wanted();
5262
5263 const unsigned new_caps = m->get_caps();
5264 const bool was_stale = session->cap_gen > cap->gen;
5265 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5266 << " mds." << mds << " seq " << m->get_seq()
5267 << " caps now " << ccap_string(new_caps)
5268 << " was " << ccap_string(cap->issued)
5269 << (was_stale ? " (stale)" : "") << dendl;
5270
5271 if (was_stale)
5272 cap->issued = cap->implemented = CEPH_CAP_PIN;
5273 cap->seq = m->get_seq();
5274 cap->gen = session->cap_gen;
5275
5276 check_cap_issue(in, new_caps);
5277
5278 // update inode
5279 int issued;
5280 in->caps_issued(&issued);
5281 issued |= in->caps_dirty();
5282
5283 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5284 !(issued & CEPH_CAP_AUTH_EXCL)) {
5285 in->mode = m->head.mode;
5286 in->uid = m->head.uid;
5287 in->gid = m->head.gid;
5288 in->btime = m->btime;
5289 }
5290 bool deleted_inode = false;
5291 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5292 !(issued & CEPH_CAP_LINK_EXCL)) {
5293 in->nlink = m->head.nlink;
5294 if (in->nlink == 0 &&
5295 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5296 deleted_inode = true;
5297 }
5298 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5299 m->xattrbl.length() &&
5300 m->head.xattr_version > in->xattr_version) {
5301 auto p = m->xattrbl.cbegin();
5302 decode(in->xattrs, p);
5303 in->xattr_version = m->head.xattr_version;
5304 }
5305
5306 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5307 in->dirstat.nfiles = m->get_nfiles();
5308 in->dirstat.nsubdirs = m->get_nsubdirs();
5309 }
5310
5311 if (new_caps & CEPH_CAP_ANY_RD) {
5312 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5313 m->get_ctime(), m->get_mtime(), m->get_atime());
5314 }
5315
5316 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5317 in->layout = m->get_layout();
5318 update_inode_file_size(in, issued, m->get_size(),
5319 m->get_truncate_seq(), m->get_truncate_size());
5320 }
5321
5322 if (m->inline_version > in->inline_version) {
5323 in->inline_data = m->inline_data;
5324 in->inline_version = m->inline_version;
5325 }
5326
5327 /* always take a newer change attr */
5328 if (m->get_change_attr() > in->change_attr)
5329 in->change_attr = m->get_change_attr();
5330
5331 // max_size
5332 if (cap == in->auth_cap &&
5333 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5334 (m->get_max_size() != in->max_size)) {
5335 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5336 in->max_size = m->get_max_size();
5337 if (in->max_size > in->wanted_max_size) {
5338 in->wanted_max_size = 0;
5339 in->requested_max_size = 0;
5340 }
5341 }
5342
5343 bool check = false;
5344 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5345 (wanted & ~(cap->wanted | new_caps))) {
5346 // If mds is importing cap, prior cap messages that update 'wanted'
5347 // may get dropped by mds (migrate seq mismatch).
5348 //
5349 // We don't send cap message to update 'wanted' if what we want are
5350 // already issued. If mds revokes caps, cap message that releases caps
5351 // also tells mds what we want. But if caps got revoked by mds forcedly
5352 // (session stale). We may haven't told mds what we want.
5353 check = true;
5354 }
5355
5356
5357 // update caps
5358 auto revoked = cap->issued & ~new_caps;
5359 if (revoked) {
5360 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5361 cap->issued = new_caps;
5362 cap->implemented |= new_caps;
5363
5364 // recall delegations if we're losing caps necessary for them
5365 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5366 in->recall_deleg(false);
5367 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5368 in->recall_deleg(true);
5369
5370 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5371 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5372 !_flush(in, new C_Client_FlushComplete(this, in))) {
5373 // waitin' for flush
5374 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5375 if (_release(in))
5376 check = true;
5377 } else {
5378 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5379 check = true;
5380 }
5381 } else if (cap->issued == new_caps) {
5382 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5383 } else {
5384 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5385 cap->issued = new_caps;
5386 cap->implemented |= new_caps;
5387
5388 if (cap == in->auth_cap) {
5389 // non-auth MDS is revoking the newly grant caps ?
5390 for (const auto &p : in->caps) {
5391 if (&p.second == cap)
5392 continue;
5393 if (p.second.implemented & ~p.second.issued & new_caps) {
5394 check = true;
5395 break;
5396 }
5397 }
5398 }
5399 }
5400
5401 if (check)
5402 check_caps(in, 0);
5403
5404 // wake up waiters
5405 if (new_caps)
5406 signal_cond_list(in->waitfor_caps);
5407
5408 // may drop inode's last ref
5409 if (deleted_inode)
5410 _try_to_trim_inode(in, true);
5411 }
5412
5413 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5414 {
5415 if (perms.uid() == 0)
5416 return 0;
5417
5418 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5419 int ret = _posix_acl_permission(in, perms, want);
5420 if (ret != -EAGAIN)
5421 return ret;
5422 }
5423
5424 // check permissions before doing anything else
5425 if (!in->check_mode(perms, want))
5426 return -EACCES;
5427 return 0;
5428 }
5429
5430 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5431 const UserPerm& perms)
5432 {
5433 int r = _getattr_for_perm(in, perms);
5434 if (r < 0)
5435 goto out;
5436
5437 r = 0;
5438 if (strncmp(name, "system.", 7) == 0) {
5439 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5440 r = -EPERM;
5441 } else {
5442 r = inode_permission(in, perms, want);
5443 }
5444 out:
5445 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5446 return r;
5447 }
5448
5449 ostream& operator<<(ostream &out, const UserPerm& perm) {
5450 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5451 return out;
5452 }
5453
5454 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5455 const UserPerm& perms)
5456 {
5457 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5458 int r = _getattr_for_perm(in, perms);
5459 if (r < 0)
5460 goto out;
5461
5462 if (mask & CEPH_SETATTR_SIZE) {
5463 r = inode_permission(in, perms, MAY_WRITE);
5464 if (r < 0)
5465 goto out;
5466 }
5467
5468 r = -EPERM;
5469 if (mask & CEPH_SETATTR_UID) {
5470 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5471 goto out;
5472 }
5473 if (mask & CEPH_SETATTR_GID) {
5474 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5475 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5476 goto out;
5477 }
5478
5479 if (mask & CEPH_SETATTR_MODE) {
5480 if (perms.uid() != 0 && perms.uid() != in->uid)
5481 goto out;
5482
5483 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5484 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5485 stx->stx_mode &= ~S_ISGID;
5486 }
5487
5488 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5489 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5490 if (perms.uid() != 0 && perms.uid() != in->uid) {
5491 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5492 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5493 check_mask |= CEPH_SETATTR_MTIME;
5494 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5495 check_mask |= CEPH_SETATTR_ATIME;
5496 if (check_mask & mask) {
5497 goto out;
5498 } else {
5499 r = inode_permission(in, perms, MAY_WRITE);
5500 if (r < 0)
5501 goto out;
5502 }
5503 }
5504 }
5505 r = 0;
5506 out:
5507 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5508 return r;
5509 }
5510
5511 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5512 {
5513 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5514 unsigned want = 0;
5515
5516 if ((flags & O_ACCMODE) == O_WRONLY)
5517 want = MAY_WRITE;
5518 else if ((flags & O_ACCMODE) == O_RDWR)
5519 want = MAY_READ | MAY_WRITE;
5520 else if ((flags & O_ACCMODE) == O_RDONLY)
5521 want = MAY_READ;
5522 if (flags & O_TRUNC)
5523 want |= MAY_WRITE;
5524
5525 int r = 0;
5526 switch (in->mode & S_IFMT) {
5527 case S_IFLNK:
5528 r = -ELOOP;
5529 goto out;
5530 case S_IFDIR:
5531 if (want & MAY_WRITE) {
5532 r = -EISDIR;
5533 goto out;
5534 }
5535 break;
5536 }
5537
5538 r = _getattr_for_perm(in, perms);
5539 if (r < 0)
5540 goto out;
5541
5542 r = inode_permission(in, perms, want);
5543 out:
5544 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5545 return r;
5546 }
5547
5548 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5549 {
5550 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5551 int r = _getattr_for_perm(dir, perms);
5552 if (r < 0)
5553 goto out;
5554
5555 r = inode_permission(dir, perms, MAY_EXEC);
5556 out:
5557 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5558 return r;
5559 }
5560
5561 int Client::may_create(Inode *dir, const UserPerm& perms)
5562 {
5563 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5564 int r = _getattr_for_perm(dir, perms);
5565 if (r < 0)
5566 goto out;
5567
5568 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5569 out:
5570 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5571 return r;
5572 }
5573
5574 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5575 {
5576 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5577 int r = _getattr_for_perm(dir, perms);
5578 if (r < 0)
5579 goto out;
5580
5581 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5582 if (r < 0)
5583 goto out;
5584
5585 /* 'name == NULL' means rmsnap */
5586 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5587 InodeRef otherin;
5588 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5589 if (r < 0)
5590 goto out;
5591 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5592 r = -EPERM;
5593 }
5594 out:
5595 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5596 return r;
5597 }
5598
5599 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5600 {
5601 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5602 int r = _getattr_for_perm(in, perms);
5603 if (r < 0)
5604 goto out;
5605
5606 if (perms.uid() == 0 || perms.uid() == in->uid) {
5607 r = 0;
5608 goto out;
5609 }
5610
5611 r = -EPERM;
5612 if (!S_ISREG(in->mode))
5613 goto out;
5614
5615 if (in->mode & S_ISUID)
5616 goto out;
5617
5618 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5619 goto out;
5620
5621 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5622 out:
5623 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5624 return r;
5625 }
5626
5627 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5628 {
5629 int mask = CEPH_STAT_CAP_MODE;
5630 bool force = false;
5631 if (acl_type != NO_ACL) {
5632 mask |= CEPH_STAT_CAP_XATTR;
5633 force = in->xattr_version == 0;
5634 }
5635 return _getattr(in, mask, perms, force);
5636 }
5637
5638 vinodeno_t Client::_get_vino(Inode *in)
5639 {
5640 /* The caller must hold the client lock */
5641 return vinodeno_t(in->ino, in->snapid);
5642 }
5643
5644 /**
5645 * Resolve an MDS spec to a list of MDS daemon GIDs.
5646 *
5647 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5648 * It may be '*' in which case it matches all GIDs.
5649 *
5650 * If no error is returned, the `targets` vector will be populated with at least
5651 * one MDS.
5652 */
5653 int Client::resolve_mds(
5654 const std::string &mds_spec,
5655 std::vector<mds_gid_t> *targets)
5656 {
5657 ceph_assert(fsmap);
5658 ceph_assert(targets != nullptr);
5659
5660 mds_role_t role;
5661 std::stringstream ss;
5662 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5663 if (role_r == 0) {
5664 // We got a role, resolve it to a GID
5665 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5666 << role << "'" << dendl;
5667 targets->push_back(
5668 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5669 return 0;
5670 }
5671
5672 std::string strtol_err;
5673 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5674 if (strtol_err.empty()) {
5675 // It is a possible GID
5676 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5677 if (fsmap->gid_exists(mds_gid)) {
5678 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5679 targets->push_back(mds_gid);
5680 } else {
5681 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5682 << dendl;
5683 return -ENOENT;
5684 }
5685 } else if (mds_spec == "*") {
5686 // It is a wildcard: use all MDSs
5687 const auto mds_info = fsmap->get_mds_info();
5688
5689 if (mds_info.empty()) {
5690 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5691 return -ENOENT;
5692 }
5693
5694 for (const auto i : mds_info) {
5695 targets->push_back(i.first);
5696 }
5697 } else {
5698 // It did not parse as an integer, it is not a wildcard, it must be a name
5699 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5700 if (mds_gid == 0) {
5701 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5702
5703 lderr(cct) << "FSMap: " << *fsmap << dendl;
5704
5705 return -ENOENT;
5706 } else {
5707 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5708 << "' to GID " << mds_gid << dendl;
5709 targets->push_back(mds_gid);
5710 }
5711 }
5712
5713 return 0;
5714 }
5715
5716
5717 /**
5718 * Authenticate with mon and establish global ID
5719 */
5720 int Client::authenticate()
5721 {
5722 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5723
5724 if (monclient->is_authenticated()) {
5725 return 0;
5726 }
5727
5728 client_lock.unlock();
5729 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5730 client_lock.lock();
5731 if (r < 0) {
5732 return r;
5733 }
5734
5735 whoami = monclient->get_global_id();
5736 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5737
5738 return 0;
5739 }
5740
5741 int Client::fetch_fsmap(bool user)
5742 {
5743 int r;
5744 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5745 // rather than MDSMap because no one MDSMap contains all the daemons, and
5746 // a `tell` can address any daemon.
5747 version_t fsmap_latest;
5748 do {
5749 C_SaferCond cond;
5750 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5751 client_lock.unlock();
5752 r = cond.wait();
5753 client_lock.lock();
5754 } while (r == -EAGAIN);
5755
5756 if (r < 0) {
5757 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5758 return r;
5759 }
5760
5761 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5762
5763 if (user) {
5764 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5765 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5766 monclient->renew_subs();
5767 wait_on_list(waiting_for_fsmap);
5768 }
5769 ceph_assert(fsmap_user);
5770 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5771 } else {
5772 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5773 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5774 monclient->renew_subs();
5775 wait_on_list(waiting_for_fsmap);
5776 }
5777 ceph_assert(fsmap);
5778 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5779 }
5780 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5781 << fsmap_latest << dendl;
5782 return 0;
5783 }
5784
5785 /**
5786 *
5787 * @mds_spec one of ID, rank, GID, "*"
5788 *
5789 */
5790 int Client::mds_command(
5791 const std::string &mds_spec,
5792 const vector<string>& cmd,
5793 const bufferlist& inbl,
5794 bufferlist *outbl,
5795 string *outs,
5796 Context *onfinish)
5797 {
5798 std::lock_guard lock(client_lock);
5799
5800 if (!initialized)
5801 return -ENOTCONN;
5802
5803 int r;
5804 r = authenticate();
5805 if (r < 0) {
5806 return r;
5807 }
5808
5809 r = fetch_fsmap(false);
5810 if (r < 0) {
5811 return r;
5812 }
5813
5814 // Look up MDS target(s) of the command
5815 std::vector<mds_gid_t> targets;
5816 r = resolve_mds(mds_spec, &targets);
5817 if (r < 0) {
5818 return r;
5819 }
5820
5821 // If daemons are laggy, we won't send them commands. If all
5822 // are laggy then we fail.
5823 std::vector<mds_gid_t> non_laggy;
5824 for (const auto gid : targets) {
5825 const auto info = fsmap->get_info_gid(gid);
5826 if (!info.laggy()) {
5827 non_laggy.push_back(gid);
5828 }
5829 }
5830 if (non_laggy.size() == 0) {
5831 *outs = "All targeted MDS daemons are laggy";
5832 return -ENOENT;
5833 }
5834
5835 if (metadata.empty()) {
5836 // We are called on an unmounted client, so metadata
5837 // won't be initialized yet.
5838 populate_metadata("");
5839 }
5840
5841 // Send commands to targets
5842 C_GatherBuilder gather(cct, onfinish);
5843 for (const auto target_gid : non_laggy) {
5844 const auto info = fsmap->get_info_gid(target_gid);
5845
5846 // Open a connection to the target MDS
5847 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5848
5849 // Generate MDSCommandOp state
5850 auto &op = command_table.start_command();
5851
5852 op.on_finish = gather.new_sub();
5853 op.cmd = cmd;
5854 op.outbl = outbl;
5855 op.outs = outs;
5856 op.inbl = inbl;
5857 op.mds_gid = target_gid;
5858 op.con = conn;
5859
5860 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5861 << " tid=" << op.tid << cmd << dendl;
5862
5863 // Construct and send MCommand
5864 auto m = op.get_message(monclient->get_fsid());
5865 conn->send_message2(std::move(m));
5866 }
5867 gather.activate();
5868
5869 return 0;
5870 }
5871
5872 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5873 {
5874 ceph_tid_t const tid = m->get_tid();
5875
5876 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5877
5878 if (!command_table.exists(tid)) {
5879 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5880 return;
5881 }
5882
5883 auto &op = command_table.get_command(tid);
5884 if (op.outbl) {
5885 *op.outbl = m->get_data();
5886 }
5887 if (op.outs) {
5888 *op.outs = m->rs;
5889 }
5890
5891 if (op.on_finish) {
5892 op.on_finish->complete(m->r);
5893 }
5894
5895 command_table.erase(tid);
5896 }
5897
5898 // -------------------
5899 // MOUNT
5900
5901 int Client::subscribe_mdsmap(const std::string &fs_name)
5902 {
5903 int r = authenticate();
5904 if (r < 0) {
5905 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5906 return r;
5907 }
5908
5909 std::string resolved_fs_name;
5910 if (fs_name.empty()) {
5911 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5912 if (resolved_fs_name.empty())
5913 // Try the backwards compatibility fs name option
5914 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5915 } else {
5916 resolved_fs_name = fs_name;
5917 }
5918
5919 std::string want = "mdsmap";
5920 if (!resolved_fs_name.empty()) {
5921 r = fetch_fsmap(true);
5922 if (r < 0)
5923 return r;
5924 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5925 if (fscid == FS_CLUSTER_ID_NONE) {
5926 return -ENOENT;
5927 }
5928
5929 std::ostringstream oss;
5930 oss << want << "." << fscid;
5931 want = oss.str();
5932 }
5933 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5934
5935 monclient->sub_want(want, 0, 0);
5936 monclient->renew_subs();
5937
5938 return 0;
5939 }
5940
5941 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5942 bool require_mds, const std::string &fs_name)
5943 {
5944 std::lock_guard lock(client_lock);
5945
5946 if (mounted) {
5947 ldout(cct, 5) << "already mounted" << dendl;
5948 return 0;
5949 }
5950
5951 unmounting = false;
5952
5953 int r = subscribe_mdsmap(fs_name);
5954 if (r < 0) {
5955 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5956 return r;
5957 }
5958
5959 tick(); // start tick
5960
5961 if (require_mds) {
5962 while (1) {
5963 auto availability = mdsmap->is_cluster_available();
5964 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5965 // Error out
5966 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5967 return CEPH_FUSE_NO_MDS_UP;
5968 } else if (availability == MDSMap::AVAILABLE) {
5969 // Continue to mount
5970 break;
5971 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5972 // Else, wait. MDSMonitor will update the map to bring
5973 // us to a conclusion eventually.
5974 wait_on_list(waiting_for_mdsmap);
5975 } else {
5976 // Unexpected value!
5977 ceph_abort();
5978 }
5979 }
5980 }
5981
5982 populate_metadata(mount_root.empty() ? "/" : mount_root);
5983
5984 filepath fp(CEPH_INO_ROOT);
5985 if (!mount_root.empty()) {
5986 fp = filepath(mount_root.c_str());
5987 }
5988 while (true) {
5989 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5990 req->set_filepath(fp);
5991 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5992 int res = make_request(req, perms);
5993 if (res < 0) {
5994 if (res == -EACCES && root) {
5995 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5996 break;
5997 }
5998 return res;
5999 }
6000
6001 if (fp.depth())
6002 fp.pop_dentry();
6003 else
6004 break;
6005 }
6006
6007 ceph_assert(root);
6008 _ll_get(root);
6009
6010 mounted = true;
6011
6012 // trace?
6013 if (!cct->_conf->client_trace.empty()) {
6014 traceout.open(cct->_conf->client_trace.c_str());
6015 if (traceout.is_open()) {
6016 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6017 } else {
6018 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6019 }
6020 }
6021
6022 /*
6023 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6024 ldout(cct, 3) << "op: struct stat st;" << dendl;
6025 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6026 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6027 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6028 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6029 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6030 ldout(cct, 3) << "op: int fd;" << dendl;
6031 */
6032 return 0;
6033 }
6034
6035 // UNMOUNT
6036
6037 void Client::_close_sessions()
6038 {
6039 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6040 if (it->second.state == MetaSession::STATE_REJECTED)
6041 mds_sessions.erase(it++);
6042 else
6043 ++it;
6044 }
6045
6046 while (!mds_sessions.empty()) {
6047 // send session closes!
6048 for (auto &p : mds_sessions) {
6049 if (p.second.state != MetaSession::STATE_CLOSING) {
6050 _close_mds_session(&p.second);
6051 mds_ranks_closing.insert(p.first);
6052 }
6053 }
6054
6055 // wait for sessions to close
6056 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6057 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6058 << timo << "s)" << dendl;
6059 std::unique_lock l{client_lock, std::adopt_lock};
6060 if (!timo) {
6061 mount_cond.wait(l);
6062 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6063 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6064 while (!mds_ranks_closing.empty()) {
6065 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6066 // this prunes entry from mds_sessions and mds_ranks_closing
6067 _closed_mds_session(&session, -ETIMEDOUT);
6068 }
6069 }
6070
6071 mds_ranks_closing.clear();
6072 l.release();
6073 }
6074 }
6075
6076 void Client::flush_mdlog_sync()
6077 {
6078 if (mds_requests.empty())
6079 return;
6080 for (auto &p : mds_sessions) {
6081 flush_mdlog(&p.second);
6082 }
6083 }
6084
6085 void Client::flush_mdlog(MetaSession *session)
6086 {
6087 // Only send this to Luminous or newer MDS daemons, older daemons
6088 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6089 const uint64_t features = session->con->get_features();
6090 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6091 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6092 session->con->send_message2(std::move(m));
6093 }
6094 }
6095
6096
6097 void Client::_abort_mds_sessions(int err)
6098 {
6099 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6100 auto req = p->second;
6101 ++p;
6102 // unsafe requests will be removed during close session below.
6103 if (req->got_unsafe)
6104 continue;
6105
6106 req->abort(err);
6107 if (req->caller_cond) {
6108 req->kick = true;
6109 req->caller_cond->notify_all();
6110 }
6111 }
6112
6113 // Process aborts on any requests that were on this waitlist.
6114 // Any requests that were on a waiting_for_open session waitlist
6115 // will get kicked during close session below.
6116 signal_cond_list(waiting_for_mdsmap);
6117
6118 // Force-close all sessions
6119 while(!mds_sessions.empty()) {
6120 auto& session = mds_sessions.begin()->second;
6121 _closed_mds_session(&session, err);
6122 }
6123 }
6124
6125 void Client::_unmount(bool abort)
6126 {
6127 std::unique_lock lock{client_lock, std::adopt_lock};
6128 if (unmounting)
6129 return;
6130
6131 if (abort || blacklisted) {
6132 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6133 } else {
6134 ldout(cct, 2) << "unmounting" << dendl;
6135 }
6136 unmounting = true;
6137
6138 deleg_timeout = 0;
6139
6140 if (abort) {
6141 // Abort all mds sessions
6142 _abort_mds_sessions(-ENOTCONN);
6143
6144 objecter->op_cancel_writes(-ENOTCONN);
6145 } else {
6146 // flush the mdlog for pending requests, if any
6147 flush_mdlog_sync();
6148 }
6149
6150 mount_cond.wait(lock, [this] {
6151 if (!mds_requests.empty()) {
6152 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6153 << dendl;
6154 }
6155 return mds_requests.empty();
6156 });
6157 if (tick_event)
6158 timer.cancel_event(tick_event);
6159 tick_event = 0;
6160
6161 cwd.reset();
6162
6163 // clean up any unclosed files
6164 while (!fd_map.empty()) {
6165 Fh *fh = fd_map.begin()->second;
6166 fd_map.erase(fd_map.begin());
6167 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6168 _release_fh(fh);
6169 }
6170
6171 while (!ll_unclosed_fh_set.empty()) {
6172 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6173 Fh *fh = *it;
6174 ll_unclosed_fh_set.erase(fh);
6175 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6176 _release_fh(fh);
6177 }
6178
6179 while (!opened_dirs.empty()) {
6180 dir_result_t *dirp = *opened_dirs.begin();
6181 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6182 _closedir(dirp);
6183 }
6184
6185 _ll_drop_pins();
6186
6187 mount_cond.wait(lock, [this] {
6188 if (unsafe_sync_write > 0) {
6189 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6190 << dendl;
6191 }
6192 return unsafe_sync_write <= 0;
6193 });
6194
6195 if (cct->_conf->client_oc) {
6196 // flush/release all buffered data
6197 std::list<InodeRef> anchor;
6198 for (auto& p : inode_map) {
6199 Inode *in = p.second;
6200 if (!in) {
6201 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6202 ceph_assert(in);
6203 }
6204
6205 // prevent inode from getting freed
6206 anchor.emplace_back(in);
6207
6208 if (abort || blacklisted) {
6209 objectcacher->purge_set(&in->oset);
6210 } else if (!in->caps.empty()) {
6211 _release(in);
6212 _flush(in, new C_Client_FlushComplete(this, in));
6213 }
6214 }
6215 }
6216
6217 if (abort || blacklisted) {
6218 for (auto p = dirty_list.begin(); !p.end(); ) {
6219 Inode *in = *p;
6220 ++p;
6221 if (in->dirty_caps) {
6222 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6223 in->mark_caps_clean();
6224 put_inode(in);
6225 }
6226 }
6227 } else {
6228 flush_caps_sync();
6229 wait_sync_caps(last_flush_tid);
6230 }
6231
6232 // empty lru cache
6233 trim_cache();
6234
6235 while (lru.lru_get_size() > 0 ||
6236 !inode_map.empty()) {
6237 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6238 << "+" << inode_map.size() << " items"
6239 << ", waiting (for caps to release?)"
6240 << dendl;
6241 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6242 r == std::cv_status::timeout) {
6243 dump_cache(NULL);
6244 }
6245 }
6246 ceph_assert(lru.lru_get_size() == 0);
6247 ceph_assert(inode_map.empty());
6248
6249 // stop tracing
6250 if (!cct->_conf->client_trace.empty()) {
6251 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6252 traceout.close();
6253 }
6254
6255 _close_sessions();
6256
6257 mounted = false;
6258
6259 lock.release();
6260 ldout(cct, 2) << "unmounted." << dendl;
6261 }
6262
6263 void Client::unmount()
6264 {
6265 std::lock_guard lock(client_lock);
6266 _unmount(false);
6267 }
6268
6269 void Client::abort_conn()
6270 {
6271 std::lock_guard lock(client_lock);
6272 _unmount(true);
6273 }
6274
6275 void Client::flush_cap_releases()
6276 {
6277 // send any cap releases
6278 for (auto &p : mds_sessions) {
6279 auto &session = p.second;
6280 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6281 p.first)) {
6282 if (cct->_conf->client_inject_release_failure) {
6283 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6284 } else {
6285 session.con->send_message2(std::move(session.release));
6286 }
6287 session.release.reset();
6288 }
6289 }
6290 }
6291
6292 void Client::tick()
6293 {
6294 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6295 sleep(cct->_conf->client_debug_inject_tick_delay);
6296 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6297 cct->_conf.apply_changes(nullptr);
6298 }
6299
6300 ldout(cct, 21) << "tick" << dendl;
6301 tick_event = timer.add_event_after(
6302 cct->_conf->client_tick_interval,
6303 new LambdaContext([this](int) {
6304 // Called back via Timer, which takes client_lock for us
6305 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6306 tick();
6307 }));
6308 utime_t now = ceph_clock_now();
6309
6310 if (!mounted && !mds_requests.empty()) {
6311 MetaRequest *req = mds_requests.begin()->second;
6312 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6313 req->abort(-ETIMEDOUT);
6314 if (req->caller_cond) {
6315 req->kick = true;
6316 req->caller_cond->notify_all();
6317 }
6318 signal_cond_list(waiting_for_mdsmap);
6319 for (auto &p : mds_sessions) {
6320 signal_context_list(p.second.waiting_for_open);
6321 }
6322 }
6323 }
6324
6325 if (mdsmap->get_epoch()) {
6326 // renew caps?
6327 utime_t el = now - last_cap_renew;
6328 if (el > mdsmap->get_session_timeout() / 3.0)
6329 renew_caps();
6330
6331 flush_cap_releases();
6332 }
6333
6334 // delayed caps
6335 xlist<Inode*>::iterator p = delayed_list.begin();
6336 while (!p.end()) {
6337 Inode *in = *p;
6338 ++p;
6339 if (in->hold_caps_until > now)
6340 break;
6341 delayed_list.pop_front();
6342 check_caps(in, CHECK_CAPS_NODELAY);
6343 }
6344
6345 trim_cache(true);
6346
6347 if (blacklisted && mounted &&
6348 last_auto_reconnect + 30 * 60 < now &&
6349 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6350 messenger->client_reset();
6351 fd_gen++; // invalidate open files
6352 blacklisted = false;
6353 _kick_stale_sessions();
6354 last_auto_reconnect = now;
6355 }
6356 }
6357
6358 void Client::renew_caps()
6359 {
6360 ldout(cct, 10) << "renew_caps()" << dendl;
6361 last_cap_renew = ceph_clock_now();
6362
6363 for (auto &p : mds_sessions) {
6364 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6365 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6366 renew_caps(&p.second);
6367 }
6368 }
6369
6370 void Client::renew_caps(MetaSession *session)
6371 {
6372 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6373 session->last_cap_renew_request = ceph_clock_now();
6374 uint64_t seq = ++session->cap_renew_seq;
6375 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6376 }
6377
6378
6379 // ===============================================================
6380 // high level (POSIXy) interface
6381
6382 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6383 InodeRef *target, const UserPerm& perms)
6384 {
6385 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6386 MetaRequest *req = new MetaRequest(op);
6387 filepath path;
6388 dir->make_nosnap_relative_path(path);
6389 path.push_dentry(name);
6390 req->set_filepath(path);
6391 req->set_inode(dir);
6392 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6393 mask |= DEBUG_GETATTR_CAPS;
6394 req->head.args.getattr.mask = mask;
6395
6396 ldout(cct, 10) << __func__ << " on " << path << dendl;
6397
6398 int r = make_request(req, perms, target);
6399 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6400 return r;
6401 }
6402
6403 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6404 const UserPerm& perms)
6405 {
6406 int r = 0;
6407 Dentry *dn = NULL;
6408 // can only request shared caps
6409 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6410
6411 if (dname == "..") {
6412 if (dir->dentries.empty()) {
6413 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6414 filepath path(dir->ino);
6415 req->set_filepath(path);
6416
6417 InodeRef tmptarget;
6418 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6419
6420 if (r == 0) {
6421 *target = std::move(tmptarget);
6422 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6423 } else {
6424 *target = dir;
6425 }
6426 }
6427 else
6428 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6429 goto done;
6430 }
6431
6432 if (dname == ".") {
6433 *target = dir;
6434 goto done;
6435 }
6436
6437 if (!dir->is_dir()) {
6438 r = -ENOTDIR;
6439 goto done;
6440 }
6441
6442 if (dname.length() > NAME_MAX) {
6443 r = -ENAMETOOLONG;
6444 goto done;
6445 }
6446
6447 if (dname == cct->_conf->client_snapdir &&
6448 dir->snapid == CEPH_NOSNAP) {
6449 *target = open_snapdir(dir);
6450 goto done;
6451 }
6452
6453 if (dir->dir &&
6454 dir->dir->dentries.count(dname)) {
6455 dn = dir->dir->dentries[dname];
6456
6457 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6458 << " seq " << dn->lease_seq
6459 << dendl;
6460
6461 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6462 // is dn lease valid?
6463 utime_t now = ceph_clock_now();
6464 if (dn->lease_mds >= 0 &&
6465 dn->lease_ttl > now &&
6466 mds_sessions.count(dn->lease_mds)) {
6467 MetaSession &s = mds_sessions.at(dn->lease_mds);
6468 if (s.cap_ttl > now &&
6469 s.cap_gen == dn->lease_gen) {
6470 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6471 // make trim_caps() behave.
6472 dir->try_touch_cap(dn->lease_mds);
6473 goto hit_dn;
6474 }
6475 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6476 << " vs lease_gen " << dn->lease_gen << dendl;
6477 }
6478 // dir shared caps?
6479 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6480 if (dn->cap_shared_gen == dir->shared_gen &&
6481 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6482 goto hit_dn;
6483 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6484 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6485 << *dir << " dn '" << dname << "'" << dendl;
6486 return -ENOENT;
6487 }
6488 }
6489 } else {
6490 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6491 }
6492 } else {
6493 // can we conclude ENOENT locally?
6494 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6495 (dir->flags & I_COMPLETE)) {
6496 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6497 return -ENOENT;
6498 }
6499 }
6500
6501 r = _do_lookup(dir, dname, mask, target, perms);
6502 goto done;
6503
6504 hit_dn:
6505 if (dn->inode) {
6506 *target = dn->inode;
6507 } else {
6508 r = -ENOENT;
6509 }
6510 touch_dn(dn);
6511
6512 done:
6513 if (r < 0)
6514 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6515 else
6516 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6517 return r;
6518 }
6519
6520 int Client::get_or_create(Inode *dir, const char* name,
6521 Dentry **pdn, bool expect_null)
6522 {
6523 // lookup
6524 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6525 dir->open_dir();
6526 if (dir->dir->dentries.count(name)) {
6527 Dentry *dn = dir->dir->dentries[name];
6528
6529 // is dn lease valid?
6530 utime_t now = ceph_clock_now();
6531 if (dn->inode &&
6532 dn->lease_mds >= 0 &&
6533 dn->lease_ttl > now &&
6534 mds_sessions.count(dn->lease_mds)) {
6535 MetaSession &s = mds_sessions.at(dn->lease_mds);
6536 if (s.cap_ttl > now &&
6537 s.cap_gen == dn->lease_gen) {
6538 if (expect_null)
6539 return -EEXIST;
6540 }
6541 }
6542 *pdn = dn;
6543 } else {
6544 // otherwise link up a new one
6545 *pdn = link(dir->dir, name, NULL, NULL);
6546 }
6547
6548 // success
6549 return 0;
6550 }
6551
6552 int Client::path_walk(const filepath& origpath, InodeRef *end,
6553 const UserPerm& perms, bool followsym, int mask)
6554 {
6555 filepath path = origpath;
6556 InodeRef cur;
6557 if (origpath.absolute())
6558 cur = root;
6559 else
6560 cur = cwd;
6561 ceph_assert(cur);
6562
6563 ldout(cct, 10) << __func__ << " " << path << dendl;
6564
6565 int symlinks = 0;
6566
6567 unsigned i=0;
6568 while (i < path.depth() && cur) {
6569 int caps = 0;
6570 const string &dname = path[i];
6571 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6572 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6573 InodeRef next;
6574 if (cct->_conf->client_permissions) {
6575 int r = may_lookup(cur.get(), perms);
6576 if (r < 0)
6577 return r;
6578 caps = CEPH_CAP_AUTH_SHARED;
6579 }
6580
6581 /* Get extra requested caps on the last component */
6582 if (i == (path.depth() - 1))
6583 caps |= mask;
6584 int r = _lookup(cur.get(), dname, caps, &next, perms);
6585 if (r < 0)
6586 return r;
6587 // only follow trailing symlink if followsym. always follow
6588 // 'directory' symlinks.
6589 if (next && next->is_symlink()) {
6590 symlinks++;
6591 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6592 if (symlinks > MAXSYMLINKS) {
6593 return -ELOOP;
6594 }
6595
6596 if (i < path.depth() - 1) {
6597 // dir symlink
6598 // replace consumed components of path with symlink dir target
6599 filepath resolved(next->symlink.c_str());
6600 resolved.append(path.postfixpath(i + 1));
6601 path = resolved;
6602 i = 0;
6603 if (next->symlink[0] == '/') {
6604 cur = root;
6605 }
6606 continue;
6607 } else if (followsym) {
6608 if (next->symlink[0] == '/') {
6609 path = next->symlink.c_str();
6610 i = 0;
6611 // reset position
6612 cur = root;
6613 } else {
6614 filepath more(next->symlink.c_str());
6615 // we need to remove the symlink component from off of the path
6616 // before adding the target that the symlink points to. remain
6617 // at the same position in the path.
6618 path.pop_dentry();
6619 path.append(more);
6620 }
6621 continue;
6622 }
6623 }
6624 cur.swap(next);
6625 i++;
6626 }
6627 if (!cur)
6628 return -ENOENT;
6629 if (end)
6630 end->swap(cur);
6631 return 0;
6632 }
6633
6634
6635 // namespace ops
6636
6637 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6638 {
6639 std::lock_guard lock(client_lock);
6640 tout(cct) << "link" << std::endl;
6641 tout(cct) << relexisting << std::endl;
6642 tout(cct) << relpath << std::endl;
6643
6644 if (unmounting)
6645 return -ENOTCONN;
6646
6647 filepath existing(relexisting);
6648
6649 InodeRef in, dir;
6650 int r = path_walk(existing, &in, perm, true);
6651 if (r < 0)
6652 return r;
6653 if (std::string(relpath) == "/") {
6654 r = -EEXIST;
6655 return r;
6656 }
6657 filepath path(relpath);
6658 string name = path.last_dentry();
6659 path.pop_dentry();
6660
6661 r = path_walk(path, &dir, perm, true);
6662 if (r < 0)
6663 return r;
6664 if (cct->_conf->client_permissions) {
6665 if (S_ISDIR(in->mode)) {
6666 r = -EPERM;
6667 return r;
6668 }
6669 r = may_hardlink(in.get(), perm);
6670 if (r < 0)
6671 return r;
6672 r = may_create(dir.get(), perm);
6673 if (r < 0)
6674 return r;
6675 }
6676 r = _link(in.get(), dir.get(), name.c_str(), perm);
6677 return r;
6678 }
6679
6680 int Client::unlink(const char *relpath, const UserPerm& perm)
6681 {
6682 std::lock_guard lock(client_lock);
6683 tout(cct) << __func__ << std::endl;
6684 tout(cct) << relpath << std::endl;
6685
6686 if (unmounting)
6687 return -ENOTCONN;
6688
6689 if (std::string(relpath) == "/")
6690 return -EISDIR;
6691
6692 filepath path(relpath);
6693 string name = path.last_dentry();
6694 path.pop_dentry();
6695 InodeRef dir;
6696 int r = path_walk(path, &dir, perm);
6697 if (r < 0)
6698 return r;
6699 if (cct->_conf->client_permissions) {
6700 r = may_delete(dir.get(), name.c_str(), perm);
6701 if (r < 0)
6702 return r;
6703 }
6704 return _unlink(dir.get(), name.c_str(), perm);
6705 }
6706
6707 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6708 {
6709 std::lock_guard lock(client_lock);
6710 tout(cct) << __func__ << std::endl;
6711 tout(cct) << relfrom << std::endl;
6712 tout(cct) << relto << std::endl;
6713
6714 if (unmounting)
6715 return -ENOTCONN;
6716
6717 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6718 return -EBUSY;
6719
6720 filepath from(relfrom);
6721 filepath to(relto);
6722 string fromname = from.last_dentry();
6723 from.pop_dentry();
6724 string toname = to.last_dentry();
6725 to.pop_dentry();
6726
6727 InodeRef fromdir, todir;
6728 int r = path_walk(from, &fromdir, perm);
6729 if (r < 0)
6730 goto out;
6731 r = path_walk(to, &todir, perm);
6732 if (r < 0)
6733 goto out;
6734
6735 if (cct->_conf->client_permissions) {
6736 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6737 if (r < 0)
6738 return r;
6739 r = may_delete(todir.get(), toname.c_str(), perm);
6740 if (r < 0 && r != -ENOENT)
6741 return r;
6742 }
6743 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6744 out:
6745 return r;
6746 }
6747
6748 // dirs
6749
6750 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6751 {
6752 std::lock_guard lock(client_lock);
6753 tout(cct) << __func__ << std::endl;
6754 tout(cct) << relpath << std::endl;
6755 tout(cct) << mode << std::endl;
6756 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6757
6758 if (unmounting)
6759 return -ENOTCONN;
6760
6761 if (std::string(relpath) == "/")
6762 return -EEXIST;
6763
6764 filepath path(relpath);
6765 string name = path.last_dentry();
6766 path.pop_dentry();
6767 InodeRef dir;
6768 int r = path_walk(path, &dir, perm);
6769 if (r < 0)
6770 return r;
6771 if (cct->_conf->client_permissions) {
6772 r = may_create(dir.get(), perm);
6773 if (r < 0)
6774 return r;
6775 }
6776 return _mkdir(dir.get(), name.c_str(), mode, perm);
6777 }
6778
6779 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6780 {
6781 std::lock_guard lock(client_lock);
6782 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6783 tout(cct) << __func__ << std::endl;
6784 tout(cct) << relpath << std::endl;
6785 tout(cct) << mode << std::endl;
6786
6787 if (unmounting)
6788 return -ENOTCONN;
6789
6790 //get through existing parts of path
6791 filepath path(relpath);
6792 unsigned int i;
6793 int r = 0, caps = 0;
6794 InodeRef cur, next;
6795 cur = cwd;
6796 for (i=0; i<path.depth(); ++i) {
6797 if (cct->_conf->client_permissions) {
6798 r = may_lookup(cur.get(), perms);
6799 if (r < 0)
6800 break;
6801 caps = CEPH_CAP_AUTH_SHARED;
6802 }
6803 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6804 if (r < 0)
6805 break;
6806 cur.swap(next);
6807 }
6808 if (r!=-ENOENT) return r;
6809 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6810 //make new directory at each level
6811 for (; i<path.depth(); ++i) {
6812 if (cct->_conf->client_permissions) {
6813 r = may_create(cur.get(), perms);
6814 if (r < 0)
6815 return r;
6816 }
6817 //make new dir
6818 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6819
6820 //check proper creation/existence
6821 if(-EEXIST == r && i < path.depth() - 1) {
6822 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6823 }
6824 if (r < 0)
6825 return r;
6826 //move to new dir and continue
6827 cur.swap(next);
6828 ldout(cct, 20) << __func__ << ": successfully created directory "
6829 << filepath(cur->ino).get_path() << dendl;
6830 }
6831 return 0;
6832 }
6833
6834 int Client::rmdir(const char *relpath, const UserPerm& perms)
6835 {
6836 std::lock_guard lock(client_lock);
6837 tout(cct) << __func__ << std::endl;
6838 tout(cct) << relpath << std::endl;
6839
6840 if (unmounting)
6841 return -ENOTCONN;
6842
6843 if (std::string(relpath) == "/")
6844 return -EBUSY;
6845
6846 filepath path(relpath);
6847 string name = path.last_dentry();
6848 path.pop_dentry();
6849 InodeRef dir;
6850 int r = path_walk(path, &dir, perms);
6851 if (r < 0)
6852 return r;
6853 if (cct->_conf->client_permissions) {
6854 int r = may_delete(dir.get(), name.c_str(), perms);
6855 if (r < 0)
6856 return r;
6857 }
6858 return _rmdir(dir.get(), name.c_str(), perms);
6859 }
6860
6861 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6862 {
6863 std::lock_guard lock(client_lock);
6864 tout(cct) << __func__ << std::endl;
6865 tout(cct) << relpath << std::endl;
6866 tout(cct) << mode << std::endl;
6867 tout(cct) << rdev << std::endl;
6868
6869 if (unmounting)
6870 return -ENOTCONN;
6871
6872 if (std::string(relpath) == "/")
6873 return -EEXIST;
6874
6875 filepath path(relpath);
6876 string name = path.last_dentry();
6877 path.pop_dentry();
6878 InodeRef dir;
6879 int r = path_walk(path, &dir, perms);
6880 if (r < 0)
6881 return r;
6882 if (cct->_conf->client_permissions) {
6883 int r = may_create(dir.get(), perms);
6884 if (r < 0)
6885 return r;
6886 }
6887 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6888 }
6889
6890 // symlinks
6891
6892 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6893 {
6894 std::lock_guard lock(client_lock);
6895 tout(cct) << __func__ << std::endl;
6896 tout(cct) << target << std::endl;
6897 tout(cct) << relpath << std::endl;
6898
6899 if (unmounting)
6900 return -ENOTCONN;
6901
6902 if (std::string(relpath) == "/")
6903 return -EEXIST;
6904
6905 filepath path(relpath);
6906 string name = path.last_dentry();
6907 path.pop_dentry();
6908 InodeRef dir;
6909 int r = path_walk(path, &dir, perms);
6910 if (r < 0)
6911 return r;
6912 if (cct->_conf->client_permissions) {
6913 int r = may_create(dir.get(), perms);
6914 if (r < 0)
6915 return r;
6916 }
6917 return _symlink(dir.get(), name.c_str(), target, perms);
6918 }
6919
6920 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6921 {
6922 std::lock_guard lock(client_lock);
6923 tout(cct) << __func__ << std::endl;
6924 tout(cct) << relpath << std::endl;
6925
6926 if (unmounting)
6927 return -ENOTCONN;
6928
6929 filepath path(relpath);
6930 InodeRef in;
6931 int r = path_walk(path, &in, perms, false);
6932 if (r < 0)
6933 return r;
6934
6935 return _readlink(in.get(), buf, size);
6936 }
6937
6938 int Client::_readlink(Inode *in, char *buf, size_t size)
6939 {
6940 if (!in->is_symlink())
6941 return -EINVAL;
6942
6943 // copy into buf (at most size bytes)
6944 int r = in->symlink.length();
6945 if (r > (int)size)
6946 r = size;
6947 memcpy(buf, in->symlink.c_str(), r);
6948 return r;
6949 }
6950
6951
6952 // inode stuff
6953
6954 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6955 {
6956 bool yes = in->caps_issued_mask(mask, true);
6957
6958 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6959 if (yes && !force)
6960 return 0;
6961
6962 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6963 filepath path;
6964 in->make_nosnap_relative_path(path);
6965 req->set_filepath(path);
6966 req->set_inode(in);
6967 req->head.args.getattr.mask = mask;
6968
6969 int res = make_request(req, perms);
6970 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6971 return res;
6972 }
6973
6974 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6975 const UserPerm& perms, InodeRef *inp)
6976 {
6977 int issued = in->caps_issued();
6978
6979 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6980 ccap_string(issued) << dendl;
6981
6982 if (in->snapid != CEPH_NOSNAP) {
6983 return -EROFS;
6984 }
6985 if ((mask & CEPH_SETATTR_SIZE) &&
6986 (unsigned long)stx->stx_size > in->size &&
6987 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6988 perms)) {
6989 return -EDQUOT;
6990 }
6991
6992 // make the change locally?
6993 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6994 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6995 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6996 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6997 << in->cap_dirtier_gid << ", forcing sync setattr"
6998 << dendl;
6999 /*
7000 * This works because we implicitly flush the caps as part of the
7001 * request, so the cap update check will happen with the writeback
7002 * cap context, and then the setattr check will happen with the
7003 * caller's context.
7004 *
7005 * In reality this pattern is likely pretty rare (different users
7006 * setattr'ing the same file). If that turns out not to be the
7007 * case later, we can build a more complex pipelined cap writeback
7008 * infrastructure...
7009 */
7010 if (!mask)
7011 mask |= CEPH_SETATTR_CTIME;
7012 goto force_request;
7013 }
7014
7015 if (!mask) {
7016 // caller just needs us to bump the ctime
7017 in->ctime = ceph_clock_now();
7018 in->cap_dirtier_uid = perms.uid();
7019 in->cap_dirtier_gid = perms.gid();
7020 if (issued & CEPH_CAP_AUTH_EXCL)
7021 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7022 else if (issued & CEPH_CAP_FILE_EXCL)
7023 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7024 else if (issued & CEPH_CAP_XATTR_EXCL)
7025 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7026 else
7027 mask |= CEPH_SETATTR_CTIME;
7028 }
7029
7030 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7031 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7032
7033 mask &= ~CEPH_SETATTR_KILL_SGUID;
7034
7035 if (mask & CEPH_SETATTR_UID) {
7036 in->ctime = ceph_clock_now();
7037 in->cap_dirtier_uid = perms.uid();
7038 in->cap_dirtier_gid = perms.gid();
7039 in->uid = stx->stx_uid;
7040 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7041 mask &= ~CEPH_SETATTR_UID;
7042 kill_sguid = true;
7043 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7044 }
7045 if (mask & CEPH_SETATTR_GID) {
7046 in->ctime = ceph_clock_now();
7047 in->cap_dirtier_uid = perms.uid();
7048 in->cap_dirtier_gid = perms.gid();
7049 in->gid = stx->stx_gid;
7050 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7051 mask &= ~CEPH_SETATTR_GID;
7052 kill_sguid = true;
7053 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7054 }
7055
7056 if (mask & CEPH_SETATTR_MODE) {
7057 in->ctime = ceph_clock_now();
7058 in->cap_dirtier_uid = perms.uid();
7059 in->cap_dirtier_gid = perms.gid();
7060 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7061 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7062 mask &= ~CEPH_SETATTR_MODE;
7063 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7064 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7065 /* Must squash the any setuid/setgid bits with an ownership change */
7066 in->mode &= ~(S_ISUID|S_ISGID);
7067 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7068 }
7069
7070 if (mask & CEPH_SETATTR_BTIME) {
7071 in->ctime = ceph_clock_now();
7072 in->cap_dirtier_uid = perms.uid();
7073 in->cap_dirtier_gid = perms.gid();
7074 in->btime = utime_t(stx->stx_btime);
7075 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7076 mask &= ~CEPH_SETATTR_BTIME;
7077 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7078 }
7079 } else if (mask & CEPH_SETATTR_SIZE) {
7080 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7081 mask |= CEPH_SETATTR_KILL_SGUID;
7082 }
7083
7084 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7085 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7086 if (mask & CEPH_SETATTR_MTIME)
7087 in->mtime = utime_t(stx->stx_mtime);
7088 if (mask & CEPH_SETATTR_ATIME)
7089 in->atime = utime_t(stx->stx_atime);
7090 in->ctime = ceph_clock_now();
7091 in->cap_dirtier_uid = perms.uid();
7092 in->cap_dirtier_gid = perms.gid();
7093 in->time_warp_seq++;
7094 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7095 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7096 }
7097 }
7098 if (!mask) {
7099 in->change_attr++;
7100 return 0;
7101 }
7102
7103 force_request:
7104 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7105
7106 filepath path;
7107
7108 in->make_nosnap_relative_path(path);
7109 req->set_filepath(path);
7110 req->set_inode(in);
7111
7112 if (mask & CEPH_SETATTR_KILL_SGUID) {
7113 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7114 }
7115 if (mask & CEPH_SETATTR_MODE) {
7116 req->head.args.setattr.mode = stx->stx_mode;
7117 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7118 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7119 }
7120 if (mask & CEPH_SETATTR_UID) {
7121 req->head.args.setattr.uid = stx->stx_uid;
7122 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7123 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7124 }
7125 if (mask & CEPH_SETATTR_GID) {
7126 req->head.args.setattr.gid = stx->stx_gid;
7127 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7128 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7129 }
7130 if (mask & CEPH_SETATTR_BTIME) {
7131 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7132 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7133 }
7134 if (mask & CEPH_SETATTR_MTIME) {
7135 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7136 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7137 CEPH_CAP_FILE_WR;
7138 }
7139 if (mask & CEPH_SETATTR_ATIME) {
7140 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7141 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7142 CEPH_CAP_FILE_WR;
7143 }
7144 if (mask & CEPH_SETATTR_SIZE) {
7145 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7146 req->head.args.setattr.size = stx->stx_size;
7147 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7148 } else { //too big!
7149 put_request(req);
7150 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7151 return -EFBIG;
7152 }
7153 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7154 CEPH_CAP_FILE_WR;
7155 }
7156 req->head.args.setattr.mask = mask;
7157
7158 req->regetattr_mask = mask;
7159
7160 int res = make_request(req, perms, inp);
7161 ldout(cct, 10) << "_setattr result=" << res << dendl;
7162 return res;
7163 }
7164
7165 /* Note that we only care about attrs that setattr cares about */
7166 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7167 {
7168 stx->stx_size = st->st_size;
7169 stx->stx_mode = st->st_mode;
7170 stx->stx_uid = st->st_uid;
7171 stx->stx_gid = st->st_gid;
7172 #ifdef __APPLE__
7173 stx->stx_mtime = st->st_mtimespec;
7174 stx->stx_atime = st->st_atimespec;
7175 #else
7176 stx->stx_mtime = st->st_mtim;
7177 stx->stx_atime = st->st_atim;
7178 #endif
7179 }
7180
7181 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7182 const UserPerm& perms, InodeRef *inp)
7183 {
7184 int ret = _do_setattr(in, stx, mask, perms, inp);
7185 if (ret < 0)
7186 return ret;
7187 if (mask & CEPH_SETATTR_MODE)
7188 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7189 return ret;
7190 }
7191
7192 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7193 const UserPerm& perms)
7194 {
7195 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7196 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7197 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7198 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7199 if (cct->_conf->client_permissions) {
7200 int r = may_setattr(in.get(), stx, mask, perms);
7201 if (r < 0)
7202 return r;
7203 }
7204 return __setattrx(in.get(), stx, mask, perms);
7205 }
7206
7207 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7208 const UserPerm& perms)
7209 {
7210 struct ceph_statx stx;
7211
7212 stat_to_statx(attr, &stx);
7213 mask &= ~CEPH_SETATTR_BTIME;
7214
7215 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7216 mask &= ~CEPH_SETATTR_UID;
7217 }
7218 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7219 mask &= ~CEPH_SETATTR_GID;
7220 }
7221
7222 return _setattrx(in, &stx, mask, perms);
7223 }
7224
7225 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7226 const UserPerm& perms)
7227 {
7228 std::lock_guard lock(client_lock);
7229 tout(cct) << __func__ << std::endl;
7230 tout(cct) << relpath << std::endl;
7231 tout(cct) << mask << std::endl;
7232
7233 if (unmounting)
7234 return -ENOTCONN;
7235
7236 filepath path(relpath);
7237 InodeRef in;
7238 int r = path_walk(path, &in, perms);
7239 if (r < 0)
7240 return r;
7241 return _setattr(in, attr, mask, perms);
7242 }
7243
7244 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7245 const UserPerm& perms, int flags)
7246 {
7247 std::lock_guard lock(client_lock);
7248 tout(cct) << __func__ << std::endl;
7249 tout(cct) << relpath << std::endl;
7250 tout(cct) << mask << std::endl;
7251
7252 if (unmounting)
7253 return -ENOTCONN;
7254
7255 filepath path(relpath);
7256 InodeRef in;
7257 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7258 if (r < 0)
7259 return r;
7260 return _setattrx(in, stx, mask, perms);
7261 }
7262
7263 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7264 {
7265 std::lock_guard lock(client_lock);
7266 tout(cct) << __func__ << std::endl;
7267 tout(cct) << fd << std::endl;
7268 tout(cct) << mask << std::endl;
7269
7270 if (unmounting)
7271 return -ENOTCONN;
7272
7273 Fh *f = get_filehandle(fd);
7274 if (!f)
7275 return -EBADF;
7276 #if defined(__linux__) && defined(O_PATH)
7277 if (f->flags & O_PATH)
7278 return -EBADF;
7279 #endif
7280 return _setattr(f->inode, attr, mask, perms);
7281 }
7282
7283 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7284 {
7285 std::lock_guard lock(client_lock);
7286 tout(cct) << __func__ << std::endl;
7287 tout(cct) << fd << std::endl;
7288 tout(cct) << mask << std::endl;
7289
7290 if (unmounting)
7291 return -ENOTCONN;
7292
7293 Fh *f = get_filehandle(fd);
7294 if (!f)
7295 return -EBADF;
7296 #if defined(__linux__) && defined(O_PATH)
7297 if (f->flags & O_PATH)
7298 return -EBADF;
7299 #endif
7300 return _setattrx(f->inode, stx, mask, perms);
7301 }
7302
7303 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7304 frag_info_t *dirstat, int mask)
7305 {
7306 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7307 std::lock_guard lock(client_lock);
7308 tout(cct) << "stat" << std::endl;
7309 tout(cct) << relpath << std::endl;
7310
7311 if (unmounting)
7312 return -ENOTCONN;
7313
7314 filepath path(relpath);
7315 InodeRef in;
7316 int r = path_walk(path, &in, perms, true, mask);
7317 if (r < 0)
7318 return r;
7319 r = _getattr(in, mask, perms);
7320 if (r < 0) {
7321 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7322 return r;
7323 }
7324 fill_stat(in, stbuf, dirstat);
7325 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7326 return r;
7327 }
7328
7329 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7330 {
7331 unsigned mask = 0;
7332
7333 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7334 if (flags & AT_NO_ATTR_SYNC)
7335 goto out;
7336
7337 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7338 mask |= CEPH_CAP_PIN;
7339 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7340 mask |= CEPH_CAP_AUTH_SHARED;
7341 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7342 mask |= CEPH_CAP_LINK_SHARED;
7343 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7344 mask |= CEPH_CAP_FILE_SHARED;
7345 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7346 mask |= CEPH_CAP_XATTR_SHARED;
7347 out:
7348 return mask;
7349 }
7350
7351 int Client::statx(const char *relpath, struct ceph_statx *stx,
7352 const UserPerm& perms,
7353 unsigned int want, unsigned int flags)
7354 {
7355 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7356 std::lock_guard lock(client_lock);
7357 tout(cct) << "statx" << std::endl;
7358 tout(cct) << relpath << std::endl;
7359
7360 if (unmounting)
7361 return -ENOTCONN;
7362
7363 filepath path(relpath);
7364 InodeRef in;
7365
7366 unsigned mask = statx_to_mask(flags, want);
7367
7368 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7369 if (r < 0)
7370 return r;
7371
7372 r = _getattr(in, mask, perms);
7373 if (r < 0) {
7374 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7375 return r;
7376 }
7377
7378 fill_statx(in, mask, stx);
7379 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7380 return r;
7381 }
7382
7383 int Client::lstat(const char *relpath, struct stat *stbuf,
7384 const UserPerm& perms, frag_info_t *dirstat, int mask)
7385 {
7386 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7387 std::lock_guard lock(client_lock);
7388 tout(cct) << __func__ << std::endl;
7389 tout(cct) << relpath << std::endl;
7390
7391 if (unmounting)
7392 return -ENOTCONN;
7393
7394 filepath path(relpath);
7395 InodeRef in;
7396 // don't follow symlinks
7397 int r = path_walk(path, &in, perms, false, mask);
7398 if (r < 0)
7399 return r;
7400 r = _getattr(in, mask, perms);
7401 if (r < 0) {
7402 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7403 return r;
7404 }
7405 fill_stat(in, stbuf, dirstat);
7406 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7407 return r;
7408 }
7409
7410 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7411 {
7412 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7413 << " mode 0" << oct << in->mode << dec
7414 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7415 memset(st, 0, sizeof(struct stat));
7416 if (use_faked_inos())
7417 st->st_ino = in->faked_ino;
7418 else
7419 st->st_ino = in->ino;
7420 st->st_dev = in->snapid;
7421 st->st_mode = in->mode;
7422 st->st_rdev = in->rdev;
7423 if (in->is_dir()) {
7424 switch (in->nlink) {
7425 case 0:
7426 st->st_nlink = 0; /* dir is unlinked */
7427 break;
7428 case 1:
7429 st->st_nlink = 1 /* parent dentry */
7430 + 1 /* <dir>/. */
7431 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7432 break;
7433 default:
7434 ceph_abort();
7435 }
7436 } else {
7437 st->st_nlink = in->nlink;
7438 }
7439 st->st_uid = in->uid;
7440 st->st_gid = in->gid;
7441 if (in->ctime > in->mtime) {
7442 stat_set_ctime_sec(st, in->ctime.sec());
7443 stat_set_ctime_nsec(st, in->ctime.nsec());
7444 } else {
7445 stat_set_ctime_sec(st, in->mtime.sec());
7446 stat_set_ctime_nsec(st, in->mtime.nsec());
7447 }
7448 stat_set_atime_sec(st, in->atime.sec());
7449 stat_set_atime_nsec(st, in->atime.nsec());
7450 stat_set_mtime_sec(st, in->mtime.sec());
7451 stat_set_mtime_nsec(st, in->mtime.nsec());
7452 if (in->is_dir()) {
7453 if (cct->_conf->client_dirsize_rbytes)
7454 st->st_size = in->rstat.rbytes;
7455 else
7456 st->st_size = in->dirstat.size();
7457 st->st_blocks = 1;
7458 } else {
7459 st->st_size = in->size;
7460 st->st_blocks = (in->size + 511) >> 9;
7461 }
7462 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7463
7464 if (dirstat)
7465 *dirstat = in->dirstat;
7466 if (rstat)
7467 *rstat = in->rstat;
7468
7469 return in->caps_issued();
7470 }
7471
7472 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7473 {
7474 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7475 << " mode 0" << oct << in->mode << dec
7476 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7477 memset(stx, 0, sizeof(struct ceph_statx));
7478
7479 /*
7480 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7481 * so that all bits are set.
7482 */
7483 if (!mask)
7484 mask = ~0;
7485
7486 /* These are always considered to be available */
7487 stx->stx_dev = in->snapid;
7488 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7489
7490 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7491 stx->stx_mode = S_IFMT & in->mode;
7492 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7493 stx->stx_rdev = in->rdev;
7494 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7495
7496 if (mask & CEPH_CAP_AUTH_SHARED) {
7497 stx->stx_uid = in->uid;
7498 stx->stx_gid = in->gid;
7499 stx->stx_mode = in->mode;
7500 in->btime.to_timespec(&stx->stx_btime);
7501 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7502 }
7503
7504 if (mask & CEPH_CAP_LINK_SHARED) {
7505 if (in->is_dir()) {
7506 switch (in->nlink) {
7507 case 0:
7508 stx->stx_nlink = 0; /* dir is unlinked */
7509 break;
7510 case 1:
7511 stx->stx_nlink = 1 /* parent dentry */
7512 + 1 /* <dir>/. */
7513 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7514 break;
7515 default:
7516 ceph_abort();
7517 }
7518 } else {
7519 stx->stx_nlink = in->nlink;
7520 }
7521 stx->stx_mask |= CEPH_STATX_NLINK;
7522 }
7523
7524 if (mask & CEPH_CAP_FILE_SHARED) {
7525
7526 in->atime.to_timespec(&stx->stx_atime);
7527 in->mtime.to_timespec(&stx->stx_mtime);
7528
7529 if (in->is_dir()) {
7530 if (cct->_conf->client_dirsize_rbytes)
7531 stx->stx_size = in->rstat.rbytes;
7532 else
7533 stx->stx_size = in->dirstat.size();
7534 stx->stx_blocks = 1;
7535 } else {
7536 stx->stx_size = in->size;
7537 stx->stx_blocks = (in->size + 511) >> 9;
7538 }
7539 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7540 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7541 }
7542
7543 /* Change time and change_attr both require all shared caps to view */
7544 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7545 stx->stx_version = in->change_attr;
7546 if (in->ctime > in->mtime)
7547 in->ctime.to_timespec(&stx->stx_ctime);
7548 else
7549 in->mtime.to_timespec(&stx->stx_ctime);
7550 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7551 }
7552
7553 }
7554
7555 void Client::touch_dn(Dentry *dn)
7556 {
7557 lru.lru_touch(dn);
7558 }
7559
7560 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7561 {
7562 std::lock_guard lock(client_lock);
7563 tout(cct) << __func__ << std::endl;
7564 tout(cct) << relpath << std::endl;
7565 tout(cct) << mode << std::endl;
7566
7567 if (unmounting)
7568 return -ENOTCONN;
7569
7570 filepath path(relpath);
7571 InodeRef in;
7572 int r = path_walk(path, &in, perms);
7573 if (r < 0)
7574 return r;
7575 struct stat attr;
7576 attr.st_mode = mode;
7577 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7578 }
7579
7580 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7581 {
7582 std::lock_guard lock(client_lock);
7583 tout(cct) << __func__ << std::endl;
7584 tout(cct) << fd << std::endl;
7585 tout(cct) << mode << std::endl;
7586
7587 if (unmounting)
7588 return -ENOTCONN;
7589
7590 Fh *f = get_filehandle(fd);
7591 if (!f)
7592 return -EBADF;
7593 #if defined(__linux__) && defined(O_PATH)
7594 if (f->flags & O_PATH)
7595 return -EBADF;
7596 #endif
7597 struct stat attr;
7598 attr.st_mode = mode;
7599 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7600 }
7601
7602 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7603 {
7604 std::lock_guard lock(client_lock);
7605 tout(cct) << __func__ << std::endl;
7606 tout(cct) << relpath << std::endl;
7607 tout(cct) << mode << std::endl;
7608
7609 if (unmounting)
7610 return -ENOTCONN;
7611
7612 filepath path(relpath);
7613 InodeRef in;
7614 // don't follow symlinks
7615 int r = path_walk(path, &in, perms, false);
7616 if (r < 0)
7617 return r;
7618 struct stat attr;
7619 attr.st_mode = mode;
7620 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7621 }
7622
7623 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7624 const UserPerm& perms)
7625 {
7626 std::lock_guard lock(client_lock);
7627 tout(cct) << __func__ << std::endl;
7628 tout(cct) << relpath << std::endl;
7629 tout(cct) << new_uid << std::endl;
7630 tout(cct) << new_gid << std::endl;
7631
7632 if (unmounting)
7633 return -ENOTCONN;
7634
7635 filepath path(relpath);
7636 InodeRef in;
7637 int r = path_walk(path, &in, perms);
7638 if (r < 0)
7639 return r;
7640 struct stat attr;
7641 attr.st_uid = new_uid;
7642 attr.st_gid = new_gid;
7643 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7644 }
7645
7646 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7647 {
7648 std::lock_guard lock(client_lock);
7649 tout(cct) << __func__ << std::endl;
7650 tout(cct) << fd << std::endl;
7651 tout(cct) << new_uid << std::endl;
7652 tout(cct) << new_gid << std::endl;
7653
7654 if (unmounting)
7655 return -ENOTCONN;
7656
7657 Fh *f = get_filehandle(fd);
7658 if (!f)
7659 return -EBADF;
7660 #if defined(__linux__) && defined(O_PATH)
7661 if (f->flags & O_PATH)
7662 return -EBADF;
7663 #endif
7664 struct stat attr;
7665 attr.st_uid = new_uid;
7666 attr.st_gid = new_gid;
7667 int mask = 0;
7668 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7669 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7670 return _setattr(f->inode, &attr, mask, perms);
7671 }
7672
7673 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7674 const UserPerm& perms)
7675 {
7676 std::lock_guard lock(client_lock);
7677 tout(cct) << __func__ << std::endl;
7678 tout(cct) << relpath << std::endl;
7679 tout(cct) << new_uid << std::endl;
7680 tout(cct) << new_gid << std::endl;
7681
7682 if (unmounting)
7683 return -ENOTCONN;
7684
7685 filepath path(relpath);
7686 InodeRef in;
7687 // don't follow symlinks
7688 int r = path_walk(path, &in, perms, false);
7689 if (r < 0)
7690 return r;
7691 struct stat attr;
7692 attr.st_uid = new_uid;
7693 attr.st_gid = new_gid;
7694 int mask = 0;
7695 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7696 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7697 return _setattr(in, &attr, mask, perms);
7698 }
7699
7700 static void attr_set_atime_and_mtime(struct stat *attr,
7701 const utime_t &atime,
7702 const utime_t &mtime)
7703 {
7704 stat_set_atime_sec(attr, atime.tv.tv_sec);
7705 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7706 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7707 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7708 }
7709
7710 // for [l]utime() invoke the timeval variant as the timespec
7711 // variant are not yet implemented. for futime[s](), invoke
7712 // the timespec variant.
7713 int Client::utime(const char *relpath, struct utimbuf *buf,
7714 const UserPerm& perms)
7715 {
7716 struct timeval tv[2];
7717 tv[0].tv_sec = buf->actime;
7718 tv[0].tv_usec = 0;
7719 tv[1].tv_sec = buf->modtime;
7720 tv[1].tv_usec = 0;
7721
7722 return utimes(relpath, tv, perms);
7723 }
7724
7725 int Client::lutime(const char *relpath, struct utimbuf *buf,
7726 const UserPerm& perms)
7727 {
7728 struct timeval tv[2];
7729 tv[0].tv_sec = buf->actime;
7730 tv[0].tv_usec = 0;
7731 tv[1].tv_sec = buf->modtime;
7732 tv[1].tv_usec = 0;
7733
7734 return lutimes(relpath, tv, perms);
7735 }
7736
7737 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7738 {
7739 struct timespec ts[2];
7740 ts[0].tv_sec = buf->actime;
7741 ts[0].tv_nsec = 0;
7742 ts[1].tv_sec = buf->modtime;
7743 ts[1].tv_nsec = 0;
7744
7745 return futimens(fd, ts, perms);
7746 }
7747
7748 int Client::utimes(const char *relpath, struct timeval times[2],
7749 const UserPerm& perms)
7750 {
7751 std::lock_guard lock(client_lock);
7752 tout(cct) << __func__ << std::endl;
7753 tout(cct) << relpath << std::endl;
7754 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7755 << std::endl;
7756 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7757 << std::endl;
7758
7759 if (unmounting)
7760 return -ENOTCONN;
7761
7762 filepath path(relpath);
7763 InodeRef in;
7764 int r = path_walk(path, &in, perms);
7765 if (r < 0)
7766 return r;
7767 struct stat attr;
7768 utime_t atime(times[0]);
7769 utime_t mtime(times[1]);
7770
7771 attr_set_atime_and_mtime(&attr, atime, mtime);
7772 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7773 }
7774
7775 int Client::lutimes(const char *relpath, struct timeval times[2],
7776 const UserPerm& perms)
7777 {
7778 std::lock_guard lock(client_lock);
7779 tout(cct) << __func__ << std::endl;
7780 tout(cct) << relpath << std::endl;
7781 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7782 << std::endl;
7783 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7784 << std::endl;
7785
7786 if (unmounting)
7787 return -ENOTCONN;
7788
7789 filepath path(relpath);
7790 InodeRef in;
7791 int r = path_walk(path, &in, perms, false);
7792 if (r < 0)
7793 return r;
7794 struct stat attr;
7795 utime_t atime(times[0]);
7796 utime_t mtime(times[1]);
7797
7798 attr_set_atime_and_mtime(&attr, atime, mtime);
7799 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7800 }
7801
7802 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7803 {
7804 struct timespec ts[2];
7805 ts[0].tv_sec = times[0].tv_sec;
7806 ts[0].tv_nsec = times[0].tv_usec * 1000;
7807 ts[1].tv_sec = times[1].tv_sec;
7808 ts[1].tv_nsec = times[1].tv_usec * 1000;
7809
7810 return futimens(fd, ts, perms);
7811 }
7812
7813 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7814 {
7815 std::lock_guard lock(client_lock);
7816 tout(cct) << __func__ << std::endl;
7817 tout(cct) << fd << std::endl;
7818 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7819 << std::endl;
7820 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7821 << std::endl;
7822
7823 if (unmounting)
7824 return -ENOTCONN;
7825
7826 Fh *f = get_filehandle(fd);
7827 if (!f)
7828 return -EBADF;
7829 #if defined(__linux__) && defined(O_PATH)
7830 if (f->flags & O_PATH)
7831 return -EBADF;
7832 #endif
7833 struct stat attr;
7834 utime_t atime(times[0]);
7835 utime_t mtime(times[1]);
7836
7837 attr_set_atime_and_mtime(&attr, atime, mtime);
7838 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7839 }
7840
7841 int Client::flock(int fd, int operation, uint64_t owner)
7842 {
7843 std::lock_guard lock(client_lock);
7844 tout(cct) << __func__ << std::endl;
7845 tout(cct) << fd << std::endl;
7846 tout(cct) << operation << std::endl;
7847 tout(cct) << owner << std::endl;
7848
7849 if (unmounting)
7850 return -ENOTCONN;
7851
7852 Fh *f = get_filehandle(fd);
7853 if (!f)
7854 return -EBADF;
7855
7856 return _flock(f, operation, owner);
7857 }
7858
7859 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7860 {
7861 std::lock_guard lock(client_lock);
7862 tout(cct) << __func__ << std::endl;
7863 tout(cct) << relpath << std::endl;
7864
7865 if (unmounting)
7866 return -ENOTCONN;
7867
7868 filepath path(relpath);
7869 InodeRef in;
7870 int r = path_walk(path, &in, perms, true);
7871 if (r < 0)
7872 return r;
7873 if (cct->_conf->client_permissions) {
7874 int r = may_open(in.get(), O_RDONLY, perms);
7875 if (r < 0)
7876 return r;
7877 }
7878 r = _opendir(in.get(), dirpp, perms);
7879 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7880 if (r != -ENOTDIR)
7881 tout(cct) << (unsigned long)*dirpp << std::endl;
7882 return r;
7883 }
7884
7885 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7886 {
7887 if (!in->is_dir())
7888 return -ENOTDIR;
7889 *dirpp = new dir_result_t(in, perms);
7890 opened_dirs.insert(*dirpp);
7891 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7892 return 0;
7893 }
7894
7895
7896 int Client::closedir(dir_result_t *dir)
7897 {
7898 std::lock_guard lock(client_lock);
7899 tout(cct) << __func__ << std::endl;
7900 tout(cct) << (unsigned long)dir << std::endl;
7901
7902 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7903 _closedir(dir);
7904 return 0;
7905 }
7906
7907 void Client::_closedir(dir_result_t *dirp)
7908 {
7909 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7910 if (dirp->inode) {
7911 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7912 dirp->inode.reset();
7913 }
7914 _readdir_drop_dirp_buffer(dirp);
7915 opened_dirs.erase(dirp);
7916 delete dirp;
7917 }
7918
7919 void Client::rewinddir(dir_result_t *dirp)
7920 {
7921 std::lock_guard lock(client_lock);
7922 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7923
7924 if (unmounting)
7925 return;
7926
7927 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7928 _readdir_drop_dirp_buffer(d);
7929 d->reset();
7930 }
7931
7932 loff_t Client::telldir(dir_result_t *dirp)
7933 {
7934 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7935 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7936 return d->offset;
7937 }
7938
7939 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7940 {
7941 std::lock_guard lock(client_lock);
7942
7943 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7944
7945 if (unmounting)
7946 return;
7947
7948 if (offset == dirp->offset)
7949 return;
7950
7951 if (offset > dirp->offset)
7952 dirp->release_count = 0; // bump if we do a forward seek
7953 else
7954 dirp->ordered_count = 0; // disable filling readdir cache
7955
7956 if (dirp->hash_order()) {
7957 if (dirp->offset > offset) {
7958 _readdir_drop_dirp_buffer(dirp);
7959 dirp->reset();
7960 }
7961 } else {
7962 if (offset == 0 ||
7963 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7964 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7965 _readdir_drop_dirp_buffer(dirp);
7966 dirp->reset();
7967 }
7968 }
7969
7970 dirp->offset = offset;
7971 }
7972
7973
7974 //struct dirent {
7975 // ino_t d_ino; /* inode number */
7976 // off_t d_off; /* offset to the next dirent */
7977 // unsigned short d_reclen; /* length of this record */
7978 // unsigned char d_type; /* type of file */
7979 // char d_name[256]; /* filename */
7980 //};
7981 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7982 {
7983 strncpy(de->d_name, name, 255);
7984 de->d_name[255] = '\0';
7985 #ifndef __CYGWIN__
7986 de->d_ino = ino;
7987 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7988 de->d_off = next_off;
7989 #endif
7990 de->d_reclen = 1;
7991 de->d_type = IFTODT(type);
7992 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7993 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7994 #endif
7995 }
7996
7997 void Client::_readdir_next_frag(dir_result_t *dirp)
7998 {
7999 frag_t fg = dirp->buffer_frag;
8000
8001 if (fg.is_rightmost()) {
8002 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8003 dirp->set_end();
8004 return;
8005 }
8006
8007 // advance
8008 fg = fg.next();
8009 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8010
8011 if (dirp->hash_order()) {
8012 // keep last_name
8013 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8014 if (dirp->offset < new_offset) // don't decrease offset
8015 dirp->offset = new_offset;
8016 } else {
8017 dirp->last_name.clear();
8018 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8019 _readdir_rechoose_frag(dirp);
8020 }
8021 }
8022
8023 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8024 {
8025 ceph_assert(dirp->inode);
8026
8027 if (dirp->hash_order())
8028 return;
8029
8030 frag_t cur = frag_t(dirp->offset_high());
8031 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8032 if (fg != cur) {
8033 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8034 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8035 dirp->last_name.clear();
8036 dirp->next_offset = 2;
8037 }
8038 }
8039
8040 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8041 {
8042 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8043 dirp->buffer.clear();
8044 }
8045
8046 int Client::_readdir_get_frag(dir_result_t *dirp)
8047 {
8048 ceph_assert(dirp);
8049 ceph_assert(dirp->inode);
8050
8051 // get the current frag.
8052 frag_t fg;
8053 if (dirp->hash_order())
8054 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8055 else
8056 fg = frag_t(dirp->offset_high());
8057
8058 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8059 << " offset " << hex << dirp->offset << dec << dendl;
8060
8061 int op = CEPH_MDS_OP_READDIR;
8062 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8063 op = CEPH_MDS_OP_LSSNAP;
8064
8065 InodeRef& diri = dirp->inode;
8066
8067 MetaRequest *req = new MetaRequest(op);
8068 filepath path;
8069 diri->make_nosnap_relative_path(path);
8070 req->set_filepath(path);
8071 req->set_inode(diri.get());
8072 req->head.args.readdir.frag = fg;
8073 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8074 if (dirp->last_name.length()) {
8075 req->path2.set_path(dirp->last_name);
8076 } else if (dirp->hash_order()) {
8077 req->head.args.readdir.offset_hash = dirp->offset_high();
8078 }
8079 req->dirp = dirp;
8080
8081 bufferlist dirbl;
8082 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8083
8084 if (res == -EAGAIN) {
8085 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8086 _readdir_rechoose_frag(dirp);
8087 return _readdir_get_frag(dirp);
8088 }
8089
8090 if (res == 0) {
8091 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8092 << " size " << dirp->buffer.size() << dendl;
8093 } else {
8094 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8095 dirp->set_end();
8096 }
8097
8098 return res;
8099 }
8100
8101 struct dentry_off_lt {
8102 bool operator()(const Dentry* dn, int64_t off) const {
8103 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8104 }
8105 };
8106
8107 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8108 int caps, bool getref)
8109 {
8110 ceph_assert(ceph_mutex_is_locked(client_lock));
8111 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8112 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8113 << dendl;
8114 Dir *dir = dirp->inode->dir;
8115
8116 if (!dir) {
8117 ldout(cct, 10) << " dir is empty" << dendl;
8118 dirp->set_end();
8119 return 0;
8120 }
8121
8122 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8123 dir->readdir_cache.end(),
8124 dirp->offset, dentry_off_lt());
8125
8126 string dn_name;
8127 while (true) {
8128 int mask = caps;
8129 if (!dirp->inode->is_complete_and_ordered())
8130 return -EAGAIN;
8131 if (pd == dir->readdir_cache.end())
8132 break;
8133 Dentry *dn = *pd;
8134 if (dn->inode == NULL) {
8135 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8136 ++pd;
8137 continue;
8138 }
8139 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8140 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8141 ++pd;
8142 continue;
8143 }
8144
8145 int idx = pd - dir->readdir_cache.begin();
8146 if (dn->inode->is_dir()) {
8147 mask |= CEPH_STAT_RSTAT;
8148 }
8149 int r = _getattr(dn->inode, mask, dirp->perms);
8150 if (r < 0)
8151 return r;
8152
8153 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8154 pd = dir->readdir_cache.begin() + idx;
8155 if (pd >= dir->readdir_cache.end() || *pd != dn)
8156 return -EAGAIN;
8157
8158 struct ceph_statx stx;
8159 struct dirent de;
8160 fill_statx(dn->inode, caps, &stx);
8161
8162 uint64_t next_off = dn->offset + 1;
8163 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8164 ++pd;
8165 if (pd == dir->readdir_cache.end())
8166 next_off = dir_result_t::END;
8167
8168 Inode *in = NULL;
8169 if (getref) {
8170 in = dn->inode.get();
8171 _ll_get(in);
8172 }
8173
8174 dn_name = dn->name; // fill in name while we have lock
8175
8176 client_lock.unlock();
8177 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8178 client_lock.lock();
8179 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8180 << " = " << r << dendl;
8181 if (r < 0) {
8182 return r;
8183 }
8184
8185 dirp->offset = next_off;
8186 if (dirp->at_end())
8187 dirp->next_offset = 2;
8188 else
8189 dirp->next_offset = dirp->offset_low();
8190 dirp->last_name = dn_name; // we successfully returned this one; update!
8191 dirp->release_count = 0; // last_name no longer match cache index
8192 if (r > 0)
8193 return r;
8194 }
8195
8196 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8197 dirp->set_end();
8198 return 0;
8199 }
8200
8201 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8202 unsigned want, unsigned flags, bool getref)
8203 {
8204 int caps = statx_to_mask(flags, want);
8205
8206 std::lock_guard lock(client_lock);
8207
8208 if (unmounting)
8209 return -ENOTCONN;
8210
8211 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8212
8213 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8214 << dec << " at_end=" << dirp->at_end()
8215 << " hash_order=" << dirp->hash_order() << dendl;
8216
8217 struct dirent de;
8218 struct ceph_statx stx;
8219 memset(&de, 0, sizeof(de));
8220 memset(&stx, 0, sizeof(stx));
8221
8222 InodeRef& diri = dirp->inode;
8223
8224 if (dirp->at_end())
8225 return 0;
8226
8227 if (dirp->offset == 0) {
8228 ldout(cct, 15) << " including ." << dendl;
8229 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8230 uint64_t next_off = 1;
8231
8232 int r;
8233 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8234 if (r < 0)
8235 return r;
8236
8237 fill_statx(diri, caps, &stx);
8238 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8239
8240 Inode *inode = NULL;
8241 if (getref) {
8242 inode = diri.get();
8243 _ll_get(inode);
8244 }
8245
8246 client_lock.unlock();
8247 r = cb(p, &de, &stx, next_off, inode);
8248 client_lock.lock();
8249 if (r < 0)
8250 return r;
8251
8252 dirp->offset = next_off;
8253 if (r > 0)
8254 return r;
8255 }
8256 if (dirp->offset == 1) {
8257 ldout(cct, 15) << " including .." << dendl;
8258 uint64_t next_off = 2;
8259 InodeRef in;
8260 if (diri->dentries.empty())
8261 in = diri;
8262 else
8263 in = diri->get_first_parent()->dir->parent_inode;
8264
8265 int r;
8266 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8267 if (r < 0)
8268 return r;
8269
8270 fill_statx(in, caps, &stx);
8271 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8272
8273 Inode *inode = NULL;
8274 if (getref) {
8275 inode = in.get();
8276 _ll_get(inode);
8277 }
8278
8279 client_lock.unlock();
8280 r = cb(p, &de, &stx, next_off, inode);
8281 client_lock.lock();
8282 if (r < 0)
8283 return r;
8284
8285 dirp->offset = next_off;
8286 if (r > 0)
8287 return r;
8288 }
8289
8290 // can we read from our cache?
8291 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8292 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8293 << dirp->inode->is_complete_and_ordered()
8294 << " issued " << ccap_string(dirp->inode->caps_issued())
8295 << dendl;
8296 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8297 dirp->inode->is_complete_and_ordered() &&
8298 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8299 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8300 if (err != -EAGAIN)
8301 return err;
8302 }
8303
8304 while (1) {
8305 if (dirp->at_end())
8306 return 0;
8307
8308 bool check_caps = true;
8309 if (!dirp->is_cached()) {
8310 int r = _readdir_get_frag(dirp);
8311 if (r)
8312 return r;
8313 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8314 // different than the requested one. (our dirfragtree was outdated)
8315 check_caps = false;
8316 }
8317 frag_t fg = dirp->buffer_frag;
8318
8319 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8320 << " offset " << hex << dirp->offset << dendl;
8321
8322 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8323 dirp->offset, dir_result_t::dentry_off_lt());
8324 it != dirp->buffer.end();
8325 ++it) {
8326 dir_result_t::dentry &entry = *it;
8327
8328 uint64_t next_off = entry.offset + 1;
8329
8330 int r;
8331 if (check_caps) {
8332 int mask = caps;
8333 if(entry.inode->is_dir()){
8334 mask |= CEPH_STAT_RSTAT;
8335 }
8336 r = _getattr(entry.inode, mask, dirp->perms);
8337 if (r < 0)
8338 return r;
8339 }
8340
8341 fill_statx(entry.inode, caps, &stx);
8342 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8343
8344 Inode *inode = NULL;
8345 if (getref) {
8346 inode = entry.inode.get();
8347 _ll_get(inode);
8348 }
8349
8350 client_lock.unlock();
8351 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8352 client_lock.lock();
8353
8354 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8355 << " = " << r << dendl;
8356 if (r < 0)
8357 return r;
8358
8359 dirp->offset = next_off;
8360 if (r > 0)
8361 return r;
8362 }
8363
8364 if (dirp->next_offset > 2) {
8365 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8366 _readdir_drop_dirp_buffer(dirp);
8367 continue; // more!
8368 }
8369
8370 if (!fg.is_rightmost()) {
8371 // next frag!
8372 _readdir_next_frag(dirp);
8373 continue;
8374 }
8375
8376 if (diri->shared_gen == dirp->start_shared_gen &&
8377 diri->dir_release_count == dirp->release_count) {
8378 if (diri->dir_ordered_count == dirp->ordered_count) {
8379 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8380 if (diri->dir) {
8381 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8382 diri->dir->readdir_cache.resize(dirp->cache_index);
8383 }
8384 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8385 } else {
8386 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8387 diri->flags |= I_COMPLETE;
8388 }
8389 }
8390
8391 dirp->set_end();
8392 return 0;
8393 }
8394 ceph_abort();
8395 return 0;
8396 }
8397
8398
8399 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8400 {
8401 return readdirplus_r(d, de, 0, 0, 0, NULL);
8402 }
8403
8404 /*
8405 * readdirplus_r
8406 *
8407 * returns
8408 * 1 if we got a dirent
8409 * 0 for end of directory
8410 * <0 on error
8411 */
8412
8413 struct single_readdir {
8414 struct dirent *de;
8415 struct ceph_statx *stx;
8416 Inode *inode;
8417 bool full;
8418 };
8419
8420 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8421 struct ceph_statx *stx, off_t off,
8422 Inode *in)
8423 {
8424 single_readdir *c = static_cast<single_readdir *>(p);
8425
8426 if (c->full)
8427 return -1; // already filled this dirent
8428
8429 *c->de = *de;
8430 if (c->stx)
8431 *c->stx = *stx;
8432 c->inode = in;
8433 c->full = true;
8434 return 1;
8435 }
8436
8437 struct dirent *Client::readdir(dir_result_t *d)
8438 {
8439 int ret;
8440 auto& de = d->de;
8441 single_readdir sr;
8442 sr.de = &de;
8443 sr.stx = NULL;
8444 sr.inode = NULL;
8445 sr.full = false;
8446
8447 // our callback fills the dirent and sets sr.full=true on first
8448 // call, and returns -1 the second time around.
8449 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8450 if (ret < -1) {
8451 errno = -ret; // this sucks.
8452 return (dirent *) NULL;
8453 }
8454 if (sr.full) {
8455 return &de;
8456 }
8457 return (dirent *) NULL;
8458 }
8459
8460 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8461 struct ceph_statx *stx, unsigned want,
8462 unsigned flags, Inode **out)
8463 {
8464 single_readdir sr;
8465 sr.de = de;
8466 sr.stx = stx;
8467 sr.inode = NULL;
8468 sr.full = false;
8469
8470 // our callback fills the dirent and sets sr.full=true on first
8471 // call, and returns -1 the second time around.
8472 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8473 if (r < -1)
8474 return r;
8475 if (out)
8476 *out = sr.inode;
8477 if (sr.full)
8478 return 1;
8479 return 0;
8480 }
8481
8482
8483 /* getdents */
8484 struct getdents_result {
8485 char *buf;
8486 int buflen;
8487 int pos;
8488 bool fullent;
8489 };
8490
8491 static int _readdir_getdent_cb(void *p, struct dirent *de,
8492 struct ceph_statx *stx, off_t off, Inode *in)
8493 {
8494 struct getdents_result *c = static_cast<getdents_result *>(p);
8495
8496 int dlen;
8497 if (c->fullent)
8498 dlen = sizeof(*de);
8499 else
8500 dlen = strlen(de->d_name) + 1;
8501
8502 if (c->pos + dlen > c->buflen)
8503 return -1; // doesn't fit
8504
8505 if (c->fullent) {
8506 memcpy(c->buf + c->pos, de, sizeof(*de));
8507 } else {
8508 memcpy(c->buf + c->pos, de->d_name, dlen);
8509 }
8510 c->pos += dlen;
8511 return 0;
8512 }
8513
8514 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8515 {
8516 getdents_result gr;
8517 gr.buf = buf;
8518 gr.buflen = buflen;
8519 gr.fullent = fullent;
8520 gr.pos = 0;
8521
8522 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8523
8524 if (r < 0) { // some error
8525 if (r == -1) { // buffer ran out of space
8526 if (gr.pos) { // but we got some entries already!
8527 return gr.pos;
8528 } // or we need a larger buffer
8529 return -ERANGE;
8530 } else { // actual error, return it
8531 return r;
8532 }
8533 }
8534 return gr.pos;
8535 }
8536
8537
8538 /* getdir */
8539 struct getdir_result {
8540 list<string> *contents;
8541 int num;
8542 };
8543
8544 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8545 {
8546 getdir_result *r = static_cast<getdir_result *>(p);
8547
8548 r->contents->push_back(de->d_name);
8549 r->num++;
8550 return 0;
8551 }
8552
8553 int Client::getdir(const char *relpath, list<string>& contents,
8554 const UserPerm& perms)
8555 {
8556 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8557 {
8558 std::lock_guard lock(client_lock);
8559 tout(cct) << "getdir" << std::endl;
8560 tout(cct) << relpath << std::endl;
8561 }
8562
8563 dir_result_t *d;
8564 int r = opendir(relpath, &d, perms);
8565 if (r < 0)
8566 return r;
8567
8568 getdir_result gr;
8569 gr.contents = &contents;
8570 gr.num = 0;
8571 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8572
8573 closedir(d);
8574
8575 if (r < 0)
8576 return r;
8577 return gr.num;
8578 }
8579
8580
8581 /****** file i/o **********/
8582 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8583 mode_t mode, int stripe_unit, int stripe_count,
8584 int object_size, const char *data_pool)
8585 {
8586 int cflags = ceph_flags_sys2wire(flags);
8587
8588 ldout(cct, 3) << "open enter(" << relpath << ", " << cflags << "," << mode << ")" << dendl;
8589 std::lock_guard lock(client_lock);
8590 tout(cct) << "open" << std::endl;
8591 tout(cct) << relpath << std::endl;
8592 tout(cct) << cflags << std::endl;
8593
8594 if (unmounting)
8595 return -ENOTCONN;
8596
8597 Fh *fh = NULL;
8598
8599 #if defined(__linux__) && defined(O_PATH)
8600 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8601 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8602 * in kernel (fs/open.c). */
8603 if (flags & O_PATH)
8604 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8605 #endif
8606
8607 filepath path(relpath);
8608 InodeRef in;
8609 bool created = false;
8610 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8611 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8612 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
8613
8614 int r = path_walk(path, &in, perms, followsym, mask);
8615
8616 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8617 return -EEXIST;
8618
8619 #if defined(__linux__) && defined(O_PATH)
8620 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8621 #else
8622 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8623 #endif
8624 return -ELOOP;
8625
8626 if (r == -ENOENT && (flags & O_CREAT)) {
8627 filepath dirpath = path;
8628 string dname = dirpath.last_dentry();
8629 dirpath.pop_dentry();
8630 InodeRef dir;
8631 r = path_walk(dirpath, &dir, perms, true,
8632 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8633 if (r < 0)
8634 goto out;
8635 if (cct->_conf->client_permissions) {
8636 r = may_create(dir.get(), perms);
8637 if (r < 0)
8638 goto out;
8639 }
8640 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8641 stripe_count, object_size, data_pool, &created, perms);
8642 }
8643 if (r < 0)
8644 goto out;
8645
8646 if (!created) {
8647 // posix says we can only check permissions of existing files
8648 if (cct->_conf->client_permissions) {
8649 r = may_open(in.get(), flags, perms);
8650 if (r < 0)
8651 goto out;
8652 }
8653 }
8654
8655 if (!fh)
8656 r = _open(in.get(), flags, mode, &fh, perms);
8657 if (r >= 0) {
8658 // allocate a integer file descriptor
8659 ceph_assert(fh);
8660 r = get_fd();
8661 ceph_assert(fd_map.count(r) == 0);
8662 fd_map[r] = fh;
8663 }
8664
8665 out:
8666 tout(cct) << r << std::endl;
8667 ldout(cct, 3) << "open exit(" << path << ", " << cflags << ") = " << r << dendl;
8668 return r;
8669 }
8670
8671 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8672 {
8673 /* Use default file striping parameters */
8674 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8675 }
8676
8677 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8678 const UserPerm& perms)
8679 {
8680 std::lock_guard lock(client_lock);
8681 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8682
8683 if (unmounting)
8684 return -ENOTCONN;
8685
8686 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8687 filepath path(ino);
8688 req->set_filepath(path);
8689
8690 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8691 char f[30];
8692 sprintf(f, "%u", h);
8693 filepath path2(dirino);
8694 path2.push_dentry(string(f));
8695 req->set_filepath2(path2);
8696
8697 int r = make_request(req, perms, NULL, NULL,
8698 rand() % mdsmap->get_num_in_mds());
8699 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8700 return r;
8701 }
8702
8703
8704 /**
8705 * Load inode into local cache.
8706 *
8707 * If inode pointer is non-NULL, and take a reference on
8708 * the resulting Inode object in one operation, so that caller
8709 * can safely assume inode will still be there after return.
8710 */
8711 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8712 {
8713 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8714
8715 if (unmounting)
8716 return -ENOTCONN;
8717
8718 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8719 filepath path(ino);
8720 req->set_filepath(path);
8721
8722 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8723 if (r == 0 && inode != NULL) {
8724 vinodeno_t vino(ino, CEPH_NOSNAP);
8725 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8726 ceph_assert(p != inode_map.end());
8727 *inode = p->second;
8728 _ll_get(*inode);
8729 }
8730 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8731 return r;
8732 }
8733
8734 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8735 {
8736 std::lock_guard lock(client_lock);
8737 return _lookup_ino(ino, perms, inode);
8738 }
8739
8740 /**
8741 * Find the parent inode of `ino` and insert it into
8742 * our cache. Conditionally also set `parent` to a referenced
8743 * Inode* if caller provides non-NULL value.
8744 */
8745 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8746 {
8747 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8748
8749 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8750 filepath path(ino->ino);
8751 req->set_filepath(path);
8752
8753 InodeRef target;
8754 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8755 // Give caller a reference to the parent ino if they provided a pointer.
8756 if (parent != NULL) {
8757 if (r == 0) {
8758 *parent = target.get();
8759 _ll_get(*parent);
8760 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8761 } else {
8762 *parent = NULL;
8763 }
8764 }
8765 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8766 return r;
8767 }
8768
8769 /**
8770 * Populate the parent dentry for `ino`, provided it is
8771 * a child of `parent`.
8772 */
8773 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8774 {
8775 ceph_assert(parent->is_dir());
8776 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8777
8778 if (unmounting)
8779 return -ENOTCONN;
8780
8781 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8782 req->set_filepath2(filepath(parent->ino));
8783 req->set_filepath(filepath(ino->ino));
8784 req->set_inode(ino);
8785
8786 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8787 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8788 return r;
8789 }
8790
8791 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8792 {
8793 std::lock_guard lock(client_lock);
8794 return _lookup_name(ino, parent, perms);
8795 }
8796
8797 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8798 {
8799 ceph_assert(in);
8800 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
8801
8802 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8803
8804 if (in->snapid != CEPH_NOSNAP) {
8805 in->snap_cap_refs++;
8806 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8807 << ccap_string(in->caps_issued()) << dendl;
8808 }
8809
8810 const auto& conf = cct->_conf;
8811 f->readahead.set_trigger_requests(1);
8812 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8813 uint64_t max_readahead = Readahead::NO_LIMIT;
8814 if (conf->client_readahead_max_bytes) {
8815 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8816 }
8817 if (conf->client_readahead_max_periods) {
8818 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8819 }
8820 f->readahead.set_max_readahead_size(max_readahead);
8821 vector<uint64_t> alignments;
8822 alignments.push_back(in->layout.get_period());
8823 alignments.push_back(in->layout.stripe_unit);
8824 f->readahead.set_alignments(alignments);
8825
8826 return f;
8827 }
8828
8829 int Client::_release_fh(Fh *f)
8830 {
8831 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8832 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8833 Inode *in = f->inode.get();
8834 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8835
8836 in->unset_deleg(f);
8837
8838 if (in->snapid == CEPH_NOSNAP) {
8839 if (in->put_open_ref(f->mode)) {
8840 _flush(in, new C_Client_FlushComplete(this, in));
8841 check_caps(in, 0);
8842 }
8843 } else {
8844 ceph_assert(in->snap_cap_refs > 0);
8845 in->snap_cap_refs--;
8846 }
8847
8848 _release_filelocks(f);
8849
8850 // Finally, read any async err (i.e. from flushes)
8851 int err = f->take_async_err();
8852 if (err != 0) {
8853 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8854 << cpp_strerror(err) << dendl;
8855 } else {
8856 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8857 }
8858
8859 _put_fh(f);
8860
8861 return err;
8862 }
8863
8864 void Client::_put_fh(Fh *f)
8865 {
8866 int left = f->put();
8867 if (!left) {
8868 delete f;
8869 }
8870 }
8871
8872 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8873 const UserPerm& perms)
8874 {
8875 if (in->snapid != CEPH_NOSNAP &&
8876 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8877 return -EROFS;
8878 }
8879
8880 // use normalized flags to generate cmode
8881 int cflags = ceph_flags_sys2wire(flags);
8882 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8883 cflags |= CEPH_O_LAZY;
8884
8885 int cmode = ceph_flags_to_mode(cflags);
8886 int want = ceph_caps_for_mode(cmode);
8887 int result = 0;
8888
8889 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8890
8891 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8892 // update wanted?
8893 check_caps(in, CHECK_CAPS_NODELAY);
8894 } else {
8895
8896 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8897 filepath path;
8898 in->make_nosnap_relative_path(path);
8899 req->set_filepath(path);
8900 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8901 req->head.args.open.mode = mode;
8902 req->head.args.open.pool = -1;
8903 if (cct->_conf->client_debug_getattr_caps)
8904 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8905 else
8906 req->head.args.open.mask = 0;
8907 req->head.args.open.old_size = in->size; // for O_TRUNC
8908 req->set_inode(in);
8909 result = make_request(req, perms);
8910
8911 /*
8912 * NFS expects that delegations will be broken on a conflicting open,
8913 * not just when there is actual conflicting access to the file. SMB leases
8914 * and oplocks also have similar semantics.
8915 *
8916 * Ensure that clients that have delegations enabled will wait on minimal
8917 * caps during open, just to ensure that other clients holding delegations
8918 * return theirs first.
8919 */
8920 if (deleg_timeout && result == 0) {
8921 int need = 0, have;
8922
8923 if (cmode & CEPH_FILE_MODE_WR)
8924 need |= CEPH_CAP_FILE_WR;
8925 if (cmode & CEPH_FILE_MODE_RD)
8926 need |= CEPH_CAP_FILE_RD;
8927
8928 Fh fh(in, flags, cmode, fd_gen, perms);
8929 result = get_caps(&fh, need, want, &have, -1);
8930 if (result < 0) {
8931 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8932 " . Denying open: " <<
8933 cpp_strerror(result) << dendl;
8934 } else {
8935 put_cap_ref(in, need);
8936 }
8937 }
8938 }
8939
8940 // success?
8941 if (result >= 0) {
8942 if (fhp)
8943 *fhp = _create_fh(in, flags, cmode, perms);
8944 } else {
8945 in->put_open_ref(cmode);
8946 }
8947
8948 trim_cache();
8949
8950 return result;
8951 }
8952
8953 int Client::_renew_caps(Inode *in)
8954 {
8955 int wanted = in->caps_file_wanted();
8956 if (in->is_any_caps() &&
8957 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8958 check_caps(in, CHECK_CAPS_NODELAY);
8959 return 0;
8960 }
8961
8962 int flags = 0;
8963 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8964 flags = O_RDWR;
8965 else if (wanted & CEPH_CAP_FILE_RD)
8966 flags = O_RDONLY;
8967 else if (wanted & CEPH_CAP_FILE_WR)
8968 flags = O_WRONLY;
8969
8970 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8971 filepath path;
8972 in->make_nosnap_relative_path(path);
8973 req->set_filepath(path);
8974 req->head.args.open.flags = flags;
8975 req->head.args.open.pool = -1;
8976 if (cct->_conf->client_debug_getattr_caps)
8977 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8978 else
8979 req->head.args.open.mask = 0;
8980 req->set_inode(in);
8981
8982 // duplicate in case Cap goes away; not sure if that race is a concern?
8983 const UserPerm *pperm = in->get_best_perms();
8984 UserPerm perms;
8985 if (pperm != NULL)
8986 perms = *pperm;
8987 int ret = make_request(req, perms);
8988 return ret;
8989 }
8990
8991 int Client::close(int fd)
8992 {
8993 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8994 std::lock_guard lock(client_lock);
8995 tout(cct) << "close" << std::endl;
8996 tout(cct) << fd << std::endl;
8997
8998 if (unmounting)
8999 return -ENOTCONN;
9000
9001 Fh *fh = get_filehandle(fd);
9002 if (!fh)
9003 return -EBADF;
9004 int err = _release_fh(fh);
9005 fd_map.erase(fd);
9006 put_fd(fd);
9007 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9008 return err;
9009 }
9010
9011
9012 // ------------
9013 // read, write
9014
9015 loff_t Client::lseek(int fd, loff_t offset, int whence)
9016 {
9017 std::lock_guard lock(client_lock);
9018 tout(cct) << "lseek" << std::endl;
9019 tout(cct) << fd << std::endl;
9020 tout(cct) << offset << std::endl;
9021 tout(cct) << whence << std::endl;
9022
9023 if (unmounting)
9024 return -ENOTCONN;
9025
9026 Fh *f = get_filehandle(fd);
9027 if (!f)
9028 return -EBADF;
9029 #if defined(__linux__) && defined(O_PATH)
9030 if (f->flags & O_PATH)
9031 return -EBADF;
9032 #endif
9033 return _lseek(f, offset, whence);
9034 }
9035
9036 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9037 {
9038 Inode *in = f->inode.get();
9039 bool whence_check = false;
9040 loff_t pos = -1;
9041
9042 switch (whence) {
9043 case SEEK_END:
9044 whence_check = true;
9045 break;
9046
9047 #ifdef SEEK_DATA
9048 case SEEK_DATA:
9049 whence_check = true;
9050 break;
9051 #endif
9052
9053 #ifdef SEEK_HOLE
9054 case SEEK_HOLE:
9055 whence_check = true;
9056 break;
9057 #endif
9058 }
9059
9060 if (whence_check) {
9061 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9062 if (r < 0)
9063 return r;
9064 }
9065
9066 switch (whence) {
9067 case SEEK_SET:
9068 pos = offset;
9069 break;
9070
9071 case SEEK_CUR:
9072 pos = f->pos + offset;
9073 break;
9074
9075 case SEEK_END:
9076 pos = in->size + offset;
9077 break;
9078
9079 #ifdef SEEK_DATA
9080 case SEEK_DATA:
9081 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9082 return -ENXIO;
9083 pos = offset;
9084 break;
9085 #endif
9086
9087 #ifdef SEEK_HOLE
9088 case SEEK_HOLE:
9089 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9090 return -ENXIO;
9091 pos = in->size;
9092 break;
9093 #endif
9094
9095 default:
9096 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9097 return -EINVAL;
9098 }
9099
9100 if (pos < 0) {
9101 return -EINVAL;
9102 } else {
9103 f->pos = pos;
9104 }
9105
9106 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9107 return f->pos;
9108 }
9109
9110
9111 void Client::lock_fh_pos(Fh *f)
9112 {
9113 ldout(cct, 10) << __func__ << " " << f << dendl;
9114
9115 if (f->pos_locked || !f->pos_waiters.empty()) {
9116 ceph::condition_variable cond;
9117 f->pos_waiters.push_back(&cond);
9118 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9119 std::unique_lock l{client_lock, std::adopt_lock};
9120 cond.wait(l, [f, me=&cond] {
9121 return !f->pos_locked && f->pos_waiters.front() == me;
9122 });
9123 l.release();
9124 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9125 ceph_assert(f->pos_waiters.front() == &cond);
9126 f->pos_waiters.pop_front();
9127 }
9128
9129 f->pos_locked = true;
9130 }
9131
9132 void Client::unlock_fh_pos(Fh *f)
9133 {
9134 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9135
9136 ldout(cct, 10) << __func__ << " " << f << dendl;
9137 f->pos_locked = false;
9138 if (!f->pos_waiters.empty()) {
9139 // only wake up the oldest waiter
9140 auto cond = f->pos_waiters.front();
9141 cond->notify_one();
9142 }
9143 }
9144
9145 int Client::uninline_data(Inode *in, Context *onfinish)
9146 {
9147 if (!in->inline_data.length()) {
9148 onfinish->complete(0);
9149 return 0;
9150 }
9151
9152 char oid_buf[32];
9153 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9154 object_t oid = oid_buf;
9155
9156 ObjectOperation create_ops;
9157 create_ops.create(false);
9158
9159 objecter->mutate(oid,
9160 OSDMap::file_to_object_locator(in->layout),
9161 create_ops,
9162 in->snaprealm->get_snap_context(),
9163 ceph::real_clock::now(),
9164 0,
9165 NULL);
9166
9167 bufferlist inline_version_bl;
9168 encode(in->inline_version, inline_version_bl);
9169
9170 ObjectOperation uninline_ops;
9171 uninline_ops.cmpxattr("inline_version",
9172 CEPH_OSD_CMPXATTR_OP_GT,
9173 CEPH_OSD_CMPXATTR_MODE_U64,
9174 inline_version_bl);
9175 bufferlist inline_data = in->inline_data;
9176 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9177 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9178
9179 objecter->mutate(oid,
9180 OSDMap::file_to_object_locator(in->layout),
9181 uninline_ops,
9182 in->snaprealm->get_snap_context(),
9183 ceph::real_clock::now(),
9184 0,
9185 onfinish);
9186
9187 return 0;
9188 }
9189
9190 //
9191
9192 // blocking osd interface
9193
9194 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9195 {
9196 std::unique_lock lock(client_lock);
9197 tout(cct) << "read" << std::endl;
9198 tout(cct) << fd << std::endl;
9199 tout(cct) << size << std::endl;
9200 tout(cct) << offset << std::endl;
9201
9202 if (unmounting)
9203 return -ENOTCONN;
9204
9205 Fh *f = get_filehandle(fd);
9206 if (!f)
9207 return -EBADF;
9208 #if defined(__linux__) && defined(O_PATH)
9209 if (f->flags & O_PATH)
9210 return -EBADF;
9211 #endif
9212 bufferlist bl;
9213 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9214 size = std::min(size, (loff_t)INT_MAX);
9215 int r = _read(f, offset, size, &bl);
9216 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9217 if (r >= 0) {
9218 lock.unlock();
9219 bl.begin().copy(bl.length(), buf);
9220 r = bl.length();
9221 }
9222 return r;
9223 }
9224
9225 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9226 {
9227 if (iovcnt < 0)
9228 return -EINVAL;
9229 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9230 }
9231
9232 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9233 {
9234 int want, have = 0;
9235 bool movepos = false;
9236 std::unique_ptr<C_SaferCond> onuninline;
9237 int64_t rc = 0;
9238 const auto& conf = cct->_conf;
9239 Inode *in = f->inode.get();
9240 utime_t lat;
9241 utime_t start = ceph_clock_now();
9242
9243 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9244 return -EBADF;
9245 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9246
9247 if (offset < 0) {
9248 lock_fh_pos(f);
9249 offset = f->pos;
9250 movepos = true;
9251 }
9252 loff_t start_pos = offset;
9253
9254 if (in->inline_version == 0) {
9255 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9256 if (r < 0) {
9257 rc = r;
9258 goto done;
9259 }
9260 ceph_assert(in->inline_version > 0);
9261 }
9262
9263 retry:
9264 if (f->mode & CEPH_FILE_MODE_LAZY)
9265 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9266 else
9267 want = CEPH_CAP_FILE_CACHE;
9268 {
9269 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9270 if (r < 0) {
9271 rc = r;
9272 goto done;
9273 }
9274 }
9275 if (f->flags & O_DIRECT)
9276 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9277
9278 if (in->inline_version < CEPH_INLINE_NONE) {
9279 if (!(have & CEPH_CAP_FILE_CACHE)) {
9280 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9281 uninline_data(in, onuninline.get());
9282 } else {
9283 uint32_t len = in->inline_data.length();
9284 uint64_t endoff = offset + size;
9285 if (endoff > in->size)
9286 endoff = in->size;
9287
9288 if (offset < len) {
9289 if (endoff <= len) {
9290 bl->substr_of(in->inline_data, offset, endoff - offset);
9291 } else {
9292 bl->substr_of(in->inline_data, offset, len - offset);
9293 bl->append_zero(endoff - len);
9294 }
9295 rc = endoff - offset;
9296 } else if ((uint64_t)offset < endoff) {
9297 bl->append_zero(endoff - offset);
9298 rc = endoff - offset;
9299 } else {
9300 rc = 0;
9301 }
9302 goto success;
9303 }
9304 }
9305
9306 if (!conf->client_debug_force_sync_read &&
9307 conf->client_oc &&
9308 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9309
9310 if (f->flags & O_RSYNC) {
9311 _flush_range(in, offset, size);
9312 }
9313 rc = _read_async(f, offset, size, bl);
9314 if (rc < 0)
9315 goto done;
9316 } else {
9317 if (f->flags & O_DIRECT)
9318 _flush_range(in, offset, size);
9319
9320 bool checkeof = false;
9321 rc = _read_sync(f, offset, size, bl, &checkeof);
9322 if (rc < 0)
9323 goto done;
9324 if (checkeof) {
9325 offset += rc;
9326 size -= rc;
9327
9328 put_cap_ref(in, CEPH_CAP_FILE_RD);
9329 have = 0;
9330 // reverify size
9331 {
9332 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9333 if (r < 0) {
9334 rc = r;
9335 goto done;
9336 }
9337 }
9338
9339 // eof? short read.
9340 if ((uint64_t)offset < in->size)
9341 goto retry;
9342 }
9343 }
9344
9345 success:
9346 ceph_assert(rc >= 0);
9347 if (movepos) {
9348 // adjust fd pos
9349 f->pos = start_pos + rc;
9350 }
9351
9352 lat = ceph_clock_now();
9353 lat -= start;
9354 logger->tinc(l_c_read, lat);
9355
9356 done:
9357 // done!
9358
9359 if (onuninline) {
9360 client_lock.unlock();
9361 int ret = onuninline->wait();
9362 client_lock.lock();
9363 if (ret >= 0 || ret == -ECANCELED) {
9364 in->inline_data.clear();
9365 in->inline_version = CEPH_INLINE_NONE;
9366 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9367 check_caps(in, 0);
9368 } else
9369 rc = ret;
9370 }
9371 if (have) {
9372 put_cap_ref(in, CEPH_CAP_FILE_RD);
9373 }
9374 if (movepos) {
9375 unlock_fh_pos(f);
9376 }
9377 return rc;
9378 }
9379
9380 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9381 client(c), f(f) {
9382 f->get();
9383 f->readahead.inc_pending();
9384 }
9385
9386 Client::C_Readahead::~C_Readahead() {
9387 f->readahead.dec_pending();
9388 client->_put_fh(f);
9389 }
9390
9391 void Client::C_Readahead::finish(int r) {
9392 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9393 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9394 }
9395
9396 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9397 {
9398 const auto& conf = cct->_conf;
9399 Inode *in = f->inode.get();
9400
9401 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9402
9403 // trim read based on file size?
9404 if (off >= in->size)
9405 return 0;
9406 if (len == 0)
9407 return 0;
9408 if (off + len > in->size) {
9409 len = in->size - off;
9410 }
9411
9412 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9413 << " max_bytes=" << f->readahead.get_max_readahead_size()
9414 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9415
9416 // read (and possibly block)
9417 int r = 0;
9418 C_SaferCond onfinish("Client::_read_async flock");
9419 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9420 off, len, bl, 0, &onfinish);
9421 if (r == 0) {
9422 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9423 client_lock.unlock();
9424 r = onfinish.wait();
9425 client_lock.lock();
9426 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9427 }
9428
9429 if(f->readahead.get_min_readahead_size() > 0) {
9430 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9431 if (readahead_extent.second > 0) {
9432 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9433 << " (caller wants " << off << "~" << len << ")" << dendl;
9434 Context *onfinish2 = new C_Readahead(this, f);
9435 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9436 readahead_extent.first, readahead_extent.second,
9437 NULL, 0, onfinish2);
9438 if (r2 == 0) {
9439 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9440 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9441 } else {
9442 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9443 delete onfinish2;
9444 }
9445 }
9446 }
9447
9448 return r;
9449 }
9450
9451 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9452 bool *checkeof)
9453 {
9454 Inode *in = f->inode.get();
9455 uint64_t pos = off;
9456 int left = len;
9457 int read = 0;
9458
9459 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9460
9461 while (left > 0) {
9462 C_SaferCond onfinish("Client::_read_sync flock");
9463 bufferlist tbl;
9464
9465 int wanted = left;
9466 filer->read_trunc(in->ino, &in->layout, in->snapid,
9467 pos, left, &tbl, 0,
9468 in->truncate_size, in->truncate_seq,
9469 &onfinish);
9470 client_lock.unlock();
9471 int r = onfinish.wait();
9472 client_lock.lock();
9473
9474 // if we get ENOENT from OSD, assume 0 bytes returned
9475 if (r == -ENOENT)
9476 r = 0;
9477 if (r < 0)
9478 return r;
9479 if (tbl.length()) {
9480 r = tbl.length();
9481
9482 read += r;
9483 pos += r;
9484 left -= r;
9485 bl->claim_append(tbl);
9486 }
9487 // short read?
9488 if (r >= 0 && r < wanted) {
9489 if (pos < in->size) {
9490 // zero up to known EOF
9491 int64_t some = in->size - pos;
9492 if (some > left)
9493 some = left;
9494 auto z = buffer::ptr_node::create(some);
9495 z->zero();
9496 bl->push_back(std::move(z));
9497 read += some;
9498 pos += some;
9499 left -= some;
9500 if (left == 0)
9501 return read;
9502 }
9503
9504 *checkeof = true;
9505 return read;
9506 }
9507 }
9508 return read;
9509 }
9510
9511
9512 /*
9513 * we keep count of uncommitted sync writes on the inode, so that
9514 * fsync can DDRT.
9515 */
9516 void Client::_sync_write_commit(Inode *in)
9517 {
9518 ceph_assert(unsafe_sync_write > 0);
9519 unsafe_sync_write--;
9520
9521 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9522
9523 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9524 if (unsafe_sync_write == 0 && unmounting) {
9525 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9526 mount_cond.notify_all();
9527 }
9528 }
9529
9530 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9531 {
9532 std::lock_guard lock(client_lock);
9533 tout(cct) << "write" << std::endl;
9534 tout(cct) << fd << std::endl;
9535 tout(cct) << size << std::endl;
9536 tout(cct) << offset << std::endl;
9537
9538 if (unmounting)
9539 return -ENOTCONN;
9540
9541 Fh *fh = get_filehandle(fd);
9542 if (!fh)
9543 return -EBADF;
9544 #if defined(__linux__) && defined(O_PATH)
9545 if (fh->flags & O_PATH)
9546 return -EBADF;
9547 #endif
9548 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9549 size = std::min(size, (loff_t)INT_MAX);
9550 int r = _write(fh, offset, size, buf, NULL, false);
9551 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9552 return r;
9553 }
9554
9555 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9556 {
9557 if (iovcnt < 0)
9558 return -EINVAL;
9559 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9560 }
9561
9562 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9563 unsigned iovcnt, int64_t offset, bool write,
9564 bool clamp_to_int)
9565 {
9566 #if defined(__linux__) && defined(O_PATH)
9567 if (fh->flags & O_PATH)
9568 return -EBADF;
9569 #endif
9570 loff_t totallen = 0;
9571 for (unsigned i = 0; i < iovcnt; i++) {
9572 totallen += iov[i].iov_len;
9573 }
9574
9575 /*
9576 * Some of the API functions take 64-bit size values, but only return
9577 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9578 * we don't do I/Os larger than the values we can return.
9579 */
9580 if (clamp_to_int) {
9581 totallen = std::min(totallen, (loff_t)INT_MAX);
9582 }
9583 if (write) {
9584 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9585 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9586 return w;
9587 } else {
9588 bufferlist bl;
9589 int64_t r = _read(fh, offset, totallen, &bl);
9590 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9591 if (r <= 0)
9592 return r;
9593
9594 auto iter = bl.cbegin();
9595 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9596 /*
9597 * This piece of code aims to handle the case that bufferlist does not have enough data
9598 * to fill in the iov
9599 */
9600 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9601 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9602 resid -= round_size;
9603 /* iter is self-updating */
9604 }
9605 return r;
9606 }
9607 }
9608
9609 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9610 {
9611 std::lock_guard lock(client_lock);
9612 tout(cct) << fd << std::endl;
9613 tout(cct) << offset << std::endl;
9614
9615 if (unmounting)
9616 return -ENOTCONN;
9617
9618 Fh *fh = get_filehandle(fd);
9619 if (!fh)
9620 return -EBADF;
9621 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9622 }
9623
9624 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9625 const struct iovec *iov, int iovcnt)
9626 {
9627 uint64_t fpos = 0;
9628
9629 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9630 return -EFBIG;
9631
9632 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9633 Inode *in = f->inode.get();
9634
9635 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9636 return -ENOSPC;
9637 }
9638
9639 ceph_assert(in->snapid == CEPH_NOSNAP);
9640
9641 // was Fh opened as writeable?
9642 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9643 return -EBADF;
9644
9645 // use/adjust fd pos?
9646 if (offset < 0) {
9647 lock_fh_pos(f);
9648 /*
9649 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9650 * change out from under us.
9651 */
9652 if (f->flags & O_APPEND) {
9653 auto r = _lseek(f, 0, SEEK_END);
9654 if (r < 0) {
9655 unlock_fh_pos(f);
9656 return r;
9657 }
9658 }
9659 offset = f->pos;
9660 fpos = offset+size;
9661 unlock_fh_pos(f);
9662 }
9663
9664 // check quota
9665 uint64_t endoff = offset + size;
9666 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9667 f->actor_perms)) {
9668 return -EDQUOT;
9669 }
9670
9671 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9672
9673 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9674
9675 // time it.
9676 utime_t start = ceph_clock_now();
9677
9678 if (in->inline_version == 0) {
9679 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9680 if (r < 0)
9681 return r;
9682 ceph_assert(in->inline_version > 0);
9683 }
9684
9685 // copy into fresh buffer (since our write may be resub, async)
9686 bufferlist bl;
9687 if (buf) {
9688 if (size > 0)
9689 bl.append(buf, size);
9690 } else if (iov){
9691 for (int i = 0; i < iovcnt; i++) {
9692 if (iov[i].iov_len > 0) {
9693 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9694 }
9695 }
9696 }
9697
9698 utime_t lat;
9699 uint64_t totalwritten;
9700 int want, have;
9701 if (f->mode & CEPH_FILE_MODE_LAZY)
9702 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9703 else
9704 want = CEPH_CAP_FILE_BUFFER;
9705 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9706 if (r < 0)
9707 return r;
9708
9709 /* clear the setuid/setgid bits, if any */
9710 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9711 struct ceph_statx stx = { 0 };
9712
9713 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9714 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9715 if (r < 0)
9716 return r;
9717 } else {
9718 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9719 }
9720
9721 if (f->flags & O_DIRECT)
9722 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9723
9724 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9725
9726 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9727
9728 if (in->inline_version < CEPH_INLINE_NONE) {
9729 if (endoff > cct->_conf->client_max_inline_size ||
9730 endoff > CEPH_INLINE_MAX_SIZE ||
9731 !(have & CEPH_CAP_FILE_BUFFER)) {
9732 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9733 uninline_data(in, onuninline.get());
9734 } else {
9735 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9736
9737 uint32_t len = in->inline_data.length();
9738
9739 if (endoff < len)
9740 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9741
9742 if (offset < len)
9743 in->inline_data.splice(offset, len - offset);
9744 else if (offset > len)
9745 in->inline_data.append_zero(offset - len);
9746
9747 in->inline_data.append(bl);
9748 in->inline_version++;
9749
9750 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9751
9752 goto success;
9753 }
9754 }
9755
9756 if (cct->_conf->client_oc &&
9757 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9758 // do buffered write
9759 if (!in->oset.dirty_or_tx)
9760 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9761
9762 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9763
9764 // async, caching, non-blocking.
9765 r = objectcacher->file_write(&in->oset, &in->layout,
9766 in->snaprealm->get_snap_context(),
9767 offset, size, bl, ceph::real_clock::now(),
9768 0);
9769 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9770
9771 if (r < 0)
9772 goto done;
9773
9774 // flush cached write if O_SYNC is set on file fh
9775 // O_DSYNC == O_SYNC on linux < 2.6.33
9776 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9777 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9778 _flush_range(in, offset, size);
9779 }
9780 } else {
9781 if (f->flags & O_DIRECT)
9782 _flush_range(in, offset, size);
9783
9784 // simple, non-atomic sync write
9785 C_SaferCond onfinish("Client::_write flock");
9786 unsafe_sync_write++;
9787 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9788
9789 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9790 offset, size, bl, ceph::real_clock::now(), 0,
9791 in->truncate_size, in->truncate_seq,
9792 &onfinish);
9793 client_lock.unlock();
9794 r = onfinish.wait();
9795 client_lock.lock();
9796 _sync_write_commit(in);
9797 if (r < 0)
9798 goto done;
9799 }
9800
9801 // if we get here, write was successful, update client metadata
9802 success:
9803 // time
9804 lat = ceph_clock_now();
9805 lat -= start;
9806 logger->tinc(l_c_wrlat, lat);
9807
9808 if (fpos) {
9809 lock_fh_pos(f);
9810 f->pos = fpos;
9811 unlock_fh_pos(f);
9812 }
9813 totalwritten = size;
9814 r = (int64_t)totalwritten;
9815
9816 // extend file?
9817 if (totalwritten + offset > in->size) {
9818 in->size = totalwritten + offset;
9819 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9820
9821 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9822 check_caps(in, CHECK_CAPS_NODELAY);
9823 } else if (is_max_size_approaching(in)) {
9824 check_caps(in, 0);
9825 }
9826
9827 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9828 } else {
9829 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9830 }
9831
9832 // mtime
9833 in->mtime = in->ctime = ceph_clock_now();
9834 in->change_attr++;
9835 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9836
9837 done:
9838
9839 if (nullptr != onuninline) {
9840 client_lock.unlock();
9841 int uninline_ret = onuninline->wait();
9842 client_lock.lock();
9843
9844 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9845 in->inline_data.clear();
9846 in->inline_version = CEPH_INLINE_NONE;
9847 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9848 check_caps(in, 0);
9849 } else
9850 r = uninline_ret;
9851 }
9852
9853 put_cap_ref(in, CEPH_CAP_FILE_WR);
9854 return r;
9855 }
9856
9857 int Client::_flush(Fh *f)
9858 {
9859 Inode *in = f->inode.get();
9860 int err = f->take_async_err();
9861 if (err != 0) {
9862 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9863 << cpp_strerror(err) << dendl;
9864 } else {
9865 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9866 }
9867
9868 return err;
9869 }
9870
9871 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9872 {
9873 struct ceph_statx stx;
9874 stx.stx_size = length;
9875 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9876 }
9877
9878 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9879 {
9880 std::lock_guard lock(client_lock);
9881 tout(cct) << __func__ << std::endl;
9882 tout(cct) << fd << std::endl;
9883 tout(cct) << length << std::endl;
9884
9885 if (unmounting)
9886 return -ENOTCONN;
9887
9888 Fh *f = get_filehandle(fd);
9889 if (!f)
9890 return -EBADF;
9891 #if defined(__linux__) && defined(O_PATH)
9892 if (f->flags & O_PATH)
9893 return -EBADF;
9894 #endif
9895 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9896 return -EBADF;
9897 struct stat attr;
9898 attr.st_size = length;
9899 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9900 }
9901
9902 int Client::fsync(int fd, bool syncdataonly)
9903 {
9904 std::lock_guard lock(client_lock);
9905 tout(cct) << "fsync" << std::endl;
9906 tout(cct) << fd << std::endl;
9907 tout(cct) << syncdataonly << std::endl;
9908
9909 if (unmounting)
9910 return -ENOTCONN;
9911
9912 Fh *f = get_filehandle(fd);
9913 if (!f)
9914 return -EBADF;
9915 #if defined(__linux__) && defined(O_PATH)
9916 if (f->flags & O_PATH)
9917 return -EBADF;
9918 #endif
9919 int r = _fsync(f, syncdataonly);
9920 if (r == 0) {
9921 // The IOs in this fsync were okay, but maybe something happened
9922 // in the background that we shoudl be reporting?
9923 r = f->take_async_err();
9924 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9925 << ") = 0, async_err = " << r << dendl;
9926 } else {
9927 // Assume that an error we encountered during fsync, even reported
9928 // synchronously, would also have applied the error to the Fh, and we
9929 // should clear it here to avoid returning the same error again on next
9930 // call.
9931 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9932 << r << dendl;
9933 f->take_async_err();
9934 }
9935 return r;
9936 }
9937
9938 int Client::_fsync(Inode *in, bool syncdataonly)
9939 {
9940 int r = 0;
9941 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9942 ceph_tid_t flush_tid = 0;
9943 InodeRef tmp_ref;
9944 utime_t lat;
9945 utime_t start = ceph_clock_now();
9946
9947 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9948
9949 if (cct->_conf->client_oc) {
9950 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9951 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9952 _flush(in, object_cacher_completion.get());
9953 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9954 }
9955
9956 if (!syncdataonly && in->dirty_caps) {
9957 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9958 if (in->flushing_caps)
9959 flush_tid = last_flush_tid;
9960 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9961
9962 if (!syncdataonly && !in->unsafe_ops.empty()) {
9963 flush_mdlog_sync();
9964
9965 MetaRequest *req = in->unsafe_ops.back();
9966 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9967
9968 req->get();
9969 wait_on_list(req->waitfor_safe);
9970 put_request(req);
9971 }
9972
9973 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9974 client_lock.unlock();
9975 ldout(cct, 15) << "waiting on data to flush" << dendl;
9976 r = object_cacher_completion->wait();
9977 client_lock.lock();
9978 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9979 } else {
9980 // FIXME: this can starve
9981 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9982 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9983 << " uncommitted, waiting" << dendl;
9984 wait_on_list(in->waitfor_commit);
9985 }
9986 }
9987
9988 if (!r) {
9989 if (flush_tid > 0)
9990 wait_sync_caps(in, flush_tid);
9991
9992 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9993 } else {
9994 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9995 << cpp_strerror(-r) << dendl;
9996 }
9997
9998 lat = ceph_clock_now();
9999 lat -= start;
10000 logger->tinc(l_c_fsync, lat);
10001
10002 return r;
10003 }
10004
10005 int Client::_fsync(Fh *f, bool syncdataonly)
10006 {
10007 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10008 return _fsync(f->inode.get(), syncdataonly);
10009 }
10010
10011 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10012 {
10013 std::lock_guard lock(client_lock);
10014 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10015 tout(cct) << fd << std::endl;
10016
10017 if (unmounting)
10018 return -ENOTCONN;
10019
10020 Fh *f = get_filehandle(fd);
10021 if (!f)
10022 return -EBADF;
10023 int r = _getattr(f->inode, mask, perms);
10024 if (r < 0)
10025 return r;
10026 fill_stat(f->inode, stbuf, NULL);
10027 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10028 return r;
10029 }
10030
10031 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10032 unsigned int want, unsigned int flags)
10033 {
10034 std::lock_guard lock(client_lock);
10035 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10036 tout(cct) << fd << std::endl;
10037
10038 if (unmounting)
10039 return -ENOTCONN;
10040
10041 Fh *f = get_filehandle(fd);
10042 if (!f)
10043 return -EBADF;
10044
10045 unsigned mask = statx_to_mask(flags, want);
10046
10047 int r = 0;
10048 if (mask && !f->inode->caps_issued_mask(mask, true)) {
10049 r = _getattr(f->inode, mask, perms);
10050 if (r < 0) {
10051 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10052 return r;
10053 }
10054 }
10055
10056 fill_statx(f->inode, mask, stx);
10057 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10058 return r;
10059 }
10060
10061 // not written yet, but i want to link!
10062
10063 int Client::chdir(const char *relpath, std::string &new_cwd,
10064 const UserPerm& perms)
10065 {
10066 std::lock_guard lock(client_lock);
10067 tout(cct) << "chdir" << std::endl;
10068 tout(cct) << relpath << std::endl;
10069
10070 if (unmounting)
10071 return -ENOTCONN;
10072
10073 filepath path(relpath);
10074 InodeRef in;
10075 int r = path_walk(path, &in, perms);
10076 if (r < 0)
10077 return r;
10078
10079 if (!(in.get()->is_dir()))
10080 return -ENOTDIR;
10081
10082 if (cwd != in)
10083 cwd.swap(in);
10084 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10085
10086 _getcwd(new_cwd, perms);
10087 return 0;
10088 }
10089
10090 void Client::_getcwd(string& dir, const UserPerm& perms)
10091 {
10092 filepath path;
10093 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10094
10095 Inode *in = cwd.get();
10096 while (in != root) {
10097 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10098
10099 // A cwd or ancester is unlinked
10100 if (in->dentries.empty()) {
10101 return;
10102 }
10103
10104 Dentry *dn = in->get_first_parent();
10105
10106
10107 if (!dn) {
10108 // look it up
10109 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10110 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10111 filepath path(in->ino);
10112 req->set_filepath(path);
10113 req->set_inode(in);
10114 int res = make_request(req, perms);
10115 if (res < 0)
10116 break;
10117
10118 // start over
10119 path = filepath();
10120 in = cwd.get();
10121 continue;
10122 }
10123 path.push_front_dentry(dn->name);
10124 in = dn->dir->parent_inode;
10125 }
10126 dir = "/";
10127 dir += path.get_path();
10128 }
10129
10130 void Client::getcwd(string& dir, const UserPerm& perms)
10131 {
10132 std::lock_guard l(client_lock);
10133 if (!unmounting)
10134 _getcwd(dir, perms);
10135 }
10136
10137 int Client::statfs(const char *path, struct statvfs *stbuf,
10138 const UserPerm& perms)
10139 {
10140 std::lock_guard l(client_lock);
10141 tout(cct) << __func__ << std::endl;
10142 unsigned long int total_files_on_fs;
10143
10144 if (unmounting)
10145 return -ENOTCONN;
10146
10147 ceph_statfs stats;
10148 C_SaferCond cond;
10149
10150 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10151 if (data_pools.size() == 1) {
10152 objecter->get_fs_stats(stats, data_pools[0], &cond);
10153 } else {
10154 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10155 }
10156
10157 client_lock.unlock();
10158 int rval = cond.wait();
10159 assert(root);
10160 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10161 client_lock.lock();
10162
10163 if (rval < 0) {
10164 ldout(cct, 1) << "underlying call to statfs returned error: "
10165 << cpp_strerror(rval)
10166 << dendl;
10167 return rval;
10168 }
10169
10170 memset(stbuf, 0, sizeof(*stbuf));
10171
10172 /*
10173 * we're going to set a block size of 4MB so we can represent larger
10174 * FSes without overflowing. Additionally convert the space
10175 * measurements from KB to bytes while making them in terms of
10176 * blocks. We use 4MB only because it is big enough, and because it
10177 * actually *is* the (ceph) default block size.
10178 */
10179 const int CEPH_BLOCK_SHIFT = 22;
10180 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10181 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10182 stbuf->f_files = total_files_on_fs;
10183 stbuf->f_ffree = 0;
10184 stbuf->f_favail = -1;
10185 stbuf->f_fsid = -1; // ??
10186 stbuf->f_flag = 0; // ??
10187 stbuf->f_namemax = NAME_MAX;
10188
10189 // Usually quota_root will == root_ancestor, but if the mount root has no
10190 // quota but we can see a parent of it that does have a quota, we'll
10191 // respect that one instead.
10192 ceph_assert(root != nullptr);
10193 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10194
10195 // get_quota_root should always give us something
10196 // because client quotas are always enabled
10197 ceph_assert(quota_root != nullptr);
10198
10199 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10200
10201 // Skip the getattr if any sessions are stale, as we don't want to
10202 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10203 // is unhealthy.
10204 if (!_any_stale_sessions()) {
10205 int r = _getattr(quota_root, 0, perms, true);
10206 if (r != 0) {
10207 // Ignore return value: error getting latest inode metadata is not a good
10208 // reason to break "df".
10209 lderr(cct) << "Error in getattr on quota root 0x"
10210 << std::hex << quota_root->ino << std::dec
10211 << " statfs result may be outdated" << dendl;
10212 }
10213 }
10214
10215 // Special case: if there is a size quota set on the Inode acting
10216 // as the root for this client mount, then report the quota status
10217 // as the filesystem statistics.
10218 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10219 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10220 // It is possible for a quota to be exceeded: arithmetic here must
10221 // handle case where used > total.
10222 const fsblkcnt_t free = total > used ? total - used : 0;
10223
10224 stbuf->f_blocks = total;
10225 stbuf->f_bfree = free;
10226 stbuf->f_bavail = free;
10227 } else {
10228 // General case: report the cluster statistics returned from RADOS. Because
10229 // multiple pools may be used without one filesystem namespace via
10230 // layouts, this is the most correct thing we can do.
10231 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10232 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10233 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10234 }
10235
10236 return rval;
10237 }
10238
10239 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10240 struct flock *fl, uint64_t owner, bool removing)
10241 {
10242 ldout(cct, 10) << __func__ << " ino " << in->ino
10243 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10244 << " type " << fl->l_type << " owner " << owner
10245 << " " << fl->l_start << "~" << fl->l_len << dendl;
10246
10247 if (in->flags & I_ERROR_FILELOCK)
10248 return -EIO;
10249
10250 int lock_cmd;
10251 if (F_RDLCK == fl->l_type)
10252 lock_cmd = CEPH_LOCK_SHARED;
10253 else if (F_WRLCK == fl->l_type)
10254 lock_cmd = CEPH_LOCK_EXCL;
10255 else if (F_UNLCK == fl->l_type)
10256 lock_cmd = CEPH_LOCK_UNLOCK;
10257 else
10258 return -EIO;
10259
10260 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10261 sleep = 0;
10262
10263 /*
10264 * Set the most significant bit, so that MDS knows the 'owner'
10265 * is sufficient to identify the owner of lock. (old code uses
10266 * both 'owner' and 'pid')
10267 */
10268 owner |= (1ULL << 63);
10269
10270 MetaRequest *req = new MetaRequest(op);
10271 filepath path;
10272 in->make_nosnap_relative_path(path);
10273 req->set_filepath(path);
10274 req->set_inode(in);
10275
10276 req->head.args.filelock_change.rule = lock_type;
10277 req->head.args.filelock_change.type = lock_cmd;
10278 req->head.args.filelock_change.owner = owner;
10279 req->head.args.filelock_change.pid = fl->l_pid;
10280 req->head.args.filelock_change.start = fl->l_start;
10281 req->head.args.filelock_change.length = fl->l_len;
10282 req->head.args.filelock_change.wait = sleep;
10283
10284 int ret;
10285 bufferlist bl;
10286
10287 if (sleep && switch_interrupt_cb) {
10288 // enable interrupt
10289 switch_interrupt_cb(callback_handle, req->get());
10290 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10291 // disable interrupt
10292 switch_interrupt_cb(callback_handle, NULL);
10293 if (ret == 0 && req->aborted()) {
10294 // effect of this lock request has been revoked by the 'lock intr' request
10295 ret = req->get_abort_code();
10296 }
10297 put_request(req);
10298 } else {
10299 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10300 }
10301
10302 if (ret == 0) {
10303 if (op == CEPH_MDS_OP_GETFILELOCK) {
10304 ceph_filelock filelock;
10305 auto p = bl.cbegin();
10306 decode(filelock, p);
10307
10308 if (CEPH_LOCK_SHARED == filelock.type)
10309 fl->l_type = F_RDLCK;
10310 else if (CEPH_LOCK_EXCL == filelock.type)
10311 fl->l_type = F_WRLCK;
10312 else
10313 fl->l_type = F_UNLCK;
10314
10315 fl->l_whence = SEEK_SET;
10316 fl->l_start = filelock.start;
10317 fl->l_len = filelock.length;
10318 fl->l_pid = filelock.pid;
10319 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10320 ceph_lock_state_t *lock_state;
10321 if (lock_type == CEPH_LOCK_FCNTL) {
10322 if (!in->fcntl_locks)
10323 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10324 lock_state = in->fcntl_locks.get();
10325 } else if (lock_type == CEPH_LOCK_FLOCK) {
10326 if (!in->flock_locks)
10327 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10328 lock_state = in->flock_locks.get();
10329 } else {
10330 ceph_abort();
10331 return -EINVAL;
10332 }
10333 _update_lock_state(fl, owner, lock_state);
10334
10335 if (!removing) {
10336 if (lock_type == CEPH_LOCK_FCNTL) {
10337 if (!fh->fcntl_locks)
10338 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10339 lock_state = fh->fcntl_locks.get();
10340 } else {
10341 if (!fh->flock_locks)
10342 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10343 lock_state = fh->flock_locks.get();
10344 }
10345 _update_lock_state(fl, owner, lock_state);
10346 }
10347 } else
10348 ceph_abort();
10349 }
10350 return ret;
10351 }
10352
10353 int Client::_interrupt_filelock(MetaRequest *req)
10354 {
10355 // Set abort code, but do not kick. The abort code prevents the request
10356 // from being re-sent.
10357 req->abort(-EINTR);
10358 if (req->mds < 0)
10359 return 0; // haven't sent the request
10360
10361 Inode *in = req->inode();
10362
10363 int lock_type;
10364 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10365 lock_type = CEPH_LOCK_FLOCK_INTR;
10366 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10367 lock_type = CEPH_LOCK_FCNTL_INTR;
10368 else {
10369 ceph_abort();
10370 return -EINVAL;
10371 }
10372
10373 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10374 filepath path;
10375 in->make_nosnap_relative_path(path);
10376 intr_req->set_filepath(path);
10377 intr_req->set_inode(in);
10378 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10379 intr_req->head.args.filelock_change.rule = lock_type;
10380 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10381
10382 UserPerm perms(req->get_uid(), req->get_gid());
10383 return make_request(intr_req, perms, NULL, NULL, -1);
10384 }
10385
10386 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10387 {
10388 if (!in->fcntl_locks && !in->flock_locks)
10389 return;
10390
10391 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10392 encode(nr_fcntl_locks, bl);
10393 if (nr_fcntl_locks) {
10394 auto &lock_state = in->fcntl_locks;
10395 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10396 p != lock_state->held_locks.end();
10397 ++p)
10398 encode(p->second, bl);
10399 }
10400
10401 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10402 encode(nr_flock_locks, bl);
10403 if (nr_flock_locks) {
10404 auto &lock_state = in->flock_locks;
10405 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10406 p != lock_state->held_locks.end();
10407 ++p)
10408 encode(p->second, bl);
10409 }
10410
10411 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10412 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10413 }
10414
10415 void Client::_release_filelocks(Fh *fh)
10416 {
10417 if (!fh->fcntl_locks && !fh->flock_locks)
10418 return;
10419
10420 Inode *in = fh->inode.get();
10421 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10422
10423 list<ceph_filelock> activated_locks;
10424
10425 list<pair<int, ceph_filelock> > to_release;
10426
10427 if (fh->fcntl_locks) {
10428 auto &lock_state = fh->fcntl_locks;
10429 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10430 auto q = p++;
10431 if (in->flags & I_ERROR_FILELOCK) {
10432 lock_state->remove_lock(q->second, activated_locks);
10433 } else {
10434 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10435 }
10436 }
10437 lock_state.reset();
10438 }
10439 if (fh->flock_locks) {
10440 auto &lock_state = fh->flock_locks;
10441 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10442 auto q = p++;
10443 if (in->flags & I_ERROR_FILELOCK) {
10444 lock_state->remove_lock(q->second, activated_locks);
10445 } else {
10446 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10447 }
10448 }
10449 lock_state.reset();
10450 }
10451
10452 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10453 in->flags &= ~I_ERROR_FILELOCK;
10454
10455 if (to_release.empty())
10456 return;
10457
10458 struct flock fl;
10459 memset(&fl, 0, sizeof(fl));
10460 fl.l_whence = SEEK_SET;
10461 fl.l_type = F_UNLCK;
10462
10463 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10464 p != to_release.end();
10465 ++p) {
10466 fl.l_start = p->second.start;
10467 fl.l_len = p->second.length;
10468 fl.l_pid = p->second.pid;
10469 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10470 p->second.owner, true);
10471 }
10472 }
10473
10474 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10475 ceph_lock_state_t *lock_state)
10476 {
10477 int lock_cmd;
10478 if (F_RDLCK == fl->l_type)
10479 lock_cmd = CEPH_LOCK_SHARED;
10480 else if (F_WRLCK == fl->l_type)
10481 lock_cmd = CEPH_LOCK_EXCL;
10482 else
10483 lock_cmd = CEPH_LOCK_UNLOCK;;
10484
10485 ceph_filelock filelock;
10486 filelock.start = fl->l_start;
10487 filelock.length = fl->l_len;
10488 filelock.client = 0;
10489 // see comment in _do_filelock()
10490 filelock.owner = owner | (1ULL << 63);
10491 filelock.pid = fl->l_pid;
10492 filelock.type = lock_cmd;
10493
10494 if (filelock.type == CEPH_LOCK_UNLOCK) {
10495 list<ceph_filelock> activated_locks;
10496 lock_state->remove_lock(filelock, activated_locks);
10497 } else {
10498 bool r = lock_state->add_lock(filelock, false, false, NULL);
10499 ceph_assert(r);
10500 }
10501 }
10502
10503 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10504 {
10505 Inode *in = fh->inode.get();
10506 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10507 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10508 return ret;
10509 }
10510
10511 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10512 {
10513 Inode *in = fh->inode.get();
10514 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10515 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10516 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10517 return ret;
10518 }
10519
10520 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10521 {
10522 Inode *in = fh->inode.get();
10523 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10524
10525 int sleep = !(cmd & LOCK_NB);
10526 cmd &= ~LOCK_NB;
10527
10528 int type;
10529 switch (cmd) {
10530 case LOCK_SH:
10531 type = F_RDLCK;
10532 break;
10533 case LOCK_EX:
10534 type = F_WRLCK;
10535 break;
10536 case LOCK_UN:
10537 type = F_UNLCK;
10538 break;
10539 default:
10540 return -EINVAL;
10541 }
10542
10543 struct flock fl;
10544 memset(&fl, 0, sizeof(fl));
10545 fl.l_type = type;
10546 fl.l_whence = SEEK_SET;
10547
10548 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10549 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10550 return ret;
10551 }
10552
10553 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10554 {
10555 /* Since the only thing this does is wrap a call to statfs, and
10556 statfs takes a lock, it doesn't seem we have a need to split it
10557 out. */
10558 return statfs(0, stbuf, perms);
10559 }
10560
10561 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10562 {
10563 if (!args)
10564 return;
10565 std::lock_guard l(client_lock);
10566 ldout(cct, 10) << __func__ << " cb " << args->handle
10567 << " invalidate_ino_cb " << args->ino_cb
10568 << " invalidate_dentry_cb " << args->dentry_cb
10569 << " switch_interrupt_cb " << args->switch_intr_cb
10570 << " remount_cb " << args->remount_cb
10571 << dendl;
10572 callback_handle = args->handle;
10573 if (args->ino_cb) {
10574 ino_invalidate_cb = args->ino_cb;
10575 async_ino_invalidator.start();
10576 }
10577 if (args->dentry_cb) {
10578 dentry_invalidate_cb = args->dentry_cb;
10579 async_dentry_invalidator.start();
10580 }
10581 if (args->switch_intr_cb) {
10582 switch_interrupt_cb = args->switch_intr_cb;
10583 interrupt_finisher.start();
10584 }
10585 if (args->remount_cb) {
10586 remount_cb = args->remount_cb;
10587 remount_finisher.start();
10588 }
10589 if (args->ino_release_cb) {
10590 ino_release_cb = args->ino_release_cb;
10591 async_ino_releasor.start();
10592 }
10593 if (args->umask_cb)
10594 umask_cb = args->umask_cb;
10595 }
10596
10597 int Client::test_dentry_handling(bool can_invalidate)
10598 {
10599 int r = 0;
10600
10601 can_invalidate_dentries = can_invalidate;
10602
10603 if (can_invalidate_dentries) {
10604 ceph_assert(dentry_invalidate_cb);
10605 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10606 r = 0;
10607 } else {
10608 ceph_assert(remount_cb);
10609 ldout(cct, 1) << "using remount_cb" << dendl;
10610 r = _do_remount(false);
10611 }
10612
10613 return r;
10614 }
10615
10616 int Client::_sync_fs()
10617 {
10618 ldout(cct, 10) << __func__ << dendl;
10619
10620 // flush file data
10621 std::unique_ptr<C_SaferCond> cond = nullptr;
10622 if (cct->_conf->client_oc) {
10623 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10624 objectcacher->flush_all(cond.get());
10625 }
10626
10627 // flush caps
10628 flush_caps_sync();
10629 ceph_tid_t flush_tid = last_flush_tid;
10630
10631 // wait for unsafe mds requests
10632 wait_unsafe_requests();
10633
10634 wait_sync_caps(flush_tid);
10635
10636 if (nullptr != cond) {
10637 client_lock.unlock();
10638 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10639 cond->wait();
10640 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10641 client_lock.lock();
10642 }
10643
10644 return 0;
10645 }
10646
10647 int Client::sync_fs()
10648 {
10649 std::lock_guard l(client_lock);
10650
10651 if (unmounting)
10652 return -ENOTCONN;
10653
10654 return _sync_fs();
10655 }
10656
10657 int64_t Client::drop_caches()
10658 {
10659 std::lock_guard l(client_lock);
10660 return objectcacher->release_all();
10661 }
10662
10663 int Client::_lazyio(Fh *fh, int enable)
10664 {
10665 Inode *in = fh->inode.get();
10666 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10667
10668 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10669 return 0;
10670
10671 int orig_mode = fh->mode;
10672 if (enable) {
10673 fh->mode |= CEPH_FILE_MODE_LAZY;
10674 in->get_open_ref(fh->mode);
10675 in->put_open_ref(orig_mode);
10676 check_caps(in, CHECK_CAPS_NODELAY);
10677 } else {
10678 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10679 in->get_open_ref(fh->mode);
10680 in->put_open_ref(orig_mode);
10681 check_caps(in, 0);
10682 }
10683
10684 return 0;
10685 }
10686
10687 int Client::lazyio(int fd, int enable)
10688 {
10689 std::lock_guard l(client_lock);
10690 Fh *f = get_filehandle(fd);
10691 if (!f)
10692 return -EBADF;
10693
10694 return _lazyio(f, enable);
10695 }
10696
10697 int Client::ll_lazyio(Fh *fh, int enable)
10698 {
10699 std::lock_guard lock(client_lock);
10700 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10701 tout(cct) << __func__ << std::endl;
10702
10703 return _lazyio(fh, enable);
10704 }
10705
10706 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10707 {
10708 std::lock_guard l(client_lock);
10709 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10710 << ", " << offset << ", " << count << ")" << dendl;
10711
10712 Fh *f = get_filehandle(fd);
10713 if (!f)
10714 return -EBADF;
10715
10716 // for now
10717 _fsync(f, true);
10718
10719 return 0;
10720 }
10721
10722 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10723 {
10724 std::lock_guard l(client_lock);
10725 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10726 << ", " << offset << ", " << count << ")" << dendl;
10727
10728 Fh *f = get_filehandle(fd);
10729 if (!f)
10730 return -EBADF;
10731 Inode *in = f->inode.get();
10732
10733 _fsync(f, true);
10734 if (_release(in)) {
10735 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10736 if (r < 0)
10737 return r;
10738 }
10739 return 0;
10740 }
10741
10742
10743 // =============================
10744 // snaps
10745
10746 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10747 {
10748 std::lock_guard l(client_lock);
10749
10750 if (unmounting)
10751 return -ENOTCONN;
10752
10753 filepath path(relpath);
10754 InodeRef in;
10755 int r = path_walk(path, &in, perm);
10756 if (r < 0)
10757 return r;
10758 if (cct->_conf->client_permissions) {
10759 r = may_create(in.get(), perm);
10760 if (r < 0)
10761 return r;
10762 }
10763 Inode *snapdir = open_snapdir(in.get());
10764 return _mkdir(snapdir, name, 0, perm);
10765 }
10766
10767 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10768 {
10769 std::lock_guard l(client_lock);
10770
10771 if (unmounting)
10772 return -ENOTCONN;
10773
10774 filepath path(relpath);
10775 InodeRef in;
10776 int r = path_walk(path, &in, perms);
10777 if (r < 0)
10778 return r;
10779 if (cct->_conf->client_permissions) {
10780 r = may_delete(in.get(), NULL, perms);
10781 if (r < 0)
10782 return r;
10783 }
10784 Inode *snapdir = open_snapdir(in.get());
10785 return _rmdir(snapdir, name, perms);
10786 }
10787
10788 // =============================
10789 // expose caps
10790
10791 int Client::get_caps_issued(int fd) {
10792
10793 std::lock_guard lock(client_lock);
10794
10795 if (unmounting)
10796 return -ENOTCONN;
10797
10798 Fh *f = get_filehandle(fd);
10799 if (!f)
10800 return -EBADF;
10801
10802 return f->inode->caps_issued();
10803 }
10804
10805 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10806 {
10807 std::lock_guard lock(client_lock);
10808
10809 if (unmounting)
10810 return -ENOTCONN;
10811
10812 filepath p(path);
10813 InodeRef in;
10814 int r = path_walk(p, &in, perms, true);
10815 if (r < 0)
10816 return r;
10817 return in->caps_issued();
10818 }
10819
10820 // =========================================
10821 // low level
10822
10823 Inode *Client::open_snapdir(Inode *diri)
10824 {
10825 Inode *in;
10826 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10827 if (!inode_map.count(vino)) {
10828 in = new Inode(this, vino, &diri->layout);
10829
10830 in->ino = diri->ino;
10831 in->snapid = CEPH_SNAPDIR;
10832 in->mode = diri->mode;
10833 in->uid = diri->uid;
10834 in->gid = diri->gid;
10835 in->nlink = 1;
10836 in->mtime = diri->mtime;
10837 in->ctime = diri->ctime;
10838 in->btime = diri->btime;
10839 in->atime = diri->atime;
10840 in->size = diri->size;
10841 in->change_attr = diri->change_attr;
10842
10843 in->dirfragtree.clear();
10844 in->snapdir_parent = diri;
10845 diri->flags |= I_SNAPDIR_OPEN;
10846 inode_map[vino] = in;
10847 if (use_faked_inos())
10848 _assign_faked_ino(in);
10849 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10850 } else {
10851 in = inode_map[vino];
10852 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10853 }
10854 return in;
10855 }
10856
10857 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10858 Inode **out, const UserPerm& perms)
10859 {
10860 std::lock_guard lock(client_lock);
10861 vinodeno_t vparent = _get_vino(parent);
10862 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10863 tout(cct) << __func__ << std::endl;
10864 tout(cct) << name << std::endl;
10865
10866 if (unmounting)
10867 return -ENOTCONN;
10868
10869 int r = 0;
10870 if (!fuse_default_permissions) {
10871 if (strcmp(name, ".") && strcmp(name, "..")) {
10872 r = may_lookup(parent, perms);
10873 if (r < 0)
10874 return r;
10875 }
10876 }
10877
10878 string dname(name);
10879 InodeRef in;
10880
10881 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10882 if (r < 0) {
10883 attr->st_ino = 0;
10884 goto out;
10885 }
10886
10887 ceph_assert(in);
10888 fill_stat(in, attr);
10889 _ll_get(in.get());
10890
10891 out:
10892 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10893 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10894 tout(cct) << attr->st_ino << std::endl;
10895 *out = in.get();
10896 return r;
10897 }
10898
10899 int Client::ll_lookup_inode(
10900 struct inodeno_t ino,
10901 const UserPerm& perms,
10902 Inode **inode)
10903 {
10904 ceph_assert(inode != NULL);
10905 std::lock_guard lock(client_lock);
10906 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10907
10908 if (unmounting)
10909 return -ENOTCONN;
10910
10911 // Num1: get inode and *inode
10912 int r = _lookup_ino(ino, perms, inode);
10913 if (r)
10914 return r;
10915
10916 ceph_assert(*inode != NULL);
10917
10918 if (!(*inode)->dentries.empty()) {
10919 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10920 return 0;
10921 }
10922
10923 if ((*inode)->is_root()) {
10924 ldout(cct, 8) << "ino is root, no parent" << dendl;
10925 return 0;
10926 }
10927
10928 // Num2: Request the parent inode, so that we can look up the name
10929 Inode *parent;
10930 r = _lookup_parent(*inode, perms, &parent);
10931 if (r) {
10932 _ll_forget(*inode, 1);
10933 return r;
10934 }
10935
10936 ceph_assert(parent != NULL);
10937
10938 // Num3: Finally, get the name (dentry) of the requested inode
10939 r = _lookup_name(*inode, parent, perms);
10940 if (r) {
10941 // Unexpected error
10942 _ll_forget(parent, 1);
10943 _ll_forget(*inode, 1);
10944 return r;
10945 }
10946
10947 _ll_forget(parent, 1);
10948 return 0;
10949 }
10950
10951 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10952 struct ceph_statx *stx, unsigned want, unsigned flags,
10953 const UserPerm& perms)
10954 {
10955 std::lock_guard lock(client_lock);
10956 vinodeno_t vparent = _get_vino(parent);
10957 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10958 tout(cct) << "ll_lookupx" << std::endl;
10959 tout(cct) << name << std::endl;
10960
10961 if (unmounting)
10962 return -ENOTCONN;
10963
10964 int r = 0;
10965 if (!fuse_default_permissions) {
10966 r = may_lookup(parent, perms);
10967 if (r < 0)
10968 return r;
10969 }
10970
10971 string dname(name);
10972 InodeRef in;
10973
10974 unsigned mask = statx_to_mask(flags, want);
10975 r = _lookup(parent, dname, mask, &in, perms);
10976 if (r < 0) {
10977 stx->stx_ino = 0;
10978 stx->stx_mask = 0;
10979 } else {
10980 ceph_assert(in);
10981 fill_statx(in, mask, stx);
10982 _ll_get(in.get());
10983 }
10984
10985 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10986 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10987 tout(cct) << stx->stx_ino << std::endl;
10988 *out = in.get();
10989 return r;
10990 }
10991
10992 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10993 unsigned int want, unsigned int flags, const UserPerm& perms)
10994 {
10995 std::lock_guard lock(client_lock);
10996
10997 if (unmounting)
10998 return -ENOTCONN;
10999
11000 filepath fp(name, 0);
11001 InodeRef in;
11002 int rc;
11003 unsigned mask = statx_to_mask(flags, want);
11004
11005 ldout(cct, 3) << __func__ << " " << name << dendl;
11006 tout(cct) << __func__ << std::endl;
11007 tout(cct) << name << std::endl;
11008
11009 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11010 if (rc < 0) {
11011 /* zero out mask, just in case... */
11012 stx->stx_mask = 0;
11013 stx->stx_ino = 0;
11014 *out = NULL;
11015 return rc;
11016 } else {
11017 ceph_assert(in);
11018 fill_statx(in, mask, stx);
11019 _ll_get(in.get());
11020 *out = in.get();
11021 return 0;
11022 }
11023 }
11024
11025 void Client::_ll_get(Inode *in)
11026 {
11027 if (in->ll_ref == 0) {
11028 in->get();
11029 if (in->is_dir() && !in->dentries.empty()) {
11030 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11031 in->get_first_parent()->get(); // pin dentry
11032 }
11033 if (in->snapid != CEPH_NOSNAP)
11034 ll_snap_ref[in->snapid]++;
11035 }
11036 in->ll_get();
11037 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11038 }
11039
11040 int Client::_ll_put(Inode *in, uint64_t num)
11041 {
11042 in->ll_put(num);
11043 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11044 if (in->ll_ref == 0) {
11045 if (in->is_dir() && !in->dentries.empty()) {
11046 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11047 in->get_first_parent()->put(); // unpin dentry
11048 }
11049 if (in->snapid != CEPH_NOSNAP) {
11050 auto p = ll_snap_ref.find(in->snapid);
11051 ceph_assert(p != ll_snap_ref.end());
11052 ceph_assert(p->second > 0);
11053 if (--p->second == 0)
11054 ll_snap_ref.erase(p);
11055 }
11056 put_inode(in);
11057 return 0;
11058 } else {
11059 return in->ll_ref;
11060 }
11061 }
11062
11063 void Client::_ll_drop_pins()
11064 {
11065 ldout(cct, 10) << __func__ << dendl;
11066 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11067 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11068 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11069 it != inode_map.end();
11070 it = next) {
11071 Inode *in = it->second;
11072 next = it;
11073 ++next;
11074 if (in->ll_ref){
11075 to_be_put.insert(in);
11076 _ll_put(in, in->ll_ref);
11077 }
11078 }
11079 }
11080
11081 bool Client::_ll_forget(Inode *in, uint64_t count)
11082 {
11083 inodeno_t ino = in->ino;
11084
11085 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11086 tout(cct) << __func__ << std::endl;
11087 tout(cct) << ino.val << std::endl;
11088 tout(cct) << count << std::endl;
11089
11090 // Ignore forget if we're no longer mounted
11091 if (unmounting)
11092 return true;
11093
11094 if (ino == 1) return true; // ignore forget on root.
11095
11096 bool last = false;
11097 if (in->ll_ref < count) {
11098 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11099 << ", which only has ll_ref=" << in->ll_ref << dendl;
11100 _ll_put(in, in->ll_ref);
11101 last = true;
11102 } else {
11103 if (_ll_put(in, count) == 0)
11104 last = true;
11105 }
11106
11107 return last;
11108 }
11109
11110 bool Client::ll_forget(Inode *in, uint64_t count)
11111 {
11112 std::lock_guard lock(client_lock);
11113 return _ll_forget(in, count);
11114 }
11115
11116 bool Client::ll_put(Inode *in)
11117 {
11118 /* ll_forget already takes the lock */
11119 return ll_forget(in, 1);
11120 }
11121
11122 int Client::ll_get_snap_ref(snapid_t snap)
11123 {
11124 std::lock_guard lock(client_lock);
11125 auto p = ll_snap_ref.find(snap);
11126 if (p != ll_snap_ref.end())
11127 return p->second;
11128 return 0;
11129 }
11130
11131 snapid_t Client::ll_get_snapid(Inode *in)
11132 {
11133 std::lock_guard lock(client_lock);
11134 return in->snapid;
11135 }
11136
11137 Inode *Client::ll_get_inode(ino_t ino)
11138 {
11139 std::lock_guard lock(client_lock);
11140
11141 if (unmounting)
11142 return NULL;
11143
11144 vinodeno_t vino = _map_faked_ino(ino);
11145 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11146 if (p == inode_map.end())
11147 return NULL;
11148 Inode *in = p->second;
11149 _ll_get(in);
11150 return in;
11151 }
11152
11153 Inode *Client::ll_get_inode(vinodeno_t vino)
11154 {
11155 std::lock_guard lock(client_lock);
11156
11157 if (unmounting)
11158 return NULL;
11159
11160 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11161 if (p == inode_map.end())
11162 return NULL;
11163 Inode *in = p->second;
11164 _ll_get(in);
11165 return in;
11166 }
11167
11168 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11169 {
11170 vinodeno_t vino = _get_vino(in);
11171
11172 ldout(cct, 8) << __func__ << " " << vino << dendl;
11173 tout(cct) << __func__ << std::endl;
11174 tout(cct) << vino.ino.val << std::endl;
11175
11176 if (vino.snapid < CEPH_NOSNAP)
11177 return 0;
11178 else
11179 return _getattr(in, caps, perms);
11180 }
11181
11182 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11183 {
11184 std::lock_guard lock(client_lock);
11185
11186 if (unmounting)
11187 return -ENOTCONN;
11188
11189 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11190
11191 if (res == 0)
11192 fill_stat(in, attr);
11193 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11194 return res;
11195 }
11196
11197 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11198 unsigned int flags, const UserPerm& perms)
11199 {
11200 std::lock_guard lock(client_lock);
11201
11202 if (unmounting)
11203 return -ENOTCONN;
11204
11205 int res = 0;
11206 unsigned mask = statx_to_mask(flags, want);
11207
11208 if (mask && !in->caps_issued_mask(mask, true))
11209 res = _ll_getattr(in, mask, perms);
11210
11211 if (res == 0)
11212 fill_statx(in, mask, stx);
11213 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11214 return res;
11215 }
11216
11217 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11218 const UserPerm& perms, InodeRef *inp)
11219 {
11220 vinodeno_t vino = _get_vino(in);
11221
11222 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11223 << dendl;
11224 tout(cct) << __func__ << std::endl;
11225 tout(cct) << vino.ino.val << std::endl;
11226 tout(cct) << stx->stx_mode << std::endl;
11227 tout(cct) << stx->stx_uid << std::endl;
11228 tout(cct) << stx->stx_gid << std::endl;
11229 tout(cct) << stx->stx_size << std::endl;
11230 tout(cct) << stx->stx_mtime << std::endl;
11231 tout(cct) << stx->stx_atime << std::endl;
11232 tout(cct) << stx->stx_btime << std::endl;
11233 tout(cct) << mask << std::endl;
11234
11235 if (!fuse_default_permissions) {
11236 int res = may_setattr(in, stx, mask, perms);
11237 if (res < 0)
11238 return res;
11239 }
11240
11241 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11242
11243 return __setattrx(in, stx, mask, perms, inp);
11244 }
11245
11246 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11247 const UserPerm& perms)
11248 {
11249 std::lock_guard lock(client_lock);
11250
11251 if (unmounting)
11252 return -ENOTCONN;
11253
11254 InodeRef target(in);
11255 int res = _ll_setattrx(in, stx, mask, perms, &target);
11256 if (res == 0) {
11257 ceph_assert(in == target.get());
11258 fill_statx(in, in->caps_issued(), stx);
11259 }
11260
11261 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11262 return res;
11263 }
11264
11265 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11266 const UserPerm& perms)
11267 {
11268 struct ceph_statx stx;
11269 stat_to_statx(attr, &stx);
11270
11271 std::lock_guard lock(client_lock);
11272
11273 if (unmounting)
11274 return -ENOTCONN;
11275
11276 InodeRef target(in);
11277 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11278 if (res == 0) {
11279 ceph_assert(in == target.get());
11280 fill_stat(in, attr);
11281 }
11282
11283 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11284 return res;
11285 }
11286
11287
11288 // ----------
11289 // xattrs
11290
11291 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11292 const UserPerm& perms)
11293 {
11294 std::lock_guard lock(client_lock);
11295
11296 if (unmounting)
11297 return -ENOTCONN;
11298
11299 InodeRef in;
11300 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11301 if (r < 0)
11302 return r;
11303 return _getxattr(in, name, value, size, perms);
11304 }
11305
11306 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11307 const UserPerm& perms)
11308 {
11309 std::lock_guard lock(client_lock);
11310
11311 if (unmounting)
11312 return -ENOTCONN;
11313
11314 InodeRef in;
11315 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11316 if (r < 0)
11317 return r;
11318 return _getxattr(in, name, value, size, perms);
11319 }
11320
11321 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11322 const UserPerm& perms)
11323 {
11324 std::lock_guard lock(client_lock);
11325
11326 if (unmounting)
11327 return -ENOTCONN;
11328
11329 Fh *f = get_filehandle(fd);
11330 if (!f)
11331 return -EBADF;
11332 return _getxattr(f->inode, name, value, size, perms);
11333 }
11334
11335 int Client::listxattr(const char *path, char *list, size_t size,
11336 const UserPerm& perms)
11337 {
11338 std::lock_guard lock(client_lock);
11339
11340 if (unmounting)
11341 return -ENOTCONN;
11342
11343 InodeRef in;
11344 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11345 if (r < 0)
11346 return r;
11347 return Client::_listxattr(in.get(), list, size, perms);
11348 }
11349
11350 int Client::llistxattr(const char *path, char *list, size_t size,
11351 const UserPerm& perms)
11352 {
11353 std::lock_guard lock(client_lock);
11354
11355 if (unmounting)
11356 return -ENOTCONN;
11357
11358 InodeRef in;
11359 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11360 if (r < 0)
11361 return r;
11362 return Client::_listxattr(in.get(), list, size, perms);
11363 }
11364
11365 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11366 {
11367 std::lock_guard lock(client_lock);
11368
11369 if (unmounting)
11370 return -ENOTCONN;
11371
11372 Fh *f = get_filehandle(fd);
11373 if (!f)
11374 return -EBADF;
11375 return Client::_listxattr(f->inode.get(), list, size, perms);
11376 }
11377
11378 int Client::removexattr(const char *path, const char *name,
11379 const UserPerm& perms)
11380 {
11381 std::lock_guard lock(client_lock);
11382
11383 if (unmounting)
11384 return -ENOTCONN;
11385
11386 InodeRef in;
11387 int r = Client::path_walk(path, &in, perms, true);
11388 if (r < 0)
11389 return r;
11390 return _removexattr(in, name, perms);
11391 }
11392
11393 int Client::lremovexattr(const char *path, const char *name,
11394 const UserPerm& perms)
11395 {
11396 std::lock_guard lock(client_lock);
11397
11398 if (unmounting)
11399 return -ENOTCONN;
11400
11401 InodeRef in;
11402 int r = Client::path_walk(path, &in, perms, false);
11403 if (r < 0)
11404 return r;
11405 return _removexattr(in, name, perms);
11406 }
11407
11408 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11409 {
11410 std::lock_guard lock(client_lock);
11411
11412 if (unmounting)
11413 return -ENOTCONN;
11414
11415 Fh *f = get_filehandle(fd);
11416 if (!f)
11417 return -EBADF;
11418 return _removexattr(f->inode, name, perms);
11419 }
11420
11421 int Client::setxattr(const char *path, const char *name, const void *value,
11422 size_t size, int flags, const UserPerm& perms)
11423 {
11424 _setxattr_maybe_wait_for_osdmap(name, value, size);
11425
11426 std::lock_guard lock(client_lock);
11427
11428 if (unmounting)
11429 return -ENOTCONN;
11430
11431 InodeRef in;
11432 int r = Client::path_walk(path, &in, perms, true);
11433 if (r < 0)
11434 return r;
11435 return _setxattr(in, name, value, size, flags, perms);
11436 }
11437
11438 int Client::lsetxattr(const char *path, const char *name, const void *value,
11439 size_t size, int flags, const UserPerm& perms)
11440 {
11441 _setxattr_maybe_wait_for_osdmap(name, value, size);
11442
11443 std::lock_guard lock(client_lock);
11444
11445 if (unmounting)
11446 return -ENOTCONN;
11447
11448 InodeRef in;
11449 int r = Client::path_walk(path, &in, perms, false);
11450 if (r < 0)
11451 return r;
11452 return _setxattr(in, name, value, size, flags, perms);
11453 }
11454
11455 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11456 int flags, const UserPerm& perms)
11457 {
11458 _setxattr_maybe_wait_for_osdmap(name, value, size);
11459
11460 std::lock_guard lock(client_lock);
11461
11462 if (unmounting)
11463 return -ENOTCONN;
11464
11465 Fh *f = get_filehandle(fd);
11466 if (!f)
11467 return -EBADF;
11468 return _setxattr(f->inode, name, value, size, flags, perms);
11469 }
11470
11471 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11472 const UserPerm& perms)
11473 {
11474 int r;
11475
11476 const VXattr *vxattr = _match_vxattr(in, name);
11477 if (vxattr) {
11478 r = -ENODATA;
11479
11480 // Do a force getattr to get the latest quota before returning
11481 // a value to userspace.
11482 int flags = 0;
11483 if (vxattr->flags & VXATTR_RSTAT) {
11484 flags |= CEPH_STAT_RSTAT;
11485 }
11486 if (vxattr->flags & VXATTR_DIRSTAT) {
11487 flags |= CEPH_CAP_FILE_SHARED;
11488 }
11489 r = _getattr(in, flags, perms, true);
11490 if (r != 0) {
11491 // Error from getattr!
11492 return r;
11493 }
11494
11495 // call pointer-to-member function
11496 char buf[256];
11497 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11498 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11499 } else {
11500 r = -ENODATA;
11501 }
11502
11503 if (size != 0) {
11504 if (r > (int)size) {
11505 r = -ERANGE;
11506 } else if (r > 0) {
11507 memcpy(value, buf, r);
11508 }
11509 }
11510 goto out;
11511 }
11512
11513 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11514 r = -EOPNOTSUPP;
11515 goto out;
11516 }
11517
11518 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11519 if (r == 0) {
11520 string n(name);
11521 r = -ENODATA;
11522 if (in->xattrs.count(n)) {
11523 r = in->xattrs[n].length();
11524 if (r > 0 && size != 0) {
11525 if (size >= (unsigned)r)
11526 memcpy(value, in->xattrs[n].c_str(), r);
11527 else
11528 r = -ERANGE;
11529 }
11530 }
11531 }
11532 out:
11533 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11534 return r;
11535 }
11536
11537 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11538 const UserPerm& perms)
11539 {
11540 if (cct->_conf->client_permissions) {
11541 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11542 if (r < 0)
11543 return r;
11544 }
11545 return _getxattr(in.get(), name, value, size, perms);
11546 }
11547
11548 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11549 size_t size, const UserPerm& perms)
11550 {
11551 std::lock_guard lock(client_lock);
11552
11553 if (unmounting)
11554 return -ENOTCONN;
11555
11556 vinodeno_t vino = _get_vino(in);
11557
11558 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11559 tout(cct) << __func__ << std::endl;
11560 tout(cct) << vino.ino.val << std::endl;
11561 tout(cct) << name << std::endl;
11562
11563 if (!fuse_default_permissions) {
11564 int r = xattr_permission(in, name, MAY_READ, perms);
11565 if (r < 0)
11566 return r;
11567 }
11568
11569 return _getxattr(in, name, value, size, perms);
11570 }
11571
11572 int Client::_listxattr(Inode *in, char *name, size_t size,
11573 const UserPerm& perms)
11574 {
11575 bool len_only = (size == 0);
11576 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11577 if (r != 0) {
11578 goto out;
11579 }
11580
11581 r = 0;
11582 for (const auto& p : in->xattrs) {
11583 size_t this_len = p.first.length() + 1;
11584 r += this_len;
11585 if (len_only)
11586 continue;
11587
11588 if (this_len > size) {
11589 r = -ERANGE;
11590 goto out;
11591 }
11592
11593 memcpy(name, p.first.c_str(), this_len);
11594 name += this_len;
11595 size -= this_len;
11596 }
11597 out:
11598 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11599 return r;
11600 }
11601
11602 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11603 const UserPerm& perms)
11604 {
11605 std::lock_guard lock(client_lock);
11606
11607 if (unmounting)
11608 return -ENOTCONN;
11609
11610 vinodeno_t vino = _get_vino(in);
11611
11612 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11613 tout(cct) << __func__ << std::endl;
11614 tout(cct) << vino.ino.val << std::endl;
11615 tout(cct) << size << std::endl;
11616
11617 return _listxattr(in, names, size, perms);
11618 }
11619
11620 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11621 size_t size, int flags, const UserPerm& perms)
11622 {
11623
11624 int xattr_flags = 0;
11625 if (!value)
11626 xattr_flags |= CEPH_XATTR_REMOVE;
11627 if (flags & XATTR_CREATE)
11628 xattr_flags |= CEPH_XATTR_CREATE;
11629 if (flags & XATTR_REPLACE)
11630 xattr_flags |= CEPH_XATTR_REPLACE;
11631
11632 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11633 filepath path;
11634 in->make_nosnap_relative_path(path);
11635 req->set_filepath(path);
11636 req->set_string2(name);
11637 req->set_inode(in);
11638 req->head.args.setxattr.flags = xattr_flags;
11639
11640 bufferlist bl;
11641 assert (value || size == 0);
11642 bl.append((const char*)value, size);
11643 req->set_data(bl);
11644
11645 int res = make_request(req, perms);
11646
11647 trim_cache();
11648 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11649 res << dendl;
11650 return res;
11651 }
11652
11653 int Client::_setxattr(Inode *in, const char *name, const void *value,
11654 size_t size, int flags, const UserPerm& perms)
11655 {
11656 if (in->snapid != CEPH_NOSNAP) {
11657 return -EROFS;
11658 }
11659
11660 if (size == 0) {
11661 value = "";
11662 } else if (value == NULL) {
11663 return -EINVAL;
11664 }
11665
11666 bool posix_acl_xattr = false;
11667 if (acl_type == POSIX_ACL)
11668 posix_acl_xattr = !strncmp(name, "system.", 7);
11669
11670 if (strncmp(name, "user.", 5) &&
11671 strncmp(name, "security.", 9) &&
11672 strncmp(name, "trusted.", 8) &&
11673 strncmp(name, "ceph.", 5) &&
11674 !posix_acl_xattr)
11675 return -EOPNOTSUPP;
11676
11677 bool check_realm = false;
11678
11679 if (posix_acl_xattr) {
11680 if (!strcmp(name, ACL_EA_ACCESS)) {
11681 mode_t new_mode = in->mode;
11682 if (value) {
11683 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11684 if (ret < 0)
11685 return ret;
11686 if (ret == 0) {
11687 value = NULL;
11688 size = 0;
11689 }
11690 if (new_mode != in->mode) {
11691 struct ceph_statx stx;
11692 stx.stx_mode = new_mode;
11693 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11694 if (ret < 0)
11695 return ret;
11696 }
11697 }
11698 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11699 if (value) {
11700 if (!S_ISDIR(in->mode))
11701 return -EACCES;
11702 int ret = posix_acl_check(value, size);
11703 if (ret < 0)
11704 return -EINVAL;
11705 if (ret == 0) {
11706 value = NULL;
11707 size = 0;
11708 }
11709 }
11710 } else {
11711 return -EOPNOTSUPP;
11712 }
11713 } else {
11714 const VXattr *vxattr = _match_vxattr(in, name);
11715 if (vxattr) {
11716 if (vxattr->readonly)
11717 return -EOPNOTSUPP;
11718 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11719 check_realm = true;
11720 }
11721 }
11722
11723 int ret = _do_setxattr(in, name, value, size, flags, perms);
11724 if (ret >= 0 && check_realm) {
11725 // check if snaprealm was created for quota inode
11726 if (in->quota.is_enable() &&
11727 !(in->snaprealm && in->snaprealm->ino == in->ino))
11728 ret = -EOPNOTSUPP;
11729 }
11730
11731 return ret;
11732 }
11733
11734 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11735 size_t size, int flags, const UserPerm& perms)
11736 {
11737 if (cct->_conf->client_permissions) {
11738 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11739 if (r < 0)
11740 return r;
11741 }
11742 return _setxattr(in.get(), name, value, size, flags, perms);
11743 }
11744
11745 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11746 {
11747 string tmp;
11748 if (name == "layout") {
11749 string::iterator begin = value.begin();
11750 string::iterator end = value.end();
11751 keys_and_values<string::iterator> p; // create instance of parser
11752 std::map<string, string> m; // map to receive results
11753 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11754 return -EINVAL;
11755 }
11756 if (begin != end)
11757 return -EINVAL;
11758 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11759 if (q->first == "pool") {
11760 tmp = q->second;
11761 break;
11762 }
11763 }
11764 } else if (name == "layout.pool") {
11765 tmp = value;
11766 }
11767
11768 if (tmp.length()) {
11769 int64_t pool;
11770 try {
11771 pool = boost::lexical_cast<unsigned>(tmp);
11772 if (!osdmap->have_pg_pool(pool))
11773 return -ENOENT;
11774 } catch (boost::bad_lexical_cast const&) {
11775 pool = osdmap->lookup_pg_pool_name(tmp);
11776 if (pool < 0) {
11777 return -ENOENT;
11778 }
11779 }
11780 }
11781
11782 return 0;
11783 }
11784
11785 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11786 {
11787 // For setting pool of layout, MetaRequest need osdmap epoch.
11788 // There is a race which create a new data pool but client and mds both don't have.
11789 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11790 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11791 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11792 string rest(strstr(name, "layout"));
11793 string v((const char*)value, size);
11794 int r = objecter->with_osdmap([&](const OSDMap& o) {
11795 return _setxattr_check_data_pool(rest, v, &o);
11796 });
11797
11798 if (r == -ENOENT) {
11799 C_SaferCond ctx;
11800 objecter->wait_for_latest_osdmap(&ctx);
11801 ctx.wait();
11802 }
11803 }
11804 }
11805
11806 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11807 size_t size, int flags, const UserPerm& perms)
11808 {
11809 _setxattr_maybe_wait_for_osdmap(name, value, size);
11810
11811 std::lock_guard lock(client_lock);
11812
11813 if (unmounting)
11814 return -ENOTCONN;
11815
11816 vinodeno_t vino = _get_vino(in);
11817
11818 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11819 tout(cct) << __func__ << std::endl;
11820 tout(cct) << vino.ino.val << std::endl;
11821 tout(cct) << name << std::endl;
11822
11823 if (!fuse_default_permissions) {
11824 int r = xattr_permission(in, name, MAY_WRITE, perms);
11825 if (r < 0)
11826 return r;
11827 }
11828 return _setxattr(in, name, value, size, flags, perms);
11829 }
11830
11831 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11832 {
11833 if (in->snapid != CEPH_NOSNAP) {
11834 return -EROFS;
11835 }
11836
11837 // same xattrs supported by kernel client
11838 if (strncmp(name, "user.", 5) &&
11839 strncmp(name, "system.", 7) &&
11840 strncmp(name, "security.", 9) &&
11841 strncmp(name, "trusted.", 8) &&
11842 strncmp(name, "ceph.", 5))
11843 return -EOPNOTSUPP;
11844
11845 const VXattr *vxattr = _match_vxattr(in, name);
11846 if (vxattr && vxattr->readonly)
11847 return -EOPNOTSUPP;
11848
11849 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11850 filepath path;
11851 in->make_nosnap_relative_path(path);
11852 req->set_filepath(path);
11853 req->set_filepath2(name);
11854 req->set_inode(in);
11855
11856 int res = make_request(req, perms);
11857
11858 trim_cache();
11859 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11860 return res;
11861 }
11862
11863 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11864 {
11865 if (cct->_conf->client_permissions) {
11866 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11867 if (r < 0)
11868 return r;
11869 }
11870 return _removexattr(in.get(), name, perms);
11871 }
11872
11873 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11874 {
11875 std::lock_guard lock(client_lock);
11876
11877 if (unmounting)
11878 return -ENOTCONN;
11879
11880 vinodeno_t vino = _get_vino(in);
11881
11882 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11883 tout(cct) << "ll_removexattr" << std::endl;
11884 tout(cct) << vino.ino.val << std::endl;
11885 tout(cct) << name << std::endl;
11886
11887 if (!fuse_default_permissions) {
11888 int r = xattr_permission(in, name, MAY_WRITE, perms);
11889 if (r < 0)
11890 return r;
11891 }
11892
11893 return _removexattr(in, name, perms);
11894 }
11895
11896 bool Client::_vxattrcb_quota_exists(Inode *in)
11897 {
11898 return in->quota.is_enable() &&
11899 (in->snapid != CEPH_NOSNAP ||
11900 (in->snaprealm && in->snaprealm->ino == in->ino));
11901 }
11902 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11903 {
11904 return snprintf(val, size,
11905 "max_bytes=%lld max_files=%lld",
11906 (long long int)in->quota.max_bytes,
11907 (long long int)in->quota.max_files);
11908 }
11909 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11910 {
11911 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11912 }
11913 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11914 {
11915 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11916 }
11917
11918 bool Client::_vxattrcb_layout_exists(Inode *in)
11919 {
11920 return in->layout != file_layout_t();
11921 }
11922 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11923 {
11924 int r = snprintf(val, size,
11925 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11926 (unsigned long long)in->layout.stripe_unit,
11927 (unsigned long long)in->layout.stripe_count,
11928 (unsigned long long)in->layout.object_size);
11929 objecter->with_osdmap([&](const OSDMap& o) {
11930 if (o.have_pg_pool(in->layout.pool_id))
11931 r += snprintf(val + r, size - r, "%s",
11932 o.get_pool_name(in->layout.pool_id).c_str());
11933 else
11934 r += snprintf(val + r, size - r, "%" PRIu64,
11935 (uint64_t)in->layout.pool_id);
11936 });
11937 if (in->layout.pool_ns.length())
11938 r += snprintf(val + r, size - r, " pool_namespace=%s",
11939 in->layout.pool_ns.c_str());
11940 return r;
11941 }
11942 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11943 {
11944 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11945 }
11946 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11947 {
11948 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11949 }
11950 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11951 {
11952 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11953 }
11954 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11955 {
11956 size_t r;
11957 objecter->with_osdmap([&](const OSDMap& o) {
11958 if (o.have_pg_pool(in->layout.pool_id))
11959 r = snprintf(val, size, "%s", o.get_pool_name(
11960 in->layout.pool_id).c_str());
11961 else
11962 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11963 });
11964 return r;
11965 }
11966 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11967 {
11968 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11969 }
11970 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11971 {
11972 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11973 }
11974 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11975 {
11976 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11977 }
11978 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11979 {
11980 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11981 }
11982 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11983 {
11984 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11985 }
11986 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11987 {
11988 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11989 }
11990 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11991 {
11992 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11993 }
11994 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11995 {
11996 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11997 }
11998 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11999 {
12000 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12001 (long)in->rstat.rctime.nsec());
12002 }
12003 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12004 {
12005 return in->dir_pin != -ENODATA;
12006 }
12007 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12008 {
12009 return snprintf(val, size, "%ld", (long)in->dir_pin);
12010 }
12011
12012 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12013 {
12014 return !in->snap_btime.is_zero();
12015 }
12016
12017 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12018 {
12019 return snprintf(val, size, "%llu.%09lu",
12020 (long long unsigned)in->snap_btime.sec(),
12021 (long unsigned)in->snap_btime.nsec());
12022 }
12023
12024 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12025 {
12026 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12027 }
12028
12029 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12030 {
12031 auto name = messenger->get_myname();
12032 return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12033 }
12034
12035 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12036 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12037
12038 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12039 { \
12040 name: CEPH_XATTR_NAME(_type, _name), \
12041 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12042 readonly: true, \
12043 exists_cb: NULL, \
12044 flags: _flags, \
12045 }
12046 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12047 { \
12048 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12049 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12050 readonly: false, \
12051 exists_cb: &Client::_vxattrcb_layout_exists, \
12052 flags: 0, \
12053 }
12054 #define XATTR_QUOTA_FIELD(_type, _name) \
12055 { \
12056 name: CEPH_XATTR_NAME(_type, _name), \
12057 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12058 readonly: false, \
12059 exists_cb: &Client::_vxattrcb_quota_exists, \
12060 flags: 0, \
12061 }
12062
12063 const Client::VXattr Client::_dir_vxattrs[] = {
12064 {
12065 name: "ceph.dir.layout",
12066 getxattr_cb: &Client::_vxattrcb_layout,
12067 readonly: false,
12068 exists_cb: &Client::_vxattrcb_layout_exists,
12069 flags: 0,
12070 },
12071 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12072 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12073 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12074 XATTR_LAYOUT_FIELD(dir, layout, pool),
12075 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12076 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12077 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12078 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12079 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12080 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12081 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12082 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12083 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12084 {
12085 name: "ceph.quota",
12086 getxattr_cb: &Client::_vxattrcb_quota,
12087 readonly: false,
12088 exists_cb: &Client::_vxattrcb_quota_exists,
12089 flags: 0,
12090 },
12091 XATTR_QUOTA_FIELD(quota, max_bytes),
12092 XATTR_QUOTA_FIELD(quota, max_files),
12093 {
12094 name: "ceph.dir.pin",
12095 getxattr_cb: &Client::_vxattrcb_dir_pin,
12096 readonly: false,
12097 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12098 flags: 0,
12099 },
12100 {
12101 name: "ceph.snap.btime",
12102 getxattr_cb: &Client::_vxattrcb_snap_btime,
12103 readonly: true,
12104 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12105 flags: 0,
12106 },
12107 { name: "" } /* Required table terminator */
12108 };
12109
12110 const Client::VXattr Client::_file_vxattrs[] = {
12111 {
12112 name: "ceph.file.layout",
12113 getxattr_cb: &Client::_vxattrcb_layout,
12114 readonly: false,
12115 exists_cb: &Client::_vxattrcb_layout_exists,
12116 flags: 0,
12117 },
12118 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12119 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12120 XATTR_LAYOUT_FIELD(file, layout, object_size),
12121 XATTR_LAYOUT_FIELD(file, layout, pool),
12122 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12123 {
12124 name: "ceph.snap.btime",
12125 getxattr_cb: &Client::_vxattrcb_snap_btime,
12126 readonly: true,
12127 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12128 flags: 0,
12129 },
12130 { name: "" } /* Required table terminator */
12131 };
12132
12133 const Client::VXattr Client::_common_vxattrs[] = {
12134 {
12135 name: "ceph.cluster_fsid",
12136 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12137 readonly: true,
12138 exists_cb: nullptr,
12139 flags: 0,
12140 },
12141 {
12142 name: "ceph.client_id",
12143 getxattr_cb: &Client::_vxattrcb_client_id,
12144 readonly: true,
12145 exists_cb: nullptr,
12146 flags: 0,
12147 },
12148 { name: "" } /* Required table terminator */
12149 };
12150
12151 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12152 {
12153 if (in->is_dir())
12154 return _dir_vxattrs;
12155 else if (in->is_file())
12156 return _file_vxattrs;
12157 return NULL;
12158 }
12159
12160 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12161 {
12162 if (strncmp(name, "ceph.", 5) == 0) {
12163 const VXattr *vxattr = _get_vxattrs(in);
12164 if (vxattr) {
12165 while (!vxattr->name.empty()) {
12166 if (vxattr->name == name)
12167 return vxattr;
12168 vxattr++;
12169 }
12170 }
12171
12172 // for common vxattrs
12173 vxattr = _common_vxattrs;
12174 while (!vxattr->name.empty()) {
12175 if (vxattr->name == name)
12176 return vxattr;
12177 vxattr++;
12178 }
12179 }
12180
12181 return NULL;
12182 }
12183
12184 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12185 {
12186 std::lock_guard lock(client_lock);
12187
12188 if (unmounting)
12189 return -ENOTCONN;
12190
12191 vinodeno_t vino = _get_vino(in);
12192
12193 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12194 tout(cct) << "ll_readlink" << std::endl;
12195 tout(cct) << vino.ino.val << std::endl;
12196
12197 for (auto dn : in->dentries) {
12198 touch_dn(dn);
12199 }
12200
12201 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12202 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12203 return r;
12204 }
12205
12206 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12207 const UserPerm& perms, InodeRef *inp)
12208 {
12209 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12210 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12211 << ", gid " << perms.gid() << ")" << dendl;
12212
12213 if (strlen(name) > NAME_MAX)
12214 return -ENAMETOOLONG;
12215
12216 if (dir->snapid != CEPH_NOSNAP) {
12217 return -EROFS;
12218 }
12219 if (is_quota_files_exceeded(dir, perms)) {
12220 return -EDQUOT;
12221 }
12222
12223 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12224
12225 filepath path;
12226 dir->make_nosnap_relative_path(path);
12227 path.push_dentry(name);
12228 req->set_filepath(path);
12229 req->set_inode(dir);
12230 req->head.args.mknod.rdev = rdev;
12231 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12232 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12233
12234 bufferlist xattrs_bl;
12235 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12236 if (res < 0)
12237 goto fail;
12238 req->head.args.mknod.mode = mode;
12239 if (xattrs_bl.length() > 0)
12240 req->set_data(xattrs_bl);
12241
12242 Dentry *de;
12243 res = get_or_create(dir, name, &de);
12244 if (res < 0)
12245 goto fail;
12246 req->set_dentry(de);
12247
12248 res = make_request(req, perms, inp);
12249
12250 trim_cache();
12251
12252 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12253 return res;
12254
12255 fail:
12256 put_request(req);
12257 return res;
12258 }
12259
12260 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12261 dev_t rdev, struct stat *attr, Inode **out,
12262 const UserPerm& perms)
12263 {
12264 std::lock_guard lock(client_lock);
12265
12266 if (unmounting)
12267 return -ENOTCONN;
12268
12269 vinodeno_t vparent = _get_vino(parent);
12270
12271 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12272 tout(cct) << "ll_mknod" << std::endl;
12273 tout(cct) << vparent.ino.val << std::endl;
12274 tout(cct) << name << std::endl;
12275 tout(cct) << mode << std::endl;
12276 tout(cct) << rdev << std::endl;
12277
12278 if (!fuse_default_permissions) {
12279 int r = may_create(parent, perms);
12280 if (r < 0)
12281 return r;
12282 }
12283
12284 InodeRef in;
12285 int r = _mknod(parent, name, mode, rdev, perms, &in);
12286 if (r == 0) {
12287 fill_stat(in, attr);
12288 _ll_get(in.get());
12289 }
12290 tout(cct) << attr->st_ino << std::endl;
12291 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12292 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12293 *out = in.get();
12294 return r;
12295 }
12296
12297 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12298 dev_t rdev, Inode **out,
12299 struct ceph_statx *stx, unsigned want, unsigned flags,
12300 const UserPerm& perms)
12301 {
12302 unsigned caps = statx_to_mask(flags, want);
12303 std::lock_guard lock(client_lock);
12304
12305 if (unmounting)
12306 return -ENOTCONN;
12307
12308 vinodeno_t vparent = _get_vino(parent);
12309
12310 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12311 tout(cct) << "ll_mknodx" << std::endl;
12312 tout(cct) << vparent.ino.val << std::endl;
12313 tout(cct) << name << std::endl;
12314 tout(cct) << mode << std::endl;
12315 tout(cct) << rdev << std::endl;
12316
12317 if (!fuse_default_permissions) {
12318 int r = may_create(parent, perms);
12319 if (r < 0)
12320 return r;
12321 }
12322
12323 InodeRef in;
12324 int r = _mknod(parent, name, mode, rdev, perms, &in);
12325 if (r == 0) {
12326 fill_statx(in, caps, stx);
12327 _ll_get(in.get());
12328 }
12329 tout(cct) << stx->stx_ino << std::endl;
12330 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12331 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12332 *out = in.get();
12333 return r;
12334 }
12335
12336 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12337 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12338 int object_size, const char *data_pool, bool *created,
12339 const UserPerm& perms)
12340 {
12341 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12342 mode << dec << ")" << dendl;
12343
12344 if (strlen(name) > NAME_MAX)
12345 return -ENAMETOOLONG;
12346 if (dir->snapid != CEPH_NOSNAP) {
12347 return -EROFS;
12348 }
12349 if (is_quota_files_exceeded(dir, perms)) {
12350 return -EDQUOT;
12351 }
12352
12353 // use normalized flags to generate cmode
12354 int cflags = ceph_flags_sys2wire(flags);
12355 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12356 cflags |= CEPH_O_LAZY;
12357
12358 int cmode = ceph_flags_to_mode(cflags);
12359
12360 int64_t pool_id = -1;
12361 if (data_pool && *data_pool) {
12362 pool_id = objecter->with_osdmap(
12363 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12364 if (pool_id < 0)
12365 return -EINVAL;
12366 if (pool_id > 0xffffffffll)
12367 return -ERANGE; // bummer!
12368 }
12369
12370 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12371
12372 filepath path;
12373 dir->make_nosnap_relative_path(path);
12374 path.push_dentry(name);
12375 req->set_filepath(path);
12376 req->set_inode(dir);
12377 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12378
12379 req->head.args.open.stripe_unit = stripe_unit;
12380 req->head.args.open.stripe_count = stripe_count;
12381 req->head.args.open.object_size = object_size;
12382 if (cct->_conf->client_debug_getattr_caps)
12383 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12384 else
12385 req->head.args.open.mask = 0;
12386 req->head.args.open.pool = pool_id;
12387 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12388 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12389
12390 mode |= S_IFREG;
12391 bufferlist xattrs_bl;
12392 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12393 if (res < 0)
12394 goto fail;
12395 req->head.args.open.mode = mode;
12396 if (xattrs_bl.length() > 0)
12397 req->set_data(xattrs_bl);
12398
12399 Dentry *de;
12400 res = get_or_create(dir, name, &de);
12401 if (res < 0)
12402 goto fail;
12403 req->set_dentry(de);
12404
12405 res = make_request(req, perms, inp, created);
12406 if (res < 0) {
12407 goto reply_error;
12408 }
12409
12410 /* If the caller passed a value in fhp, do the open */
12411 if(fhp) {
12412 (*inp)->get_open_ref(cmode);
12413 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12414 }
12415
12416 reply_error:
12417 trim_cache();
12418
12419 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12420 << " layout " << stripe_unit
12421 << ' ' << stripe_count
12422 << ' ' << object_size
12423 <<") = " << res << dendl;
12424 return res;
12425
12426 fail:
12427 put_request(req);
12428 return res;
12429 }
12430
12431
12432 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12433 InodeRef *inp)
12434 {
12435 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12436 << mode << dec << ", uid " << perm.uid()
12437 << ", gid " << perm.gid() << ")" << dendl;
12438
12439 if (strlen(name) > NAME_MAX)
12440 return -ENAMETOOLONG;
12441
12442 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12443 return -EROFS;
12444 }
12445 if (is_quota_files_exceeded(dir, perm)) {
12446 return -EDQUOT;
12447 }
12448 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12449 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12450
12451 filepath path;
12452 dir->make_nosnap_relative_path(path);
12453 path.push_dentry(name);
12454 req->set_filepath(path);
12455 req->set_inode(dir);
12456 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12457 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12458
12459 mode |= S_IFDIR;
12460 bufferlist xattrs_bl;
12461 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12462 if (res < 0)
12463 goto fail;
12464 req->head.args.mkdir.mode = mode;
12465 if (xattrs_bl.length() > 0)
12466 req->set_data(xattrs_bl);
12467
12468 Dentry *de;
12469 res = get_or_create(dir, name, &de);
12470 if (res < 0)
12471 goto fail;
12472 req->set_dentry(de);
12473
12474 ldout(cct, 10) << "_mkdir: making request" << dendl;
12475 res = make_request(req, perm, inp);
12476 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12477
12478 trim_cache();
12479
12480 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12481 return res;
12482
12483 fail:
12484 put_request(req);
12485 return res;
12486 }
12487
12488 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12489 struct stat *attr, Inode **out, const UserPerm& perm)
12490 {
12491 std::lock_guard lock(client_lock);
12492
12493 if (unmounting)
12494 return -ENOTCONN;
12495
12496 vinodeno_t vparent = _get_vino(parent);
12497
12498 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12499 tout(cct) << "ll_mkdir" << std::endl;
12500 tout(cct) << vparent.ino.val << std::endl;
12501 tout(cct) << name << std::endl;
12502 tout(cct) << mode << std::endl;
12503
12504 if (!fuse_default_permissions) {
12505 int r = may_create(parent, perm);
12506 if (r < 0)
12507 return r;
12508 }
12509
12510 InodeRef in;
12511 int r = _mkdir(parent, name, mode, perm, &in);
12512 if (r == 0) {
12513 fill_stat(in, attr);
12514 _ll_get(in.get());
12515 }
12516 tout(cct) << attr->st_ino << std::endl;
12517 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12518 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12519 *out = in.get();
12520 return r;
12521 }
12522
12523 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12524 struct ceph_statx *stx, unsigned want, unsigned flags,
12525 const UserPerm& perms)
12526 {
12527 std::lock_guard lock(client_lock);
12528
12529 if (unmounting)
12530 return -ENOTCONN;
12531
12532 vinodeno_t vparent = _get_vino(parent);
12533
12534 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12535 tout(cct) << "ll_mkdirx" << std::endl;
12536 tout(cct) << vparent.ino.val << std::endl;
12537 tout(cct) << name << std::endl;
12538 tout(cct) << mode << std::endl;
12539
12540 if (!fuse_default_permissions) {
12541 int r = may_create(parent, perms);
12542 if (r < 0)
12543 return r;
12544 }
12545
12546 InodeRef in;
12547 int r = _mkdir(parent, name, mode, perms, &in);
12548 if (r == 0) {
12549 fill_statx(in, statx_to_mask(flags, want), stx);
12550 _ll_get(in.get());
12551 } else {
12552 stx->stx_ino = 0;
12553 stx->stx_mask = 0;
12554 }
12555 tout(cct) << stx->stx_ino << std::endl;
12556 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12557 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12558 *out = in.get();
12559 return r;
12560 }
12561
12562 int Client::_symlink(Inode *dir, const char *name, const char *target,
12563 const UserPerm& perms, InodeRef *inp)
12564 {
12565 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12566 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12567 << dendl;
12568
12569 if (strlen(name) > NAME_MAX)
12570 return -ENAMETOOLONG;
12571
12572 if (dir->snapid != CEPH_NOSNAP) {
12573 return -EROFS;
12574 }
12575 if (is_quota_files_exceeded(dir, perms)) {
12576 return -EDQUOT;
12577 }
12578
12579 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12580
12581 filepath path;
12582 dir->make_nosnap_relative_path(path);
12583 path.push_dentry(name);
12584 req->set_filepath(path);
12585 req->set_inode(dir);
12586 req->set_string2(target);
12587 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12588 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12589
12590 Dentry *de;
12591 int res = get_or_create(dir, name, &de);
12592 if (res < 0)
12593 goto fail;
12594 req->set_dentry(de);
12595
12596 res = make_request(req, perms, inp);
12597
12598 trim_cache();
12599 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12600 res << dendl;
12601 return res;
12602
12603 fail:
12604 put_request(req);
12605 return res;
12606 }
12607
12608 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12609 struct stat *attr, Inode **out, const UserPerm& perms)
12610 {
12611 std::lock_guard lock(client_lock);
12612
12613 if (unmounting)
12614 return -ENOTCONN;
12615
12616 vinodeno_t vparent = _get_vino(parent);
12617
12618 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12619 << dendl;
12620 tout(cct) << "ll_symlink" << std::endl;
12621 tout(cct) << vparent.ino.val << std::endl;
12622 tout(cct) << name << std::endl;
12623 tout(cct) << value << std::endl;
12624
12625 if (!fuse_default_permissions) {
12626 int r = may_create(parent, perms);
12627 if (r < 0)
12628 return r;
12629 }
12630
12631 InodeRef in;
12632 int r = _symlink(parent, name, value, perms, &in);
12633 if (r == 0) {
12634 fill_stat(in, attr);
12635 _ll_get(in.get());
12636 }
12637 tout(cct) << attr->st_ino << std::endl;
12638 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12639 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12640 *out = in.get();
12641 return r;
12642 }
12643
12644 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12645 Inode **out, struct ceph_statx *stx, unsigned want,
12646 unsigned flags, const UserPerm& perms)
12647 {
12648 std::lock_guard lock(client_lock);
12649
12650 if (unmounting)
12651 return -ENOTCONN;
12652
12653 vinodeno_t vparent = _get_vino(parent);
12654
12655 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12656 << dendl;
12657 tout(cct) << "ll_symlinkx" << std::endl;
12658 tout(cct) << vparent.ino.val << std::endl;
12659 tout(cct) << name << std::endl;
12660 tout(cct) << value << std::endl;
12661
12662 if (!fuse_default_permissions) {
12663 int r = may_create(parent, perms);
12664 if (r < 0)
12665 return r;
12666 }
12667
12668 InodeRef in;
12669 int r = _symlink(parent, name, value, perms, &in);
12670 if (r == 0) {
12671 fill_statx(in, statx_to_mask(flags, want), stx);
12672 _ll_get(in.get());
12673 }
12674 tout(cct) << stx->stx_ino << std::endl;
12675 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12676 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12677 *out = in.get();
12678 return r;
12679 }
12680
12681 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12682 {
12683 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12684 << " uid " << perm.uid() << " gid " << perm.gid()
12685 << ")" << dendl;
12686
12687 if (dir->snapid != CEPH_NOSNAP) {
12688 return -EROFS;
12689 }
12690
12691 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12692
12693 filepath path;
12694 dir->make_nosnap_relative_path(path);
12695 path.push_dentry(name);
12696 req->set_filepath(path);
12697
12698 InodeRef otherin;
12699 Inode *in;
12700 Dentry *de;
12701
12702 int res = get_or_create(dir, name, &de);
12703 if (res < 0)
12704 goto fail;
12705 req->set_dentry(de);
12706 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12707 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12708
12709 res = _lookup(dir, name, 0, &otherin, perm);
12710 if (res < 0)
12711 goto fail;
12712
12713 in = otherin.get();
12714 req->set_other_inode(in);
12715 in->break_all_delegs();
12716 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12717
12718 req->set_inode(dir);
12719
12720 res = make_request(req, perm);
12721
12722 trim_cache();
12723 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12724 return res;
12725
12726 fail:
12727 put_request(req);
12728 return res;
12729 }
12730
12731 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12732 {
12733 std::lock_guard lock(client_lock);
12734
12735 if (unmounting)
12736 return -ENOTCONN;
12737
12738 vinodeno_t vino = _get_vino(in);
12739
12740 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12741 tout(cct) << "ll_unlink" << std::endl;
12742 tout(cct) << vino.ino.val << std::endl;
12743 tout(cct) << name << std::endl;
12744
12745 if (!fuse_default_permissions) {
12746 int r = may_delete(in, name, perm);
12747 if (r < 0)
12748 return r;
12749 }
12750 return _unlink(in, name, perm);
12751 }
12752
12753 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12754 {
12755 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12756 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12757
12758 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12759 return -EROFS;
12760 }
12761
12762 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12763 MetaRequest *req = new MetaRequest(op);
12764 filepath path;
12765 dir->make_nosnap_relative_path(path);
12766 path.push_dentry(name);
12767 req->set_filepath(path);
12768 req->set_inode(dir);
12769
12770 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12771 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12772 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12773
12774 InodeRef in;
12775
12776 Dentry *de;
12777 int res = get_or_create(dir, name, &de);
12778 if (res < 0)
12779 goto fail;
12780 if (op == CEPH_MDS_OP_RMDIR)
12781 req->set_dentry(de);
12782 else
12783 de->get();
12784
12785 res = _lookup(dir, name, 0, &in, perms);
12786 if (res < 0)
12787 goto fail;
12788
12789 if (op == CEPH_MDS_OP_RMSNAP) {
12790 unlink(de, true, true);
12791 de->put();
12792 }
12793 req->set_other_inode(in.get());
12794
12795 res = make_request(req, perms);
12796
12797 trim_cache();
12798 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12799 return res;
12800
12801 fail:
12802 put_request(req);
12803 return res;
12804 }
12805
12806 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12807 {
12808 std::lock_guard lock(client_lock);
12809
12810 if (unmounting)
12811 return -ENOTCONN;
12812
12813 vinodeno_t vino = _get_vino(in);
12814
12815 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12816 tout(cct) << "ll_rmdir" << std::endl;
12817 tout(cct) << vino.ino.val << std::endl;
12818 tout(cct) << name << std::endl;
12819
12820 if (!fuse_default_permissions) {
12821 int r = may_delete(in, name, perms);
12822 if (r < 0)
12823 return r;
12824 }
12825
12826 return _rmdir(in, name, perms);
12827 }
12828
12829 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12830 {
12831 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12832 << todir->ino << " " << toname
12833 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12834 << dendl;
12835
12836 if (fromdir->snapid != todir->snapid)
12837 return -EXDEV;
12838
12839 int op = CEPH_MDS_OP_RENAME;
12840 if (fromdir->snapid != CEPH_NOSNAP) {
12841 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12842 op = CEPH_MDS_OP_RENAMESNAP;
12843 else
12844 return -EROFS;
12845 }
12846
12847 InodeRef target;
12848 MetaRequest *req = new MetaRequest(op);
12849
12850 filepath from;
12851 fromdir->make_nosnap_relative_path(from);
12852 from.push_dentry(fromname);
12853 filepath to;
12854 todir->make_nosnap_relative_path(to);
12855 to.push_dentry(toname);
12856 req->set_filepath(to);
12857 req->set_filepath2(from);
12858
12859 Dentry *oldde;
12860 int res = get_or_create(fromdir, fromname, &oldde);
12861 if (res < 0)
12862 goto fail;
12863 Dentry *de;
12864 res = get_or_create(todir, toname, &de);
12865 if (res < 0)
12866 goto fail;
12867
12868 if (op == CEPH_MDS_OP_RENAME) {
12869 req->set_old_dentry(oldde);
12870 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12871 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12872
12873 req->set_dentry(de);
12874 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12875 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12876
12877 InodeRef oldin, otherin;
12878 Inode *fromdir_root = nullptr;
12879 Inode *todir_root = nullptr;
12880 int mask = 0;
12881 bool quota_check = false;
12882 if (fromdir != todir) {
12883 fromdir_root =
12884 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12885 todir_root =
12886 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12887
12888 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12889 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12890 // to auth MDS to get latest rstat for todir_root and source dir
12891 // even if their dentry caches and inode caps are satisfied.
12892 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12893 if (res < 0)
12894 goto fail;
12895
12896 quota_check = true;
12897 if (oldde->inode && oldde->inode->is_dir()) {
12898 mask |= CEPH_STAT_RSTAT;
12899 }
12900 }
12901 }
12902
12903 res = _lookup(fromdir, fromname, mask, &oldin, perm);
12904 if (res < 0)
12905 goto fail;
12906
12907 Inode *oldinode = oldin.get();
12908 oldinode->break_all_delegs();
12909 req->set_old_inode(oldinode);
12910 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12911
12912 if (quota_check) {
12913 int64_t old_bytes, old_files;
12914 if (oldinode->is_dir()) {
12915 old_bytes = oldinode->rstat.rbytes;
12916 old_files = oldinode->rstat.rsize();
12917 } else {
12918 old_bytes = oldinode->size;
12919 old_files = 1;
12920 }
12921
12922 bool quota_exceed = false;
12923 if (todir_root && todir_root->quota.max_bytes &&
12924 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12925 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12926 << old_bytes << ") to (" << todir->ino
12927 << ") will exceed quota on " << *todir_root << dendl;
12928 quota_exceed = true;
12929 }
12930
12931 if (todir_root && todir_root->quota.max_files &&
12932 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12933 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12934 << old_files << ") to (" << todir->ino
12935 << ") will exceed quota on " << *todir_root << dendl;
12936 quota_exceed = true;
12937 }
12938
12939 if (quota_exceed) {
12940 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12941 goto fail;
12942 }
12943 }
12944
12945 res = _lookup(todir, toname, 0, &otherin, perm);
12946 switch (res) {
12947 case 0:
12948 {
12949 Inode *in = otherin.get();
12950 req->set_other_inode(in);
12951 in->break_all_delegs();
12952 }
12953 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12954 break;
12955 case -ENOENT:
12956 break;
12957 default:
12958 goto fail;
12959 }
12960
12961 req->set_inode(todir);
12962 } else {
12963 // renamesnap reply contains no tracedn, so we need to invalidate
12964 // dentry manually
12965 unlink(oldde, true, true);
12966 unlink(de, true, true);
12967
12968 req->set_inode(todir);
12969 }
12970
12971 res = make_request(req, perm, &target);
12972 ldout(cct, 10) << "rename result is " << res << dendl;
12973
12974 // renamed item from our cache
12975
12976 trim_cache();
12977 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12978 return res;
12979
12980 fail:
12981 put_request(req);
12982 return res;
12983 }
12984
12985 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12986 const char *newname, const UserPerm& perm)
12987 {
12988 std::lock_guard lock(client_lock);
12989
12990 if (unmounting)
12991 return -ENOTCONN;
12992
12993 vinodeno_t vparent = _get_vino(parent);
12994 vinodeno_t vnewparent = _get_vino(newparent);
12995
12996 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12997 << vnewparent << " " << newname << dendl;
12998 tout(cct) << "ll_rename" << std::endl;
12999 tout(cct) << vparent.ino.val << std::endl;
13000 tout(cct) << name << std::endl;
13001 tout(cct) << vnewparent.ino.val << std::endl;
13002 tout(cct) << newname << std::endl;
13003
13004 if (!fuse_default_permissions) {
13005 int r = may_delete(parent, name, perm);
13006 if (r < 0)
13007 return r;
13008 r = may_delete(newparent, newname, perm);
13009 if (r < 0 && r != -ENOENT)
13010 return r;
13011 }
13012
13013 return _rename(parent, name, newparent, newname, perm);
13014 }
13015
13016 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
13017 {
13018 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13019 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13020
13021 if (strlen(newname) > NAME_MAX)
13022 return -ENAMETOOLONG;
13023
13024 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13025 return -EROFS;
13026 }
13027 if (is_quota_files_exceeded(dir, perm)) {
13028 return -EDQUOT;
13029 }
13030
13031 in->break_all_delegs();
13032 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13033
13034 filepath path(newname, dir->ino);
13035 req->set_filepath(path);
13036 filepath existing(in->ino);
13037 req->set_filepath2(existing);
13038
13039 req->set_inode(dir);
13040 req->inode_drop = CEPH_CAP_FILE_SHARED;
13041 req->inode_unless = CEPH_CAP_FILE_EXCL;
13042
13043 Dentry *de;
13044 int res = get_or_create(dir, newname, &de);
13045 if (res < 0)
13046 goto fail;
13047 req->set_dentry(de);
13048
13049 res = make_request(req, perm, inp);
13050 ldout(cct, 10) << "link result is " << res << dendl;
13051
13052 trim_cache();
13053 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13054 return res;
13055
13056 fail:
13057 put_request(req);
13058 return res;
13059 }
13060
13061 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13062 const UserPerm& perm)
13063 {
13064 std::lock_guard lock(client_lock);
13065
13066 if (unmounting)
13067 return -ENOTCONN;
13068
13069 vinodeno_t vino = _get_vino(in);
13070 vinodeno_t vnewparent = _get_vino(newparent);
13071
13072 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13073 newname << dendl;
13074 tout(cct) << "ll_link" << std::endl;
13075 tout(cct) << vino.ino.val << std::endl;
13076 tout(cct) << vnewparent << std::endl;
13077 tout(cct) << newname << std::endl;
13078
13079 InodeRef target;
13080
13081 if (!fuse_default_permissions) {
13082 if (S_ISDIR(in->mode))
13083 return -EPERM;
13084
13085 int r = may_hardlink(in, perm);
13086 if (r < 0)
13087 return r;
13088
13089 r = may_create(newparent, perm);
13090 if (r < 0)
13091 return r;
13092 }
13093
13094 return _link(in, newparent, newname, perm, &target);
13095 }
13096
13097 int Client::ll_num_osds(void)
13098 {
13099 std::lock_guard lock(client_lock);
13100 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13101 }
13102
13103 int Client::ll_osdaddr(int osd, uint32_t *addr)
13104 {
13105 std::lock_guard lock(client_lock);
13106
13107 entity_addr_t g;
13108 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13109 if (!o.exists(osd))
13110 return false;
13111 g = o.get_addrs(osd).front();
13112 return true;
13113 });
13114 if (!exists)
13115 return -1;
13116 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13117 *addr = ntohl(nb_addr);
13118 return 0;
13119 }
13120
13121 uint32_t Client::ll_stripe_unit(Inode *in)
13122 {
13123 std::lock_guard lock(client_lock);
13124 return in->layout.stripe_unit;
13125 }
13126
13127 uint64_t Client::ll_snap_seq(Inode *in)
13128 {
13129 std::lock_guard lock(client_lock);
13130 return in->snaprealm->seq;
13131 }
13132
13133 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13134 {
13135 std::lock_guard lock(client_lock);
13136 *layout = in->layout;
13137 return 0;
13138 }
13139
13140 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13141 {
13142 return ll_file_layout(fh->inode.get(), layout);
13143 }
13144
13145 /* Currently we cannot take advantage of redundancy in reads, since we
13146 would have to go through all possible placement groups (a
13147 potentially quite large number determined by a hash), and use CRUSH
13148 to calculate the appropriate set of OSDs for each placement group,
13149 then index into that. An array with one entry per OSD is much more
13150 tractable and works for demonstration purposes. */
13151
13152 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13153 file_layout_t* layout)
13154 {
13155 std::lock_guard lock(client_lock);
13156
13157 inodeno_t ino = in->ino;
13158 uint32_t object_size = layout->object_size;
13159 uint32_t su = layout->stripe_unit;
13160 uint32_t stripe_count = layout->stripe_count;
13161 uint64_t stripes_per_object = object_size / su;
13162 uint64_t stripeno = 0, stripepos = 0;
13163
13164 if(stripe_count) {
13165 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13166 stripepos = blockno % stripe_count; // which object in the object set (X)
13167 }
13168 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13169 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13170
13171 object_t oid = file_object_t(ino, objectno);
13172 return objecter->with_osdmap([&](const OSDMap& o) {
13173 ceph_object_layout olayout =
13174 o.file_to_object_layout(oid, *layout);
13175 pg_t pg = (pg_t)olayout.ol_pgid;
13176 vector<int> osds;
13177 int primary;
13178 o.pg_to_acting_osds(pg, &osds, &primary);
13179 return primary;
13180 });
13181 }
13182
13183 /* Return the offset of the block, internal to the object */
13184
13185 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13186 {
13187 std::lock_guard lock(client_lock);
13188 file_layout_t *layout=&(in->layout);
13189 uint32_t object_size = layout->object_size;
13190 uint32_t su = layout->stripe_unit;
13191 uint64_t stripes_per_object = object_size / su;
13192
13193 return (blockno % stripes_per_object) * su;
13194 }
13195
13196 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13197 const UserPerm& perms)
13198 {
13199 std::lock_guard lock(client_lock);
13200
13201 if (unmounting)
13202 return -ENOTCONN;
13203
13204 vinodeno_t vino = _get_vino(in);
13205
13206 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13207 tout(cct) << "ll_opendir" << std::endl;
13208 tout(cct) << vino.ino.val << std::endl;
13209
13210 if (!fuse_default_permissions) {
13211 int r = may_open(in, flags, perms);
13212 if (r < 0)
13213 return r;
13214 }
13215
13216 int r = _opendir(in, dirpp, perms);
13217 tout(cct) << (unsigned long)*dirpp << std::endl;
13218
13219 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13220 << dendl;
13221 return r;
13222 }
13223
13224 int Client::ll_releasedir(dir_result_t *dirp)
13225 {
13226 std::lock_guard lock(client_lock);
13227 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13228 tout(cct) << "ll_releasedir" << std::endl;
13229 tout(cct) << (unsigned long)dirp << std::endl;
13230
13231 if (unmounting)
13232 return -ENOTCONN;
13233
13234 _closedir(dirp);
13235 return 0;
13236 }
13237
13238 int Client::ll_fsyncdir(dir_result_t *dirp)
13239 {
13240 std::lock_guard lock(client_lock);
13241 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13242 tout(cct) << "ll_fsyncdir" << std::endl;
13243 tout(cct) << (unsigned long)dirp << std::endl;
13244
13245 if (unmounting)
13246 return -ENOTCONN;
13247
13248 return _fsync(dirp->inode.get(), false);
13249 }
13250
13251 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13252 {
13253 ceph_assert(!(flags & O_CREAT));
13254
13255 std::lock_guard lock(client_lock);
13256
13257 if (unmounting)
13258 return -ENOTCONN;
13259
13260 vinodeno_t vino = _get_vino(in);
13261
13262 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13263 tout(cct) << "ll_open" << std::endl;
13264 tout(cct) << vino.ino.val << std::endl;
13265 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13266
13267 int r;
13268 if (!fuse_default_permissions) {
13269 r = may_open(in, flags, perms);
13270 if (r < 0)
13271 goto out;
13272 }
13273
13274 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13275
13276 out:
13277 Fh *fhptr = fhp ? *fhp : NULL;
13278 if (fhptr) {
13279 ll_unclosed_fh_set.insert(fhptr);
13280 }
13281 tout(cct) << (unsigned long)fhptr << std::endl;
13282 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13283 " = " << r << " (" << fhptr << ")" << dendl;
13284 return r;
13285 }
13286
13287 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13288 int flags, InodeRef *in, int caps, Fh **fhp,
13289 const UserPerm& perms)
13290 {
13291 *fhp = NULL;
13292
13293 vinodeno_t vparent = _get_vino(parent);
13294
13295 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13296 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13297 << ", gid " << perms.gid() << dendl;
13298 tout(cct) << "ll_create" << std::endl;
13299 tout(cct) << vparent.ino.val << std::endl;
13300 tout(cct) << name << std::endl;
13301 tout(cct) << mode << std::endl;
13302 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13303
13304 bool created = false;
13305 int r = _lookup(parent, name, caps, in, perms);
13306
13307 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13308 return -EEXIST;
13309
13310 if (r == -ENOENT && (flags & O_CREAT)) {
13311 if (!fuse_default_permissions) {
13312 r = may_create(parent, perms);
13313 if (r < 0)
13314 goto out;
13315 }
13316 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13317 perms);
13318 if (r < 0)
13319 goto out;
13320 }
13321
13322 if (r < 0)
13323 goto out;
13324
13325 ceph_assert(*in);
13326
13327 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13328 if (!created) {
13329 if (!fuse_default_permissions) {
13330 r = may_open(in->get(), flags, perms);
13331 if (r < 0) {
13332 if (*fhp) {
13333 int release_r = _release_fh(*fhp);
13334 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13335 }
13336 goto out;
13337 }
13338 }
13339 if (*fhp == NULL) {
13340 r = _open(in->get(), flags, mode, fhp, perms);
13341 if (r < 0)
13342 goto out;
13343 }
13344 }
13345
13346 out:
13347 if (*fhp) {
13348 ll_unclosed_fh_set.insert(*fhp);
13349 }
13350
13351 ino_t ino = 0;
13352 if (r >= 0) {
13353 Inode *inode = in->get();
13354 if (use_faked_inos())
13355 ino = inode->faked_ino;
13356 else
13357 ino = inode->ino;
13358 }
13359
13360 tout(cct) << (unsigned long)*fhp << std::endl;
13361 tout(cct) << ino << std::endl;
13362 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13363 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13364 *fhp << " " << hex << ino << dec << ")" << dendl;
13365
13366 return r;
13367 }
13368
13369 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13370 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13371 const UserPerm& perms)
13372 {
13373 std::lock_guard lock(client_lock);
13374 InodeRef in;
13375
13376 if (unmounting)
13377 return -ENOTCONN;
13378
13379 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13380 fhp, perms);
13381 if (r >= 0) {
13382 ceph_assert(in);
13383
13384 // passing an Inode in outp requires an additional ref
13385 if (outp) {
13386 _ll_get(in.get());
13387 *outp = in.get();
13388 }
13389 fill_stat(in, attr);
13390 } else {
13391 attr->st_ino = 0;
13392 }
13393
13394 return r;
13395 }
13396
13397 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13398 int oflags, Inode **outp, Fh **fhp,
13399 struct ceph_statx *stx, unsigned want, unsigned lflags,
13400 const UserPerm& perms)
13401 {
13402 unsigned caps = statx_to_mask(lflags, want);
13403 std::lock_guard lock(client_lock);
13404 InodeRef in;
13405
13406 if (unmounting)
13407 return -ENOTCONN;
13408
13409 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13410 if (r >= 0) {
13411 ceph_assert(in);
13412
13413 // passing an Inode in outp requires an additional ref
13414 if (outp) {
13415 _ll_get(in.get());
13416 *outp = in.get();
13417 }
13418 fill_statx(in, caps, stx);
13419 } else {
13420 stx->stx_ino = 0;
13421 stx->stx_mask = 0;
13422 }
13423
13424 return r;
13425 }
13426
13427 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13428 {
13429 std::lock_guard lock(client_lock);
13430 tout(cct) << "ll_lseek" << std::endl;
13431 tout(cct) << offset << std::endl;
13432 tout(cct) << whence << std::endl;
13433
13434 if (unmounting)
13435 return -ENOTCONN;
13436
13437 return _lseek(fh, offset, whence);
13438 }
13439
13440 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13441 {
13442 std::lock_guard lock(client_lock);
13443 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13444 tout(cct) << "ll_read" << std::endl;
13445 tout(cct) << (unsigned long)fh << std::endl;
13446 tout(cct) << off << std::endl;
13447 tout(cct) << len << std::endl;
13448
13449 if (unmounting)
13450 return -ENOTCONN;
13451
13452 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13453 len = std::min(len, (loff_t)INT_MAX);
13454 int r = _read(fh, off, len, bl);
13455 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13456 << dendl;
13457 return r;
13458 }
13459
13460 int Client::ll_read_block(Inode *in, uint64_t blockid,
13461 char *buf,
13462 uint64_t offset,
13463 uint64_t length,
13464 file_layout_t* layout)
13465 {
13466 std::lock_guard lock(client_lock);
13467
13468 if (unmounting)
13469 return -ENOTCONN;
13470
13471 vinodeno_t vino = _get_vino(in);
13472 object_t oid = file_object_t(vino.ino, blockid);
13473 C_SaferCond onfinish;
13474 bufferlist bl;
13475
13476 objecter->read(oid,
13477 object_locator_t(layout->pool_id),
13478 offset,
13479 length,
13480 vino.snapid,
13481 &bl,
13482 CEPH_OSD_FLAG_READ,
13483 &onfinish);
13484
13485 client_lock.unlock();
13486 int r = onfinish.wait();
13487 client_lock.lock();
13488
13489 if (r >= 0) {
13490 bl.begin().copy(bl.length(), buf);
13491 r = bl.length();
13492 }
13493
13494 return r;
13495 }
13496
13497 /* It appears that the OSD doesn't return success unless the entire
13498 buffer was written, return the write length on success. */
13499
13500 int Client::ll_write_block(Inode *in, uint64_t blockid,
13501 char* buf, uint64_t offset,
13502 uint64_t length, file_layout_t* layout,
13503 uint64_t snapseq, uint32_t sync)
13504 {
13505 vinodeno_t vino = ll_get_vino(in);
13506 int r = 0;
13507 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13508
13509 if (length == 0) {
13510 return -EINVAL;
13511 }
13512 if (true || sync) {
13513 /* if write is stable, the epilogue is waiting on
13514 * flock */
13515 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13516 }
13517 object_t oid = file_object_t(vino.ino, blockid);
13518 SnapContext fakesnap;
13519 ceph::bufferlist bl;
13520 if (length > 0) {
13521 bl.push_back(buffer::copy(buf, length));
13522 }
13523
13524 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13525 << dendl;
13526
13527 fakesnap.seq = snapseq;
13528
13529 /* lock just in time */
13530 client_lock.lock();
13531 if (unmounting) {
13532 client_lock.unlock();
13533 return -ENOTCONN;
13534 }
13535
13536 objecter->write(oid,
13537 object_locator_t(layout->pool_id),
13538 offset,
13539 length,
13540 fakesnap,
13541 bl,
13542 ceph::real_clock::now(),
13543 0,
13544 onsafe.get());
13545
13546 client_lock.unlock();
13547 if (nullptr != onsafe) {
13548 r = onsafe->wait();
13549 }
13550
13551 if (r < 0) {
13552 return r;
13553 } else {
13554 return length;
13555 }
13556 }
13557
13558 int Client::ll_commit_blocks(Inode *in,
13559 uint64_t offset,
13560 uint64_t length)
13561 {
13562 std::lock_guard lock(client_lock);
13563 /*
13564 BarrierContext *bctx;
13565 vinodeno_t vino = _get_vino(in);
13566 uint64_t ino = vino.ino;
13567
13568 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13569 << offset << " to " << length << dendl;
13570
13571 if (length == 0) {
13572 return -EINVAL;
13573 }
13574
13575 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13576 if (p != barriers.end()) {
13577 barrier_interval civ(offset, offset + length);
13578 p->second->commit_barrier(civ);
13579 }
13580 */
13581 return 0;
13582 }
13583
13584 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13585 {
13586 std::lock_guard lock(client_lock);
13587 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13588 "~" << len << dendl;
13589 tout(cct) << "ll_write" << std::endl;
13590 tout(cct) << (unsigned long)fh << std::endl;
13591 tout(cct) << off << std::endl;
13592 tout(cct) << len << std::endl;
13593
13594 if (unmounting)
13595 return -ENOTCONN;
13596
13597 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13598 len = std::min(len, (loff_t)INT_MAX);
13599 int r = _write(fh, off, len, data, NULL, 0);
13600 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13601 << dendl;
13602 return r;
13603 }
13604
13605 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13606 {
13607 std::lock_guard lock(client_lock);
13608 if (unmounting)
13609 return -ENOTCONN;
13610 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13611 }
13612
13613 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13614 {
13615 std::lock_guard lock(client_lock);
13616 if (unmounting)
13617 return -ENOTCONN;
13618 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13619 }
13620
13621 int Client::ll_flush(Fh *fh)
13622 {
13623 std::lock_guard lock(client_lock);
13624 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13625 tout(cct) << "ll_flush" << std::endl;
13626 tout(cct) << (unsigned long)fh << std::endl;
13627
13628 if (unmounting)
13629 return -ENOTCONN;
13630
13631 return _flush(fh);
13632 }
13633
13634 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13635 {
13636 std::lock_guard lock(client_lock);
13637 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13638 tout(cct) << "ll_fsync" << std::endl;
13639 tout(cct) << (unsigned long)fh << std::endl;
13640
13641 if (unmounting)
13642 return -ENOTCONN;
13643
13644 int r = _fsync(fh, syncdataonly);
13645 if (r) {
13646 // If we're returning an error, clear it from the FH
13647 fh->take_async_err();
13648 }
13649 return r;
13650 }
13651
13652 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13653 {
13654 std::lock_guard lock(client_lock);
13655 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13656 tout(cct) << "ll_sync_inode" << std::endl;
13657 tout(cct) << (unsigned long)in << std::endl;
13658
13659 if (unmounting)
13660 return -ENOTCONN;
13661
13662 return _fsync(in, syncdataonly);
13663 }
13664
13665 #ifdef FALLOC_FL_PUNCH_HOLE
13666
13667 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13668 {
13669 if (offset < 0 || length <= 0)
13670 return -EINVAL;
13671
13672 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13673 return -EOPNOTSUPP;
13674
13675 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13676 return -EOPNOTSUPP;
13677
13678 Inode *in = fh->inode.get();
13679
13680 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13681 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13682 return -ENOSPC;
13683 }
13684
13685 if (in->snapid != CEPH_NOSNAP)
13686 return -EROFS;
13687
13688 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13689 return -EBADF;
13690
13691 uint64_t size = offset + length;
13692 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13693 size > in->size &&
13694 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13695 return -EDQUOT;
13696 }
13697
13698 int have;
13699 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13700 if (r < 0)
13701 return r;
13702
13703 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13704 if (mode & FALLOC_FL_PUNCH_HOLE) {
13705 if (in->inline_version < CEPH_INLINE_NONE &&
13706 (have & CEPH_CAP_FILE_BUFFER)) {
13707 bufferlist bl;
13708 auto inline_iter = in->inline_data.cbegin();
13709 int len = in->inline_data.length();
13710 if (offset < len) {
13711 if (offset > 0)
13712 inline_iter.copy(offset, bl);
13713 int size = length;
13714 if (offset + size > len)
13715 size = len - offset;
13716 if (size > 0)
13717 bl.append_zero(size);
13718 if (offset + size < len) {
13719 inline_iter += size;
13720 inline_iter.copy(len - offset - size, bl);
13721 }
13722 in->inline_data = bl;
13723 in->inline_version++;
13724 }
13725 in->mtime = in->ctime = ceph_clock_now();
13726 in->change_attr++;
13727 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13728 } else {
13729 if (in->inline_version < CEPH_INLINE_NONE) {
13730 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13731 uninline_data(in, onuninline.get());
13732 }
13733
13734 C_SaferCond onfinish("Client::_punch_hole flock");
13735
13736 unsafe_sync_write++;
13737 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13738
13739 _invalidate_inode_cache(in, offset, length);
13740 filer->zero(in->ino, &in->layout,
13741 in->snaprealm->get_snap_context(),
13742 offset, length,
13743 ceph::real_clock::now(),
13744 0, true, &onfinish);
13745 in->mtime = in->ctime = ceph_clock_now();
13746 in->change_attr++;
13747 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13748
13749 client_lock.unlock();
13750 onfinish.wait();
13751 client_lock.lock();
13752 _sync_write_commit(in);
13753 }
13754 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13755 uint64_t size = offset + length;
13756 if (size > in->size) {
13757 in->size = size;
13758 in->mtime = in->ctime = ceph_clock_now();
13759 in->change_attr++;
13760 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13761
13762 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13763 check_caps(in, CHECK_CAPS_NODELAY);
13764 } else if (is_max_size_approaching(in)) {
13765 check_caps(in, 0);
13766 }
13767 }
13768 }
13769
13770 if (nullptr != onuninline) {
13771 client_lock.unlock();
13772 int ret = onuninline->wait();
13773 client_lock.lock();
13774
13775 if (ret >= 0 || ret == -ECANCELED) {
13776 in->inline_data.clear();
13777 in->inline_version = CEPH_INLINE_NONE;
13778 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13779 check_caps(in, 0);
13780 } else
13781 r = ret;
13782 }
13783
13784 put_cap_ref(in, CEPH_CAP_FILE_WR);
13785 return r;
13786 }
13787 #else
13788
13789 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13790 {
13791 return -EOPNOTSUPP;
13792 }
13793
13794 #endif
13795
13796
13797 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13798 {
13799 std::lock_guard lock(client_lock);
13800 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13801 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13802 tout(cct) << (unsigned long)fh << std::endl;
13803
13804 if (unmounting)
13805 return -ENOTCONN;
13806
13807 return _fallocate(fh, mode, offset, length);
13808 }
13809
13810 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13811 {
13812 std::lock_guard lock(client_lock);
13813 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13814
13815 if (unmounting)
13816 return -ENOTCONN;
13817
13818 Fh *fh = get_filehandle(fd);
13819 if (!fh)
13820 return -EBADF;
13821 #if defined(__linux__) && defined(O_PATH)
13822 if (fh->flags & O_PATH)
13823 return -EBADF;
13824 #endif
13825 return _fallocate(fh, mode, offset, length);
13826 }
13827
13828 int Client::ll_release(Fh *fh)
13829 {
13830 std::lock_guard lock(client_lock);
13831
13832 if (unmounting)
13833 return -ENOTCONN;
13834
13835 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13836 dendl;
13837 tout(cct) << __func__ << " (fh)" << std::endl;
13838 tout(cct) << (unsigned long)fh << std::endl;
13839
13840 if (ll_unclosed_fh_set.count(fh))
13841 ll_unclosed_fh_set.erase(fh);
13842 return _release_fh(fh);
13843 }
13844
13845 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13846 {
13847 std::lock_guard lock(client_lock);
13848
13849 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13850 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13851
13852 if (unmounting)
13853 return -ENOTCONN;
13854
13855 return _getlk(fh, fl, owner);
13856 }
13857
13858 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13859 {
13860 std::lock_guard lock(client_lock);
13861
13862 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13863 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13864
13865 if (unmounting)
13866 return -ENOTCONN;
13867
13868 return _setlk(fh, fl, owner, sleep);
13869 }
13870
13871 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13872 {
13873 std::lock_guard lock(client_lock);
13874
13875 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13876 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13877
13878 if (unmounting)
13879 return -ENOTCONN;
13880
13881 return _flock(fh, cmd, owner);
13882 }
13883
13884 int Client::set_deleg_timeout(uint32_t timeout)
13885 {
13886 std::lock_guard lock(client_lock);
13887
13888 /*
13889 * The whole point is to prevent blacklisting so we must time out the
13890 * delegation before the session autoclose timeout kicks in.
13891 */
13892 if (timeout >= mdsmap->get_session_autoclose())
13893 return -EINVAL;
13894
13895 deleg_timeout = timeout;
13896 return 0;
13897 }
13898
13899 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13900 {
13901 int ret = -EINVAL;
13902
13903 std::lock_guard lock(client_lock);
13904
13905 if (!mounted)
13906 return -ENOTCONN;
13907
13908 Inode *inode = fh->inode.get();
13909
13910 switch(cmd) {
13911 case CEPH_DELEGATION_NONE:
13912 inode->unset_deleg(fh);
13913 ret = 0;
13914 break;
13915 default:
13916 try {
13917 ret = inode->set_deleg(fh, cmd, cb, priv);
13918 } catch (std::bad_alloc&) {
13919 ret = -ENOMEM;
13920 }
13921 break;
13922 }
13923 return ret;
13924 }
13925
13926 class C_Client_RequestInterrupt : public Context {
13927 private:
13928 Client *client;
13929 MetaRequest *req;
13930 public:
13931 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13932 req->get();
13933 }
13934 void finish(int r) override {
13935 std::lock_guard l(client->client_lock);
13936 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13937 client->_interrupt_filelock(req);
13938 client->put_request(req);
13939 }
13940 };
13941
13942 void Client::ll_interrupt(void *d)
13943 {
13944 MetaRequest *req = static_cast<MetaRequest*>(d);
13945 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13946 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13947 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13948 }
13949
13950 // =========================================
13951 // layout
13952
13953 // expose file layouts
13954
13955 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13956 const UserPerm& perms)
13957 {
13958 std::lock_guard lock(client_lock);
13959
13960 if (unmounting)
13961 return -ENOTCONN;
13962
13963 filepath path(relpath);
13964 InodeRef in;
13965 int r = path_walk(path, &in, perms);
13966 if (r < 0)
13967 return r;
13968
13969 *lp = in->layout;
13970
13971 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13972 return 0;
13973 }
13974
13975 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13976 {
13977 std::lock_guard lock(client_lock);
13978
13979 if (unmounting)
13980 return -ENOTCONN;
13981
13982 Fh *f = get_filehandle(fd);
13983 if (!f)
13984 return -EBADF;
13985 Inode *in = f->inode.get();
13986
13987 *lp = in->layout;
13988
13989 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13990 return 0;
13991 }
13992
13993 int64_t Client::get_default_pool_id()
13994 {
13995 std::lock_guard lock(client_lock);
13996
13997 if (unmounting)
13998 return -ENOTCONN;
13999
14000 /* first data pool is the default */
14001 return mdsmap->get_first_data_pool();
14002 }
14003
14004 // expose osdmap
14005
14006 int64_t Client::get_pool_id(const char *pool_name)
14007 {
14008 std::lock_guard lock(client_lock);
14009
14010 if (unmounting)
14011 return -ENOTCONN;
14012
14013 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14014 pool_name);
14015 }
14016
14017 string Client::get_pool_name(int64_t pool)
14018 {
14019 std::lock_guard lock(client_lock);
14020
14021 if (unmounting)
14022 return string();
14023
14024 return objecter->with_osdmap([pool](const OSDMap& o) {
14025 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14026 });
14027 }
14028
14029 int Client::get_pool_replication(int64_t pool)
14030 {
14031 std::lock_guard lock(client_lock);
14032
14033 if (unmounting)
14034 return -ENOTCONN;
14035
14036 return objecter->with_osdmap([pool](const OSDMap& o) {
14037 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
14038 });
14039 }
14040
14041 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14042 {
14043 std::lock_guard lock(client_lock);
14044
14045 if (unmounting)
14046 return -ENOTCONN;
14047
14048 Fh *f = get_filehandle(fd);
14049 if (!f)
14050 return -EBADF;
14051 Inode *in = f->inode.get();
14052
14053 vector<ObjectExtent> extents;
14054 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14055 ceph_assert(extents.size() == 1);
14056
14057 objecter->with_osdmap([&](const OSDMap& o) {
14058 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14059 o.pg_to_acting_osds(pg, osds);
14060 });
14061
14062 if (osds.empty())
14063 return -EINVAL;
14064
14065 /*
14066 * Return the remainder of the extent (stripe unit)
14067 *
14068 * If length = 1 is passed to Striper::file_to_extents we get a single
14069 * extent back, but its length is one so we still need to compute the length
14070 * to the end of the stripe unit.
14071 *
14072 * If length = su then we may get 1 or 2 objects back in the extents vector
14073 * which would have to be examined. Even then, the offsets are local to the
14074 * object, so matching up to the file offset is extra work.
14075 *
14076 * It seems simpler to stick with length = 1 and manually compute the
14077 * remainder.
14078 */
14079 if (len) {
14080 uint64_t su = in->layout.stripe_unit;
14081 *len = su - (off % su);
14082 }
14083
14084 return 0;
14085 }
14086
14087 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14088 {
14089 std::lock_guard lock(client_lock);
14090
14091 if (unmounting)
14092 return -ENOTCONN;
14093
14094 if (id < 0)
14095 return -EINVAL;
14096 return objecter->with_osdmap([&](const OSDMap& o) {
14097 return o.crush->get_full_location_ordered(id, path);
14098 });
14099 }
14100
14101 int Client::get_file_stripe_address(int fd, loff_t offset,
14102 vector<entity_addr_t>& address)
14103 {
14104 std::lock_guard lock(client_lock);
14105
14106 if (unmounting)
14107 return -ENOTCONN;
14108
14109 Fh *f = get_filehandle(fd);
14110 if (!f)
14111 return -EBADF;
14112 Inode *in = f->inode.get();
14113
14114 // which object?
14115 vector<ObjectExtent> extents;
14116 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14117 in->truncate_size, extents);
14118 ceph_assert(extents.size() == 1);
14119
14120 // now we have the object and its 'layout'
14121 return objecter->with_osdmap([&](const OSDMap& o) {
14122 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14123 vector<int> osds;
14124 o.pg_to_acting_osds(pg, osds);
14125 if (osds.empty())
14126 return -EINVAL;
14127 for (unsigned i = 0; i < osds.size(); i++) {
14128 entity_addr_t addr = o.get_addrs(osds[i]).front();
14129 address.push_back(addr);
14130 }
14131 return 0;
14132 });
14133 }
14134
14135 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14136 {
14137 std::lock_guard lock(client_lock);
14138
14139 if (unmounting)
14140 return -ENOTCONN;
14141
14142 return objecter->with_osdmap([&](const OSDMap& o) {
14143 if (!o.exists(osd))
14144 return -ENOENT;
14145
14146 addr = o.get_addrs(osd).front();
14147 return 0;
14148 });
14149 }
14150
14151 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14152 loff_t length, loff_t offset)
14153 {
14154 std::lock_guard lock(client_lock);
14155
14156 if (unmounting)
14157 return -ENOTCONN;
14158
14159 Fh *f = get_filehandle(fd);
14160 if (!f)
14161 return -EBADF;
14162 Inode *in = f->inode.get();
14163
14164 // map to a list of extents
14165 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14166
14167 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14168 return 0;
14169 }
14170
14171
14172 /* find an osd with the same ip. -ENXIO if none. */
14173 int Client::get_local_osd()
14174 {
14175 std::lock_guard lock(client_lock);
14176
14177 if (unmounting)
14178 return -ENOTCONN;
14179
14180 objecter->with_osdmap([this](const OSDMap& o) {
14181 if (o.get_epoch() != local_osd_epoch) {
14182 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14183 local_osd_epoch = o.get_epoch();
14184 }
14185 });
14186 return local_osd;
14187 }
14188
14189
14190
14191
14192
14193
14194 // ===============================
14195
14196 void Client::ms_handle_connect(Connection *con)
14197 {
14198 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14199 }
14200
14201 bool Client::ms_handle_reset(Connection *con)
14202 {
14203 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14204 return false;
14205 }
14206
14207 void Client::ms_handle_remote_reset(Connection *con)
14208 {
14209 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14210 std::lock_guard l(client_lock);
14211 switch (con->get_peer_type()) {
14212 case CEPH_ENTITY_TYPE_MDS:
14213 {
14214 // kludge to figure out which mds this is; fixme with a Connection* state
14215 mds_rank_t mds = MDS_RANK_NONE;
14216 MetaSession *s = NULL;
14217 for (auto &p : mds_sessions) {
14218 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14219 mds = p.first;
14220 s = &p.second;
14221 }
14222 }
14223 if (mds >= 0) {
14224 assert (s != NULL);
14225 switch (s->state) {
14226 case MetaSession::STATE_CLOSING:
14227 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14228 _closed_mds_session(s);
14229 break;
14230
14231 case MetaSession::STATE_OPENING:
14232 {
14233 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14234 list<Context*> waiters;
14235 waiters.swap(s->waiting_for_open);
14236 _closed_mds_session(s);
14237 MetaSession *news = _get_or_open_mds_session(mds);
14238 news->waiting_for_open.swap(waiters);
14239 }
14240 break;
14241
14242 case MetaSession::STATE_OPEN:
14243 {
14244 objecter->maybe_request_map(); /* to check if we are blacklisted */
14245 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14246 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14247 _closed_mds_session(s);
14248 } else {
14249 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14250 s->state = MetaSession::STATE_STALE;
14251 }
14252 }
14253 break;
14254
14255 case MetaSession::STATE_NEW:
14256 case MetaSession::STATE_CLOSED:
14257 default:
14258 break;
14259 }
14260 }
14261 }
14262 break;
14263 }
14264 }
14265
14266 bool Client::ms_handle_refused(Connection *con)
14267 {
14268 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14269 return false;
14270 }
14271
14272 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14273 {
14274 Inode *quota_in = root_ancestor;
14275 SnapRealm *realm = in->snaprealm;
14276 while (realm) {
14277 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14278 if (realm->ino != in->ino) {
14279 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14280 if (p == inode_map.end())
14281 break;
14282
14283 if (p->second->quota.is_enable()) {
14284 quota_in = p->second;
14285 break;
14286 }
14287 }
14288 realm = realm->pparent;
14289 }
14290 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14291 return quota_in;
14292 }
14293
14294 /**
14295 * Traverse quota ancestors of the Inode, return true
14296 * if any of them passes the passed function
14297 */
14298 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14299 std::function<bool (const Inode &in)> test)
14300 {
14301 while (true) {
14302 ceph_assert(in != NULL);
14303 if (test(*in)) {
14304 return true;
14305 }
14306
14307 if (in == root_ancestor) {
14308 // We're done traversing, drop out
14309 return false;
14310 } else {
14311 // Continue up the tree
14312 in = get_quota_root(in, perms);
14313 }
14314 }
14315
14316 return false;
14317 }
14318
14319 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14320 {
14321 return check_quota_condition(in, perms,
14322 [](const Inode &in) {
14323 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14324 });
14325 }
14326
14327 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14328 const UserPerm& perms)
14329 {
14330 return check_quota_condition(in, perms,
14331 [&new_bytes](const Inode &in) {
14332 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14333 > in.quota.max_bytes;
14334 });
14335 }
14336
14337 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14338 {
14339 ceph_assert(in->size >= in->reported_size);
14340 const uint64_t size = in->size - in->reported_size;
14341 return check_quota_condition(in, perms,
14342 [&size](const Inode &in) {
14343 if (in.quota.max_bytes) {
14344 if (in.rstat.rbytes >= in.quota.max_bytes) {
14345 return true;
14346 }
14347
14348 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14349 return (space >> 4) < size;
14350 } else {
14351 return false;
14352 }
14353 });
14354 }
14355
14356 enum {
14357 POOL_CHECKED = 1,
14358 POOL_CHECKING = 2,
14359 POOL_READ = 4,
14360 POOL_WRITE = 8,
14361 };
14362
14363 int Client::check_pool_perm(Inode *in, int need)
14364 {
14365 if (!cct->_conf->client_check_pool_perm)
14366 return 0;
14367
14368 /* Only need to do this for regular files */
14369 if (!in->is_file())
14370 return 0;
14371
14372 int64_t pool_id = in->layout.pool_id;
14373 std::string pool_ns = in->layout.pool_ns;
14374 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14375 int have = 0;
14376 while (true) {
14377 auto it = pool_perms.find(perm_key);
14378 if (it == pool_perms.end())
14379 break;
14380 if (it->second == POOL_CHECKING) {
14381 // avoid concurrent checkings
14382 wait_on_list(waiting_for_pool_perm);
14383 } else {
14384 have = it->second;
14385 ceph_assert(have & POOL_CHECKED);
14386 break;
14387 }
14388 }
14389
14390 if (!have) {
14391 if (in->snapid != CEPH_NOSNAP) {
14392 // pool permission check needs to write to the first object. But for snapshot,
14393 // head of the first object may have alread been deleted. To avoid creating
14394 // orphan object, skip the check for now.
14395 return 0;
14396 }
14397
14398 pool_perms[perm_key] = POOL_CHECKING;
14399
14400 char oid_buf[32];
14401 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14402 object_t oid = oid_buf;
14403
14404 SnapContext nullsnapc;
14405
14406 C_SaferCond rd_cond;
14407 ObjectOperation rd_op;
14408 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14409
14410 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14411 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14412
14413 C_SaferCond wr_cond;
14414 ObjectOperation wr_op;
14415 wr_op.create(true);
14416
14417 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14418 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14419
14420 client_lock.unlock();
14421 int rd_ret = rd_cond.wait();
14422 int wr_ret = wr_cond.wait();
14423 client_lock.lock();
14424
14425 bool errored = false;
14426
14427 if (rd_ret == 0 || rd_ret == -ENOENT)
14428 have |= POOL_READ;
14429 else if (rd_ret != -EPERM) {
14430 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14431 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14432 errored = true;
14433 }
14434
14435 if (wr_ret == 0 || wr_ret == -EEXIST)
14436 have |= POOL_WRITE;
14437 else if (wr_ret != -EPERM) {
14438 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14439 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14440 errored = true;
14441 }
14442
14443 if (errored) {
14444 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14445 // Raise EIO because actual error code might be misleading for
14446 // userspace filesystem user.
14447 pool_perms.erase(perm_key);
14448 signal_cond_list(waiting_for_pool_perm);
14449 return -EIO;
14450 }
14451
14452 pool_perms[perm_key] = have | POOL_CHECKED;
14453 signal_cond_list(waiting_for_pool_perm);
14454 }
14455
14456 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14457 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14458 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14459 return -EPERM;
14460 }
14461 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14462 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14463 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14464 return -EPERM;
14465 }
14466
14467 return 0;
14468 }
14469
14470 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14471 {
14472 if (acl_type == POSIX_ACL) {
14473 if (in->xattrs.count(ACL_EA_ACCESS)) {
14474 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14475
14476 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14477 }
14478 }
14479 return -EAGAIN;
14480 }
14481
14482 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14483 {
14484 if (acl_type == NO_ACL)
14485 return 0;
14486
14487 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14488 if (r < 0)
14489 goto out;
14490
14491 if (acl_type == POSIX_ACL) {
14492 if (in->xattrs.count(ACL_EA_ACCESS)) {
14493 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14494 bufferptr acl(access_acl.c_str(), access_acl.length());
14495 r = posix_acl_access_chmod(acl, mode);
14496 if (r < 0)
14497 goto out;
14498 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14499 } else {
14500 r = 0;
14501 }
14502 }
14503 out:
14504 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14505 return r;
14506 }
14507
14508 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14509 const UserPerm& perms)
14510 {
14511 if (acl_type == NO_ACL)
14512 return 0;
14513
14514 if (S_ISLNK(*mode))
14515 return 0;
14516
14517 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14518 if (r < 0)
14519 goto out;
14520
14521 if (acl_type == POSIX_ACL) {
14522 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14523 map<string, bufferptr> xattrs;
14524
14525 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14526 bufferptr acl(default_acl.c_str(), default_acl.length());
14527 r = posix_acl_inherit_mode(acl, mode);
14528 if (r < 0)
14529 goto out;
14530
14531 if (r > 0) {
14532 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14533 if (r < 0)
14534 goto out;
14535 if (r > 0)
14536 xattrs[ACL_EA_ACCESS] = acl;
14537 }
14538
14539 if (S_ISDIR(*mode))
14540 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14541
14542 r = xattrs.size();
14543 if (r > 0)
14544 encode(xattrs, xattrs_bl);
14545 } else {
14546 if (umask_cb)
14547 *mode &= ~umask_cb(callback_handle);
14548 r = 0;
14549 }
14550 }
14551 out:
14552 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14553 return r;
14554 }
14555
14556 void Client::set_filer_flags(int flags)
14557 {
14558 std::lock_guard l(client_lock);
14559 ceph_assert(flags == 0 ||
14560 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14561 objecter->add_global_op_flags(flags);
14562 }
14563
14564 void Client::clear_filer_flags(int flags)
14565 {
14566 std::lock_guard l(client_lock);
14567 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14568 objecter->clear_global_op_flag(flags);
14569 }
14570
14571 // called before mount
14572 void Client::set_uuid(const std::string& uuid)
14573 {
14574 std::lock_guard l(client_lock);
14575 assert(initialized);
14576 assert(!uuid.empty());
14577
14578 metadata["uuid"] = uuid;
14579 _close_sessions();
14580 }
14581
14582 // called before mount. 0 means infinite
14583 void Client::set_session_timeout(unsigned timeout)
14584 {
14585 std::lock_guard l(client_lock);
14586 assert(initialized);
14587
14588 metadata["timeout"] = stringify(timeout);
14589 }
14590
14591 // called before mount
14592 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14593 const std::string& fs_name)
14594 {
14595 std::lock_guard l(client_lock);
14596 if (!initialized)
14597 return -ENOTCONN;
14598
14599 if (uuid.empty())
14600 return -EINVAL;
14601
14602 {
14603 auto it = metadata.find("uuid");
14604 if (it != metadata.end() && it->second == uuid)
14605 return -EINVAL;
14606 }
14607
14608 int r = subscribe_mdsmap(fs_name);
14609 if (r < 0) {
14610 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14611 return r;
14612 }
14613
14614 if (metadata.empty())
14615 populate_metadata("");
14616
14617 while (mdsmap->get_epoch() == 0)
14618 wait_on_list(waiting_for_mdsmap);
14619
14620 reclaim_errno = 0;
14621 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14622 if (!mdsmap->is_up(mds)) {
14623 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14624 wait_on_list(waiting_for_mdsmap);
14625 continue;
14626 }
14627
14628 MetaSession *session;
14629 if (!have_open_session(mds)) {
14630 session = _get_or_open_mds_session(mds);
14631 if (session->state == MetaSession::STATE_REJECTED)
14632 return -EPERM;
14633 if (session->state != MetaSession::STATE_OPENING) {
14634 // umounting?
14635 return -EINVAL;
14636 }
14637 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14638 wait_on_context_list(session->waiting_for_open);
14639 continue;
14640 }
14641
14642 session = &mds_sessions.at(mds);
14643 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14644 return -EOPNOTSUPP;
14645
14646 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14647 session->reclaim_state == MetaSession::RECLAIMING) {
14648 session->reclaim_state = MetaSession::RECLAIMING;
14649 auto m = make_message<MClientReclaim>(uuid, flags);
14650 session->con->send_message2(std::move(m));
14651 wait_on_list(waiting_for_reclaim);
14652 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14653 return reclaim_errno ? : -ENOTRECOVERABLE;
14654 } else {
14655 mds++;
14656 }
14657 }
14658
14659 // didn't find target session in any mds
14660 if (reclaim_target_addrs.empty()) {
14661 if (flags & CEPH_RECLAIM_RESET)
14662 return -ENOENT;
14663 return -ENOTRECOVERABLE;
14664 }
14665
14666 if (flags & CEPH_RECLAIM_RESET)
14667 return 0;
14668
14669 // use blacklist to check if target session was killed
14670 // (config option mds_session_blacklist_on_evict needs to be true)
14671 C_SaferCond cond;
14672 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14673 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14674 client_lock.unlock();
14675 cond.wait();
14676 client_lock.lock();
14677 }
14678
14679 bool blacklisted = objecter->with_osdmap(
14680 [this](const OSDMap &osd_map) -> bool {
14681 return osd_map.is_blacklisted(reclaim_target_addrs);
14682 });
14683 if (blacklisted)
14684 return -ENOTRECOVERABLE;
14685
14686 metadata["reclaiming_uuid"] = uuid;
14687 return 0;
14688 }
14689
14690 void Client::finish_reclaim()
14691 {
14692 auto it = metadata.find("reclaiming_uuid");
14693 if (it == metadata.end()) {
14694 for (auto &p : mds_sessions)
14695 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14696 return;
14697 }
14698
14699 for (auto &p : mds_sessions) {
14700 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14701 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14702 p.second.con->send_message2(std::move(m));
14703 }
14704
14705 metadata["uuid"] = it->second;
14706 metadata.erase(it);
14707 }
14708
14709 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14710 {
14711 mds_rank_t from = mds_rank_t(reply->get_source().num());
14712 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14713
14714 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14715 if (!session) {
14716 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14717 return;
14718 }
14719
14720 if (reply->get_result() >= 0) {
14721 session->reclaim_state = MetaSession::RECLAIM_OK;
14722 if (reply->get_epoch() > reclaim_osd_epoch)
14723 reclaim_osd_epoch = reply->get_epoch();
14724 if (!reply->get_addrs().empty())
14725 reclaim_target_addrs = reply->get_addrs();
14726 } else {
14727 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14728 reclaim_errno = reply->get_result();
14729 }
14730
14731 signal_cond_list(waiting_for_reclaim);
14732 }
14733
14734 /**
14735 * This is included in cap release messages, to cause
14736 * the MDS to wait until this OSD map epoch. It is necessary
14737 * in corner cases where we cancel RADOS ops, so that
14738 * nobody else tries to do IO to the same objects in
14739 * the same epoch as the cancelled ops.
14740 */
14741 void Client::set_cap_epoch_barrier(epoch_t e)
14742 {
14743 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14744 cap_epoch_barrier = e;
14745 }
14746
14747 const char** Client::get_tracked_conf_keys() const
14748 {
14749 static const char* keys[] = {
14750 "client_cache_size",
14751 "client_cache_mid",
14752 "client_acl_type",
14753 "client_deleg_timeout",
14754 "client_deleg_break_on_open",
14755 NULL
14756 };
14757 return keys;
14758 }
14759
14760 void Client::handle_conf_change(const ConfigProxy& conf,
14761 const std::set <std::string> &changed)
14762 {
14763 std::lock_guard lock(client_lock);
14764
14765 if (changed.count("client_cache_mid")) {
14766 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14767 }
14768 if (changed.count("client_acl_type")) {
14769 acl_type = NO_ACL;
14770 if (cct->_conf->client_acl_type == "posix_acl")
14771 acl_type = POSIX_ACL;
14772 }
14773 }
14774
14775 void intrusive_ptr_add_ref(Inode *in)
14776 {
14777 in->get();
14778 }
14779
14780 void intrusive_ptr_release(Inode *in)
14781 {
14782 in->client->put_inode(in);
14783 }
14784
14785 mds_rank_t Client::_get_random_up_mds() const
14786 {
14787 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14788
14789 std::set<mds_rank_t> up;
14790 mdsmap->get_up_mds_set(up);
14791
14792 if (up.empty())
14793 return MDS_RANK_NONE;
14794 std::set<mds_rank_t>::const_iterator p = up.begin();
14795 for (int n = rand() % up.size(); n; n--)
14796 ++p;
14797 return *p;
14798 }
14799
14800
14801 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14802 : Client(m, mc, new Objecter(m->cct, m, mc, nullptr))
14803 {
14804 monclient->set_messenger(m);
14805 objecter->set_client_incarnation(0);
14806 }
14807
14808 StandaloneClient::~StandaloneClient()
14809 {
14810 delete objecter;
14811 objecter = nullptr;
14812 }
14813
14814 int StandaloneClient::init()
14815 {
14816 _pre_init();
14817 objecter->init();
14818
14819 client_lock.lock();
14820 ceph_assert(!is_initialized());
14821
14822 messenger->add_dispatcher_tail(objecter);
14823 messenger->add_dispatcher_tail(this);
14824
14825 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14826 int r = monclient->init();
14827 if (r < 0) {
14828 // need to do cleanup because we're in an intermediate init state
14829 timer.shutdown();
14830 client_lock.unlock();
14831 objecter->shutdown();
14832 objectcacher->stop();
14833 monclient->shutdown();
14834 return r;
14835 }
14836 objecter->start();
14837
14838 client_lock.unlock();
14839 _finish_init();
14840
14841 return 0;
14842 }
14843
14844 void StandaloneClient::shutdown()
14845 {
14846 Client::shutdown();
14847 objecter->shutdown();
14848 monclient->shutdown();
14849 }