]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import 15.2.5
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
77
78 #define dout_subsys ceph_subsys_client
79
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
83
84 #include "Client.h"
85 #include "Inode.h"
86 #include "Dentry.h"
87 #include "Delegation.h"
88 #include "Dir.h"
89 #include "ClientSnapRealm.h"
90 #include "Fh.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
95
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
98
99 #include "include/cephfs/ceph_ll_client.h"
100
101 #if HAVE_GETGROUPLIST
102 #include <grp.h>
103 #include <pwd.h>
104 #include <unistd.h>
105 #endif
106
107 #undef dout_prefix
108 #define dout_prefix *_dout << "client." << whoami << " "
109
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112 // FreeBSD fails to define this
113 #ifndef O_DSYNC
114 #define O_DSYNC 0x0
115 #endif
116 // Darwin fails to define this
117 #ifndef O_RSYNC
118 #define O_RSYNC 0x0
119 #endif
120
121 #ifndef O_DIRECT
122 #define O_DIRECT 0x0
123 #endif
124
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
127 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
128 {
129 Client *client = static_cast<Client*>(p);
130 client->flush_set_callback(oset);
131 }
132
133
134 // -------------
135
136 Client::CommandHook::CommandHook(Client *client) :
137 m_client(client)
138 {
139 }
140
141 int Client::CommandHook::call(
142 std::string_view command,
143 const cmdmap_t& cmdmap,
144 Formatter *f,
145 std::ostream& errss,
146 bufferlist& out)
147 {
148 f->open_object_section("result");
149 {
150 std::lock_guard l{m_client->client_lock};
151 if (command == "mds_requests")
152 m_client->dump_mds_requests(f);
153 else if (command == "mds_sessions")
154 m_client->dump_mds_sessions(f);
155 else if (command == "dump_cache")
156 m_client->dump_cache(f);
157 else if (command == "kick_stale_sessions")
158 m_client->_kick_stale_sessions();
159 else if (command == "status")
160 m_client->dump_status(f);
161 else
162 ceph_abort_msg("bad command registered");
163 }
164 f->close_section();
165 return 0;
166 }
167
168
169 // -------------
170
171 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
172 : inode(in), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 perms(perms)
175 { }
176
177 void Client::_reset_faked_inos()
178 {
179 ino_t start = 1024;
180 free_faked_inos.clear();
181 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
182 last_used_faked_ino = 0;
183 last_used_faked_root = 0;
184 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
185 }
186
187 void Client::_assign_faked_ino(Inode *in)
188 {
189 if (0 == last_used_faked_ino)
190 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
192 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
193 last_used_faked_ino = 2048;
194 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
195 }
196 ceph_assert(it != free_faked_inos.end());
197 if (last_used_faked_ino < it.get_start()) {
198 ceph_assert(it.get_len() > 0);
199 last_used_faked_ino = it.get_start();
200 } else {
201 ++last_used_faked_ino;
202 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
203 }
204 in->faked_ino = last_used_faked_ino;
205 free_faked_inos.erase(in->faked_ino);
206 faked_ino_map[in->faked_ino] = in->vino();
207 }
208
209 /*
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
215 */
216 void Client::_assign_faked_root(Inode *in)
217 {
218 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
219 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
220 last_used_faked_root = 0;
221 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
222 }
223 assert(it != free_faked_inos.end());
224 vinodeno_t inode_info = in->vino();
225 uint64_t inode_num = (uint64_t)inode_info.ino;
226 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
227 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it.get_start() + it.get_len() > last_used_faked_root);
229
230 in->faked_ino = last_used_faked_root;
231 free_faked_inos.erase(in->faked_ino);
232 faked_ino_map[in->faked_ino] = in->vino();
233 }
234
235 void Client::_release_faked_ino(Inode *in)
236 {
237 free_faked_inos.insert(in->faked_ino);
238 faked_ino_map.erase(in->faked_ino);
239 }
240
241 vinodeno_t Client::_map_faked_ino(ino_t ino)
242 {
243 vinodeno_t vino;
244 if (ino == 1)
245 vino = root->vino();
246 else if (faked_ino_map.count(ino))
247 vino = faked_ino_map[ino];
248 else
249 vino = vinodeno_t(0, CEPH_NOSNAP);
250 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
251 return vino;
252 }
253
254 vinodeno_t Client::map_faked_ino(ino_t ino)
255 {
256 std::lock_guard lock(client_lock);
257 return _map_faked_ino(ino);
258 }
259
260 // cons/des
261
262 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
263 : Dispatcher(m->cct),
264 timer(m->cct, client_lock),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 async_ino_releasor(m->cct),
274 objecter_finisher(m->cct),
275 m_command_hook(this),
276 fscid(0)
277 {
278 _reset_faked_inos();
279
280 user_id = cct->_conf->client_mount_uid;
281 group_id = cct->_conf->client_mount_gid;
282 fuse_default_permissions = cct->_conf.get_val<bool>(
283 "fuse_default_permissions");
284
285 if (cct->_conf->client_acl_type == "posix_acl")
286 acl_type = POSIX_ACL;
287
288 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
289
290 // file handles
291 free_fd_set.insert(10, 1<<30);
292
293 mdsmap.reset(new MDSMap);
294
295 // osd interfaces
296 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
297 &client_lock));
298 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
299 client_flush_set_callback, // all commit callback
300 (void*)this,
301 cct->_conf->client_oc_size,
302 cct->_conf->client_oc_max_objects,
303 cct->_conf->client_oc_max_dirty,
304 cct->_conf->client_oc_target_dirty,
305 cct->_conf->client_oc_max_dirty_age,
306 true));
307 }
308
309
310 Client::~Client()
311 {
312 ceph_assert(ceph_mutex_is_not_locked(client_lock));
313
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 std::lock_guard l{client_lock};
318 tear_down_cache();
319 }
320
321 void Client::tear_down_cache()
322 {
323 // fd's
324 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
325 it != fd_map.end();
326 ++it) {
327 Fh *fh = it->second;
328 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
329 _release_fh(fh);
330 }
331 fd_map.clear();
332
333 while (!opened_dirs.empty()) {
334 dir_result_t *dirp = *opened_dirs.begin();
335 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
336 _closedir(dirp);
337 }
338
339 // caps!
340 // *** FIXME ***
341
342 // empty lru
343 trim_cache();
344 ceph_assert(lru.lru_get_size() == 0);
345
346 // close root ino
347 ceph_assert(inode_map.size() <= 1 + root_parents.size());
348 if (root && inode_map.size() == 1 + root_parents.size()) {
349 delete root;
350 root = 0;
351 root_ancestor = 0;
352 while (!root_parents.empty())
353 root_parents.erase(root_parents.begin());
354 inode_map.clear();
355 _reset_faked_inos();
356 }
357
358 ceph_assert(inode_map.empty());
359 }
360
361 inodeno_t Client::get_root_ino()
362 {
363 std::lock_guard l(client_lock);
364 if (use_faked_inos())
365 return root->faked_ino;
366 else
367 return root->ino;
368 }
369
370 Inode *Client::get_root()
371 {
372 std::lock_guard l(client_lock);
373 root->ll_get();
374 return root;
375 }
376
377
378 // debug crapola
379
380 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
381 {
382 filepath path;
383 in->make_long_path(path);
384 ldout(cct, 1) << "dump_inode: "
385 << (disconnected ? "DISCONNECTED ":"")
386 << "inode " << in->ino
387 << " " << path
388 << " ref " << in->get_num_ref()
389 << *in << dendl;
390
391 if (f) {
392 f->open_object_section("inode");
393 f->dump_stream("path") << path;
394 if (disconnected)
395 f->dump_int("disconnected", 1);
396 in->dump(f);
397 f->close_section();
398 }
399
400 did.insert(in);
401 if (in->dir) {
402 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
403 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
404 it != in->dir->dentries.end();
405 ++it) {
406 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
407 if (f) {
408 f->open_object_section("dentry");
409 it->second->dump(f);
410 f->close_section();
411 }
412 if (it->second->inode)
413 dump_inode(f, it->second->inode.get(), did, false);
414 }
415 }
416 }
417
418 void Client::dump_cache(Formatter *f)
419 {
420 set<Inode*> did;
421
422 ldout(cct, 1) << __func__ << dendl;
423
424 if (f)
425 f->open_array_section("cache");
426
427 if (root)
428 dump_inode(f, root, did, true);
429
430 // make a second pass to catch anything disconnected
431 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
432 it != inode_map.end();
433 ++it) {
434 if (did.count(it->second))
435 continue;
436 dump_inode(f, it->second, did, true);
437 }
438
439 if (f)
440 f->close_section();
441 }
442
443 void Client::dump_status(Formatter *f)
444 {
445 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
446
447 ldout(cct, 1) << __func__ << dendl;
448
449 const epoch_t osd_epoch
450 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
451
452 if (f) {
453 f->open_object_section("metadata");
454 for (const auto& kv : metadata)
455 f->dump_string(kv.first.c_str(), kv.second);
456 f->close_section();
457
458 f->dump_int("dentry_count", lru.lru_get_size());
459 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
460 f->dump_int("id", get_nodeid().v);
461 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
462 f->dump_object("inst", inst);
463 f->dump_object("addr", inst.addr);
464 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
465 f->dump_string("addr_str", inst.addr.get_legacy_str());
466 f->dump_int("inode_count", inode_map.size());
467 f->dump_int("mds_epoch", mdsmap->get_epoch());
468 f->dump_int("osd_epoch", osd_epoch);
469 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
470 f->dump_bool("blacklisted", blacklisted);
471 }
472 }
473
474 void Client::_pre_init()
475 {
476 timer.init();
477
478 objecter_finisher.start();
479 filer.reset(new Filer(objecter, &objecter_finisher));
480 objecter->enable_blacklist_events();
481
482 objectcacher->start();
483 }
484
485 int Client::init()
486 {
487 _pre_init();
488 {
489 std::lock_guard l{client_lock};
490 ceph_assert(!initialized);
491 messenger->add_dispatcher_tail(this);
492 }
493 _finish_init();
494 return 0;
495 }
496
497 void Client::_finish_init()
498 {
499 {
500 std::lock_guard l{client_lock};
501 // logger
502 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
503 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
504 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
505 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
506 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
507 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
508 logger.reset(plb.create_perf_counters());
509 cct->get_perfcounters_collection()->add(logger.get());
510 }
511
512 cct->_conf.add_observer(this);
513
514 AdminSocket* admin_socket = cct->get_admin_socket();
515 int ret = admin_socket->register_command("mds_requests",
516 &m_command_hook,
517 "show in-progress mds requests");
518 if (ret < 0) {
519 lderr(cct) << "error registering admin socket command: "
520 << cpp_strerror(-ret) << dendl;
521 }
522 ret = admin_socket->register_command("mds_sessions",
523 &m_command_hook,
524 "show mds session state");
525 if (ret < 0) {
526 lderr(cct) << "error registering admin socket command: "
527 << cpp_strerror(-ret) << dendl;
528 }
529 ret = admin_socket->register_command("dump_cache",
530 &m_command_hook,
531 "show in-memory metadata cache contents");
532 if (ret < 0) {
533 lderr(cct) << "error registering admin socket command: "
534 << cpp_strerror(-ret) << dendl;
535 }
536 ret = admin_socket->register_command("kick_stale_sessions",
537 &m_command_hook,
538 "kick sessions that were remote reset");
539 if (ret < 0) {
540 lderr(cct) << "error registering admin socket command: "
541 << cpp_strerror(-ret) << dendl;
542 }
543 ret = admin_socket->register_command("status",
544 &m_command_hook,
545 "show overall client status");
546 if (ret < 0) {
547 lderr(cct) << "error registering admin socket command: "
548 << cpp_strerror(-ret) << dendl;
549 }
550
551 std::lock_guard l{client_lock};
552 initialized = true;
553 }
554
555 void Client::shutdown()
556 {
557 ldout(cct, 1) << __func__ << dendl;
558
559 // If we were not mounted, but were being used for sending
560 // MDS commands, we may have sessions that need closing.
561 {
562 std::lock_guard l{client_lock};
563 _close_sessions();
564 }
565 cct->_conf.remove_observer(this);
566
567 cct->get_admin_socket()->unregister_commands(&m_command_hook);
568
569 if (ino_invalidate_cb) {
570 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
571 async_ino_invalidator.wait_for_empty();
572 async_ino_invalidator.stop();
573 }
574
575 if (dentry_invalidate_cb) {
576 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
577 async_dentry_invalidator.wait_for_empty();
578 async_dentry_invalidator.stop();
579 }
580
581 if (switch_interrupt_cb) {
582 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
583 interrupt_finisher.wait_for_empty();
584 interrupt_finisher.stop();
585 }
586
587 if (remount_cb) {
588 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
589 remount_finisher.wait_for_empty();
590 remount_finisher.stop();
591 }
592
593 if (ino_release_cb) {
594 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
595 async_ino_releasor.wait_for_empty();
596 async_ino_releasor.stop();
597 }
598
599 objectcacher->stop(); // outside of client_lock! this does a join.
600 {
601 std::lock_guard l{client_lock};
602 ceph_assert(initialized);
603 initialized = false;
604 timer.shutdown();
605 }
606 objecter_finisher.wait_for_empty();
607 objecter_finisher.stop();
608
609 if (logger) {
610 cct->get_perfcounters_collection()->remove(logger.get());
611 logger.reset();
612 }
613 }
614
615
616 // ===================
617 // metadata cache stuff
618
619 void Client::trim_cache(bool trim_kernel_dcache)
620 {
621 uint64_t max = cct->_conf->client_cache_size;
622 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
623 unsigned last = 0;
624 while (lru.lru_get_size() != last) {
625 last = lru.lru_get_size();
626
627 if (!unmounting && lru.lru_get_size() <= max) break;
628
629 // trim!
630 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
631 if (!dn)
632 break; // done
633
634 trim_dentry(dn);
635 }
636
637 if (trim_kernel_dcache && lru.lru_get_size() > max)
638 _invalidate_kernel_dcache();
639
640 // hose root?
641 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
642 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
643 delete root;
644 root = 0;
645 root_ancestor = 0;
646 while (!root_parents.empty())
647 root_parents.erase(root_parents.begin());
648 inode_map.clear();
649 _reset_faked_inos();
650 }
651 }
652
653 void Client::trim_cache_for_reconnect(MetaSession *s)
654 {
655 mds_rank_t mds = s->mds_num;
656 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
657
658 int trimmed = 0;
659 list<Dentry*> skipped;
660 while (lru.lru_get_size() > 0) {
661 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
662 if (!dn)
663 break;
664
665 if ((dn->inode && dn->inode->caps.count(mds)) ||
666 dn->dir->parent_inode->caps.count(mds)) {
667 trim_dentry(dn);
668 trimmed++;
669 } else
670 skipped.push_back(dn);
671 }
672
673 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
674 lru.lru_insert_mid(*p);
675
676 ldout(cct, 20) << __func__ << " mds." << mds
677 << " trimmed " << trimmed << " dentries" << dendl;
678
679 if (s->caps.size() > 0)
680 _invalidate_kernel_dcache();
681 }
682
683 void Client::trim_dentry(Dentry *dn)
684 {
685 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
686 << " in dir "
687 << std::hex << dn->dir->parent_inode->ino << std::dec
688 << dendl;
689 if (dn->inode) {
690 Inode *diri = dn->dir->parent_inode;
691 diri->dir_release_count++;
692 clear_dir_complete_and_ordered(diri, true);
693 }
694 unlink(dn, false, false); // drop dir, drop dentry
695 }
696
697
698 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
699 uint64_t truncate_seq, uint64_t truncate_size)
700 {
701 uint64_t prior_size = in->size;
702
703 if (truncate_seq > in->truncate_seq ||
704 (truncate_seq == in->truncate_seq && size > in->size)) {
705 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
706 in->size = size;
707 in->reported_size = size;
708 if (truncate_seq != in->truncate_seq) {
709 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
710 << truncate_seq << dendl;
711 in->truncate_seq = truncate_seq;
712 in->oset.truncate_seq = truncate_seq;
713
714 // truncate cached file data
715 if (prior_size > size) {
716 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
717 }
718 }
719
720 // truncate inline data
721 if (in->inline_version < CEPH_INLINE_NONE) {
722 uint32_t len = in->inline_data.length();
723 if (size < len)
724 in->inline_data.splice(size, len - size);
725 }
726 }
727 if (truncate_seq >= in->truncate_seq &&
728 in->truncate_size != truncate_size) {
729 if (in->is_file()) {
730 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
731 << truncate_size << dendl;
732 in->truncate_size = truncate_size;
733 in->oset.truncate_size = truncate_size;
734 } else {
735 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
736 }
737 }
738 }
739
740 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
741 utime_t ctime, utime_t mtime, utime_t atime)
742 {
743 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
744 << " ctime " << ctime << " mtime " << mtime << dendl;
745
746 if (time_warp_seq > in->time_warp_seq)
747 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
748 << " is higher than local time_warp_seq "
749 << in->time_warp_seq << dendl;
750
751 int warn = false;
752 // be careful with size, mtime, atime
753 if (issued & (CEPH_CAP_FILE_EXCL|
754 CEPH_CAP_FILE_WR|
755 CEPH_CAP_FILE_BUFFER|
756 CEPH_CAP_AUTH_EXCL|
757 CEPH_CAP_XATTR_EXCL)) {
758 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
759 if (ctime > in->ctime)
760 in->ctime = ctime;
761 if (time_warp_seq > in->time_warp_seq) {
762 //the mds updated times, so take those!
763 in->mtime = mtime;
764 in->atime = atime;
765 in->time_warp_seq = time_warp_seq;
766 } else if (time_warp_seq == in->time_warp_seq) {
767 //take max times
768 if (mtime > in->mtime)
769 in->mtime = mtime;
770 if (atime > in->atime)
771 in->atime = atime;
772 } else if (issued & CEPH_CAP_FILE_EXCL) {
773 //ignore mds values as we have a higher seq
774 } else warn = true;
775 } else {
776 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
777 if (time_warp_seq >= in->time_warp_seq) {
778 in->ctime = ctime;
779 in->mtime = mtime;
780 in->atime = atime;
781 in->time_warp_seq = time_warp_seq;
782 } else warn = true;
783 }
784 if (warn) {
785 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
786 << time_warp_seq << " is lower than local time_warp_seq "
787 << in->time_warp_seq
788 << dendl;
789 }
790 }
791
792 void Client::_fragmap_remove_non_leaves(Inode *in)
793 {
794 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
795 if (!in->dirfragtree.is_leaf(p->first))
796 in->fragmap.erase(p++);
797 else
798 ++p;
799 }
800
801 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
802 {
803 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
804 if (p->second == mds)
805 in->fragmap.erase(p++);
806 else
807 ++p;
808 }
809
810 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
811 MetaSession *session,
812 const UserPerm& request_perms)
813 {
814 Inode *in;
815 bool was_new = false;
816 if (inode_map.count(st->vino)) {
817 in = inode_map[st->vino];
818 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
819 } else {
820 in = new Inode(this, st->vino, &st->layout);
821 inode_map[st->vino] = in;
822
823 if (use_faked_inos())
824 _assign_faked_ino(in);
825
826 if (!root) {
827 root = in;
828 if (use_faked_inos())
829 _assign_faked_root(root);
830 root_ancestor = in;
831 cwd = root;
832 } else if (!mounted) {
833 root_parents[root_ancestor] = in;
834 root_ancestor = in;
835 }
836
837 // immutable bits
838 in->ino = st->vino.ino;
839 in->snapid = st->vino.snapid;
840 in->mode = st->mode & S_IFMT;
841 was_new = true;
842 }
843
844 in->rdev = st->rdev;
845 if (in->is_symlink())
846 in->symlink = st->symlink;
847
848 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
849 bool new_version = false;
850 if (in->version == 0 ||
851 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
852 (in->version & ~1) < st->version))
853 new_version = true;
854
855 int issued;
856 in->caps_issued(&issued);
857 issued |= in->caps_dirty();
858 int new_issued = ~issued & (int)st->cap.caps;
859
860 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
861 !(issued & CEPH_CAP_AUTH_EXCL)) {
862 in->mode = st->mode;
863 in->uid = st->uid;
864 in->gid = st->gid;
865 in->btime = st->btime;
866 in->snap_btime = st->snap_btime;
867 }
868
869 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
870 !(issued & CEPH_CAP_LINK_EXCL)) {
871 in->nlink = st->nlink;
872 }
873
874 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
875 update_inode_file_time(in, issued, st->time_warp_seq,
876 st->ctime, st->mtime, st->atime);
877 }
878
879 if (new_version ||
880 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
881 in->layout = st->layout;
882 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
883 }
884
885 if (in->is_dir()) {
886 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
887 in->dirstat = st->dirstat;
888 }
889 // dir_layout/rstat/quota are not tracked by capability, update them only if
890 // the inode stat is from auth mds
891 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
892 in->dir_layout = st->dir_layout;
893 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
894 in->rstat = st->rstat;
895 in->quota = st->quota;
896 in->dir_pin = st->dir_pin;
897 }
898 // move me if/when version reflects fragtree changes.
899 if (in->dirfragtree != st->dirfragtree) {
900 in->dirfragtree = st->dirfragtree;
901 _fragmap_remove_non_leaves(in);
902 }
903 }
904
905 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
906 st->xattrbl.length() &&
907 st->xattr_version > in->xattr_version) {
908 auto p = st->xattrbl.cbegin();
909 decode(in->xattrs, p);
910 in->xattr_version = st->xattr_version;
911 }
912
913 if (st->inline_version > in->inline_version) {
914 in->inline_data = st->inline_data;
915 in->inline_version = st->inline_version;
916 }
917
918 /* always take a newer change attr */
919 if (st->change_attr > in->change_attr)
920 in->change_attr = st->change_attr;
921
922 if (st->version > in->version)
923 in->version = st->version;
924
925 if (was_new)
926 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
927
928 if (!st->cap.caps)
929 return in; // as with readdir returning indoes in different snaprealms (no caps!)
930
931 if (in->snapid == CEPH_NOSNAP) {
932 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
933 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
934 st->cap.flags, request_perms);
935 if (in->auth_cap && in->auth_cap->session == session) {
936 in->max_size = st->max_size;
937 in->rstat = st->rstat;
938 }
939
940 // setting I_COMPLETE needs to happen after adding the cap
941 if (in->is_dir() &&
942 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
943 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
944 in->dirstat.nfiles == 0 &&
945 in->dirstat.nsubdirs == 0) {
946 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
947 in->flags |= I_COMPLETE | I_DIR_ORDERED;
948 if (in->dir) {
949 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
950 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
951 in->dir->readdir_cache.clear();
952 for (const auto& p : in->dir->dentries) {
953 unlink(p.second, true, true); // keep dir, keep dentry
954 }
955 if (in->dir->dentries.empty())
956 close_dir(in->dir);
957 }
958 }
959 } else {
960 in->snap_caps |= st->cap.caps;
961 }
962
963 return in;
964 }
965
966
967 /*
968 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
969 */
970 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
971 Inode *in, utime_t from, MetaSession *session,
972 Dentry *old_dentry)
973 {
974 Dentry *dn = NULL;
975 if (dir->dentries.count(dname))
976 dn = dir->dentries[dname];
977
978 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
979 << " in dir " << dir->parent_inode->vino() << " dn " << dn
980 << dendl;
981
982 if (dn && dn->inode) {
983 if (dn->inode->vino() == in->vino()) {
984 touch_dn(dn);
985 ldout(cct, 12) << " had dentry " << dname
986 << " with correct vino " << dn->inode->vino()
987 << dendl;
988 } else {
989 ldout(cct, 12) << " had dentry " << dname
990 << " with WRONG vino " << dn->inode->vino()
991 << dendl;
992 unlink(dn, true, true); // keep dir, keep dentry
993 }
994 }
995
996 if (!dn || !dn->inode) {
997 InodeRef tmp_ref(in);
998 if (old_dentry) {
999 if (old_dentry->dir != dir) {
1000 Inode *old_diri = old_dentry->dir->parent_inode;
1001 old_diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(old_diri, false);
1003 }
1004 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1005 }
1006 Inode *diri = dir->parent_inode;
1007 diri->dir_ordered_count++;
1008 clear_dir_complete_and_ordered(diri, false);
1009 dn = link(dir, dname, in, dn);
1010 }
1011
1012 update_dentry_lease(dn, dlease, from, session);
1013 return dn;
1014 }
1015
1016 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1017 {
1018 utime_t dttl = from;
1019 dttl += (float)dlease->duration_ms / 1000.0;
1020
1021 ceph_assert(dn);
1022
1023 if (dlease->mask & CEPH_LEASE_VALID) {
1024 if (dttl > dn->lease_ttl) {
1025 ldout(cct, 10) << "got dentry lease on " << dn->name
1026 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1027 dn->lease_ttl = dttl;
1028 dn->lease_mds = session->mds_num;
1029 dn->lease_seq = dlease->seq;
1030 dn->lease_gen = session->cap_gen;
1031 }
1032 }
1033 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1034 }
1035
1036
1037 /*
1038 * update MDS location cache for a single inode
1039 */
1040 void Client::update_dir_dist(Inode *in, DirStat *dst)
1041 {
1042 // auth
1043 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1044 if (dst->auth >= 0) {
1045 in->fragmap[dst->frag] = dst->auth;
1046 } else {
1047 in->fragmap.erase(dst->frag);
1048 }
1049 if (!in->dirfragtree.is_leaf(dst->frag)) {
1050 in->dirfragtree.force_to_leaf(cct, dst->frag);
1051 _fragmap_remove_non_leaves(in);
1052 }
1053
1054 // replicated
1055 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1056 }
1057
1058 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1059 {
1060 if (diri->flags & I_COMPLETE) {
1061 if (complete) {
1062 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1063 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1064 } else {
1065 if (diri->flags & I_DIR_ORDERED) {
1066 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1067 diri->flags &= ~I_DIR_ORDERED;
1068 }
1069 }
1070 if (diri->dir)
1071 diri->dir->readdir_cache.clear();
1072 }
1073 }
1074
1075 /*
1076 * insert results from readdir or lssnap into the metadata cache.
1077 */
1078 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1079
1080 auto& reply = request->reply;
1081 ConnectionRef con = request->reply->get_connection();
1082 uint64_t features;
1083 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1084 features = (uint64_t)-1;
1085 }
1086 else {
1087 features = con->get_features();
1088 }
1089
1090 dir_result_t *dirp = request->dirp;
1091 ceph_assert(dirp);
1092
1093 // the extra buffer list is only set for readdir and lssnap replies
1094 auto p = reply->get_extra_bl().cbegin();
1095 if (!p.end()) {
1096 // snapdir?
1097 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1098 ceph_assert(diri);
1099 diri = open_snapdir(diri);
1100 }
1101
1102 // only open dir if we're actually adding stuff to it!
1103 Dir *dir = diri->open_dir();
1104 ceph_assert(dir);
1105
1106 // dirstat
1107 DirStat dst(p, features);
1108 __u32 numdn;
1109 __u16 flags;
1110 decode(numdn, p);
1111 decode(flags, p);
1112
1113 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1114 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1115
1116 frag_t fg = (unsigned)request->head.args.readdir.frag;
1117 unsigned readdir_offset = dirp->next_offset;
1118 string readdir_start = dirp->last_name;
1119 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1120
1121 unsigned last_hash = 0;
1122 if (hash_order) {
1123 if (!readdir_start.empty()) {
1124 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1125 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1126 /* mds understands offset_hash */
1127 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1128 }
1129 }
1130
1131 if (fg != dst.frag) {
1132 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1133 fg = dst.frag;
1134 if (!hash_order) {
1135 readdir_offset = 2;
1136 readdir_start.clear();
1137 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1138 }
1139 }
1140
1141 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1142 << ", hash_order=" << hash_order
1143 << ", readdir_start " << readdir_start
1144 << ", last_hash " << last_hash
1145 << ", next_offset " << readdir_offset << dendl;
1146
1147 if (diri->snapid != CEPH_SNAPDIR &&
1148 fg.is_leftmost() && readdir_offset == 2 &&
1149 !(hash_order && last_hash)) {
1150 dirp->release_count = diri->dir_release_count;
1151 dirp->ordered_count = diri->dir_ordered_count;
1152 dirp->start_shared_gen = diri->shared_gen;
1153 dirp->cache_index = 0;
1154 }
1155
1156 dirp->buffer_frag = fg;
1157
1158 _readdir_drop_dirp_buffer(dirp);
1159 dirp->buffer.reserve(numdn);
1160
1161 string dname;
1162 LeaseStat dlease;
1163 for (unsigned i=0; i<numdn; i++) {
1164 decode(dname, p);
1165 dlease.decode(p, features);
1166 InodeStat ist(p, features);
1167
1168 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1169
1170 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1171 request->perms);
1172 Dentry *dn;
1173 if (diri->dir->dentries.count(dname)) {
1174 Dentry *olddn = diri->dir->dentries[dname];
1175 if (olddn->inode != in) {
1176 // replace incorrect dentry
1177 unlink(olddn, true, true); // keep dir, dentry
1178 dn = link(dir, dname, in, olddn);
1179 ceph_assert(dn == olddn);
1180 } else {
1181 // keep existing dn
1182 dn = olddn;
1183 touch_dn(dn);
1184 }
1185 } else {
1186 // new dn
1187 dn = link(dir, dname, in, NULL);
1188 }
1189
1190 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1191 if (hash_order) {
1192 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1193 if (hash != last_hash)
1194 readdir_offset = 2;
1195 last_hash = hash;
1196 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1197 } else {
1198 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1199 }
1200 // add to readdir cache
1201 if (dirp->release_count == diri->dir_release_count &&
1202 dirp->ordered_count == diri->dir_ordered_count &&
1203 dirp->start_shared_gen == diri->shared_gen) {
1204 if (dirp->cache_index == dir->readdir_cache.size()) {
1205 if (i == 0) {
1206 ceph_assert(!dirp->inode->is_complete_and_ordered());
1207 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1208 }
1209 dir->readdir_cache.push_back(dn);
1210 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1211 if (dirp->inode->is_complete_and_ordered())
1212 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1213 else
1214 dir->readdir_cache[dirp->cache_index] = dn;
1215 } else {
1216 ceph_abort_msg("unexpected readdir buffer idx");
1217 }
1218 dirp->cache_index++;
1219 }
1220 // add to cached result list
1221 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1222 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1223 }
1224
1225 if (numdn > 0)
1226 dirp->last_name = dname;
1227 if (end)
1228 dirp->next_offset = 2;
1229 else
1230 dirp->next_offset = readdir_offset;
1231
1232 if (dir->is_empty())
1233 close_dir(dir);
1234 }
1235 }
1236
1237 /** insert_trace
1238 *
1239 * insert a trace from a MDS reply into the cache.
1240 */
1241 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1242 {
1243 auto& reply = request->reply;
1244 int op = request->get_op();
1245
1246 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1247 << " is_target=" << (int)reply->head.is_target
1248 << " is_dentry=" << (int)reply->head.is_dentry
1249 << dendl;
1250
1251 auto p = reply->get_trace_bl().cbegin();
1252 if (request->got_unsafe) {
1253 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1254 ceph_assert(p.end());
1255 return NULL;
1256 }
1257
1258 if (p.end()) {
1259 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1260
1261 Dentry *d = request->dentry();
1262 if (d) {
1263 Inode *diri = d->dir->parent_inode;
1264 diri->dir_release_count++;
1265 clear_dir_complete_and_ordered(diri, true);
1266 }
1267
1268 if (d && reply->get_result() == 0) {
1269 if (op == CEPH_MDS_OP_RENAME) {
1270 // rename
1271 Dentry *od = request->old_dentry();
1272 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1273 ceph_assert(od);
1274 unlink(od, true, true); // keep dir, dentry
1275 } else if (op == CEPH_MDS_OP_RMDIR ||
1276 op == CEPH_MDS_OP_UNLINK) {
1277 // unlink, rmdir
1278 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1279 unlink(d, true, true); // keep dir, dentry
1280 }
1281 }
1282 return NULL;
1283 }
1284
1285 ConnectionRef con = request->reply->get_connection();
1286 uint64_t features;
1287 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1288 features = (uint64_t)-1;
1289 }
1290 else {
1291 features = con->get_features();
1292 }
1293 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1294
1295 // snap trace
1296 SnapRealm *realm = NULL;
1297 if (reply->snapbl.length())
1298 update_snap_trace(reply->snapbl, &realm);
1299
1300 ldout(cct, 10) << " hrm "
1301 << " is_target=" << (int)reply->head.is_target
1302 << " is_dentry=" << (int)reply->head.is_dentry
1303 << dendl;
1304
1305 InodeStat dirst;
1306 DirStat dst;
1307 string dname;
1308 LeaseStat dlease;
1309 InodeStat ist;
1310
1311 if (reply->head.is_dentry) {
1312 dirst.decode(p, features);
1313 dst.decode(p, features);
1314 decode(dname, p);
1315 dlease.decode(p, features);
1316 }
1317
1318 Inode *in = 0;
1319 if (reply->head.is_target) {
1320 ist.decode(p, features);
1321 if (cct->_conf->client_debug_getattr_caps) {
1322 unsigned wanted = 0;
1323 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1324 wanted = request->head.args.getattr.mask;
1325 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1326 wanted = request->head.args.open.mask;
1327
1328 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1329 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1330 ceph_abort_msg("MDS reply does not contain xattrs");
1331 }
1332
1333 in = add_update_inode(&ist, request->sent_stamp, session,
1334 request->perms);
1335 }
1336
1337 Inode *diri = NULL;
1338 if (reply->head.is_dentry) {
1339 diri = add_update_inode(&dirst, request->sent_stamp, session,
1340 request->perms);
1341 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1342
1343 if (in) {
1344 Dir *dir = diri->open_dir();
1345 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1346 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1347 } else {
1348 Dentry *dn = NULL;
1349 if (diri->dir && diri->dir->dentries.count(dname)) {
1350 dn = diri->dir->dentries[dname];
1351 if (dn->inode) {
1352 diri->dir_ordered_count++;
1353 clear_dir_complete_and_ordered(diri, false);
1354 unlink(dn, true, true); // keep dir, dentry
1355 }
1356 }
1357 if (dlease.duration_ms > 0) {
1358 if (!dn) {
1359 Dir *dir = diri->open_dir();
1360 dn = link(dir, dname, NULL, NULL);
1361 }
1362 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1363 }
1364 }
1365 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1366 op == CEPH_MDS_OP_MKSNAP) {
1367 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1368 // fake it for snap lookup
1369 vinodeno_t vino = ist.vino;
1370 vino.snapid = CEPH_SNAPDIR;
1371 ceph_assert(inode_map.count(vino));
1372 diri = inode_map[vino];
1373
1374 string dname = request->path.last_dentry();
1375
1376 LeaseStat dlease;
1377 dlease.duration_ms = 0;
1378
1379 if (in) {
1380 Dir *dir = diri->open_dir();
1381 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1382 } else {
1383 if (diri->dir && diri->dir->dentries.count(dname)) {
1384 Dentry *dn = diri->dir->dentries[dname];
1385 if (dn->inode)
1386 unlink(dn, true, true); // keep dir, dentry
1387 }
1388 }
1389 }
1390
1391 if (in) {
1392 if (op == CEPH_MDS_OP_READDIR ||
1393 op == CEPH_MDS_OP_LSSNAP) {
1394 insert_readdir_results(request, session, in);
1395 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1396 // hack: return parent inode instead
1397 in = diri;
1398 }
1399
1400 if (request->dentry() == NULL && in != request->inode()) {
1401 // pin the target inode if its parent dentry is not pinned
1402 request->set_other_inode(in);
1403 }
1404 }
1405
1406 if (realm)
1407 put_snap_realm(realm);
1408
1409 request->target = in;
1410 return in;
1411 }
1412
1413 // -------
1414
1415 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1416 {
1417 mds_rank_t mds = MDS_RANK_NONE;
1418 __u32 hash = 0;
1419 bool is_hash = false;
1420
1421 Inode *in = NULL;
1422 Dentry *de = NULL;
1423
1424 if (req->resend_mds >= 0) {
1425 mds = req->resend_mds;
1426 req->resend_mds = -1;
1427 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1428 goto out;
1429 }
1430
1431 if (cct->_conf->client_use_random_mds)
1432 goto random_mds;
1433
1434 in = req->inode();
1435 de = req->dentry();
1436 if (in) {
1437 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1438 if (req->path.depth()) {
1439 hash = in->hash_dentry_name(req->path[0]);
1440 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1441 << " on " << req->path[0]
1442 << " => " << hash << dendl;
1443 is_hash = true;
1444 }
1445 } else if (de) {
1446 if (de->inode) {
1447 in = de->inode.get();
1448 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1449 } else {
1450 in = de->dir->parent_inode;
1451 hash = in->hash_dentry_name(de->name);
1452 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1453 << " on " << de->name
1454 << " => " << hash << dendl;
1455 is_hash = true;
1456 }
1457 }
1458 if (in) {
1459 if (in->snapid != CEPH_NOSNAP) {
1460 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1461 while (in->snapid != CEPH_NOSNAP) {
1462 if (in->snapid == CEPH_SNAPDIR)
1463 in = in->snapdir_parent.get();
1464 else if (!in->dentries.empty())
1465 /* In most cases there will only be one dentry, so getting it
1466 * will be the correct action. If there are multiple hard links,
1467 * I think the MDS should be able to redirect as needed*/
1468 in = in->get_first_parent()->dir->parent_inode;
1469 else {
1470 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1471 break;
1472 }
1473 }
1474 is_hash = false;
1475 }
1476
1477 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1478 << " hash=" << hash << dendl;
1479
1480 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1481 frag_t fg = in->dirfragtree[hash];
1482 if (in->fragmap.count(fg)) {
1483 mds = in->fragmap[fg];
1484 if (phash_diri)
1485 *phash_diri = in;
1486 } else if (in->auth_cap) {
1487 mds = in->auth_cap->session->mds_num;
1488 }
1489 if (mds >= 0) {
1490 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1491 goto out;
1492 }
1493 }
1494
1495 if (in->auth_cap && req->auth_is_best()) {
1496 mds = in->auth_cap->session->mds_num;
1497 } else if (!in->caps.empty()) {
1498 mds = in->caps.begin()->second.session->mds_num;
1499 } else {
1500 goto random_mds;
1501 }
1502 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1503
1504 goto out;
1505 }
1506
1507 random_mds:
1508 if (mds < 0) {
1509 mds = _get_random_up_mds();
1510 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1511 }
1512
1513 out:
1514 ldout(cct, 20) << "mds is " << mds << dendl;
1515 return mds;
1516 }
1517
1518
1519 void Client::connect_mds_targets(mds_rank_t mds)
1520 {
1521 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1522 ceph_assert(mds_sessions.count(mds));
1523 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1524 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1525 q != info.export_targets.end();
1526 ++q) {
1527 if (mds_sessions.count(*q) == 0 &&
1528 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1529 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1530 << " export target mds." << *q << dendl;
1531 _open_mds_session(*q);
1532 }
1533 }
1534 }
1535
1536 void Client::dump_mds_sessions(Formatter *f)
1537 {
1538 f->dump_int("id", get_nodeid().v);
1539 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1540 f->dump_object("inst", inst);
1541 f->dump_stream("inst_str") << inst;
1542 f->dump_stream("addr_str") << inst.addr;
1543 f->open_array_section("sessions");
1544 for (const auto &p : mds_sessions) {
1545 f->open_object_section("session");
1546 p.second.dump(f);
1547 f->close_section();
1548 }
1549 f->close_section();
1550 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1551 }
1552 void Client::dump_mds_requests(Formatter *f)
1553 {
1554 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1555 p != mds_requests.end();
1556 ++p) {
1557 f->open_object_section("request");
1558 p->second->dump(f);
1559 f->close_section();
1560 }
1561 }
1562
1563 int Client::verify_reply_trace(int r, MetaSession *session,
1564 MetaRequest *request, const MConstRef<MClientReply>& reply,
1565 InodeRef *ptarget, bool *pcreated,
1566 const UserPerm& perms)
1567 {
1568 // check whether this request actually did the create, and set created flag
1569 bufferlist extra_bl;
1570 inodeno_t created_ino;
1571 bool got_created_ino = false;
1572 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1573
1574 extra_bl = reply->get_extra_bl();
1575 if (extra_bl.length() >= 8) {
1576 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1577 struct openc_response_t ocres;
1578
1579 decode(ocres, extra_bl);
1580 created_ino = ocres.created_ino;
1581 /*
1582 * The userland cephfs client doesn't have a way to do an async create
1583 * (yet), so just discard delegated_inos for now. Eventually we should
1584 * store them and use them in create calls, even if they are synchronous,
1585 * if only for testing purposes.
1586 */
1587 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1588 } else {
1589 // u64 containing number of created ino
1590 decode(created_ino, extra_bl);
1591 }
1592 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1593 got_created_ino = true;
1594 }
1595
1596 if (pcreated)
1597 *pcreated = got_created_ino;
1598
1599 if (request->target) {
1600 *ptarget = request->target;
1601 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1602 } else {
1603 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1604 (*ptarget) = p->second;
1605 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1606 } else {
1607 // we got a traceless reply, and need to look up what we just
1608 // created. for now, do this by name. someday, do this by the
1609 // ino... which we know! FIXME.
1610 InodeRef target;
1611 Dentry *d = request->dentry();
1612 if (d) {
1613 if (d->dir) {
1614 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1615 << d->dir->parent_inode->ino << "/" << d->name
1616 << " got_ino " << got_created_ino
1617 << " ino " << created_ino
1618 << dendl;
1619 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1620 &target, perms);
1621 } else {
1622 // if the dentry is not linked, just do our best. see #5021.
1623 ceph_abort_msg("how did this happen? i want logs!");
1624 }
1625 } else {
1626 Inode *in = request->inode();
1627 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1628 << in->ino << dendl;
1629 r = _getattr(in, request->regetattr_mask, perms, true);
1630 target = in;
1631 }
1632 if (r >= 0) {
1633 // verify ino returned in reply and trace_dist are the same
1634 if (got_created_ino &&
1635 created_ino.val != target->ino.val) {
1636 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1637 r = -EINTR;
1638 }
1639 if (ptarget)
1640 ptarget->swap(target);
1641 }
1642 }
1643 }
1644
1645 return r;
1646 }
1647
1648
1649 /**
1650 * make a request
1651 *
1652 * Blocking helper to make an MDS request.
1653 *
1654 * If the ptarget flag is set, behavior changes slightly: the caller
1655 * expects to get a pointer to the inode we are creating or operating
1656 * on. As a result, we will follow up any traceless mutation reply
1657 * with a getattr or lookup to transparently handle a traceless reply
1658 * from the MDS (as when the MDS restarts and the client has to replay
1659 * a request).
1660 *
1661 * @param request the MetaRequest to execute
1662 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665 * @param use_mds [optional] prefer a specific mds (-1 for default)
1666 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1667 */
1668 int Client::make_request(MetaRequest *request,
1669 const UserPerm& perms,
1670 InodeRef *ptarget, bool *pcreated,
1671 mds_rank_t use_mds,
1672 bufferlist *pdirbl)
1673 {
1674 int r = 0;
1675
1676 // assign a unique tid
1677 ceph_tid_t tid = ++last_tid;
1678 request->set_tid(tid);
1679
1680 // and timestamp
1681 request->op_stamp = ceph_clock_now();
1682
1683 // make note
1684 mds_requests[tid] = request->get();
1685 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1686 oldest_tid = tid;
1687
1688 request->set_caller_perms(perms);
1689
1690 if (cct->_conf->client_inject_fixed_oldest_tid) {
1691 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1692 request->set_oldest_client_tid(1);
1693 } else {
1694 request->set_oldest_client_tid(oldest_tid);
1695 }
1696
1697 // hack target mds?
1698 if (use_mds >= 0)
1699 request->resend_mds = use_mds;
1700
1701 MetaSession *session = NULL;
1702 while (1) {
1703 if (request->aborted())
1704 break;
1705
1706 if (blacklisted) {
1707 request->abort(-EBLACKLISTED);
1708 break;
1709 }
1710
1711 // set up wait cond
1712 ceph::condition_variable caller_cond;
1713 request->caller_cond = &caller_cond;
1714
1715 // choose mds
1716 Inode *hash_diri = NULL;
1717 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1718 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1719 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1720 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1721 if (hash_diri) {
1722 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1723 _fragmap_remove_stopped_mds(hash_diri, mds);
1724 } else {
1725 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1726 request->resend_mds = _get_random_up_mds();
1727 }
1728 } else {
1729 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1730 wait_on_list(waiting_for_mdsmap);
1731 }
1732 continue;
1733 }
1734
1735 // open a session?
1736 if (!have_open_session(mds)) {
1737 session = _get_or_open_mds_session(mds);
1738 if (session->state == MetaSession::STATE_REJECTED) {
1739 request->abort(-EPERM);
1740 break;
1741 }
1742 // wait
1743 if (session->state == MetaSession::STATE_OPENING) {
1744 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1745 wait_on_context_list(session->waiting_for_open);
1746 continue;
1747 }
1748
1749 if (!have_open_session(mds))
1750 continue;
1751 } else {
1752 session = &mds_sessions.at(mds);
1753 }
1754
1755 // send request.
1756 send_request(request, session);
1757
1758 // wait for signal
1759 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1760 request->kick = false;
1761 std::unique_lock l{client_lock, std::adopt_lock};
1762 caller_cond.wait(l, [request] {
1763 return (request->reply || // reply
1764 request->resend_mds >= 0 || // forward
1765 request->kick);
1766 });
1767 l.release();
1768 request->caller_cond = nullptr;
1769
1770 // did we get a reply?
1771 if (request->reply)
1772 break;
1773 }
1774
1775 if (!request->reply) {
1776 ceph_assert(request->aborted());
1777 ceph_assert(!request->got_unsafe);
1778 r = request->get_abort_code();
1779 request->item.remove_myself();
1780 unregister_request(request);
1781 put_request(request);
1782 return r;
1783 }
1784
1785 // got it!
1786 auto reply = std::move(request->reply);
1787 r = reply->get_result();
1788 if (r >= 0)
1789 request->success = true;
1790
1791 // kick dispatcher (we've got it!)
1792 ceph_assert(request->dispatch_cond);
1793 request->dispatch_cond->notify_all();
1794 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1795 request->dispatch_cond = 0;
1796
1797 if (r >= 0 && ptarget)
1798 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1799
1800 if (pdirbl)
1801 *pdirbl = reply->get_extra_bl();
1802
1803 // -- log times --
1804 utime_t lat = ceph_clock_now();
1805 lat -= request->sent_stamp;
1806 ldout(cct, 20) << "lat " << lat << dendl;
1807 logger->tinc(l_c_lat, lat);
1808 logger->tinc(l_c_reply, lat);
1809
1810 put_request(request);
1811 return r;
1812 }
1813
1814 void Client::unregister_request(MetaRequest *req)
1815 {
1816 mds_requests.erase(req->tid);
1817 if (req->tid == oldest_tid) {
1818 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1819 while (true) {
1820 if (p == mds_requests.end()) {
1821 oldest_tid = 0;
1822 break;
1823 }
1824 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1825 oldest_tid = p->first;
1826 break;
1827 }
1828 ++p;
1829 }
1830 }
1831 put_request(req);
1832 }
1833
1834 void Client::put_request(MetaRequest *request)
1835 {
1836 if (request->_put()) {
1837 int op = -1;
1838 if (request->success)
1839 op = request->get_op();
1840 InodeRef other_in;
1841 request->take_other_inode(&other_in);
1842 delete request;
1843
1844 if (other_in &&
1845 (op == CEPH_MDS_OP_RMDIR ||
1846 op == CEPH_MDS_OP_RENAME ||
1847 op == CEPH_MDS_OP_RMSNAP)) {
1848 _try_to_trim_inode(other_in.get(), false);
1849 }
1850 }
1851 }
1852
1853 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1854 mds_rank_t mds, int drop,
1855 int unless, int force)
1856 {
1857 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1858 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1859 << ", force:" << force << ")" << dendl;
1860 int released = 0;
1861 auto it = in->caps.find(mds);
1862 if (it != in->caps.end()) {
1863 Cap &cap = it->second;
1864 drop &= ~(in->dirty_caps | get_caps_used(in));
1865 if ((drop & cap.issued) &&
1866 !(unless & cap.issued)) {
1867 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1868 cap.issued &= ~drop;
1869 cap.implemented &= ~drop;
1870 released = 1;
1871 } else {
1872 released = force;
1873 }
1874 if (released) {
1875 cap.wanted = in->caps_wanted();
1876 if (&cap == in->auth_cap &&
1877 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1878 in->requested_max_size = 0;
1879 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1880 }
1881 ceph_mds_request_release rel;
1882 rel.ino = in->ino;
1883 rel.cap_id = cap.cap_id;
1884 rel.seq = cap.seq;
1885 rel.issue_seq = cap.issue_seq;
1886 rel.mseq = cap.mseq;
1887 rel.caps = cap.implemented;
1888 rel.wanted = cap.wanted;
1889 rel.dname_len = 0;
1890 rel.dname_seq = 0;
1891 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1892 }
1893 }
1894 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1895 << released << dendl;
1896 return released;
1897 }
1898
1899 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1900 mds_rank_t mds, int drop, int unless)
1901 {
1902 ldout(cct, 20) << __func__ << " enter(dn:"
1903 << dn << ")" << dendl;
1904 int released = 0;
1905 if (dn->dir)
1906 released = encode_inode_release(dn->dir->parent_inode, req,
1907 mds, drop, unless, 1);
1908 if (released && dn->lease_mds == mds) {
1909 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1910 auto& rel = req->cap_releases.back();
1911 rel.item.dname_len = dn->name.length();
1912 rel.item.dname_seq = dn->lease_seq;
1913 rel.dname = dn->name;
1914 }
1915 ldout(cct, 25) << __func__ << " exit(dn:"
1916 << dn << ")" << dendl;
1917 }
1918
1919
1920 /*
1921 * This requires the MClientRequest *request member to be set.
1922 * It will error out horribly without one.
1923 * Additionally, if you set any *drop member, you'd better have
1924 * set the corresponding dentry!
1925 */
1926 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1927 {
1928 ldout(cct, 20) << __func__ << " enter (req: "
1929 << req << ", mds: " << mds << ")" << dendl;
1930 if (req->inode_drop && req->inode())
1931 encode_inode_release(req->inode(), req,
1932 mds, req->inode_drop,
1933 req->inode_unless);
1934
1935 if (req->old_inode_drop && req->old_inode())
1936 encode_inode_release(req->old_inode(), req,
1937 mds, req->old_inode_drop,
1938 req->old_inode_unless);
1939 if (req->other_inode_drop && req->other_inode())
1940 encode_inode_release(req->other_inode(), req,
1941 mds, req->other_inode_drop,
1942 req->other_inode_unless);
1943
1944 if (req->dentry_drop && req->dentry())
1945 encode_dentry_release(req->dentry(), req,
1946 mds, req->dentry_drop,
1947 req->dentry_unless);
1948
1949 if (req->old_dentry_drop && req->old_dentry())
1950 encode_dentry_release(req->old_dentry(), req,
1951 mds, req->old_dentry_drop,
1952 req->old_dentry_unless);
1953 ldout(cct, 25) << __func__ << " exit (req: "
1954 << req << ", mds " << mds <<dendl;
1955 }
1956
1957 bool Client::have_open_session(mds_rank_t mds)
1958 {
1959 const auto &it = mds_sessions.find(mds);
1960 return it != mds_sessions.end() &&
1961 (it->second.state == MetaSession::STATE_OPEN ||
1962 it->second.state == MetaSession::STATE_STALE);
1963 }
1964
1965 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1966 {
1967 const auto &it = mds_sessions.find(mds);
1968 if (it == mds_sessions.end() || it->second.con != con) {
1969 return NULL;
1970 } else {
1971 return &it->second;
1972 }
1973 }
1974
1975 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1976 {
1977 auto it = mds_sessions.find(mds);
1978 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1979 }
1980
1981 /**
1982 * Populate a map of strings with client-identifying metadata,
1983 * such as the hostname. Call this once at initialization.
1984 */
1985 void Client::populate_metadata(const std::string &mount_root)
1986 {
1987 // Hostname
1988 struct utsname u;
1989 int r = uname(&u);
1990 if (r >= 0) {
1991 metadata["hostname"] = u.nodename;
1992 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1993 } else {
1994 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1995 }
1996
1997 metadata["pid"] = stringify(getpid());
1998
1999 // Ceph entity id (the '0' in "client.0")
2000 metadata["entity_id"] = cct->_conf->name.get_id();
2001
2002 // Our mount position
2003 if (!mount_root.empty()) {
2004 metadata["root"] = mount_root;
2005 }
2006
2007 // Ceph version
2008 metadata["ceph_version"] = pretty_version_to_str();
2009 metadata["ceph_sha1"] = git_version_to_str();
2010
2011 // Apply any metadata from the user's configured overrides
2012 std::vector<std::string> tokens;
2013 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2014 for (const auto &i : tokens) {
2015 auto eqpos = i.find("=");
2016 // Throw out anything that isn't of the form "<str>=<str>"
2017 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2018 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2019 continue;
2020 }
2021 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2022 }
2023 }
2024
2025 /**
2026 * Optionally add or override client metadata fields.
2027 */
2028 void Client::update_metadata(std::string const &k, std::string const &v)
2029 {
2030 std::lock_guard l(client_lock);
2031 ceph_assert(initialized);
2032
2033 auto it = metadata.find(k);
2034 if (it != metadata.end()) {
2035 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2036 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2037 }
2038
2039 metadata[k] = v;
2040 }
2041
2042 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2043 {
2044 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2045 auto addrs = mdsmap->get_addrs(mds);
2046 auto em = mds_sessions.emplace(std::piecewise_construct,
2047 std::forward_as_tuple(mds),
2048 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2049 ceph_assert(em.second); /* not already present */
2050 MetaSession *session = &em.first->second;
2051
2052 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2053 m->metadata = metadata;
2054 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2055 session->con->send_message2(std::move(m));
2056 return session;
2057 }
2058
2059 void Client::_close_mds_session(MetaSession *s)
2060 {
2061 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2062 s->state = MetaSession::STATE_CLOSING;
2063 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2064 }
2065
2066 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2067 {
2068 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2069 if (rejected && s->state != MetaSession::STATE_CLOSING)
2070 s->state = MetaSession::STATE_REJECTED;
2071 else
2072 s->state = MetaSession::STATE_CLOSED;
2073 s->con->mark_down();
2074 signal_context_list(s->waiting_for_open);
2075 mount_cond.notify_all();
2076 remove_session_caps(s, err);
2077 kick_requests_closed(s);
2078 mds_ranks_closing.erase(s->mds_num);
2079 if (s->state == MetaSession::STATE_CLOSED)
2080 mds_sessions.erase(s->mds_num);
2081 }
2082
2083 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2084 {
2085 mds_rank_t from = mds_rank_t(m->get_source().num());
2086 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2087
2088 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2089 if (!session) {
2090 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2091 return;
2092 }
2093
2094 switch (m->get_op()) {
2095 case CEPH_SESSION_OPEN:
2096 {
2097 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2098 missing_features -= m->supported_features;
2099 if (!missing_features.empty()) {
2100 lderr(cct) << "mds." << from << " lacks required features '"
2101 << missing_features << "', closing session " << dendl;
2102 _close_mds_session(session);
2103 _closed_mds_session(session, -EPERM, true);
2104 break;
2105 }
2106 session->mds_features = std::move(m->supported_features);
2107
2108 renew_caps(session);
2109 session->state = MetaSession::STATE_OPEN;
2110 if (unmounting)
2111 mount_cond.notify_all();
2112 else
2113 connect_mds_targets(from);
2114 signal_context_list(session->waiting_for_open);
2115 break;
2116 }
2117
2118 case CEPH_SESSION_CLOSE:
2119 _closed_mds_session(session);
2120 break;
2121
2122 case CEPH_SESSION_RENEWCAPS:
2123 if (session->cap_renew_seq == m->get_seq()) {
2124 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2125 session->cap_ttl =
2126 session->last_cap_renew_request + mdsmap->get_session_timeout();
2127 if (was_stale)
2128 wake_up_session_caps(session, false);
2129 }
2130 break;
2131
2132 case CEPH_SESSION_STALE:
2133 // invalidate session caps/leases
2134 session->cap_gen++;
2135 session->cap_ttl = ceph_clock_now();
2136 session->cap_ttl -= 1;
2137 renew_caps(session);
2138 break;
2139
2140 case CEPH_SESSION_RECALL_STATE:
2141 trim_caps(session, m->get_max_caps());
2142 break;
2143
2144 case CEPH_SESSION_FLUSHMSG:
2145 /* flush cap release */
2146 if (auto& m = session->release; m) {
2147 session->con->send_message2(std::move(m));
2148 }
2149 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2150 break;
2151
2152 case CEPH_SESSION_FORCE_RO:
2153 force_session_readonly(session);
2154 break;
2155
2156 case CEPH_SESSION_REJECT:
2157 {
2158 std::string_view error_str;
2159 auto it = m->metadata.find("error_string");
2160 if (it != m->metadata.end())
2161 error_str = it->second;
2162 else
2163 error_str = "unknown error";
2164 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2165
2166 _closed_mds_session(session, -EPERM, true);
2167 }
2168 break;
2169
2170 default:
2171 ceph_abort();
2172 }
2173 }
2174
2175 bool Client::_any_stale_sessions() const
2176 {
2177 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2178
2179 for (const auto &p : mds_sessions) {
2180 if (p.second.state == MetaSession::STATE_STALE) {
2181 return true;
2182 }
2183 }
2184
2185 return false;
2186 }
2187
2188 void Client::_kick_stale_sessions()
2189 {
2190 ldout(cct, 1) << __func__ << dendl;
2191
2192 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2193 MetaSession &s = it->second;
2194 if (s.state == MetaSession::STATE_REJECTED) {
2195 mds_sessions.erase(it++);
2196 continue;
2197 }
2198 ++it;
2199 if (s.state == MetaSession::STATE_STALE)
2200 _closed_mds_session(&s);
2201 }
2202 }
2203
2204 void Client::send_request(MetaRequest *request, MetaSession *session,
2205 bool drop_cap_releases)
2206 {
2207 // make the request
2208 mds_rank_t mds = session->mds_num;
2209 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2210 << " for mds." << mds << dendl;
2211 auto r = build_client_request(request);
2212 if (request->dentry()) {
2213 r->set_dentry_wanted();
2214 }
2215 if (request->got_unsafe) {
2216 r->set_replayed_op();
2217 if (request->target)
2218 r->head.ino = request->target->ino;
2219 } else {
2220 encode_cap_releases(request, mds);
2221 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2222 request->cap_releases.clear();
2223 else
2224 r->releases.swap(request->cap_releases);
2225 }
2226 r->set_mdsmap_epoch(mdsmap->get_epoch());
2227 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2228 objecter->with_osdmap([r](const OSDMap& o) {
2229 r->set_osdmap_epoch(o.get_epoch());
2230 });
2231 }
2232
2233 if (request->mds == -1) {
2234 request->sent_stamp = ceph_clock_now();
2235 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2236 }
2237 request->mds = mds;
2238
2239 Inode *in = request->inode();
2240 if (in) {
2241 auto it = in->caps.find(mds);
2242 if (it != in->caps.end()) {
2243 request->sent_on_mseq = it->second.mseq;
2244 }
2245 }
2246
2247 session->requests.push_back(&request->item);
2248
2249 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2250 session->con->send_message2(std::move(r));
2251 }
2252
2253 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2254 {
2255 auto req = make_message<MClientRequest>(request->get_op());
2256 req->set_tid(request->tid);
2257 req->set_stamp(request->op_stamp);
2258 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2259
2260 // if the filepath's haven't been set, set them!
2261 if (request->path.empty()) {
2262 Inode *in = request->inode();
2263 Dentry *de = request->dentry();
2264 if (in)
2265 in->make_nosnap_relative_path(request->path);
2266 else if (de) {
2267 if (de->inode)
2268 de->inode->make_nosnap_relative_path(request->path);
2269 else if (de->dir) {
2270 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2271 request->path.push_dentry(de->name);
2272 }
2273 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or appropriately-endowed dentry given!"
2275 << dendl;
2276 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2277 << " No path, inode, or dentry given!"
2278 << dendl;
2279 }
2280 req->set_filepath(request->get_filepath());
2281 req->set_filepath2(request->get_filepath2());
2282 req->set_data(request->data);
2283 req->set_retry_attempt(request->retry_attempt++);
2284 req->head.num_fwd = request->num_fwd;
2285 const gid_t *_gids;
2286 int gid_count = request->perms.get_gids(&_gids);
2287 req->set_gid_list(gid_count, _gids);
2288 return req;
2289 }
2290
2291
2292
2293 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2294 {
2295 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2296 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2297 if (!session) {
2298 return;
2299 }
2300 ceph_tid_t tid = fwd->get_tid();
2301
2302 if (mds_requests.count(tid) == 0) {
2303 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2304 return;
2305 }
2306
2307 MetaRequest *request = mds_requests[tid];
2308 ceph_assert(request);
2309
2310 // reset retry counter
2311 request->retry_attempt = 0;
2312
2313 // request not forwarded, or dest mds has no session.
2314 // resend.
2315 ldout(cct, 10) << __func__ << " tid " << tid
2316 << " fwd " << fwd->get_num_fwd()
2317 << " to mds." << fwd->get_dest_mds()
2318 << ", resending to " << fwd->get_dest_mds()
2319 << dendl;
2320
2321 request->mds = -1;
2322 request->item.remove_myself();
2323 request->num_fwd = fwd->get_num_fwd();
2324 request->resend_mds = fwd->get_dest_mds();
2325 request->caller_cond->notify_all();
2326 }
2327
2328 bool Client::is_dir_operation(MetaRequest *req)
2329 {
2330 int op = req->get_op();
2331 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2332 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2333 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2334 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2335 return true;
2336 return false;
2337 }
2338
2339 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2340 {
2341 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2342 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2343 if (!session) {
2344 return;
2345 }
2346
2347 ceph_tid_t tid = reply->get_tid();
2348 bool is_safe = reply->is_safe();
2349
2350 if (mds_requests.count(tid) == 0) {
2351 lderr(cct) << __func__ << " no pending request on tid " << tid
2352 << " safe is:" << is_safe << dendl;
2353 return;
2354 }
2355 MetaRequest *request = mds_requests.at(tid);
2356
2357 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2358 << " tid " << tid << dendl;
2359
2360 if (request->got_unsafe && !is_safe) {
2361 //duplicate response
2362 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2363 << mds_num << " safe:" << is_safe << dendl;
2364 return;
2365 }
2366
2367 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2368 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2369 << " from mds." << request->mds << dendl;
2370 request->send_to_auth = true;
2371 request->resend_mds = choose_target_mds(request);
2372 Inode *in = request->inode();
2373 std::map<mds_rank_t, Cap>::const_iterator it;
2374 if (request->resend_mds >= 0 &&
2375 request->resend_mds == request->mds &&
2376 (in == NULL ||
2377 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2378 request->sent_on_mseq == it->second.mseq)) {
2379 ldout(cct, 20) << "have to return ESTALE" << dendl;
2380 } else {
2381 request->caller_cond->notify_all();
2382 return;
2383 }
2384 }
2385
2386 ceph_assert(!request->reply);
2387 request->reply = reply;
2388 insert_trace(request, session);
2389
2390 // Handle unsafe reply
2391 if (!is_safe) {
2392 request->got_unsafe = true;
2393 session->unsafe_requests.push_back(&request->unsafe_item);
2394 if (is_dir_operation(request)) {
2395 Inode *dir = request->inode();
2396 ceph_assert(dir);
2397 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2398 }
2399 if (request->target) {
2400 InodeRef &in = request->target;
2401 in->unsafe_ops.push_back(&request->unsafe_target_item);
2402 }
2403 }
2404
2405 // Only signal the caller once (on the first reply):
2406 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2407 if (!is_safe || !request->got_unsafe) {
2408 ceph::condition_variable cond;
2409 request->dispatch_cond = &cond;
2410
2411 // wake up waiter
2412 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2413 request->caller_cond->notify_all();
2414
2415 // wake for kick back
2416 std::unique_lock l{client_lock, std::adopt_lock};
2417 cond.wait(l, [tid, request, &cond, this] {
2418 if (request->dispatch_cond) {
2419 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2420 << tid << " " << &cond << dendl;
2421 }
2422 return !request->dispatch_cond;
2423 });
2424 l.release();
2425 }
2426
2427 if (is_safe) {
2428 // the filesystem change is committed to disk
2429 // we're done, clean up
2430 if (request->got_unsafe) {
2431 request->unsafe_item.remove_myself();
2432 request->unsafe_dir_item.remove_myself();
2433 request->unsafe_target_item.remove_myself();
2434 signal_cond_list(request->waitfor_safe);
2435 }
2436 request->item.remove_myself();
2437 unregister_request(request);
2438 }
2439 if (unmounting)
2440 mount_cond.notify_all();
2441 }
2442
2443 void Client::_handle_full_flag(int64_t pool)
2444 {
2445 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2446 << "on " << pool << dendl;
2447 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2448 // to do this rather than blocking, because otherwise when we fill up we
2449 // potentially lock caps forever on files with dirty pages, and we need
2450 // to be able to release those caps to the MDS so that it can delete files
2451 // and free up space.
2452 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2453
2454 // For all inodes with layouts in this pool and a pending flush write op
2455 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2456 // from ObjectCacher so that it doesn't re-issue the write in response to
2457 // the ENOSPC error.
2458 // Fortunately since we're cancelling everything in a given pool, we don't
2459 // need to know which ops belong to which ObjectSet, we can just blow all
2460 // the un-flushed cached data away and mark any dirty inodes' async_err
2461 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2462 // affecting this pool, and all the objectsets we're purging were also
2463 // in this pool.
2464 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2465 i != inode_map.end(); ++i)
2466 {
2467 Inode *inode = i->second;
2468 if (inode->oset.dirty_or_tx
2469 && (pool == -1 || inode->layout.pool_id == pool)) {
2470 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2471 << " has dirty objects, purging and setting ENOSPC" << dendl;
2472 objectcacher->purge_set(&inode->oset);
2473 inode->set_async_err(-ENOSPC);
2474 }
2475 }
2476
2477 if (cancelled_epoch != (epoch_t)-1) {
2478 set_cap_epoch_barrier(cancelled_epoch);
2479 }
2480 }
2481
2482 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2483 {
2484 std::set<entity_addr_t> new_blacklists;
2485 objecter->consume_blacklist_events(&new_blacklists);
2486
2487 const auto myaddrs = messenger->get_myaddrs();
2488 bool new_blacklist = false;
2489 bool prenautilus = objecter->with_osdmap(
2490 [&](const OSDMap& o) {
2491 return o.require_osd_release < ceph_release_t::nautilus;
2492 });
2493 if (!blacklisted) {
2494 for (auto a : myaddrs.v) {
2495 // blacklist entries are always TYPE_ANY for nautilus+
2496 a.set_type(entity_addr_t::TYPE_ANY);
2497 if (new_blacklists.count(a)) {
2498 new_blacklist = true;
2499 break;
2500 }
2501 if (prenautilus) {
2502 // ...except pre-nautilus, they were TYPE_LEGACY
2503 a.set_type(entity_addr_t::TYPE_LEGACY);
2504 if (new_blacklists.count(a)) {
2505 new_blacklist = true;
2506 break;
2507 }
2508 }
2509 }
2510 }
2511 if (new_blacklist) {
2512 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2513 return o.get_epoch();
2514 });
2515 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2516 blacklisted = true;
2517
2518 _abort_mds_sessions(-EBLACKLISTED);
2519
2520 // Since we know all our OSD ops will fail, cancel them all preemtively,
2521 // so that on an unhealthy cluster we can umount promptly even if e.g.
2522 // some PGs were inaccessible.
2523 objecter->op_cancel_writes(-EBLACKLISTED);
2524
2525 } else if (blacklisted) {
2526 // Handle case where we were blacklisted but no longer are
2527 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2528 return o.is_blacklisted(myaddrs);});
2529 }
2530
2531 // Always subscribe to next osdmap for blacklisted client
2532 // until this client is not blacklisted.
2533 if (blacklisted) {
2534 objecter->maybe_request_map();
2535 }
2536
2537 if (objecter->osdmap_full_flag()) {
2538 _handle_full_flag(-1);
2539 } else {
2540 // Accumulate local list of full pools so that I can drop
2541 // the objecter lock before re-entering objecter in
2542 // cancel_writes
2543 std::vector<int64_t> full_pools;
2544
2545 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2546 for (const auto& kv : o.get_pools()) {
2547 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2548 full_pools.push_back(kv.first);
2549 }
2550 }
2551 });
2552
2553 for (auto p : full_pools)
2554 _handle_full_flag(p);
2555
2556 // Subscribe to subsequent maps to watch for the full flag going
2557 // away. For the global full flag objecter does this for us, but
2558 // it pays no attention to the per-pool full flag so in this branch
2559 // we do it ourselves.
2560 if (!full_pools.empty()) {
2561 objecter->maybe_request_map();
2562 }
2563 }
2564 }
2565
2566
2567 // ------------------------
2568 // incoming messages
2569
2570
2571 bool Client::ms_dispatch2(const MessageRef &m)
2572 {
2573 std::lock_guard l(client_lock);
2574 if (!initialized) {
2575 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2576 return true;
2577 }
2578
2579 switch (m->get_type()) {
2580 // mounting and mds sessions
2581 case CEPH_MSG_MDS_MAP:
2582 handle_mds_map(ref_cast<MMDSMap>(m));
2583 break;
2584 case CEPH_MSG_FS_MAP:
2585 handle_fs_map(ref_cast<MFSMap>(m));
2586 break;
2587 case CEPH_MSG_FS_MAP_USER:
2588 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2589 break;
2590 case CEPH_MSG_CLIENT_SESSION:
2591 handle_client_session(ref_cast<MClientSession>(m));
2592 break;
2593
2594 case CEPH_MSG_OSD_MAP:
2595 handle_osd_map(ref_cast<MOSDMap>(m));
2596 break;
2597
2598 // requests
2599 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2600 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2601 break;
2602 case CEPH_MSG_CLIENT_REPLY:
2603 handle_client_reply(ref_cast<MClientReply>(m));
2604 break;
2605
2606 // reclaim reply
2607 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2608 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2609 break;
2610
2611 case CEPH_MSG_CLIENT_SNAP:
2612 handle_snap(ref_cast<MClientSnap>(m));
2613 break;
2614 case CEPH_MSG_CLIENT_CAPS:
2615 handle_caps(ref_cast<MClientCaps>(m));
2616 break;
2617 case CEPH_MSG_CLIENT_LEASE:
2618 handle_lease(ref_cast<MClientLease>(m));
2619 break;
2620 case MSG_COMMAND_REPLY:
2621 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2622 handle_command_reply(ref_cast<MCommandReply>(m));
2623 } else {
2624 return false;
2625 }
2626 break;
2627 case CEPH_MSG_CLIENT_QUOTA:
2628 handle_quota(ref_cast<MClientQuota>(m));
2629 break;
2630
2631 default:
2632 return false;
2633 }
2634
2635 // unmounting?
2636 if (unmounting) {
2637 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2638 << "+" << inode_map.size() << dendl;
2639 long unsigned size = lru.lru_get_size() + inode_map.size();
2640 trim_cache();
2641 if (size < lru.lru_get_size() + inode_map.size()) {
2642 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2643 mount_cond.notify_all();
2644 } else {
2645 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2646 << "+" << inode_map.size() << dendl;
2647 }
2648 }
2649
2650 return true;
2651 }
2652
2653 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2654 {
2655 fsmap.reset(new FSMap(m->get_fsmap()));
2656
2657 signal_cond_list(waiting_for_fsmap);
2658
2659 monclient->sub_got("fsmap", fsmap->get_epoch());
2660 }
2661
2662 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2663 {
2664 fsmap_user.reset(new FSMapUser);
2665 *fsmap_user = m->get_fsmap();
2666
2667 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2668 signal_cond_list(waiting_for_fsmap);
2669 }
2670
2671 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2672 {
2673 mds_gid_t old_inc, new_inc;
2674 if (m->get_epoch() <= mdsmap->get_epoch()) {
2675 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2676 << " is identical to or older than our "
2677 << mdsmap->get_epoch() << dendl;
2678 return;
2679 }
2680
2681 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2682
2683 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2684 oldmap.swap(mdsmap);
2685
2686 mdsmap->decode(m->get_encoded());
2687
2688 // Cancel any commands for missing or laggy GIDs
2689 std::list<ceph_tid_t> cancel_ops;
2690 auto &commands = command_table.get_commands();
2691 for (const auto &i : commands) {
2692 auto &op = i.second;
2693 const mds_gid_t op_mds_gid = op.mds_gid;
2694 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2695 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2696 cancel_ops.push_back(i.first);
2697 if (op.outs) {
2698 std::ostringstream ss;
2699 ss << "MDS " << op_mds_gid << " went away";
2700 *(op.outs) = ss.str();
2701 }
2702 op.con->mark_down();
2703 if (op.on_finish) {
2704 op.on_finish->complete(-ETIMEDOUT);
2705 }
2706 }
2707 }
2708
2709 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2710 i != cancel_ops.end(); ++i) {
2711 command_table.erase(*i);
2712 }
2713
2714 // reset session
2715 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2716 mds_rank_t mds = p->first;
2717 MetaSession *session = &p->second;
2718 ++p;
2719
2720 int oldstate = oldmap->get_state(mds);
2721 int newstate = mdsmap->get_state(mds);
2722 if (!mdsmap->is_up(mds)) {
2723 session->con->mark_down();
2724 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2725 old_inc = oldmap->get_incarnation(mds);
2726 new_inc = mdsmap->get_incarnation(mds);
2727 if (old_inc != new_inc) {
2728 ldout(cct, 1) << "mds incarnation changed from "
2729 << old_inc << " to " << new_inc << dendl;
2730 oldstate = MDSMap::STATE_NULL;
2731 }
2732 session->con->mark_down();
2733 session->addrs = mdsmap->get_addrs(mds);
2734 // When new MDS starts to take over, notify kernel to trim unused entries
2735 // in its dcache/icache. Hopefully, the kernel will release some unused
2736 // inodes before the new MDS enters reconnect state.
2737 trim_cache_for_reconnect(session);
2738 } else if (oldstate == newstate)
2739 continue; // no change
2740
2741 session->mds_state = newstate;
2742 if (newstate == MDSMap::STATE_RECONNECT) {
2743 session->con = messenger->connect_to_mds(session->addrs);
2744 send_reconnect(session);
2745 } else if (newstate > MDSMap::STATE_RECONNECT) {
2746 if (oldstate < MDSMap::STATE_RECONNECT) {
2747 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2748 _closed_mds_session(session);
2749 continue;
2750 }
2751 if (newstate >= MDSMap::STATE_ACTIVE) {
2752 if (oldstate < MDSMap::STATE_ACTIVE) {
2753 // kick new requests
2754 kick_requests(session);
2755 kick_flushing_caps(session);
2756 signal_context_list(session->waiting_for_open);
2757 wake_up_session_caps(session, true);
2758 }
2759 connect_mds_targets(mds);
2760 }
2761 } else if (newstate == MDSMap::STATE_NULL &&
2762 mds >= mdsmap->get_max_mds()) {
2763 _closed_mds_session(session);
2764 }
2765 }
2766
2767 // kick any waiting threads
2768 signal_cond_list(waiting_for_mdsmap);
2769
2770 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2771 }
2772
2773 void Client::send_reconnect(MetaSession *session)
2774 {
2775 mds_rank_t mds = session->mds_num;
2776 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2777
2778 // trim unused caps to reduce MDS's cache rejoin time
2779 trim_cache_for_reconnect(session);
2780
2781 session->readonly = false;
2782
2783 session->release.reset();
2784
2785 // reset my cap seq number
2786 session->seq = 0;
2787 //connect to the mds' offload targets
2788 connect_mds_targets(mds);
2789 //make sure unsafe requests get saved
2790 resend_unsafe_requests(session);
2791
2792 early_kick_flushing_caps(session);
2793
2794 auto m = make_message<MClientReconnect>();
2795 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2796
2797 // i have an open session.
2798 ceph::unordered_set<inodeno_t> did_snaprealm;
2799 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2800 p != inode_map.end();
2801 ++p) {
2802 Inode *in = p->second;
2803 auto it = in->caps.find(mds);
2804 if (it != in->caps.end()) {
2805 if (allow_multi &&
2806 m->get_approx_size() >=
2807 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2808 m->mark_more();
2809 session->con->send_message2(std::move(m));
2810
2811 m = make_message<MClientReconnect>();
2812 }
2813
2814 Cap &cap = it->second;
2815 ldout(cct, 10) << " caps on " << p->first
2816 << " " << ccap_string(cap.issued)
2817 << " wants " << ccap_string(in->caps_wanted())
2818 << dendl;
2819 filepath path;
2820 in->make_long_path(path);
2821 ldout(cct, 10) << " path " << path << dendl;
2822
2823 bufferlist flockbl;
2824 _encode_filelocks(in, flockbl);
2825
2826 cap.seq = 0; // reset seq.
2827 cap.issue_seq = 0; // reset seq.
2828 cap.mseq = 0; // reset seq.
2829 // cap gen should catch up with session cap_gen
2830 if (cap.gen < session->cap_gen) {
2831 cap.gen = session->cap_gen;
2832 cap.issued = cap.implemented = CEPH_CAP_PIN;
2833 } else {
2834 cap.issued = cap.implemented;
2835 }
2836 snapid_t snap_follows = 0;
2837 if (!in->cap_snaps.empty())
2838 snap_follows = in->cap_snaps.begin()->first;
2839
2840 m->add_cap(p->first.ino,
2841 cap.cap_id,
2842 path.get_ino(), path.get_path(), // ino
2843 in->caps_wanted(), // wanted
2844 cap.issued, // issued
2845 in->snaprealm->ino,
2846 snap_follows,
2847 flockbl);
2848
2849 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2850 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2851 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2852 did_snaprealm.insert(in->snaprealm->ino);
2853 }
2854 }
2855 }
2856
2857 if (!allow_multi)
2858 m->set_encoding_version(0); // use connection features to choose encoding
2859 session->con->send_message2(std::move(m));
2860
2861 mount_cond.notify_all();
2862
2863 if (session->reclaim_state == MetaSession::RECLAIMING)
2864 signal_cond_list(waiting_for_reclaim);
2865 }
2866
2867
2868 void Client::kick_requests(MetaSession *session)
2869 {
2870 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2871 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2872 p != mds_requests.end();
2873 ++p) {
2874 MetaRequest *req = p->second;
2875 if (req->got_unsafe)
2876 continue;
2877 if (req->aborted()) {
2878 if (req->caller_cond) {
2879 req->kick = true;
2880 req->caller_cond->notify_all();
2881 }
2882 continue;
2883 }
2884 if (req->retry_attempt > 0)
2885 continue; // new requests only
2886 if (req->mds == session->mds_num) {
2887 send_request(p->second, session);
2888 }
2889 }
2890 }
2891
2892 void Client::resend_unsafe_requests(MetaSession *session)
2893 {
2894 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2895 !iter.end();
2896 ++iter)
2897 send_request(*iter, session);
2898
2899 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2900 // process completed requests in clientreplay stage.
2901 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2902 p != mds_requests.end();
2903 ++p) {
2904 MetaRequest *req = p->second;
2905 if (req->got_unsafe)
2906 continue;
2907 if (req->aborted())
2908 continue;
2909 if (req->retry_attempt == 0)
2910 continue; // old requests only
2911 if (req->mds == session->mds_num)
2912 send_request(req, session, true);
2913 }
2914 }
2915
2916 void Client::wait_unsafe_requests()
2917 {
2918 list<MetaRequest*> last_unsafe_reqs;
2919 for (const auto &p : mds_sessions) {
2920 const MetaSession &s = p.second;
2921 if (!s.unsafe_requests.empty()) {
2922 MetaRequest *req = s.unsafe_requests.back();
2923 req->get();
2924 last_unsafe_reqs.push_back(req);
2925 }
2926 }
2927
2928 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2929 p != last_unsafe_reqs.end();
2930 ++p) {
2931 MetaRequest *req = *p;
2932 if (req->unsafe_item.is_on_list())
2933 wait_on_list(req->waitfor_safe);
2934 put_request(req);
2935 }
2936 }
2937
2938 void Client::kick_requests_closed(MetaSession *session)
2939 {
2940 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2941 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2942 p != mds_requests.end(); ) {
2943 MetaRequest *req = p->second;
2944 ++p;
2945 if (req->mds == session->mds_num) {
2946 if (req->caller_cond) {
2947 req->kick = true;
2948 req->caller_cond->notify_all();
2949 }
2950 req->item.remove_myself();
2951 if (req->got_unsafe) {
2952 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2953 req->unsafe_item.remove_myself();
2954 if (is_dir_operation(req)) {
2955 Inode *dir = req->inode();
2956 assert(dir);
2957 dir->set_async_err(-EIO);
2958 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2959 << dir->ino << " " << req->get_tid() << dendl;
2960 req->unsafe_dir_item.remove_myself();
2961 }
2962 if (req->target) {
2963 InodeRef &in = req->target;
2964 in->set_async_err(-EIO);
2965 lderr(cct) << "kick_requests_closed drop req of inode : "
2966 << in->ino << " " << req->get_tid() << dendl;
2967 req->unsafe_target_item.remove_myself();
2968 }
2969 signal_cond_list(req->waitfor_safe);
2970 unregister_request(req);
2971 }
2972 }
2973 }
2974 ceph_assert(session->requests.empty());
2975 ceph_assert(session->unsafe_requests.empty());
2976 }
2977
2978
2979
2980
2981 /************
2982 * leases
2983 */
2984
2985 void Client::got_mds_push(MetaSession *s)
2986 {
2987 s->seq++;
2988 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2989 if (s->state == MetaSession::STATE_CLOSING) {
2990 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2991 }
2992 }
2993
2994 void Client::handle_lease(const MConstRef<MClientLease>& m)
2995 {
2996 ldout(cct, 10) << __func__ << " " << *m << dendl;
2997
2998 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2999
3000 mds_rank_t mds = mds_rank_t(m->get_source().num());
3001 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3002 if (!session) {
3003 return;
3004 }
3005
3006 got_mds_push(session);
3007
3008 ceph_seq_t seq = m->get_seq();
3009
3010 Inode *in;
3011 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3012 if (inode_map.count(vino) == 0) {
3013 ldout(cct, 10) << " don't have vino " << vino << dendl;
3014 goto revoke;
3015 }
3016 in = inode_map[vino];
3017
3018 if (m->get_mask() & CEPH_LEASE_VALID) {
3019 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3020 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3021 goto revoke;
3022 }
3023 Dentry *dn = in->dir->dentries[m->dname];
3024 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3025 dn->lease_mds = -1;
3026 }
3027
3028 revoke:
3029 {
3030 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3031 m->get_mask(), m->get_ino(),
3032 m->get_first(), m->get_last(), m->dname);
3033 m->get_connection()->send_message2(std::move(reply));
3034 }
3035 }
3036
3037 void Client::put_inode(Inode *in, int n)
3038 {
3039 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3040 int left = in->_put(n);
3041 if (left == 0) {
3042 // release any caps
3043 remove_all_caps(in);
3044
3045 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3046 bool unclean = objectcacher->release_set(&in->oset);
3047 ceph_assert(!unclean);
3048 inode_map.erase(in->vino());
3049 if (use_faked_inos())
3050 _release_faked_ino(in);
3051
3052 if (in == root) {
3053 root = 0;
3054 root_ancestor = 0;
3055 while (!root_parents.empty())
3056 root_parents.erase(root_parents.begin());
3057 }
3058
3059 delete in;
3060 }
3061 }
3062
3063 void Client::close_dir(Dir *dir)
3064 {
3065 Inode *in = dir->parent_inode;
3066 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3067 ceph_assert(dir->is_empty());
3068 ceph_assert(in->dir == dir);
3069 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3070 if (!in->dentries.empty())
3071 in->get_first_parent()->put(); // unpin dentry
3072
3073 delete in->dir;
3074 in->dir = 0;
3075 put_inode(in); // unpin inode
3076 }
3077
3078 /**
3079 * Don't call this with in==NULL, use get_or_create for that
3080 * leave dn set to default NULL unless you're trying to add
3081 * a new inode to a pre-created Dentry
3082 */
3083 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3084 {
3085 if (!dn) {
3086 // create a new Dentry
3087 dn = new Dentry(dir, name);
3088
3089 lru.lru_insert_mid(dn); // mid or top?
3090
3091 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3092 << " dn " << dn << " (new dn)" << dendl;
3093 } else {
3094 ceph_assert(!dn->inode);
3095 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3096 << " dn " << dn << " (old dn)" << dendl;
3097 }
3098
3099 if (in) { // link to inode
3100 InodeRef tmp_ref;
3101 // only one parent for directories!
3102 if (in->is_dir() && !in->dentries.empty()) {
3103 tmp_ref = in; // prevent unlink below from freeing the inode.
3104 Dentry *olddn = in->get_first_parent();
3105 ceph_assert(olddn->dir != dir || olddn->name != name);
3106 Inode *old_diri = olddn->dir->parent_inode;
3107 old_diri->dir_release_count++;
3108 clear_dir_complete_and_ordered(old_diri, true);
3109 unlink(olddn, true, true); // keep dir, dentry
3110 }
3111
3112 dn->link(in);
3113 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3114 }
3115
3116 return dn;
3117 }
3118
3119 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3120 {
3121 InodeRef in(dn->inode);
3122 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3123 << " inode " << dn->inode << dendl;
3124
3125 // unlink from inode
3126 if (dn->inode) {
3127 dn->unlink();
3128 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3129 }
3130
3131 if (keepdentry) {
3132 dn->lease_mds = -1;
3133 } else {
3134 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3135
3136 // unlink from dir
3137 Dir *dir = dn->dir;
3138 dn->detach();
3139
3140 // delete den
3141 lru.lru_remove(dn);
3142 dn->put();
3143
3144 if (dir->is_empty() && !keepdir)
3145 close_dir(dir);
3146 }
3147 }
3148
3149 /**
3150 * For asynchronous flushes, check for errors from the IO and
3151 * update the inode if necessary
3152 */
3153 class C_Client_FlushComplete : public Context {
3154 private:
3155 Client *client;
3156 InodeRef inode;
3157 public:
3158 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3159 void finish(int r) override {
3160 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3161 if (r != 0) {
3162 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3163 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3164 << " 0x" << std::hex << inode->ino << std::dec
3165 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3166 inode->set_async_err(r);
3167 }
3168 }
3169 };
3170
3171
3172 /****
3173 * caps
3174 */
3175
3176 void Client::get_cap_ref(Inode *in, int cap)
3177 {
3178 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3179 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3180 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3181 in->get();
3182 }
3183 if ((cap & CEPH_CAP_FILE_CACHE) &&
3184 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3185 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3186 in->get();
3187 }
3188 in->get_cap_ref(cap);
3189 }
3190
3191 void Client::put_cap_ref(Inode *in, int cap)
3192 {
3193 int last = in->put_cap_ref(cap);
3194 if (last) {
3195 int put_nref = 0;
3196 int drop = last & ~in->caps_issued();
3197 if (in->snapid == CEPH_NOSNAP) {
3198 if ((last & CEPH_CAP_FILE_WR) &&
3199 !in->cap_snaps.empty() &&
3200 in->cap_snaps.rbegin()->second.writing) {
3201 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3202 in->cap_snaps.rbegin()->second.writing = 0;
3203 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3204 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3205 }
3206 if (last & CEPH_CAP_FILE_BUFFER) {
3207 for (auto &p : in->cap_snaps)
3208 p.second.dirty_data = 0;
3209 signal_cond_list(in->waitfor_commit);
3210 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3211 ++put_nref;
3212 }
3213 }
3214 if (last & CEPH_CAP_FILE_CACHE) {
3215 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3216 ++put_nref;
3217 }
3218 if (drop)
3219 check_caps(in, 0);
3220 if (put_nref)
3221 put_inode(in, put_nref);
3222 }
3223 }
3224
3225 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3226 {
3227 Inode *in = fh->inode.get();
3228
3229 int r = check_pool_perm(in, need);
3230 if (r < 0)
3231 return r;
3232
3233 while (1) {
3234 int file_wanted = in->caps_file_wanted();
3235 if ((file_wanted & need) != need) {
3236 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3237 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3238 << dendl;
3239 return -EBADF;
3240 }
3241
3242 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3243 return -EBADF;
3244
3245 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3246 return -EIO;
3247
3248 int implemented;
3249 int have = in->caps_issued(&implemented);
3250
3251 bool waitfor_caps = false;
3252 bool waitfor_commit = false;
3253
3254 if (have & need & CEPH_CAP_FILE_WR) {
3255 if (endoff > 0) {
3256 if ((endoff >= (loff_t)in->max_size ||
3257 endoff > (loff_t)(in->size << 1)) &&
3258 endoff > (loff_t)in->wanted_max_size) {
3259 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3260 in->wanted_max_size = endoff;
3261 }
3262 if (in->wanted_max_size > in->max_size &&
3263 in->wanted_max_size > in->requested_max_size)
3264 check_caps(in, 0);
3265 }
3266
3267 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3268 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3269 waitfor_caps = true;
3270 }
3271 if (!in->cap_snaps.empty()) {
3272 if (in->cap_snaps.rbegin()->second.writing) {
3273 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3274 waitfor_caps = true;
3275 }
3276 for (auto &p : in->cap_snaps) {
3277 if (p.second.dirty_data) {
3278 waitfor_commit = true;
3279 break;
3280 }
3281 }
3282 if (waitfor_commit) {
3283 _flush(in, new C_Client_FlushComplete(this, in));
3284 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3285 }
3286 }
3287 }
3288
3289 if (!waitfor_caps && !waitfor_commit) {
3290 if ((have & need) == need) {
3291 int revoking = implemented & ~have;
3292 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3293 << " need " << ccap_string(need) << " want " << ccap_string(want)
3294 << " revoking " << ccap_string(revoking)
3295 << dendl;
3296 if ((revoking & want) == 0) {
3297 *phave = need | (have & want);
3298 in->get_cap_ref(need);
3299 return 0;
3300 }
3301 }
3302 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3303 waitfor_caps = true;
3304 }
3305
3306 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3307 in->auth_cap->session->readonly)
3308 return -EROFS;
3309
3310 if (in->flags & I_CAP_DROPPED) {
3311 int mds_wanted = in->caps_mds_wanted();
3312 if ((mds_wanted & need) != need) {
3313 int ret = _renew_caps(in);
3314 if (ret < 0)
3315 return ret;
3316 continue;
3317 }
3318 if (!(file_wanted & ~mds_wanted))
3319 in->flags &= ~I_CAP_DROPPED;
3320 }
3321
3322 if (waitfor_caps)
3323 wait_on_list(in->waitfor_caps);
3324 else if (waitfor_commit)
3325 wait_on_list(in->waitfor_commit);
3326 }
3327 }
3328
3329 int Client::get_caps_used(Inode *in)
3330 {
3331 unsigned used = in->caps_used();
3332 if (!(used & CEPH_CAP_FILE_CACHE) &&
3333 !objectcacher->set_is_empty(&in->oset))
3334 used |= CEPH_CAP_FILE_CACHE;
3335 return used;
3336 }
3337
3338 void Client::cap_delay_requeue(Inode *in)
3339 {
3340 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3341 in->hold_caps_until = ceph_clock_now();
3342 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3343 delayed_list.push_back(&in->delay_cap_item);
3344 }
3345
3346 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3347 int flags, int used, int want, int retain,
3348 int flush, ceph_tid_t flush_tid)
3349 {
3350 int held = cap->issued | cap->implemented;
3351 int revoking = cap->implemented & ~cap->issued;
3352 retain &= ~revoking;
3353 int dropping = cap->issued & ~retain;
3354 int op = CEPH_CAP_OP_UPDATE;
3355
3356 ldout(cct, 10) << __func__ << " " << *in
3357 << " mds." << session->mds_num << " seq " << cap->seq
3358 << " used " << ccap_string(used)
3359 << " want " << ccap_string(want)
3360 << " flush " << ccap_string(flush)
3361 << " retain " << ccap_string(retain)
3362 << " held "<< ccap_string(held)
3363 << " revoking " << ccap_string(revoking)
3364 << " dropping " << ccap_string(dropping)
3365 << dendl;
3366
3367 if (cct->_conf->client_inject_release_failure && revoking) {
3368 const int would_have_issued = cap->issued & retain;
3369 const int would_have_implemented = cap->implemented & (cap->issued | used);
3370 // Simulated bug:
3371 // - tell the server we think issued is whatever they issued plus whatever we implemented
3372 // - leave what we have implemented in place
3373 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3374 cap->issued = cap->issued | cap->implemented;
3375
3376 // Make an exception for revoking xattr caps: we are injecting
3377 // failure to release other caps, but allow xattr because client
3378 // will block on xattr ops if it can't release these to MDS (#9800)
3379 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3380 cap->issued ^= xattr_mask & revoking;
3381 cap->implemented ^= xattr_mask & revoking;
3382
3383 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3384 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3385 } else {
3386 // Normal behaviour
3387 cap->issued &= retain;
3388 cap->implemented &= cap->issued | used;
3389 }
3390
3391 snapid_t follows = 0;
3392
3393 if (flush)
3394 follows = in->snaprealm->get_snap_context().seq;
3395
3396 auto m = make_message<MClientCaps>(op,
3397 in->ino,
3398 0,
3399 cap->cap_id, cap->seq,
3400 cap->implemented,
3401 want,
3402 flush,
3403 cap->mseq,
3404 cap_epoch_barrier);
3405 m->caller_uid = in->cap_dirtier_uid;
3406 m->caller_gid = in->cap_dirtier_gid;
3407
3408 m->head.issue_seq = cap->issue_seq;
3409 m->set_tid(flush_tid);
3410
3411 m->head.uid = in->uid;
3412 m->head.gid = in->gid;
3413 m->head.mode = in->mode;
3414
3415 m->head.nlink = in->nlink;
3416
3417 if (flush & CEPH_CAP_XATTR_EXCL) {
3418 encode(in->xattrs, m->xattrbl);
3419 m->head.xattr_version = in->xattr_version;
3420 }
3421
3422 m->size = in->size;
3423 m->max_size = in->max_size;
3424 m->truncate_seq = in->truncate_seq;
3425 m->truncate_size = in->truncate_size;
3426 m->mtime = in->mtime;
3427 m->atime = in->atime;
3428 m->ctime = in->ctime;
3429 m->btime = in->btime;
3430 m->time_warp_seq = in->time_warp_seq;
3431 m->change_attr = in->change_attr;
3432
3433 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3434 !in->cap_snaps.empty() &&
3435 in->cap_snaps.rbegin()->second.flush_tid == 0)
3436 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3437 m->flags = flags;
3438
3439 if (flush & CEPH_CAP_FILE_WR) {
3440 m->inline_version = in->inline_version;
3441 m->inline_data = in->inline_data;
3442 }
3443
3444 in->reported_size = in->size;
3445 m->set_snap_follows(follows);
3446 cap->wanted = want;
3447 if (cap == in->auth_cap) {
3448 if (want & CEPH_CAP_ANY_FILE_WR) {
3449 m->set_max_size(in->wanted_max_size);
3450 in->requested_max_size = in->wanted_max_size;
3451 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3452 } else {
3453 in->requested_max_size = 0;
3454 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3455 }
3456 }
3457
3458 if (!session->flushing_caps_tids.empty())
3459 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3460
3461 session->con->send_message2(std::move(m));
3462 }
3463
3464 static bool is_max_size_approaching(Inode *in)
3465 {
3466 /* mds will adjust max size according to the reported size */
3467 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3468 return false;
3469 if (in->size >= in->max_size)
3470 return true;
3471 /* half of previous max_size increment has been used */
3472 if (in->max_size > in->reported_size &&
3473 (in->size << 1) >= in->max_size + in->reported_size)
3474 return true;
3475 return false;
3476 }
3477
3478 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3479 {
3480 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3481 return used;
3482 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3483 return used;
3484
3485 if (issued & CEPH_CAP_FILE_LAZYIO) {
3486 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3487 used &= ~CEPH_CAP_FILE_CACHE;
3488 used |= CEPH_CAP_FILE_LAZYIO;
3489 }
3490 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3491 used &= ~CEPH_CAP_FILE_BUFFER;
3492 used |= CEPH_CAP_FILE_LAZYIO;
3493 }
3494 } else {
3495 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3496 used &= ~CEPH_CAP_FILE_CACHE;
3497 used |= CEPH_CAP_FILE_LAZYIO;
3498 }
3499 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3500 used &= ~CEPH_CAP_FILE_BUFFER;
3501 used |= CEPH_CAP_FILE_LAZYIO;
3502 }
3503 }
3504 return used;
3505 }
3506
3507 /**
3508 * check_caps
3509 *
3510 * Examine currently used and wanted versus held caps. Release, flush or ack
3511 * revoked caps to the MDS as appropriate.
3512 *
3513 * @param in the inode to check
3514 * @param flags flags to apply to cap check
3515 */
3516 void Client::check_caps(Inode *in, unsigned flags)
3517 {
3518 unsigned wanted = in->caps_wanted();
3519 unsigned used = get_caps_used(in);
3520 unsigned cap_used;
3521
3522 int implemented;
3523 int issued = in->caps_issued(&implemented);
3524 int revoking = implemented & ~issued;
3525
3526 int orig_used = used;
3527 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3528
3529 int retain = wanted | used | CEPH_CAP_PIN;
3530 if (!unmounting && in->nlink > 0) {
3531 if (wanted) {
3532 retain |= CEPH_CAP_ANY;
3533 } else if (in->is_dir() &&
3534 (issued & CEPH_CAP_FILE_SHARED) &&
3535 (in->flags & I_COMPLETE)) {
3536 // we do this here because we don't want to drop to Fs (and then
3537 // drop the Fs if we do a create!) if that alone makes us send lookups
3538 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3539 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3540 retain |= wanted;
3541 } else {
3542 retain |= CEPH_CAP_ANY_SHARED;
3543 // keep RD only if we didn't have the file open RW,
3544 // because then the mds would revoke it anyway to
3545 // journal max_size=0.
3546 if (in->max_size == 0)
3547 retain |= CEPH_CAP_ANY_RD;
3548 }
3549 }
3550
3551 ldout(cct, 10) << __func__ << " on " << *in
3552 << " wanted " << ccap_string(wanted)
3553 << " used " << ccap_string(used)
3554 << " issued " << ccap_string(issued)
3555 << " revoking " << ccap_string(revoking)
3556 << " flags=" << flags
3557 << dendl;
3558
3559 if (in->snapid != CEPH_NOSNAP)
3560 return; //snap caps last forever, can't write
3561
3562 if (in->caps.empty())
3563 return; // guard if at end of func
3564
3565 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3566 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3567 if (_release(in))
3568 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3569 }
3570
3571
3572 for (auto &p : in->caps) {
3573 mds_rank_t mds = p.first;
3574 Cap &cap = p.second;
3575
3576 MetaSession *session = &mds_sessions.at(mds);
3577
3578 cap_used = used;
3579 if (in->auth_cap && &cap != in->auth_cap)
3580 cap_used &= ~in->auth_cap->issued;
3581
3582 revoking = cap.implemented & ~cap.issued;
3583
3584 ldout(cct, 10) << " cap mds." << mds
3585 << " issued " << ccap_string(cap.issued)
3586 << " implemented " << ccap_string(cap.implemented)
3587 << " revoking " << ccap_string(revoking) << dendl;
3588
3589 if (in->wanted_max_size > in->max_size &&
3590 in->wanted_max_size > in->requested_max_size &&
3591 &cap == in->auth_cap)
3592 goto ack;
3593
3594 /* approaching file_max? */
3595 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3596 &cap == in->auth_cap &&
3597 is_max_size_approaching(in)) {
3598 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3599 << ", reported " << in->reported_size << dendl;
3600 goto ack;
3601 }
3602
3603 /* completed revocation? */
3604 if (revoking && (revoking & cap_used) == 0) {
3605 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3606 goto ack;
3607 }
3608
3609 /* want more caps from mds? */
3610 if (wanted & ~(cap.wanted | cap.issued))
3611 goto ack;
3612
3613 if (!revoking && unmounting && (cap_used == 0))
3614 goto ack;
3615
3616 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3617 !in->dirty_caps) // and we have no dirty caps
3618 continue;
3619
3620 if (!(flags & CHECK_CAPS_NODELAY)) {
3621 ldout(cct, 10) << "delaying cap release" << dendl;
3622 cap_delay_requeue(in);
3623 continue;
3624 }
3625
3626 ack:
3627 if (&cap == in->auth_cap) {
3628 if (in->flags & I_KICK_FLUSH) {
3629 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3630 << " to mds." << mds << dendl;
3631 kick_flushing_caps(in, session);
3632 }
3633 if (!in->cap_snaps.empty() &&
3634 in->cap_snaps.rbegin()->second.flush_tid == 0)
3635 flush_snaps(in);
3636 }
3637
3638 int flushing;
3639 int msg_flags = 0;
3640 ceph_tid_t flush_tid;
3641 if (in->auth_cap == &cap && in->dirty_caps) {
3642 flushing = mark_caps_flushing(in, &flush_tid);
3643 if (flags & CHECK_CAPS_SYNCHRONOUS)
3644 msg_flags |= MClientCaps::FLAG_SYNC;
3645 } else {
3646 flushing = 0;
3647 flush_tid = 0;
3648 }
3649
3650 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3651 flushing, flush_tid);
3652 }
3653 }
3654
3655
3656 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3657 {
3658 int used = get_caps_used(in);
3659 int dirty = in->caps_dirty();
3660 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3661
3662 if (in->cap_snaps.size() &&
3663 in->cap_snaps.rbegin()->second.writing) {
3664 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3665 return;
3666 } else if (in->caps_dirty() ||
3667 (used & CEPH_CAP_FILE_WR) ||
3668 (dirty & CEPH_CAP_ANY_WR)) {
3669 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3670 ceph_assert(capsnapem.second); /* element inserted */
3671 CapSnap &capsnap = capsnapem.first->second;
3672 capsnap.context = old_snapc;
3673 capsnap.issued = in->caps_issued();
3674 capsnap.dirty = in->caps_dirty();
3675
3676 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3677
3678 capsnap.uid = in->uid;
3679 capsnap.gid = in->gid;
3680 capsnap.mode = in->mode;
3681 capsnap.btime = in->btime;
3682 capsnap.xattrs = in->xattrs;
3683 capsnap.xattr_version = in->xattr_version;
3684 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3685 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3686
3687 if (used & CEPH_CAP_FILE_WR) {
3688 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3689 capsnap.writing = 1;
3690 } else {
3691 finish_cap_snap(in, capsnap, used);
3692 }
3693 } else {
3694 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3695 }
3696 }
3697
3698 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3699 {
3700 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3701 capsnap.size = in->size;
3702 capsnap.mtime = in->mtime;
3703 capsnap.atime = in->atime;
3704 capsnap.ctime = in->ctime;
3705 capsnap.time_warp_seq = in->time_warp_seq;
3706 capsnap.change_attr = in->change_attr;
3707 capsnap.dirty |= in->caps_dirty();
3708
3709 /* Only reset it if it wasn't set before */
3710 if (capsnap.cap_dirtier_uid == -1) {
3711 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3712 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3713 }
3714
3715 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3716 capsnap.inline_data = in->inline_data;
3717 capsnap.inline_version = in->inline_version;
3718 }
3719
3720 if (used & CEPH_CAP_FILE_BUFFER) {
3721 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3722 << " WRBUFFER, delaying" << dendl;
3723 } else {
3724 capsnap.dirty_data = 0;
3725 flush_snaps(in);
3726 }
3727 }
3728
3729 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3730 {
3731 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3732 in->cap_snaps.at(seq).dirty_data = 0;
3733 flush_snaps(in);
3734 }
3735
3736 void Client::send_flush_snap(Inode *in, MetaSession *session,
3737 snapid_t follows, CapSnap& capsnap)
3738 {
3739 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3740 in->ino, in->snaprealm->ino, 0,
3741 in->auth_cap->mseq, cap_epoch_barrier);
3742 m->caller_uid = capsnap.cap_dirtier_uid;
3743 m->caller_gid = capsnap.cap_dirtier_gid;
3744
3745 m->set_client_tid(capsnap.flush_tid);
3746 m->head.snap_follows = follows;
3747
3748 m->head.caps = capsnap.issued;
3749 m->head.dirty = capsnap.dirty;
3750
3751 m->head.uid = capsnap.uid;
3752 m->head.gid = capsnap.gid;
3753 m->head.mode = capsnap.mode;
3754 m->btime = capsnap.btime;
3755
3756 m->size = capsnap.size;
3757
3758 m->head.xattr_version = capsnap.xattr_version;
3759 encode(capsnap.xattrs, m->xattrbl);
3760
3761 m->ctime = capsnap.ctime;
3762 m->btime = capsnap.btime;
3763 m->mtime = capsnap.mtime;
3764 m->atime = capsnap.atime;
3765 m->time_warp_seq = capsnap.time_warp_seq;
3766 m->change_attr = capsnap.change_attr;
3767
3768 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3769 m->inline_version = in->inline_version;
3770 m->inline_data = in->inline_data;
3771 }
3772
3773 ceph_assert(!session->flushing_caps_tids.empty());
3774 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3775
3776 session->con->send_message2(std::move(m));
3777 }
3778
3779 void Client::flush_snaps(Inode *in)
3780 {
3781 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3782 ceph_assert(in->cap_snaps.size());
3783
3784 // pick auth mds
3785 ceph_assert(in->auth_cap);
3786 MetaSession *session = in->auth_cap->session;
3787
3788 for (auto &p : in->cap_snaps) {
3789 CapSnap &capsnap = p.second;
3790 // only do new flush
3791 if (capsnap.flush_tid > 0)
3792 continue;
3793
3794 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3795 << " follows " << p.first
3796 << " size " << capsnap.size
3797 << " mtime " << capsnap.mtime
3798 << " dirty_data=" << capsnap.dirty_data
3799 << " writing=" << capsnap.writing
3800 << " on " << *in << dendl;
3801 if (capsnap.dirty_data || capsnap.writing)
3802 break;
3803
3804 capsnap.flush_tid = ++last_flush_tid;
3805 session->flushing_caps_tids.insert(capsnap.flush_tid);
3806 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3807 if (!in->flushing_cap_item.is_on_list())
3808 session->flushing_caps.push_back(&in->flushing_cap_item);
3809
3810 send_flush_snap(in, session, p.first, capsnap);
3811 }
3812 }
3813
3814 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3815 {
3816 ceph::condition_variable cond;
3817 ls.push_back(&cond);
3818 std::unique_lock l{client_lock, std::adopt_lock};
3819 cond.wait(l);
3820 l.release();
3821 ls.remove(&cond);
3822 }
3823
3824 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3825 {
3826 for (auto cond : ls) {
3827 cond->notify_all();
3828 }
3829 }
3830
3831 void Client::wait_on_context_list(list<Context*>& ls)
3832 {
3833 ceph::condition_variable cond;
3834 bool done = false;
3835 int r;
3836 ls.push_back(new C_Cond(cond, &done, &r));
3837 std::unique_lock l{client_lock, std::adopt_lock};
3838 cond.wait(l, [&done] { return done;});
3839 l.release();
3840 }
3841
3842 void Client::signal_context_list(list<Context*>& ls)
3843 {
3844 while (!ls.empty()) {
3845 ls.front()->complete(0);
3846 ls.pop_front();
3847 }
3848 }
3849
3850 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3851 {
3852 for (const auto &cap : s->caps) {
3853 auto &in = cap->inode;
3854 if (reconnect) {
3855 in.requested_max_size = 0;
3856 in.wanted_max_size = 0;
3857 } else {
3858 if (cap->gen < s->cap_gen) {
3859 // mds did not re-issue stale cap.
3860 cap->issued = cap->implemented = CEPH_CAP_PIN;
3861 // make sure mds knows what we want.
3862 if (in.caps_file_wanted() & ~cap->wanted)
3863 in.flags |= I_CAP_DROPPED;
3864 }
3865 }
3866 signal_cond_list(in.waitfor_caps);
3867 }
3868 }
3869
3870
3871 // flush dirty data (from objectcache)
3872
3873 class C_Client_CacheInvalidate : public Context {
3874 private:
3875 Client *client;
3876 vinodeno_t ino;
3877 int64_t offset, length;
3878 public:
3879 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3880 client(c), offset(off), length(len) {
3881 if (client->use_faked_inos())
3882 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3883 else
3884 ino = in->vino();
3885 }
3886 void finish(int r) override {
3887 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3888 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3889 client->_async_invalidate(ino, offset, length);
3890 }
3891 };
3892
3893 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3894 {
3895 if (unmounting)
3896 return;
3897 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3898 ino_invalidate_cb(callback_handle, ino, off, len);
3899 }
3900
3901 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3902
3903 if (ino_invalidate_cb)
3904 // we queue the invalidate, which calls the callback and decrements the ref
3905 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3906 }
3907
3908 void Client::_invalidate_inode_cache(Inode *in)
3909 {
3910 ldout(cct, 10) << __func__ << " " << *in << dendl;
3911
3912 // invalidate our userspace inode cache
3913 if (cct->_conf->client_oc) {
3914 objectcacher->release_set(&in->oset);
3915 if (!objectcacher->set_is_empty(&in->oset))
3916 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3917 }
3918
3919 _schedule_invalidate_callback(in, 0, 0);
3920 }
3921
3922 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3923 {
3924 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3925
3926 // invalidate our userspace inode cache
3927 if (cct->_conf->client_oc) {
3928 vector<ObjectExtent> ls;
3929 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3930 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3931 }
3932
3933 _schedule_invalidate_callback(in, off, len);
3934 }
3935
3936 bool Client::_release(Inode *in)
3937 {
3938 ldout(cct, 20) << "_release " << *in << dendl;
3939 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3940 _invalidate_inode_cache(in);
3941 return true;
3942 }
3943 return false;
3944 }
3945
3946 bool Client::_flush(Inode *in, Context *onfinish)
3947 {
3948 ldout(cct, 10) << "_flush " << *in << dendl;
3949
3950 if (!in->oset.dirty_or_tx) {
3951 ldout(cct, 10) << " nothing to flush" << dendl;
3952 onfinish->complete(0);
3953 return true;
3954 }
3955
3956 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3957 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3958 objectcacher->purge_set(&in->oset);
3959 if (onfinish) {
3960 onfinish->complete(-ENOSPC);
3961 }
3962 return true;
3963 }
3964
3965 return objectcacher->flush_set(&in->oset, onfinish);
3966 }
3967
3968 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3969 {
3970 ceph_assert(ceph_mutex_is_locked(client_lock));
3971 if (!in->oset.dirty_or_tx) {
3972 ldout(cct, 10) << " nothing to flush" << dendl;
3973 return;
3974 }
3975
3976 C_SaferCond onflush("Client::_flush_range flock");
3977 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3978 offset, size, &onflush);
3979 if (!ret) {
3980 // wait for flush
3981 client_lock.unlock();
3982 onflush.wait();
3983 client_lock.lock();
3984 }
3985 }
3986
3987 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3988 {
3989 // std::lock_guard l(client_lock);
3990 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
3991 Inode *in = static_cast<Inode *>(oset->parent);
3992 ceph_assert(in);
3993 _flushed(in);
3994 }
3995
3996 void Client::_flushed(Inode *in)
3997 {
3998 ldout(cct, 10) << "_flushed " << *in << dendl;
3999
4000 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4001 }
4002
4003
4004
4005 // checks common to add_update_cap, handle_cap_grant
4006 void Client::check_cap_issue(Inode *in, unsigned issued)
4007 {
4008 unsigned had = in->caps_issued();
4009
4010 if ((issued & CEPH_CAP_FILE_CACHE) &&
4011 !(had & CEPH_CAP_FILE_CACHE))
4012 in->cache_gen++;
4013
4014 if ((issued & CEPH_CAP_FILE_SHARED) &&
4015 !(had & CEPH_CAP_FILE_SHARED)) {
4016 in->shared_gen++;
4017
4018 if (in->is_dir())
4019 clear_dir_complete_and_ordered(in, true);
4020 }
4021 }
4022
4023 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4024 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4025 inodeno_t realm, int flags, const UserPerm& cap_perms)
4026 {
4027 if (!in->is_any_caps()) {
4028 ceph_assert(in->snaprealm == 0);
4029 in->snaprealm = get_snap_realm(realm);
4030 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4031 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4032 } else {
4033 ceph_assert(in->snaprealm);
4034 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4035 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4036 in->snaprealm_item.remove_myself();
4037 auto oldrealm = in->snaprealm;
4038 in->snaprealm = get_snap_realm(realm);
4039 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4040 put_snap_realm(oldrealm);
4041 }
4042 }
4043
4044 mds_rank_t mds = mds_session->mds_num;
4045 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4046 Cap &cap = capem.first->second;
4047 if (!capem.second) {
4048 if (cap.gen < mds_session->cap_gen)
4049 cap.issued = cap.implemented = CEPH_CAP_PIN;
4050
4051 /*
4052 * auth mds of the inode changed. we received the cap export
4053 * message, but still haven't received the cap import message.
4054 * handle_cap_export() updated the new auth MDS' cap.
4055 *
4056 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4057 * a message that was send before the cap import message. So
4058 * don't remove caps.
4059 */
4060 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4061 if (&cap != in->auth_cap)
4062 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4063
4064 ceph_assert(cap.cap_id == cap_id);
4065 seq = cap.seq;
4066 mseq = cap.mseq;
4067 issued |= cap.issued;
4068 flags |= CEPH_CAP_FLAG_AUTH;
4069 }
4070 }
4071
4072 check_cap_issue(in, issued);
4073
4074 if (flags & CEPH_CAP_FLAG_AUTH) {
4075 if (in->auth_cap != &cap &&
4076 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4077 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4078 ldout(cct, 10) << __func__ << " changing auth cap: "
4079 << "add myself to new auth MDS' flushing caps list" << dendl;
4080 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4081 }
4082 in->auth_cap = &cap;
4083 }
4084 }
4085
4086 unsigned old_caps = cap.issued;
4087 cap.cap_id = cap_id;
4088 cap.issued = issued;
4089 cap.implemented |= issued;
4090 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4091 cap.wanted = wanted;
4092 else
4093 cap.wanted |= wanted;
4094 cap.seq = seq;
4095 cap.issue_seq = seq;
4096 cap.mseq = mseq;
4097 cap.gen = mds_session->cap_gen;
4098 cap.latest_perms = cap_perms;
4099 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4100 << " from mds." << mds
4101 << " on " << *in
4102 << dendl;
4103
4104 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4105 // non-auth MDS is revoking the newly grant caps ?
4106 for (auto &p : in->caps) {
4107 if (&p.second == &cap)
4108 continue;
4109 if (p.second.implemented & ~p.second.issued & issued) {
4110 check_caps(in, CHECK_CAPS_NODELAY);
4111 break;
4112 }
4113 }
4114 }
4115
4116 if (issued & ~old_caps)
4117 signal_cond_list(in->waitfor_caps);
4118 }
4119
4120 void Client::remove_cap(Cap *cap, bool queue_release)
4121 {
4122 auto &in = cap->inode;
4123 MetaSession *session = cap->session;
4124 mds_rank_t mds = cap->session->mds_num;
4125
4126 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4127
4128 if (queue_release) {
4129 session->enqueue_cap_release(
4130 in.ino,
4131 cap->cap_id,
4132 cap->issue_seq,
4133 cap->mseq,
4134 cap_epoch_barrier);
4135 }
4136
4137 if (in.auth_cap == cap) {
4138 if (in.flushing_cap_item.is_on_list()) {
4139 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4140 in.flushing_cap_item.remove_myself();
4141 }
4142 in.auth_cap = NULL;
4143 }
4144 size_t n = in.caps.erase(mds);
4145 ceph_assert(n == 1);
4146 cap = nullptr;
4147
4148 if (!in.is_any_caps()) {
4149 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4150 in.snaprealm_item.remove_myself();
4151 put_snap_realm(in.snaprealm);
4152 in.snaprealm = 0;
4153 }
4154 }
4155
4156 void Client::remove_all_caps(Inode *in)
4157 {
4158 while (!in->caps.empty())
4159 remove_cap(&in->caps.begin()->second, true);
4160 }
4161
4162 void Client::remove_session_caps(MetaSession *s, int err)
4163 {
4164 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4165
4166 while (s->caps.size()) {
4167 Cap *cap = *s->caps.begin();
4168 InodeRef in(&cap->inode);
4169 bool dirty_caps = false;
4170 if (in->auth_cap == cap) {
4171 dirty_caps = in->dirty_caps | in->flushing_caps;
4172 in->wanted_max_size = 0;
4173 in->requested_max_size = 0;
4174 if (in->has_any_filelocks())
4175 in->flags |= I_ERROR_FILELOCK;
4176 }
4177 auto caps = cap->implemented;
4178 if (cap->wanted | cap->issued)
4179 in->flags |= I_CAP_DROPPED;
4180 remove_cap(cap, false);
4181 in->cap_snaps.clear();
4182 if (dirty_caps) {
4183 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4184 if (in->flushing_caps) {
4185 num_flushing_caps--;
4186 in->flushing_cap_tids.clear();
4187 }
4188 in->flushing_caps = 0;
4189 in->mark_caps_clean();
4190 put_inode(in.get());
4191 }
4192 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4193 if (caps && !in->caps_issued_mask(caps, true)) {
4194 if (err == -EBLACKLISTED) {
4195 if (in->oset.dirty_or_tx) {
4196 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4197 in->set_async_err(err);
4198 }
4199 objectcacher->purge_set(&in->oset);
4200 } else {
4201 objectcacher->release_set(&in->oset);
4202 }
4203 _schedule_invalidate_callback(in.get(), 0, 0);
4204 }
4205
4206 signal_cond_list(in->waitfor_caps);
4207 }
4208 s->flushing_caps_tids.clear();
4209 sync_cond.notify_all();
4210 }
4211
4212 int Client::_do_remount(bool retry_on_error)
4213 {
4214 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4215
4216 errno = 0;
4217 int r = remount_cb(callback_handle);
4218 if (r == 0) {
4219 retries_on_invalidate = 0;
4220 } else {
4221 int e = errno;
4222 client_t whoami = get_nodeid();
4223 if (r == -1) {
4224 lderr(cct) <<
4225 "failed to remount (to trim kernel dentries): "
4226 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4227 } else {
4228 lderr(cct) <<
4229 "failed to remount (to trim kernel dentries): "
4230 "return code = " << r << dendl;
4231 }
4232 bool should_abort =
4233 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4234 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4235 !(retry_on_error && (++retries_on_invalidate < max_retries));
4236 if (should_abort && !unmounting) {
4237 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4238 ceph_abort();
4239 }
4240 }
4241 return r;
4242 }
4243
4244 class C_Client_Remount : public Context {
4245 private:
4246 Client *client;
4247 public:
4248 explicit C_Client_Remount(Client *c) : client(c) {}
4249 void finish(int r) override {
4250 ceph_assert(r == 0);
4251 client->_do_remount(true);
4252 }
4253 };
4254
4255 void Client::_invalidate_kernel_dcache()
4256 {
4257 if (unmounting)
4258 return;
4259 if (can_invalidate_dentries) {
4260 if (dentry_invalidate_cb && root->dir) {
4261 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4262 p != root->dir->dentries.end();
4263 ++p) {
4264 if (p->second->inode)
4265 _schedule_invalidate_dentry_callback(p->second, false);
4266 }
4267 }
4268 } else if (remount_cb) {
4269 // Hacky:
4270 // when remounting a file system, linux kernel trims all unused dentries in the fs
4271 remount_finisher.queue(new C_Client_Remount(this));
4272 }
4273 }
4274
4275 void Client::_trim_negative_child_dentries(InodeRef& in)
4276 {
4277 if (!in->is_dir())
4278 return;
4279
4280 Dir* dir = in->dir;
4281 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4282 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4283 Dentry *dn = p->second;
4284 ++p;
4285 ceph_assert(!dn->inode);
4286 if (dn->lru_is_expireable())
4287 unlink(dn, true, false); // keep dir, drop dentry
4288 }
4289 if (dir->dentries.empty()) {
4290 close_dir(dir);
4291 }
4292 }
4293
4294 if (in->flags & I_SNAPDIR_OPEN) {
4295 InodeRef snapdir = open_snapdir(in.get());
4296 _trim_negative_child_dentries(snapdir);
4297 }
4298 }
4299
4300 class C_Client_CacheRelease : public Context {
4301 private:
4302 Client *client;
4303 vinodeno_t ino;
4304 public:
4305 C_Client_CacheRelease(Client *c, Inode *in) :
4306 client(c) {
4307 if (client->use_faked_inos())
4308 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4309 else
4310 ino = in->vino();
4311 }
4312 void finish(int r) override {
4313 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4314 client->_async_inode_release(ino);
4315 }
4316 };
4317
4318 void Client::_async_inode_release(vinodeno_t ino)
4319 {
4320 if (unmounting)
4321 return;
4322 ldout(cct, 10) << __func__ << " " << ino << dendl;
4323 ino_release_cb(callback_handle, ino);
4324 }
4325
4326 void Client::_schedule_ino_release_callback(Inode *in) {
4327
4328 if (ino_release_cb)
4329 // we queue the invalidate, which calls the callback and decrements the ref
4330 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4331 }
4332
4333 void Client::trim_caps(MetaSession *s, uint64_t max)
4334 {
4335 mds_rank_t mds = s->mds_num;
4336 size_t caps_size = s->caps.size();
4337 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4338 << " caps " << caps_size << dendl;
4339
4340 uint64_t trimmed = 0;
4341 auto p = s->caps.begin();
4342 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4343 * looking at from getting deleted during traversal. */
4344 while ((caps_size - trimmed) > max && !p.end()) {
4345 Cap *cap = *p;
4346 InodeRef in(&cap->inode);
4347
4348 // Increment p early because it will be invalidated if cap
4349 // is deleted inside remove_cap
4350 ++p;
4351
4352 if (in->caps.size() > 1 && cap != in->auth_cap) {
4353 int mine = cap->issued | cap->implemented;
4354 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4355 // disposable non-auth cap
4356 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4357 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4358 cap = (remove_cap(cap, true), nullptr);
4359 trimmed++;
4360 }
4361 } else {
4362 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4363 _trim_negative_child_dentries(in);
4364 bool all = true;
4365 auto q = in->dentries.begin();
4366 while (q != in->dentries.end()) {
4367 Dentry *dn = *q;
4368 ++q;
4369 if (dn->lru_is_expireable()) {
4370 if (can_invalidate_dentries &&
4371 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4372 // Only issue one of these per DN for inodes in root: handle
4373 // others more efficiently by calling for root-child DNs at
4374 // the end of this function.
4375 _schedule_invalidate_dentry_callback(dn, true);
4376 }
4377 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4378 to_trim.insert(dn);
4379 } else {
4380 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4381 all = false;
4382 }
4383 }
4384 if (all && in->ino != MDS_INO_ROOT) {
4385 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4386 trimmed++;
4387 _schedule_ino_release_callback(in.get());
4388 }
4389 }
4390 }
4391 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4392 for (const auto &dn : to_trim) {
4393 trim_dentry(dn);
4394 }
4395 to_trim.clear();
4396
4397 caps_size = s->caps.size();
4398 if (caps_size > (size_t)max)
4399 _invalidate_kernel_dcache();
4400 }
4401
4402 void Client::force_session_readonly(MetaSession *s)
4403 {
4404 s->readonly = true;
4405 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4406 auto &in = (*p)->inode;
4407 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4408 signal_cond_list(in.waitfor_caps);
4409 }
4410 }
4411
4412 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4413 {
4414 MetaSession *session = in->auth_cap->session;
4415
4416 int flushing = in->dirty_caps;
4417 ceph_assert(flushing);
4418
4419 ceph_tid_t flush_tid = ++last_flush_tid;
4420 in->flushing_cap_tids[flush_tid] = flushing;
4421
4422 if (!in->flushing_caps) {
4423 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4424 num_flushing_caps++;
4425 } else {
4426 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4427 }
4428
4429 in->flushing_caps |= flushing;
4430 in->mark_caps_clean();
4431
4432 if (!in->flushing_cap_item.is_on_list())
4433 session->flushing_caps.push_back(&in->flushing_cap_item);
4434 session->flushing_caps_tids.insert(flush_tid);
4435
4436 *ptid = flush_tid;
4437 return flushing;
4438 }
4439
4440 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4441 {
4442 for (auto &p : in->cap_snaps) {
4443 CapSnap &capsnap = p.second;
4444 if (capsnap.flush_tid > 0) {
4445 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4446 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4447 }
4448 }
4449 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4450 it != in->flushing_cap_tids.end();
4451 ++it) {
4452 old_s->flushing_caps_tids.erase(it->first);
4453 new_s->flushing_caps_tids.insert(it->first);
4454 }
4455 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4456 }
4457
4458 /*
4459 * Flush all caps back to the MDS. Because the callers generally wait on the
4460 * result of this function (syncfs and umount cases), we set
4461 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4462 */
4463 void Client::flush_caps_sync()
4464 {
4465 ldout(cct, 10) << __func__ << dendl;
4466 xlist<Inode*>::iterator p = delayed_list.begin();
4467 while (!p.end()) {
4468 unsigned flags = CHECK_CAPS_NODELAY;
4469 Inode *in = *p;
4470
4471 ++p;
4472 delayed_list.pop_front();
4473 if (p.end() && dirty_list.empty())
4474 flags |= CHECK_CAPS_SYNCHRONOUS;
4475 check_caps(in, flags);
4476 }
4477
4478 // other caps, too
4479 p = dirty_list.begin();
4480 while (!p.end()) {
4481 unsigned flags = CHECK_CAPS_NODELAY;
4482 Inode *in = *p;
4483
4484 ++p;
4485 if (p.end())
4486 flags |= CHECK_CAPS_SYNCHRONOUS;
4487 check_caps(in, flags);
4488 }
4489 }
4490
4491 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4492 {
4493 while (in->flushing_caps) {
4494 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4495 ceph_assert(it != in->flushing_cap_tids.end());
4496 if (it->first > want)
4497 break;
4498 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4499 << ccap_string(it->second) << " want " << want
4500 << " last " << it->first << dendl;
4501 wait_on_list(in->waitfor_caps);
4502 }
4503 }
4504
4505 void Client::wait_sync_caps(ceph_tid_t want)
4506 {
4507 retry:
4508 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4509 << num_flushing_caps << " total flushing)" << dendl;
4510 for (auto &p : mds_sessions) {
4511 MetaSession *s = &p.second;
4512 if (s->flushing_caps_tids.empty())
4513 continue;
4514 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4515 if (oldest_tid <= want) {
4516 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4517 << " (want " << want << ")" << dendl;
4518 std::unique_lock l{client_lock, std::adopt_lock};
4519 sync_cond.wait(l);
4520 l.release();
4521 goto retry;
4522 }
4523 }
4524 }
4525
4526 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4527 {
4528 in->flags &= ~I_KICK_FLUSH;
4529
4530 Cap *cap = in->auth_cap;
4531 ceph_assert(cap->session == session);
4532
4533 ceph_tid_t last_snap_flush = 0;
4534 for (auto p = in->flushing_cap_tids.rbegin();
4535 p != in->flushing_cap_tids.rend();
4536 ++p) {
4537 if (!p->second) {
4538 last_snap_flush = p->first;
4539 break;
4540 }
4541 }
4542
4543 int wanted = in->caps_wanted();
4544 int used = get_caps_used(in) | in->caps_dirty();
4545 auto it = in->cap_snaps.begin();
4546 for (auto& p : in->flushing_cap_tids) {
4547 if (p.second) {
4548 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4549 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4550 p.second, p.first);
4551 } else {
4552 ceph_assert(it != in->cap_snaps.end());
4553 ceph_assert(it->second.flush_tid == p.first);
4554 send_flush_snap(in, session, it->first, it->second);
4555 ++it;
4556 }
4557 }
4558 }
4559
4560 void Client::kick_flushing_caps(MetaSession *session)
4561 {
4562 mds_rank_t mds = session->mds_num;
4563 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4564
4565 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4566 Inode *in = *p;
4567 if (in->flags & I_KICK_FLUSH) {
4568 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4569 kick_flushing_caps(in, session);
4570 }
4571 }
4572 }
4573
4574 void Client::early_kick_flushing_caps(MetaSession *session)
4575 {
4576 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4577 Inode *in = *p;
4578 Cap *cap = in->auth_cap;
4579 ceph_assert(cap);
4580
4581 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4582 // stage. This guarantees that MDS processes the cap flush message before issuing
4583 // the flushing caps to other client.
4584 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4585 in->flags |= I_KICK_FLUSH;
4586 continue;
4587 }
4588
4589 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4590 << " to mds." << session->mds_num << dendl;
4591 // send_reconnect() also will reset these sequence numbers. make sure
4592 // sequence numbers in cap flush message match later reconnect message.
4593 cap->seq = 0;
4594 cap->issue_seq = 0;
4595 cap->mseq = 0;
4596 cap->issued = cap->implemented;
4597
4598 kick_flushing_caps(in, session);
4599 }
4600 }
4601
4602 void SnapRealm::build_snap_context()
4603 {
4604 set<snapid_t> snaps;
4605 snapid_t max_seq = seq;
4606
4607 // start with prior_parents?
4608 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4609 snaps.insert(prior_parent_snaps[i]);
4610
4611 // current parent's snaps
4612 if (pparent) {
4613 const SnapContext& psnapc = pparent->get_snap_context();
4614 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4615 if (psnapc.snaps[i] >= parent_since)
4616 snaps.insert(psnapc.snaps[i]);
4617 if (psnapc.seq > max_seq)
4618 max_seq = psnapc.seq;
4619 }
4620
4621 // my snaps
4622 for (unsigned i=0; i<my_snaps.size(); i++)
4623 snaps.insert(my_snaps[i]);
4624
4625 // ok!
4626 cached_snap_context.seq = max_seq;
4627 cached_snap_context.snaps.resize(0);
4628 cached_snap_context.snaps.reserve(snaps.size());
4629 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4630 cached_snap_context.snaps.push_back(*p);
4631 }
4632
4633 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4634 {
4635 list<SnapRealm*> q;
4636 q.push_back(realm);
4637
4638 while (!q.empty()) {
4639 realm = q.front();
4640 q.pop_front();
4641
4642 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4643 realm->invalidate_cache();
4644
4645 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4646 p != realm->pchildren.end();
4647 ++p)
4648 q.push_back(*p);
4649 }
4650 }
4651
4652 SnapRealm *Client::get_snap_realm(inodeno_t r)
4653 {
4654 SnapRealm *realm = snap_realms[r];
4655 if (!realm)
4656 snap_realms[r] = realm = new SnapRealm(r);
4657 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4658 realm->nref++;
4659 return realm;
4660 }
4661
4662 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4663 {
4664 if (snap_realms.count(r) == 0) {
4665 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4666 return NULL;
4667 }
4668 SnapRealm *realm = snap_realms[r];
4669 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4670 realm->nref++;
4671 return realm;
4672 }
4673
4674 void Client::put_snap_realm(SnapRealm *realm)
4675 {
4676 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4677 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4678 if (--realm->nref == 0) {
4679 snap_realms.erase(realm->ino);
4680 if (realm->pparent) {
4681 realm->pparent->pchildren.erase(realm);
4682 put_snap_realm(realm->pparent);
4683 }
4684 delete realm;
4685 }
4686 }
4687
4688 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4689 {
4690 if (realm->parent != parent) {
4691 ldout(cct, 10) << __func__ << " " << *realm
4692 << " " << realm->parent << " -> " << parent << dendl;
4693 realm->parent = parent;
4694 if (realm->pparent) {
4695 realm->pparent->pchildren.erase(realm);
4696 put_snap_realm(realm->pparent);
4697 }
4698 realm->pparent = get_snap_realm(parent);
4699 realm->pparent->pchildren.insert(realm);
4700 return true;
4701 }
4702 return false;
4703 }
4704
4705 static bool has_new_snaps(const SnapContext& old_snapc,
4706 const SnapContext& new_snapc)
4707 {
4708 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4709 }
4710
4711
4712 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4713 {
4714 SnapRealm *first_realm = NULL;
4715 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4716
4717 map<SnapRealm*, SnapContext> dirty_realms;
4718
4719 auto p = bl.cbegin();
4720 while (!p.end()) {
4721 SnapRealmInfo info;
4722 decode(info, p);
4723 SnapRealm *realm = get_snap_realm(info.ino());
4724
4725 bool invalidate = false;
4726
4727 if (info.seq() > realm->seq) {
4728 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4729 << dendl;
4730
4731 if (flush) {
4732 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4733 // flush me + children
4734 list<SnapRealm*> q;
4735 q.push_back(realm);
4736 while (!q.empty()) {
4737 SnapRealm *realm = q.front();
4738 q.pop_front();
4739
4740 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4741 p != realm->pchildren.end();
4742 ++p)
4743 q.push_back(*p);
4744
4745 if (dirty_realms.count(realm) == 0) {
4746 realm->nref++;
4747 dirty_realms[realm] = realm->get_snap_context();
4748 }
4749 }
4750 }
4751
4752 // update
4753 realm->seq = info.seq();
4754 realm->created = info.created();
4755 realm->parent_since = info.parent_since();
4756 realm->prior_parent_snaps = info.prior_parent_snaps;
4757 realm->my_snaps = info.my_snaps;
4758 invalidate = true;
4759 }
4760
4761 // _always_ verify parent
4762 if (adjust_realm_parent(realm, info.parent()))
4763 invalidate = true;
4764
4765 if (invalidate) {
4766 invalidate_snaprealm_and_children(realm);
4767 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4768 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4769 } else {
4770 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4771 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4772 }
4773
4774 if (!first_realm)
4775 first_realm = realm;
4776 else
4777 put_snap_realm(realm);
4778 }
4779
4780 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4781 q != dirty_realms.end();
4782 ++q) {
4783 SnapRealm *realm = q->first;
4784 // if there are new snaps ?
4785 if (has_new_snaps(q->second, realm->get_snap_context())) {
4786 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4787 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4788 while (!r.end()) {
4789 Inode *in = *r;
4790 ++r;
4791 queue_cap_snap(in, q->second);
4792 }
4793 } else {
4794 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4795 }
4796 put_snap_realm(realm);
4797 }
4798
4799 if (realm_ret)
4800 *realm_ret = first_realm;
4801 else
4802 put_snap_realm(first_realm);
4803 }
4804
4805 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4806 {
4807 ldout(cct, 10) << __func__ << " " << *m << dendl;
4808 mds_rank_t mds = mds_rank_t(m->get_source().num());
4809 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4810 if (!session) {
4811 return;
4812 }
4813
4814 got_mds_push(session);
4815
4816 map<Inode*, SnapContext> to_move;
4817 SnapRealm *realm = 0;
4818
4819 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4820 ceph_assert(m->head.split);
4821 SnapRealmInfo info;
4822 auto p = m->bl.cbegin();
4823 decode(info, p);
4824 ceph_assert(info.ino() == m->head.split);
4825
4826 // flush, then move, ino's.
4827 realm = get_snap_realm(info.ino());
4828 ldout(cct, 10) << " splitting off " << *realm << dendl;
4829 for (auto& ino : m->split_inos) {
4830 vinodeno_t vino(ino, CEPH_NOSNAP);
4831 if (inode_map.count(vino)) {
4832 Inode *in = inode_map[vino];
4833 if (!in->snaprealm || in->snaprealm == realm)
4834 continue;
4835 if (in->snaprealm->created > info.created()) {
4836 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4837 << *in->snaprealm << dendl;
4838 continue;
4839 }
4840 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4841
4842
4843 in->snaprealm_item.remove_myself();
4844 to_move[in] = in->snaprealm->get_snap_context();
4845 put_snap_realm(in->snaprealm);
4846 }
4847 }
4848
4849 // move child snaprealms, too
4850 for (auto& child_realm : m->split_realms) {
4851 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4852 SnapRealm *child = get_snap_realm_maybe(child_realm);
4853 if (!child)
4854 continue;
4855 adjust_realm_parent(child, realm->ino);
4856 put_snap_realm(child);
4857 }
4858 }
4859
4860 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4861
4862 if (realm) {
4863 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4864 Inode *in = p->first;
4865 in->snaprealm = realm;
4866 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4867 realm->nref++;
4868 // queue for snap writeback
4869 if (has_new_snaps(p->second, realm->get_snap_context()))
4870 queue_cap_snap(in, p->second);
4871 }
4872 put_snap_realm(realm);
4873 }
4874 }
4875
4876 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4877 {
4878 mds_rank_t mds = mds_rank_t(m->get_source().num());
4879 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4880 if (!session) {
4881 return;
4882 }
4883
4884 got_mds_push(session);
4885
4886 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4887
4888 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4889 if (inode_map.count(vino)) {
4890 Inode *in = NULL;
4891 in = inode_map[vino];
4892
4893 if (in) {
4894 in->quota = m->quota;
4895 in->rstat = m->rstat;
4896 }
4897 }
4898 }
4899
4900 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4901 {
4902 mds_rank_t mds = mds_rank_t(m->get_source().num());
4903 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4904 if (!session) {
4905 return;
4906 }
4907
4908 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4909 // Pause RADOS operations until we see the required epoch
4910 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4911 }
4912
4913 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4914 // Record the barrier so that we will transmit it to MDS when releasing
4915 set_cap_epoch_barrier(m->osd_epoch_barrier);
4916 }
4917
4918 got_mds_push(session);
4919
4920 Inode *in;
4921 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4922 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4923 in = it->second;
4924 } else {
4925 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4926 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4927 session->enqueue_cap_release(
4928 m->get_ino(),
4929 m->get_cap_id(),
4930 m->get_seq(),
4931 m->get_mseq(),
4932 cap_epoch_barrier);
4933 } else {
4934 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4935 }
4936
4937 // in case the mds is waiting on e.g. a revocation
4938 flush_cap_releases();
4939 return;
4940 }
4941
4942 switch (m->get_op()) {
4943 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4944 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4945 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4946 }
4947
4948 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4949 Cap &cap = in->caps.at(mds);
4950
4951 switch (m->get_op()) {
4952 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4953 case CEPH_CAP_OP_IMPORT:
4954 case CEPH_CAP_OP_REVOKE:
4955 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4956 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4957 }
4958 } else {
4959 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4960 return;
4961 }
4962 }
4963
4964 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4965 {
4966 mds_rank_t mds = session->mds_num;
4967
4968 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4969 << " IMPORT from mds." << mds << dendl;
4970
4971 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4972 Cap *cap = NULL;
4973 UserPerm cap_perms;
4974 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4975 cap = &it->second;
4976 cap_perms = cap->latest_perms;
4977 }
4978
4979 // add/update it
4980 SnapRealm *realm = NULL;
4981 update_snap_trace(m->snapbl, &realm);
4982
4983 int issued = m->get_caps();
4984 int wanted = m->get_wanted();
4985 add_update_cap(in, session, m->get_cap_id(),
4986 issued, wanted, m->get_seq(), m->get_mseq(),
4987 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4988
4989 if (cap && cap->cap_id == m->peer.cap_id) {
4990 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4991 }
4992
4993 if (realm)
4994 put_snap_realm(realm);
4995
4996 if (in->auth_cap && in->auth_cap->session == session) {
4997 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4998 in->requested_max_size > m->get_max_size()) {
4999 in->requested_max_size = 0;
5000 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5001 }
5002 // reflush any/all caps (if we are now the auth_cap)
5003 kick_flushing_caps(in, session);
5004 }
5005 }
5006
5007 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5008 {
5009 mds_rank_t mds = session->mds_num;
5010
5011 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5012 << " EXPORT from mds." << mds << dendl;
5013
5014 auto it = in->caps.find(mds);
5015 if (it != in->caps.end()) {
5016 Cap &cap = it->second;
5017 if (cap.cap_id == m->get_cap_id()) {
5018 if (m->peer.cap_id) {
5019 const auto peer_mds = mds_rank_t(m->peer.mds);
5020 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5021 auto it = in->caps.find(peer_mds);
5022 if (it != in->caps.end()) {
5023 Cap &tcap = it->second;
5024 if (tcap.cap_id == m->peer.cap_id &&
5025 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5026 tcap.cap_id = m->peer.cap_id;
5027 tcap.seq = m->peer.seq - 1;
5028 tcap.issue_seq = tcap.seq;
5029 tcap.issued |= cap.issued;
5030 tcap.implemented |= cap.issued;
5031 if (&cap == in->auth_cap)
5032 in->auth_cap = &tcap;
5033 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5034 adjust_session_flushing_caps(in, session, tsession);
5035 }
5036 } else {
5037 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5038 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5039 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5040 cap.latest_perms);
5041 }
5042 } else {
5043 if (cap.wanted | cap.issued)
5044 in->flags |= I_CAP_DROPPED;
5045 }
5046
5047 remove_cap(&cap, false);
5048 }
5049 }
5050 }
5051
5052 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5053 {
5054 mds_rank_t mds = session->mds_num;
5055 ceph_assert(in->caps.count(mds));
5056
5057 ldout(cct, 10) << __func__ << " on ino " << *in
5058 << " size " << in->size << " -> " << m->get_size()
5059 << dendl;
5060
5061 int issued;
5062 in->caps_issued(&issued);
5063 issued |= in->caps_dirty();
5064 update_inode_file_size(in, issued, m->get_size(),
5065 m->get_truncate_seq(), m->get_truncate_size());
5066 }
5067
5068 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5069 {
5070 ceph_tid_t flush_ack_tid = m->get_client_tid();
5071 int dirty = m->get_dirty();
5072 int cleaned = 0;
5073 int flushed = 0;
5074
5075 auto it = in->flushing_cap_tids.begin();
5076 if (it->first < flush_ack_tid) {
5077 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5078 << " got unexpected flush ack tid " << flush_ack_tid
5079 << " expected is " << it->first << dendl;
5080 }
5081 for (; it != in->flushing_cap_tids.end(); ) {
5082 if (!it->second) {
5083 // cap snap
5084 ++it;
5085 continue;
5086 }
5087 if (it->first == flush_ack_tid)
5088 cleaned = it->second;
5089 if (it->first <= flush_ack_tid) {
5090 session->flushing_caps_tids.erase(it->first);
5091 in->flushing_cap_tids.erase(it++);
5092 ++flushed;
5093 continue;
5094 }
5095 cleaned &= ~it->second;
5096 if (!cleaned)
5097 break;
5098 ++it;
5099 }
5100
5101 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5102 << " cleaned " << ccap_string(cleaned) << " on " << *in
5103 << " with " << ccap_string(dirty) << dendl;
5104
5105 if (flushed) {
5106 signal_cond_list(in->waitfor_caps);
5107 if (session->flushing_caps_tids.empty() ||
5108 *session->flushing_caps_tids.begin() > flush_ack_tid)
5109 sync_cond.notify_all();
5110 }
5111
5112 if (!dirty) {
5113 in->cap_dirtier_uid = -1;
5114 in->cap_dirtier_gid = -1;
5115 }
5116
5117 if (!cleaned) {
5118 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5119 } else {
5120 if (in->flushing_caps) {
5121 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5122 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5123 in->flushing_caps &= ~cleaned;
5124 if (in->flushing_caps == 0) {
5125 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5126 num_flushing_caps--;
5127 if (in->flushing_cap_tids.empty())
5128 in->flushing_cap_item.remove_myself();
5129 }
5130 if (!in->caps_dirty())
5131 put_inode(in);
5132 }
5133 }
5134 }
5135
5136
5137 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5138 {
5139 ceph_tid_t flush_ack_tid = m->get_client_tid();
5140 mds_rank_t mds = session->mds_num;
5141 ceph_assert(in->caps.count(mds));
5142 snapid_t follows = m->get_snap_follows();
5143
5144 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5145 auto& capsnap = it->second;
5146 if (flush_ack_tid != capsnap.flush_tid) {
5147 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5148 } else {
5149 InodeRef tmp_ref(in);
5150 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5151 << " on " << *in << dendl;
5152 session->flushing_caps_tids.erase(capsnap.flush_tid);
5153 in->flushing_cap_tids.erase(capsnap.flush_tid);
5154 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5155 in->flushing_cap_item.remove_myself();
5156 in->cap_snaps.erase(it);
5157
5158 signal_cond_list(in->waitfor_caps);
5159 if (session->flushing_caps_tids.empty() ||
5160 *session->flushing_caps_tids.begin() > flush_ack_tid)
5161 sync_cond.notify_all();
5162 }
5163 } else {
5164 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5165 << " on " << *in << dendl;
5166 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5167 }
5168 }
5169
5170 class C_Client_DentryInvalidate : public Context {
5171 private:
5172 Client *client;
5173 vinodeno_t dirino;
5174 vinodeno_t ino;
5175 string name;
5176 public:
5177 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5178 client(c), name(dn->name) {
5179 if (client->use_faked_inos()) {
5180 dirino.ino = dn->dir->parent_inode->faked_ino;
5181 if (del)
5182 ino.ino = dn->inode->faked_ino;
5183 } else {
5184 dirino = dn->dir->parent_inode->vino();
5185 if (del)
5186 ino = dn->inode->vino();
5187 }
5188 if (!del)
5189 ino.ino = inodeno_t();
5190 }
5191 void finish(int r) override {
5192 // _async_dentry_invalidate is responsible for its own locking
5193 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5194 client->_async_dentry_invalidate(dirino, ino, name);
5195 }
5196 };
5197
5198 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5199 {
5200 if (unmounting)
5201 return;
5202 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5203 << " in dir " << dirino << dendl;
5204 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5205 }
5206
5207 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5208 {
5209 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5210 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5211 }
5212
5213 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5214 {
5215 int ref = in->get_num_ref();
5216 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5217
5218 if (in->dir && !in->dir->dentries.empty()) {
5219 for (auto p = in->dir->dentries.begin();
5220 p != in->dir->dentries.end(); ) {
5221 Dentry *dn = p->second;
5222 ++p;
5223 /* rmsnap removes whole subtree, need trim inodes recursively.
5224 * we don't need to invalidate dentries recursively. because
5225 * invalidating a directory dentry effectively invalidate
5226 * whole subtree */
5227 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5228 _try_to_trim_inode(dn->inode.get(), false);
5229
5230 if (dn->lru_is_expireable())
5231 unlink(dn, true, false); // keep dir, drop dentry
5232 }
5233 if (in->dir->dentries.empty()) {
5234 close_dir(in->dir);
5235 --ref;
5236 }
5237 }
5238
5239 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5240 InodeRef snapdir = open_snapdir(in);
5241 _try_to_trim_inode(snapdir.get(), false);
5242 --ref;
5243 }
5244
5245 if (ref > 0) {
5246 auto q = in->dentries.begin();
5247 while (q != in->dentries.end()) {
5248 Dentry *dn = *q;
5249 ++q;
5250 if( in->ll_ref > 0 && sched_inval) {
5251 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5252 // so in->dentries doesn't always reflect the state of kernel's dcache.
5253 _schedule_invalidate_dentry_callback(dn, true);
5254 }
5255 unlink(dn, true, true);
5256 }
5257 }
5258 }
5259
5260 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5261 {
5262 mds_rank_t mds = session->mds_num;
5263 int used = get_caps_used(in);
5264 int wanted = in->caps_wanted();
5265
5266 const unsigned new_caps = m->get_caps();
5267 const bool was_stale = session->cap_gen > cap->gen;
5268 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5269 << " mds." << mds << " seq " << m->get_seq()
5270 << " caps now " << ccap_string(new_caps)
5271 << " was " << ccap_string(cap->issued)
5272 << (was_stale ? " (stale)" : "") << dendl;
5273
5274 if (was_stale)
5275 cap->issued = cap->implemented = CEPH_CAP_PIN;
5276 cap->seq = m->get_seq();
5277 cap->gen = session->cap_gen;
5278
5279 check_cap_issue(in, new_caps);
5280
5281 // update inode
5282 int issued;
5283 in->caps_issued(&issued);
5284 issued |= in->caps_dirty();
5285
5286 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5287 !(issued & CEPH_CAP_AUTH_EXCL)) {
5288 in->mode = m->head.mode;
5289 in->uid = m->head.uid;
5290 in->gid = m->head.gid;
5291 in->btime = m->btime;
5292 }
5293 bool deleted_inode = false;
5294 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5295 !(issued & CEPH_CAP_LINK_EXCL)) {
5296 in->nlink = m->head.nlink;
5297 if (in->nlink == 0 &&
5298 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5299 deleted_inode = true;
5300 }
5301 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5302 m->xattrbl.length() &&
5303 m->head.xattr_version > in->xattr_version) {
5304 auto p = m->xattrbl.cbegin();
5305 decode(in->xattrs, p);
5306 in->xattr_version = m->head.xattr_version;
5307 }
5308
5309 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5310 in->dirstat.nfiles = m->get_nfiles();
5311 in->dirstat.nsubdirs = m->get_nsubdirs();
5312 }
5313
5314 if (new_caps & CEPH_CAP_ANY_RD) {
5315 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5316 m->get_ctime(), m->get_mtime(), m->get_atime());
5317 }
5318
5319 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5320 in->layout = m->get_layout();
5321 update_inode_file_size(in, issued, m->get_size(),
5322 m->get_truncate_seq(), m->get_truncate_size());
5323 }
5324
5325 if (m->inline_version > in->inline_version) {
5326 in->inline_data = m->inline_data;
5327 in->inline_version = m->inline_version;
5328 }
5329
5330 /* always take a newer change attr */
5331 if (m->get_change_attr() > in->change_attr)
5332 in->change_attr = m->get_change_attr();
5333
5334 // max_size
5335 if (cap == in->auth_cap &&
5336 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5337 (m->get_max_size() != in->max_size)) {
5338 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5339 in->max_size = m->get_max_size();
5340 if (in->max_size > in->wanted_max_size) {
5341 in->wanted_max_size = 0;
5342 in->requested_max_size = 0;
5343 }
5344 }
5345
5346 bool check = false;
5347 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5348 (wanted & ~(cap->wanted | new_caps))) {
5349 // If mds is importing cap, prior cap messages that update 'wanted'
5350 // may get dropped by mds (migrate seq mismatch).
5351 //
5352 // We don't send cap message to update 'wanted' if what we want are
5353 // already issued. If mds revokes caps, cap message that releases caps
5354 // also tells mds what we want. But if caps got revoked by mds forcedly
5355 // (session stale). We may haven't told mds what we want.
5356 check = true;
5357 }
5358
5359
5360 // update caps
5361 auto revoked = cap->issued & ~new_caps;
5362 if (revoked) {
5363 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5364 cap->issued = new_caps;
5365 cap->implemented |= new_caps;
5366
5367 // recall delegations if we're losing caps necessary for them
5368 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5369 in->recall_deleg(false);
5370 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5371 in->recall_deleg(true);
5372
5373 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5374 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5375 !_flush(in, new C_Client_FlushComplete(this, in))) {
5376 // waitin' for flush
5377 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5378 if (_release(in))
5379 check = true;
5380 } else {
5381 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5382 check = true;
5383 }
5384 } else if (cap->issued == new_caps) {
5385 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5386 } else {
5387 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5388 cap->issued = new_caps;
5389 cap->implemented |= new_caps;
5390
5391 if (cap == in->auth_cap) {
5392 // non-auth MDS is revoking the newly grant caps ?
5393 for (const auto &p : in->caps) {
5394 if (&p.second == cap)
5395 continue;
5396 if (p.second.implemented & ~p.second.issued & new_caps) {
5397 check = true;
5398 break;
5399 }
5400 }
5401 }
5402 }
5403
5404 if (check)
5405 check_caps(in, 0);
5406
5407 // wake up waiters
5408 if (new_caps)
5409 signal_cond_list(in->waitfor_caps);
5410
5411 // may drop inode's last ref
5412 if (deleted_inode)
5413 _try_to_trim_inode(in, true);
5414 }
5415
5416 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5417 {
5418 if (perms.uid() == 0)
5419 return 0;
5420
5421 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5422 int ret = _posix_acl_permission(in, perms, want);
5423 if (ret != -EAGAIN)
5424 return ret;
5425 }
5426
5427 // check permissions before doing anything else
5428 if (!in->check_mode(perms, want))
5429 return -EACCES;
5430 return 0;
5431 }
5432
5433 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5434 const UserPerm& perms)
5435 {
5436 int r = _getattr_for_perm(in, perms);
5437 if (r < 0)
5438 goto out;
5439
5440 r = 0;
5441 if (strncmp(name, "system.", 7) == 0) {
5442 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5443 r = -EPERM;
5444 } else {
5445 r = inode_permission(in, perms, want);
5446 }
5447 out:
5448 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5449 return r;
5450 }
5451
5452 ostream& operator<<(ostream &out, const UserPerm& perm) {
5453 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5454 return out;
5455 }
5456
5457 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5458 const UserPerm& perms)
5459 {
5460 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5461 int r = _getattr_for_perm(in, perms);
5462 if (r < 0)
5463 goto out;
5464
5465 if (mask & CEPH_SETATTR_SIZE) {
5466 r = inode_permission(in, perms, MAY_WRITE);
5467 if (r < 0)
5468 goto out;
5469 }
5470
5471 r = -EPERM;
5472 if (mask & CEPH_SETATTR_UID) {
5473 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5474 goto out;
5475 }
5476 if (mask & CEPH_SETATTR_GID) {
5477 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5478 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5479 goto out;
5480 }
5481
5482 if (mask & CEPH_SETATTR_MODE) {
5483 if (perms.uid() != 0 && perms.uid() != in->uid)
5484 goto out;
5485
5486 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5487 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5488 stx->stx_mode &= ~S_ISGID;
5489 }
5490
5491 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5492 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5493 if (perms.uid() != 0 && perms.uid() != in->uid) {
5494 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5495 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5496 check_mask |= CEPH_SETATTR_MTIME;
5497 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5498 check_mask |= CEPH_SETATTR_ATIME;
5499 if (check_mask & mask) {
5500 goto out;
5501 } else {
5502 r = inode_permission(in, perms, MAY_WRITE);
5503 if (r < 0)
5504 goto out;
5505 }
5506 }
5507 }
5508 r = 0;
5509 out:
5510 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5511 return r;
5512 }
5513
5514 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5515 {
5516 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5517 unsigned want = 0;
5518
5519 if ((flags & O_ACCMODE) == O_WRONLY)
5520 want = MAY_WRITE;
5521 else if ((flags & O_ACCMODE) == O_RDWR)
5522 want = MAY_READ | MAY_WRITE;
5523 else if ((flags & O_ACCMODE) == O_RDONLY)
5524 want = MAY_READ;
5525 if (flags & O_TRUNC)
5526 want |= MAY_WRITE;
5527
5528 int r = 0;
5529 switch (in->mode & S_IFMT) {
5530 case S_IFLNK:
5531 r = -ELOOP;
5532 goto out;
5533 case S_IFDIR:
5534 if (want & MAY_WRITE) {
5535 r = -EISDIR;
5536 goto out;
5537 }
5538 break;
5539 }
5540
5541 r = _getattr_for_perm(in, perms);
5542 if (r < 0)
5543 goto out;
5544
5545 r = inode_permission(in, perms, want);
5546 out:
5547 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5548 return r;
5549 }
5550
5551 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5552 {
5553 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5554 int r = _getattr_for_perm(dir, perms);
5555 if (r < 0)
5556 goto out;
5557
5558 r = inode_permission(dir, perms, MAY_EXEC);
5559 out:
5560 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5561 return r;
5562 }
5563
5564 int Client::may_create(Inode *dir, const UserPerm& perms)
5565 {
5566 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5567 int r = _getattr_for_perm(dir, perms);
5568 if (r < 0)
5569 goto out;
5570
5571 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5572 out:
5573 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5574 return r;
5575 }
5576
5577 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5578 {
5579 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5580 int r = _getattr_for_perm(dir, perms);
5581 if (r < 0)
5582 goto out;
5583
5584 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5585 if (r < 0)
5586 goto out;
5587
5588 /* 'name == NULL' means rmsnap */
5589 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5590 InodeRef otherin;
5591 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5592 if (r < 0)
5593 goto out;
5594 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5595 r = -EPERM;
5596 }
5597 out:
5598 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5599 return r;
5600 }
5601
5602 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5603 {
5604 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5605 int r = _getattr_for_perm(in, perms);
5606 if (r < 0)
5607 goto out;
5608
5609 if (perms.uid() == 0 || perms.uid() == in->uid) {
5610 r = 0;
5611 goto out;
5612 }
5613
5614 r = -EPERM;
5615 if (!S_ISREG(in->mode))
5616 goto out;
5617
5618 if (in->mode & S_ISUID)
5619 goto out;
5620
5621 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5622 goto out;
5623
5624 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5625 out:
5626 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5627 return r;
5628 }
5629
5630 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5631 {
5632 int mask = CEPH_STAT_CAP_MODE;
5633 bool force = false;
5634 if (acl_type != NO_ACL) {
5635 mask |= CEPH_STAT_CAP_XATTR;
5636 force = in->xattr_version == 0;
5637 }
5638 return _getattr(in, mask, perms, force);
5639 }
5640
5641 vinodeno_t Client::_get_vino(Inode *in)
5642 {
5643 /* The caller must hold the client lock */
5644 return vinodeno_t(in->ino, in->snapid);
5645 }
5646
5647 /**
5648 * Resolve an MDS spec to a list of MDS daemon GIDs.
5649 *
5650 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5651 * It may be '*' in which case it matches all GIDs.
5652 *
5653 * If no error is returned, the `targets` vector will be populated with at least
5654 * one MDS.
5655 */
5656 int Client::resolve_mds(
5657 const std::string &mds_spec,
5658 std::vector<mds_gid_t> *targets)
5659 {
5660 ceph_assert(fsmap);
5661 ceph_assert(targets != nullptr);
5662
5663 mds_role_t role;
5664 std::stringstream ss;
5665 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5666 if (role_r == 0) {
5667 // We got a role, resolve it to a GID
5668 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5669 << role << "'" << dendl;
5670 targets->push_back(
5671 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5672 return 0;
5673 }
5674
5675 std::string strtol_err;
5676 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5677 if (strtol_err.empty()) {
5678 // It is a possible GID
5679 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5680 if (fsmap->gid_exists(mds_gid)) {
5681 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5682 targets->push_back(mds_gid);
5683 } else {
5684 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5685 << dendl;
5686 return -ENOENT;
5687 }
5688 } else if (mds_spec == "*") {
5689 // It is a wildcard: use all MDSs
5690 const auto mds_info = fsmap->get_mds_info();
5691
5692 if (mds_info.empty()) {
5693 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5694 return -ENOENT;
5695 }
5696
5697 for (const auto i : mds_info) {
5698 targets->push_back(i.first);
5699 }
5700 } else {
5701 // It did not parse as an integer, it is not a wildcard, it must be a name
5702 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5703 if (mds_gid == 0) {
5704 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5705
5706 lderr(cct) << "FSMap: " << *fsmap << dendl;
5707
5708 return -ENOENT;
5709 } else {
5710 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5711 << "' to GID " << mds_gid << dendl;
5712 targets->push_back(mds_gid);
5713 }
5714 }
5715
5716 return 0;
5717 }
5718
5719
5720 /**
5721 * Authenticate with mon and establish global ID
5722 */
5723 int Client::authenticate()
5724 {
5725 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5726
5727 if (monclient->is_authenticated()) {
5728 return 0;
5729 }
5730
5731 client_lock.unlock();
5732 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5733 client_lock.lock();
5734 if (r < 0) {
5735 return r;
5736 }
5737
5738 whoami = monclient->get_global_id();
5739 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5740
5741 return 0;
5742 }
5743
5744 int Client::fetch_fsmap(bool user)
5745 {
5746 int r;
5747 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5748 // rather than MDSMap because no one MDSMap contains all the daemons, and
5749 // a `tell` can address any daemon.
5750 version_t fsmap_latest;
5751 do {
5752 C_SaferCond cond;
5753 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5754 client_lock.unlock();
5755 r = cond.wait();
5756 client_lock.lock();
5757 } while (r == -EAGAIN);
5758
5759 if (r < 0) {
5760 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5761 return r;
5762 }
5763
5764 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5765
5766 if (user) {
5767 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5768 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5769 monclient->renew_subs();
5770 wait_on_list(waiting_for_fsmap);
5771 }
5772 ceph_assert(fsmap_user);
5773 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5774 } else {
5775 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5776 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5777 monclient->renew_subs();
5778 wait_on_list(waiting_for_fsmap);
5779 }
5780 ceph_assert(fsmap);
5781 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5782 }
5783 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5784 << fsmap_latest << dendl;
5785 return 0;
5786 }
5787
5788 /**
5789 *
5790 * @mds_spec one of ID, rank, GID, "*"
5791 *
5792 */
5793 int Client::mds_command(
5794 const std::string &mds_spec,
5795 const vector<string>& cmd,
5796 const bufferlist& inbl,
5797 bufferlist *outbl,
5798 string *outs,
5799 Context *onfinish)
5800 {
5801 std::lock_guard lock(client_lock);
5802
5803 if (!initialized)
5804 return -ENOTCONN;
5805
5806 int r;
5807 r = authenticate();
5808 if (r < 0) {
5809 return r;
5810 }
5811
5812 r = fetch_fsmap(false);
5813 if (r < 0) {
5814 return r;
5815 }
5816
5817 // Look up MDS target(s) of the command
5818 std::vector<mds_gid_t> targets;
5819 r = resolve_mds(mds_spec, &targets);
5820 if (r < 0) {
5821 return r;
5822 }
5823
5824 // If daemons are laggy, we won't send them commands. If all
5825 // are laggy then we fail.
5826 std::vector<mds_gid_t> non_laggy;
5827 for (const auto gid : targets) {
5828 const auto info = fsmap->get_info_gid(gid);
5829 if (!info.laggy()) {
5830 non_laggy.push_back(gid);
5831 }
5832 }
5833 if (non_laggy.size() == 0) {
5834 *outs = "All targeted MDS daemons are laggy";
5835 return -ENOENT;
5836 }
5837
5838 if (metadata.empty()) {
5839 // We are called on an unmounted client, so metadata
5840 // won't be initialized yet.
5841 populate_metadata("");
5842 }
5843
5844 // Send commands to targets
5845 C_GatherBuilder gather(cct, onfinish);
5846 for (const auto target_gid : non_laggy) {
5847 const auto info = fsmap->get_info_gid(target_gid);
5848
5849 // Open a connection to the target MDS
5850 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5851
5852 // Generate MDSCommandOp state
5853 auto &op = command_table.start_command();
5854
5855 op.on_finish = gather.new_sub();
5856 op.cmd = cmd;
5857 op.outbl = outbl;
5858 op.outs = outs;
5859 op.inbl = inbl;
5860 op.mds_gid = target_gid;
5861 op.con = conn;
5862
5863 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5864 << " tid=" << op.tid << cmd << dendl;
5865
5866 // Construct and send MCommand
5867 auto m = op.get_message(monclient->get_fsid());
5868 conn->send_message2(std::move(m));
5869 }
5870 gather.activate();
5871
5872 return 0;
5873 }
5874
5875 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5876 {
5877 ceph_tid_t const tid = m->get_tid();
5878
5879 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5880
5881 if (!command_table.exists(tid)) {
5882 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5883 return;
5884 }
5885
5886 auto &op = command_table.get_command(tid);
5887 if (op.outbl) {
5888 *op.outbl = m->get_data();
5889 }
5890 if (op.outs) {
5891 *op.outs = m->rs;
5892 }
5893
5894 if (op.on_finish) {
5895 op.on_finish->complete(m->r);
5896 }
5897
5898 command_table.erase(tid);
5899 }
5900
5901 // -------------------
5902 // MOUNT
5903
5904 int Client::subscribe_mdsmap(const std::string &fs_name)
5905 {
5906 int r = authenticate();
5907 if (r < 0) {
5908 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5909 return r;
5910 }
5911
5912 std::string resolved_fs_name;
5913 if (fs_name.empty()) {
5914 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5915 if (resolved_fs_name.empty())
5916 // Try the backwards compatibility fs name option
5917 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5918 } else {
5919 resolved_fs_name = fs_name;
5920 }
5921
5922 std::string want = "mdsmap";
5923 if (!resolved_fs_name.empty()) {
5924 r = fetch_fsmap(true);
5925 if (r < 0)
5926 return r;
5927 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5928 if (fscid == FS_CLUSTER_ID_NONE) {
5929 return -ENOENT;
5930 }
5931
5932 std::ostringstream oss;
5933 oss << want << "." << fscid;
5934 want = oss.str();
5935 }
5936 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5937
5938 monclient->sub_want(want, 0, 0);
5939 monclient->renew_subs();
5940
5941 return 0;
5942 }
5943
5944 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5945 bool require_mds, const std::string &fs_name)
5946 {
5947 std::lock_guard lock(client_lock);
5948
5949 if (mounted) {
5950 ldout(cct, 5) << "already mounted" << dendl;
5951 return 0;
5952 }
5953
5954 unmounting = false;
5955
5956 int r = subscribe_mdsmap(fs_name);
5957 if (r < 0) {
5958 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5959 return r;
5960 }
5961
5962 tick(); // start tick
5963
5964 if (require_mds) {
5965 while (1) {
5966 auto availability = mdsmap->is_cluster_available();
5967 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5968 // Error out
5969 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5970 return CEPH_FUSE_NO_MDS_UP;
5971 } else if (availability == MDSMap::AVAILABLE) {
5972 // Continue to mount
5973 break;
5974 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5975 // Else, wait. MDSMonitor will update the map to bring
5976 // us to a conclusion eventually.
5977 wait_on_list(waiting_for_mdsmap);
5978 } else {
5979 // Unexpected value!
5980 ceph_abort();
5981 }
5982 }
5983 }
5984
5985 populate_metadata(mount_root.empty() ? "/" : mount_root);
5986
5987 filepath fp(CEPH_INO_ROOT);
5988 if (!mount_root.empty()) {
5989 fp = filepath(mount_root.c_str());
5990 }
5991 while (true) {
5992 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5993 req->set_filepath(fp);
5994 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5995 int res = make_request(req, perms);
5996 if (res < 0) {
5997 if (res == -EACCES && root) {
5998 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5999 break;
6000 }
6001 return res;
6002 }
6003
6004 if (fp.depth())
6005 fp.pop_dentry();
6006 else
6007 break;
6008 }
6009
6010 ceph_assert(root);
6011 _ll_get(root);
6012
6013 mounted = true;
6014
6015 // trace?
6016 if (!cct->_conf->client_trace.empty()) {
6017 traceout.open(cct->_conf->client_trace.c_str());
6018 if (traceout.is_open()) {
6019 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6020 } else {
6021 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6022 }
6023 }
6024
6025 /*
6026 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6027 ldout(cct, 3) << "op: struct stat st;" << dendl;
6028 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6029 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6030 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6031 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6032 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6033 ldout(cct, 3) << "op: int fd;" << dendl;
6034 */
6035 return 0;
6036 }
6037
6038 // UNMOUNT
6039
6040 void Client::_close_sessions()
6041 {
6042 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6043 if (it->second.state == MetaSession::STATE_REJECTED)
6044 mds_sessions.erase(it++);
6045 else
6046 ++it;
6047 }
6048
6049 while (!mds_sessions.empty()) {
6050 // send session closes!
6051 for (auto &p : mds_sessions) {
6052 if (p.second.state != MetaSession::STATE_CLOSING) {
6053 _close_mds_session(&p.second);
6054 mds_ranks_closing.insert(p.first);
6055 }
6056 }
6057
6058 // wait for sessions to close
6059 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6060 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6061 << timo << "s)" << dendl;
6062 std::unique_lock l{client_lock, std::adopt_lock};
6063 if (!timo) {
6064 mount_cond.wait(l);
6065 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6066 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6067 while (!mds_ranks_closing.empty()) {
6068 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6069 // this prunes entry from mds_sessions and mds_ranks_closing
6070 _closed_mds_session(&session, -ETIMEDOUT);
6071 }
6072 }
6073
6074 mds_ranks_closing.clear();
6075 l.release();
6076 }
6077 }
6078
6079 void Client::flush_mdlog_sync()
6080 {
6081 if (mds_requests.empty())
6082 return;
6083 for (auto &p : mds_sessions) {
6084 flush_mdlog(&p.second);
6085 }
6086 }
6087
6088 void Client::flush_mdlog(MetaSession *session)
6089 {
6090 // Only send this to Luminous or newer MDS daemons, older daemons
6091 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6092 const uint64_t features = session->con->get_features();
6093 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6094 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6095 session->con->send_message2(std::move(m));
6096 }
6097 }
6098
6099
6100 void Client::_abort_mds_sessions(int err)
6101 {
6102 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6103 auto req = p->second;
6104 ++p;
6105 // unsafe requests will be removed during close session below.
6106 if (req->got_unsafe)
6107 continue;
6108
6109 req->abort(err);
6110 if (req->caller_cond) {
6111 req->kick = true;
6112 req->caller_cond->notify_all();
6113 }
6114 }
6115
6116 // Process aborts on any requests that were on this waitlist.
6117 // Any requests that were on a waiting_for_open session waitlist
6118 // will get kicked during close session below.
6119 signal_cond_list(waiting_for_mdsmap);
6120
6121 // Force-close all sessions
6122 while(!mds_sessions.empty()) {
6123 auto& session = mds_sessions.begin()->second;
6124 _closed_mds_session(&session, err);
6125 }
6126 }
6127
6128 void Client::_unmount(bool abort)
6129 {
6130 std::unique_lock lock{client_lock, std::adopt_lock};
6131 if (unmounting)
6132 return;
6133
6134 if (abort || blacklisted) {
6135 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6136 } else {
6137 ldout(cct, 2) << "unmounting" << dendl;
6138 }
6139 unmounting = true;
6140
6141 deleg_timeout = 0;
6142
6143 if (abort) {
6144 // Abort all mds sessions
6145 _abort_mds_sessions(-ENOTCONN);
6146
6147 objecter->op_cancel_writes(-ENOTCONN);
6148 } else {
6149 // flush the mdlog for pending requests, if any
6150 flush_mdlog_sync();
6151 }
6152
6153 mount_cond.wait(lock, [this] {
6154 if (!mds_requests.empty()) {
6155 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6156 << dendl;
6157 }
6158 return mds_requests.empty();
6159 });
6160 if (tick_event)
6161 timer.cancel_event(tick_event);
6162 tick_event = 0;
6163
6164 cwd.reset();
6165
6166 // clean up any unclosed files
6167 while (!fd_map.empty()) {
6168 Fh *fh = fd_map.begin()->second;
6169 fd_map.erase(fd_map.begin());
6170 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6171 _release_fh(fh);
6172 }
6173
6174 while (!ll_unclosed_fh_set.empty()) {
6175 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6176 Fh *fh = *it;
6177 ll_unclosed_fh_set.erase(fh);
6178 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6179 _release_fh(fh);
6180 }
6181
6182 while (!opened_dirs.empty()) {
6183 dir_result_t *dirp = *opened_dirs.begin();
6184 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6185 _closedir(dirp);
6186 }
6187
6188 _ll_drop_pins();
6189
6190 mount_cond.wait(lock, [this] {
6191 if (unsafe_sync_write > 0) {
6192 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6193 << dendl;
6194 }
6195 return unsafe_sync_write <= 0;
6196 });
6197
6198 if (cct->_conf->client_oc) {
6199 // flush/release all buffered data
6200 std::list<InodeRef> anchor;
6201 for (auto& p : inode_map) {
6202 Inode *in = p.second;
6203 if (!in) {
6204 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6205 ceph_assert(in);
6206 }
6207
6208 // prevent inode from getting freed
6209 anchor.emplace_back(in);
6210
6211 if (abort || blacklisted) {
6212 objectcacher->purge_set(&in->oset);
6213 } else if (!in->caps.empty()) {
6214 _release(in);
6215 _flush(in, new C_Client_FlushComplete(this, in));
6216 }
6217 }
6218 }
6219
6220 if (abort || blacklisted) {
6221 for (auto p = dirty_list.begin(); !p.end(); ) {
6222 Inode *in = *p;
6223 ++p;
6224 if (in->dirty_caps) {
6225 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6226 in->mark_caps_clean();
6227 put_inode(in);
6228 }
6229 }
6230 } else {
6231 flush_caps_sync();
6232 wait_sync_caps(last_flush_tid);
6233 }
6234
6235 // empty lru cache
6236 trim_cache();
6237
6238 while (lru.lru_get_size() > 0 ||
6239 !inode_map.empty()) {
6240 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6241 << "+" << inode_map.size() << " items"
6242 << ", waiting (for caps to release?)"
6243 << dendl;
6244 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6245 r == std::cv_status::timeout) {
6246 dump_cache(NULL);
6247 }
6248 }
6249 ceph_assert(lru.lru_get_size() == 0);
6250 ceph_assert(inode_map.empty());
6251
6252 // stop tracing
6253 if (!cct->_conf->client_trace.empty()) {
6254 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6255 traceout.close();
6256 }
6257
6258 _close_sessions();
6259
6260 mounted = false;
6261
6262 lock.release();
6263 ldout(cct, 2) << "unmounted." << dendl;
6264 }
6265
6266 void Client::unmount()
6267 {
6268 std::lock_guard lock(client_lock);
6269 _unmount(false);
6270 }
6271
6272 void Client::abort_conn()
6273 {
6274 std::lock_guard lock(client_lock);
6275 _unmount(true);
6276 }
6277
6278 void Client::flush_cap_releases()
6279 {
6280 // send any cap releases
6281 for (auto &p : mds_sessions) {
6282 auto &session = p.second;
6283 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6284 p.first)) {
6285 if (cct->_conf->client_inject_release_failure) {
6286 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6287 } else {
6288 session.con->send_message2(std::move(session.release));
6289 }
6290 session.release.reset();
6291 }
6292 }
6293 }
6294
6295 void Client::tick()
6296 {
6297 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6298 sleep(cct->_conf->client_debug_inject_tick_delay);
6299 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6300 cct->_conf.apply_changes(nullptr);
6301 }
6302
6303 ldout(cct, 21) << "tick" << dendl;
6304 tick_event = timer.add_event_after(
6305 cct->_conf->client_tick_interval,
6306 new LambdaContext([this](int) {
6307 // Called back via Timer, which takes client_lock for us
6308 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6309 tick();
6310 }));
6311 utime_t now = ceph_clock_now();
6312
6313 if (!mounted && !mds_requests.empty()) {
6314 MetaRequest *req = mds_requests.begin()->second;
6315 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6316 req->abort(-ETIMEDOUT);
6317 if (req->caller_cond) {
6318 req->kick = true;
6319 req->caller_cond->notify_all();
6320 }
6321 signal_cond_list(waiting_for_mdsmap);
6322 for (auto &p : mds_sessions) {
6323 signal_context_list(p.second.waiting_for_open);
6324 }
6325 }
6326 }
6327
6328 if (mdsmap->get_epoch()) {
6329 // renew caps?
6330 utime_t el = now - last_cap_renew;
6331 if (el > mdsmap->get_session_timeout() / 3.0)
6332 renew_caps();
6333
6334 flush_cap_releases();
6335 }
6336
6337 // delayed caps
6338 xlist<Inode*>::iterator p = delayed_list.begin();
6339 while (!p.end()) {
6340 Inode *in = *p;
6341 ++p;
6342 if (in->hold_caps_until > now)
6343 break;
6344 delayed_list.pop_front();
6345 check_caps(in, CHECK_CAPS_NODELAY);
6346 }
6347
6348 trim_cache(true);
6349
6350 if (blacklisted && mounted &&
6351 last_auto_reconnect + 30 * 60 < now &&
6352 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6353 messenger->client_reset();
6354 fd_gen++; // invalidate open files
6355 blacklisted = false;
6356 _kick_stale_sessions();
6357 last_auto_reconnect = now;
6358 }
6359 }
6360
6361 void Client::renew_caps()
6362 {
6363 ldout(cct, 10) << "renew_caps()" << dendl;
6364 last_cap_renew = ceph_clock_now();
6365
6366 for (auto &p : mds_sessions) {
6367 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6368 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6369 renew_caps(&p.second);
6370 }
6371 }
6372
6373 void Client::renew_caps(MetaSession *session)
6374 {
6375 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6376 session->last_cap_renew_request = ceph_clock_now();
6377 uint64_t seq = ++session->cap_renew_seq;
6378 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6379 }
6380
6381
6382 // ===============================================================
6383 // high level (POSIXy) interface
6384
6385 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6386 InodeRef *target, const UserPerm& perms)
6387 {
6388 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6389 MetaRequest *req = new MetaRequest(op);
6390 filepath path;
6391 dir->make_nosnap_relative_path(path);
6392 path.push_dentry(name);
6393 req->set_filepath(path);
6394 req->set_inode(dir);
6395 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6396 mask |= DEBUG_GETATTR_CAPS;
6397 req->head.args.getattr.mask = mask;
6398
6399 ldout(cct, 10) << __func__ << " on " << path << dendl;
6400
6401 int r = make_request(req, perms, target);
6402 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6403 return r;
6404 }
6405
6406 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6407 const UserPerm& perms)
6408 {
6409 int r = 0;
6410 Dentry *dn = NULL;
6411
6412 if (dname == "..") {
6413 if (dir->dentries.empty()) {
6414 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6415 filepath path(dir->ino);
6416 req->set_filepath(path);
6417
6418 InodeRef tmptarget;
6419 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6420
6421 if (r == 0) {
6422 Inode *tempino = tmptarget.get();
6423 _ll_get(tempino);
6424 *target = tempino;
6425 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6426 } else {
6427 *target = dir;
6428 }
6429 }
6430 else
6431 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6432 goto done;
6433 }
6434
6435 if (dname == ".") {
6436 *target = dir;
6437 goto done;
6438 }
6439
6440 if (!dir->is_dir()) {
6441 r = -ENOTDIR;
6442 goto done;
6443 }
6444
6445 if (dname.length() > NAME_MAX) {
6446 r = -ENAMETOOLONG;
6447 goto done;
6448 }
6449
6450 if (dname == cct->_conf->client_snapdir &&
6451 dir->snapid == CEPH_NOSNAP) {
6452 *target = open_snapdir(dir);
6453 goto done;
6454 }
6455
6456 if (dir->dir &&
6457 dir->dir->dentries.count(dname)) {
6458 dn = dir->dir->dentries[dname];
6459
6460 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6461 << " seq " << dn->lease_seq
6462 << dendl;
6463
6464 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6465 // is dn lease valid?
6466 utime_t now = ceph_clock_now();
6467 if (dn->lease_mds >= 0 &&
6468 dn->lease_ttl > now &&
6469 mds_sessions.count(dn->lease_mds)) {
6470 MetaSession &s = mds_sessions.at(dn->lease_mds);
6471 if (s.cap_ttl > now &&
6472 s.cap_gen == dn->lease_gen) {
6473 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6474 // make trim_caps() behave.
6475 dir->try_touch_cap(dn->lease_mds);
6476 goto hit_dn;
6477 }
6478 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6479 << " vs lease_gen " << dn->lease_gen << dendl;
6480 }
6481 // dir shared caps?
6482 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6483 if (dn->cap_shared_gen == dir->shared_gen &&
6484 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6485 goto hit_dn;
6486 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6487 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6488 << *dir << " dn '" << dname << "'" << dendl;
6489 return -ENOENT;
6490 }
6491 }
6492 } else {
6493 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6494 }
6495 } else {
6496 // can we conclude ENOENT locally?
6497 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6498 (dir->flags & I_COMPLETE)) {
6499 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6500 return -ENOENT;
6501 }
6502 }
6503
6504 r = _do_lookup(dir, dname, mask, target, perms);
6505 goto done;
6506
6507 hit_dn:
6508 if (dn->inode) {
6509 *target = dn->inode;
6510 } else {
6511 r = -ENOENT;
6512 }
6513 touch_dn(dn);
6514
6515 done:
6516 if (r < 0)
6517 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6518 else
6519 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6520 return r;
6521 }
6522
6523 int Client::get_or_create(Inode *dir, const char* name,
6524 Dentry **pdn, bool expect_null)
6525 {
6526 // lookup
6527 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6528 dir->open_dir();
6529 if (dir->dir->dentries.count(name)) {
6530 Dentry *dn = dir->dir->dentries[name];
6531
6532 // is dn lease valid?
6533 utime_t now = ceph_clock_now();
6534 if (dn->inode &&
6535 dn->lease_mds >= 0 &&
6536 dn->lease_ttl > now &&
6537 mds_sessions.count(dn->lease_mds)) {
6538 MetaSession &s = mds_sessions.at(dn->lease_mds);
6539 if (s.cap_ttl > now &&
6540 s.cap_gen == dn->lease_gen) {
6541 if (expect_null)
6542 return -EEXIST;
6543 }
6544 }
6545 *pdn = dn;
6546 } else {
6547 // otherwise link up a new one
6548 *pdn = link(dir->dir, name, NULL, NULL);
6549 }
6550
6551 // success
6552 return 0;
6553 }
6554
6555 int Client::path_walk(const filepath& origpath, InodeRef *end,
6556 const UserPerm& perms, bool followsym, int mask)
6557 {
6558 filepath path = origpath;
6559 InodeRef cur;
6560 if (origpath.absolute())
6561 cur = root;
6562 else
6563 cur = cwd;
6564 ceph_assert(cur);
6565
6566 ldout(cct, 10) << __func__ << " " << path << dendl;
6567
6568 int symlinks = 0;
6569
6570 unsigned i=0;
6571 while (i < path.depth() && cur) {
6572 int caps = 0;
6573 const string &dname = path[i];
6574 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6575 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6576 InodeRef next;
6577 if (cct->_conf->client_permissions) {
6578 int r = may_lookup(cur.get(), perms);
6579 if (r < 0)
6580 return r;
6581 caps = CEPH_CAP_AUTH_SHARED;
6582 }
6583
6584 /* Get extra requested caps on the last component */
6585 if (i == (path.depth() - 1))
6586 caps |= mask;
6587 int r = _lookup(cur.get(), dname, caps, &next, perms);
6588 if (r < 0)
6589 return r;
6590 // only follow trailing symlink if followsym. always follow
6591 // 'directory' symlinks.
6592 if (next && next->is_symlink()) {
6593 symlinks++;
6594 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6595 if (symlinks > MAXSYMLINKS) {
6596 return -ELOOP;
6597 }
6598
6599 if (i < path.depth() - 1) {
6600 // dir symlink
6601 // replace consumed components of path with symlink dir target
6602 filepath resolved(next->symlink.c_str());
6603 resolved.append(path.postfixpath(i + 1));
6604 path = resolved;
6605 i = 0;
6606 if (next->symlink[0] == '/') {
6607 cur = root;
6608 }
6609 continue;
6610 } else if (followsym) {
6611 if (next->symlink[0] == '/') {
6612 path = next->symlink.c_str();
6613 i = 0;
6614 // reset position
6615 cur = root;
6616 } else {
6617 filepath more(next->symlink.c_str());
6618 // we need to remove the symlink component from off of the path
6619 // before adding the target that the symlink points to. remain
6620 // at the same position in the path.
6621 path.pop_dentry();
6622 path.append(more);
6623 }
6624 continue;
6625 }
6626 }
6627 cur.swap(next);
6628 i++;
6629 }
6630 if (!cur)
6631 return -ENOENT;
6632 if (end)
6633 end->swap(cur);
6634 return 0;
6635 }
6636
6637
6638 // namespace ops
6639
6640 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6641 {
6642 std::lock_guard lock(client_lock);
6643 tout(cct) << "link" << std::endl;
6644 tout(cct) << relexisting << std::endl;
6645 tout(cct) << relpath << std::endl;
6646
6647 if (unmounting)
6648 return -ENOTCONN;
6649
6650 filepath existing(relexisting);
6651
6652 InodeRef in, dir;
6653 int r = path_walk(existing, &in, perm, true);
6654 if (r < 0)
6655 return r;
6656 if (std::string(relpath) == "/") {
6657 r = -EEXIST;
6658 return r;
6659 }
6660 filepath path(relpath);
6661 string name = path.last_dentry();
6662 path.pop_dentry();
6663
6664 r = path_walk(path, &dir, perm, true);
6665 if (r < 0)
6666 return r;
6667 if (cct->_conf->client_permissions) {
6668 if (S_ISDIR(in->mode)) {
6669 r = -EPERM;
6670 return r;
6671 }
6672 r = may_hardlink(in.get(), perm);
6673 if (r < 0)
6674 return r;
6675 r = may_create(dir.get(), perm);
6676 if (r < 0)
6677 return r;
6678 }
6679 r = _link(in.get(), dir.get(), name.c_str(), perm);
6680 return r;
6681 }
6682
6683 int Client::unlink(const char *relpath, const UserPerm& perm)
6684 {
6685 std::lock_guard lock(client_lock);
6686 tout(cct) << __func__ << std::endl;
6687 tout(cct) << relpath << std::endl;
6688
6689 if (unmounting)
6690 return -ENOTCONN;
6691
6692 if (std::string(relpath) == "/")
6693 return -EISDIR;
6694
6695 filepath path(relpath);
6696 string name = path.last_dentry();
6697 path.pop_dentry();
6698 InodeRef dir;
6699 int r = path_walk(path, &dir, perm);
6700 if (r < 0)
6701 return r;
6702 if (cct->_conf->client_permissions) {
6703 r = may_delete(dir.get(), name.c_str(), perm);
6704 if (r < 0)
6705 return r;
6706 }
6707 return _unlink(dir.get(), name.c_str(), perm);
6708 }
6709
6710 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6711 {
6712 std::lock_guard lock(client_lock);
6713 tout(cct) << __func__ << std::endl;
6714 tout(cct) << relfrom << std::endl;
6715 tout(cct) << relto << std::endl;
6716
6717 if (unmounting)
6718 return -ENOTCONN;
6719
6720 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6721 return -EBUSY;
6722
6723 filepath from(relfrom);
6724 filepath to(relto);
6725 string fromname = from.last_dentry();
6726 from.pop_dentry();
6727 string toname = to.last_dentry();
6728 to.pop_dentry();
6729
6730 InodeRef fromdir, todir;
6731 int r = path_walk(from, &fromdir, perm);
6732 if (r < 0)
6733 goto out;
6734 r = path_walk(to, &todir, perm);
6735 if (r < 0)
6736 goto out;
6737
6738 if (cct->_conf->client_permissions) {
6739 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6740 if (r < 0)
6741 return r;
6742 r = may_delete(todir.get(), toname.c_str(), perm);
6743 if (r < 0 && r != -ENOENT)
6744 return r;
6745 }
6746 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6747 out:
6748 return r;
6749 }
6750
6751 // dirs
6752
6753 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6754 {
6755 std::lock_guard lock(client_lock);
6756 tout(cct) << __func__ << std::endl;
6757 tout(cct) << relpath << std::endl;
6758 tout(cct) << mode << std::endl;
6759 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6760
6761 if (unmounting)
6762 return -ENOTCONN;
6763
6764 if (std::string(relpath) == "/")
6765 return -EEXIST;
6766
6767 filepath path(relpath);
6768 string name = path.last_dentry();
6769 path.pop_dentry();
6770 InodeRef dir;
6771 int r = path_walk(path, &dir, perm);
6772 if (r < 0)
6773 return r;
6774 if (cct->_conf->client_permissions) {
6775 r = may_create(dir.get(), perm);
6776 if (r < 0)
6777 return r;
6778 }
6779 return _mkdir(dir.get(), name.c_str(), mode, perm);
6780 }
6781
6782 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6783 {
6784 std::lock_guard lock(client_lock);
6785 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6786 tout(cct) << __func__ << std::endl;
6787 tout(cct) << relpath << std::endl;
6788 tout(cct) << mode << std::endl;
6789
6790 if (unmounting)
6791 return -ENOTCONN;
6792
6793 //get through existing parts of path
6794 filepath path(relpath);
6795 unsigned int i;
6796 int r = 0, caps = 0;
6797 InodeRef cur, next;
6798 cur = cwd;
6799 for (i=0; i<path.depth(); ++i) {
6800 if (cct->_conf->client_permissions) {
6801 r = may_lookup(cur.get(), perms);
6802 if (r < 0)
6803 break;
6804 caps = CEPH_CAP_AUTH_SHARED;
6805 }
6806 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6807 if (r < 0)
6808 break;
6809 cur.swap(next);
6810 }
6811 if (r!=-ENOENT) return r;
6812 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6813 //make new directory at each level
6814 for (; i<path.depth(); ++i) {
6815 if (cct->_conf->client_permissions) {
6816 r = may_create(cur.get(), perms);
6817 if (r < 0)
6818 return r;
6819 }
6820 //make new dir
6821 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6822
6823 //check proper creation/existence
6824 if(-EEXIST == r && i < path.depth() - 1) {
6825 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6826 }
6827 if (r < 0)
6828 return r;
6829 //move to new dir and continue
6830 cur.swap(next);
6831 ldout(cct, 20) << __func__ << ": successfully created directory "
6832 << filepath(cur->ino).get_path() << dendl;
6833 }
6834 return 0;
6835 }
6836
6837 int Client::rmdir(const char *relpath, const UserPerm& perms)
6838 {
6839 std::lock_guard lock(client_lock);
6840 tout(cct) << __func__ << std::endl;
6841 tout(cct) << relpath << std::endl;
6842
6843 if (unmounting)
6844 return -ENOTCONN;
6845
6846 if (std::string(relpath) == "/")
6847 return -EBUSY;
6848
6849 filepath path(relpath);
6850 string name = path.last_dentry();
6851 path.pop_dentry();
6852 InodeRef dir;
6853 int r = path_walk(path, &dir, perms);
6854 if (r < 0)
6855 return r;
6856 if (cct->_conf->client_permissions) {
6857 int r = may_delete(dir.get(), name.c_str(), perms);
6858 if (r < 0)
6859 return r;
6860 }
6861 return _rmdir(dir.get(), name.c_str(), perms);
6862 }
6863
6864 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6865 {
6866 std::lock_guard lock(client_lock);
6867 tout(cct) << __func__ << std::endl;
6868 tout(cct) << relpath << std::endl;
6869 tout(cct) << mode << std::endl;
6870 tout(cct) << rdev << std::endl;
6871
6872 if (unmounting)
6873 return -ENOTCONN;
6874
6875 if (std::string(relpath) == "/")
6876 return -EEXIST;
6877
6878 filepath path(relpath);
6879 string name = path.last_dentry();
6880 path.pop_dentry();
6881 InodeRef dir;
6882 int r = path_walk(path, &dir, perms);
6883 if (r < 0)
6884 return r;
6885 if (cct->_conf->client_permissions) {
6886 int r = may_create(dir.get(), perms);
6887 if (r < 0)
6888 return r;
6889 }
6890 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6891 }
6892
6893 // symlinks
6894
6895 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6896 {
6897 std::lock_guard lock(client_lock);
6898 tout(cct) << __func__ << std::endl;
6899 tout(cct) << target << std::endl;
6900 tout(cct) << relpath << std::endl;
6901
6902 if (unmounting)
6903 return -ENOTCONN;
6904
6905 if (std::string(relpath) == "/")
6906 return -EEXIST;
6907
6908 filepath path(relpath);
6909 string name = path.last_dentry();
6910 path.pop_dentry();
6911 InodeRef dir;
6912 int r = path_walk(path, &dir, perms);
6913 if (r < 0)
6914 return r;
6915 if (cct->_conf->client_permissions) {
6916 int r = may_create(dir.get(), perms);
6917 if (r < 0)
6918 return r;
6919 }
6920 return _symlink(dir.get(), name.c_str(), target, perms);
6921 }
6922
6923 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6924 {
6925 std::lock_guard lock(client_lock);
6926 tout(cct) << __func__ << std::endl;
6927 tout(cct) << relpath << std::endl;
6928
6929 if (unmounting)
6930 return -ENOTCONN;
6931
6932 filepath path(relpath);
6933 InodeRef in;
6934 int r = path_walk(path, &in, perms, false);
6935 if (r < 0)
6936 return r;
6937
6938 return _readlink(in.get(), buf, size);
6939 }
6940
6941 int Client::_readlink(Inode *in, char *buf, size_t size)
6942 {
6943 if (!in->is_symlink())
6944 return -EINVAL;
6945
6946 // copy into buf (at most size bytes)
6947 int r = in->symlink.length();
6948 if (r > (int)size)
6949 r = size;
6950 memcpy(buf, in->symlink.c_str(), r);
6951 return r;
6952 }
6953
6954
6955 // inode stuff
6956
6957 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6958 {
6959 bool yes = in->caps_issued_mask(mask, true);
6960
6961 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6962 if (yes && !force)
6963 return 0;
6964
6965 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6966 filepath path;
6967 in->make_nosnap_relative_path(path);
6968 req->set_filepath(path);
6969 req->set_inode(in);
6970 req->head.args.getattr.mask = mask;
6971
6972 int res = make_request(req, perms);
6973 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6974 return res;
6975 }
6976
6977 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6978 const UserPerm& perms, InodeRef *inp)
6979 {
6980 int issued = in->caps_issued();
6981
6982 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6983 ccap_string(issued) << dendl;
6984
6985 if (in->snapid != CEPH_NOSNAP) {
6986 return -EROFS;
6987 }
6988 if ((mask & CEPH_SETATTR_SIZE) &&
6989 (unsigned long)stx->stx_size > in->size &&
6990 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6991 perms)) {
6992 return -EDQUOT;
6993 }
6994
6995 // make the change locally?
6996 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6997 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6998 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6999 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7000 << in->cap_dirtier_gid << ", forcing sync setattr"
7001 << dendl;
7002 /*
7003 * This works because we implicitly flush the caps as part of the
7004 * request, so the cap update check will happen with the writeback
7005 * cap context, and then the setattr check will happen with the
7006 * caller's context.
7007 *
7008 * In reality this pattern is likely pretty rare (different users
7009 * setattr'ing the same file). If that turns out not to be the
7010 * case later, we can build a more complex pipelined cap writeback
7011 * infrastructure...
7012 */
7013 if (!mask)
7014 mask |= CEPH_SETATTR_CTIME;
7015 goto force_request;
7016 }
7017
7018 if (!mask) {
7019 // caller just needs us to bump the ctime
7020 in->ctime = ceph_clock_now();
7021 in->cap_dirtier_uid = perms.uid();
7022 in->cap_dirtier_gid = perms.gid();
7023 if (issued & CEPH_CAP_AUTH_EXCL)
7024 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7025 else if (issued & CEPH_CAP_FILE_EXCL)
7026 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7027 else if (issued & CEPH_CAP_XATTR_EXCL)
7028 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7029 else
7030 mask |= CEPH_SETATTR_CTIME;
7031 }
7032
7033 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7034 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7035
7036 mask &= ~CEPH_SETATTR_KILL_SGUID;
7037
7038 if (mask & CEPH_SETATTR_UID) {
7039 in->ctime = ceph_clock_now();
7040 in->cap_dirtier_uid = perms.uid();
7041 in->cap_dirtier_gid = perms.gid();
7042 in->uid = stx->stx_uid;
7043 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7044 mask &= ~CEPH_SETATTR_UID;
7045 kill_sguid = true;
7046 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7047 }
7048 if (mask & CEPH_SETATTR_GID) {
7049 in->ctime = ceph_clock_now();
7050 in->cap_dirtier_uid = perms.uid();
7051 in->cap_dirtier_gid = perms.gid();
7052 in->gid = stx->stx_gid;
7053 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7054 mask &= ~CEPH_SETATTR_GID;
7055 kill_sguid = true;
7056 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7057 }
7058
7059 if (mask & CEPH_SETATTR_MODE) {
7060 in->ctime = ceph_clock_now();
7061 in->cap_dirtier_uid = perms.uid();
7062 in->cap_dirtier_gid = perms.gid();
7063 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7064 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7065 mask &= ~CEPH_SETATTR_MODE;
7066 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7067 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7068 /* Must squash the any setuid/setgid bits with an ownership change */
7069 in->mode &= ~(S_ISUID|S_ISGID);
7070 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7071 }
7072
7073 if (mask & CEPH_SETATTR_BTIME) {
7074 in->ctime = ceph_clock_now();
7075 in->cap_dirtier_uid = perms.uid();
7076 in->cap_dirtier_gid = perms.gid();
7077 in->btime = utime_t(stx->stx_btime);
7078 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7079 mask &= ~CEPH_SETATTR_BTIME;
7080 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7081 }
7082 } else if (mask & CEPH_SETATTR_SIZE) {
7083 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7084 mask |= CEPH_SETATTR_KILL_SGUID;
7085 }
7086
7087 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7088 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7089 if (mask & CEPH_SETATTR_MTIME)
7090 in->mtime = utime_t(stx->stx_mtime);
7091 if (mask & CEPH_SETATTR_ATIME)
7092 in->atime = utime_t(stx->stx_atime);
7093 in->ctime = ceph_clock_now();
7094 in->cap_dirtier_uid = perms.uid();
7095 in->cap_dirtier_gid = perms.gid();
7096 in->time_warp_seq++;
7097 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7098 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7099 }
7100 }
7101 if (!mask) {
7102 in->change_attr++;
7103 return 0;
7104 }
7105
7106 force_request:
7107 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7108
7109 filepath path;
7110
7111 in->make_nosnap_relative_path(path);
7112 req->set_filepath(path);
7113 req->set_inode(in);
7114
7115 if (mask & CEPH_SETATTR_KILL_SGUID) {
7116 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7117 }
7118 if (mask & CEPH_SETATTR_MODE) {
7119 req->head.args.setattr.mode = stx->stx_mode;
7120 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7121 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7122 }
7123 if (mask & CEPH_SETATTR_UID) {
7124 req->head.args.setattr.uid = stx->stx_uid;
7125 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7126 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7127 }
7128 if (mask & CEPH_SETATTR_GID) {
7129 req->head.args.setattr.gid = stx->stx_gid;
7130 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7131 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7132 }
7133 if (mask & CEPH_SETATTR_BTIME) {
7134 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7135 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7136 }
7137 if (mask & CEPH_SETATTR_MTIME) {
7138 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7139 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7140 CEPH_CAP_FILE_WR;
7141 }
7142 if (mask & CEPH_SETATTR_ATIME) {
7143 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7144 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7145 CEPH_CAP_FILE_WR;
7146 }
7147 if (mask & CEPH_SETATTR_SIZE) {
7148 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7149 req->head.args.setattr.size = stx->stx_size;
7150 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7151 } else { //too big!
7152 put_request(req);
7153 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7154 return -EFBIG;
7155 }
7156 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7157 CEPH_CAP_FILE_WR;
7158 }
7159 req->head.args.setattr.mask = mask;
7160
7161 req->regetattr_mask = mask;
7162
7163 int res = make_request(req, perms, inp);
7164 ldout(cct, 10) << "_setattr result=" << res << dendl;
7165 return res;
7166 }
7167
7168 /* Note that we only care about attrs that setattr cares about */
7169 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7170 {
7171 stx->stx_size = st->st_size;
7172 stx->stx_mode = st->st_mode;
7173 stx->stx_uid = st->st_uid;
7174 stx->stx_gid = st->st_gid;
7175 #ifdef __APPLE__
7176 stx->stx_mtime = st->st_mtimespec;
7177 stx->stx_atime = st->st_atimespec;
7178 #else
7179 stx->stx_mtime = st->st_mtim;
7180 stx->stx_atime = st->st_atim;
7181 #endif
7182 }
7183
7184 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7185 const UserPerm& perms, InodeRef *inp)
7186 {
7187 int ret = _do_setattr(in, stx, mask, perms, inp);
7188 if (ret < 0)
7189 return ret;
7190 if (mask & CEPH_SETATTR_MODE)
7191 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7192 return ret;
7193 }
7194
7195 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7196 const UserPerm& perms)
7197 {
7198 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7199 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7200 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7201 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7202 if (cct->_conf->client_permissions) {
7203 int r = may_setattr(in.get(), stx, mask, perms);
7204 if (r < 0)
7205 return r;
7206 }
7207 return __setattrx(in.get(), stx, mask, perms);
7208 }
7209
7210 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7211 const UserPerm& perms)
7212 {
7213 struct ceph_statx stx;
7214
7215 stat_to_statx(attr, &stx);
7216 mask &= ~CEPH_SETATTR_BTIME;
7217
7218 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7219 mask &= ~CEPH_SETATTR_UID;
7220 }
7221 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7222 mask &= ~CEPH_SETATTR_GID;
7223 }
7224
7225 return _setattrx(in, &stx, mask, perms);
7226 }
7227
7228 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7229 const UserPerm& perms)
7230 {
7231 std::lock_guard lock(client_lock);
7232 tout(cct) << __func__ << std::endl;
7233 tout(cct) << relpath << std::endl;
7234 tout(cct) << mask << std::endl;
7235
7236 if (unmounting)
7237 return -ENOTCONN;
7238
7239 filepath path(relpath);
7240 InodeRef in;
7241 int r = path_walk(path, &in, perms);
7242 if (r < 0)
7243 return r;
7244 return _setattr(in, attr, mask, perms);
7245 }
7246
7247 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7248 const UserPerm& perms, int flags)
7249 {
7250 std::lock_guard lock(client_lock);
7251 tout(cct) << __func__ << std::endl;
7252 tout(cct) << relpath << std::endl;
7253 tout(cct) << mask << std::endl;
7254
7255 if (unmounting)
7256 return -ENOTCONN;
7257
7258 filepath path(relpath);
7259 InodeRef in;
7260 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7261 if (r < 0)
7262 return r;
7263 return _setattrx(in, stx, mask, perms);
7264 }
7265
7266 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7267 {
7268 std::lock_guard lock(client_lock);
7269 tout(cct) << __func__ << std::endl;
7270 tout(cct) << fd << std::endl;
7271 tout(cct) << mask << std::endl;
7272
7273 if (unmounting)
7274 return -ENOTCONN;
7275
7276 Fh *f = get_filehandle(fd);
7277 if (!f)
7278 return -EBADF;
7279 #if defined(__linux__) && defined(O_PATH)
7280 if (f->flags & O_PATH)
7281 return -EBADF;
7282 #endif
7283 return _setattr(f->inode, attr, mask, perms);
7284 }
7285
7286 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7287 {
7288 std::lock_guard lock(client_lock);
7289 tout(cct) << __func__ << std::endl;
7290 tout(cct) << fd << std::endl;
7291 tout(cct) << mask << std::endl;
7292
7293 if (unmounting)
7294 return -ENOTCONN;
7295
7296 Fh *f = get_filehandle(fd);
7297 if (!f)
7298 return -EBADF;
7299 #if defined(__linux__) && defined(O_PATH)
7300 if (f->flags & O_PATH)
7301 return -EBADF;
7302 #endif
7303 return _setattrx(f->inode, stx, mask, perms);
7304 }
7305
7306 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7307 frag_info_t *dirstat, int mask)
7308 {
7309 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7310 std::lock_guard lock(client_lock);
7311 tout(cct) << "stat" << std::endl;
7312 tout(cct) << relpath << std::endl;
7313
7314 if (unmounting)
7315 return -ENOTCONN;
7316
7317 filepath path(relpath);
7318 InodeRef in;
7319 int r = path_walk(path, &in, perms, true, mask);
7320 if (r < 0)
7321 return r;
7322 r = _getattr(in, mask, perms);
7323 if (r < 0) {
7324 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7325 return r;
7326 }
7327 fill_stat(in, stbuf, dirstat);
7328 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7329 return r;
7330 }
7331
7332 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7333 {
7334 unsigned mask = 0;
7335
7336 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7337 if (flags & AT_NO_ATTR_SYNC)
7338 goto out;
7339
7340 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7341 mask |= CEPH_CAP_PIN;
7342 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7343 mask |= CEPH_CAP_AUTH_SHARED;
7344 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7345 mask |= CEPH_CAP_LINK_SHARED;
7346 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7347 mask |= CEPH_CAP_FILE_SHARED;
7348 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7349 mask |= CEPH_CAP_XATTR_SHARED;
7350 out:
7351 return mask;
7352 }
7353
7354 int Client::statx(const char *relpath, struct ceph_statx *stx,
7355 const UserPerm& perms,
7356 unsigned int want, unsigned int flags)
7357 {
7358 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7359 std::lock_guard lock(client_lock);
7360 tout(cct) << "statx" << std::endl;
7361 tout(cct) << relpath << std::endl;
7362
7363 if (unmounting)
7364 return -ENOTCONN;
7365
7366 filepath path(relpath);
7367 InodeRef in;
7368
7369 unsigned mask = statx_to_mask(flags, want);
7370
7371 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7372 if (r < 0)
7373 return r;
7374
7375 r = _getattr(in, mask, perms);
7376 if (r < 0) {
7377 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7378 return r;
7379 }
7380
7381 fill_statx(in, mask, stx);
7382 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7383 return r;
7384 }
7385
7386 int Client::lstat(const char *relpath, struct stat *stbuf,
7387 const UserPerm& perms, frag_info_t *dirstat, int mask)
7388 {
7389 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7390 std::lock_guard lock(client_lock);
7391 tout(cct) << __func__ << std::endl;
7392 tout(cct) << relpath << std::endl;
7393
7394 if (unmounting)
7395 return -ENOTCONN;
7396
7397 filepath path(relpath);
7398 InodeRef in;
7399 // don't follow symlinks
7400 int r = path_walk(path, &in, perms, false, mask);
7401 if (r < 0)
7402 return r;
7403 r = _getattr(in, mask, perms);
7404 if (r < 0) {
7405 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7406 return r;
7407 }
7408 fill_stat(in, stbuf, dirstat);
7409 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7410 return r;
7411 }
7412
7413 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7414 {
7415 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7416 << " mode 0" << oct << in->mode << dec
7417 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7418 memset(st, 0, sizeof(struct stat));
7419 if (use_faked_inos())
7420 st->st_ino = in->faked_ino;
7421 else
7422 st->st_ino = in->ino;
7423 st->st_dev = in->snapid;
7424 st->st_mode = in->mode;
7425 st->st_rdev = in->rdev;
7426 if (in->is_dir()) {
7427 switch (in->nlink) {
7428 case 0:
7429 st->st_nlink = 0; /* dir is unlinked */
7430 break;
7431 case 1:
7432 st->st_nlink = 1 /* parent dentry */
7433 + 1 /* <dir>/. */
7434 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7435 break;
7436 default:
7437 ceph_abort();
7438 }
7439 } else {
7440 st->st_nlink = in->nlink;
7441 }
7442 st->st_uid = in->uid;
7443 st->st_gid = in->gid;
7444 if (in->ctime > in->mtime) {
7445 stat_set_ctime_sec(st, in->ctime.sec());
7446 stat_set_ctime_nsec(st, in->ctime.nsec());
7447 } else {
7448 stat_set_ctime_sec(st, in->mtime.sec());
7449 stat_set_ctime_nsec(st, in->mtime.nsec());
7450 }
7451 stat_set_atime_sec(st, in->atime.sec());
7452 stat_set_atime_nsec(st, in->atime.nsec());
7453 stat_set_mtime_sec(st, in->mtime.sec());
7454 stat_set_mtime_nsec(st, in->mtime.nsec());
7455 if (in->is_dir()) {
7456 if (cct->_conf->client_dirsize_rbytes)
7457 st->st_size = in->rstat.rbytes;
7458 else
7459 st->st_size = in->dirstat.size();
7460 st->st_blocks = 1;
7461 } else {
7462 st->st_size = in->size;
7463 st->st_blocks = (in->size + 511) >> 9;
7464 }
7465 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7466
7467 if (dirstat)
7468 *dirstat = in->dirstat;
7469 if (rstat)
7470 *rstat = in->rstat;
7471
7472 return in->caps_issued();
7473 }
7474
7475 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7476 {
7477 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7478 << " mode 0" << oct << in->mode << dec
7479 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7480 memset(stx, 0, sizeof(struct ceph_statx));
7481
7482 /*
7483 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7484 * so that all bits are set.
7485 */
7486 if (!mask)
7487 mask = ~0;
7488
7489 /* These are always considered to be available */
7490 stx->stx_dev = in->snapid;
7491 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7492
7493 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7494 stx->stx_mode = S_IFMT & in->mode;
7495 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7496 stx->stx_rdev = in->rdev;
7497 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7498
7499 if (mask & CEPH_CAP_AUTH_SHARED) {
7500 stx->stx_uid = in->uid;
7501 stx->stx_gid = in->gid;
7502 stx->stx_mode = in->mode;
7503 in->btime.to_timespec(&stx->stx_btime);
7504 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7505 }
7506
7507 if (mask & CEPH_CAP_LINK_SHARED) {
7508 if (in->is_dir()) {
7509 switch (in->nlink) {
7510 case 0:
7511 stx->stx_nlink = 0; /* dir is unlinked */
7512 break;
7513 case 1:
7514 stx->stx_nlink = 1 /* parent dentry */
7515 + 1 /* <dir>/. */
7516 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7517 break;
7518 default:
7519 ceph_abort();
7520 }
7521 } else {
7522 stx->stx_nlink = in->nlink;
7523 }
7524 stx->stx_mask |= CEPH_STATX_NLINK;
7525 }
7526
7527 if (mask & CEPH_CAP_FILE_SHARED) {
7528
7529 in->atime.to_timespec(&stx->stx_atime);
7530 in->mtime.to_timespec(&stx->stx_mtime);
7531
7532 if (in->is_dir()) {
7533 if (cct->_conf->client_dirsize_rbytes)
7534 stx->stx_size = in->rstat.rbytes;
7535 else
7536 stx->stx_size = in->dirstat.size();
7537 stx->stx_blocks = 1;
7538 } else {
7539 stx->stx_size = in->size;
7540 stx->stx_blocks = (in->size + 511) >> 9;
7541 }
7542 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7543 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7544 }
7545
7546 /* Change time and change_attr both require all shared caps to view */
7547 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7548 stx->stx_version = in->change_attr;
7549 if (in->ctime > in->mtime)
7550 in->ctime.to_timespec(&stx->stx_ctime);
7551 else
7552 in->mtime.to_timespec(&stx->stx_ctime);
7553 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7554 }
7555
7556 }
7557
7558 void Client::touch_dn(Dentry *dn)
7559 {
7560 lru.lru_touch(dn);
7561 }
7562
7563 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7564 {
7565 std::lock_guard lock(client_lock);
7566 tout(cct) << __func__ << std::endl;
7567 tout(cct) << relpath << std::endl;
7568 tout(cct) << mode << std::endl;
7569
7570 if (unmounting)
7571 return -ENOTCONN;
7572
7573 filepath path(relpath);
7574 InodeRef in;
7575 int r = path_walk(path, &in, perms);
7576 if (r < 0)
7577 return r;
7578 struct stat attr;
7579 attr.st_mode = mode;
7580 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7581 }
7582
7583 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7584 {
7585 std::lock_guard lock(client_lock);
7586 tout(cct) << __func__ << std::endl;
7587 tout(cct) << fd << std::endl;
7588 tout(cct) << mode << std::endl;
7589
7590 if (unmounting)
7591 return -ENOTCONN;
7592
7593 Fh *f = get_filehandle(fd);
7594 if (!f)
7595 return -EBADF;
7596 #if defined(__linux__) && defined(O_PATH)
7597 if (f->flags & O_PATH)
7598 return -EBADF;
7599 #endif
7600 struct stat attr;
7601 attr.st_mode = mode;
7602 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7603 }
7604
7605 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7606 {
7607 std::lock_guard lock(client_lock);
7608 tout(cct) << __func__ << std::endl;
7609 tout(cct) << relpath << std::endl;
7610 tout(cct) << mode << std::endl;
7611
7612 if (unmounting)
7613 return -ENOTCONN;
7614
7615 filepath path(relpath);
7616 InodeRef in;
7617 // don't follow symlinks
7618 int r = path_walk(path, &in, perms, false);
7619 if (r < 0)
7620 return r;
7621 struct stat attr;
7622 attr.st_mode = mode;
7623 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7624 }
7625
7626 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7627 const UserPerm& perms)
7628 {
7629 std::lock_guard lock(client_lock);
7630 tout(cct) << __func__ << std::endl;
7631 tout(cct) << relpath << std::endl;
7632 tout(cct) << new_uid << std::endl;
7633 tout(cct) << new_gid << std::endl;
7634
7635 if (unmounting)
7636 return -ENOTCONN;
7637
7638 filepath path(relpath);
7639 InodeRef in;
7640 int r = path_walk(path, &in, perms);
7641 if (r < 0)
7642 return r;
7643 struct stat attr;
7644 attr.st_uid = new_uid;
7645 attr.st_gid = new_gid;
7646 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7647 }
7648
7649 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7650 {
7651 std::lock_guard lock(client_lock);
7652 tout(cct) << __func__ << std::endl;
7653 tout(cct) << fd << std::endl;
7654 tout(cct) << new_uid << std::endl;
7655 tout(cct) << new_gid << std::endl;
7656
7657 if (unmounting)
7658 return -ENOTCONN;
7659
7660 Fh *f = get_filehandle(fd);
7661 if (!f)
7662 return -EBADF;
7663 #if defined(__linux__) && defined(O_PATH)
7664 if (f->flags & O_PATH)
7665 return -EBADF;
7666 #endif
7667 struct stat attr;
7668 attr.st_uid = new_uid;
7669 attr.st_gid = new_gid;
7670 int mask = 0;
7671 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7672 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7673 return _setattr(f->inode, &attr, mask, perms);
7674 }
7675
7676 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7677 const UserPerm& perms)
7678 {
7679 std::lock_guard lock(client_lock);
7680 tout(cct) << __func__ << std::endl;
7681 tout(cct) << relpath << std::endl;
7682 tout(cct) << new_uid << std::endl;
7683 tout(cct) << new_gid << std::endl;
7684
7685 if (unmounting)
7686 return -ENOTCONN;
7687
7688 filepath path(relpath);
7689 InodeRef in;
7690 // don't follow symlinks
7691 int r = path_walk(path, &in, perms, false);
7692 if (r < 0)
7693 return r;
7694 struct stat attr;
7695 attr.st_uid = new_uid;
7696 attr.st_gid = new_gid;
7697 int mask = 0;
7698 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7699 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7700 return _setattr(in, &attr, mask, perms);
7701 }
7702
7703 static void attr_set_atime_and_mtime(struct stat *attr,
7704 const utime_t &atime,
7705 const utime_t &mtime)
7706 {
7707 stat_set_atime_sec(attr, atime.tv.tv_sec);
7708 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7709 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7710 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7711 }
7712
7713 // for [l]utime() invoke the timeval variant as the timespec
7714 // variant are not yet implemented. for futime[s](), invoke
7715 // the timespec variant.
7716 int Client::utime(const char *relpath, struct utimbuf *buf,
7717 const UserPerm& perms)
7718 {
7719 struct timeval tv[2];
7720 tv[0].tv_sec = buf->actime;
7721 tv[0].tv_usec = 0;
7722 tv[1].tv_sec = buf->modtime;
7723 tv[1].tv_usec = 0;
7724
7725 return utimes(relpath, tv, perms);
7726 }
7727
7728 int Client::lutime(const char *relpath, struct utimbuf *buf,
7729 const UserPerm& perms)
7730 {
7731 struct timeval tv[2];
7732 tv[0].tv_sec = buf->actime;
7733 tv[0].tv_usec = 0;
7734 tv[1].tv_sec = buf->modtime;
7735 tv[1].tv_usec = 0;
7736
7737 return lutimes(relpath, tv, perms);
7738 }
7739
7740 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7741 {
7742 struct timespec ts[2];
7743 ts[0].tv_sec = buf->actime;
7744 ts[0].tv_nsec = 0;
7745 ts[1].tv_sec = buf->modtime;
7746 ts[1].tv_nsec = 0;
7747
7748 return futimens(fd, ts, perms);
7749 }
7750
7751 int Client::utimes(const char *relpath, struct timeval times[2],
7752 const UserPerm& perms)
7753 {
7754 std::lock_guard lock(client_lock);
7755 tout(cct) << __func__ << std::endl;
7756 tout(cct) << relpath << std::endl;
7757 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7758 << std::endl;
7759 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7760 << std::endl;
7761
7762 if (unmounting)
7763 return -ENOTCONN;
7764
7765 filepath path(relpath);
7766 InodeRef in;
7767 int r = path_walk(path, &in, perms);
7768 if (r < 0)
7769 return r;
7770 struct stat attr;
7771 utime_t atime(times[0]);
7772 utime_t mtime(times[1]);
7773
7774 attr_set_atime_and_mtime(&attr, atime, mtime);
7775 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7776 }
7777
7778 int Client::lutimes(const char *relpath, struct timeval times[2],
7779 const UserPerm& perms)
7780 {
7781 std::lock_guard lock(client_lock);
7782 tout(cct) << __func__ << std::endl;
7783 tout(cct) << relpath << std::endl;
7784 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7785 << std::endl;
7786 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7787 << std::endl;
7788
7789 if (unmounting)
7790 return -ENOTCONN;
7791
7792 filepath path(relpath);
7793 InodeRef in;
7794 int r = path_walk(path, &in, perms, false);
7795 if (r < 0)
7796 return r;
7797 struct stat attr;
7798 utime_t atime(times[0]);
7799 utime_t mtime(times[1]);
7800
7801 attr_set_atime_and_mtime(&attr, atime, mtime);
7802 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7803 }
7804
7805 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7806 {
7807 struct timespec ts[2];
7808 ts[0].tv_sec = times[0].tv_sec;
7809 ts[0].tv_nsec = times[0].tv_usec * 1000;
7810 ts[1].tv_sec = times[1].tv_sec;
7811 ts[1].tv_nsec = times[1].tv_usec * 1000;
7812
7813 return futimens(fd, ts, perms);
7814 }
7815
7816 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7817 {
7818 std::lock_guard lock(client_lock);
7819 tout(cct) << __func__ << std::endl;
7820 tout(cct) << fd << std::endl;
7821 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7822 << std::endl;
7823 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7824 << std::endl;
7825
7826 if (unmounting)
7827 return -ENOTCONN;
7828
7829 Fh *f = get_filehandle(fd);
7830 if (!f)
7831 return -EBADF;
7832 #if defined(__linux__) && defined(O_PATH)
7833 if (f->flags & O_PATH)
7834 return -EBADF;
7835 #endif
7836 struct stat attr;
7837 utime_t atime(times[0]);
7838 utime_t mtime(times[1]);
7839
7840 attr_set_atime_and_mtime(&attr, atime, mtime);
7841 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7842 }
7843
7844 int Client::flock(int fd, int operation, uint64_t owner)
7845 {
7846 std::lock_guard lock(client_lock);
7847 tout(cct) << __func__ << std::endl;
7848 tout(cct) << fd << std::endl;
7849 tout(cct) << operation << std::endl;
7850 tout(cct) << owner << std::endl;
7851
7852 if (unmounting)
7853 return -ENOTCONN;
7854
7855 Fh *f = get_filehandle(fd);
7856 if (!f)
7857 return -EBADF;
7858
7859 return _flock(f, operation, owner);
7860 }
7861
7862 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7863 {
7864 std::lock_guard lock(client_lock);
7865 tout(cct) << __func__ << std::endl;
7866 tout(cct) << relpath << std::endl;
7867
7868 if (unmounting)
7869 return -ENOTCONN;
7870
7871 filepath path(relpath);
7872 InodeRef in;
7873 int r = path_walk(path, &in, perms, true);
7874 if (r < 0)
7875 return r;
7876 if (cct->_conf->client_permissions) {
7877 int r = may_open(in.get(), O_RDONLY, perms);
7878 if (r < 0)
7879 return r;
7880 }
7881 r = _opendir(in.get(), dirpp, perms);
7882 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7883 if (r != -ENOTDIR)
7884 tout(cct) << (unsigned long)*dirpp << std::endl;
7885 return r;
7886 }
7887
7888 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7889 {
7890 if (!in->is_dir())
7891 return -ENOTDIR;
7892 *dirpp = new dir_result_t(in, perms);
7893 opened_dirs.insert(*dirpp);
7894 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7895 return 0;
7896 }
7897
7898
7899 int Client::closedir(dir_result_t *dir)
7900 {
7901 std::lock_guard lock(client_lock);
7902 tout(cct) << __func__ << std::endl;
7903 tout(cct) << (unsigned long)dir << std::endl;
7904
7905 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7906 _closedir(dir);
7907 return 0;
7908 }
7909
7910 void Client::_closedir(dir_result_t *dirp)
7911 {
7912 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7913 if (dirp->inode) {
7914 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7915 dirp->inode.reset();
7916 }
7917 _readdir_drop_dirp_buffer(dirp);
7918 opened_dirs.erase(dirp);
7919 delete dirp;
7920 }
7921
7922 void Client::rewinddir(dir_result_t *dirp)
7923 {
7924 std::lock_guard lock(client_lock);
7925 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7926
7927 if (unmounting)
7928 return;
7929
7930 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7931 _readdir_drop_dirp_buffer(d);
7932 d->reset();
7933 }
7934
7935 loff_t Client::telldir(dir_result_t *dirp)
7936 {
7937 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7938 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7939 return d->offset;
7940 }
7941
7942 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7943 {
7944 std::lock_guard lock(client_lock);
7945
7946 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7947
7948 if (unmounting)
7949 return;
7950
7951 if (offset == dirp->offset)
7952 return;
7953
7954 if (offset > dirp->offset)
7955 dirp->release_count = 0; // bump if we do a forward seek
7956 else
7957 dirp->ordered_count = 0; // disable filling readdir cache
7958
7959 if (dirp->hash_order()) {
7960 if (dirp->offset > offset) {
7961 _readdir_drop_dirp_buffer(dirp);
7962 dirp->reset();
7963 }
7964 } else {
7965 if (offset == 0 ||
7966 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7967 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7968 _readdir_drop_dirp_buffer(dirp);
7969 dirp->reset();
7970 }
7971 }
7972
7973 dirp->offset = offset;
7974 }
7975
7976
7977 //struct dirent {
7978 // ino_t d_ino; /* inode number */
7979 // off_t d_off; /* offset to the next dirent */
7980 // unsigned short d_reclen; /* length of this record */
7981 // unsigned char d_type; /* type of file */
7982 // char d_name[256]; /* filename */
7983 //};
7984 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7985 {
7986 strncpy(de->d_name, name, 255);
7987 de->d_name[255] = '\0';
7988 #ifndef __CYGWIN__
7989 de->d_ino = ino;
7990 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7991 de->d_off = next_off;
7992 #endif
7993 de->d_reclen = 1;
7994 de->d_type = IFTODT(type);
7995 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7996 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7997 #endif
7998 }
7999
8000 void Client::_readdir_next_frag(dir_result_t *dirp)
8001 {
8002 frag_t fg = dirp->buffer_frag;
8003
8004 if (fg.is_rightmost()) {
8005 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8006 dirp->set_end();
8007 return;
8008 }
8009
8010 // advance
8011 fg = fg.next();
8012 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8013
8014 if (dirp->hash_order()) {
8015 // keep last_name
8016 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8017 if (dirp->offset < new_offset) // don't decrease offset
8018 dirp->offset = new_offset;
8019 } else {
8020 dirp->last_name.clear();
8021 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8022 _readdir_rechoose_frag(dirp);
8023 }
8024 }
8025
8026 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8027 {
8028 ceph_assert(dirp->inode);
8029
8030 if (dirp->hash_order())
8031 return;
8032
8033 frag_t cur = frag_t(dirp->offset_high());
8034 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8035 if (fg != cur) {
8036 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8037 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8038 dirp->last_name.clear();
8039 dirp->next_offset = 2;
8040 }
8041 }
8042
8043 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8044 {
8045 ldout(cct, 10) << __func__ << " " << dirp << dendl;
8046 dirp->buffer.clear();
8047 }
8048
8049 int Client::_readdir_get_frag(dir_result_t *dirp)
8050 {
8051 ceph_assert(dirp);
8052 ceph_assert(dirp->inode);
8053
8054 // get the current frag.
8055 frag_t fg;
8056 if (dirp->hash_order())
8057 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8058 else
8059 fg = frag_t(dirp->offset_high());
8060
8061 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8062 << " offset " << hex << dirp->offset << dec << dendl;
8063
8064 int op = CEPH_MDS_OP_READDIR;
8065 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8066 op = CEPH_MDS_OP_LSSNAP;
8067
8068 InodeRef& diri = dirp->inode;
8069
8070 MetaRequest *req = new MetaRequest(op);
8071 filepath path;
8072 diri->make_nosnap_relative_path(path);
8073 req->set_filepath(path);
8074 req->set_inode(diri.get());
8075 req->head.args.readdir.frag = fg;
8076 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8077 if (dirp->last_name.length()) {
8078 req->path2.set_path(dirp->last_name);
8079 } else if (dirp->hash_order()) {
8080 req->head.args.readdir.offset_hash = dirp->offset_high();
8081 }
8082 req->dirp = dirp;
8083
8084 bufferlist dirbl;
8085 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8086
8087 if (res == -EAGAIN) {
8088 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8089 _readdir_rechoose_frag(dirp);
8090 return _readdir_get_frag(dirp);
8091 }
8092
8093 if (res == 0) {
8094 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8095 << " size " << dirp->buffer.size() << dendl;
8096 } else {
8097 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8098 dirp->set_end();
8099 }
8100
8101 return res;
8102 }
8103
8104 struct dentry_off_lt {
8105 bool operator()(const Dentry* dn, int64_t off) const {
8106 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8107 }
8108 };
8109
8110 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8111 int caps, bool getref)
8112 {
8113 ceph_assert(ceph_mutex_is_locked(client_lock));
8114 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8115 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8116 << dendl;
8117 Dir *dir = dirp->inode->dir;
8118
8119 if (!dir) {
8120 ldout(cct, 10) << " dir is empty" << dendl;
8121 dirp->set_end();
8122 return 0;
8123 }
8124
8125 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8126 dir->readdir_cache.end(),
8127 dirp->offset, dentry_off_lt());
8128
8129 string dn_name;
8130 while (true) {
8131 if (!dirp->inode->is_complete_and_ordered())
8132 return -EAGAIN;
8133 if (pd == dir->readdir_cache.end())
8134 break;
8135 Dentry *dn = *pd;
8136 if (dn->inode == NULL) {
8137 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8138 ++pd;
8139 continue;
8140 }
8141 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8142 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8143 ++pd;
8144 continue;
8145 }
8146
8147 int idx = pd - dir->readdir_cache.begin();
8148 int r = _getattr(dn->inode, caps, dirp->perms);
8149 if (r < 0)
8150 return r;
8151
8152 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8153 pd = dir->readdir_cache.begin() + idx;
8154 if (pd >= dir->readdir_cache.end() || *pd != dn)
8155 return -EAGAIN;
8156
8157 struct ceph_statx stx;
8158 struct dirent de;
8159 fill_statx(dn->inode, caps, &stx);
8160
8161 uint64_t next_off = dn->offset + 1;
8162 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8163 ++pd;
8164 if (pd == dir->readdir_cache.end())
8165 next_off = dir_result_t::END;
8166
8167 Inode *in = NULL;
8168 if (getref) {
8169 in = dn->inode.get();
8170 _ll_get(in);
8171 }
8172
8173 dn_name = dn->name; // fill in name while we have lock
8174
8175 client_lock.unlock();
8176 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8177 client_lock.lock();
8178 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8179 << " = " << r << dendl;
8180 if (r < 0) {
8181 return r;
8182 }
8183
8184 dirp->offset = next_off;
8185 if (dirp->at_end())
8186 dirp->next_offset = 2;
8187 else
8188 dirp->next_offset = dirp->offset_low();
8189 dirp->last_name = dn_name; // we successfully returned this one; update!
8190 dirp->release_count = 0; // last_name no longer match cache index
8191 if (r > 0)
8192 return r;
8193 }
8194
8195 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8196 dirp->set_end();
8197 return 0;
8198 }
8199
8200 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8201 unsigned want, unsigned flags, bool getref)
8202 {
8203 int caps = statx_to_mask(flags, want);
8204
8205 std::lock_guard lock(client_lock);
8206
8207 if (unmounting)
8208 return -ENOTCONN;
8209
8210 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8211
8212 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8213 << dec << " at_end=" << dirp->at_end()
8214 << " hash_order=" << dirp->hash_order() << dendl;
8215
8216 struct dirent de;
8217 struct ceph_statx stx;
8218 memset(&de, 0, sizeof(de));
8219 memset(&stx, 0, sizeof(stx));
8220
8221 InodeRef& diri = dirp->inode;
8222
8223 if (dirp->at_end())
8224 return 0;
8225
8226 if (dirp->offset == 0) {
8227 ldout(cct, 15) << " including ." << dendl;
8228 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8229 uint64_t next_off = 1;
8230
8231 int r;
8232 r = _getattr(diri, caps, dirp->perms);
8233 if (r < 0)
8234 return r;
8235
8236 fill_statx(diri, caps, &stx);
8237 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8238
8239 Inode *inode = NULL;
8240 if (getref) {
8241 inode = diri.get();
8242 _ll_get(inode);
8243 }
8244
8245 client_lock.unlock();
8246 r = cb(p, &de, &stx, next_off, inode);
8247 client_lock.lock();
8248 if (r < 0)
8249 return r;
8250
8251 dirp->offset = next_off;
8252 if (r > 0)
8253 return r;
8254 }
8255 if (dirp->offset == 1) {
8256 ldout(cct, 15) << " including .." << dendl;
8257 uint64_t next_off = 2;
8258 InodeRef in;
8259 if (diri->dentries.empty())
8260 in = diri;
8261 else
8262 in = diri->get_first_parent()->dir->parent_inode;
8263
8264 int r;
8265 r = _getattr(in, caps, dirp->perms);
8266 if (r < 0)
8267 return r;
8268
8269 fill_statx(in, caps, &stx);
8270 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8271
8272 Inode *inode = NULL;
8273 if (getref) {
8274 inode = in.get();
8275 _ll_get(inode);
8276 }
8277
8278 client_lock.unlock();
8279 r = cb(p, &de, &stx, next_off, inode);
8280 client_lock.lock();
8281 if (r < 0)
8282 return r;
8283
8284 dirp->offset = next_off;
8285 if (r > 0)
8286 return r;
8287 }
8288
8289 // can we read from our cache?
8290 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8291 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8292 << dirp->inode->is_complete_and_ordered()
8293 << " issued " << ccap_string(dirp->inode->caps_issued())
8294 << dendl;
8295 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8296 dirp->inode->is_complete_and_ordered() &&
8297 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8298 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8299 if (err != -EAGAIN)
8300 return err;
8301 }
8302
8303 while (1) {
8304 if (dirp->at_end())
8305 return 0;
8306
8307 bool check_caps = true;
8308 if (!dirp->is_cached()) {
8309 int r = _readdir_get_frag(dirp);
8310 if (r)
8311 return r;
8312 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8313 // different than the requested one. (our dirfragtree was outdated)
8314 check_caps = false;
8315 }
8316 frag_t fg = dirp->buffer_frag;
8317
8318 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8319 << " offset " << hex << dirp->offset << dendl;
8320
8321 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8322 dirp->offset, dir_result_t::dentry_off_lt());
8323 it != dirp->buffer.end();
8324 ++it) {
8325 dir_result_t::dentry &entry = *it;
8326
8327 uint64_t next_off = entry.offset + 1;
8328
8329 int r;
8330 if (check_caps) {
8331 r = _getattr(entry.inode, caps, dirp->perms);
8332 if (r < 0)
8333 return r;
8334 }
8335
8336 fill_statx(entry.inode, caps, &stx);
8337 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8338
8339 Inode *inode = NULL;
8340 if (getref) {
8341 inode = entry.inode.get();
8342 _ll_get(inode);
8343 }
8344
8345 client_lock.unlock();
8346 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8347 client_lock.lock();
8348
8349 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8350 << " = " << r << dendl;
8351 if (r < 0)
8352 return r;
8353
8354 dirp->offset = next_off;
8355 if (r > 0)
8356 return r;
8357 }
8358
8359 if (dirp->next_offset > 2) {
8360 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8361 _readdir_drop_dirp_buffer(dirp);
8362 continue; // more!
8363 }
8364
8365 if (!fg.is_rightmost()) {
8366 // next frag!
8367 _readdir_next_frag(dirp);
8368 continue;
8369 }
8370
8371 if (diri->shared_gen == dirp->start_shared_gen &&
8372 diri->dir_release_count == dirp->release_count) {
8373 if (diri->dir_ordered_count == dirp->ordered_count) {
8374 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8375 if (diri->dir) {
8376 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8377 diri->dir->readdir_cache.resize(dirp->cache_index);
8378 }
8379 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8380 } else {
8381 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8382 diri->flags |= I_COMPLETE;
8383 }
8384 }
8385
8386 dirp->set_end();
8387 return 0;
8388 }
8389 ceph_abort();
8390 return 0;
8391 }
8392
8393
8394 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8395 {
8396 return readdirplus_r(d, de, 0, 0, 0, NULL);
8397 }
8398
8399 /*
8400 * readdirplus_r
8401 *
8402 * returns
8403 * 1 if we got a dirent
8404 * 0 for end of directory
8405 * <0 on error
8406 */
8407
8408 struct single_readdir {
8409 struct dirent *de;
8410 struct ceph_statx *stx;
8411 Inode *inode;
8412 bool full;
8413 };
8414
8415 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8416 struct ceph_statx *stx, off_t off,
8417 Inode *in)
8418 {
8419 single_readdir *c = static_cast<single_readdir *>(p);
8420
8421 if (c->full)
8422 return -1; // already filled this dirent
8423
8424 *c->de = *de;
8425 if (c->stx)
8426 *c->stx = *stx;
8427 c->inode = in;
8428 c->full = true;
8429 return 1;
8430 }
8431
8432 struct dirent *Client::readdir(dir_result_t *d)
8433 {
8434 int ret;
8435 static struct dirent de;
8436 single_readdir sr;
8437 sr.de = &de;
8438 sr.stx = NULL;
8439 sr.inode = NULL;
8440 sr.full = false;
8441
8442 // our callback fills the dirent and sets sr.full=true on first
8443 // call, and returns -1 the second time around.
8444 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8445 if (ret < -1) {
8446 errno = -ret; // this sucks.
8447 return (dirent *) NULL;
8448 }
8449 if (sr.full) {
8450 return &de;
8451 }
8452 return (dirent *) NULL;
8453 }
8454
8455 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8456 struct ceph_statx *stx, unsigned want,
8457 unsigned flags, Inode **out)
8458 {
8459 single_readdir sr;
8460 sr.de = de;
8461 sr.stx = stx;
8462 sr.inode = NULL;
8463 sr.full = false;
8464
8465 // our callback fills the dirent and sets sr.full=true on first
8466 // call, and returns -1 the second time around.
8467 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8468 if (r < -1)
8469 return r;
8470 if (out)
8471 *out = sr.inode;
8472 if (sr.full)
8473 return 1;
8474 return 0;
8475 }
8476
8477
8478 /* getdents */
8479 struct getdents_result {
8480 char *buf;
8481 int buflen;
8482 int pos;
8483 bool fullent;
8484 };
8485
8486 static int _readdir_getdent_cb(void *p, struct dirent *de,
8487 struct ceph_statx *stx, off_t off, Inode *in)
8488 {
8489 struct getdents_result *c = static_cast<getdents_result *>(p);
8490
8491 int dlen;
8492 if (c->fullent)
8493 dlen = sizeof(*de);
8494 else
8495 dlen = strlen(de->d_name) + 1;
8496
8497 if (c->pos + dlen > c->buflen)
8498 return -1; // doesn't fit
8499
8500 if (c->fullent) {
8501 memcpy(c->buf + c->pos, de, sizeof(*de));
8502 } else {
8503 memcpy(c->buf + c->pos, de->d_name, dlen);
8504 }
8505 c->pos += dlen;
8506 return 0;
8507 }
8508
8509 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8510 {
8511 getdents_result gr;
8512 gr.buf = buf;
8513 gr.buflen = buflen;
8514 gr.fullent = fullent;
8515 gr.pos = 0;
8516
8517 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8518
8519 if (r < 0) { // some error
8520 if (r == -1) { // buffer ran out of space
8521 if (gr.pos) { // but we got some entries already!
8522 return gr.pos;
8523 } // or we need a larger buffer
8524 return -ERANGE;
8525 } else { // actual error, return it
8526 return r;
8527 }
8528 }
8529 return gr.pos;
8530 }
8531
8532
8533 /* getdir */
8534 struct getdir_result {
8535 list<string> *contents;
8536 int num;
8537 };
8538
8539 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8540 {
8541 getdir_result *r = static_cast<getdir_result *>(p);
8542
8543 r->contents->push_back(de->d_name);
8544 r->num++;
8545 return 0;
8546 }
8547
8548 int Client::getdir(const char *relpath, list<string>& contents,
8549 const UserPerm& perms)
8550 {
8551 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8552 {
8553 std::lock_guard lock(client_lock);
8554 tout(cct) << "getdir" << std::endl;
8555 tout(cct) << relpath << std::endl;
8556 }
8557
8558 dir_result_t *d;
8559 int r = opendir(relpath, &d, perms);
8560 if (r < 0)
8561 return r;
8562
8563 getdir_result gr;
8564 gr.contents = &contents;
8565 gr.num = 0;
8566 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8567
8568 closedir(d);
8569
8570 if (r < 0)
8571 return r;
8572 return gr.num;
8573 }
8574
8575
8576 /****** file i/o **********/
8577 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8578 mode_t mode, int stripe_unit, int stripe_count,
8579 int object_size, const char *data_pool)
8580 {
8581 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8582 std::lock_guard lock(client_lock);
8583 tout(cct) << "open" << std::endl;
8584 tout(cct) << relpath << std::endl;
8585 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8586
8587 if (unmounting)
8588 return -ENOTCONN;
8589
8590 Fh *fh = NULL;
8591
8592 #if defined(__linux__) && defined(O_PATH)
8593 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8594 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8595 * in kernel (fs/open.c). */
8596 if (flags & O_PATH)
8597 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8598 #endif
8599
8600 filepath path(relpath);
8601 InodeRef in;
8602 bool created = false;
8603 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8604 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8605 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8606
8607 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8608 return -EEXIST;
8609
8610 #if defined(__linux__) && defined(O_PATH)
8611 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8612 #else
8613 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8614 #endif
8615 return -ELOOP;
8616
8617 if (r == -ENOENT && (flags & O_CREAT)) {
8618 filepath dirpath = path;
8619 string dname = dirpath.last_dentry();
8620 dirpath.pop_dentry();
8621 InodeRef dir;
8622 r = path_walk(dirpath, &dir, perms, true,
8623 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8624 if (r < 0)
8625 goto out;
8626 if (cct->_conf->client_permissions) {
8627 r = may_create(dir.get(), perms);
8628 if (r < 0)
8629 goto out;
8630 }
8631 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8632 stripe_count, object_size, data_pool, &created, perms);
8633 }
8634 if (r < 0)
8635 goto out;
8636
8637 if (!created) {
8638 // posix says we can only check permissions of existing files
8639 if (cct->_conf->client_permissions) {
8640 r = may_open(in.get(), flags, perms);
8641 if (r < 0)
8642 goto out;
8643 }
8644 }
8645
8646 if (!fh)
8647 r = _open(in.get(), flags, mode, &fh, perms);
8648 if (r >= 0) {
8649 // allocate a integer file descriptor
8650 ceph_assert(fh);
8651 r = get_fd();
8652 ceph_assert(fd_map.count(r) == 0);
8653 fd_map[r] = fh;
8654 }
8655
8656 out:
8657 tout(cct) << r << std::endl;
8658 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8659 return r;
8660 }
8661
8662 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8663 {
8664 /* Use default file striping parameters */
8665 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8666 }
8667
8668 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8669 const UserPerm& perms)
8670 {
8671 std::lock_guard lock(client_lock);
8672 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8673
8674 if (unmounting)
8675 return -ENOTCONN;
8676
8677 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8678 filepath path(ino);
8679 req->set_filepath(path);
8680
8681 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8682 char f[30];
8683 sprintf(f, "%u", h);
8684 filepath path2(dirino);
8685 path2.push_dentry(string(f));
8686 req->set_filepath2(path2);
8687
8688 int r = make_request(req, perms, NULL, NULL,
8689 rand() % mdsmap->get_num_in_mds());
8690 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8691 return r;
8692 }
8693
8694
8695 /**
8696 * Load inode into local cache.
8697 *
8698 * If inode pointer is non-NULL, and take a reference on
8699 * the resulting Inode object in one operation, so that caller
8700 * can safely assume inode will still be there after return.
8701 */
8702 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8703 {
8704 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8705
8706 if (unmounting)
8707 return -ENOTCONN;
8708
8709 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8710 filepath path(ino);
8711 req->set_filepath(path);
8712
8713 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8714 if (r == 0 && inode != NULL) {
8715 vinodeno_t vino(ino, CEPH_NOSNAP);
8716 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8717 ceph_assert(p != inode_map.end());
8718 *inode = p->second;
8719 _ll_get(*inode);
8720 }
8721 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8722 return r;
8723 }
8724
8725 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8726 {
8727 std::lock_guard lock(client_lock);
8728 return _lookup_ino(ino, perms, inode);
8729 }
8730
8731 /**
8732 * Find the parent inode of `ino` and insert it into
8733 * our cache. Conditionally also set `parent` to a referenced
8734 * Inode* if caller provides non-NULL value.
8735 */
8736 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8737 {
8738 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8739
8740 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8741 filepath path(ino->ino);
8742 req->set_filepath(path);
8743
8744 InodeRef target;
8745 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8746 // Give caller a reference to the parent ino if they provided a pointer.
8747 if (parent != NULL) {
8748 if (r == 0) {
8749 *parent = target.get();
8750 _ll_get(*parent);
8751 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8752 } else {
8753 *parent = NULL;
8754 }
8755 }
8756 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8757 return r;
8758 }
8759
8760 /**
8761 * Populate the parent dentry for `ino`, provided it is
8762 * a child of `parent`.
8763 */
8764 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8765 {
8766 ceph_assert(parent->is_dir());
8767 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8768
8769 if (unmounting)
8770 return -ENOTCONN;
8771
8772 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8773 req->set_filepath2(filepath(parent->ino));
8774 req->set_filepath(filepath(ino->ino));
8775 req->set_inode(ino);
8776
8777 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8778 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8779 return r;
8780 }
8781
8782 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8783 {
8784 std::lock_guard lock(client_lock);
8785 return _lookup_name(ino, parent, perms);
8786 }
8787
8788 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8789 {
8790 ceph_assert(in);
8791 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
8792
8793 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8794
8795 if (in->snapid != CEPH_NOSNAP) {
8796 in->snap_cap_refs++;
8797 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8798 << ccap_string(in->caps_issued()) << dendl;
8799 }
8800
8801 const auto& conf = cct->_conf;
8802 f->readahead.set_trigger_requests(1);
8803 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8804 uint64_t max_readahead = Readahead::NO_LIMIT;
8805 if (conf->client_readahead_max_bytes) {
8806 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8807 }
8808 if (conf->client_readahead_max_periods) {
8809 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8810 }
8811 f->readahead.set_max_readahead_size(max_readahead);
8812 vector<uint64_t> alignments;
8813 alignments.push_back(in->layout.get_period());
8814 alignments.push_back(in->layout.stripe_unit);
8815 f->readahead.set_alignments(alignments);
8816
8817 return f;
8818 }
8819
8820 int Client::_release_fh(Fh *f)
8821 {
8822 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8823 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8824 Inode *in = f->inode.get();
8825 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8826
8827 in->unset_deleg(f);
8828
8829 if (in->snapid == CEPH_NOSNAP) {
8830 if (in->put_open_ref(f->mode)) {
8831 _flush(in, new C_Client_FlushComplete(this, in));
8832 check_caps(in, 0);
8833 }
8834 } else {
8835 ceph_assert(in->snap_cap_refs > 0);
8836 in->snap_cap_refs--;
8837 }
8838
8839 _release_filelocks(f);
8840
8841 // Finally, read any async err (i.e. from flushes)
8842 int err = f->take_async_err();
8843 if (err != 0) {
8844 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8845 << cpp_strerror(err) << dendl;
8846 } else {
8847 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8848 }
8849
8850 _put_fh(f);
8851
8852 return err;
8853 }
8854
8855 void Client::_put_fh(Fh *f)
8856 {
8857 int left = f->put();
8858 if (!left) {
8859 delete f;
8860 }
8861 }
8862
8863 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8864 const UserPerm& perms)
8865 {
8866 if (in->snapid != CEPH_NOSNAP &&
8867 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8868 return -EROFS;
8869 }
8870
8871 // use normalized flags to generate cmode
8872 int cflags = ceph_flags_sys2wire(flags);
8873 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8874 cflags |= CEPH_O_LAZY;
8875
8876 int cmode = ceph_flags_to_mode(cflags);
8877 int want = ceph_caps_for_mode(cmode);
8878 int result = 0;
8879
8880 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8881
8882 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8883 // update wanted?
8884 check_caps(in, CHECK_CAPS_NODELAY);
8885 } else {
8886
8887 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8888 filepath path;
8889 in->make_nosnap_relative_path(path);
8890 req->set_filepath(path);
8891 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8892 req->head.args.open.mode = mode;
8893 req->head.args.open.pool = -1;
8894 if (cct->_conf->client_debug_getattr_caps)
8895 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8896 else
8897 req->head.args.open.mask = 0;
8898 req->head.args.open.old_size = in->size; // for O_TRUNC
8899 req->set_inode(in);
8900 result = make_request(req, perms);
8901
8902 /*
8903 * NFS expects that delegations will be broken on a conflicting open,
8904 * not just when there is actual conflicting access to the file. SMB leases
8905 * and oplocks also have similar semantics.
8906 *
8907 * Ensure that clients that have delegations enabled will wait on minimal
8908 * caps during open, just to ensure that other clients holding delegations
8909 * return theirs first.
8910 */
8911 if (deleg_timeout && result == 0) {
8912 int need = 0, have;
8913
8914 if (cmode & CEPH_FILE_MODE_WR)
8915 need |= CEPH_CAP_FILE_WR;
8916 if (cmode & CEPH_FILE_MODE_RD)
8917 need |= CEPH_CAP_FILE_RD;
8918
8919 Fh fh(in, flags, cmode, fd_gen, perms);
8920 result = get_caps(&fh, need, want, &have, -1);
8921 if (result < 0) {
8922 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8923 " . Denying open: " <<
8924 cpp_strerror(result) << dendl;
8925 in->put_open_ref(cmode);
8926 } else {
8927 put_cap_ref(in, need);
8928 }
8929 }
8930 }
8931
8932 // success?
8933 if (result >= 0) {
8934 if (fhp)
8935 *fhp = _create_fh(in, flags, cmode, perms);
8936 } else {
8937 in->put_open_ref(cmode);
8938 }
8939
8940 trim_cache();
8941
8942 return result;
8943 }
8944
8945 int Client::_renew_caps(Inode *in)
8946 {
8947 int wanted = in->caps_file_wanted();
8948 if (in->is_any_caps() &&
8949 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8950 check_caps(in, CHECK_CAPS_NODELAY);
8951 return 0;
8952 }
8953
8954 int flags = 0;
8955 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8956 flags = O_RDWR;
8957 else if (wanted & CEPH_CAP_FILE_RD)
8958 flags = O_RDONLY;
8959 else if (wanted & CEPH_CAP_FILE_WR)
8960 flags = O_WRONLY;
8961
8962 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8963 filepath path;
8964 in->make_nosnap_relative_path(path);
8965 req->set_filepath(path);
8966 req->head.args.open.flags = flags;
8967 req->head.args.open.pool = -1;
8968 if (cct->_conf->client_debug_getattr_caps)
8969 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8970 else
8971 req->head.args.open.mask = 0;
8972 req->set_inode(in);
8973
8974 // duplicate in case Cap goes away; not sure if that race is a concern?
8975 const UserPerm *pperm = in->get_best_perms();
8976 UserPerm perms;
8977 if (pperm != NULL)
8978 perms = *pperm;
8979 int ret = make_request(req, perms);
8980 return ret;
8981 }
8982
8983 int Client::close(int fd)
8984 {
8985 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8986 std::lock_guard lock(client_lock);
8987 tout(cct) << "close" << std::endl;
8988 tout(cct) << fd << std::endl;
8989
8990 if (unmounting)
8991 return -ENOTCONN;
8992
8993 Fh *fh = get_filehandle(fd);
8994 if (!fh)
8995 return -EBADF;
8996 int err = _release_fh(fh);
8997 fd_map.erase(fd);
8998 put_fd(fd);
8999 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9000 return err;
9001 }
9002
9003
9004 // ------------
9005 // read, write
9006
9007 loff_t Client::lseek(int fd, loff_t offset, int whence)
9008 {
9009 std::lock_guard lock(client_lock);
9010 tout(cct) << "lseek" << std::endl;
9011 tout(cct) << fd << std::endl;
9012 tout(cct) << offset << std::endl;
9013 tout(cct) << whence << std::endl;
9014
9015 if (unmounting)
9016 return -ENOTCONN;
9017
9018 Fh *f = get_filehandle(fd);
9019 if (!f)
9020 return -EBADF;
9021 #if defined(__linux__) && defined(O_PATH)
9022 if (f->flags & O_PATH)
9023 return -EBADF;
9024 #endif
9025 return _lseek(f, offset, whence);
9026 }
9027
9028 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9029 {
9030 Inode *in = f->inode.get();
9031 bool whence_check = false;
9032 loff_t pos = -1;
9033
9034 switch (whence) {
9035 case SEEK_END:
9036 whence_check = true;
9037 break;
9038
9039 #ifdef SEEK_DATA
9040 case SEEK_DATA:
9041 whence_check = true;
9042 break;
9043 #endif
9044
9045 #ifdef SEEK_HOLE
9046 case SEEK_HOLE:
9047 whence_check = true;
9048 break;
9049 #endif
9050 }
9051
9052 if (whence_check) {
9053 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9054 if (r < 0)
9055 return r;
9056 }
9057
9058 switch (whence) {
9059 case SEEK_SET:
9060 pos = offset;
9061 break;
9062
9063 case SEEK_CUR:
9064 pos = f->pos + offset;
9065 break;
9066
9067 case SEEK_END:
9068 pos = in->size + offset;
9069 break;
9070
9071 #ifdef SEEK_DATA
9072 case SEEK_DATA:
9073 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9074 return -ENXIO;
9075 pos = offset;
9076 break;
9077 #endif
9078
9079 #ifdef SEEK_HOLE
9080 case SEEK_HOLE:
9081 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9082 return -ENXIO;
9083 pos = in->size;
9084 break;
9085 #endif
9086
9087 default:
9088 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9089 return -EINVAL;
9090 }
9091
9092 if (pos < 0) {
9093 return -EINVAL;
9094 } else {
9095 f->pos = pos;
9096 }
9097
9098 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9099 return f->pos;
9100 }
9101
9102
9103 void Client::lock_fh_pos(Fh *f)
9104 {
9105 ldout(cct, 10) << __func__ << " " << f << dendl;
9106
9107 if (f->pos_locked || !f->pos_waiters.empty()) {
9108 ceph::condition_variable cond;
9109 f->pos_waiters.push_back(&cond);
9110 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9111 std::unique_lock l{client_lock, std::adopt_lock};
9112 cond.wait(l, [f, me=&cond] {
9113 return !f->pos_locked && f->pos_waiters.front() == me;
9114 });
9115 l.release();
9116 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9117 ceph_assert(f->pos_waiters.front() == &cond);
9118 f->pos_waiters.pop_front();
9119 }
9120
9121 f->pos_locked = true;
9122 }
9123
9124 void Client::unlock_fh_pos(Fh *f)
9125 {
9126 ldout(cct, 10) << __func__ << " " << f << dendl;
9127 f->pos_locked = false;
9128 }
9129
9130 int Client::uninline_data(Inode *in, Context *onfinish)
9131 {
9132 if (!in->inline_data.length()) {
9133 onfinish->complete(0);
9134 return 0;
9135 }
9136
9137 char oid_buf[32];
9138 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9139 object_t oid = oid_buf;
9140
9141 ObjectOperation create_ops;
9142 create_ops.create(false);
9143
9144 objecter->mutate(oid,
9145 OSDMap::file_to_object_locator(in->layout),
9146 create_ops,
9147 in->snaprealm->get_snap_context(),
9148 ceph::real_clock::now(),
9149 0,
9150 NULL);
9151
9152 bufferlist inline_version_bl;
9153 encode(in->inline_version, inline_version_bl);
9154
9155 ObjectOperation uninline_ops;
9156 uninline_ops.cmpxattr("inline_version",
9157 CEPH_OSD_CMPXATTR_OP_GT,
9158 CEPH_OSD_CMPXATTR_MODE_U64,
9159 inline_version_bl);
9160 bufferlist inline_data = in->inline_data;
9161 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9162 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9163
9164 objecter->mutate(oid,
9165 OSDMap::file_to_object_locator(in->layout),
9166 uninline_ops,
9167 in->snaprealm->get_snap_context(),
9168 ceph::real_clock::now(),
9169 0,
9170 onfinish);
9171
9172 return 0;
9173 }
9174
9175 //
9176
9177 // blocking osd interface
9178
9179 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9180 {
9181 std::unique_lock lock(client_lock);
9182 tout(cct) << "read" << std::endl;
9183 tout(cct) << fd << std::endl;
9184 tout(cct) << size << std::endl;
9185 tout(cct) << offset << std::endl;
9186
9187 if (unmounting)
9188 return -ENOTCONN;
9189
9190 Fh *f = get_filehandle(fd);
9191 if (!f)
9192 return -EBADF;
9193 #if defined(__linux__) && defined(O_PATH)
9194 if (f->flags & O_PATH)
9195 return -EBADF;
9196 #endif
9197 bufferlist bl;
9198 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9199 size = std::min(size, (loff_t)INT_MAX);
9200 int r = _read(f, offset, size, &bl);
9201 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9202 if (r >= 0) {
9203 lock.unlock();
9204 bl.begin().copy(bl.length(), buf);
9205 r = bl.length();
9206 }
9207 return r;
9208 }
9209
9210 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9211 {
9212 if (iovcnt < 0)
9213 return -EINVAL;
9214 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9215 }
9216
9217 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9218 {
9219 int want, have = 0;
9220 bool movepos = false;
9221 std::unique_ptr<C_SaferCond> onuninline;
9222 int64_t r = 0;
9223 const auto& conf = cct->_conf;
9224 Inode *in = f->inode.get();
9225 utime_t lat;
9226 utime_t start = ceph_clock_now();
9227
9228 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9229 return -EBADF;
9230 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9231
9232 if (offset < 0) {
9233 lock_fh_pos(f);
9234 offset = f->pos;
9235 movepos = true;
9236 }
9237 loff_t start_pos = offset;
9238
9239 if (in->inline_version == 0) {
9240 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9241 if (r < 0) {
9242 goto done;
9243 }
9244 ceph_assert(in->inline_version > 0);
9245 }
9246
9247 retry:
9248 if (f->mode & CEPH_FILE_MODE_LAZY)
9249 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9250 else
9251 want = CEPH_CAP_FILE_CACHE;
9252 r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9253 if (r < 0) {
9254 goto done;
9255 }
9256 if (f->flags & O_DIRECT)
9257 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9258
9259 if (in->inline_version < CEPH_INLINE_NONE) {
9260 if (!(have & CEPH_CAP_FILE_CACHE)) {
9261 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9262 uninline_data(in, onuninline.get());
9263 } else {
9264 uint32_t len = in->inline_data.length();
9265 uint64_t endoff = offset + size;
9266 if (endoff > in->size)
9267 endoff = in->size;
9268
9269 if (offset < len) {
9270 if (endoff <= len) {
9271 bl->substr_of(in->inline_data, offset, endoff - offset);
9272 } else {
9273 bl->substr_of(in->inline_data, offset, len - offset);
9274 bl->append_zero(endoff - len);
9275 }
9276 r = endoff - offset;
9277 } else if ((uint64_t)offset < endoff) {
9278 bl->append_zero(endoff - offset);
9279 r = endoff - offset;
9280 } else {
9281 r = 0;
9282 }
9283 goto success;
9284 }
9285 }
9286
9287 if (!conf->client_debug_force_sync_read &&
9288 conf->client_oc &&
9289 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9290
9291 if (f->flags & O_RSYNC) {
9292 _flush_range(in, offset, size);
9293 }
9294 r = _read_async(f, offset, size, bl);
9295 if (r < 0)
9296 goto done;
9297 } else {
9298 if (f->flags & O_DIRECT)
9299 _flush_range(in, offset, size);
9300
9301 bool checkeof = false;
9302 r = _read_sync(f, offset, size, bl, &checkeof);
9303 if (r < 0)
9304 goto done;
9305 if (checkeof) {
9306 offset += r;
9307 size -= r;
9308
9309 put_cap_ref(in, CEPH_CAP_FILE_RD);
9310 have = 0;
9311 // reverify size
9312 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9313 if (r < 0)
9314 goto done;
9315
9316 // eof? short read.
9317 if ((uint64_t)offset < in->size)
9318 goto retry;
9319 }
9320 }
9321
9322 success:
9323 ceph_assert(r >= 0);
9324 if (movepos) {
9325 // adjust fd pos
9326 f->pos = start_pos + r;
9327 }
9328
9329 lat = ceph_clock_now();
9330 lat -= start;
9331 logger->tinc(l_c_read, lat);
9332
9333 done:
9334 // done!
9335
9336 if (onuninline) {
9337 client_lock.unlock();
9338 int ret = onuninline->wait();
9339 client_lock.lock();
9340 if (ret >= 0 || ret == -ECANCELED) {
9341 in->inline_data.clear();
9342 in->inline_version = CEPH_INLINE_NONE;
9343 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9344 check_caps(in, 0);
9345 } else
9346 r = ret;
9347 }
9348 if (have) {
9349 put_cap_ref(in, CEPH_CAP_FILE_RD);
9350 }
9351 if (movepos) {
9352 unlock_fh_pos(f);
9353 }
9354 return r;
9355 }
9356
9357 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9358 client(c), f(f) {
9359 f->get();
9360 f->readahead.inc_pending();
9361 }
9362
9363 Client::C_Readahead::~C_Readahead() {
9364 f->readahead.dec_pending();
9365 client->_put_fh(f);
9366 }
9367
9368 void Client::C_Readahead::finish(int r) {
9369 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9370 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9371 }
9372
9373 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9374 {
9375 const auto& conf = cct->_conf;
9376 Inode *in = f->inode.get();
9377
9378 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9379
9380 // trim read based on file size?
9381 if (off >= in->size)
9382 return 0;
9383 if (len == 0)
9384 return 0;
9385 if (off + len > in->size) {
9386 len = in->size - off;
9387 }
9388
9389 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9390 << " max_bytes=" << f->readahead.get_max_readahead_size()
9391 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9392
9393 // read (and possibly block)
9394 int r = 0;
9395 C_SaferCond onfinish("Client::_read_async flock");
9396 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9397 off, len, bl, 0, &onfinish);
9398 if (r == 0) {
9399 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9400 client_lock.unlock();
9401 r = onfinish.wait();
9402 client_lock.lock();
9403 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9404 }
9405
9406 if(f->readahead.get_min_readahead_size() > 0) {
9407 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9408 if (readahead_extent.second > 0) {
9409 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9410 << " (caller wants " << off << "~" << len << ")" << dendl;
9411 Context *onfinish2 = new C_Readahead(this, f);
9412 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9413 readahead_extent.first, readahead_extent.second,
9414 NULL, 0, onfinish2);
9415 if (r2 == 0) {
9416 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9417 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9418 } else {
9419 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9420 delete onfinish2;
9421 }
9422 }
9423 }
9424
9425 return r;
9426 }
9427
9428 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9429 bool *checkeof)
9430 {
9431 Inode *in = f->inode.get();
9432 uint64_t pos = off;
9433 int left = len;
9434 int read = 0;
9435
9436 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9437
9438 while (left > 0) {
9439 C_SaferCond onfinish("Client::_read_sync flock");
9440 bufferlist tbl;
9441
9442 int wanted = left;
9443 filer->read_trunc(in->ino, &in->layout, in->snapid,
9444 pos, left, &tbl, 0,
9445 in->truncate_size, in->truncate_seq,
9446 &onfinish);
9447 client_lock.unlock();
9448 int r = onfinish.wait();
9449 client_lock.lock();
9450
9451 // if we get ENOENT from OSD, assume 0 bytes returned
9452 if (r == -ENOENT)
9453 r = 0;
9454 if (r < 0)
9455 return r;
9456 if (tbl.length()) {
9457 r = tbl.length();
9458
9459 read += r;
9460 pos += r;
9461 left -= r;
9462 bl->claim_append(tbl);
9463 }
9464 // short read?
9465 if (r >= 0 && r < wanted) {
9466 if (pos < in->size) {
9467 // zero up to known EOF
9468 int64_t some = in->size - pos;
9469 if (some > left)
9470 some = left;
9471 auto z = buffer::ptr_node::create(some);
9472 z->zero();
9473 bl->push_back(std::move(z));
9474 read += some;
9475 pos += some;
9476 left -= some;
9477 if (left == 0)
9478 return read;
9479 }
9480
9481 *checkeof = true;
9482 return read;
9483 }
9484 }
9485 return read;
9486 }
9487
9488
9489 /*
9490 * we keep count of uncommitted sync writes on the inode, so that
9491 * fsync can DDRT.
9492 */
9493 void Client::_sync_write_commit(Inode *in)
9494 {
9495 ceph_assert(unsafe_sync_write > 0);
9496 unsafe_sync_write--;
9497
9498 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9499
9500 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9501 if (unsafe_sync_write == 0 && unmounting) {
9502 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9503 mount_cond.notify_all();
9504 }
9505 }
9506
9507 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9508 {
9509 std::lock_guard lock(client_lock);
9510 tout(cct) << "write" << std::endl;
9511 tout(cct) << fd << std::endl;
9512 tout(cct) << size << std::endl;
9513 tout(cct) << offset << std::endl;
9514
9515 if (unmounting)
9516 return -ENOTCONN;
9517
9518 Fh *fh = get_filehandle(fd);
9519 if (!fh)
9520 return -EBADF;
9521 #if defined(__linux__) && defined(O_PATH)
9522 if (fh->flags & O_PATH)
9523 return -EBADF;
9524 #endif
9525 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9526 size = std::min(size, (loff_t)INT_MAX);
9527 int r = _write(fh, offset, size, buf, NULL, false);
9528 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9529 return r;
9530 }
9531
9532 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9533 {
9534 if (iovcnt < 0)
9535 return -EINVAL;
9536 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9537 }
9538
9539 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9540 unsigned iovcnt, int64_t offset, bool write,
9541 bool clamp_to_int)
9542 {
9543 #if defined(__linux__) && defined(O_PATH)
9544 if (fh->flags & O_PATH)
9545 return -EBADF;
9546 #endif
9547 loff_t totallen = 0;
9548 for (unsigned i = 0; i < iovcnt; i++) {
9549 totallen += iov[i].iov_len;
9550 }
9551
9552 /*
9553 * Some of the API functions take 64-bit size values, but only return
9554 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9555 * we don't do I/Os larger than the values we can return.
9556 */
9557 if (clamp_to_int) {
9558 totallen = std::min(totallen, (loff_t)INT_MAX);
9559 }
9560 if (write) {
9561 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9562 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9563 return w;
9564 } else {
9565 bufferlist bl;
9566 int64_t r = _read(fh, offset, totallen, &bl);
9567 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9568 if (r <= 0)
9569 return r;
9570
9571 auto iter = bl.cbegin();
9572 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9573 /*
9574 * This piece of code aims to handle the case that bufferlist does not have enough data
9575 * to fill in the iov
9576 */
9577 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9578 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9579 resid -= round_size;
9580 /* iter is self-updating */
9581 }
9582 return r;
9583 }
9584 }
9585
9586 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9587 {
9588 std::lock_guard lock(client_lock);
9589 tout(cct) << fd << std::endl;
9590 tout(cct) << offset << std::endl;
9591
9592 if (unmounting)
9593 return -ENOTCONN;
9594
9595 Fh *fh = get_filehandle(fd);
9596 if (!fh)
9597 return -EBADF;
9598 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9599 }
9600
9601 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9602 const struct iovec *iov, int iovcnt)
9603 {
9604 uint64_t fpos = 0;
9605
9606 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9607 return -EFBIG;
9608
9609 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9610 Inode *in = f->inode.get();
9611
9612 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9613 return -ENOSPC;
9614 }
9615
9616 ceph_assert(in->snapid == CEPH_NOSNAP);
9617
9618 // was Fh opened as writeable?
9619 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9620 return -EBADF;
9621
9622 // use/adjust fd pos?
9623 if (offset < 0) {
9624 lock_fh_pos(f);
9625 /*
9626 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9627 * change out from under us.
9628 */
9629 if (f->flags & O_APPEND) {
9630 auto r = _lseek(f, 0, SEEK_END);
9631 if (r < 0) {
9632 unlock_fh_pos(f);
9633 return r;
9634 }
9635 }
9636 offset = f->pos;
9637 fpos = offset+size;
9638 unlock_fh_pos(f);
9639 }
9640
9641 // check quota
9642 uint64_t endoff = offset + size;
9643 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9644 f->actor_perms)) {
9645 return -EDQUOT;
9646 }
9647
9648 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9649
9650 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9651
9652 // time it.
9653 utime_t start = ceph_clock_now();
9654
9655 if (in->inline_version == 0) {
9656 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9657 if (r < 0)
9658 return r;
9659 ceph_assert(in->inline_version > 0);
9660 }
9661
9662 // copy into fresh buffer (since our write may be resub, async)
9663 bufferlist bl;
9664 if (buf) {
9665 if (size > 0)
9666 bl.append(buf, size);
9667 } else if (iov){
9668 for (int i = 0; i < iovcnt; i++) {
9669 if (iov[i].iov_len > 0) {
9670 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9671 }
9672 }
9673 }
9674
9675 utime_t lat;
9676 uint64_t totalwritten;
9677 int want, have;
9678 if (f->mode & CEPH_FILE_MODE_LAZY)
9679 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9680 else
9681 want = CEPH_CAP_FILE_BUFFER;
9682 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9683 if (r < 0)
9684 return r;
9685
9686 /* clear the setuid/setgid bits, if any */
9687 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9688 struct ceph_statx stx = { 0 };
9689
9690 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9691 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9692 if (r < 0)
9693 return r;
9694 } else {
9695 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9696 }
9697
9698 if (f->flags & O_DIRECT)
9699 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9700
9701 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9702
9703 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9704
9705 if (in->inline_version < CEPH_INLINE_NONE) {
9706 if (endoff > cct->_conf->client_max_inline_size ||
9707 endoff > CEPH_INLINE_MAX_SIZE ||
9708 !(have & CEPH_CAP_FILE_BUFFER)) {
9709 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9710 uninline_data(in, onuninline.get());
9711 } else {
9712 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9713
9714 uint32_t len = in->inline_data.length();
9715
9716 if (endoff < len)
9717 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9718
9719 if (offset < len)
9720 in->inline_data.splice(offset, len - offset);
9721 else if (offset > len)
9722 in->inline_data.append_zero(offset - len);
9723
9724 in->inline_data.append(bl);
9725 in->inline_version++;
9726
9727 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9728
9729 goto success;
9730 }
9731 }
9732
9733 if (cct->_conf->client_oc &&
9734 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9735 // do buffered write
9736 if (!in->oset.dirty_or_tx)
9737 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9738
9739 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9740
9741 // async, caching, non-blocking.
9742 r = objectcacher->file_write(&in->oset, &in->layout,
9743 in->snaprealm->get_snap_context(),
9744 offset, size, bl, ceph::real_clock::now(),
9745 0);
9746 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9747
9748 if (r < 0)
9749 goto done;
9750
9751 // flush cached write if O_SYNC is set on file fh
9752 // O_DSYNC == O_SYNC on linux < 2.6.33
9753 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9754 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9755 _flush_range(in, offset, size);
9756 }
9757 } else {
9758 if (f->flags & O_DIRECT)
9759 _flush_range(in, offset, size);
9760
9761 // simple, non-atomic sync write
9762 C_SaferCond onfinish("Client::_write flock");
9763 unsafe_sync_write++;
9764 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9765
9766 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9767 offset, size, bl, ceph::real_clock::now(), 0,
9768 in->truncate_size, in->truncate_seq,
9769 &onfinish);
9770 client_lock.unlock();
9771 r = onfinish.wait();
9772 client_lock.lock();
9773 _sync_write_commit(in);
9774 if (r < 0)
9775 goto done;
9776 }
9777
9778 // if we get here, write was successful, update client metadata
9779 success:
9780 // time
9781 lat = ceph_clock_now();
9782 lat -= start;
9783 logger->tinc(l_c_wrlat, lat);
9784
9785 if (fpos) {
9786 lock_fh_pos(f);
9787 f->pos = fpos;
9788 unlock_fh_pos(f);
9789 }
9790 totalwritten = size;
9791 r = (int64_t)totalwritten;
9792
9793 // extend file?
9794 if (totalwritten + offset > in->size) {
9795 in->size = totalwritten + offset;
9796 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9797
9798 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9799 check_caps(in, CHECK_CAPS_NODELAY);
9800 } else if (is_max_size_approaching(in)) {
9801 check_caps(in, 0);
9802 }
9803
9804 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9805 } else {
9806 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9807 }
9808
9809 // mtime
9810 in->mtime = in->ctime = ceph_clock_now();
9811 in->change_attr++;
9812 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9813
9814 done:
9815
9816 if (nullptr != onuninline) {
9817 client_lock.unlock();
9818 int uninline_ret = onuninline->wait();
9819 client_lock.lock();
9820
9821 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9822 in->inline_data.clear();
9823 in->inline_version = CEPH_INLINE_NONE;
9824 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9825 check_caps(in, 0);
9826 } else
9827 r = uninline_ret;
9828 }
9829
9830 put_cap_ref(in, CEPH_CAP_FILE_WR);
9831 return r;
9832 }
9833
9834 int Client::_flush(Fh *f)
9835 {
9836 Inode *in = f->inode.get();
9837 int err = f->take_async_err();
9838 if (err != 0) {
9839 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9840 << cpp_strerror(err) << dendl;
9841 } else {
9842 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9843 }
9844
9845 return err;
9846 }
9847
9848 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9849 {
9850 struct ceph_statx stx;
9851 stx.stx_size = length;
9852 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9853 }
9854
9855 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9856 {
9857 std::lock_guard lock(client_lock);
9858 tout(cct) << __func__ << std::endl;
9859 tout(cct) << fd << std::endl;
9860 tout(cct) << length << std::endl;
9861
9862 if (unmounting)
9863 return -ENOTCONN;
9864
9865 Fh *f = get_filehandle(fd);
9866 if (!f)
9867 return -EBADF;
9868 #if defined(__linux__) && defined(O_PATH)
9869 if (f->flags & O_PATH)
9870 return -EBADF;
9871 #endif
9872 struct stat attr;
9873 attr.st_size = length;
9874 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9875 }
9876
9877 int Client::fsync(int fd, bool syncdataonly)
9878 {
9879 std::lock_guard lock(client_lock);
9880 tout(cct) << "fsync" << std::endl;
9881 tout(cct) << fd << std::endl;
9882 tout(cct) << syncdataonly << std::endl;
9883
9884 if (unmounting)
9885 return -ENOTCONN;
9886
9887 Fh *f = get_filehandle(fd);
9888 if (!f)
9889 return -EBADF;
9890 #if defined(__linux__) && defined(O_PATH)
9891 if (f->flags & O_PATH)
9892 return -EBADF;
9893 #endif
9894 int r = _fsync(f, syncdataonly);
9895 if (r == 0) {
9896 // The IOs in this fsync were okay, but maybe something happened
9897 // in the background that we shoudl be reporting?
9898 r = f->take_async_err();
9899 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9900 << ") = 0, async_err = " << r << dendl;
9901 } else {
9902 // Assume that an error we encountered during fsync, even reported
9903 // synchronously, would also have applied the error to the Fh, and we
9904 // should clear it here to avoid returning the same error again on next
9905 // call.
9906 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9907 << r << dendl;
9908 f->take_async_err();
9909 }
9910 return r;
9911 }
9912
9913 int Client::_fsync(Inode *in, bool syncdataonly)
9914 {
9915 int r = 0;
9916 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9917 ceph_tid_t flush_tid = 0;
9918 InodeRef tmp_ref;
9919 utime_t lat;
9920 utime_t start = ceph_clock_now();
9921
9922 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9923
9924 if (cct->_conf->client_oc) {
9925 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9926 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9927 _flush(in, object_cacher_completion.get());
9928 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9929 }
9930
9931 if (!syncdataonly && in->dirty_caps) {
9932 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9933 if (in->flushing_caps)
9934 flush_tid = last_flush_tid;
9935 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9936
9937 if (!syncdataonly && !in->unsafe_ops.empty()) {
9938 flush_mdlog_sync();
9939
9940 MetaRequest *req = in->unsafe_ops.back();
9941 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9942
9943 req->get();
9944 wait_on_list(req->waitfor_safe);
9945 put_request(req);
9946 }
9947
9948 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9949 client_lock.unlock();
9950 ldout(cct, 15) << "waiting on data to flush" << dendl;
9951 r = object_cacher_completion->wait();
9952 client_lock.lock();
9953 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9954 } else {
9955 // FIXME: this can starve
9956 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9957 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9958 << " uncommitted, waiting" << dendl;
9959 wait_on_list(in->waitfor_commit);
9960 }
9961 }
9962
9963 if (!r) {
9964 if (flush_tid > 0)
9965 wait_sync_caps(in, flush_tid);
9966
9967 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9968 } else {
9969 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9970 << cpp_strerror(-r) << dendl;
9971 }
9972
9973 lat = ceph_clock_now();
9974 lat -= start;
9975 logger->tinc(l_c_fsync, lat);
9976
9977 return r;
9978 }
9979
9980 int Client::_fsync(Fh *f, bool syncdataonly)
9981 {
9982 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9983 return _fsync(f->inode.get(), syncdataonly);
9984 }
9985
9986 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9987 {
9988 std::lock_guard lock(client_lock);
9989 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9990 tout(cct) << fd << std::endl;
9991
9992 if (unmounting)
9993 return -ENOTCONN;
9994
9995 Fh *f = get_filehandle(fd);
9996 if (!f)
9997 return -EBADF;
9998 int r = _getattr(f->inode, mask, perms);
9999 if (r < 0)
10000 return r;
10001 fill_stat(f->inode, stbuf, NULL);
10002 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10003 return r;
10004 }
10005
10006 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10007 unsigned int want, unsigned int flags)
10008 {
10009 std::lock_guard lock(client_lock);
10010 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10011 tout(cct) << fd << std::endl;
10012
10013 if (unmounting)
10014 return -ENOTCONN;
10015
10016 Fh *f = get_filehandle(fd);
10017 if (!f)
10018 return -EBADF;
10019
10020 unsigned mask = statx_to_mask(flags, want);
10021
10022 int r = 0;
10023 if (mask && !f->inode->caps_issued_mask(mask, true)) {
10024 r = _getattr(f->inode, mask, perms);
10025 if (r < 0) {
10026 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10027 return r;
10028 }
10029 }
10030
10031 fill_statx(f->inode, mask, stx);
10032 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10033 return r;
10034 }
10035
10036 // not written yet, but i want to link!
10037
10038 int Client::chdir(const char *relpath, std::string &new_cwd,
10039 const UserPerm& perms)
10040 {
10041 std::lock_guard lock(client_lock);
10042 tout(cct) << "chdir" << std::endl;
10043 tout(cct) << relpath << std::endl;
10044
10045 if (unmounting)
10046 return -ENOTCONN;
10047
10048 filepath path(relpath);
10049 InodeRef in;
10050 int r = path_walk(path, &in, perms);
10051 if (r < 0)
10052 return r;
10053
10054 if (!(in.get()->is_dir()))
10055 return -ENOTDIR;
10056
10057 if (cwd != in)
10058 cwd.swap(in);
10059 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10060
10061 _getcwd(new_cwd, perms);
10062 return 0;
10063 }
10064
10065 void Client::_getcwd(string& dir, const UserPerm& perms)
10066 {
10067 filepath path;
10068 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10069
10070 Inode *in = cwd.get();
10071 while (in != root) {
10072 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10073
10074 // A cwd or ancester is unlinked
10075 if (in->dentries.empty()) {
10076 return;
10077 }
10078
10079 Dentry *dn = in->get_first_parent();
10080
10081
10082 if (!dn) {
10083 // look it up
10084 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10085 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10086 filepath path(in->ino);
10087 req->set_filepath(path);
10088 req->set_inode(in);
10089 int res = make_request(req, perms);
10090 if (res < 0)
10091 break;
10092
10093 // start over
10094 path = filepath();
10095 in = cwd.get();
10096 continue;
10097 }
10098 path.push_front_dentry(dn->name);
10099 in = dn->dir->parent_inode;
10100 }
10101 dir = "/";
10102 dir += path.get_path();
10103 }
10104
10105 void Client::getcwd(string& dir, const UserPerm& perms)
10106 {
10107 std::lock_guard l(client_lock);
10108 if (!unmounting)
10109 _getcwd(dir, perms);
10110 }
10111
10112 int Client::statfs(const char *path, struct statvfs *stbuf,
10113 const UserPerm& perms)
10114 {
10115 std::lock_guard l(client_lock);
10116 tout(cct) << __func__ << std::endl;
10117 unsigned long int total_files_on_fs;
10118
10119 if (unmounting)
10120 return -ENOTCONN;
10121
10122 ceph_statfs stats;
10123 C_SaferCond cond;
10124
10125 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10126 if (data_pools.size() == 1) {
10127 objecter->get_fs_stats(stats, data_pools[0], &cond);
10128 } else {
10129 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10130 }
10131
10132 client_lock.unlock();
10133 int rval = cond.wait();
10134 assert(root);
10135 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10136 client_lock.lock();
10137
10138 if (rval < 0) {
10139 ldout(cct, 1) << "underlying call to statfs returned error: "
10140 << cpp_strerror(rval)
10141 << dendl;
10142 return rval;
10143 }
10144
10145 memset(stbuf, 0, sizeof(*stbuf));
10146
10147 /*
10148 * we're going to set a block size of 4MB so we can represent larger
10149 * FSes without overflowing. Additionally convert the space
10150 * measurements from KB to bytes while making them in terms of
10151 * blocks. We use 4MB only because it is big enough, and because it
10152 * actually *is* the (ceph) default block size.
10153 */
10154 const int CEPH_BLOCK_SHIFT = 22;
10155 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10156 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10157 stbuf->f_files = total_files_on_fs;
10158 stbuf->f_ffree = 0;
10159 stbuf->f_favail = -1;
10160 stbuf->f_fsid = -1; // ??
10161 stbuf->f_flag = 0; // ??
10162 stbuf->f_namemax = NAME_MAX;
10163
10164 // Usually quota_root will == root_ancestor, but if the mount root has no
10165 // quota but we can see a parent of it that does have a quota, we'll
10166 // respect that one instead.
10167 ceph_assert(root != nullptr);
10168 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10169
10170 // get_quota_root should always give us something
10171 // because client quotas are always enabled
10172 ceph_assert(quota_root != nullptr);
10173
10174 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10175
10176 // Skip the getattr if any sessions are stale, as we don't want to
10177 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10178 // is unhealthy.
10179 if (!_any_stale_sessions()) {
10180 int r = _getattr(quota_root, 0, perms, true);
10181 if (r != 0) {
10182 // Ignore return value: error getting latest inode metadata is not a good
10183 // reason to break "df".
10184 lderr(cct) << "Error in getattr on quota root 0x"
10185 << std::hex << quota_root->ino << std::dec
10186 << " statfs result may be outdated" << dendl;
10187 }
10188 }
10189
10190 // Special case: if there is a size quota set on the Inode acting
10191 // as the root for this client mount, then report the quota status
10192 // as the filesystem statistics.
10193 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10194 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10195 // It is possible for a quota to be exceeded: arithmetic here must
10196 // handle case where used > total.
10197 const fsblkcnt_t free = total > used ? total - used : 0;
10198
10199 stbuf->f_blocks = total;
10200 stbuf->f_bfree = free;
10201 stbuf->f_bavail = free;
10202 } else {
10203 // General case: report the cluster statistics returned from RADOS. Because
10204 // multiple pools may be used without one filesystem namespace via
10205 // layouts, this is the most correct thing we can do.
10206 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10207 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10208 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10209 }
10210
10211 return rval;
10212 }
10213
10214 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10215 struct flock *fl, uint64_t owner, bool removing)
10216 {
10217 ldout(cct, 10) << __func__ << " ino " << in->ino
10218 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10219 << " type " << fl->l_type << " owner " << owner
10220 << " " << fl->l_start << "~" << fl->l_len << dendl;
10221
10222 if (in->flags & I_ERROR_FILELOCK)
10223 return -EIO;
10224
10225 int lock_cmd;
10226 if (F_RDLCK == fl->l_type)
10227 lock_cmd = CEPH_LOCK_SHARED;
10228 else if (F_WRLCK == fl->l_type)
10229 lock_cmd = CEPH_LOCK_EXCL;
10230 else if (F_UNLCK == fl->l_type)
10231 lock_cmd = CEPH_LOCK_UNLOCK;
10232 else
10233 return -EIO;
10234
10235 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10236 sleep = 0;
10237
10238 /*
10239 * Set the most significant bit, so that MDS knows the 'owner'
10240 * is sufficient to identify the owner of lock. (old code uses
10241 * both 'owner' and 'pid')
10242 */
10243 owner |= (1ULL << 63);
10244
10245 MetaRequest *req = new MetaRequest(op);
10246 filepath path;
10247 in->make_nosnap_relative_path(path);
10248 req->set_filepath(path);
10249 req->set_inode(in);
10250
10251 req->head.args.filelock_change.rule = lock_type;
10252 req->head.args.filelock_change.type = lock_cmd;
10253 req->head.args.filelock_change.owner = owner;
10254 req->head.args.filelock_change.pid = fl->l_pid;
10255 req->head.args.filelock_change.start = fl->l_start;
10256 req->head.args.filelock_change.length = fl->l_len;
10257 req->head.args.filelock_change.wait = sleep;
10258
10259 int ret;
10260 bufferlist bl;
10261
10262 if (sleep && switch_interrupt_cb) {
10263 // enable interrupt
10264 switch_interrupt_cb(callback_handle, req->get());
10265 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10266 // disable interrupt
10267 switch_interrupt_cb(callback_handle, NULL);
10268 if (ret == 0 && req->aborted()) {
10269 // effect of this lock request has been revoked by the 'lock intr' request
10270 ret = req->get_abort_code();
10271 }
10272 put_request(req);
10273 } else {
10274 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10275 }
10276
10277 if (ret == 0) {
10278 if (op == CEPH_MDS_OP_GETFILELOCK) {
10279 ceph_filelock filelock;
10280 auto p = bl.cbegin();
10281 decode(filelock, p);
10282
10283 if (CEPH_LOCK_SHARED == filelock.type)
10284 fl->l_type = F_RDLCK;
10285 else if (CEPH_LOCK_EXCL == filelock.type)
10286 fl->l_type = F_WRLCK;
10287 else
10288 fl->l_type = F_UNLCK;
10289
10290 fl->l_whence = SEEK_SET;
10291 fl->l_start = filelock.start;
10292 fl->l_len = filelock.length;
10293 fl->l_pid = filelock.pid;
10294 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10295 ceph_lock_state_t *lock_state;
10296 if (lock_type == CEPH_LOCK_FCNTL) {
10297 if (!in->fcntl_locks)
10298 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10299 lock_state = in->fcntl_locks.get();
10300 } else if (lock_type == CEPH_LOCK_FLOCK) {
10301 if (!in->flock_locks)
10302 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10303 lock_state = in->flock_locks.get();
10304 } else {
10305 ceph_abort();
10306 return -EINVAL;
10307 }
10308 _update_lock_state(fl, owner, lock_state);
10309
10310 if (!removing) {
10311 if (lock_type == CEPH_LOCK_FCNTL) {
10312 if (!fh->fcntl_locks)
10313 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10314 lock_state = fh->fcntl_locks.get();
10315 } else {
10316 if (!fh->flock_locks)
10317 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10318 lock_state = fh->flock_locks.get();
10319 }
10320 _update_lock_state(fl, owner, lock_state);
10321 }
10322 } else
10323 ceph_abort();
10324 }
10325 return ret;
10326 }
10327
10328 int Client::_interrupt_filelock(MetaRequest *req)
10329 {
10330 // Set abort code, but do not kick. The abort code prevents the request
10331 // from being re-sent.
10332 req->abort(-EINTR);
10333 if (req->mds < 0)
10334 return 0; // haven't sent the request
10335
10336 Inode *in = req->inode();
10337
10338 int lock_type;
10339 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10340 lock_type = CEPH_LOCK_FLOCK_INTR;
10341 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10342 lock_type = CEPH_LOCK_FCNTL_INTR;
10343 else {
10344 ceph_abort();
10345 return -EINVAL;
10346 }
10347
10348 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10349 filepath path;
10350 in->make_nosnap_relative_path(path);
10351 intr_req->set_filepath(path);
10352 intr_req->set_inode(in);
10353 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10354 intr_req->head.args.filelock_change.rule = lock_type;
10355 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10356
10357 UserPerm perms(req->get_uid(), req->get_gid());
10358 return make_request(intr_req, perms, NULL, NULL, -1);
10359 }
10360
10361 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10362 {
10363 if (!in->fcntl_locks && !in->flock_locks)
10364 return;
10365
10366 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10367 encode(nr_fcntl_locks, bl);
10368 if (nr_fcntl_locks) {
10369 auto &lock_state = in->fcntl_locks;
10370 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10371 p != lock_state->held_locks.end();
10372 ++p)
10373 encode(p->second, bl);
10374 }
10375
10376 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10377 encode(nr_flock_locks, bl);
10378 if (nr_flock_locks) {
10379 auto &lock_state = in->flock_locks;
10380 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10381 p != lock_state->held_locks.end();
10382 ++p)
10383 encode(p->second, bl);
10384 }
10385
10386 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10387 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10388 }
10389
10390 void Client::_release_filelocks(Fh *fh)
10391 {
10392 if (!fh->fcntl_locks && !fh->flock_locks)
10393 return;
10394
10395 Inode *in = fh->inode.get();
10396 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10397
10398 list<ceph_filelock> activated_locks;
10399
10400 list<pair<int, ceph_filelock> > to_release;
10401
10402 if (fh->fcntl_locks) {
10403 auto &lock_state = fh->fcntl_locks;
10404 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10405 auto q = p++;
10406 if (in->flags & I_ERROR_FILELOCK) {
10407 lock_state->remove_lock(q->second, activated_locks);
10408 } else {
10409 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10410 }
10411 }
10412 lock_state.reset();
10413 }
10414 if (fh->flock_locks) {
10415 auto &lock_state = fh->flock_locks;
10416 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10417 auto q = p++;
10418 if (in->flags & I_ERROR_FILELOCK) {
10419 lock_state->remove_lock(q->second, activated_locks);
10420 } else {
10421 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10422 }
10423 }
10424 lock_state.reset();
10425 }
10426
10427 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10428 in->flags &= ~I_ERROR_FILELOCK;
10429
10430 if (to_release.empty())
10431 return;
10432
10433 struct flock fl;
10434 memset(&fl, 0, sizeof(fl));
10435 fl.l_whence = SEEK_SET;
10436 fl.l_type = F_UNLCK;
10437
10438 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10439 p != to_release.end();
10440 ++p) {
10441 fl.l_start = p->second.start;
10442 fl.l_len = p->second.length;
10443 fl.l_pid = p->second.pid;
10444 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10445 p->second.owner, true);
10446 }
10447 }
10448
10449 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10450 ceph_lock_state_t *lock_state)
10451 {
10452 int lock_cmd;
10453 if (F_RDLCK == fl->l_type)
10454 lock_cmd = CEPH_LOCK_SHARED;
10455 else if (F_WRLCK == fl->l_type)
10456 lock_cmd = CEPH_LOCK_EXCL;
10457 else
10458 lock_cmd = CEPH_LOCK_UNLOCK;;
10459
10460 ceph_filelock filelock;
10461 filelock.start = fl->l_start;
10462 filelock.length = fl->l_len;
10463 filelock.client = 0;
10464 // see comment in _do_filelock()
10465 filelock.owner = owner | (1ULL << 63);
10466 filelock.pid = fl->l_pid;
10467 filelock.type = lock_cmd;
10468
10469 if (filelock.type == CEPH_LOCK_UNLOCK) {
10470 list<ceph_filelock> activated_locks;
10471 lock_state->remove_lock(filelock, activated_locks);
10472 } else {
10473 bool r = lock_state->add_lock(filelock, false, false, NULL);
10474 ceph_assert(r);
10475 }
10476 }
10477
10478 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10479 {
10480 Inode *in = fh->inode.get();
10481 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10482 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10483 return ret;
10484 }
10485
10486 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10487 {
10488 Inode *in = fh->inode.get();
10489 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10490 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10491 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10492 return ret;
10493 }
10494
10495 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10496 {
10497 Inode *in = fh->inode.get();
10498 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10499
10500 int sleep = !(cmd & LOCK_NB);
10501 cmd &= ~LOCK_NB;
10502
10503 int type;
10504 switch (cmd) {
10505 case LOCK_SH:
10506 type = F_RDLCK;
10507 break;
10508 case LOCK_EX:
10509 type = F_WRLCK;
10510 break;
10511 case LOCK_UN:
10512 type = F_UNLCK;
10513 break;
10514 default:
10515 return -EINVAL;
10516 }
10517
10518 struct flock fl;
10519 memset(&fl, 0, sizeof(fl));
10520 fl.l_type = type;
10521 fl.l_whence = SEEK_SET;
10522
10523 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10524 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10525 return ret;
10526 }
10527
10528 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10529 {
10530 /* Since the only thing this does is wrap a call to statfs, and
10531 statfs takes a lock, it doesn't seem we have a need to split it
10532 out. */
10533 return statfs(0, stbuf, perms);
10534 }
10535
10536 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10537 {
10538 if (!args)
10539 return;
10540 std::lock_guard l(client_lock);
10541 ldout(cct, 10) << __func__ << " cb " << args->handle
10542 << " invalidate_ino_cb " << args->ino_cb
10543 << " invalidate_dentry_cb " << args->dentry_cb
10544 << " switch_interrupt_cb " << args->switch_intr_cb
10545 << " remount_cb " << args->remount_cb
10546 << dendl;
10547 callback_handle = args->handle;
10548 if (args->ino_cb) {
10549 ino_invalidate_cb = args->ino_cb;
10550 async_ino_invalidator.start();
10551 }
10552 if (args->dentry_cb) {
10553 dentry_invalidate_cb = args->dentry_cb;
10554 async_dentry_invalidator.start();
10555 }
10556 if (args->switch_intr_cb) {
10557 switch_interrupt_cb = args->switch_intr_cb;
10558 interrupt_finisher.start();
10559 }
10560 if (args->remount_cb) {
10561 remount_cb = args->remount_cb;
10562 remount_finisher.start();
10563 }
10564 if (args->ino_release_cb) {
10565 ino_release_cb = args->ino_release_cb;
10566 async_ino_releasor.start();
10567 }
10568 if (args->umask_cb)
10569 umask_cb = args->umask_cb;
10570 }
10571
10572 int Client::test_dentry_handling(bool can_invalidate)
10573 {
10574 int r = 0;
10575
10576 can_invalidate_dentries = can_invalidate;
10577
10578 if (can_invalidate_dentries) {
10579 ceph_assert(dentry_invalidate_cb);
10580 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10581 r = 0;
10582 } else {
10583 ceph_assert(remount_cb);
10584 ldout(cct, 1) << "using remount_cb" << dendl;
10585 r = _do_remount(false);
10586 }
10587
10588 return r;
10589 }
10590
10591 int Client::_sync_fs()
10592 {
10593 ldout(cct, 10) << __func__ << dendl;
10594
10595 // flush file data
10596 std::unique_ptr<C_SaferCond> cond = nullptr;
10597 if (cct->_conf->client_oc) {
10598 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10599 objectcacher->flush_all(cond.get());
10600 }
10601
10602 // flush caps
10603 flush_caps_sync();
10604 ceph_tid_t flush_tid = last_flush_tid;
10605
10606 // wait for unsafe mds requests
10607 wait_unsafe_requests();
10608
10609 wait_sync_caps(flush_tid);
10610
10611 if (nullptr != cond) {
10612 client_lock.unlock();
10613 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10614 cond->wait();
10615 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10616 client_lock.lock();
10617 }
10618
10619 return 0;
10620 }
10621
10622 int Client::sync_fs()
10623 {
10624 std::lock_guard l(client_lock);
10625
10626 if (unmounting)
10627 return -ENOTCONN;
10628
10629 return _sync_fs();
10630 }
10631
10632 int64_t Client::drop_caches()
10633 {
10634 std::lock_guard l(client_lock);
10635 return objectcacher->release_all();
10636 }
10637
10638 int Client::_lazyio(Fh *fh, int enable)
10639 {
10640 Inode *in = fh->inode.get();
10641 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10642
10643 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10644 return 0;
10645
10646 int orig_mode = fh->mode;
10647 if (enable) {
10648 fh->mode |= CEPH_FILE_MODE_LAZY;
10649 in->get_open_ref(fh->mode);
10650 in->put_open_ref(orig_mode);
10651 check_caps(in, CHECK_CAPS_NODELAY);
10652 } else {
10653 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10654 in->get_open_ref(fh->mode);
10655 in->put_open_ref(orig_mode);
10656 check_caps(in, 0);
10657 }
10658
10659 return 0;
10660 }
10661
10662 int Client::lazyio(int fd, int enable)
10663 {
10664 std::lock_guard l(client_lock);
10665 Fh *f = get_filehandle(fd);
10666 if (!f)
10667 return -EBADF;
10668
10669 return _lazyio(f, enable);
10670 }
10671
10672 int Client::ll_lazyio(Fh *fh, int enable)
10673 {
10674 std::lock_guard lock(client_lock);
10675 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10676 tout(cct) << __func__ << std::endl;
10677
10678 return _lazyio(fh, enable);
10679 }
10680
10681 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10682 {
10683 std::lock_guard l(client_lock);
10684 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10685 << ", " << offset << ", " << count << ")" << dendl;
10686
10687 Fh *f = get_filehandle(fd);
10688 if (!f)
10689 return -EBADF;
10690
10691 // for now
10692 _fsync(f, true);
10693
10694 return 0;
10695 }
10696
10697 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10698 {
10699 std::lock_guard l(client_lock);
10700 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10701 << ", " << offset << ", " << count << ")" << dendl;
10702
10703 Fh *f = get_filehandle(fd);
10704 if (!f)
10705 return -EBADF;
10706 Inode *in = f->inode.get();
10707
10708 _fsync(f, true);
10709 if (_release(in)) {
10710 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10711 if (r < 0)
10712 return r;
10713 }
10714 return 0;
10715 }
10716
10717
10718 // =============================
10719 // snaps
10720
10721 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10722 {
10723 std::lock_guard l(client_lock);
10724
10725 if (unmounting)
10726 return -ENOTCONN;
10727
10728 filepath path(relpath);
10729 InodeRef in;
10730 int r = path_walk(path, &in, perm);
10731 if (r < 0)
10732 return r;
10733 if (cct->_conf->client_permissions) {
10734 r = may_create(in.get(), perm);
10735 if (r < 0)
10736 return r;
10737 }
10738 Inode *snapdir = open_snapdir(in.get());
10739 return _mkdir(snapdir, name, 0, perm);
10740 }
10741
10742 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10743 {
10744 std::lock_guard l(client_lock);
10745
10746 if (unmounting)
10747 return -ENOTCONN;
10748
10749 filepath path(relpath);
10750 InodeRef in;
10751 int r = path_walk(path, &in, perms);
10752 if (r < 0)
10753 return r;
10754 if (cct->_conf->client_permissions) {
10755 r = may_delete(in.get(), NULL, perms);
10756 if (r < 0)
10757 return r;
10758 }
10759 Inode *snapdir = open_snapdir(in.get());
10760 return _rmdir(snapdir, name, perms);
10761 }
10762
10763 // =============================
10764 // expose caps
10765
10766 int Client::get_caps_issued(int fd) {
10767
10768 std::lock_guard lock(client_lock);
10769
10770 if (unmounting)
10771 return -ENOTCONN;
10772
10773 Fh *f = get_filehandle(fd);
10774 if (!f)
10775 return -EBADF;
10776
10777 return f->inode->caps_issued();
10778 }
10779
10780 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10781 {
10782 std::lock_guard lock(client_lock);
10783
10784 if (unmounting)
10785 return -ENOTCONN;
10786
10787 filepath p(path);
10788 InodeRef in;
10789 int r = path_walk(p, &in, perms, true);
10790 if (r < 0)
10791 return r;
10792 return in->caps_issued();
10793 }
10794
10795 // =========================================
10796 // low level
10797
10798 Inode *Client::open_snapdir(Inode *diri)
10799 {
10800 Inode *in;
10801 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10802 if (!inode_map.count(vino)) {
10803 in = new Inode(this, vino, &diri->layout);
10804
10805 in->ino = diri->ino;
10806 in->snapid = CEPH_SNAPDIR;
10807 in->mode = diri->mode;
10808 in->uid = diri->uid;
10809 in->gid = diri->gid;
10810 in->nlink = 1;
10811 in->mtime = diri->mtime;
10812 in->ctime = diri->ctime;
10813 in->btime = diri->btime;
10814 in->atime = diri->atime;
10815 in->size = diri->size;
10816 in->change_attr = diri->change_attr;
10817
10818 in->dirfragtree.clear();
10819 in->snapdir_parent = diri;
10820 diri->flags |= I_SNAPDIR_OPEN;
10821 inode_map[vino] = in;
10822 if (use_faked_inos())
10823 _assign_faked_ino(in);
10824 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10825 } else {
10826 in = inode_map[vino];
10827 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10828 }
10829 return in;
10830 }
10831
10832 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10833 Inode **out, const UserPerm& perms)
10834 {
10835 std::lock_guard lock(client_lock);
10836 vinodeno_t vparent = _get_vino(parent);
10837 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10838 tout(cct) << __func__ << std::endl;
10839 tout(cct) << name << std::endl;
10840
10841 if (unmounting)
10842 return -ENOTCONN;
10843
10844 int r = 0;
10845 if (!fuse_default_permissions) {
10846 if (strcmp(name, ".") && strcmp(name, "..")) {
10847 r = may_lookup(parent, perms);
10848 if (r < 0)
10849 return r;
10850 }
10851 }
10852
10853 string dname(name);
10854 InodeRef in;
10855
10856 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10857 if (r < 0) {
10858 attr->st_ino = 0;
10859 goto out;
10860 }
10861
10862 ceph_assert(in);
10863 fill_stat(in, attr);
10864 _ll_get(in.get());
10865
10866 out:
10867 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10868 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10869 tout(cct) << attr->st_ino << std::endl;
10870 *out = in.get();
10871 return r;
10872 }
10873
10874 int Client::ll_lookup_inode(
10875 struct inodeno_t ino,
10876 const UserPerm& perms,
10877 Inode **inode)
10878 {
10879 ceph_assert(inode != NULL);
10880 std::lock_guard lock(client_lock);
10881 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10882
10883 if (unmounting)
10884 return -ENOTCONN;
10885
10886 // Num1: get inode and *inode
10887 int r = _lookup_ino(ino, perms, inode);
10888 if (r)
10889 return r;
10890
10891 ceph_assert(*inode != NULL);
10892
10893 if (!(*inode)->dentries.empty()) {
10894 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10895 return 0;
10896 }
10897
10898 if ((*inode)->is_root()) {
10899 ldout(cct, 8) << "ino is root, no parent" << dendl;
10900 return 0;
10901 }
10902
10903 // Num2: Request the parent inode, so that we can look up the name
10904 Inode *parent;
10905 r = _lookup_parent(*inode, perms, &parent);
10906 if (r) {
10907 _ll_forget(*inode, 1);
10908 return r;
10909 }
10910
10911 ceph_assert(parent != NULL);
10912
10913 // Num3: Finally, get the name (dentry) of the requested inode
10914 r = _lookup_name(*inode, parent, perms);
10915 if (r) {
10916 // Unexpected error
10917 _ll_forget(parent, 1);
10918 _ll_forget(*inode, 1);
10919 return r;
10920 }
10921
10922 _ll_forget(parent, 1);
10923 return 0;
10924 }
10925
10926 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10927 struct ceph_statx *stx, unsigned want, unsigned flags,
10928 const UserPerm& perms)
10929 {
10930 std::lock_guard lock(client_lock);
10931 vinodeno_t vparent = _get_vino(parent);
10932 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10933 tout(cct) << "ll_lookupx" << std::endl;
10934 tout(cct) << name << std::endl;
10935
10936 if (unmounting)
10937 return -ENOTCONN;
10938
10939 int r = 0;
10940 if (!fuse_default_permissions) {
10941 r = may_lookup(parent, perms);
10942 if (r < 0)
10943 return r;
10944 }
10945
10946 string dname(name);
10947 InodeRef in;
10948
10949 unsigned mask = statx_to_mask(flags, want);
10950 r = _lookup(parent, dname, mask, &in, perms);
10951 if (r < 0) {
10952 stx->stx_ino = 0;
10953 stx->stx_mask = 0;
10954 } else {
10955 ceph_assert(in);
10956 fill_statx(in, mask, stx);
10957 _ll_get(in.get());
10958 }
10959
10960 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10961 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10962 tout(cct) << stx->stx_ino << std::endl;
10963 *out = in.get();
10964 return r;
10965 }
10966
10967 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10968 unsigned int want, unsigned int flags, const UserPerm& perms)
10969 {
10970 std::lock_guard lock(client_lock);
10971
10972 if (unmounting)
10973 return -ENOTCONN;
10974
10975 filepath fp(name, 0);
10976 InodeRef in;
10977 int rc;
10978 unsigned mask = statx_to_mask(flags, want);
10979
10980 ldout(cct, 3) << __func__ << " " << name << dendl;
10981 tout(cct) << __func__ << std::endl;
10982 tout(cct) << name << std::endl;
10983
10984 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10985 if (rc < 0) {
10986 /* zero out mask, just in case... */
10987 stx->stx_mask = 0;
10988 stx->stx_ino = 0;
10989 *out = NULL;
10990 return rc;
10991 } else {
10992 ceph_assert(in);
10993 fill_statx(in, mask, stx);
10994 _ll_get(in.get());
10995 *out = in.get();
10996 return 0;
10997 }
10998 }
10999
11000 void Client::_ll_get(Inode *in)
11001 {
11002 if (in->ll_ref == 0) {
11003 in->get();
11004 if (in->is_dir() && !in->dentries.empty()) {
11005 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11006 in->get_first_parent()->get(); // pin dentry
11007 }
11008 if (in->snapid != CEPH_NOSNAP)
11009 ll_snap_ref[in->snapid]++;
11010 }
11011 in->ll_get();
11012 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11013 }
11014
11015 int Client::_ll_put(Inode *in, uint64_t num)
11016 {
11017 in->ll_put(num);
11018 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11019 if (in->ll_ref == 0) {
11020 if (in->is_dir() && !in->dentries.empty()) {
11021 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11022 in->get_first_parent()->put(); // unpin dentry
11023 }
11024 if (in->snapid != CEPH_NOSNAP) {
11025 auto p = ll_snap_ref.find(in->snapid);
11026 ceph_assert(p != ll_snap_ref.end());
11027 ceph_assert(p->second > 0);
11028 if (--p->second == 0)
11029 ll_snap_ref.erase(p);
11030 }
11031 put_inode(in);
11032 return 0;
11033 } else {
11034 return in->ll_ref;
11035 }
11036 }
11037
11038 void Client::_ll_drop_pins()
11039 {
11040 ldout(cct, 10) << __func__ << dendl;
11041 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11042 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11043 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11044 it != inode_map.end();
11045 it = next) {
11046 Inode *in = it->second;
11047 next = it;
11048 ++next;
11049 if (in->ll_ref){
11050 to_be_put.insert(in);
11051 _ll_put(in, in->ll_ref);
11052 }
11053 }
11054 }
11055
11056 bool Client::_ll_forget(Inode *in, uint64_t count)
11057 {
11058 inodeno_t ino = in->ino;
11059
11060 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11061 tout(cct) << __func__ << std::endl;
11062 tout(cct) << ino.val << std::endl;
11063 tout(cct) << count << std::endl;
11064
11065 // Ignore forget if we're no longer mounted
11066 if (unmounting)
11067 return true;
11068
11069 if (ino == 1) return true; // ignore forget on root.
11070
11071 bool last = false;
11072 if (in->ll_ref < count) {
11073 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11074 << ", which only has ll_ref=" << in->ll_ref << dendl;
11075 _ll_put(in, in->ll_ref);
11076 last = true;
11077 } else {
11078 if (_ll_put(in, count) == 0)
11079 last = true;
11080 }
11081
11082 return last;
11083 }
11084
11085 bool Client::ll_forget(Inode *in, uint64_t count)
11086 {
11087 std::lock_guard lock(client_lock);
11088 return _ll_forget(in, count);
11089 }
11090
11091 bool Client::ll_put(Inode *in)
11092 {
11093 /* ll_forget already takes the lock */
11094 return ll_forget(in, 1);
11095 }
11096
11097 int Client::ll_get_snap_ref(snapid_t snap)
11098 {
11099 std::lock_guard lock(client_lock);
11100 auto p = ll_snap_ref.find(snap);
11101 if (p != ll_snap_ref.end())
11102 return p->second;
11103 return 0;
11104 }
11105
11106 snapid_t Client::ll_get_snapid(Inode *in)
11107 {
11108 std::lock_guard lock(client_lock);
11109 return in->snapid;
11110 }
11111
11112 Inode *Client::ll_get_inode(ino_t ino)
11113 {
11114 std::lock_guard lock(client_lock);
11115
11116 if (unmounting)
11117 return NULL;
11118
11119 vinodeno_t vino = _map_faked_ino(ino);
11120 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11121 if (p == inode_map.end())
11122 return NULL;
11123 Inode *in = p->second;
11124 _ll_get(in);
11125 return in;
11126 }
11127
11128 Inode *Client::ll_get_inode(vinodeno_t vino)
11129 {
11130 std::lock_guard lock(client_lock);
11131
11132 if (unmounting)
11133 return NULL;
11134
11135 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11136 if (p == inode_map.end())
11137 return NULL;
11138 Inode *in = p->second;
11139 _ll_get(in);
11140 return in;
11141 }
11142
11143 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11144 {
11145 vinodeno_t vino = _get_vino(in);
11146
11147 ldout(cct, 8) << __func__ << " " << vino << dendl;
11148 tout(cct) << __func__ << std::endl;
11149 tout(cct) << vino.ino.val << std::endl;
11150
11151 if (vino.snapid < CEPH_NOSNAP)
11152 return 0;
11153 else
11154 return _getattr(in, caps, perms);
11155 }
11156
11157 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11158 {
11159 std::lock_guard lock(client_lock);
11160
11161 if (unmounting)
11162 return -ENOTCONN;
11163
11164 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11165
11166 if (res == 0)
11167 fill_stat(in, attr);
11168 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11169 return res;
11170 }
11171
11172 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11173 unsigned int flags, const UserPerm& perms)
11174 {
11175 std::lock_guard lock(client_lock);
11176
11177 if (unmounting)
11178 return -ENOTCONN;
11179
11180 int res = 0;
11181 unsigned mask = statx_to_mask(flags, want);
11182
11183 if (mask && !in->caps_issued_mask(mask, true))
11184 res = _ll_getattr(in, mask, perms);
11185
11186 if (res == 0)
11187 fill_statx(in, mask, stx);
11188 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11189 return res;
11190 }
11191
11192 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11193 const UserPerm& perms, InodeRef *inp)
11194 {
11195 vinodeno_t vino = _get_vino(in);
11196
11197 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11198 << dendl;
11199 tout(cct) << __func__ << std::endl;
11200 tout(cct) << vino.ino.val << std::endl;
11201 tout(cct) << stx->stx_mode << std::endl;
11202 tout(cct) << stx->stx_uid << std::endl;
11203 tout(cct) << stx->stx_gid << std::endl;
11204 tout(cct) << stx->stx_size << std::endl;
11205 tout(cct) << stx->stx_mtime << std::endl;
11206 tout(cct) << stx->stx_atime << std::endl;
11207 tout(cct) << stx->stx_btime << std::endl;
11208 tout(cct) << mask << std::endl;
11209
11210 if (!fuse_default_permissions) {
11211 int res = may_setattr(in, stx, mask, perms);
11212 if (res < 0)
11213 return res;
11214 }
11215
11216 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11217
11218 return __setattrx(in, stx, mask, perms, inp);
11219 }
11220
11221 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11222 const UserPerm& perms)
11223 {
11224 std::lock_guard lock(client_lock);
11225
11226 if (unmounting)
11227 return -ENOTCONN;
11228
11229 InodeRef target(in);
11230 int res = _ll_setattrx(in, stx, mask, perms, &target);
11231 if (res == 0) {
11232 ceph_assert(in == target.get());
11233 fill_statx(in, in->caps_issued(), stx);
11234 }
11235
11236 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11237 return res;
11238 }
11239
11240 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11241 const UserPerm& perms)
11242 {
11243 struct ceph_statx stx;
11244 stat_to_statx(attr, &stx);
11245
11246 std::lock_guard lock(client_lock);
11247
11248 if (unmounting)
11249 return -ENOTCONN;
11250
11251 InodeRef target(in);
11252 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11253 if (res == 0) {
11254 ceph_assert(in == target.get());
11255 fill_stat(in, attr);
11256 }
11257
11258 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11259 return res;
11260 }
11261
11262
11263 // ----------
11264 // xattrs
11265
11266 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11267 const UserPerm& perms)
11268 {
11269 std::lock_guard lock(client_lock);
11270
11271 if (unmounting)
11272 return -ENOTCONN;
11273
11274 InodeRef in;
11275 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11276 if (r < 0)
11277 return r;
11278 return _getxattr(in, name, value, size, perms);
11279 }
11280
11281 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11282 const UserPerm& perms)
11283 {
11284 std::lock_guard lock(client_lock);
11285
11286 if (unmounting)
11287 return -ENOTCONN;
11288
11289 InodeRef in;
11290 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11291 if (r < 0)
11292 return r;
11293 return _getxattr(in, name, value, size, perms);
11294 }
11295
11296 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11297 const UserPerm& perms)
11298 {
11299 std::lock_guard lock(client_lock);
11300
11301 if (unmounting)
11302 return -ENOTCONN;
11303
11304 Fh *f = get_filehandle(fd);
11305 if (!f)
11306 return -EBADF;
11307 return _getxattr(f->inode, name, value, size, perms);
11308 }
11309
11310 int Client::listxattr(const char *path, char *list, size_t size,
11311 const UserPerm& perms)
11312 {
11313 std::lock_guard lock(client_lock);
11314
11315 if (unmounting)
11316 return -ENOTCONN;
11317
11318 InodeRef in;
11319 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11320 if (r < 0)
11321 return r;
11322 return Client::_listxattr(in.get(), list, size, perms);
11323 }
11324
11325 int Client::llistxattr(const char *path, char *list, size_t size,
11326 const UserPerm& perms)
11327 {
11328 std::lock_guard lock(client_lock);
11329
11330 if (unmounting)
11331 return -ENOTCONN;
11332
11333 InodeRef in;
11334 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11335 if (r < 0)
11336 return r;
11337 return Client::_listxattr(in.get(), list, size, perms);
11338 }
11339
11340 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11341 {
11342 std::lock_guard lock(client_lock);
11343
11344 if (unmounting)
11345 return -ENOTCONN;
11346
11347 Fh *f = get_filehandle(fd);
11348 if (!f)
11349 return -EBADF;
11350 return Client::_listxattr(f->inode.get(), list, size, perms);
11351 }
11352
11353 int Client::removexattr(const char *path, const char *name,
11354 const UserPerm& perms)
11355 {
11356 std::lock_guard lock(client_lock);
11357
11358 if (unmounting)
11359 return -ENOTCONN;
11360
11361 InodeRef in;
11362 int r = Client::path_walk(path, &in, perms, true);
11363 if (r < 0)
11364 return r;
11365 return _removexattr(in, name, perms);
11366 }
11367
11368 int Client::lremovexattr(const char *path, const char *name,
11369 const UserPerm& perms)
11370 {
11371 std::lock_guard lock(client_lock);
11372
11373 if (unmounting)
11374 return -ENOTCONN;
11375
11376 InodeRef in;
11377 int r = Client::path_walk(path, &in, perms, false);
11378 if (r < 0)
11379 return r;
11380 return _removexattr(in, name, perms);
11381 }
11382
11383 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11384 {
11385 std::lock_guard lock(client_lock);
11386
11387 if (unmounting)
11388 return -ENOTCONN;
11389
11390 Fh *f = get_filehandle(fd);
11391 if (!f)
11392 return -EBADF;
11393 return _removexattr(f->inode, name, perms);
11394 }
11395
11396 int Client::setxattr(const char *path, const char *name, const void *value,
11397 size_t size, int flags, const UserPerm& perms)
11398 {
11399 _setxattr_maybe_wait_for_osdmap(name, value, size);
11400
11401 std::lock_guard lock(client_lock);
11402
11403 if (unmounting)
11404 return -ENOTCONN;
11405
11406 InodeRef in;
11407 int r = Client::path_walk(path, &in, perms, true);
11408 if (r < 0)
11409 return r;
11410 return _setxattr(in, name, value, size, flags, perms);
11411 }
11412
11413 int Client::lsetxattr(const char *path, const char *name, const void *value,
11414 size_t size, int flags, const UserPerm& perms)
11415 {
11416 _setxattr_maybe_wait_for_osdmap(name, value, size);
11417
11418 std::lock_guard lock(client_lock);
11419
11420 if (unmounting)
11421 return -ENOTCONN;
11422
11423 InodeRef in;
11424 int r = Client::path_walk(path, &in, perms, false);
11425 if (r < 0)
11426 return r;
11427 return _setxattr(in, name, value, size, flags, perms);
11428 }
11429
11430 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11431 int flags, const UserPerm& perms)
11432 {
11433 _setxattr_maybe_wait_for_osdmap(name, value, size);
11434
11435 std::lock_guard lock(client_lock);
11436
11437 if (unmounting)
11438 return -ENOTCONN;
11439
11440 Fh *f = get_filehandle(fd);
11441 if (!f)
11442 return -EBADF;
11443 return _setxattr(f->inode, name, value, size, flags, perms);
11444 }
11445
11446 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11447 const UserPerm& perms)
11448 {
11449 int r;
11450
11451 const VXattr *vxattr = _match_vxattr(in, name);
11452 if (vxattr) {
11453 r = -ENODATA;
11454
11455 // Do a force getattr to get the latest quota before returning
11456 // a value to userspace.
11457 int flags = 0;
11458 if (vxattr->flags & VXATTR_RSTAT) {
11459 flags |= CEPH_STAT_RSTAT;
11460 }
11461 r = _getattr(in, flags, perms, true);
11462 if (r != 0) {
11463 // Error from getattr!
11464 return r;
11465 }
11466
11467 // call pointer-to-member function
11468 char buf[256];
11469 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11470 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11471 } else {
11472 r = -ENODATA;
11473 }
11474
11475 if (size != 0) {
11476 if (r > (int)size) {
11477 r = -ERANGE;
11478 } else if (r > 0) {
11479 memcpy(value, buf, r);
11480 }
11481 }
11482 goto out;
11483 }
11484
11485 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11486 r = -EOPNOTSUPP;
11487 goto out;
11488 }
11489
11490 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11491 if (r == 0) {
11492 string n(name);
11493 r = -ENODATA;
11494 if (in->xattrs.count(n)) {
11495 r = in->xattrs[n].length();
11496 if (r > 0 && size != 0) {
11497 if (size >= (unsigned)r)
11498 memcpy(value, in->xattrs[n].c_str(), r);
11499 else
11500 r = -ERANGE;
11501 }
11502 }
11503 }
11504 out:
11505 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11506 return r;
11507 }
11508
11509 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11510 const UserPerm& perms)
11511 {
11512 if (cct->_conf->client_permissions) {
11513 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11514 if (r < 0)
11515 return r;
11516 }
11517 return _getxattr(in.get(), name, value, size, perms);
11518 }
11519
11520 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11521 size_t size, const UserPerm& perms)
11522 {
11523 std::lock_guard lock(client_lock);
11524
11525 if (unmounting)
11526 return -ENOTCONN;
11527
11528 vinodeno_t vino = _get_vino(in);
11529
11530 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11531 tout(cct) << __func__ << std::endl;
11532 tout(cct) << vino.ino.val << std::endl;
11533 tout(cct) << name << std::endl;
11534
11535 if (!fuse_default_permissions) {
11536 int r = xattr_permission(in, name, MAY_READ, perms);
11537 if (r < 0)
11538 return r;
11539 }
11540
11541 return _getxattr(in, name, value, size, perms);
11542 }
11543
11544 int Client::_listxattr(Inode *in, char *name, size_t size,
11545 const UserPerm& perms)
11546 {
11547 bool len_only = (size == 0);
11548 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11549 if (r != 0) {
11550 goto out;
11551 }
11552
11553 r = 0;
11554 for (const auto& p : in->xattrs) {
11555 size_t this_len = p.first.length() + 1;
11556 r += this_len;
11557 if (len_only)
11558 continue;
11559
11560 if (this_len > size) {
11561 r = -ERANGE;
11562 goto out;
11563 }
11564
11565 memcpy(name, p.first.c_str(), this_len);
11566 name += this_len;
11567 size -= this_len;
11568 }
11569 out:
11570 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11571 return r;
11572 }
11573
11574 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11575 const UserPerm& perms)
11576 {
11577 std::lock_guard lock(client_lock);
11578
11579 if (unmounting)
11580 return -ENOTCONN;
11581
11582 vinodeno_t vino = _get_vino(in);
11583
11584 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11585 tout(cct) << __func__ << std::endl;
11586 tout(cct) << vino.ino.val << std::endl;
11587 tout(cct) << size << std::endl;
11588
11589 return _listxattr(in, names, size, perms);
11590 }
11591
11592 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11593 size_t size, int flags, const UserPerm& perms)
11594 {
11595
11596 int xattr_flags = 0;
11597 if (!value)
11598 xattr_flags |= CEPH_XATTR_REMOVE;
11599 if (flags & XATTR_CREATE)
11600 xattr_flags |= CEPH_XATTR_CREATE;
11601 if (flags & XATTR_REPLACE)
11602 xattr_flags |= CEPH_XATTR_REPLACE;
11603
11604 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11605 filepath path;
11606 in->make_nosnap_relative_path(path);
11607 req->set_filepath(path);
11608 req->set_string2(name);
11609 req->set_inode(in);
11610 req->head.args.setxattr.flags = xattr_flags;
11611
11612 bufferlist bl;
11613 assert (value || size == 0);
11614 bl.append((const char*)value, size);
11615 req->set_data(bl);
11616
11617 int res = make_request(req, perms);
11618
11619 trim_cache();
11620 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11621 res << dendl;
11622 return res;
11623 }
11624
11625 int Client::_setxattr(Inode *in, const char *name, const void *value,
11626 size_t size, int flags, const UserPerm& perms)
11627 {
11628 if (in->snapid != CEPH_NOSNAP) {
11629 return -EROFS;
11630 }
11631
11632 if (size == 0) {
11633 value = "";
11634 } else if (value == NULL) {
11635 return -EINVAL;
11636 }
11637
11638 bool posix_acl_xattr = false;
11639 if (acl_type == POSIX_ACL)
11640 posix_acl_xattr = !strncmp(name, "system.", 7);
11641
11642 if (strncmp(name, "user.", 5) &&
11643 strncmp(name, "security.", 9) &&
11644 strncmp(name, "trusted.", 8) &&
11645 strncmp(name, "ceph.", 5) &&
11646 !posix_acl_xattr)
11647 return -EOPNOTSUPP;
11648
11649 bool check_realm = false;
11650
11651 if (posix_acl_xattr) {
11652 if (!strcmp(name, ACL_EA_ACCESS)) {
11653 mode_t new_mode = in->mode;
11654 if (value) {
11655 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11656 if (ret < 0)
11657 return ret;
11658 if (ret == 0) {
11659 value = NULL;
11660 size = 0;
11661 }
11662 if (new_mode != in->mode) {
11663 struct ceph_statx stx;
11664 stx.stx_mode = new_mode;
11665 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11666 if (ret < 0)
11667 return ret;
11668 }
11669 }
11670 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11671 if (value) {
11672 if (!S_ISDIR(in->mode))
11673 return -EACCES;
11674 int ret = posix_acl_check(value, size);
11675 if (ret < 0)
11676 return -EINVAL;
11677 if (ret == 0) {
11678 value = NULL;
11679 size = 0;
11680 }
11681 }
11682 } else {
11683 return -EOPNOTSUPP;
11684 }
11685 } else {
11686 const VXattr *vxattr = _match_vxattr(in, name);
11687 if (vxattr) {
11688 if (vxattr->readonly)
11689 return -EOPNOTSUPP;
11690 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11691 check_realm = true;
11692 }
11693 }
11694
11695 int ret = _do_setxattr(in, name, value, size, flags, perms);
11696 if (ret >= 0 && check_realm) {
11697 // check if snaprealm was created for quota inode
11698 if (in->quota.is_enable() &&
11699 !(in->snaprealm && in->snaprealm->ino == in->ino))
11700 ret = -EOPNOTSUPP;
11701 }
11702
11703 return ret;
11704 }
11705
11706 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11707 size_t size, int flags, const UserPerm& perms)
11708 {
11709 if (cct->_conf->client_permissions) {
11710 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11711 if (r < 0)
11712 return r;
11713 }
11714 return _setxattr(in.get(), name, value, size, flags, perms);
11715 }
11716
11717 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11718 {
11719 string tmp;
11720 if (name == "layout") {
11721 string::iterator begin = value.begin();
11722 string::iterator end = value.end();
11723 keys_and_values<string::iterator> p; // create instance of parser
11724 std::map<string, string> m; // map to receive results
11725 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11726 return -EINVAL;
11727 }
11728 if (begin != end)
11729 return -EINVAL;
11730 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11731 if (q->first == "pool") {
11732 tmp = q->second;
11733 break;
11734 }
11735 }
11736 } else if (name == "layout.pool") {
11737 tmp = value;
11738 }
11739
11740 if (tmp.length()) {
11741 int64_t pool;
11742 try {
11743 pool = boost::lexical_cast<unsigned>(tmp);
11744 if (!osdmap->have_pg_pool(pool))
11745 return -ENOENT;
11746 } catch (boost::bad_lexical_cast const&) {
11747 pool = osdmap->lookup_pg_pool_name(tmp);
11748 if (pool < 0) {
11749 return -ENOENT;
11750 }
11751 }
11752 }
11753
11754 return 0;
11755 }
11756
11757 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11758 {
11759 // For setting pool of layout, MetaRequest need osdmap epoch.
11760 // There is a race which create a new data pool but client and mds both don't have.
11761 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11762 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11763 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11764 string rest(strstr(name, "layout"));
11765 string v((const char*)value, size);
11766 int r = objecter->with_osdmap([&](const OSDMap& o) {
11767 return _setxattr_check_data_pool(rest, v, &o);
11768 });
11769
11770 if (r == -ENOENT) {
11771 C_SaferCond ctx;
11772 objecter->wait_for_latest_osdmap(&ctx);
11773 ctx.wait();
11774 }
11775 }
11776 }
11777
11778 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11779 size_t size, int flags, const UserPerm& perms)
11780 {
11781 _setxattr_maybe_wait_for_osdmap(name, value, size);
11782
11783 std::lock_guard lock(client_lock);
11784
11785 if (unmounting)
11786 return -ENOTCONN;
11787
11788 vinodeno_t vino = _get_vino(in);
11789
11790 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11791 tout(cct) << __func__ << std::endl;
11792 tout(cct) << vino.ino.val << std::endl;
11793 tout(cct) << name << std::endl;
11794
11795 if (!fuse_default_permissions) {
11796 int r = xattr_permission(in, name, MAY_WRITE, perms);
11797 if (r < 0)
11798 return r;
11799 }
11800 return _setxattr(in, name, value, size, flags, perms);
11801 }
11802
11803 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11804 {
11805 if (in->snapid != CEPH_NOSNAP) {
11806 return -EROFS;
11807 }
11808
11809 // same xattrs supported by kernel client
11810 if (strncmp(name, "user.", 5) &&
11811 strncmp(name, "system.", 7) &&
11812 strncmp(name, "security.", 9) &&
11813 strncmp(name, "trusted.", 8) &&
11814 strncmp(name, "ceph.", 5))
11815 return -EOPNOTSUPP;
11816
11817 const VXattr *vxattr = _match_vxattr(in, name);
11818 if (vxattr && vxattr->readonly)
11819 return -EOPNOTSUPP;
11820
11821 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11822 filepath path;
11823 in->make_nosnap_relative_path(path);
11824 req->set_filepath(path);
11825 req->set_filepath2(name);
11826 req->set_inode(in);
11827
11828 int res = make_request(req, perms);
11829
11830 trim_cache();
11831 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11832 return res;
11833 }
11834
11835 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11836 {
11837 if (cct->_conf->client_permissions) {
11838 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11839 if (r < 0)
11840 return r;
11841 }
11842 return _removexattr(in.get(), name, perms);
11843 }
11844
11845 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11846 {
11847 std::lock_guard lock(client_lock);
11848
11849 if (unmounting)
11850 return -ENOTCONN;
11851
11852 vinodeno_t vino = _get_vino(in);
11853
11854 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11855 tout(cct) << "ll_removexattr" << std::endl;
11856 tout(cct) << vino.ino.val << std::endl;
11857 tout(cct) << name << std::endl;
11858
11859 if (!fuse_default_permissions) {
11860 int r = xattr_permission(in, name, MAY_WRITE, perms);
11861 if (r < 0)
11862 return r;
11863 }
11864
11865 return _removexattr(in, name, perms);
11866 }
11867
11868 bool Client::_vxattrcb_quota_exists(Inode *in)
11869 {
11870 return in->quota.is_enable() &&
11871 (in->snapid != CEPH_NOSNAP ||
11872 (in->snaprealm && in->snaprealm->ino == in->ino));
11873 }
11874 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11875 {
11876 return snprintf(val, size,
11877 "max_bytes=%lld max_files=%lld",
11878 (long long int)in->quota.max_bytes,
11879 (long long int)in->quota.max_files);
11880 }
11881 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11882 {
11883 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11884 }
11885 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11886 {
11887 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11888 }
11889
11890 bool Client::_vxattrcb_layout_exists(Inode *in)
11891 {
11892 return in->layout != file_layout_t();
11893 }
11894 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11895 {
11896 int r = snprintf(val, size,
11897 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11898 (unsigned long long)in->layout.stripe_unit,
11899 (unsigned long long)in->layout.stripe_count,
11900 (unsigned long long)in->layout.object_size);
11901 objecter->with_osdmap([&](const OSDMap& o) {
11902 if (o.have_pg_pool(in->layout.pool_id))
11903 r += snprintf(val + r, size - r, "%s",
11904 o.get_pool_name(in->layout.pool_id).c_str());
11905 else
11906 r += snprintf(val + r, size - r, "%" PRIu64,
11907 (uint64_t)in->layout.pool_id);
11908 });
11909 if (in->layout.pool_ns.length())
11910 r += snprintf(val + r, size - r, " pool_namespace=%s",
11911 in->layout.pool_ns.c_str());
11912 return r;
11913 }
11914 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11915 {
11916 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11917 }
11918 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11919 {
11920 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11921 }
11922 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11923 {
11924 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11925 }
11926 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11927 {
11928 size_t r;
11929 objecter->with_osdmap([&](const OSDMap& o) {
11930 if (o.have_pg_pool(in->layout.pool_id))
11931 r = snprintf(val, size, "%s", o.get_pool_name(
11932 in->layout.pool_id).c_str());
11933 else
11934 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11935 });
11936 return r;
11937 }
11938 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11939 {
11940 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11941 }
11942 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11943 {
11944 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11945 }
11946 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11947 {
11948 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11949 }
11950 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11951 {
11952 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11953 }
11954 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11955 {
11956 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11957 }
11958 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11959 {
11960 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11961 }
11962 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11963 {
11964 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11965 }
11966 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11967 {
11968 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11969 }
11970 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11971 {
11972 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11973 (long)in->rstat.rctime.nsec());
11974 }
11975 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11976 {
11977 return in->dir_pin != -ENODATA;
11978 }
11979 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11980 {
11981 return snprintf(val, size, "%ld", (long)in->dir_pin);
11982 }
11983
11984 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11985 {
11986 return !in->snap_btime.is_zero();
11987 }
11988
11989 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11990 {
11991 return snprintf(val, size, "%llu.%09lu",
11992 (long long unsigned)in->snap_btime.sec(),
11993 (long unsigned)in->snap_btime.nsec());
11994 }
11995
11996 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11997 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11998
11999 #define XATTR_NAME_CEPH(_type, _name) \
12000 { \
12001 name: CEPH_XATTR_NAME(_type, _name), \
12002 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12003 readonly: true, \
12004 exists_cb: NULL, \
12005 flags: 0, \
12006 }
12007 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
12008 { \
12009 name: CEPH_XATTR_NAME(_type, _name), \
12010 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12011 readonly: true, \
12012 exists_cb: NULL, \
12013 flags: _flags, \
12014 }
12015 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12016 { \
12017 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12018 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12019 readonly: false, \
12020 exists_cb: &Client::_vxattrcb_layout_exists, \
12021 flags: 0, \
12022 }
12023 #define XATTR_QUOTA_FIELD(_type, _name) \
12024 { \
12025 name: CEPH_XATTR_NAME(_type, _name), \
12026 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12027 readonly: false, \
12028 exists_cb: &Client::_vxattrcb_quota_exists, \
12029 flags: 0, \
12030 }
12031
12032 const Client::VXattr Client::_dir_vxattrs[] = {
12033 {
12034 name: "ceph.dir.layout",
12035 getxattr_cb: &Client::_vxattrcb_layout,
12036 readonly: false,
12037 exists_cb: &Client::_vxattrcb_layout_exists,
12038 flags: 0,
12039 },
12040 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12041 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12042 XATTR_LAYOUT_FIELD(dir, layout, object_size),
12043 XATTR_LAYOUT_FIELD(dir, layout, pool),
12044 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12045 XATTR_NAME_CEPH(dir, entries),
12046 XATTR_NAME_CEPH(dir, files),
12047 XATTR_NAME_CEPH(dir, subdirs),
12048 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
12049 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
12050 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
12051 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
12052 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
12053 {
12054 name: "ceph.quota",
12055 getxattr_cb: &Client::_vxattrcb_quota,
12056 readonly: false,
12057 exists_cb: &Client::_vxattrcb_quota_exists,
12058 flags: 0,
12059 },
12060 XATTR_QUOTA_FIELD(quota, max_bytes),
12061 XATTR_QUOTA_FIELD(quota, max_files),
12062 {
12063 name: "ceph.dir.pin",
12064 getxattr_cb: &Client::_vxattrcb_dir_pin,
12065 readonly: false,
12066 exists_cb: &Client::_vxattrcb_dir_pin_exists,
12067 flags: 0,
12068 },
12069 {
12070 name: "ceph.snap.btime",
12071 getxattr_cb: &Client::_vxattrcb_snap_btime,
12072 readonly: true,
12073 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12074 flags: 0,
12075 },
12076 { name: "" } /* Required table terminator */
12077 };
12078
12079 const Client::VXattr Client::_file_vxattrs[] = {
12080 {
12081 name: "ceph.file.layout",
12082 getxattr_cb: &Client::_vxattrcb_layout,
12083 readonly: false,
12084 exists_cb: &Client::_vxattrcb_layout_exists,
12085 flags: 0,
12086 },
12087 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12088 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12089 XATTR_LAYOUT_FIELD(file, layout, object_size),
12090 XATTR_LAYOUT_FIELD(file, layout, pool),
12091 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12092 {
12093 name: "ceph.snap.btime",
12094 getxattr_cb: &Client::_vxattrcb_snap_btime,
12095 readonly: true,
12096 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12097 flags: 0,
12098 },
12099 { name: "" } /* Required table terminator */
12100 };
12101
12102 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12103 {
12104 if (in->is_dir())
12105 return _dir_vxattrs;
12106 else if (in->is_file())
12107 return _file_vxattrs;
12108 return NULL;
12109 }
12110
12111 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12112 {
12113 if (strncmp(name, "ceph.", 5) == 0) {
12114 const VXattr *vxattr = _get_vxattrs(in);
12115 if (vxattr) {
12116 while (!vxattr->name.empty()) {
12117 if (vxattr->name == name)
12118 return vxattr;
12119 vxattr++;
12120 }
12121 }
12122 }
12123 return NULL;
12124 }
12125
12126 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12127 {
12128 std::lock_guard lock(client_lock);
12129
12130 if (unmounting)
12131 return -ENOTCONN;
12132
12133 vinodeno_t vino = _get_vino(in);
12134
12135 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12136 tout(cct) << "ll_readlink" << std::endl;
12137 tout(cct) << vino.ino.val << std::endl;
12138
12139 for (auto dn : in->dentries) {
12140 touch_dn(dn);
12141 }
12142
12143 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12144 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12145 return r;
12146 }
12147
12148 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12149 const UserPerm& perms, InodeRef *inp)
12150 {
12151 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12152 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12153 << ", gid " << perms.gid() << ")" << dendl;
12154
12155 if (strlen(name) > NAME_MAX)
12156 return -ENAMETOOLONG;
12157
12158 if (dir->snapid != CEPH_NOSNAP) {
12159 return -EROFS;
12160 }
12161 if (is_quota_files_exceeded(dir, perms)) {
12162 return -EDQUOT;
12163 }
12164
12165 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12166
12167 filepath path;
12168 dir->make_nosnap_relative_path(path);
12169 path.push_dentry(name);
12170 req->set_filepath(path);
12171 req->set_inode(dir);
12172 req->head.args.mknod.rdev = rdev;
12173 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12174 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12175
12176 bufferlist xattrs_bl;
12177 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12178 if (res < 0)
12179 goto fail;
12180 req->head.args.mknod.mode = mode;
12181 if (xattrs_bl.length() > 0)
12182 req->set_data(xattrs_bl);
12183
12184 Dentry *de;
12185 res = get_or_create(dir, name, &de);
12186 if (res < 0)
12187 goto fail;
12188 req->set_dentry(de);
12189
12190 res = make_request(req, perms, inp);
12191
12192 trim_cache();
12193
12194 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12195 return res;
12196
12197 fail:
12198 put_request(req);
12199 return res;
12200 }
12201
12202 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12203 dev_t rdev, struct stat *attr, Inode **out,
12204 const UserPerm& perms)
12205 {
12206 std::lock_guard lock(client_lock);
12207
12208 if (unmounting)
12209 return -ENOTCONN;
12210
12211 vinodeno_t vparent = _get_vino(parent);
12212
12213 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12214 tout(cct) << "ll_mknod" << std::endl;
12215 tout(cct) << vparent.ino.val << std::endl;
12216 tout(cct) << name << std::endl;
12217 tout(cct) << mode << std::endl;
12218 tout(cct) << rdev << std::endl;
12219
12220 if (!fuse_default_permissions) {
12221 int r = may_create(parent, perms);
12222 if (r < 0)
12223 return r;
12224 }
12225
12226 InodeRef in;
12227 int r = _mknod(parent, name, mode, rdev, perms, &in);
12228 if (r == 0) {
12229 fill_stat(in, attr);
12230 _ll_get(in.get());
12231 }
12232 tout(cct) << attr->st_ino << std::endl;
12233 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12234 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12235 *out = in.get();
12236 return r;
12237 }
12238
12239 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12240 dev_t rdev, Inode **out,
12241 struct ceph_statx *stx, unsigned want, unsigned flags,
12242 const UserPerm& perms)
12243 {
12244 unsigned caps = statx_to_mask(flags, want);
12245 std::lock_guard lock(client_lock);
12246
12247 if (unmounting)
12248 return -ENOTCONN;
12249
12250 vinodeno_t vparent = _get_vino(parent);
12251
12252 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12253 tout(cct) << "ll_mknodx" << std::endl;
12254 tout(cct) << vparent.ino.val << std::endl;
12255 tout(cct) << name << std::endl;
12256 tout(cct) << mode << std::endl;
12257 tout(cct) << rdev << std::endl;
12258
12259 if (!fuse_default_permissions) {
12260 int r = may_create(parent, perms);
12261 if (r < 0)
12262 return r;
12263 }
12264
12265 InodeRef in;
12266 int r = _mknod(parent, name, mode, rdev, perms, &in);
12267 if (r == 0) {
12268 fill_statx(in, caps, stx);
12269 _ll_get(in.get());
12270 }
12271 tout(cct) << stx->stx_ino << std::endl;
12272 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12273 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12274 *out = in.get();
12275 return r;
12276 }
12277
12278 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12279 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12280 int object_size, const char *data_pool, bool *created,
12281 const UserPerm& perms)
12282 {
12283 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12284 mode << dec << ")" << dendl;
12285
12286 if (strlen(name) > NAME_MAX)
12287 return -ENAMETOOLONG;
12288 if (dir->snapid != CEPH_NOSNAP) {
12289 return -EROFS;
12290 }
12291 if (is_quota_files_exceeded(dir, perms)) {
12292 return -EDQUOT;
12293 }
12294
12295 // use normalized flags to generate cmode
12296 int cflags = ceph_flags_sys2wire(flags);
12297 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12298 cflags |= CEPH_O_LAZY;
12299
12300 int cmode = ceph_flags_to_mode(cflags);
12301
12302 int64_t pool_id = -1;
12303 if (data_pool && *data_pool) {
12304 pool_id = objecter->with_osdmap(
12305 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12306 if (pool_id < 0)
12307 return -EINVAL;
12308 if (pool_id > 0xffffffffll)
12309 return -ERANGE; // bummer!
12310 }
12311
12312 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12313
12314 filepath path;
12315 dir->make_nosnap_relative_path(path);
12316 path.push_dentry(name);
12317 req->set_filepath(path);
12318 req->set_inode(dir);
12319 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12320
12321 req->head.args.open.stripe_unit = stripe_unit;
12322 req->head.args.open.stripe_count = stripe_count;
12323 req->head.args.open.object_size = object_size;
12324 if (cct->_conf->client_debug_getattr_caps)
12325 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12326 else
12327 req->head.args.open.mask = 0;
12328 req->head.args.open.pool = pool_id;
12329 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12330 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12331
12332 mode |= S_IFREG;
12333 bufferlist xattrs_bl;
12334 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12335 if (res < 0)
12336 goto fail;
12337 req->head.args.open.mode = mode;
12338 if (xattrs_bl.length() > 0)
12339 req->set_data(xattrs_bl);
12340
12341 Dentry *de;
12342 res = get_or_create(dir, name, &de);
12343 if (res < 0)
12344 goto fail;
12345 req->set_dentry(de);
12346
12347 res = make_request(req, perms, inp, created);
12348 if (res < 0) {
12349 goto reply_error;
12350 }
12351
12352 /* If the caller passed a value in fhp, do the open */
12353 if(fhp) {
12354 (*inp)->get_open_ref(cmode);
12355 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12356 }
12357
12358 reply_error:
12359 trim_cache();
12360
12361 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12362 << " layout " << stripe_unit
12363 << ' ' << stripe_count
12364 << ' ' << object_size
12365 <<") = " << res << dendl;
12366 return res;
12367
12368 fail:
12369 put_request(req);
12370 return res;
12371 }
12372
12373
12374 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12375 InodeRef *inp)
12376 {
12377 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12378 << mode << dec << ", uid " << perm.uid()
12379 << ", gid " << perm.gid() << ")" << dendl;
12380
12381 if (strlen(name) > NAME_MAX)
12382 return -ENAMETOOLONG;
12383
12384 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12385 return -EROFS;
12386 }
12387 if (is_quota_files_exceeded(dir, perm)) {
12388 return -EDQUOT;
12389 }
12390 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12391 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12392
12393 filepath path;
12394 dir->make_nosnap_relative_path(path);
12395 path.push_dentry(name);
12396 req->set_filepath(path);
12397 req->set_inode(dir);
12398 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12399 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12400
12401 mode |= S_IFDIR;
12402 bufferlist xattrs_bl;
12403 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12404 if (res < 0)
12405 goto fail;
12406 req->head.args.mkdir.mode = mode;
12407 if (xattrs_bl.length() > 0)
12408 req->set_data(xattrs_bl);
12409
12410 Dentry *de;
12411 res = get_or_create(dir, name, &de);
12412 if (res < 0)
12413 goto fail;
12414 req->set_dentry(de);
12415
12416 ldout(cct, 10) << "_mkdir: making request" << dendl;
12417 res = make_request(req, perm, inp);
12418 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12419
12420 trim_cache();
12421
12422 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12423 return res;
12424
12425 fail:
12426 put_request(req);
12427 return res;
12428 }
12429
12430 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12431 struct stat *attr, Inode **out, const UserPerm& perm)
12432 {
12433 std::lock_guard lock(client_lock);
12434
12435 if (unmounting)
12436 return -ENOTCONN;
12437
12438 vinodeno_t vparent = _get_vino(parent);
12439
12440 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12441 tout(cct) << "ll_mkdir" << std::endl;
12442 tout(cct) << vparent.ino.val << std::endl;
12443 tout(cct) << name << std::endl;
12444 tout(cct) << mode << std::endl;
12445
12446 if (!fuse_default_permissions) {
12447 int r = may_create(parent, perm);
12448 if (r < 0)
12449 return r;
12450 }
12451
12452 InodeRef in;
12453 int r = _mkdir(parent, name, mode, perm, &in);
12454 if (r == 0) {
12455 fill_stat(in, attr);
12456 _ll_get(in.get());
12457 }
12458 tout(cct) << attr->st_ino << std::endl;
12459 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12460 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12461 *out = in.get();
12462 return r;
12463 }
12464
12465 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12466 struct ceph_statx *stx, unsigned want, unsigned flags,
12467 const UserPerm& perms)
12468 {
12469 std::lock_guard lock(client_lock);
12470
12471 if (unmounting)
12472 return -ENOTCONN;
12473
12474 vinodeno_t vparent = _get_vino(parent);
12475
12476 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12477 tout(cct) << "ll_mkdirx" << std::endl;
12478 tout(cct) << vparent.ino.val << std::endl;
12479 tout(cct) << name << std::endl;
12480 tout(cct) << mode << std::endl;
12481
12482 if (!fuse_default_permissions) {
12483 int r = may_create(parent, perms);
12484 if (r < 0)
12485 return r;
12486 }
12487
12488 InodeRef in;
12489 int r = _mkdir(parent, name, mode, perms, &in);
12490 if (r == 0) {
12491 fill_statx(in, statx_to_mask(flags, want), stx);
12492 _ll_get(in.get());
12493 } else {
12494 stx->stx_ino = 0;
12495 stx->stx_mask = 0;
12496 }
12497 tout(cct) << stx->stx_ino << std::endl;
12498 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12499 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12500 *out = in.get();
12501 return r;
12502 }
12503
12504 int Client::_symlink(Inode *dir, const char *name, const char *target,
12505 const UserPerm& perms, InodeRef *inp)
12506 {
12507 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12508 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12509 << dendl;
12510
12511 if (strlen(name) > NAME_MAX)
12512 return -ENAMETOOLONG;
12513
12514 if (dir->snapid != CEPH_NOSNAP) {
12515 return -EROFS;
12516 }
12517 if (is_quota_files_exceeded(dir, perms)) {
12518 return -EDQUOT;
12519 }
12520
12521 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12522
12523 filepath path;
12524 dir->make_nosnap_relative_path(path);
12525 path.push_dentry(name);
12526 req->set_filepath(path);
12527 req->set_inode(dir);
12528 req->set_string2(target);
12529 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12530 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12531
12532 Dentry *de;
12533 int res = get_or_create(dir, name, &de);
12534 if (res < 0)
12535 goto fail;
12536 req->set_dentry(de);
12537
12538 res = make_request(req, perms, inp);
12539
12540 trim_cache();
12541 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12542 res << dendl;
12543 return res;
12544
12545 fail:
12546 put_request(req);
12547 return res;
12548 }
12549
12550 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12551 struct stat *attr, Inode **out, const UserPerm& perms)
12552 {
12553 std::lock_guard lock(client_lock);
12554
12555 if (unmounting)
12556 return -ENOTCONN;
12557
12558 vinodeno_t vparent = _get_vino(parent);
12559
12560 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12561 << dendl;
12562 tout(cct) << "ll_symlink" << std::endl;
12563 tout(cct) << vparent.ino.val << std::endl;
12564 tout(cct) << name << std::endl;
12565 tout(cct) << value << std::endl;
12566
12567 if (!fuse_default_permissions) {
12568 int r = may_create(parent, perms);
12569 if (r < 0)
12570 return r;
12571 }
12572
12573 InodeRef in;
12574 int r = _symlink(parent, name, value, perms, &in);
12575 if (r == 0) {
12576 fill_stat(in, attr);
12577 _ll_get(in.get());
12578 }
12579 tout(cct) << attr->st_ino << std::endl;
12580 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12581 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12582 *out = in.get();
12583 return r;
12584 }
12585
12586 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12587 Inode **out, struct ceph_statx *stx, unsigned want,
12588 unsigned flags, const UserPerm& perms)
12589 {
12590 std::lock_guard lock(client_lock);
12591
12592 if (unmounting)
12593 return -ENOTCONN;
12594
12595 vinodeno_t vparent = _get_vino(parent);
12596
12597 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12598 << dendl;
12599 tout(cct) << "ll_symlinkx" << std::endl;
12600 tout(cct) << vparent.ino.val << std::endl;
12601 tout(cct) << name << std::endl;
12602 tout(cct) << value << std::endl;
12603
12604 if (!fuse_default_permissions) {
12605 int r = may_create(parent, perms);
12606 if (r < 0)
12607 return r;
12608 }
12609
12610 InodeRef in;
12611 int r = _symlink(parent, name, value, perms, &in);
12612 if (r == 0) {
12613 fill_statx(in, statx_to_mask(flags, want), stx);
12614 _ll_get(in.get());
12615 }
12616 tout(cct) << stx->stx_ino << std::endl;
12617 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12618 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12619 *out = in.get();
12620 return r;
12621 }
12622
12623 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12624 {
12625 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12626 << " uid " << perm.uid() << " gid " << perm.gid()
12627 << ")" << dendl;
12628
12629 if (dir->snapid != CEPH_NOSNAP) {
12630 return -EROFS;
12631 }
12632
12633 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12634
12635 filepath path;
12636 dir->make_nosnap_relative_path(path);
12637 path.push_dentry(name);
12638 req->set_filepath(path);
12639
12640 InodeRef otherin;
12641 Inode *in;
12642 Dentry *de;
12643
12644 int res = get_or_create(dir, name, &de);
12645 if (res < 0)
12646 goto fail;
12647 req->set_dentry(de);
12648 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12649 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12650
12651 res = _lookup(dir, name, 0, &otherin, perm);
12652 if (res < 0)
12653 goto fail;
12654
12655 in = otherin.get();
12656 req->set_other_inode(in);
12657 in->break_all_delegs();
12658 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12659
12660 req->set_inode(dir);
12661
12662 res = make_request(req, perm);
12663
12664 trim_cache();
12665 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12666 return res;
12667
12668 fail:
12669 put_request(req);
12670 return res;
12671 }
12672
12673 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12674 {
12675 std::lock_guard lock(client_lock);
12676
12677 if (unmounting)
12678 return -ENOTCONN;
12679
12680 vinodeno_t vino = _get_vino(in);
12681
12682 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12683 tout(cct) << "ll_unlink" << std::endl;
12684 tout(cct) << vino.ino.val << std::endl;
12685 tout(cct) << name << std::endl;
12686
12687 if (!fuse_default_permissions) {
12688 int r = may_delete(in, name, perm);
12689 if (r < 0)
12690 return r;
12691 }
12692 return _unlink(in, name, perm);
12693 }
12694
12695 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12696 {
12697 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12698 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12699
12700 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12701 return -EROFS;
12702 }
12703
12704 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12705 MetaRequest *req = new MetaRequest(op);
12706 filepath path;
12707 dir->make_nosnap_relative_path(path);
12708 path.push_dentry(name);
12709 req->set_filepath(path);
12710 req->set_inode(dir);
12711
12712 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12713 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12714 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12715
12716 InodeRef in;
12717
12718 Dentry *de;
12719 int res = get_or_create(dir, name, &de);
12720 if (res < 0)
12721 goto fail;
12722 if (op == CEPH_MDS_OP_RMDIR)
12723 req->set_dentry(de);
12724 else
12725 de->get();
12726
12727 res = _lookup(dir, name, 0, &in, perms);
12728 if (res < 0)
12729 goto fail;
12730
12731 if (op == CEPH_MDS_OP_RMSNAP) {
12732 unlink(de, true, true);
12733 de->put();
12734 }
12735 req->set_other_inode(in.get());
12736
12737 res = make_request(req, perms);
12738
12739 trim_cache();
12740 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12741 return res;
12742
12743 fail:
12744 put_request(req);
12745 return res;
12746 }
12747
12748 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12749 {
12750 std::lock_guard lock(client_lock);
12751
12752 if (unmounting)
12753 return -ENOTCONN;
12754
12755 vinodeno_t vino = _get_vino(in);
12756
12757 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12758 tout(cct) << "ll_rmdir" << std::endl;
12759 tout(cct) << vino.ino.val << std::endl;
12760 tout(cct) << name << std::endl;
12761
12762 if (!fuse_default_permissions) {
12763 int r = may_delete(in, name, perms);
12764 if (r < 0)
12765 return r;
12766 }
12767
12768 return _rmdir(in, name, perms);
12769 }
12770
12771 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12772 {
12773 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12774 << todir->ino << " " << toname
12775 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12776 << dendl;
12777
12778 if (fromdir->snapid != todir->snapid)
12779 return -EXDEV;
12780
12781 int op = CEPH_MDS_OP_RENAME;
12782 if (fromdir->snapid != CEPH_NOSNAP) {
12783 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12784 op = CEPH_MDS_OP_RENAMESNAP;
12785 else
12786 return -EROFS;
12787 }
12788
12789 InodeRef target;
12790 MetaRequest *req = new MetaRequest(op);
12791
12792 filepath from;
12793 fromdir->make_nosnap_relative_path(from);
12794 from.push_dentry(fromname);
12795 filepath to;
12796 todir->make_nosnap_relative_path(to);
12797 to.push_dentry(toname);
12798 req->set_filepath(to);
12799 req->set_filepath2(from);
12800
12801 Dentry *oldde;
12802 int res = get_or_create(fromdir, fromname, &oldde);
12803 if (res < 0)
12804 goto fail;
12805 Dentry *de;
12806 res = get_or_create(todir, toname, &de);
12807 if (res < 0)
12808 goto fail;
12809
12810 if (op == CEPH_MDS_OP_RENAME) {
12811 req->set_old_dentry(oldde);
12812 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12813 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12814
12815 req->set_dentry(de);
12816 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12817 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12818
12819 InodeRef oldin, otherin;
12820 Inode *fromdir_root = nullptr;
12821 Inode *todir_root = nullptr;
12822 int mask = 0;
12823 bool quota_check = false;
12824 if (fromdir != todir) {
12825 fromdir_root =
12826 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12827 todir_root =
12828 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12829
12830 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12831 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12832 // to auth MDS to get latest rstat for todir_root and source dir
12833 // even if their dentry caches and inode caps are satisfied.
12834 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12835 if (res < 0)
12836 goto fail;
12837
12838 quota_check = true;
12839 if (oldde->inode && oldde->inode->is_dir()) {
12840 mask |= CEPH_STAT_RSTAT;
12841 }
12842 }
12843 }
12844
12845 res = _lookup(fromdir, fromname, mask, &oldin, perm);
12846 if (res < 0)
12847 goto fail;
12848
12849 Inode *oldinode = oldin.get();
12850 oldinode->break_all_delegs();
12851 req->set_old_inode(oldinode);
12852 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12853
12854 if (quota_check) {
12855 int64_t old_bytes, old_files;
12856 if (oldinode->is_dir()) {
12857 old_bytes = oldinode->rstat.rbytes;
12858 old_files = oldinode->rstat.rsize();
12859 } else {
12860 old_bytes = oldinode->size;
12861 old_files = 1;
12862 }
12863
12864 bool quota_exceed = false;
12865 if (todir_root && todir_root->quota.max_bytes &&
12866 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12867 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12868 << old_bytes << ") to (" << todir->ino
12869 << ") will exceed quota on " << *todir_root << dendl;
12870 quota_exceed = true;
12871 }
12872
12873 if (todir_root && todir_root->quota.max_files &&
12874 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12875 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12876 << old_files << ") to (" << todir->ino
12877 << ") will exceed quota on " << *todir_root << dendl;
12878 quota_exceed = true;
12879 }
12880
12881 if (quota_exceed) {
12882 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12883 goto fail;
12884 }
12885 }
12886
12887 res = _lookup(todir, toname, 0, &otherin, perm);
12888 switch (res) {
12889 case 0:
12890 {
12891 Inode *in = otherin.get();
12892 req->set_other_inode(in);
12893 in->break_all_delegs();
12894 }
12895 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12896 break;
12897 case -ENOENT:
12898 break;
12899 default:
12900 goto fail;
12901 }
12902
12903 req->set_inode(todir);
12904 } else {
12905 // renamesnap reply contains no tracedn, so we need to invalidate
12906 // dentry manually
12907 unlink(oldde, true, true);
12908 unlink(de, true, true);
12909
12910 req->set_inode(todir);
12911 }
12912
12913 res = make_request(req, perm, &target);
12914 ldout(cct, 10) << "rename result is " << res << dendl;
12915
12916 // renamed item from our cache
12917
12918 trim_cache();
12919 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12920 return res;
12921
12922 fail:
12923 put_request(req);
12924 return res;
12925 }
12926
12927 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12928 const char *newname, const UserPerm& perm)
12929 {
12930 std::lock_guard lock(client_lock);
12931
12932 if (unmounting)
12933 return -ENOTCONN;
12934
12935 vinodeno_t vparent = _get_vino(parent);
12936 vinodeno_t vnewparent = _get_vino(newparent);
12937
12938 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12939 << vnewparent << " " << newname << dendl;
12940 tout(cct) << "ll_rename" << std::endl;
12941 tout(cct) << vparent.ino.val << std::endl;
12942 tout(cct) << name << std::endl;
12943 tout(cct) << vnewparent.ino.val << std::endl;
12944 tout(cct) << newname << std::endl;
12945
12946 if (!fuse_default_permissions) {
12947 int r = may_delete(parent, name, perm);
12948 if (r < 0)
12949 return r;
12950 r = may_delete(newparent, newname, perm);
12951 if (r < 0 && r != -ENOENT)
12952 return r;
12953 }
12954
12955 return _rename(parent, name, newparent, newname, perm);
12956 }
12957
12958 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12959 {
12960 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12961 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12962
12963 if (strlen(newname) > NAME_MAX)
12964 return -ENAMETOOLONG;
12965
12966 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12967 return -EROFS;
12968 }
12969 if (is_quota_files_exceeded(dir, perm)) {
12970 return -EDQUOT;
12971 }
12972
12973 in->break_all_delegs();
12974 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12975
12976 filepath path(newname, dir->ino);
12977 req->set_filepath(path);
12978 filepath existing(in->ino);
12979 req->set_filepath2(existing);
12980
12981 req->set_inode(dir);
12982 req->inode_drop = CEPH_CAP_FILE_SHARED;
12983 req->inode_unless = CEPH_CAP_FILE_EXCL;
12984
12985 Dentry *de;
12986 int res = get_or_create(dir, newname, &de);
12987 if (res < 0)
12988 goto fail;
12989 req->set_dentry(de);
12990
12991 res = make_request(req, perm, inp);
12992 ldout(cct, 10) << "link result is " << res << dendl;
12993
12994 trim_cache();
12995 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12996 return res;
12997
12998 fail:
12999 put_request(req);
13000 return res;
13001 }
13002
13003 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13004 const UserPerm& perm)
13005 {
13006 std::lock_guard lock(client_lock);
13007
13008 if (unmounting)
13009 return -ENOTCONN;
13010
13011 vinodeno_t vino = _get_vino(in);
13012 vinodeno_t vnewparent = _get_vino(newparent);
13013
13014 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13015 newname << dendl;
13016 tout(cct) << "ll_link" << std::endl;
13017 tout(cct) << vino.ino.val << std::endl;
13018 tout(cct) << vnewparent << std::endl;
13019 tout(cct) << newname << std::endl;
13020
13021 InodeRef target;
13022
13023 if (!fuse_default_permissions) {
13024 if (S_ISDIR(in->mode))
13025 return -EPERM;
13026
13027 int r = may_hardlink(in, perm);
13028 if (r < 0)
13029 return r;
13030
13031 r = may_create(newparent, perm);
13032 if (r < 0)
13033 return r;
13034 }
13035
13036 return _link(in, newparent, newname, perm, &target);
13037 }
13038
13039 int Client::ll_num_osds(void)
13040 {
13041 std::lock_guard lock(client_lock);
13042 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13043 }
13044
13045 int Client::ll_osdaddr(int osd, uint32_t *addr)
13046 {
13047 std::lock_guard lock(client_lock);
13048
13049 entity_addr_t g;
13050 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13051 if (!o.exists(osd))
13052 return false;
13053 g = o.get_addrs(osd).front();
13054 return true;
13055 });
13056 if (!exists)
13057 return -1;
13058 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13059 *addr = ntohl(nb_addr);
13060 return 0;
13061 }
13062
13063 uint32_t Client::ll_stripe_unit(Inode *in)
13064 {
13065 std::lock_guard lock(client_lock);
13066 return in->layout.stripe_unit;
13067 }
13068
13069 uint64_t Client::ll_snap_seq(Inode *in)
13070 {
13071 std::lock_guard lock(client_lock);
13072 return in->snaprealm->seq;
13073 }
13074
13075 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13076 {
13077 std::lock_guard lock(client_lock);
13078 *layout = in->layout;
13079 return 0;
13080 }
13081
13082 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13083 {
13084 return ll_file_layout(fh->inode.get(), layout);
13085 }
13086
13087 /* Currently we cannot take advantage of redundancy in reads, since we
13088 would have to go through all possible placement groups (a
13089 potentially quite large number determined by a hash), and use CRUSH
13090 to calculate the appropriate set of OSDs for each placement group,
13091 then index into that. An array with one entry per OSD is much more
13092 tractable and works for demonstration purposes. */
13093
13094 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13095 file_layout_t* layout)
13096 {
13097 std::lock_guard lock(client_lock);
13098
13099 inodeno_t ino = in->ino;
13100 uint32_t object_size = layout->object_size;
13101 uint32_t su = layout->stripe_unit;
13102 uint32_t stripe_count = layout->stripe_count;
13103 uint64_t stripes_per_object = object_size / su;
13104 uint64_t stripeno = 0, stripepos = 0;
13105
13106 if(stripe_count) {
13107 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13108 stripepos = blockno % stripe_count; // which object in the object set (X)
13109 }
13110 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13111 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13112
13113 object_t oid = file_object_t(ino, objectno);
13114 return objecter->with_osdmap([&](const OSDMap& o) {
13115 ceph_object_layout olayout =
13116 o.file_to_object_layout(oid, *layout);
13117 pg_t pg = (pg_t)olayout.ol_pgid;
13118 vector<int> osds;
13119 int primary;
13120 o.pg_to_acting_osds(pg, &osds, &primary);
13121 return primary;
13122 });
13123 }
13124
13125 /* Return the offset of the block, internal to the object */
13126
13127 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13128 {
13129 std::lock_guard lock(client_lock);
13130 file_layout_t *layout=&(in->layout);
13131 uint32_t object_size = layout->object_size;
13132 uint32_t su = layout->stripe_unit;
13133 uint64_t stripes_per_object = object_size / su;
13134
13135 return (blockno % stripes_per_object) * su;
13136 }
13137
13138 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13139 const UserPerm& perms)
13140 {
13141 std::lock_guard lock(client_lock);
13142
13143 if (unmounting)
13144 return -ENOTCONN;
13145
13146 vinodeno_t vino = _get_vino(in);
13147
13148 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13149 tout(cct) << "ll_opendir" << std::endl;
13150 tout(cct) << vino.ino.val << std::endl;
13151
13152 if (!fuse_default_permissions) {
13153 int r = may_open(in, flags, perms);
13154 if (r < 0)
13155 return r;
13156 }
13157
13158 int r = _opendir(in, dirpp, perms);
13159 tout(cct) << (unsigned long)*dirpp << std::endl;
13160
13161 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13162 << dendl;
13163 return r;
13164 }
13165
13166 int Client::ll_releasedir(dir_result_t *dirp)
13167 {
13168 std::lock_guard lock(client_lock);
13169 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13170 tout(cct) << "ll_releasedir" << std::endl;
13171 tout(cct) << (unsigned long)dirp << std::endl;
13172
13173 if (unmounting)
13174 return -ENOTCONN;
13175
13176 _closedir(dirp);
13177 return 0;
13178 }
13179
13180 int Client::ll_fsyncdir(dir_result_t *dirp)
13181 {
13182 std::lock_guard lock(client_lock);
13183 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13184 tout(cct) << "ll_fsyncdir" << std::endl;
13185 tout(cct) << (unsigned long)dirp << std::endl;
13186
13187 if (unmounting)
13188 return -ENOTCONN;
13189
13190 return _fsync(dirp->inode.get(), false);
13191 }
13192
13193 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13194 {
13195 ceph_assert(!(flags & O_CREAT));
13196
13197 std::lock_guard lock(client_lock);
13198
13199 if (unmounting)
13200 return -ENOTCONN;
13201
13202 vinodeno_t vino = _get_vino(in);
13203
13204 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13205 tout(cct) << "ll_open" << std::endl;
13206 tout(cct) << vino.ino.val << std::endl;
13207 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13208
13209 int r;
13210 if (!fuse_default_permissions) {
13211 r = may_open(in, flags, perms);
13212 if (r < 0)
13213 goto out;
13214 }
13215
13216 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13217
13218 out:
13219 Fh *fhptr = fhp ? *fhp : NULL;
13220 if (fhptr) {
13221 ll_unclosed_fh_set.insert(fhptr);
13222 }
13223 tout(cct) << (unsigned long)fhptr << std::endl;
13224 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13225 " = " << r << " (" << fhptr << ")" << dendl;
13226 return r;
13227 }
13228
13229 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13230 int flags, InodeRef *in, int caps, Fh **fhp,
13231 const UserPerm& perms)
13232 {
13233 *fhp = NULL;
13234
13235 vinodeno_t vparent = _get_vino(parent);
13236
13237 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13238 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13239 << ", gid " << perms.gid() << dendl;
13240 tout(cct) << "ll_create" << std::endl;
13241 tout(cct) << vparent.ino.val << std::endl;
13242 tout(cct) << name << std::endl;
13243 tout(cct) << mode << std::endl;
13244 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13245
13246 bool created = false;
13247 int r = _lookup(parent, name, caps, in, perms);
13248
13249 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13250 return -EEXIST;
13251
13252 if (r == -ENOENT && (flags & O_CREAT)) {
13253 if (!fuse_default_permissions) {
13254 r = may_create(parent, perms);
13255 if (r < 0)
13256 goto out;
13257 }
13258 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13259 perms);
13260 if (r < 0)
13261 goto out;
13262 }
13263
13264 if (r < 0)
13265 goto out;
13266
13267 ceph_assert(*in);
13268
13269 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13270 if (!created) {
13271 if (!fuse_default_permissions) {
13272 r = may_open(in->get(), flags, perms);
13273 if (r < 0) {
13274 if (*fhp) {
13275 int release_r = _release_fh(*fhp);
13276 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13277 }
13278 goto out;
13279 }
13280 }
13281 if (*fhp == NULL) {
13282 r = _open(in->get(), flags, mode, fhp, perms);
13283 if (r < 0)
13284 goto out;
13285 }
13286 }
13287
13288 out:
13289 if (*fhp) {
13290 ll_unclosed_fh_set.insert(*fhp);
13291 }
13292
13293 ino_t ino = 0;
13294 if (r >= 0) {
13295 Inode *inode = in->get();
13296 if (use_faked_inos())
13297 ino = inode->faked_ino;
13298 else
13299 ino = inode->ino;
13300 }
13301
13302 tout(cct) << (unsigned long)*fhp << std::endl;
13303 tout(cct) << ino << std::endl;
13304 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13305 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13306 *fhp << " " << hex << ino << dec << ")" << dendl;
13307
13308 return r;
13309 }
13310
13311 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13312 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13313 const UserPerm& perms)
13314 {
13315 std::lock_guard lock(client_lock);
13316 InodeRef in;
13317
13318 if (unmounting)
13319 return -ENOTCONN;
13320
13321 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13322 fhp, perms);
13323 if (r >= 0) {
13324 ceph_assert(in);
13325
13326 // passing an Inode in outp requires an additional ref
13327 if (outp) {
13328 _ll_get(in.get());
13329 *outp = in.get();
13330 }
13331 fill_stat(in, attr);
13332 } else {
13333 attr->st_ino = 0;
13334 }
13335
13336 return r;
13337 }
13338
13339 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13340 int oflags, Inode **outp, Fh **fhp,
13341 struct ceph_statx *stx, unsigned want, unsigned lflags,
13342 const UserPerm& perms)
13343 {
13344 unsigned caps = statx_to_mask(lflags, want);
13345 std::lock_guard lock(client_lock);
13346 InodeRef in;
13347
13348 if (unmounting)
13349 return -ENOTCONN;
13350
13351 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13352 if (r >= 0) {
13353 ceph_assert(in);
13354
13355 // passing an Inode in outp requires an additional ref
13356 if (outp) {
13357 _ll_get(in.get());
13358 *outp = in.get();
13359 }
13360 fill_statx(in, caps, stx);
13361 } else {
13362 stx->stx_ino = 0;
13363 stx->stx_mask = 0;
13364 }
13365
13366 return r;
13367 }
13368
13369 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13370 {
13371 std::lock_guard lock(client_lock);
13372 tout(cct) << "ll_lseek" << std::endl;
13373 tout(cct) << offset << std::endl;
13374 tout(cct) << whence << std::endl;
13375
13376 if (unmounting)
13377 return -ENOTCONN;
13378
13379 return _lseek(fh, offset, whence);
13380 }
13381
13382 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13383 {
13384 std::lock_guard lock(client_lock);
13385 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13386 tout(cct) << "ll_read" << std::endl;
13387 tout(cct) << (unsigned long)fh << std::endl;
13388 tout(cct) << off << std::endl;
13389 tout(cct) << len << std::endl;
13390
13391 if (unmounting)
13392 return -ENOTCONN;
13393
13394 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13395 len = std::min(len, (loff_t)INT_MAX);
13396 int r = _read(fh, off, len, bl);
13397 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13398 << dendl;
13399 return r;
13400 }
13401
13402 int Client::ll_read_block(Inode *in, uint64_t blockid,
13403 char *buf,
13404 uint64_t offset,
13405 uint64_t length,
13406 file_layout_t* layout)
13407 {
13408 std::lock_guard lock(client_lock);
13409
13410 if (unmounting)
13411 return -ENOTCONN;
13412
13413 vinodeno_t vino = _get_vino(in);
13414 object_t oid = file_object_t(vino.ino, blockid);
13415 C_SaferCond onfinish;
13416 bufferlist bl;
13417
13418 objecter->read(oid,
13419 object_locator_t(layout->pool_id),
13420 offset,
13421 length,
13422 vino.snapid,
13423 &bl,
13424 CEPH_OSD_FLAG_READ,
13425 &onfinish);
13426
13427 client_lock.unlock();
13428 int r = onfinish.wait();
13429 client_lock.lock();
13430
13431 if (r >= 0) {
13432 bl.begin().copy(bl.length(), buf);
13433 r = bl.length();
13434 }
13435
13436 return r;
13437 }
13438
13439 /* It appears that the OSD doesn't return success unless the entire
13440 buffer was written, return the write length on success. */
13441
13442 int Client::ll_write_block(Inode *in, uint64_t blockid,
13443 char* buf, uint64_t offset,
13444 uint64_t length, file_layout_t* layout,
13445 uint64_t snapseq, uint32_t sync)
13446 {
13447 vinodeno_t vino = ll_get_vino(in);
13448 int r = 0;
13449 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13450
13451 if (length == 0) {
13452 return -EINVAL;
13453 }
13454 if (true || sync) {
13455 /* if write is stable, the epilogue is waiting on
13456 * flock */
13457 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13458 }
13459 object_t oid = file_object_t(vino.ino, blockid);
13460 SnapContext fakesnap;
13461 ceph::bufferlist bl;
13462 if (length > 0) {
13463 bl.push_back(buffer::copy(buf, length));
13464 }
13465
13466 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13467 << dendl;
13468
13469 fakesnap.seq = snapseq;
13470
13471 /* lock just in time */
13472 client_lock.lock();
13473 if (unmounting) {
13474 client_lock.unlock();
13475 return -ENOTCONN;
13476 }
13477
13478 objecter->write(oid,
13479 object_locator_t(layout->pool_id),
13480 offset,
13481 length,
13482 fakesnap,
13483 bl,
13484 ceph::real_clock::now(),
13485 0,
13486 onsafe.get());
13487
13488 client_lock.unlock();
13489 if (nullptr != onsafe) {
13490 r = onsafe->wait();
13491 }
13492
13493 if (r < 0) {
13494 return r;
13495 } else {
13496 return length;
13497 }
13498 }
13499
13500 int Client::ll_commit_blocks(Inode *in,
13501 uint64_t offset,
13502 uint64_t length)
13503 {
13504 std::lock_guard lock(client_lock);
13505 /*
13506 BarrierContext *bctx;
13507 vinodeno_t vino = _get_vino(in);
13508 uint64_t ino = vino.ino;
13509
13510 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13511 << offset << " to " << length << dendl;
13512
13513 if (length == 0) {
13514 return -EINVAL;
13515 }
13516
13517 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13518 if (p != barriers.end()) {
13519 barrier_interval civ(offset, offset + length);
13520 p->second->commit_barrier(civ);
13521 }
13522 */
13523 return 0;
13524 }
13525
13526 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13527 {
13528 std::lock_guard lock(client_lock);
13529 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13530 "~" << len << dendl;
13531 tout(cct) << "ll_write" << std::endl;
13532 tout(cct) << (unsigned long)fh << std::endl;
13533 tout(cct) << off << std::endl;
13534 tout(cct) << len << std::endl;
13535
13536 if (unmounting)
13537 return -ENOTCONN;
13538
13539 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13540 len = std::min(len, (loff_t)INT_MAX);
13541 int r = _write(fh, off, len, data, NULL, 0);
13542 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13543 << dendl;
13544 return r;
13545 }
13546
13547 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13548 {
13549 std::lock_guard lock(client_lock);
13550 if (unmounting)
13551 return -ENOTCONN;
13552 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13553 }
13554
13555 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13556 {
13557 std::lock_guard lock(client_lock);
13558 if (unmounting)
13559 return -ENOTCONN;
13560 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13561 }
13562
13563 int Client::ll_flush(Fh *fh)
13564 {
13565 std::lock_guard lock(client_lock);
13566 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13567 tout(cct) << "ll_flush" << std::endl;
13568 tout(cct) << (unsigned long)fh << std::endl;
13569
13570 if (unmounting)
13571 return -ENOTCONN;
13572
13573 return _flush(fh);
13574 }
13575
13576 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13577 {
13578 std::lock_guard lock(client_lock);
13579 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13580 tout(cct) << "ll_fsync" << std::endl;
13581 tout(cct) << (unsigned long)fh << std::endl;
13582
13583 if (unmounting)
13584 return -ENOTCONN;
13585
13586 int r = _fsync(fh, syncdataonly);
13587 if (r) {
13588 // If we're returning an error, clear it from the FH
13589 fh->take_async_err();
13590 }
13591 return r;
13592 }
13593
13594 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13595 {
13596 std::lock_guard lock(client_lock);
13597 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13598 tout(cct) << "ll_sync_inode" << std::endl;
13599 tout(cct) << (unsigned long)in << std::endl;
13600
13601 if (unmounting)
13602 return -ENOTCONN;
13603
13604 return _fsync(in, syncdataonly);
13605 }
13606
13607 #ifdef FALLOC_FL_PUNCH_HOLE
13608
13609 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13610 {
13611 if (offset < 0 || length <= 0)
13612 return -EINVAL;
13613
13614 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13615 return -EOPNOTSUPP;
13616
13617 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13618 return -EOPNOTSUPP;
13619
13620 Inode *in = fh->inode.get();
13621
13622 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13623 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13624 return -ENOSPC;
13625 }
13626
13627 if (in->snapid != CEPH_NOSNAP)
13628 return -EROFS;
13629
13630 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13631 return -EBADF;
13632
13633 uint64_t size = offset + length;
13634 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13635 size > in->size &&
13636 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13637 return -EDQUOT;
13638 }
13639
13640 int have;
13641 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13642 if (r < 0)
13643 return r;
13644
13645 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13646 if (mode & FALLOC_FL_PUNCH_HOLE) {
13647 if (in->inline_version < CEPH_INLINE_NONE &&
13648 (have & CEPH_CAP_FILE_BUFFER)) {
13649 bufferlist bl;
13650 auto inline_iter = in->inline_data.cbegin();
13651 int len = in->inline_data.length();
13652 if (offset < len) {
13653 if (offset > 0)
13654 inline_iter.copy(offset, bl);
13655 int size = length;
13656 if (offset + size > len)
13657 size = len - offset;
13658 if (size > 0)
13659 bl.append_zero(size);
13660 if (offset + size < len) {
13661 inline_iter += size;
13662 inline_iter.copy(len - offset - size, bl);
13663 }
13664 in->inline_data = bl;
13665 in->inline_version++;
13666 }
13667 in->mtime = in->ctime = ceph_clock_now();
13668 in->change_attr++;
13669 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13670 } else {
13671 if (in->inline_version < CEPH_INLINE_NONE) {
13672 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13673 uninline_data(in, onuninline.get());
13674 }
13675
13676 C_SaferCond onfinish("Client::_punch_hole flock");
13677
13678 unsafe_sync_write++;
13679 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13680
13681 _invalidate_inode_cache(in, offset, length);
13682 filer->zero(in->ino, &in->layout,
13683 in->snaprealm->get_snap_context(),
13684 offset, length,
13685 ceph::real_clock::now(),
13686 0, true, &onfinish);
13687 in->mtime = in->ctime = ceph_clock_now();
13688 in->change_attr++;
13689 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13690
13691 client_lock.unlock();
13692 onfinish.wait();
13693 client_lock.lock();
13694 _sync_write_commit(in);
13695 }
13696 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13697 uint64_t size = offset + length;
13698 if (size > in->size) {
13699 in->size = size;
13700 in->mtime = in->ctime = ceph_clock_now();
13701 in->change_attr++;
13702 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13703
13704 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13705 check_caps(in, CHECK_CAPS_NODELAY);
13706 } else if (is_max_size_approaching(in)) {
13707 check_caps(in, 0);
13708 }
13709 }
13710 }
13711
13712 if (nullptr != onuninline) {
13713 client_lock.unlock();
13714 int ret = onuninline->wait();
13715 client_lock.lock();
13716
13717 if (ret >= 0 || ret == -ECANCELED) {
13718 in->inline_data.clear();
13719 in->inline_version = CEPH_INLINE_NONE;
13720 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13721 check_caps(in, 0);
13722 } else
13723 r = ret;
13724 }
13725
13726 put_cap_ref(in, CEPH_CAP_FILE_WR);
13727 return r;
13728 }
13729 #else
13730
13731 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13732 {
13733 return -EOPNOTSUPP;
13734 }
13735
13736 #endif
13737
13738
13739 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13740 {
13741 std::lock_guard lock(client_lock);
13742 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13743 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13744 tout(cct) << (unsigned long)fh << std::endl;
13745
13746 if (unmounting)
13747 return -ENOTCONN;
13748
13749 return _fallocate(fh, mode, offset, length);
13750 }
13751
13752 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13753 {
13754 std::lock_guard lock(client_lock);
13755 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13756
13757 if (unmounting)
13758 return -ENOTCONN;
13759
13760 Fh *fh = get_filehandle(fd);
13761 if (!fh)
13762 return -EBADF;
13763 #if defined(__linux__) && defined(O_PATH)
13764 if (fh->flags & O_PATH)
13765 return -EBADF;
13766 #endif
13767 return _fallocate(fh, mode, offset, length);
13768 }
13769
13770 int Client::ll_release(Fh *fh)
13771 {
13772 std::lock_guard lock(client_lock);
13773
13774 if (unmounting)
13775 return -ENOTCONN;
13776
13777 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13778 dendl;
13779 tout(cct) << __func__ << " (fh)" << std::endl;
13780 tout(cct) << (unsigned long)fh << std::endl;
13781
13782 if (ll_unclosed_fh_set.count(fh))
13783 ll_unclosed_fh_set.erase(fh);
13784 return _release_fh(fh);
13785 }
13786
13787 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13788 {
13789 std::lock_guard lock(client_lock);
13790
13791 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13792 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13793
13794 if (unmounting)
13795 return -ENOTCONN;
13796
13797 return _getlk(fh, fl, owner);
13798 }
13799
13800 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13801 {
13802 std::lock_guard lock(client_lock);
13803
13804 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13805 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13806
13807 if (unmounting)
13808 return -ENOTCONN;
13809
13810 return _setlk(fh, fl, owner, sleep);
13811 }
13812
13813 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13814 {
13815 std::lock_guard lock(client_lock);
13816
13817 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13818 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13819
13820 if (unmounting)
13821 return -ENOTCONN;
13822
13823 return _flock(fh, cmd, owner);
13824 }
13825
13826 int Client::set_deleg_timeout(uint32_t timeout)
13827 {
13828 std::lock_guard lock(client_lock);
13829
13830 /*
13831 * The whole point is to prevent blacklisting so we must time out the
13832 * delegation before the session autoclose timeout kicks in.
13833 */
13834 if (timeout >= mdsmap->get_session_autoclose())
13835 return -EINVAL;
13836
13837 deleg_timeout = timeout;
13838 return 0;
13839 }
13840
13841 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13842 {
13843 int ret = -EINVAL;
13844
13845 std::lock_guard lock(client_lock);
13846
13847 if (!mounted)
13848 return -ENOTCONN;
13849
13850 Inode *inode = fh->inode.get();
13851
13852 switch(cmd) {
13853 case CEPH_DELEGATION_NONE:
13854 inode->unset_deleg(fh);
13855 ret = 0;
13856 break;
13857 default:
13858 try {
13859 ret = inode->set_deleg(fh, cmd, cb, priv);
13860 } catch (std::bad_alloc&) {
13861 ret = -ENOMEM;
13862 }
13863 break;
13864 }
13865 return ret;
13866 }
13867
13868 class C_Client_RequestInterrupt : public Context {
13869 private:
13870 Client *client;
13871 MetaRequest *req;
13872 public:
13873 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13874 req->get();
13875 }
13876 void finish(int r) override {
13877 std::lock_guard l(client->client_lock);
13878 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13879 client->_interrupt_filelock(req);
13880 client->put_request(req);
13881 }
13882 };
13883
13884 void Client::ll_interrupt(void *d)
13885 {
13886 MetaRequest *req = static_cast<MetaRequest*>(d);
13887 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13888 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13889 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13890 }
13891
13892 // =========================================
13893 // layout
13894
13895 // expose file layouts
13896
13897 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13898 const UserPerm& perms)
13899 {
13900 std::lock_guard lock(client_lock);
13901
13902 if (unmounting)
13903 return -ENOTCONN;
13904
13905 filepath path(relpath);
13906 InodeRef in;
13907 int r = path_walk(path, &in, perms);
13908 if (r < 0)
13909 return r;
13910
13911 *lp = in->layout;
13912
13913 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13914 return 0;
13915 }
13916
13917 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13918 {
13919 std::lock_guard lock(client_lock);
13920
13921 if (unmounting)
13922 return -ENOTCONN;
13923
13924 Fh *f = get_filehandle(fd);
13925 if (!f)
13926 return -EBADF;
13927 Inode *in = f->inode.get();
13928
13929 *lp = in->layout;
13930
13931 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13932 return 0;
13933 }
13934
13935 int64_t Client::get_default_pool_id()
13936 {
13937 std::lock_guard lock(client_lock);
13938
13939 if (unmounting)
13940 return -ENOTCONN;
13941
13942 /* first data pool is the default */
13943 return mdsmap->get_first_data_pool();
13944 }
13945
13946 // expose osdmap
13947
13948 int64_t Client::get_pool_id(const char *pool_name)
13949 {
13950 std::lock_guard lock(client_lock);
13951
13952 if (unmounting)
13953 return -ENOTCONN;
13954
13955 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13956 pool_name);
13957 }
13958
13959 string Client::get_pool_name(int64_t pool)
13960 {
13961 std::lock_guard lock(client_lock);
13962
13963 if (unmounting)
13964 return string();
13965
13966 return objecter->with_osdmap([pool](const OSDMap& o) {
13967 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13968 });
13969 }
13970
13971 int Client::get_pool_replication(int64_t pool)
13972 {
13973 std::lock_guard lock(client_lock);
13974
13975 if (unmounting)
13976 return -ENOTCONN;
13977
13978 return objecter->with_osdmap([pool](const OSDMap& o) {
13979 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13980 });
13981 }
13982
13983 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13984 {
13985 std::lock_guard lock(client_lock);
13986
13987 if (unmounting)
13988 return -ENOTCONN;
13989
13990 Fh *f = get_filehandle(fd);
13991 if (!f)
13992 return -EBADF;
13993 Inode *in = f->inode.get();
13994
13995 vector<ObjectExtent> extents;
13996 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13997 ceph_assert(extents.size() == 1);
13998
13999 objecter->with_osdmap([&](const OSDMap& o) {
14000 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14001 o.pg_to_acting_osds(pg, osds);
14002 });
14003
14004 if (osds.empty())
14005 return -EINVAL;
14006
14007 /*
14008 * Return the remainder of the extent (stripe unit)
14009 *
14010 * If length = 1 is passed to Striper::file_to_extents we get a single
14011 * extent back, but its length is one so we still need to compute the length
14012 * to the end of the stripe unit.
14013 *
14014 * If length = su then we may get 1 or 2 objects back in the extents vector
14015 * which would have to be examined. Even then, the offsets are local to the
14016 * object, so matching up to the file offset is extra work.
14017 *
14018 * It seems simpler to stick with length = 1 and manually compute the
14019 * remainder.
14020 */
14021 if (len) {
14022 uint64_t su = in->layout.stripe_unit;
14023 *len = su - (off % su);
14024 }
14025
14026 return 0;
14027 }
14028
14029 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14030 {
14031 std::lock_guard lock(client_lock);
14032
14033 if (unmounting)
14034 return -ENOTCONN;
14035
14036 if (id < 0)
14037 return -EINVAL;
14038 return objecter->with_osdmap([&](const OSDMap& o) {
14039 return o.crush->get_full_location_ordered(id, path);
14040 });
14041 }
14042
14043 int Client::get_file_stripe_address(int fd, loff_t offset,
14044 vector<entity_addr_t>& address)
14045 {
14046 std::lock_guard lock(client_lock);
14047
14048 if (unmounting)
14049 return -ENOTCONN;
14050
14051 Fh *f = get_filehandle(fd);
14052 if (!f)
14053 return -EBADF;
14054 Inode *in = f->inode.get();
14055
14056 // which object?
14057 vector<ObjectExtent> extents;
14058 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14059 in->truncate_size, extents);
14060 ceph_assert(extents.size() == 1);
14061
14062 // now we have the object and its 'layout'
14063 return objecter->with_osdmap([&](const OSDMap& o) {
14064 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14065 vector<int> osds;
14066 o.pg_to_acting_osds(pg, osds);
14067 if (osds.empty())
14068 return -EINVAL;
14069 for (unsigned i = 0; i < osds.size(); i++) {
14070 entity_addr_t addr = o.get_addrs(osds[i]).front();
14071 address.push_back(addr);
14072 }
14073 return 0;
14074 });
14075 }
14076
14077 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14078 {
14079 std::lock_guard lock(client_lock);
14080
14081 if (unmounting)
14082 return -ENOTCONN;
14083
14084 return objecter->with_osdmap([&](const OSDMap& o) {
14085 if (!o.exists(osd))
14086 return -ENOENT;
14087
14088 addr = o.get_addrs(osd).front();
14089 return 0;
14090 });
14091 }
14092
14093 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14094 loff_t length, loff_t offset)
14095 {
14096 std::lock_guard lock(client_lock);
14097
14098 if (unmounting)
14099 return -ENOTCONN;
14100
14101 Fh *f = get_filehandle(fd);
14102 if (!f)
14103 return -EBADF;
14104 Inode *in = f->inode.get();
14105
14106 // map to a list of extents
14107 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14108
14109 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14110 return 0;
14111 }
14112
14113
14114 /* find an osd with the same ip. -ENXIO if none. */
14115 int Client::get_local_osd()
14116 {
14117 std::lock_guard lock(client_lock);
14118
14119 if (unmounting)
14120 return -ENOTCONN;
14121
14122 objecter->with_osdmap([this](const OSDMap& o) {
14123 if (o.get_epoch() != local_osd_epoch) {
14124 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14125 local_osd_epoch = o.get_epoch();
14126 }
14127 });
14128 return local_osd;
14129 }
14130
14131
14132
14133
14134
14135
14136 // ===============================
14137
14138 void Client::ms_handle_connect(Connection *con)
14139 {
14140 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14141 }
14142
14143 bool Client::ms_handle_reset(Connection *con)
14144 {
14145 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14146 return false;
14147 }
14148
14149 void Client::ms_handle_remote_reset(Connection *con)
14150 {
14151 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14152 std::lock_guard l(client_lock);
14153 switch (con->get_peer_type()) {
14154 case CEPH_ENTITY_TYPE_MDS:
14155 {
14156 // kludge to figure out which mds this is; fixme with a Connection* state
14157 mds_rank_t mds = MDS_RANK_NONE;
14158 MetaSession *s = NULL;
14159 for (auto &p : mds_sessions) {
14160 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14161 mds = p.first;
14162 s = &p.second;
14163 }
14164 }
14165 if (mds >= 0) {
14166 assert (s != NULL);
14167 switch (s->state) {
14168 case MetaSession::STATE_CLOSING:
14169 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14170 _closed_mds_session(s);
14171 break;
14172
14173 case MetaSession::STATE_OPENING:
14174 {
14175 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14176 list<Context*> waiters;
14177 waiters.swap(s->waiting_for_open);
14178 _closed_mds_session(s);
14179 MetaSession *news = _get_or_open_mds_session(mds);
14180 news->waiting_for_open.swap(waiters);
14181 }
14182 break;
14183
14184 case MetaSession::STATE_OPEN:
14185 {
14186 objecter->maybe_request_map(); /* to check if we are blacklisted */
14187 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14188 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14189 _closed_mds_session(s);
14190 } else {
14191 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14192 s->state = MetaSession::STATE_STALE;
14193 }
14194 }
14195 break;
14196
14197 case MetaSession::STATE_NEW:
14198 case MetaSession::STATE_CLOSED:
14199 default:
14200 break;
14201 }
14202 }
14203 }
14204 break;
14205 }
14206 }
14207
14208 bool Client::ms_handle_refused(Connection *con)
14209 {
14210 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14211 return false;
14212 }
14213
14214 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14215 {
14216 Inode *quota_in = root_ancestor;
14217 SnapRealm *realm = in->snaprealm;
14218 while (realm) {
14219 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14220 if (realm->ino != in->ino) {
14221 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14222 if (p == inode_map.end())
14223 break;
14224
14225 if (p->second->quota.is_enable()) {
14226 quota_in = p->second;
14227 break;
14228 }
14229 }
14230 realm = realm->pparent;
14231 }
14232 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14233 return quota_in;
14234 }
14235
14236 /**
14237 * Traverse quota ancestors of the Inode, return true
14238 * if any of them passes the passed function
14239 */
14240 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14241 std::function<bool (const Inode &in)> test)
14242 {
14243 while (true) {
14244 ceph_assert(in != NULL);
14245 if (test(*in)) {
14246 return true;
14247 }
14248
14249 if (in == root_ancestor) {
14250 // We're done traversing, drop out
14251 return false;
14252 } else {
14253 // Continue up the tree
14254 in = get_quota_root(in, perms);
14255 }
14256 }
14257
14258 return false;
14259 }
14260
14261 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14262 {
14263 return check_quota_condition(in, perms,
14264 [](const Inode &in) {
14265 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14266 });
14267 }
14268
14269 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14270 const UserPerm& perms)
14271 {
14272 return check_quota_condition(in, perms,
14273 [&new_bytes](const Inode &in) {
14274 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14275 > in.quota.max_bytes;
14276 });
14277 }
14278
14279 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14280 {
14281 ceph_assert(in->size >= in->reported_size);
14282 const uint64_t size = in->size - in->reported_size;
14283 return check_quota_condition(in, perms,
14284 [&size](const Inode &in) {
14285 if (in.quota.max_bytes) {
14286 if (in.rstat.rbytes >= in.quota.max_bytes) {
14287 return true;
14288 }
14289
14290 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14291 return (space >> 4) < size;
14292 } else {
14293 return false;
14294 }
14295 });
14296 }
14297
14298 enum {
14299 POOL_CHECKED = 1,
14300 POOL_CHECKING = 2,
14301 POOL_READ = 4,
14302 POOL_WRITE = 8,
14303 };
14304
14305 int Client::check_pool_perm(Inode *in, int need)
14306 {
14307 if (!cct->_conf->client_check_pool_perm)
14308 return 0;
14309
14310 int64_t pool_id = in->layout.pool_id;
14311 std::string pool_ns = in->layout.pool_ns;
14312 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14313 int have = 0;
14314 while (true) {
14315 auto it = pool_perms.find(perm_key);
14316 if (it == pool_perms.end())
14317 break;
14318 if (it->second == POOL_CHECKING) {
14319 // avoid concurrent checkings
14320 wait_on_list(waiting_for_pool_perm);
14321 } else {
14322 have = it->second;
14323 ceph_assert(have & POOL_CHECKED);
14324 break;
14325 }
14326 }
14327
14328 if (!have) {
14329 if (in->snapid != CEPH_NOSNAP) {
14330 // pool permission check needs to write to the first object. But for snapshot,
14331 // head of the first object may have alread been deleted. To avoid creating
14332 // orphan object, skip the check for now.
14333 return 0;
14334 }
14335
14336 pool_perms[perm_key] = POOL_CHECKING;
14337
14338 char oid_buf[32];
14339 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14340 object_t oid = oid_buf;
14341
14342 SnapContext nullsnapc;
14343
14344 C_SaferCond rd_cond;
14345 ObjectOperation rd_op;
14346 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14347
14348 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14349 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14350
14351 C_SaferCond wr_cond;
14352 ObjectOperation wr_op;
14353 wr_op.create(true);
14354
14355 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14356 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14357
14358 client_lock.unlock();
14359 int rd_ret = rd_cond.wait();
14360 int wr_ret = wr_cond.wait();
14361 client_lock.lock();
14362
14363 bool errored = false;
14364
14365 if (rd_ret == 0 || rd_ret == -ENOENT)
14366 have |= POOL_READ;
14367 else if (rd_ret != -EPERM) {
14368 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14369 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14370 errored = true;
14371 }
14372
14373 if (wr_ret == 0 || wr_ret == -EEXIST)
14374 have |= POOL_WRITE;
14375 else if (wr_ret != -EPERM) {
14376 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14377 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14378 errored = true;
14379 }
14380
14381 if (errored) {
14382 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14383 // Raise EIO because actual error code might be misleading for
14384 // userspace filesystem user.
14385 pool_perms.erase(perm_key);
14386 signal_cond_list(waiting_for_pool_perm);
14387 return -EIO;
14388 }
14389
14390 pool_perms[perm_key] = have | POOL_CHECKED;
14391 signal_cond_list(waiting_for_pool_perm);
14392 }
14393
14394 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14395 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14396 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14397 return -EPERM;
14398 }
14399 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14400 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14401 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14402 return -EPERM;
14403 }
14404
14405 return 0;
14406 }
14407
14408 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14409 {
14410 if (acl_type == POSIX_ACL) {
14411 if (in->xattrs.count(ACL_EA_ACCESS)) {
14412 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14413
14414 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14415 }
14416 }
14417 return -EAGAIN;
14418 }
14419
14420 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14421 {
14422 if (acl_type == NO_ACL)
14423 return 0;
14424
14425 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14426 if (r < 0)
14427 goto out;
14428
14429 if (acl_type == POSIX_ACL) {
14430 if (in->xattrs.count(ACL_EA_ACCESS)) {
14431 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14432 bufferptr acl(access_acl.c_str(), access_acl.length());
14433 r = posix_acl_access_chmod(acl, mode);
14434 if (r < 0)
14435 goto out;
14436 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14437 } else {
14438 r = 0;
14439 }
14440 }
14441 out:
14442 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14443 return r;
14444 }
14445
14446 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14447 const UserPerm& perms)
14448 {
14449 if (acl_type == NO_ACL)
14450 return 0;
14451
14452 if (S_ISLNK(*mode))
14453 return 0;
14454
14455 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14456 if (r < 0)
14457 goto out;
14458
14459 if (acl_type == POSIX_ACL) {
14460 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14461 map<string, bufferptr> xattrs;
14462
14463 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14464 bufferptr acl(default_acl.c_str(), default_acl.length());
14465 r = posix_acl_inherit_mode(acl, mode);
14466 if (r < 0)
14467 goto out;
14468
14469 if (r > 0) {
14470 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14471 if (r < 0)
14472 goto out;
14473 if (r > 0)
14474 xattrs[ACL_EA_ACCESS] = acl;
14475 }
14476
14477 if (S_ISDIR(*mode))
14478 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14479
14480 r = xattrs.size();
14481 if (r > 0)
14482 encode(xattrs, xattrs_bl);
14483 } else {
14484 if (umask_cb)
14485 *mode &= ~umask_cb(callback_handle);
14486 r = 0;
14487 }
14488 }
14489 out:
14490 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14491 return r;
14492 }
14493
14494 void Client::set_filer_flags(int flags)
14495 {
14496 std::lock_guard l(client_lock);
14497 ceph_assert(flags == 0 ||
14498 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14499 objecter->add_global_op_flags(flags);
14500 }
14501
14502 void Client::clear_filer_flags(int flags)
14503 {
14504 std::lock_guard l(client_lock);
14505 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14506 objecter->clear_global_op_flag(flags);
14507 }
14508
14509 // called before mount
14510 void Client::set_uuid(const std::string& uuid)
14511 {
14512 std::lock_guard l(client_lock);
14513 assert(initialized);
14514 assert(!uuid.empty());
14515
14516 metadata["uuid"] = uuid;
14517 _close_sessions();
14518 }
14519
14520 // called before mount. 0 means infinite
14521 void Client::set_session_timeout(unsigned timeout)
14522 {
14523 std::lock_guard l(client_lock);
14524 assert(initialized);
14525
14526 metadata["timeout"] = stringify(timeout);
14527 }
14528
14529 // called before mount
14530 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14531 const std::string& fs_name)
14532 {
14533 std::lock_guard l(client_lock);
14534 if (!initialized)
14535 return -ENOTCONN;
14536
14537 if (uuid.empty())
14538 return -EINVAL;
14539
14540 {
14541 auto it = metadata.find("uuid");
14542 if (it != metadata.end() && it->second == uuid)
14543 return -EINVAL;
14544 }
14545
14546 int r = subscribe_mdsmap(fs_name);
14547 if (r < 0) {
14548 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14549 return r;
14550 }
14551
14552 if (metadata.empty())
14553 populate_metadata("");
14554
14555 while (mdsmap->get_epoch() == 0)
14556 wait_on_list(waiting_for_mdsmap);
14557
14558 reclaim_errno = 0;
14559 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14560 if (!mdsmap->is_up(mds)) {
14561 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14562 wait_on_list(waiting_for_mdsmap);
14563 continue;
14564 }
14565
14566 MetaSession *session;
14567 if (!have_open_session(mds)) {
14568 session = _get_or_open_mds_session(mds);
14569 if (session->state == MetaSession::STATE_REJECTED)
14570 return -EPERM;
14571 if (session->state != MetaSession::STATE_OPENING) {
14572 // umounting?
14573 return -EINVAL;
14574 }
14575 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14576 wait_on_context_list(session->waiting_for_open);
14577 continue;
14578 }
14579
14580 session = &mds_sessions.at(mds);
14581 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14582 return -EOPNOTSUPP;
14583
14584 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14585 session->reclaim_state == MetaSession::RECLAIMING) {
14586 session->reclaim_state = MetaSession::RECLAIMING;
14587 auto m = make_message<MClientReclaim>(uuid, flags);
14588 session->con->send_message2(std::move(m));
14589 wait_on_list(waiting_for_reclaim);
14590 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14591 return reclaim_errno ? : -ENOTRECOVERABLE;
14592 } else {
14593 mds++;
14594 }
14595 }
14596
14597 // didn't find target session in any mds
14598 if (reclaim_target_addrs.empty()) {
14599 if (flags & CEPH_RECLAIM_RESET)
14600 return -ENOENT;
14601 return -ENOTRECOVERABLE;
14602 }
14603
14604 if (flags & CEPH_RECLAIM_RESET)
14605 return 0;
14606
14607 // use blacklist to check if target session was killed
14608 // (config option mds_session_blacklist_on_evict needs to be true)
14609 C_SaferCond cond;
14610 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14611 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14612 client_lock.unlock();
14613 cond.wait();
14614 client_lock.lock();
14615 }
14616
14617 bool blacklisted = objecter->with_osdmap(
14618 [this](const OSDMap &osd_map) -> bool {
14619 return osd_map.is_blacklisted(reclaim_target_addrs);
14620 });
14621 if (blacklisted)
14622 return -ENOTRECOVERABLE;
14623
14624 metadata["reclaiming_uuid"] = uuid;
14625 return 0;
14626 }
14627
14628 void Client::finish_reclaim()
14629 {
14630 auto it = metadata.find("reclaiming_uuid");
14631 if (it == metadata.end()) {
14632 for (auto &p : mds_sessions)
14633 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14634 return;
14635 }
14636
14637 for (auto &p : mds_sessions) {
14638 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14639 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14640 p.second.con->send_message2(std::move(m));
14641 }
14642
14643 metadata["uuid"] = it->second;
14644 metadata.erase(it);
14645 }
14646
14647 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14648 {
14649 mds_rank_t from = mds_rank_t(reply->get_source().num());
14650 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14651
14652 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14653 if (!session) {
14654 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14655 return;
14656 }
14657
14658 if (reply->get_result() >= 0) {
14659 session->reclaim_state = MetaSession::RECLAIM_OK;
14660 if (reply->get_epoch() > reclaim_osd_epoch)
14661 reclaim_osd_epoch = reply->get_epoch();
14662 if (!reply->get_addrs().empty())
14663 reclaim_target_addrs = reply->get_addrs();
14664 } else {
14665 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14666 reclaim_errno = reply->get_result();
14667 }
14668
14669 signal_cond_list(waiting_for_reclaim);
14670 }
14671
14672 /**
14673 * This is included in cap release messages, to cause
14674 * the MDS to wait until this OSD map epoch. It is necessary
14675 * in corner cases where we cancel RADOS ops, so that
14676 * nobody else tries to do IO to the same objects in
14677 * the same epoch as the cancelled ops.
14678 */
14679 void Client::set_cap_epoch_barrier(epoch_t e)
14680 {
14681 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14682 cap_epoch_barrier = e;
14683 }
14684
14685 const char** Client::get_tracked_conf_keys() const
14686 {
14687 static const char* keys[] = {
14688 "client_cache_size",
14689 "client_cache_mid",
14690 "client_acl_type",
14691 "client_deleg_timeout",
14692 "client_deleg_break_on_open",
14693 NULL
14694 };
14695 return keys;
14696 }
14697
14698 void Client::handle_conf_change(const ConfigProxy& conf,
14699 const std::set <std::string> &changed)
14700 {
14701 std::lock_guard lock(client_lock);
14702
14703 if (changed.count("client_cache_mid")) {
14704 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14705 }
14706 if (changed.count("client_acl_type")) {
14707 acl_type = NO_ACL;
14708 if (cct->_conf->client_acl_type == "posix_acl")
14709 acl_type = POSIX_ACL;
14710 }
14711 }
14712
14713 void intrusive_ptr_add_ref(Inode *in)
14714 {
14715 in->get();
14716 }
14717
14718 void intrusive_ptr_release(Inode *in)
14719 {
14720 in->client->put_inode(in);
14721 }
14722
14723 mds_rank_t Client::_get_random_up_mds() const
14724 {
14725 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14726
14727 std::set<mds_rank_t> up;
14728 mdsmap->get_up_mds_set(up);
14729
14730 if (up.empty())
14731 return MDS_RANK_NONE;
14732 std::set<mds_rank_t>::const_iterator p = up.begin();
14733 for (int n = rand() % up.size(); n; n--)
14734 ++p;
14735 return *p;
14736 }
14737
14738
14739 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14740 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14741 {
14742 monclient->set_messenger(m);
14743 objecter->set_client_incarnation(0);
14744 }
14745
14746 StandaloneClient::~StandaloneClient()
14747 {
14748 delete objecter;
14749 objecter = nullptr;
14750 }
14751
14752 int StandaloneClient::init()
14753 {
14754 _pre_init();
14755 objecter->init();
14756
14757 client_lock.lock();
14758 ceph_assert(!is_initialized());
14759
14760 messenger->add_dispatcher_tail(objecter);
14761 messenger->add_dispatcher_tail(this);
14762
14763 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14764 int r = monclient->init();
14765 if (r < 0) {
14766 // need to do cleanup because we're in an intermediate init state
14767 timer.shutdown();
14768 client_lock.unlock();
14769 objecter->shutdown();
14770 objectcacher->stop();
14771 monclient->shutdown();
14772 return r;
14773 }
14774 objecter->start();
14775
14776 client_lock.unlock();
14777 _finish_init();
14778
14779 return 0;
14780 }
14781
14782 void StandaloneClient::shutdown()
14783 {
14784 Client::shutdown();
14785 objecter->shutdown();
14786 monclient->shutdown();
14787 }