]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import 15.2.4
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
77
78 #define dout_subsys ceph_subsys_client
79
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
83
84 #include "Client.h"
85 #include "Inode.h"
86 #include "Dentry.h"
87 #include "Delegation.h"
88 #include "Dir.h"
89 #include "ClientSnapRealm.h"
90 #include "Fh.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
95
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
98
99 #include "include/cephfs/ceph_ll_client.h"
100
101 #if HAVE_GETGROUPLIST
102 #include <grp.h>
103 #include <pwd.h>
104 #include <unistd.h>
105 #endif
106
107 #undef dout_prefix
108 #define dout_prefix *_dout << "client." << whoami << " "
109
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112 // FreeBSD fails to define this
113 #ifndef O_DSYNC
114 #define O_DSYNC 0x0
115 #endif
116 // Darwin fails to define this
117 #ifndef O_RSYNC
118 #define O_RSYNC 0x0
119 #endif
120
121 #ifndef O_DIRECT
122 #define O_DIRECT 0x0
123 #endif
124
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
127 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
128 {
129 Client *client = static_cast<Client*>(p);
130 client->flush_set_callback(oset);
131 }
132
133
134 // -------------
135
136 Client::CommandHook::CommandHook(Client *client) :
137 m_client(client)
138 {
139 }
140
141 int Client::CommandHook::call(
142 std::string_view command,
143 const cmdmap_t& cmdmap,
144 Formatter *f,
145 std::ostream& errss,
146 bufferlist& out)
147 {
148 f->open_object_section("result");
149 {
150 std::lock_guard l{m_client->client_lock};
151 if (command == "mds_requests")
152 m_client->dump_mds_requests(f);
153 else if (command == "mds_sessions")
154 m_client->dump_mds_sessions(f);
155 else if (command == "dump_cache")
156 m_client->dump_cache(f);
157 else if (command == "kick_stale_sessions")
158 m_client->_kick_stale_sessions();
159 else if (command == "status")
160 m_client->dump_status(f);
161 else
162 ceph_abort_msg("bad command registered");
163 }
164 f->close_section();
165 return 0;
166 }
167
168
169 // -------------
170
171 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
172 : inode(in), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 perms(perms)
175 { }
176
177 void Client::_reset_faked_inos()
178 {
179 ino_t start = 1024;
180 free_faked_inos.clear();
181 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
182 last_used_faked_ino = 0;
183 last_used_faked_root = 0;
184 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
185 }
186
187 void Client::_assign_faked_ino(Inode *in)
188 {
189 if (0 == last_used_faked_ino)
190 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
192 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
193 last_used_faked_ino = 2048;
194 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
195 }
196 ceph_assert(it != free_faked_inos.end());
197 if (last_used_faked_ino < it.get_start()) {
198 ceph_assert(it.get_len() > 0);
199 last_used_faked_ino = it.get_start();
200 } else {
201 ++last_used_faked_ino;
202 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
203 }
204 in->faked_ino = last_used_faked_ino;
205 free_faked_inos.erase(in->faked_ino);
206 faked_ino_map[in->faked_ino] = in->vino();
207 }
208
209 /*
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
215 */
216 void Client::_assign_faked_root(Inode *in)
217 {
218 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
219 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
220 last_used_faked_root = 0;
221 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
222 }
223 assert(it != free_faked_inos.end());
224 vinodeno_t inode_info = in->vino();
225 uint64_t inode_num = (uint64_t)inode_info.ino;
226 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
227 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it.get_start() + it.get_len() > last_used_faked_root);
229
230 in->faked_ino = last_used_faked_root;
231 free_faked_inos.erase(in->faked_ino);
232 faked_ino_map[in->faked_ino] = in->vino();
233 }
234
235 void Client::_release_faked_ino(Inode *in)
236 {
237 free_faked_inos.insert(in->faked_ino);
238 faked_ino_map.erase(in->faked_ino);
239 }
240
241 vinodeno_t Client::_map_faked_ino(ino_t ino)
242 {
243 vinodeno_t vino;
244 if (ino == 1)
245 vino = root->vino();
246 else if (faked_ino_map.count(ino))
247 vino = faked_ino_map[ino];
248 else
249 vino = vinodeno_t(0, CEPH_NOSNAP);
250 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
251 return vino;
252 }
253
254 vinodeno_t Client::map_faked_ino(ino_t ino)
255 {
256 std::lock_guard lock(client_lock);
257 return _map_faked_ino(ino);
258 }
259
260 // cons/des
261
262 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
263 : Dispatcher(m->cct),
264 timer(m->cct, client_lock),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 async_ino_releasor(m->cct),
274 objecter_finisher(m->cct),
275 m_command_hook(this),
276 fscid(0)
277 {
278 _reset_faked_inos();
279
280 user_id = cct->_conf->client_mount_uid;
281 group_id = cct->_conf->client_mount_gid;
282 fuse_default_permissions = cct->_conf.get_val<bool>(
283 "fuse_default_permissions");
284
285 if (cct->_conf->client_acl_type == "posix_acl")
286 acl_type = POSIX_ACL;
287
288 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
289
290 // file handles
291 free_fd_set.insert(10, 1<<30);
292
293 mdsmap.reset(new MDSMap);
294
295 // osd interfaces
296 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
297 &client_lock));
298 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
299 client_flush_set_callback, // all commit callback
300 (void*)this,
301 cct->_conf->client_oc_size,
302 cct->_conf->client_oc_max_objects,
303 cct->_conf->client_oc_max_dirty,
304 cct->_conf->client_oc_target_dirty,
305 cct->_conf->client_oc_max_dirty_age,
306 true));
307 }
308
309
310 Client::~Client()
311 {
312 ceph_assert(ceph_mutex_is_not_locked(client_lock));
313
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 std::lock_guard l{client_lock};
318 tear_down_cache();
319 }
320
321 void Client::tear_down_cache()
322 {
323 // fd's
324 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
325 it != fd_map.end();
326 ++it) {
327 Fh *fh = it->second;
328 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
329 _release_fh(fh);
330 }
331 fd_map.clear();
332
333 while (!opened_dirs.empty()) {
334 dir_result_t *dirp = *opened_dirs.begin();
335 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
336 _closedir(dirp);
337 }
338
339 // caps!
340 // *** FIXME ***
341
342 // empty lru
343 trim_cache();
344 ceph_assert(lru.lru_get_size() == 0);
345
346 // close root ino
347 ceph_assert(inode_map.size() <= 1 + root_parents.size());
348 if (root && inode_map.size() == 1 + root_parents.size()) {
349 delete root;
350 root = 0;
351 root_ancestor = 0;
352 while (!root_parents.empty())
353 root_parents.erase(root_parents.begin());
354 inode_map.clear();
355 _reset_faked_inos();
356 }
357
358 ceph_assert(inode_map.empty());
359 }
360
361 inodeno_t Client::get_root_ino()
362 {
363 std::lock_guard l(client_lock);
364 if (use_faked_inos())
365 return root->faked_ino;
366 else
367 return root->ino;
368 }
369
370 Inode *Client::get_root()
371 {
372 std::lock_guard l(client_lock);
373 root->ll_get();
374 return root;
375 }
376
377
378 // debug crapola
379
380 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
381 {
382 filepath path;
383 in->make_long_path(path);
384 ldout(cct, 1) << "dump_inode: "
385 << (disconnected ? "DISCONNECTED ":"")
386 << "inode " << in->ino
387 << " " << path
388 << " ref " << in->get_num_ref()
389 << *in << dendl;
390
391 if (f) {
392 f->open_object_section("inode");
393 f->dump_stream("path") << path;
394 if (disconnected)
395 f->dump_int("disconnected", 1);
396 in->dump(f);
397 f->close_section();
398 }
399
400 did.insert(in);
401 if (in->dir) {
402 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
403 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
404 it != in->dir->dentries.end();
405 ++it) {
406 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
407 if (f) {
408 f->open_object_section("dentry");
409 it->second->dump(f);
410 f->close_section();
411 }
412 if (it->second->inode)
413 dump_inode(f, it->second->inode.get(), did, false);
414 }
415 }
416 }
417
418 void Client::dump_cache(Formatter *f)
419 {
420 set<Inode*> did;
421
422 ldout(cct, 1) << __func__ << dendl;
423
424 if (f)
425 f->open_array_section("cache");
426
427 if (root)
428 dump_inode(f, root, did, true);
429
430 // make a second pass to catch anything disconnected
431 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
432 it != inode_map.end();
433 ++it) {
434 if (did.count(it->second))
435 continue;
436 dump_inode(f, it->second, did, true);
437 }
438
439 if (f)
440 f->close_section();
441 }
442
443 void Client::dump_status(Formatter *f)
444 {
445 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
446
447 ldout(cct, 1) << __func__ << dendl;
448
449 const epoch_t osd_epoch
450 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
451
452 if (f) {
453 f->open_object_section("metadata");
454 for (const auto& kv : metadata)
455 f->dump_string(kv.first.c_str(), kv.second);
456 f->close_section();
457
458 f->dump_int("dentry_count", lru.lru_get_size());
459 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
460 f->dump_int("id", get_nodeid().v);
461 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
462 f->dump_object("inst", inst);
463 f->dump_object("addr", inst.addr);
464 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
465 f->dump_string("addr_str", inst.addr.get_legacy_str());
466 f->dump_int("inode_count", inode_map.size());
467 f->dump_int("mds_epoch", mdsmap->get_epoch());
468 f->dump_int("osd_epoch", osd_epoch);
469 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
470 f->dump_bool("blacklisted", blacklisted);
471 }
472 }
473
474 void Client::_pre_init()
475 {
476 timer.init();
477
478 objecter_finisher.start();
479 filer.reset(new Filer(objecter, &objecter_finisher));
480 objecter->enable_blacklist_events();
481
482 objectcacher->start();
483 }
484
485 int Client::init()
486 {
487 _pre_init();
488 {
489 std::lock_guard l{client_lock};
490 ceph_assert(!initialized);
491 messenger->add_dispatcher_tail(this);
492 }
493 _finish_init();
494 return 0;
495 }
496
497 void Client::_finish_init()
498 {
499 {
500 std::lock_guard l{client_lock};
501 // logger
502 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
503 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
504 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
505 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
506 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
507 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
508 logger.reset(plb.create_perf_counters());
509 cct->get_perfcounters_collection()->add(logger.get());
510 }
511
512 cct->_conf.add_observer(this);
513
514 AdminSocket* admin_socket = cct->get_admin_socket();
515 int ret = admin_socket->register_command("mds_requests",
516 &m_command_hook,
517 "show in-progress mds requests");
518 if (ret < 0) {
519 lderr(cct) << "error registering admin socket command: "
520 << cpp_strerror(-ret) << dendl;
521 }
522 ret = admin_socket->register_command("mds_sessions",
523 &m_command_hook,
524 "show mds session state");
525 if (ret < 0) {
526 lderr(cct) << "error registering admin socket command: "
527 << cpp_strerror(-ret) << dendl;
528 }
529 ret = admin_socket->register_command("dump_cache",
530 &m_command_hook,
531 "show in-memory metadata cache contents");
532 if (ret < 0) {
533 lderr(cct) << "error registering admin socket command: "
534 << cpp_strerror(-ret) << dendl;
535 }
536 ret = admin_socket->register_command("kick_stale_sessions",
537 &m_command_hook,
538 "kick sessions that were remote reset");
539 if (ret < 0) {
540 lderr(cct) << "error registering admin socket command: "
541 << cpp_strerror(-ret) << dendl;
542 }
543 ret = admin_socket->register_command("status",
544 &m_command_hook,
545 "show overall client status");
546 if (ret < 0) {
547 lderr(cct) << "error registering admin socket command: "
548 << cpp_strerror(-ret) << dendl;
549 }
550
551 std::lock_guard l{client_lock};
552 initialized = true;
553 }
554
555 void Client::shutdown()
556 {
557 ldout(cct, 1) << __func__ << dendl;
558
559 // If we were not mounted, but were being used for sending
560 // MDS commands, we may have sessions that need closing.
561 {
562 std::lock_guard l{client_lock};
563 _close_sessions();
564 }
565 cct->_conf.remove_observer(this);
566
567 cct->get_admin_socket()->unregister_commands(&m_command_hook);
568
569 if (ino_invalidate_cb) {
570 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
571 async_ino_invalidator.wait_for_empty();
572 async_ino_invalidator.stop();
573 }
574
575 if (dentry_invalidate_cb) {
576 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
577 async_dentry_invalidator.wait_for_empty();
578 async_dentry_invalidator.stop();
579 }
580
581 if (switch_interrupt_cb) {
582 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
583 interrupt_finisher.wait_for_empty();
584 interrupt_finisher.stop();
585 }
586
587 if (remount_cb) {
588 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
589 remount_finisher.wait_for_empty();
590 remount_finisher.stop();
591 }
592
593 if (ino_release_cb) {
594 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
595 async_ino_releasor.wait_for_empty();
596 async_ino_releasor.stop();
597 }
598
599 objectcacher->stop(); // outside of client_lock! this does a join.
600 {
601 std::lock_guard l{client_lock};
602 ceph_assert(initialized);
603 initialized = false;
604 timer.shutdown();
605 }
606 objecter_finisher.wait_for_empty();
607 objecter_finisher.stop();
608
609 if (logger) {
610 cct->get_perfcounters_collection()->remove(logger.get());
611 logger.reset();
612 }
613 }
614
615
616 // ===================
617 // metadata cache stuff
618
619 void Client::trim_cache(bool trim_kernel_dcache)
620 {
621 uint64_t max = cct->_conf->client_cache_size;
622 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
623 unsigned last = 0;
624 while (lru.lru_get_size() != last) {
625 last = lru.lru_get_size();
626
627 if (!unmounting && lru.lru_get_size() <= max) break;
628
629 // trim!
630 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
631 if (!dn)
632 break; // done
633
634 trim_dentry(dn);
635 }
636
637 if (trim_kernel_dcache && lru.lru_get_size() > max)
638 _invalidate_kernel_dcache();
639
640 // hose root?
641 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
642 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
643 delete root;
644 root = 0;
645 root_ancestor = 0;
646 while (!root_parents.empty())
647 root_parents.erase(root_parents.begin());
648 inode_map.clear();
649 _reset_faked_inos();
650 }
651 }
652
653 void Client::trim_cache_for_reconnect(MetaSession *s)
654 {
655 mds_rank_t mds = s->mds_num;
656 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
657
658 int trimmed = 0;
659 list<Dentry*> skipped;
660 while (lru.lru_get_size() > 0) {
661 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
662 if (!dn)
663 break;
664
665 if ((dn->inode && dn->inode->caps.count(mds)) ||
666 dn->dir->parent_inode->caps.count(mds)) {
667 trim_dentry(dn);
668 trimmed++;
669 } else
670 skipped.push_back(dn);
671 }
672
673 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
674 lru.lru_insert_mid(*p);
675
676 ldout(cct, 20) << __func__ << " mds." << mds
677 << " trimmed " << trimmed << " dentries" << dendl;
678
679 if (s->caps.size() > 0)
680 _invalidate_kernel_dcache();
681 }
682
683 void Client::trim_dentry(Dentry *dn)
684 {
685 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
686 << " in dir "
687 << std::hex << dn->dir->parent_inode->ino << std::dec
688 << dendl;
689 if (dn->inode) {
690 Inode *diri = dn->dir->parent_inode;
691 diri->dir_release_count++;
692 clear_dir_complete_and_ordered(diri, true);
693 }
694 unlink(dn, false, false); // drop dir, drop dentry
695 }
696
697
698 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
699 uint64_t truncate_seq, uint64_t truncate_size)
700 {
701 uint64_t prior_size = in->size;
702
703 if (truncate_seq > in->truncate_seq ||
704 (truncate_seq == in->truncate_seq && size > in->size)) {
705 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
706 in->size = size;
707 in->reported_size = size;
708 if (truncate_seq != in->truncate_seq) {
709 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
710 << truncate_seq << dendl;
711 in->truncate_seq = truncate_seq;
712 in->oset.truncate_seq = truncate_seq;
713
714 // truncate cached file data
715 if (prior_size > size) {
716 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
717 }
718 }
719
720 // truncate inline data
721 if (in->inline_version < CEPH_INLINE_NONE) {
722 uint32_t len = in->inline_data.length();
723 if (size < len)
724 in->inline_data.splice(size, len - size);
725 }
726 }
727 if (truncate_seq >= in->truncate_seq &&
728 in->truncate_size != truncate_size) {
729 if (in->is_file()) {
730 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
731 << truncate_size << dendl;
732 in->truncate_size = truncate_size;
733 in->oset.truncate_size = truncate_size;
734 } else {
735 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
736 }
737 }
738 }
739
740 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
741 utime_t ctime, utime_t mtime, utime_t atime)
742 {
743 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
744 << " ctime " << ctime << " mtime " << mtime << dendl;
745
746 if (time_warp_seq > in->time_warp_seq)
747 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
748 << " is higher than local time_warp_seq "
749 << in->time_warp_seq << dendl;
750
751 int warn = false;
752 // be careful with size, mtime, atime
753 if (issued & (CEPH_CAP_FILE_EXCL|
754 CEPH_CAP_FILE_WR|
755 CEPH_CAP_FILE_BUFFER|
756 CEPH_CAP_AUTH_EXCL|
757 CEPH_CAP_XATTR_EXCL)) {
758 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
759 if (ctime > in->ctime)
760 in->ctime = ctime;
761 if (time_warp_seq > in->time_warp_seq) {
762 //the mds updated times, so take those!
763 in->mtime = mtime;
764 in->atime = atime;
765 in->time_warp_seq = time_warp_seq;
766 } else if (time_warp_seq == in->time_warp_seq) {
767 //take max times
768 if (mtime > in->mtime)
769 in->mtime = mtime;
770 if (atime > in->atime)
771 in->atime = atime;
772 } else if (issued & CEPH_CAP_FILE_EXCL) {
773 //ignore mds values as we have a higher seq
774 } else warn = true;
775 } else {
776 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
777 if (time_warp_seq >= in->time_warp_seq) {
778 in->ctime = ctime;
779 in->mtime = mtime;
780 in->atime = atime;
781 in->time_warp_seq = time_warp_seq;
782 } else warn = true;
783 }
784 if (warn) {
785 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
786 << time_warp_seq << " is lower than local time_warp_seq "
787 << in->time_warp_seq
788 << dendl;
789 }
790 }
791
792 void Client::_fragmap_remove_non_leaves(Inode *in)
793 {
794 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
795 if (!in->dirfragtree.is_leaf(p->first))
796 in->fragmap.erase(p++);
797 else
798 ++p;
799 }
800
801 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
802 {
803 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
804 if (p->second == mds)
805 in->fragmap.erase(p++);
806 else
807 ++p;
808 }
809
810 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
811 MetaSession *session,
812 const UserPerm& request_perms)
813 {
814 Inode *in;
815 bool was_new = false;
816 if (inode_map.count(st->vino)) {
817 in = inode_map[st->vino];
818 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
819 } else {
820 in = new Inode(this, st->vino, &st->layout);
821 inode_map[st->vino] = in;
822
823 if (use_faked_inos())
824 _assign_faked_ino(in);
825
826 if (!root) {
827 root = in;
828 if (use_faked_inos())
829 _assign_faked_root(root);
830 root_ancestor = in;
831 cwd = root;
832 } else if (!mounted) {
833 root_parents[root_ancestor] = in;
834 root_ancestor = in;
835 }
836
837 // immutable bits
838 in->ino = st->vino.ino;
839 in->snapid = st->vino.snapid;
840 in->mode = st->mode & S_IFMT;
841 was_new = true;
842 }
843
844 in->rdev = st->rdev;
845 if (in->is_symlink())
846 in->symlink = st->symlink;
847
848 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
849 bool new_version = false;
850 if (in->version == 0 ||
851 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
852 (in->version & ~1) < st->version))
853 new_version = true;
854
855 int issued;
856 in->caps_issued(&issued);
857 issued |= in->caps_dirty();
858 int new_issued = ~issued & (int)st->cap.caps;
859
860 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
861 !(issued & CEPH_CAP_AUTH_EXCL)) {
862 in->mode = st->mode;
863 in->uid = st->uid;
864 in->gid = st->gid;
865 in->btime = st->btime;
866 in->snap_btime = st->snap_btime;
867 }
868
869 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
870 !(issued & CEPH_CAP_LINK_EXCL)) {
871 in->nlink = st->nlink;
872 }
873
874 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
875 update_inode_file_time(in, issued, st->time_warp_seq,
876 st->ctime, st->mtime, st->atime);
877 }
878
879 if (new_version ||
880 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
881 in->layout = st->layout;
882 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
883 }
884
885 if (in->is_dir()) {
886 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
887 in->dirstat = st->dirstat;
888 }
889 // dir_layout/rstat/quota are not tracked by capability, update them only if
890 // the inode stat is from auth mds
891 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
892 in->dir_layout = st->dir_layout;
893 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
894 in->rstat = st->rstat;
895 in->quota = st->quota;
896 in->dir_pin = st->dir_pin;
897 }
898 // move me if/when version reflects fragtree changes.
899 if (in->dirfragtree != st->dirfragtree) {
900 in->dirfragtree = st->dirfragtree;
901 _fragmap_remove_non_leaves(in);
902 }
903 }
904
905 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
906 st->xattrbl.length() &&
907 st->xattr_version > in->xattr_version) {
908 auto p = st->xattrbl.cbegin();
909 decode(in->xattrs, p);
910 in->xattr_version = st->xattr_version;
911 }
912
913 if (st->inline_version > in->inline_version) {
914 in->inline_data = st->inline_data;
915 in->inline_version = st->inline_version;
916 }
917
918 /* always take a newer change attr */
919 if (st->change_attr > in->change_attr)
920 in->change_attr = st->change_attr;
921
922 if (st->version > in->version)
923 in->version = st->version;
924
925 if (was_new)
926 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
927
928 if (!st->cap.caps)
929 return in; // as with readdir returning indoes in different snaprealms (no caps!)
930
931 if (in->snapid == CEPH_NOSNAP) {
932 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
933 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
934 st->cap.flags, request_perms);
935 if (in->auth_cap && in->auth_cap->session == session) {
936 in->max_size = st->max_size;
937 in->rstat = st->rstat;
938 }
939
940 // setting I_COMPLETE needs to happen after adding the cap
941 if (in->is_dir() &&
942 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
943 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
944 in->dirstat.nfiles == 0 &&
945 in->dirstat.nsubdirs == 0) {
946 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
947 in->flags |= I_COMPLETE | I_DIR_ORDERED;
948 if (in->dir) {
949 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
950 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
951 in->dir->readdir_cache.clear();
952 for (const auto& p : in->dir->dentries) {
953 unlink(p.second, true, true); // keep dir, keep dentry
954 }
955 if (in->dir->dentries.empty())
956 close_dir(in->dir);
957 }
958 }
959 } else {
960 in->snap_caps |= st->cap.caps;
961 }
962
963 return in;
964 }
965
966
967 /*
968 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
969 */
970 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
971 Inode *in, utime_t from, MetaSession *session,
972 Dentry *old_dentry)
973 {
974 Dentry *dn = NULL;
975 if (dir->dentries.count(dname))
976 dn = dir->dentries[dname];
977
978 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
979 << " in dir " << dir->parent_inode->vino() << " dn " << dn
980 << dendl;
981
982 if (dn && dn->inode) {
983 if (dn->inode->vino() == in->vino()) {
984 touch_dn(dn);
985 ldout(cct, 12) << " had dentry " << dname
986 << " with correct vino " << dn->inode->vino()
987 << dendl;
988 } else {
989 ldout(cct, 12) << " had dentry " << dname
990 << " with WRONG vino " << dn->inode->vino()
991 << dendl;
992 unlink(dn, true, true); // keep dir, keep dentry
993 }
994 }
995
996 if (!dn || !dn->inode) {
997 InodeRef tmp_ref(in);
998 if (old_dentry) {
999 if (old_dentry->dir != dir) {
1000 Inode *old_diri = old_dentry->dir->parent_inode;
1001 old_diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(old_diri, false);
1003 }
1004 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1005 }
1006 Inode *diri = dir->parent_inode;
1007 diri->dir_ordered_count++;
1008 clear_dir_complete_and_ordered(diri, false);
1009 dn = link(dir, dname, in, dn);
1010 }
1011
1012 update_dentry_lease(dn, dlease, from, session);
1013 return dn;
1014 }
1015
1016 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1017 {
1018 utime_t dttl = from;
1019 dttl += (float)dlease->duration_ms / 1000.0;
1020
1021 ceph_assert(dn);
1022
1023 if (dlease->mask & CEPH_LEASE_VALID) {
1024 if (dttl > dn->lease_ttl) {
1025 ldout(cct, 10) << "got dentry lease on " << dn->name
1026 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1027 dn->lease_ttl = dttl;
1028 dn->lease_mds = session->mds_num;
1029 dn->lease_seq = dlease->seq;
1030 dn->lease_gen = session->cap_gen;
1031 }
1032 }
1033 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1034 }
1035
1036
1037 /*
1038 * update MDS location cache for a single inode
1039 */
1040 void Client::update_dir_dist(Inode *in, DirStat *dst)
1041 {
1042 // auth
1043 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1044 if (dst->auth >= 0) {
1045 in->fragmap[dst->frag] = dst->auth;
1046 } else {
1047 in->fragmap.erase(dst->frag);
1048 }
1049 if (!in->dirfragtree.is_leaf(dst->frag)) {
1050 in->dirfragtree.force_to_leaf(cct, dst->frag);
1051 _fragmap_remove_non_leaves(in);
1052 }
1053
1054 // replicated
1055 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1056 }
1057
1058 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1059 {
1060 if (diri->flags & I_COMPLETE) {
1061 if (complete) {
1062 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1063 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1064 } else {
1065 if (diri->flags & I_DIR_ORDERED) {
1066 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1067 diri->flags &= ~I_DIR_ORDERED;
1068 }
1069 }
1070 if (diri->dir)
1071 diri->dir->readdir_cache.clear();
1072 }
1073 }
1074
1075 /*
1076 * insert results from readdir or lssnap into the metadata cache.
1077 */
1078 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1079
1080 auto& reply = request->reply;
1081 ConnectionRef con = request->reply->get_connection();
1082 uint64_t features;
1083 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1084 features = (uint64_t)-1;
1085 }
1086 else {
1087 features = con->get_features();
1088 }
1089
1090 dir_result_t *dirp = request->dirp;
1091 ceph_assert(dirp);
1092
1093 // the extra buffer list is only set for readdir and lssnap replies
1094 auto p = reply->get_extra_bl().cbegin();
1095 if (!p.end()) {
1096 // snapdir?
1097 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1098 ceph_assert(diri);
1099 diri = open_snapdir(diri);
1100 }
1101
1102 // only open dir if we're actually adding stuff to it!
1103 Dir *dir = diri->open_dir();
1104 ceph_assert(dir);
1105
1106 // dirstat
1107 DirStat dst(p, features);
1108 __u32 numdn;
1109 __u16 flags;
1110 decode(numdn, p);
1111 decode(flags, p);
1112
1113 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1114 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1115
1116 frag_t fg = (unsigned)request->head.args.readdir.frag;
1117 unsigned readdir_offset = dirp->next_offset;
1118 string readdir_start = dirp->last_name;
1119 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1120
1121 unsigned last_hash = 0;
1122 if (hash_order) {
1123 if (!readdir_start.empty()) {
1124 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1125 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1126 /* mds understands offset_hash */
1127 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1128 }
1129 }
1130
1131 if (fg != dst.frag) {
1132 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1133 fg = dst.frag;
1134 if (!hash_order) {
1135 readdir_offset = 2;
1136 readdir_start.clear();
1137 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1138 }
1139 }
1140
1141 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1142 << ", hash_order=" << hash_order
1143 << ", readdir_start " << readdir_start
1144 << ", last_hash " << last_hash
1145 << ", next_offset " << readdir_offset << dendl;
1146
1147 if (diri->snapid != CEPH_SNAPDIR &&
1148 fg.is_leftmost() && readdir_offset == 2 &&
1149 !(hash_order && last_hash)) {
1150 dirp->release_count = diri->dir_release_count;
1151 dirp->ordered_count = diri->dir_ordered_count;
1152 dirp->start_shared_gen = diri->shared_gen;
1153 dirp->cache_index = 0;
1154 }
1155
1156 dirp->buffer_frag = fg;
1157
1158 _readdir_drop_dirp_buffer(dirp);
1159 dirp->buffer.reserve(numdn);
1160
1161 string dname;
1162 LeaseStat dlease;
1163 for (unsigned i=0; i<numdn; i++) {
1164 decode(dname, p);
1165 dlease.decode(p, features);
1166 InodeStat ist(p, features);
1167
1168 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1169
1170 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1171 request->perms);
1172 Dentry *dn;
1173 if (diri->dir->dentries.count(dname)) {
1174 Dentry *olddn = diri->dir->dentries[dname];
1175 if (olddn->inode != in) {
1176 // replace incorrect dentry
1177 unlink(olddn, true, true); // keep dir, dentry
1178 dn = link(dir, dname, in, olddn);
1179 ceph_assert(dn == olddn);
1180 } else {
1181 // keep existing dn
1182 dn = olddn;
1183 touch_dn(dn);
1184 }
1185 } else {
1186 // new dn
1187 dn = link(dir, dname, in, NULL);
1188 }
1189
1190 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1191 if (hash_order) {
1192 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1193 if (hash != last_hash)
1194 readdir_offset = 2;
1195 last_hash = hash;
1196 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1197 } else {
1198 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1199 }
1200 // add to readdir cache
1201 if (dirp->release_count == diri->dir_release_count &&
1202 dirp->ordered_count == diri->dir_ordered_count &&
1203 dirp->start_shared_gen == diri->shared_gen) {
1204 if (dirp->cache_index == dir->readdir_cache.size()) {
1205 if (i == 0) {
1206 ceph_assert(!dirp->inode->is_complete_and_ordered());
1207 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1208 }
1209 dir->readdir_cache.push_back(dn);
1210 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1211 if (dirp->inode->is_complete_and_ordered())
1212 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1213 else
1214 dir->readdir_cache[dirp->cache_index] = dn;
1215 } else {
1216 ceph_abort_msg("unexpected readdir buffer idx");
1217 }
1218 dirp->cache_index++;
1219 }
1220 // add to cached result list
1221 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1222 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1223 }
1224
1225 if (numdn > 0)
1226 dirp->last_name = dname;
1227 if (end)
1228 dirp->next_offset = 2;
1229 else
1230 dirp->next_offset = readdir_offset;
1231
1232 if (dir->is_empty())
1233 close_dir(dir);
1234 }
1235 }
1236
1237 /** insert_trace
1238 *
1239 * insert a trace from a MDS reply into the cache.
1240 */
1241 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1242 {
1243 auto& reply = request->reply;
1244 int op = request->get_op();
1245
1246 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1247 << " is_target=" << (int)reply->head.is_target
1248 << " is_dentry=" << (int)reply->head.is_dentry
1249 << dendl;
1250
1251 auto p = reply->get_trace_bl().cbegin();
1252 if (request->got_unsafe) {
1253 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1254 ceph_assert(p.end());
1255 return NULL;
1256 }
1257
1258 if (p.end()) {
1259 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1260
1261 Dentry *d = request->dentry();
1262 if (d) {
1263 Inode *diri = d->dir->parent_inode;
1264 diri->dir_release_count++;
1265 clear_dir_complete_and_ordered(diri, true);
1266 }
1267
1268 if (d && reply->get_result() == 0) {
1269 if (op == CEPH_MDS_OP_RENAME) {
1270 // rename
1271 Dentry *od = request->old_dentry();
1272 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1273 ceph_assert(od);
1274 unlink(od, true, true); // keep dir, dentry
1275 } else if (op == CEPH_MDS_OP_RMDIR ||
1276 op == CEPH_MDS_OP_UNLINK) {
1277 // unlink, rmdir
1278 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1279 unlink(d, true, true); // keep dir, dentry
1280 }
1281 }
1282 return NULL;
1283 }
1284
1285 ConnectionRef con = request->reply->get_connection();
1286 uint64_t features;
1287 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1288 features = (uint64_t)-1;
1289 }
1290 else {
1291 features = con->get_features();
1292 }
1293 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1294
1295 // snap trace
1296 SnapRealm *realm = NULL;
1297 if (reply->snapbl.length())
1298 update_snap_trace(reply->snapbl, &realm);
1299
1300 ldout(cct, 10) << " hrm "
1301 << " is_target=" << (int)reply->head.is_target
1302 << " is_dentry=" << (int)reply->head.is_dentry
1303 << dendl;
1304
1305 InodeStat dirst;
1306 DirStat dst;
1307 string dname;
1308 LeaseStat dlease;
1309 InodeStat ist;
1310
1311 if (reply->head.is_dentry) {
1312 dirst.decode(p, features);
1313 dst.decode(p, features);
1314 decode(dname, p);
1315 dlease.decode(p, features);
1316 }
1317
1318 Inode *in = 0;
1319 if (reply->head.is_target) {
1320 ist.decode(p, features);
1321 if (cct->_conf->client_debug_getattr_caps) {
1322 unsigned wanted = 0;
1323 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1324 wanted = request->head.args.getattr.mask;
1325 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1326 wanted = request->head.args.open.mask;
1327
1328 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1329 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1330 ceph_abort_msg("MDS reply does not contain xattrs");
1331 }
1332
1333 in = add_update_inode(&ist, request->sent_stamp, session,
1334 request->perms);
1335 }
1336
1337 Inode *diri = NULL;
1338 if (reply->head.is_dentry) {
1339 diri = add_update_inode(&dirst, request->sent_stamp, session,
1340 request->perms);
1341 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1342
1343 if (in) {
1344 Dir *dir = diri->open_dir();
1345 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1346 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1347 } else {
1348 Dentry *dn = NULL;
1349 if (diri->dir && diri->dir->dentries.count(dname)) {
1350 dn = diri->dir->dentries[dname];
1351 if (dn->inode) {
1352 diri->dir_ordered_count++;
1353 clear_dir_complete_and_ordered(diri, false);
1354 unlink(dn, true, true); // keep dir, dentry
1355 }
1356 }
1357 if (dlease.duration_ms > 0) {
1358 if (!dn) {
1359 Dir *dir = diri->open_dir();
1360 dn = link(dir, dname, NULL, NULL);
1361 }
1362 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1363 }
1364 }
1365 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1366 op == CEPH_MDS_OP_MKSNAP) {
1367 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1368 // fake it for snap lookup
1369 vinodeno_t vino = ist.vino;
1370 vino.snapid = CEPH_SNAPDIR;
1371 ceph_assert(inode_map.count(vino));
1372 diri = inode_map[vino];
1373
1374 string dname = request->path.last_dentry();
1375
1376 LeaseStat dlease;
1377 dlease.duration_ms = 0;
1378
1379 if (in) {
1380 Dir *dir = diri->open_dir();
1381 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1382 } else {
1383 if (diri->dir && diri->dir->dentries.count(dname)) {
1384 Dentry *dn = diri->dir->dentries[dname];
1385 if (dn->inode)
1386 unlink(dn, true, true); // keep dir, dentry
1387 }
1388 }
1389 }
1390
1391 if (in) {
1392 if (op == CEPH_MDS_OP_READDIR ||
1393 op == CEPH_MDS_OP_LSSNAP) {
1394 insert_readdir_results(request, session, in);
1395 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1396 // hack: return parent inode instead
1397 in = diri;
1398 }
1399
1400 if (request->dentry() == NULL && in != request->inode()) {
1401 // pin the target inode if its parent dentry is not pinned
1402 request->set_other_inode(in);
1403 }
1404 }
1405
1406 if (realm)
1407 put_snap_realm(realm);
1408
1409 request->target = in;
1410 return in;
1411 }
1412
1413 // -------
1414
1415 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1416 {
1417 mds_rank_t mds = MDS_RANK_NONE;
1418 __u32 hash = 0;
1419 bool is_hash = false;
1420
1421 Inode *in = NULL;
1422 Dentry *de = NULL;
1423
1424 if (req->resend_mds >= 0) {
1425 mds = req->resend_mds;
1426 req->resend_mds = -1;
1427 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1428 goto out;
1429 }
1430
1431 if (cct->_conf->client_use_random_mds)
1432 goto random_mds;
1433
1434 in = req->inode();
1435 de = req->dentry();
1436 if (in) {
1437 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1438 if (req->path.depth()) {
1439 hash = in->hash_dentry_name(req->path[0]);
1440 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1441 << " on " << req->path[0]
1442 << " => " << hash << dendl;
1443 is_hash = true;
1444 }
1445 } else if (de) {
1446 if (de->inode) {
1447 in = de->inode.get();
1448 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1449 } else {
1450 in = de->dir->parent_inode;
1451 hash = in->hash_dentry_name(de->name);
1452 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1453 << " on " << de->name
1454 << " => " << hash << dendl;
1455 is_hash = true;
1456 }
1457 }
1458 if (in) {
1459 if (in->snapid != CEPH_NOSNAP) {
1460 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1461 while (in->snapid != CEPH_NOSNAP) {
1462 if (in->snapid == CEPH_SNAPDIR)
1463 in = in->snapdir_parent.get();
1464 else if (!in->dentries.empty())
1465 /* In most cases there will only be one dentry, so getting it
1466 * will be the correct action. If there are multiple hard links,
1467 * I think the MDS should be able to redirect as needed*/
1468 in = in->get_first_parent()->dir->parent_inode;
1469 else {
1470 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1471 break;
1472 }
1473 }
1474 is_hash = false;
1475 }
1476
1477 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1478 << " hash=" << hash << dendl;
1479
1480 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1481 frag_t fg = in->dirfragtree[hash];
1482 if (in->fragmap.count(fg)) {
1483 mds = in->fragmap[fg];
1484 if (phash_diri)
1485 *phash_diri = in;
1486 } else if (in->auth_cap) {
1487 mds = in->auth_cap->session->mds_num;
1488 }
1489 if (mds >= 0) {
1490 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1491 goto out;
1492 }
1493 }
1494
1495 if (in->auth_cap && req->auth_is_best()) {
1496 mds = in->auth_cap->session->mds_num;
1497 } else if (!in->caps.empty()) {
1498 mds = in->caps.begin()->second.session->mds_num;
1499 } else {
1500 goto random_mds;
1501 }
1502 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1503
1504 goto out;
1505 }
1506
1507 random_mds:
1508 if (mds < 0) {
1509 mds = _get_random_up_mds();
1510 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1511 }
1512
1513 out:
1514 ldout(cct, 20) << "mds is " << mds << dendl;
1515 return mds;
1516 }
1517
1518
1519 void Client::connect_mds_targets(mds_rank_t mds)
1520 {
1521 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1522 ceph_assert(mds_sessions.count(mds));
1523 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1524 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1525 q != info.export_targets.end();
1526 ++q) {
1527 if (mds_sessions.count(*q) == 0 &&
1528 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1529 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1530 << " export target mds." << *q << dendl;
1531 _open_mds_session(*q);
1532 }
1533 }
1534 }
1535
1536 void Client::dump_mds_sessions(Formatter *f)
1537 {
1538 f->dump_int("id", get_nodeid().v);
1539 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1540 f->dump_object("inst", inst);
1541 f->dump_stream("inst_str") << inst;
1542 f->dump_stream("addr_str") << inst.addr;
1543 f->open_array_section("sessions");
1544 for (const auto &p : mds_sessions) {
1545 f->open_object_section("session");
1546 p.second.dump(f);
1547 f->close_section();
1548 }
1549 f->close_section();
1550 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1551 }
1552 void Client::dump_mds_requests(Formatter *f)
1553 {
1554 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1555 p != mds_requests.end();
1556 ++p) {
1557 f->open_object_section("request");
1558 p->second->dump(f);
1559 f->close_section();
1560 }
1561 }
1562
1563 int Client::verify_reply_trace(int r, MetaSession *session,
1564 MetaRequest *request, const MConstRef<MClientReply>& reply,
1565 InodeRef *ptarget, bool *pcreated,
1566 const UserPerm& perms)
1567 {
1568 // check whether this request actually did the create, and set created flag
1569 bufferlist extra_bl;
1570 inodeno_t created_ino;
1571 bool got_created_ino = false;
1572 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1573
1574 extra_bl = reply->get_extra_bl();
1575 if (extra_bl.length() >= 8) {
1576 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1577 struct openc_response_t ocres;
1578
1579 decode(ocres, extra_bl);
1580 created_ino = ocres.created_ino;
1581 /*
1582 * The userland cephfs client doesn't have a way to do an async create
1583 * (yet), so just discard delegated_inos for now. Eventually we should
1584 * store them and use them in create calls, even if they are synchronous,
1585 * if only for testing purposes.
1586 */
1587 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1588 } else {
1589 // u64 containing number of created ino
1590 decode(created_ino, extra_bl);
1591 }
1592 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1593 got_created_ino = true;
1594 }
1595
1596 if (pcreated)
1597 *pcreated = got_created_ino;
1598
1599 if (request->target) {
1600 *ptarget = request->target;
1601 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1602 } else {
1603 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1604 (*ptarget) = p->second;
1605 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1606 } else {
1607 // we got a traceless reply, and need to look up what we just
1608 // created. for now, do this by name. someday, do this by the
1609 // ino... which we know! FIXME.
1610 InodeRef target;
1611 Dentry *d = request->dentry();
1612 if (d) {
1613 if (d->dir) {
1614 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1615 << d->dir->parent_inode->ino << "/" << d->name
1616 << " got_ino " << got_created_ino
1617 << " ino " << created_ino
1618 << dendl;
1619 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1620 &target, perms);
1621 } else {
1622 // if the dentry is not linked, just do our best. see #5021.
1623 ceph_abort_msg("how did this happen? i want logs!");
1624 }
1625 } else {
1626 Inode *in = request->inode();
1627 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1628 << in->ino << dendl;
1629 r = _getattr(in, request->regetattr_mask, perms, true);
1630 target = in;
1631 }
1632 if (r >= 0) {
1633 // verify ino returned in reply and trace_dist are the same
1634 if (got_created_ino &&
1635 created_ino.val != target->ino.val) {
1636 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1637 r = -EINTR;
1638 }
1639 if (ptarget)
1640 ptarget->swap(target);
1641 }
1642 }
1643 }
1644
1645 return r;
1646 }
1647
1648
1649 /**
1650 * make a request
1651 *
1652 * Blocking helper to make an MDS request.
1653 *
1654 * If the ptarget flag is set, behavior changes slightly: the caller
1655 * expects to get a pointer to the inode we are creating or operating
1656 * on. As a result, we will follow up any traceless mutation reply
1657 * with a getattr or lookup to transparently handle a traceless reply
1658 * from the MDS (as when the MDS restarts and the client has to replay
1659 * a request).
1660 *
1661 * @param request the MetaRequest to execute
1662 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665 * @param use_mds [optional] prefer a specific mds (-1 for default)
1666 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1667 */
1668 int Client::make_request(MetaRequest *request,
1669 const UserPerm& perms,
1670 InodeRef *ptarget, bool *pcreated,
1671 mds_rank_t use_mds,
1672 bufferlist *pdirbl)
1673 {
1674 int r = 0;
1675
1676 // assign a unique tid
1677 ceph_tid_t tid = ++last_tid;
1678 request->set_tid(tid);
1679
1680 // and timestamp
1681 request->op_stamp = ceph_clock_now();
1682
1683 // make note
1684 mds_requests[tid] = request->get();
1685 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1686 oldest_tid = tid;
1687
1688 request->set_caller_perms(perms);
1689
1690 if (cct->_conf->client_inject_fixed_oldest_tid) {
1691 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1692 request->set_oldest_client_tid(1);
1693 } else {
1694 request->set_oldest_client_tid(oldest_tid);
1695 }
1696
1697 // hack target mds?
1698 if (use_mds >= 0)
1699 request->resend_mds = use_mds;
1700
1701 MetaSession *session = NULL;
1702 while (1) {
1703 if (request->aborted())
1704 break;
1705
1706 if (blacklisted) {
1707 request->abort(-EBLACKLISTED);
1708 break;
1709 }
1710
1711 // set up wait cond
1712 ceph::condition_variable caller_cond;
1713 request->caller_cond = &caller_cond;
1714
1715 // choose mds
1716 Inode *hash_diri = NULL;
1717 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1718 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1719 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1720 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1721 if (hash_diri) {
1722 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1723 _fragmap_remove_stopped_mds(hash_diri, mds);
1724 } else {
1725 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1726 request->resend_mds = _get_random_up_mds();
1727 }
1728 } else {
1729 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1730 wait_on_list(waiting_for_mdsmap);
1731 }
1732 continue;
1733 }
1734
1735 // open a session?
1736 if (!have_open_session(mds)) {
1737 session = _get_or_open_mds_session(mds);
1738
1739 // wait
1740 if (session->state == MetaSession::STATE_OPENING) {
1741 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1742 wait_on_context_list(session->waiting_for_open);
1743 // Abort requests on REJECT from MDS
1744 if (rejected_by_mds.count(mds)) {
1745 request->abort(-EPERM);
1746 break;
1747 }
1748 continue;
1749 }
1750
1751 if (!have_open_session(mds))
1752 continue;
1753 } else {
1754 session = &mds_sessions.at(mds);
1755 }
1756
1757 // send request.
1758 send_request(request, session);
1759
1760 // wait for signal
1761 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1762 request->kick = false;
1763 std::unique_lock l{client_lock, std::adopt_lock};
1764 caller_cond.wait(l, [request] {
1765 return (request->reply || // reply
1766 request->resend_mds >= 0 || // forward
1767 request->kick);
1768 });
1769 l.release();
1770 request->caller_cond = nullptr;
1771
1772 // did we get a reply?
1773 if (request->reply)
1774 break;
1775 }
1776
1777 if (!request->reply) {
1778 ceph_assert(request->aborted());
1779 ceph_assert(!request->got_unsafe);
1780 r = request->get_abort_code();
1781 request->item.remove_myself();
1782 unregister_request(request);
1783 put_request(request);
1784 return r;
1785 }
1786
1787 // got it!
1788 auto reply = std::move(request->reply);
1789 r = reply->get_result();
1790 if (r >= 0)
1791 request->success = true;
1792
1793 // kick dispatcher (we've got it!)
1794 ceph_assert(request->dispatch_cond);
1795 request->dispatch_cond->notify_all();
1796 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1797 request->dispatch_cond = 0;
1798
1799 if (r >= 0 && ptarget)
1800 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1801
1802 if (pdirbl)
1803 *pdirbl = reply->get_extra_bl();
1804
1805 // -- log times --
1806 utime_t lat = ceph_clock_now();
1807 lat -= request->sent_stamp;
1808 ldout(cct, 20) << "lat " << lat << dendl;
1809 logger->tinc(l_c_lat, lat);
1810 logger->tinc(l_c_reply, lat);
1811
1812 put_request(request);
1813 return r;
1814 }
1815
1816 void Client::unregister_request(MetaRequest *req)
1817 {
1818 mds_requests.erase(req->tid);
1819 if (req->tid == oldest_tid) {
1820 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1821 while (true) {
1822 if (p == mds_requests.end()) {
1823 oldest_tid = 0;
1824 break;
1825 }
1826 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1827 oldest_tid = p->first;
1828 break;
1829 }
1830 ++p;
1831 }
1832 }
1833 put_request(req);
1834 }
1835
1836 void Client::put_request(MetaRequest *request)
1837 {
1838 if (request->_put()) {
1839 int op = -1;
1840 if (request->success)
1841 op = request->get_op();
1842 InodeRef other_in;
1843 request->take_other_inode(&other_in);
1844 delete request;
1845
1846 if (other_in &&
1847 (op == CEPH_MDS_OP_RMDIR ||
1848 op == CEPH_MDS_OP_RENAME ||
1849 op == CEPH_MDS_OP_RMSNAP)) {
1850 _try_to_trim_inode(other_in.get(), false);
1851 }
1852 }
1853 }
1854
1855 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1856 mds_rank_t mds, int drop,
1857 int unless, int force)
1858 {
1859 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1860 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1861 << ", force:" << force << ")" << dendl;
1862 int released = 0;
1863 auto it = in->caps.find(mds);
1864 if (it != in->caps.end()) {
1865 Cap &cap = it->second;
1866 drop &= ~(in->dirty_caps | get_caps_used(in));
1867 if ((drop & cap.issued) &&
1868 !(unless & cap.issued)) {
1869 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1870 cap.issued &= ~drop;
1871 cap.implemented &= ~drop;
1872 released = 1;
1873 } else {
1874 released = force;
1875 }
1876 if (released) {
1877 cap.wanted = in->caps_wanted();
1878 if (&cap == in->auth_cap &&
1879 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1880 in->requested_max_size = 0;
1881 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1882 }
1883 ceph_mds_request_release rel;
1884 rel.ino = in->ino;
1885 rel.cap_id = cap.cap_id;
1886 rel.seq = cap.seq;
1887 rel.issue_seq = cap.issue_seq;
1888 rel.mseq = cap.mseq;
1889 rel.caps = cap.implemented;
1890 rel.wanted = cap.wanted;
1891 rel.dname_len = 0;
1892 rel.dname_seq = 0;
1893 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1894 }
1895 }
1896 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1897 << released << dendl;
1898 return released;
1899 }
1900
1901 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1902 mds_rank_t mds, int drop, int unless)
1903 {
1904 ldout(cct, 20) << __func__ << " enter(dn:"
1905 << dn << ")" << dendl;
1906 int released = 0;
1907 if (dn->dir)
1908 released = encode_inode_release(dn->dir->parent_inode, req,
1909 mds, drop, unless, 1);
1910 if (released && dn->lease_mds == mds) {
1911 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1912 auto& rel = req->cap_releases.back();
1913 rel.item.dname_len = dn->name.length();
1914 rel.item.dname_seq = dn->lease_seq;
1915 rel.dname = dn->name;
1916 }
1917 ldout(cct, 25) << __func__ << " exit(dn:"
1918 << dn << ")" << dendl;
1919 }
1920
1921
1922 /*
1923 * This requires the MClientRequest *request member to be set.
1924 * It will error out horribly without one.
1925 * Additionally, if you set any *drop member, you'd better have
1926 * set the corresponding dentry!
1927 */
1928 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1929 {
1930 ldout(cct, 20) << __func__ << " enter (req: "
1931 << req << ", mds: " << mds << ")" << dendl;
1932 if (req->inode_drop && req->inode())
1933 encode_inode_release(req->inode(), req,
1934 mds, req->inode_drop,
1935 req->inode_unless);
1936
1937 if (req->old_inode_drop && req->old_inode())
1938 encode_inode_release(req->old_inode(), req,
1939 mds, req->old_inode_drop,
1940 req->old_inode_unless);
1941 if (req->other_inode_drop && req->other_inode())
1942 encode_inode_release(req->other_inode(), req,
1943 mds, req->other_inode_drop,
1944 req->other_inode_unless);
1945
1946 if (req->dentry_drop && req->dentry())
1947 encode_dentry_release(req->dentry(), req,
1948 mds, req->dentry_drop,
1949 req->dentry_unless);
1950
1951 if (req->old_dentry_drop && req->old_dentry())
1952 encode_dentry_release(req->old_dentry(), req,
1953 mds, req->old_dentry_drop,
1954 req->old_dentry_unless);
1955 ldout(cct, 25) << __func__ << " exit (req: "
1956 << req << ", mds " << mds <<dendl;
1957 }
1958
1959 bool Client::have_open_session(mds_rank_t mds)
1960 {
1961 const auto &it = mds_sessions.find(mds);
1962 return it != mds_sessions.end() &&
1963 (it->second.state == MetaSession::STATE_OPEN ||
1964 it->second.state == MetaSession::STATE_STALE);
1965 }
1966
1967 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1968 {
1969 const auto &it = mds_sessions.find(mds);
1970 if (it == mds_sessions.end() || it->second.con != con) {
1971 return NULL;
1972 } else {
1973 return &it->second;
1974 }
1975 }
1976
1977 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1978 {
1979 auto it = mds_sessions.find(mds);
1980 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1981 }
1982
1983 /**
1984 * Populate a map of strings with client-identifying metadata,
1985 * such as the hostname. Call this once at initialization.
1986 */
1987 void Client::populate_metadata(const std::string &mount_root)
1988 {
1989 // Hostname
1990 struct utsname u;
1991 int r = uname(&u);
1992 if (r >= 0) {
1993 metadata["hostname"] = u.nodename;
1994 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1995 } else {
1996 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1997 }
1998
1999 metadata["pid"] = stringify(getpid());
2000
2001 // Ceph entity id (the '0' in "client.0")
2002 metadata["entity_id"] = cct->_conf->name.get_id();
2003
2004 // Our mount position
2005 if (!mount_root.empty()) {
2006 metadata["root"] = mount_root;
2007 }
2008
2009 // Ceph version
2010 metadata["ceph_version"] = pretty_version_to_str();
2011 metadata["ceph_sha1"] = git_version_to_str();
2012
2013 // Apply any metadata from the user's configured overrides
2014 std::vector<std::string> tokens;
2015 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2016 for (const auto &i : tokens) {
2017 auto eqpos = i.find("=");
2018 // Throw out anything that isn't of the form "<str>=<str>"
2019 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2020 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2021 continue;
2022 }
2023 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2024 }
2025 }
2026
2027 /**
2028 * Optionally add or override client metadata fields.
2029 */
2030 void Client::update_metadata(std::string const &k, std::string const &v)
2031 {
2032 std::lock_guard l(client_lock);
2033 ceph_assert(initialized);
2034
2035 auto it = metadata.find(k);
2036 if (it != metadata.end()) {
2037 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2038 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2039 }
2040
2041 metadata[k] = v;
2042 }
2043
2044 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2045 {
2046 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2047 auto addrs = mdsmap->get_addrs(mds);
2048 auto em = mds_sessions.emplace(std::piecewise_construct,
2049 std::forward_as_tuple(mds),
2050 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2051 ceph_assert(em.second); /* not already present */
2052 MetaSession *session = &em.first->second;
2053
2054 // Maybe skip sending a request to open if this MDS daemon
2055 // has previously sent us a REJECT.
2056 if (rejected_by_mds.count(mds)) {
2057 if (rejected_by_mds[mds] == session->addrs) {
2058 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2059 "because we were rejected" << dendl;
2060 return session;
2061 } else {
2062 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2063 "rejected us, trying with new inst" << dendl;
2064 rejected_by_mds.erase(mds);
2065 }
2066 }
2067
2068 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2069 m->metadata = metadata;
2070 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2071 session->con->send_message2(std::move(m));
2072 return session;
2073 }
2074
2075 void Client::_close_mds_session(MetaSession *s)
2076 {
2077 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2078 s->state = MetaSession::STATE_CLOSING;
2079 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2080 }
2081
2082 void Client::_closed_mds_session(MetaSession *s)
2083 {
2084 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2085 s->state = MetaSession::STATE_CLOSED;
2086 s->con->mark_down();
2087 signal_context_list(s->waiting_for_open);
2088 mount_cond.notify_all();
2089 remove_session_caps(s);
2090 kick_requests_closed(s);
2091 mds_sessions.erase(s->mds_num);
2092 }
2093
2094 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2095 {
2096 mds_rank_t from = mds_rank_t(m->get_source().num());
2097 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2098
2099 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2100 if (!session) {
2101 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2102 return;
2103 }
2104
2105 switch (m->get_op()) {
2106 case CEPH_SESSION_OPEN:
2107 {
2108 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2109 missing_features -= m->supported_features;
2110 if (!missing_features.empty()) {
2111 lderr(cct) << "mds." << from << " lacks required features '"
2112 << missing_features << "', closing session " << dendl;
2113 rejected_by_mds[session->mds_num] = session->addrs;
2114 _close_mds_session(session);
2115 _closed_mds_session(session);
2116 break;
2117 }
2118 session->mds_features = std::move(m->supported_features);
2119
2120 renew_caps(session);
2121 session->state = MetaSession::STATE_OPEN;
2122 if (unmounting)
2123 mount_cond.notify_all();
2124 else
2125 connect_mds_targets(from);
2126 signal_context_list(session->waiting_for_open);
2127 break;
2128 }
2129
2130 case CEPH_SESSION_CLOSE:
2131 _closed_mds_session(session);
2132 break;
2133
2134 case CEPH_SESSION_RENEWCAPS:
2135 if (session->cap_renew_seq == m->get_seq()) {
2136 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2137 session->cap_ttl =
2138 session->last_cap_renew_request + mdsmap->get_session_timeout();
2139 if (was_stale)
2140 wake_up_session_caps(session, false);
2141 }
2142 break;
2143
2144 case CEPH_SESSION_STALE:
2145 // invalidate session caps/leases
2146 session->cap_gen++;
2147 session->cap_ttl = ceph_clock_now();
2148 session->cap_ttl -= 1;
2149 renew_caps(session);
2150 break;
2151
2152 case CEPH_SESSION_RECALL_STATE:
2153 trim_caps(session, m->get_max_caps());
2154 break;
2155
2156 case CEPH_SESSION_FLUSHMSG:
2157 /* flush cap release */
2158 if (auto& m = session->release; m) {
2159 session->con->send_message2(std::move(m));
2160 }
2161 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2162 break;
2163
2164 case CEPH_SESSION_FORCE_RO:
2165 force_session_readonly(session);
2166 break;
2167
2168 case CEPH_SESSION_REJECT:
2169 {
2170 std::string_view error_str;
2171 auto it = m->metadata.find("error_string");
2172 if (it != m->metadata.end())
2173 error_str = it->second;
2174 else
2175 error_str = "unknown error";
2176 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2177
2178 rejected_by_mds[session->mds_num] = session->addrs;
2179 _closed_mds_session(session);
2180 }
2181 break;
2182
2183 default:
2184 ceph_abort();
2185 }
2186 }
2187
2188 bool Client::_any_stale_sessions() const
2189 {
2190 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2191
2192 for (const auto &p : mds_sessions) {
2193 if (p.second.state == MetaSession::STATE_STALE) {
2194 return true;
2195 }
2196 }
2197
2198 return false;
2199 }
2200
2201 void Client::_kick_stale_sessions()
2202 {
2203 ldout(cct, 1) << __func__ << dendl;
2204
2205 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2206 MetaSession &s = it->second;
2207 ++it;
2208 if (s.state == MetaSession::STATE_STALE)
2209 _closed_mds_session(&s);
2210 }
2211 }
2212
2213 void Client::send_request(MetaRequest *request, MetaSession *session,
2214 bool drop_cap_releases)
2215 {
2216 // make the request
2217 mds_rank_t mds = session->mds_num;
2218 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2219 << " for mds." << mds << dendl;
2220 auto r = build_client_request(request);
2221 if (request->dentry()) {
2222 r->set_dentry_wanted();
2223 }
2224 if (request->got_unsafe) {
2225 r->set_replayed_op();
2226 if (request->target)
2227 r->head.ino = request->target->ino;
2228 } else {
2229 encode_cap_releases(request, mds);
2230 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2231 request->cap_releases.clear();
2232 else
2233 r->releases.swap(request->cap_releases);
2234 }
2235 r->set_mdsmap_epoch(mdsmap->get_epoch());
2236 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2237 objecter->with_osdmap([r](const OSDMap& o) {
2238 r->set_osdmap_epoch(o.get_epoch());
2239 });
2240 }
2241
2242 if (request->mds == -1) {
2243 request->sent_stamp = ceph_clock_now();
2244 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2245 }
2246 request->mds = mds;
2247
2248 Inode *in = request->inode();
2249 if (in) {
2250 auto it = in->caps.find(mds);
2251 if (it != in->caps.end()) {
2252 request->sent_on_mseq = it->second.mseq;
2253 }
2254 }
2255
2256 session->requests.push_back(&request->item);
2257
2258 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2259 session->con->send_message2(std::move(r));
2260 }
2261
2262 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2263 {
2264 auto req = make_message<MClientRequest>(request->get_op());
2265 req->set_tid(request->tid);
2266 req->set_stamp(request->op_stamp);
2267 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2268
2269 // if the filepath's haven't been set, set them!
2270 if (request->path.empty()) {
2271 Inode *in = request->inode();
2272 Dentry *de = request->dentry();
2273 if (in)
2274 in->make_nosnap_relative_path(request->path);
2275 else if (de) {
2276 if (de->inode)
2277 de->inode->make_nosnap_relative_path(request->path);
2278 else if (de->dir) {
2279 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2280 request->path.push_dentry(de->name);
2281 }
2282 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2283 << " No path, inode, or appropriately-endowed dentry given!"
2284 << dendl;
2285 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2286 << " No path, inode, or dentry given!"
2287 << dendl;
2288 }
2289 req->set_filepath(request->get_filepath());
2290 req->set_filepath2(request->get_filepath2());
2291 req->set_data(request->data);
2292 req->set_retry_attempt(request->retry_attempt++);
2293 req->head.num_fwd = request->num_fwd;
2294 const gid_t *_gids;
2295 int gid_count = request->perms.get_gids(&_gids);
2296 req->set_gid_list(gid_count, _gids);
2297 return req;
2298 }
2299
2300
2301
2302 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2303 {
2304 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2305 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2306 if (!session) {
2307 return;
2308 }
2309 ceph_tid_t tid = fwd->get_tid();
2310
2311 if (mds_requests.count(tid) == 0) {
2312 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2313 return;
2314 }
2315
2316 MetaRequest *request = mds_requests[tid];
2317 ceph_assert(request);
2318
2319 // reset retry counter
2320 request->retry_attempt = 0;
2321
2322 // request not forwarded, or dest mds has no session.
2323 // resend.
2324 ldout(cct, 10) << __func__ << " tid " << tid
2325 << " fwd " << fwd->get_num_fwd()
2326 << " to mds." << fwd->get_dest_mds()
2327 << ", resending to " << fwd->get_dest_mds()
2328 << dendl;
2329
2330 request->mds = -1;
2331 request->item.remove_myself();
2332 request->num_fwd = fwd->get_num_fwd();
2333 request->resend_mds = fwd->get_dest_mds();
2334 request->caller_cond->notify_all();
2335 }
2336
2337 bool Client::is_dir_operation(MetaRequest *req)
2338 {
2339 int op = req->get_op();
2340 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2341 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2342 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2343 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2344 return true;
2345 return false;
2346 }
2347
2348 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2349 {
2350 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2351 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2352 if (!session) {
2353 return;
2354 }
2355
2356 ceph_tid_t tid = reply->get_tid();
2357 bool is_safe = reply->is_safe();
2358
2359 if (mds_requests.count(tid) == 0) {
2360 lderr(cct) << __func__ << " no pending request on tid " << tid
2361 << " safe is:" << is_safe << dendl;
2362 return;
2363 }
2364 MetaRequest *request = mds_requests.at(tid);
2365
2366 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2367 << " tid " << tid << dendl;
2368
2369 if (request->got_unsafe && !is_safe) {
2370 //duplicate response
2371 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2372 << mds_num << " safe:" << is_safe << dendl;
2373 return;
2374 }
2375
2376 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2377 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2378 << " from mds." << request->mds << dendl;
2379 request->send_to_auth = true;
2380 request->resend_mds = choose_target_mds(request);
2381 Inode *in = request->inode();
2382 std::map<mds_rank_t, Cap>::const_iterator it;
2383 if (request->resend_mds >= 0 &&
2384 request->resend_mds == request->mds &&
2385 (in == NULL ||
2386 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2387 request->sent_on_mseq == it->second.mseq)) {
2388 ldout(cct, 20) << "have to return ESTALE" << dendl;
2389 } else {
2390 request->caller_cond->notify_all();
2391 return;
2392 }
2393 }
2394
2395 ceph_assert(!request->reply);
2396 request->reply = reply;
2397 insert_trace(request, session);
2398
2399 // Handle unsafe reply
2400 if (!is_safe) {
2401 request->got_unsafe = true;
2402 session->unsafe_requests.push_back(&request->unsafe_item);
2403 if (is_dir_operation(request)) {
2404 Inode *dir = request->inode();
2405 ceph_assert(dir);
2406 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2407 }
2408 if (request->target) {
2409 InodeRef &in = request->target;
2410 in->unsafe_ops.push_back(&request->unsafe_target_item);
2411 }
2412 }
2413
2414 // Only signal the caller once (on the first reply):
2415 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2416 if (!is_safe || !request->got_unsafe) {
2417 ceph::condition_variable cond;
2418 request->dispatch_cond = &cond;
2419
2420 // wake up waiter
2421 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2422 request->caller_cond->notify_all();
2423
2424 // wake for kick back
2425 std::unique_lock l{client_lock, std::adopt_lock};
2426 cond.wait(l, [tid, request, &cond, this] {
2427 if (request->dispatch_cond) {
2428 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2429 << tid << " " << &cond << dendl;
2430 }
2431 return !request->dispatch_cond;
2432 });
2433 l.release();
2434 }
2435
2436 if (is_safe) {
2437 // the filesystem change is committed to disk
2438 // we're done, clean up
2439 if (request->got_unsafe) {
2440 request->unsafe_item.remove_myself();
2441 request->unsafe_dir_item.remove_myself();
2442 request->unsafe_target_item.remove_myself();
2443 signal_cond_list(request->waitfor_safe);
2444 }
2445 request->item.remove_myself();
2446 unregister_request(request);
2447 }
2448 if (unmounting)
2449 mount_cond.notify_all();
2450 }
2451
2452 void Client::_handle_full_flag(int64_t pool)
2453 {
2454 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2455 << "on " << pool << dendl;
2456 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2457 // to do this rather than blocking, because otherwise when we fill up we
2458 // potentially lock caps forever on files with dirty pages, and we need
2459 // to be able to release those caps to the MDS so that it can delete files
2460 // and free up space.
2461 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2462
2463 // For all inodes with layouts in this pool and a pending flush write op
2464 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2465 // from ObjectCacher so that it doesn't re-issue the write in response to
2466 // the ENOSPC error.
2467 // Fortunately since we're cancelling everything in a given pool, we don't
2468 // need to know which ops belong to which ObjectSet, we can just blow all
2469 // the un-flushed cached data away and mark any dirty inodes' async_err
2470 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2471 // affecting this pool, and all the objectsets we're purging were also
2472 // in this pool.
2473 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2474 i != inode_map.end(); ++i)
2475 {
2476 Inode *inode = i->second;
2477 if (inode->oset.dirty_or_tx
2478 && (pool == -1 || inode->layout.pool_id == pool)) {
2479 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2480 << " has dirty objects, purging and setting ENOSPC" << dendl;
2481 objectcacher->purge_set(&inode->oset);
2482 inode->set_async_err(-ENOSPC);
2483 }
2484 }
2485
2486 if (cancelled_epoch != (epoch_t)-1) {
2487 set_cap_epoch_barrier(cancelled_epoch);
2488 }
2489 }
2490
2491 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2492 {
2493 std::set<entity_addr_t> new_blacklists;
2494 objecter->consume_blacklist_events(&new_blacklists);
2495
2496 const auto myaddrs = messenger->get_myaddrs();
2497 bool new_blacklist = false;
2498 bool prenautilus = objecter->with_osdmap(
2499 [&](const OSDMap& o) {
2500 return o.require_osd_release < ceph_release_t::nautilus;
2501 });
2502 if (!blacklisted) {
2503 for (auto a : myaddrs.v) {
2504 // blacklist entries are always TYPE_ANY for nautilus+
2505 a.set_type(entity_addr_t::TYPE_ANY);
2506 if (new_blacklists.count(a)) {
2507 new_blacklist = true;
2508 break;
2509 }
2510 if (prenautilus) {
2511 // ...except pre-nautilus, they were TYPE_LEGACY
2512 a.set_type(entity_addr_t::TYPE_LEGACY);
2513 if (new_blacklists.count(a)) {
2514 new_blacklist = true;
2515 break;
2516 }
2517 }
2518 }
2519 }
2520 if (new_blacklist) {
2521 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2522 return o.get_epoch();
2523 });
2524 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2525 blacklisted = true;
2526
2527 _abort_mds_sessions(-EBLACKLISTED);
2528
2529 // Since we know all our OSD ops will fail, cancel them all preemtively,
2530 // so that on an unhealthy cluster we can umount promptly even if e.g.
2531 // some PGs were inaccessible.
2532 objecter->op_cancel_writes(-EBLACKLISTED);
2533
2534 } else if (blacklisted) {
2535 // Handle case where we were blacklisted but no longer are
2536 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2537 return o.is_blacklisted(myaddrs);});
2538 }
2539
2540 // Always subscribe to next osdmap for blacklisted client
2541 // until this client is not blacklisted.
2542 if (blacklisted) {
2543 objecter->maybe_request_map();
2544 }
2545
2546 if (objecter->osdmap_full_flag()) {
2547 _handle_full_flag(-1);
2548 } else {
2549 // Accumulate local list of full pools so that I can drop
2550 // the objecter lock before re-entering objecter in
2551 // cancel_writes
2552 std::vector<int64_t> full_pools;
2553
2554 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2555 for (const auto& kv : o.get_pools()) {
2556 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2557 full_pools.push_back(kv.first);
2558 }
2559 }
2560 });
2561
2562 for (auto p : full_pools)
2563 _handle_full_flag(p);
2564
2565 // Subscribe to subsequent maps to watch for the full flag going
2566 // away. For the global full flag objecter does this for us, but
2567 // it pays no attention to the per-pool full flag so in this branch
2568 // we do it ourselves.
2569 if (!full_pools.empty()) {
2570 objecter->maybe_request_map();
2571 }
2572 }
2573 }
2574
2575
2576 // ------------------------
2577 // incoming messages
2578
2579
2580 bool Client::ms_dispatch2(const MessageRef &m)
2581 {
2582 std::lock_guard l(client_lock);
2583 if (!initialized) {
2584 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2585 return true;
2586 }
2587
2588 switch (m->get_type()) {
2589 // mounting and mds sessions
2590 case CEPH_MSG_MDS_MAP:
2591 handle_mds_map(ref_cast<MMDSMap>(m));
2592 break;
2593 case CEPH_MSG_FS_MAP:
2594 handle_fs_map(ref_cast<MFSMap>(m));
2595 break;
2596 case CEPH_MSG_FS_MAP_USER:
2597 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2598 break;
2599 case CEPH_MSG_CLIENT_SESSION:
2600 handle_client_session(ref_cast<MClientSession>(m));
2601 break;
2602
2603 case CEPH_MSG_OSD_MAP:
2604 handle_osd_map(ref_cast<MOSDMap>(m));
2605 break;
2606
2607 // requests
2608 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2609 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2610 break;
2611 case CEPH_MSG_CLIENT_REPLY:
2612 handle_client_reply(ref_cast<MClientReply>(m));
2613 break;
2614
2615 // reclaim reply
2616 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2617 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2618 break;
2619
2620 case CEPH_MSG_CLIENT_SNAP:
2621 handle_snap(ref_cast<MClientSnap>(m));
2622 break;
2623 case CEPH_MSG_CLIENT_CAPS:
2624 handle_caps(ref_cast<MClientCaps>(m));
2625 break;
2626 case CEPH_MSG_CLIENT_LEASE:
2627 handle_lease(ref_cast<MClientLease>(m));
2628 break;
2629 case MSG_COMMAND_REPLY:
2630 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2631 handle_command_reply(ref_cast<MCommandReply>(m));
2632 } else {
2633 return false;
2634 }
2635 break;
2636 case CEPH_MSG_CLIENT_QUOTA:
2637 handle_quota(ref_cast<MClientQuota>(m));
2638 break;
2639
2640 default:
2641 return false;
2642 }
2643
2644 // unmounting?
2645 if (unmounting) {
2646 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2647 << "+" << inode_map.size() << dendl;
2648 long unsigned size = lru.lru_get_size() + inode_map.size();
2649 trim_cache();
2650 if (size < lru.lru_get_size() + inode_map.size()) {
2651 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2652 mount_cond.notify_all();
2653 } else {
2654 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2655 << "+" << inode_map.size() << dendl;
2656 }
2657 }
2658
2659 return true;
2660 }
2661
2662 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2663 {
2664 fsmap.reset(new FSMap(m->get_fsmap()));
2665
2666 signal_cond_list(waiting_for_fsmap);
2667
2668 monclient->sub_got("fsmap", fsmap->get_epoch());
2669 }
2670
2671 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2672 {
2673 fsmap_user.reset(new FSMapUser);
2674 *fsmap_user = m->get_fsmap();
2675
2676 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2677 signal_cond_list(waiting_for_fsmap);
2678 }
2679
2680 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2681 {
2682 mds_gid_t old_inc, new_inc;
2683 if (m->get_epoch() <= mdsmap->get_epoch()) {
2684 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2685 << " is identical to or older than our "
2686 << mdsmap->get_epoch() << dendl;
2687 return;
2688 }
2689
2690 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2691
2692 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2693 oldmap.swap(mdsmap);
2694
2695 mdsmap->decode(m->get_encoded());
2696
2697 // Cancel any commands for missing or laggy GIDs
2698 std::list<ceph_tid_t> cancel_ops;
2699 auto &commands = command_table.get_commands();
2700 for (const auto &i : commands) {
2701 auto &op = i.second;
2702 const mds_gid_t op_mds_gid = op.mds_gid;
2703 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2704 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2705 cancel_ops.push_back(i.first);
2706 if (op.outs) {
2707 std::ostringstream ss;
2708 ss << "MDS " << op_mds_gid << " went away";
2709 *(op.outs) = ss.str();
2710 }
2711 op.con->mark_down();
2712 if (op.on_finish) {
2713 op.on_finish->complete(-ETIMEDOUT);
2714 }
2715 }
2716 }
2717
2718 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2719 i != cancel_ops.end(); ++i) {
2720 command_table.erase(*i);
2721 }
2722
2723 // reset session
2724 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2725 mds_rank_t mds = p->first;
2726 MetaSession *session = &p->second;
2727 ++p;
2728
2729 int oldstate = oldmap->get_state(mds);
2730 int newstate = mdsmap->get_state(mds);
2731 if (!mdsmap->is_up(mds)) {
2732 session->con->mark_down();
2733 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2734 old_inc = oldmap->get_incarnation(mds);
2735 new_inc = mdsmap->get_incarnation(mds);
2736 if (old_inc != new_inc) {
2737 ldout(cct, 1) << "mds incarnation changed from "
2738 << old_inc << " to " << new_inc << dendl;
2739 oldstate = MDSMap::STATE_NULL;
2740 }
2741 session->con->mark_down();
2742 session->addrs = mdsmap->get_addrs(mds);
2743 // When new MDS starts to take over, notify kernel to trim unused entries
2744 // in its dcache/icache. Hopefully, the kernel will release some unused
2745 // inodes before the new MDS enters reconnect state.
2746 trim_cache_for_reconnect(session);
2747 } else if (oldstate == newstate)
2748 continue; // no change
2749
2750 session->mds_state = newstate;
2751 if (newstate == MDSMap::STATE_RECONNECT) {
2752 session->con = messenger->connect_to_mds(session->addrs);
2753 send_reconnect(session);
2754 } else if (newstate > MDSMap::STATE_RECONNECT) {
2755 if (oldstate < MDSMap::STATE_RECONNECT) {
2756 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2757 _closed_mds_session(session);
2758 continue;
2759 }
2760 if (newstate >= MDSMap::STATE_ACTIVE) {
2761 if (oldstate < MDSMap::STATE_ACTIVE) {
2762 // kick new requests
2763 kick_requests(session);
2764 kick_flushing_caps(session);
2765 signal_context_list(session->waiting_for_open);
2766 wake_up_session_caps(session, true);
2767 }
2768 connect_mds_targets(mds);
2769 }
2770 } else if (newstate == MDSMap::STATE_NULL &&
2771 mds >= mdsmap->get_max_mds()) {
2772 _closed_mds_session(session);
2773 }
2774 }
2775
2776 // kick any waiting threads
2777 signal_cond_list(waiting_for_mdsmap);
2778
2779 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2780 }
2781
2782 void Client::send_reconnect(MetaSession *session)
2783 {
2784 mds_rank_t mds = session->mds_num;
2785 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2786
2787 // trim unused caps to reduce MDS's cache rejoin time
2788 trim_cache_for_reconnect(session);
2789
2790 session->readonly = false;
2791
2792 session->release.reset();
2793
2794 // reset my cap seq number
2795 session->seq = 0;
2796 //connect to the mds' offload targets
2797 connect_mds_targets(mds);
2798 //make sure unsafe requests get saved
2799 resend_unsafe_requests(session);
2800
2801 early_kick_flushing_caps(session);
2802
2803 auto m = make_message<MClientReconnect>();
2804 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2805
2806 // i have an open session.
2807 ceph::unordered_set<inodeno_t> did_snaprealm;
2808 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2809 p != inode_map.end();
2810 ++p) {
2811 Inode *in = p->second;
2812 auto it = in->caps.find(mds);
2813 if (it != in->caps.end()) {
2814 if (allow_multi &&
2815 m->get_approx_size() >=
2816 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2817 m->mark_more();
2818 session->con->send_message2(std::move(m));
2819
2820 m = make_message<MClientReconnect>();
2821 }
2822
2823 Cap &cap = it->second;
2824 ldout(cct, 10) << " caps on " << p->first
2825 << " " << ccap_string(cap.issued)
2826 << " wants " << ccap_string(in->caps_wanted())
2827 << dendl;
2828 filepath path;
2829 in->make_long_path(path);
2830 ldout(cct, 10) << " path " << path << dendl;
2831
2832 bufferlist flockbl;
2833 _encode_filelocks(in, flockbl);
2834
2835 cap.seq = 0; // reset seq.
2836 cap.issue_seq = 0; // reset seq.
2837 cap.mseq = 0; // reset seq.
2838 // cap gen should catch up with session cap_gen
2839 if (cap.gen < session->cap_gen) {
2840 cap.gen = session->cap_gen;
2841 cap.issued = cap.implemented = CEPH_CAP_PIN;
2842 } else {
2843 cap.issued = cap.implemented;
2844 }
2845 snapid_t snap_follows = 0;
2846 if (!in->cap_snaps.empty())
2847 snap_follows = in->cap_snaps.begin()->first;
2848
2849 m->add_cap(p->first.ino,
2850 cap.cap_id,
2851 path.get_ino(), path.get_path(), // ino
2852 in->caps_wanted(), // wanted
2853 cap.issued, // issued
2854 in->snaprealm->ino,
2855 snap_follows,
2856 flockbl);
2857
2858 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2859 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2860 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2861 did_snaprealm.insert(in->snaprealm->ino);
2862 }
2863 }
2864 }
2865
2866 if (!allow_multi)
2867 m->set_encoding_version(0); // use connection features to choose encoding
2868 session->con->send_message2(std::move(m));
2869
2870 mount_cond.notify_all();
2871
2872 if (session->reclaim_state == MetaSession::RECLAIMING)
2873 signal_cond_list(waiting_for_reclaim);
2874 }
2875
2876
2877 void Client::kick_requests(MetaSession *session)
2878 {
2879 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2880 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2881 p != mds_requests.end();
2882 ++p) {
2883 MetaRequest *req = p->second;
2884 if (req->got_unsafe)
2885 continue;
2886 if (req->aborted()) {
2887 if (req->caller_cond) {
2888 req->kick = true;
2889 req->caller_cond->notify_all();
2890 }
2891 continue;
2892 }
2893 if (req->retry_attempt > 0)
2894 continue; // new requests only
2895 if (req->mds == session->mds_num) {
2896 send_request(p->second, session);
2897 }
2898 }
2899 }
2900
2901 void Client::resend_unsafe_requests(MetaSession *session)
2902 {
2903 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2904 !iter.end();
2905 ++iter)
2906 send_request(*iter, session);
2907
2908 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2909 // process completed requests in clientreplay stage.
2910 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2911 p != mds_requests.end();
2912 ++p) {
2913 MetaRequest *req = p->second;
2914 if (req->got_unsafe)
2915 continue;
2916 if (req->aborted())
2917 continue;
2918 if (req->retry_attempt == 0)
2919 continue; // old requests only
2920 if (req->mds == session->mds_num)
2921 send_request(req, session, true);
2922 }
2923 }
2924
2925 void Client::wait_unsafe_requests()
2926 {
2927 list<MetaRequest*> last_unsafe_reqs;
2928 for (const auto &p : mds_sessions) {
2929 const MetaSession &s = p.second;
2930 if (!s.unsafe_requests.empty()) {
2931 MetaRequest *req = s.unsafe_requests.back();
2932 req->get();
2933 last_unsafe_reqs.push_back(req);
2934 }
2935 }
2936
2937 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2938 p != last_unsafe_reqs.end();
2939 ++p) {
2940 MetaRequest *req = *p;
2941 if (req->unsafe_item.is_on_list())
2942 wait_on_list(req->waitfor_safe);
2943 put_request(req);
2944 }
2945 }
2946
2947 void Client::kick_requests_closed(MetaSession *session)
2948 {
2949 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2950 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2951 p != mds_requests.end(); ) {
2952 MetaRequest *req = p->second;
2953 ++p;
2954 if (req->mds == session->mds_num) {
2955 if (req->caller_cond) {
2956 req->kick = true;
2957 req->caller_cond->notify_all();
2958 }
2959 req->item.remove_myself();
2960 if (req->got_unsafe) {
2961 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2962 req->unsafe_item.remove_myself();
2963 if (is_dir_operation(req)) {
2964 Inode *dir = req->inode();
2965 assert(dir);
2966 dir->set_async_err(-EIO);
2967 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2968 << dir->ino << " " << req->get_tid() << dendl;
2969 req->unsafe_dir_item.remove_myself();
2970 }
2971 if (req->target) {
2972 InodeRef &in = req->target;
2973 in->set_async_err(-EIO);
2974 lderr(cct) << "kick_requests_closed drop req of inode : "
2975 << in->ino << " " << req->get_tid() << dendl;
2976 req->unsafe_target_item.remove_myself();
2977 }
2978 signal_cond_list(req->waitfor_safe);
2979 unregister_request(req);
2980 }
2981 }
2982 }
2983 ceph_assert(session->requests.empty());
2984 ceph_assert(session->unsafe_requests.empty());
2985 }
2986
2987
2988
2989
2990 /************
2991 * leases
2992 */
2993
2994 void Client::got_mds_push(MetaSession *s)
2995 {
2996 s->seq++;
2997 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2998 if (s->state == MetaSession::STATE_CLOSING) {
2999 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3000 }
3001 }
3002
3003 void Client::handle_lease(const MConstRef<MClientLease>& m)
3004 {
3005 ldout(cct, 10) << __func__ << " " << *m << dendl;
3006
3007 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3008
3009 mds_rank_t mds = mds_rank_t(m->get_source().num());
3010 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3011 if (!session) {
3012 return;
3013 }
3014
3015 got_mds_push(session);
3016
3017 ceph_seq_t seq = m->get_seq();
3018
3019 Inode *in;
3020 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3021 if (inode_map.count(vino) == 0) {
3022 ldout(cct, 10) << " don't have vino " << vino << dendl;
3023 goto revoke;
3024 }
3025 in = inode_map[vino];
3026
3027 if (m->get_mask() & CEPH_LEASE_VALID) {
3028 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3029 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3030 goto revoke;
3031 }
3032 Dentry *dn = in->dir->dentries[m->dname];
3033 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3034 dn->lease_mds = -1;
3035 }
3036
3037 revoke:
3038 {
3039 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3040 m->get_mask(), m->get_ino(),
3041 m->get_first(), m->get_last(), m->dname);
3042 m->get_connection()->send_message2(std::move(reply));
3043 }
3044 }
3045
3046 void Client::put_inode(Inode *in, int n)
3047 {
3048 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3049 int left = in->_put(n);
3050 if (left == 0) {
3051 // release any caps
3052 remove_all_caps(in);
3053
3054 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3055 bool unclean = objectcacher->release_set(&in->oset);
3056 ceph_assert(!unclean);
3057 inode_map.erase(in->vino());
3058 if (use_faked_inos())
3059 _release_faked_ino(in);
3060
3061 if (in == root) {
3062 root = 0;
3063 root_ancestor = 0;
3064 while (!root_parents.empty())
3065 root_parents.erase(root_parents.begin());
3066 }
3067
3068 delete in;
3069 }
3070 }
3071
3072 void Client::close_dir(Dir *dir)
3073 {
3074 Inode *in = dir->parent_inode;
3075 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3076 ceph_assert(dir->is_empty());
3077 ceph_assert(in->dir == dir);
3078 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3079 if (!in->dentries.empty())
3080 in->get_first_parent()->put(); // unpin dentry
3081
3082 delete in->dir;
3083 in->dir = 0;
3084 put_inode(in); // unpin inode
3085 }
3086
3087 /**
3088 * Don't call this with in==NULL, use get_or_create for that
3089 * leave dn set to default NULL unless you're trying to add
3090 * a new inode to a pre-created Dentry
3091 */
3092 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3093 {
3094 if (!dn) {
3095 // create a new Dentry
3096 dn = new Dentry(dir, name);
3097
3098 lru.lru_insert_mid(dn); // mid or top?
3099
3100 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3101 << " dn " << dn << " (new dn)" << dendl;
3102 } else {
3103 ceph_assert(!dn->inode);
3104 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3105 << " dn " << dn << " (old dn)" << dendl;
3106 }
3107
3108 if (in) { // link to inode
3109 InodeRef tmp_ref;
3110 // only one parent for directories!
3111 if (in->is_dir() && !in->dentries.empty()) {
3112 tmp_ref = in; // prevent unlink below from freeing the inode.
3113 Dentry *olddn = in->get_first_parent();
3114 ceph_assert(olddn->dir != dir || olddn->name != name);
3115 Inode *old_diri = olddn->dir->parent_inode;
3116 old_diri->dir_release_count++;
3117 clear_dir_complete_and_ordered(old_diri, true);
3118 unlink(olddn, true, true); // keep dir, dentry
3119 }
3120
3121 dn->link(in);
3122 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3123 }
3124
3125 return dn;
3126 }
3127
3128 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3129 {
3130 InodeRef in(dn->inode);
3131 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3132 << " inode " << dn->inode << dendl;
3133
3134 // unlink from inode
3135 if (dn->inode) {
3136 dn->unlink();
3137 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3138 }
3139
3140 if (keepdentry) {
3141 dn->lease_mds = -1;
3142 } else {
3143 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3144
3145 // unlink from dir
3146 Dir *dir = dn->dir;
3147 dn->detach();
3148
3149 // delete den
3150 lru.lru_remove(dn);
3151 dn->put();
3152
3153 if (dir->is_empty() && !keepdir)
3154 close_dir(dir);
3155 }
3156 }
3157
3158 /**
3159 * For asynchronous flushes, check for errors from the IO and
3160 * update the inode if necessary
3161 */
3162 class C_Client_FlushComplete : public Context {
3163 private:
3164 Client *client;
3165 InodeRef inode;
3166 public:
3167 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3168 void finish(int r) override {
3169 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3170 if (r != 0) {
3171 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3172 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3173 << " 0x" << std::hex << inode->ino << std::dec
3174 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3175 inode->set_async_err(r);
3176 }
3177 }
3178 };
3179
3180
3181 /****
3182 * caps
3183 */
3184
3185 void Client::get_cap_ref(Inode *in, int cap)
3186 {
3187 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3188 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3189 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3190 in->get();
3191 }
3192 if ((cap & CEPH_CAP_FILE_CACHE) &&
3193 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3194 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3195 in->get();
3196 }
3197 in->get_cap_ref(cap);
3198 }
3199
3200 void Client::put_cap_ref(Inode *in, int cap)
3201 {
3202 int last = in->put_cap_ref(cap);
3203 if (last) {
3204 int put_nref = 0;
3205 int drop = last & ~in->caps_issued();
3206 if (in->snapid == CEPH_NOSNAP) {
3207 if ((last & CEPH_CAP_FILE_WR) &&
3208 !in->cap_snaps.empty() &&
3209 in->cap_snaps.rbegin()->second.writing) {
3210 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3211 in->cap_snaps.rbegin()->second.writing = 0;
3212 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3213 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3214 }
3215 if (last & CEPH_CAP_FILE_BUFFER) {
3216 for (auto &p : in->cap_snaps)
3217 p.second.dirty_data = 0;
3218 signal_cond_list(in->waitfor_commit);
3219 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3220 ++put_nref;
3221 }
3222 }
3223 if (last & CEPH_CAP_FILE_CACHE) {
3224 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3225 ++put_nref;
3226 }
3227 if (drop)
3228 check_caps(in, 0);
3229 if (put_nref)
3230 put_inode(in, put_nref);
3231 }
3232 }
3233
3234 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3235 {
3236 int r = check_pool_perm(in, need);
3237 if (r < 0)
3238 return r;
3239
3240 while (1) {
3241 int file_wanted = in->caps_file_wanted();
3242 if ((file_wanted & need) != need) {
3243 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3244 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3245 << dendl;
3246 return -EBADF;
3247 }
3248
3249 int implemented;
3250 int have = in->caps_issued(&implemented);
3251
3252 bool waitfor_caps = false;
3253 bool waitfor_commit = false;
3254
3255 if (have & need & CEPH_CAP_FILE_WR) {
3256 if (endoff > 0) {
3257 if ((endoff >= (loff_t)in->max_size ||
3258 endoff > (loff_t)(in->size << 1)) &&
3259 endoff > (loff_t)in->wanted_max_size) {
3260 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3261 in->wanted_max_size = endoff;
3262 }
3263 if (in->wanted_max_size > in->max_size &&
3264 in->wanted_max_size > in->requested_max_size)
3265 check_caps(in, 0);
3266 }
3267
3268 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3269 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3270 waitfor_caps = true;
3271 }
3272 if (!in->cap_snaps.empty()) {
3273 if (in->cap_snaps.rbegin()->second.writing) {
3274 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3275 waitfor_caps = true;
3276 }
3277 for (auto &p : in->cap_snaps) {
3278 if (p.second.dirty_data) {
3279 waitfor_commit = true;
3280 break;
3281 }
3282 }
3283 if (waitfor_commit) {
3284 _flush(in, new C_Client_FlushComplete(this, in));
3285 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3286 }
3287 }
3288 }
3289
3290 if (!waitfor_caps && !waitfor_commit) {
3291 if ((have & need) == need) {
3292 int revoking = implemented & ~have;
3293 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3294 << " need " << ccap_string(need) << " want " << ccap_string(want)
3295 << " revoking " << ccap_string(revoking)
3296 << dendl;
3297 if ((revoking & want) == 0) {
3298 *phave = need | (have & want);
3299 in->get_cap_ref(need);
3300 return 0;
3301 }
3302 }
3303 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3304 waitfor_caps = true;
3305 }
3306
3307 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3308 in->auth_cap->session->readonly)
3309 return -EROFS;
3310
3311 if (in->flags & I_CAP_DROPPED) {
3312 int mds_wanted = in->caps_mds_wanted();
3313 if ((mds_wanted & need) != need) {
3314 int ret = _renew_caps(in);
3315 if (ret < 0)
3316 return ret;
3317 continue;
3318 }
3319 if (!(file_wanted & ~mds_wanted))
3320 in->flags &= ~I_CAP_DROPPED;
3321 }
3322
3323 if (waitfor_caps)
3324 wait_on_list(in->waitfor_caps);
3325 else if (waitfor_commit)
3326 wait_on_list(in->waitfor_commit);
3327 }
3328 }
3329
3330 int Client::get_caps_used(Inode *in)
3331 {
3332 unsigned used = in->caps_used();
3333 if (!(used & CEPH_CAP_FILE_CACHE) &&
3334 !objectcacher->set_is_empty(&in->oset))
3335 used |= CEPH_CAP_FILE_CACHE;
3336 return used;
3337 }
3338
3339 void Client::cap_delay_requeue(Inode *in)
3340 {
3341 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3342 in->hold_caps_until = ceph_clock_now();
3343 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3344 delayed_list.push_back(&in->delay_cap_item);
3345 }
3346
3347 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3348 int flags, int used, int want, int retain,
3349 int flush, ceph_tid_t flush_tid)
3350 {
3351 int held = cap->issued | cap->implemented;
3352 int revoking = cap->implemented & ~cap->issued;
3353 retain &= ~revoking;
3354 int dropping = cap->issued & ~retain;
3355 int op = CEPH_CAP_OP_UPDATE;
3356
3357 ldout(cct, 10) << __func__ << " " << *in
3358 << " mds." << session->mds_num << " seq " << cap->seq
3359 << " used " << ccap_string(used)
3360 << " want " << ccap_string(want)
3361 << " flush " << ccap_string(flush)
3362 << " retain " << ccap_string(retain)
3363 << " held "<< ccap_string(held)
3364 << " revoking " << ccap_string(revoking)
3365 << " dropping " << ccap_string(dropping)
3366 << dendl;
3367
3368 if (cct->_conf->client_inject_release_failure && revoking) {
3369 const int would_have_issued = cap->issued & retain;
3370 const int would_have_implemented = cap->implemented & (cap->issued | used);
3371 // Simulated bug:
3372 // - tell the server we think issued is whatever they issued plus whatever we implemented
3373 // - leave what we have implemented in place
3374 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3375 cap->issued = cap->issued | cap->implemented;
3376
3377 // Make an exception for revoking xattr caps: we are injecting
3378 // failure to release other caps, but allow xattr because client
3379 // will block on xattr ops if it can't release these to MDS (#9800)
3380 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3381 cap->issued ^= xattr_mask & revoking;
3382 cap->implemented ^= xattr_mask & revoking;
3383
3384 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3385 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3386 } else {
3387 // Normal behaviour
3388 cap->issued &= retain;
3389 cap->implemented &= cap->issued | used;
3390 }
3391
3392 snapid_t follows = 0;
3393
3394 if (flush)
3395 follows = in->snaprealm->get_snap_context().seq;
3396
3397 auto m = make_message<MClientCaps>(op,
3398 in->ino,
3399 0,
3400 cap->cap_id, cap->seq,
3401 cap->implemented,
3402 want,
3403 flush,
3404 cap->mseq,
3405 cap_epoch_barrier);
3406 m->caller_uid = in->cap_dirtier_uid;
3407 m->caller_gid = in->cap_dirtier_gid;
3408
3409 m->head.issue_seq = cap->issue_seq;
3410 m->set_tid(flush_tid);
3411
3412 m->head.uid = in->uid;
3413 m->head.gid = in->gid;
3414 m->head.mode = in->mode;
3415
3416 m->head.nlink = in->nlink;
3417
3418 if (flush & CEPH_CAP_XATTR_EXCL) {
3419 encode(in->xattrs, m->xattrbl);
3420 m->head.xattr_version = in->xattr_version;
3421 }
3422
3423 m->size = in->size;
3424 m->max_size = in->max_size;
3425 m->truncate_seq = in->truncate_seq;
3426 m->truncate_size = in->truncate_size;
3427 m->mtime = in->mtime;
3428 m->atime = in->atime;
3429 m->ctime = in->ctime;
3430 m->btime = in->btime;
3431 m->time_warp_seq = in->time_warp_seq;
3432 m->change_attr = in->change_attr;
3433
3434 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3435 !in->cap_snaps.empty() &&
3436 in->cap_snaps.rbegin()->second.flush_tid == 0)
3437 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3438 m->flags = flags;
3439
3440 if (flush & CEPH_CAP_FILE_WR) {
3441 m->inline_version = in->inline_version;
3442 m->inline_data = in->inline_data;
3443 }
3444
3445 in->reported_size = in->size;
3446 m->set_snap_follows(follows);
3447 cap->wanted = want;
3448 if (cap == in->auth_cap) {
3449 if (want & CEPH_CAP_ANY_FILE_WR) {
3450 m->set_max_size(in->wanted_max_size);
3451 in->requested_max_size = in->wanted_max_size;
3452 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3453 } else {
3454 in->requested_max_size = 0;
3455 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3456 }
3457 }
3458
3459 if (!session->flushing_caps_tids.empty())
3460 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3461
3462 session->con->send_message2(std::move(m));
3463 }
3464
3465 static bool is_max_size_approaching(Inode *in)
3466 {
3467 /* mds will adjust max size according to the reported size */
3468 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3469 return false;
3470 if (in->size >= in->max_size)
3471 return true;
3472 /* half of previous max_size increment has been used */
3473 if (in->max_size > in->reported_size &&
3474 (in->size << 1) >= in->max_size + in->reported_size)
3475 return true;
3476 return false;
3477 }
3478
3479 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3480 {
3481 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3482 return used;
3483 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3484 return used;
3485
3486 if (issued & CEPH_CAP_FILE_LAZYIO) {
3487 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3488 used &= ~CEPH_CAP_FILE_CACHE;
3489 used |= CEPH_CAP_FILE_LAZYIO;
3490 }
3491 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3492 used &= ~CEPH_CAP_FILE_BUFFER;
3493 used |= CEPH_CAP_FILE_LAZYIO;
3494 }
3495 } else {
3496 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3497 used &= ~CEPH_CAP_FILE_CACHE;
3498 used |= CEPH_CAP_FILE_LAZYIO;
3499 }
3500 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3501 used &= ~CEPH_CAP_FILE_BUFFER;
3502 used |= CEPH_CAP_FILE_LAZYIO;
3503 }
3504 }
3505 return used;
3506 }
3507
3508 /**
3509 * check_caps
3510 *
3511 * Examine currently used and wanted versus held caps. Release, flush or ack
3512 * revoked caps to the MDS as appropriate.
3513 *
3514 * @param in the inode to check
3515 * @param flags flags to apply to cap check
3516 */
3517 void Client::check_caps(Inode *in, unsigned flags)
3518 {
3519 unsigned wanted = in->caps_wanted();
3520 unsigned used = get_caps_used(in);
3521 unsigned cap_used;
3522
3523 int implemented;
3524 int issued = in->caps_issued(&implemented);
3525 int revoking = implemented & ~issued;
3526
3527 int orig_used = used;
3528 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3529
3530 int retain = wanted | used | CEPH_CAP_PIN;
3531 if (!unmounting && in->nlink > 0) {
3532 if (wanted) {
3533 retain |= CEPH_CAP_ANY;
3534 } else if (in->is_dir() &&
3535 (issued & CEPH_CAP_FILE_SHARED) &&
3536 (in->flags & I_COMPLETE)) {
3537 // we do this here because we don't want to drop to Fs (and then
3538 // drop the Fs if we do a create!) if that alone makes us send lookups
3539 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3540 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3541 retain |= wanted;
3542 } else {
3543 retain |= CEPH_CAP_ANY_SHARED;
3544 // keep RD only if we didn't have the file open RW,
3545 // because then the mds would revoke it anyway to
3546 // journal max_size=0.
3547 if (in->max_size == 0)
3548 retain |= CEPH_CAP_ANY_RD;
3549 }
3550 }
3551
3552 ldout(cct, 10) << __func__ << " on " << *in
3553 << " wanted " << ccap_string(wanted)
3554 << " used " << ccap_string(used)
3555 << " issued " << ccap_string(issued)
3556 << " revoking " << ccap_string(revoking)
3557 << " flags=" << flags
3558 << dendl;
3559
3560 if (in->snapid != CEPH_NOSNAP)
3561 return; //snap caps last forever, can't write
3562
3563 if (in->caps.empty())
3564 return; // guard if at end of func
3565
3566 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3567 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3568 if (_release(in))
3569 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3570 }
3571
3572
3573 for (auto &p : in->caps) {
3574 mds_rank_t mds = p.first;
3575 Cap &cap = p.second;
3576
3577 MetaSession *session = &mds_sessions.at(mds);
3578
3579 cap_used = used;
3580 if (in->auth_cap && &cap != in->auth_cap)
3581 cap_used &= ~in->auth_cap->issued;
3582
3583 revoking = cap.implemented & ~cap.issued;
3584
3585 ldout(cct, 10) << " cap mds." << mds
3586 << " issued " << ccap_string(cap.issued)
3587 << " implemented " << ccap_string(cap.implemented)
3588 << " revoking " << ccap_string(revoking) << dendl;
3589
3590 if (in->wanted_max_size > in->max_size &&
3591 in->wanted_max_size > in->requested_max_size &&
3592 &cap == in->auth_cap)
3593 goto ack;
3594
3595 /* approaching file_max? */
3596 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3597 &cap == in->auth_cap &&
3598 is_max_size_approaching(in)) {
3599 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3600 << ", reported " << in->reported_size << dendl;
3601 goto ack;
3602 }
3603
3604 /* completed revocation? */
3605 if (revoking && (revoking & cap_used) == 0) {
3606 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3607 goto ack;
3608 }
3609
3610 /* want more caps from mds? */
3611 if (wanted & ~(cap.wanted | cap.issued))
3612 goto ack;
3613
3614 if (!revoking && unmounting && (cap_used == 0))
3615 goto ack;
3616
3617 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3618 !in->dirty_caps) // and we have no dirty caps
3619 continue;
3620
3621 if (!(flags & CHECK_CAPS_NODELAY)) {
3622 ldout(cct, 10) << "delaying cap release" << dendl;
3623 cap_delay_requeue(in);
3624 continue;
3625 }
3626
3627 ack:
3628 if (&cap == in->auth_cap) {
3629 if (in->flags & I_KICK_FLUSH) {
3630 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3631 << " to mds." << mds << dendl;
3632 kick_flushing_caps(in, session);
3633 }
3634 if (!in->cap_snaps.empty() &&
3635 in->cap_snaps.rbegin()->second.flush_tid == 0)
3636 flush_snaps(in);
3637 }
3638
3639 int flushing;
3640 int msg_flags = 0;
3641 ceph_tid_t flush_tid;
3642 if (in->auth_cap == &cap && in->dirty_caps) {
3643 flushing = mark_caps_flushing(in, &flush_tid);
3644 if (flags & CHECK_CAPS_SYNCHRONOUS)
3645 msg_flags |= MClientCaps::FLAG_SYNC;
3646 } else {
3647 flushing = 0;
3648 flush_tid = 0;
3649 }
3650
3651 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3652 flushing, flush_tid);
3653 }
3654 }
3655
3656
3657 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3658 {
3659 int used = get_caps_used(in);
3660 int dirty = in->caps_dirty();
3661 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3662
3663 if (in->cap_snaps.size() &&
3664 in->cap_snaps.rbegin()->second.writing) {
3665 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3666 return;
3667 } else if (in->caps_dirty() ||
3668 (used & CEPH_CAP_FILE_WR) ||
3669 (dirty & CEPH_CAP_ANY_WR)) {
3670 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3671 ceph_assert(capsnapem.second); /* element inserted */
3672 CapSnap &capsnap = capsnapem.first->second;
3673 capsnap.context = old_snapc;
3674 capsnap.issued = in->caps_issued();
3675 capsnap.dirty = in->caps_dirty();
3676
3677 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3678
3679 capsnap.uid = in->uid;
3680 capsnap.gid = in->gid;
3681 capsnap.mode = in->mode;
3682 capsnap.btime = in->btime;
3683 capsnap.xattrs = in->xattrs;
3684 capsnap.xattr_version = in->xattr_version;
3685 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3686 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3687
3688 if (used & CEPH_CAP_FILE_WR) {
3689 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3690 capsnap.writing = 1;
3691 } else {
3692 finish_cap_snap(in, capsnap, used);
3693 }
3694 } else {
3695 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3696 }
3697 }
3698
3699 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3700 {
3701 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3702 capsnap.size = in->size;
3703 capsnap.mtime = in->mtime;
3704 capsnap.atime = in->atime;
3705 capsnap.ctime = in->ctime;
3706 capsnap.time_warp_seq = in->time_warp_seq;
3707 capsnap.change_attr = in->change_attr;
3708 capsnap.dirty |= in->caps_dirty();
3709
3710 /* Only reset it if it wasn't set before */
3711 if (capsnap.cap_dirtier_uid == -1) {
3712 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3713 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3714 }
3715
3716 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3717 capsnap.inline_data = in->inline_data;
3718 capsnap.inline_version = in->inline_version;
3719 }
3720
3721 if (used & CEPH_CAP_FILE_BUFFER) {
3722 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3723 << " WRBUFFER, delaying" << dendl;
3724 } else {
3725 capsnap.dirty_data = 0;
3726 flush_snaps(in);
3727 }
3728 }
3729
3730 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3731 {
3732 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3733 in->cap_snaps.at(seq).dirty_data = 0;
3734 flush_snaps(in);
3735 }
3736
3737 void Client::send_flush_snap(Inode *in, MetaSession *session,
3738 snapid_t follows, CapSnap& capsnap)
3739 {
3740 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3741 in->ino, in->snaprealm->ino, 0,
3742 in->auth_cap->mseq, cap_epoch_barrier);
3743 m->caller_uid = capsnap.cap_dirtier_uid;
3744 m->caller_gid = capsnap.cap_dirtier_gid;
3745
3746 m->set_client_tid(capsnap.flush_tid);
3747 m->head.snap_follows = follows;
3748
3749 m->head.caps = capsnap.issued;
3750 m->head.dirty = capsnap.dirty;
3751
3752 m->head.uid = capsnap.uid;
3753 m->head.gid = capsnap.gid;
3754 m->head.mode = capsnap.mode;
3755 m->btime = capsnap.btime;
3756
3757 m->size = capsnap.size;
3758
3759 m->head.xattr_version = capsnap.xattr_version;
3760 encode(capsnap.xattrs, m->xattrbl);
3761
3762 m->ctime = capsnap.ctime;
3763 m->btime = capsnap.btime;
3764 m->mtime = capsnap.mtime;
3765 m->atime = capsnap.atime;
3766 m->time_warp_seq = capsnap.time_warp_seq;
3767 m->change_attr = capsnap.change_attr;
3768
3769 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3770 m->inline_version = in->inline_version;
3771 m->inline_data = in->inline_data;
3772 }
3773
3774 ceph_assert(!session->flushing_caps_tids.empty());
3775 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3776
3777 session->con->send_message2(std::move(m));
3778 }
3779
3780 void Client::flush_snaps(Inode *in)
3781 {
3782 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3783 ceph_assert(in->cap_snaps.size());
3784
3785 // pick auth mds
3786 ceph_assert(in->auth_cap);
3787 MetaSession *session = in->auth_cap->session;
3788
3789 for (auto &p : in->cap_snaps) {
3790 CapSnap &capsnap = p.second;
3791 // only do new flush
3792 if (capsnap.flush_tid > 0)
3793 continue;
3794
3795 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3796 << " follows " << p.first
3797 << " size " << capsnap.size
3798 << " mtime " << capsnap.mtime
3799 << " dirty_data=" << capsnap.dirty_data
3800 << " writing=" << capsnap.writing
3801 << " on " << *in << dendl;
3802 if (capsnap.dirty_data || capsnap.writing)
3803 break;
3804
3805 capsnap.flush_tid = ++last_flush_tid;
3806 session->flushing_caps_tids.insert(capsnap.flush_tid);
3807 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3808 if (!in->flushing_cap_item.is_on_list())
3809 session->flushing_caps.push_back(&in->flushing_cap_item);
3810
3811 send_flush_snap(in, session, p.first, capsnap);
3812 }
3813 }
3814
3815 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3816 {
3817 ceph::condition_variable cond;
3818 ls.push_back(&cond);
3819 std::unique_lock l{client_lock, std::adopt_lock};
3820 cond.wait(l);
3821 l.release();
3822 ls.remove(&cond);
3823 }
3824
3825 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3826 {
3827 for (auto cond : ls) {
3828 cond->notify_all();
3829 }
3830 }
3831
3832 void Client::wait_on_context_list(list<Context*>& ls)
3833 {
3834 ceph::condition_variable cond;
3835 bool done = false;
3836 int r;
3837 ls.push_back(new C_Cond(cond, &done, &r));
3838 std::unique_lock l{client_lock, std::adopt_lock};
3839 cond.wait(l, [&done] { return done;});
3840 l.release();
3841 }
3842
3843 void Client::signal_context_list(list<Context*>& ls)
3844 {
3845 while (!ls.empty()) {
3846 ls.front()->complete(0);
3847 ls.pop_front();
3848 }
3849 }
3850
3851 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3852 {
3853 for (const auto &cap : s->caps) {
3854 auto &in = cap->inode;
3855 if (reconnect) {
3856 in.requested_max_size = 0;
3857 in.wanted_max_size = 0;
3858 } else {
3859 if (cap->gen < s->cap_gen) {
3860 // mds did not re-issue stale cap.
3861 cap->issued = cap->implemented = CEPH_CAP_PIN;
3862 // make sure mds knows what we want.
3863 if (in.caps_file_wanted() & ~cap->wanted)
3864 in.flags |= I_CAP_DROPPED;
3865 }
3866 }
3867 signal_cond_list(in.waitfor_caps);
3868 }
3869 }
3870
3871
3872 // flush dirty data (from objectcache)
3873
3874 class C_Client_CacheInvalidate : public Context {
3875 private:
3876 Client *client;
3877 vinodeno_t ino;
3878 int64_t offset, length;
3879 public:
3880 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3881 client(c), offset(off), length(len) {
3882 if (client->use_faked_inos())
3883 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3884 else
3885 ino = in->vino();
3886 }
3887 void finish(int r) override {
3888 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3890 client->_async_invalidate(ino, offset, length);
3891 }
3892 };
3893
3894 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3895 {
3896 if (unmounting)
3897 return;
3898 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3899 ino_invalidate_cb(callback_handle, ino, off, len);
3900 }
3901
3902 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3903
3904 if (ino_invalidate_cb)
3905 // we queue the invalidate, which calls the callback and decrements the ref
3906 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3907 }
3908
3909 void Client::_invalidate_inode_cache(Inode *in)
3910 {
3911 ldout(cct, 10) << __func__ << " " << *in << dendl;
3912
3913 // invalidate our userspace inode cache
3914 if (cct->_conf->client_oc) {
3915 objectcacher->release_set(&in->oset);
3916 if (!objectcacher->set_is_empty(&in->oset))
3917 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3918 }
3919
3920 _schedule_invalidate_callback(in, 0, 0);
3921 }
3922
3923 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3924 {
3925 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3926
3927 // invalidate our userspace inode cache
3928 if (cct->_conf->client_oc) {
3929 vector<ObjectExtent> ls;
3930 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3931 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3932 }
3933
3934 _schedule_invalidate_callback(in, off, len);
3935 }
3936
3937 bool Client::_release(Inode *in)
3938 {
3939 ldout(cct, 20) << "_release " << *in << dendl;
3940 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3941 _invalidate_inode_cache(in);
3942 return true;
3943 }
3944 return false;
3945 }
3946
3947 bool Client::_flush(Inode *in, Context *onfinish)
3948 {
3949 ldout(cct, 10) << "_flush " << *in << dendl;
3950
3951 if (!in->oset.dirty_or_tx) {
3952 ldout(cct, 10) << " nothing to flush" << dendl;
3953 onfinish->complete(0);
3954 return true;
3955 }
3956
3957 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3958 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3959 objectcacher->purge_set(&in->oset);
3960 if (onfinish) {
3961 onfinish->complete(-ENOSPC);
3962 }
3963 return true;
3964 }
3965
3966 return objectcacher->flush_set(&in->oset, onfinish);
3967 }
3968
3969 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3970 {
3971 ceph_assert(ceph_mutex_is_locked(client_lock));
3972 if (!in->oset.dirty_or_tx) {
3973 ldout(cct, 10) << " nothing to flush" << dendl;
3974 return;
3975 }
3976
3977 C_SaferCond onflush("Client::_flush_range flock");
3978 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3979 offset, size, &onflush);
3980 if (!ret) {
3981 // wait for flush
3982 client_lock.unlock();
3983 onflush.wait();
3984 client_lock.lock();
3985 }
3986 }
3987
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3989 {
3990 // std::lock_guard l(client_lock);
3991 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
3992 Inode *in = static_cast<Inode *>(oset->parent);
3993 ceph_assert(in);
3994 _flushed(in);
3995 }
3996
3997 void Client::_flushed(Inode *in)
3998 {
3999 ldout(cct, 10) << "_flushed " << *in << dendl;
4000
4001 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4002 }
4003
4004
4005
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode *in, unsigned issued)
4008 {
4009 unsigned had = in->caps_issued();
4010
4011 if ((issued & CEPH_CAP_FILE_CACHE) &&
4012 !(had & CEPH_CAP_FILE_CACHE))
4013 in->cache_gen++;
4014
4015 if ((issued & CEPH_CAP_FILE_SHARED) &&
4016 !(had & CEPH_CAP_FILE_SHARED)) {
4017 in->shared_gen++;
4018
4019 if (in->is_dir())
4020 clear_dir_complete_and_ordered(in, true);
4021 }
4022 }
4023
4024 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4025 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4026 inodeno_t realm, int flags, const UserPerm& cap_perms)
4027 {
4028 if (!in->is_any_caps()) {
4029 ceph_assert(in->snaprealm == 0);
4030 in->snaprealm = get_snap_realm(realm);
4031 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4032 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4033 } else {
4034 ceph_assert(in->snaprealm);
4035 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4036 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4037 in->snaprealm_item.remove_myself();
4038 auto oldrealm = in->snaprealm;
4039 in->snaprealm = get_snap_realm(realm);
4040 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4041 put_snap_realm(oldrealm);
4042 }
4043 }
4044
4045 mds_rank_t mds = mds_session->mds_num;
4046 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4047 Cap &cap = capem.first->second;
4048 if (!capem.second) {
4049 if (cap.gen < mds_session->cap_gen)
4050 cap.issued = cap.implemented = CEPH_CAP_PIN;
4051
4052 /*
4053 * auth mds of the inode changed. we received the cap export
4054 * message, but still haven't received the cap import message.
4055 * handle_cap_export() updated the new auth MDS' cap.
4056 *
4057 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058 * a message that was send before the cap import message. So
4059 * don't remove caps.
4060 */
4061 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4062 if (&cap != in->auth_cap)
4063 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4064
4065 ceph_assert(cap.cap_id == cap_id);
4066 seq = cap.seq;
4067 mseq = cap.mseq;
4068 issued |= cap.issued;
4069 flags |= CEPH_CAP_FLAG_AUTH;
4070 }
4071 }
4072
4073 check_cap_issue(in, issued);
4074
4075 if (flags & CEPH_CAP_FLAG_AUTH) {
4076 if (in->auth_cap != &cap &&
4077 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4078 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4079 ldout(cct, 10) << __func__ << " changing auth cap: "
4080 << "add myself to new auth MDS' flushing caps list" << dendl;
4081 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4082 }
4083 in->auth_cap = &cap;
4084 }
4085 }
4086
4087 unsigned old_caps = cap.issued;
4088 cap.cap_id = cap_id;
4089 cap.issued = issued;
4090 cap.implemented |= issued;
4091 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4092 cap.wanted = wanted;
4093 else
4094 cap.wanted |= wanted;
4095 cap.seq = seq;
4096 cap.issue_seq = seq;
4097 cap.mseq = mseq;
4098 cap.gen = mds_session->cap_gen;
4099 cap.latest_perms = cap_perms;
4100 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4101 << " from mds." << mds
4102 << " on " << *in
4103 << dendl;
4104
4105 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4106 // non-auth MDS is revoking the newly grant caps ?
4107 for (auto &p : in->caps) {
4108 if (&p.second == &cap)
4109 continue;
4110 if (p.second.implemented & ~p.second.issued & issued) {
4111 check_caps(in, CHECK_CAPS_NODELAY);
4112 break;
4113 }
4114 }
4115 }
4116
4117 if (issued & ~old_caps)
4118 signal_cond_list(in->waitfor_caps);
4119 }
4120
4121 void Client::remove_cap(Cap *cap, bool queue_release)
4122 {
4123 auto &in = cap->inode;
4124 MetaSession *session = cap->session;
4125 mds_rank_t mds = cap->session->mds_num;
4126
4127 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4128
4129 if (queue_release) {
4130 session->enqueue_cap_release(
4131 in.ino,
4132 cap->cap_id,
4133 cap->issue_seq,
4134 cap->mseq,
4135 cap_epoch_barrier);
4136 }
4137
4138 if (in.auth_cap == cap) {
4139 if (in.flushing_cap_item.is_on_list()) {
4140 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4141 in.flushing_cap_item.remove_myself();
4142 }
4143 in.auth_cap = NULL;
4144 }
4145 size_t n = in.caps.erase(mds);
4146 ceph_assert(n == 1);
4147 cap = nullptr;
4148
4149 if (!in.is_any_caps()) {
4150 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4151 in.snaprealm_item.remove_myself();
4152 put_snap_realm(in.snaprealm);
4153 in.snaprealm = 0;
4154 }
4155 }
4156
4157 void Client::remove_all_caps(Inode *in)
4158 {
4159 while (!in->caps.empty())
4160 remove_cap(&in->caps.begin()->second, true);
4161 }
4162
4163 void Client::remove_session_caps(MetaSession *s)
4164 {
4165 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4166
4167 while (s->caps.size()) {
4168 Cap *cap = *s->caps.begin();
4169 InodeRef in(&cap->inode);
4170 bool dirty_caps = false;
4171 if (in->auth_cap == cap) {
4172 dirty_caps = in->dirty_caps | in->flushing_caps;
4173 in->wanted_max_size = 0;
4174 in->requested_max_size = 0;
4175 }
4176 if (cap->wanted | cap->issued)
4177 in->flags |= I_CAP_DROPPED;
4178 remove_cap(cap, false);
4179 in->cap_snaps.clear();
4180 if (dirty_caps) {
4181 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4182 if (in->flushing_caps) {
4183 num_flushing_caps--;
4184 in->flushing_cap_tids.clear();
4185 }
4186 in->flushing_caps = 0;
4187 in->mark_caps_clean();
4188 put_inode(in.get());
4189 }
4190 signal_cond_list(in->waitfor_caps);
4191 }
4192 s->flushing_caps_tids.clear();
4193 sync_cond.notify_all();
4194 }
4195
4196 int Client::_do_remount(bool retry_on_error)
4197 {
4198 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4199
4200 errno = 0;
4201 int r = remount_cb(callback_handle);
4202 if (r == 0) {
4203 retries_on_invalidate = 0;
4204 } else {
4205 int e = errno;
4206 client_t whoami = get_nodeid();
4207 if (r == -1) {
4208 lderr(cct) <<
4209 "failed to remount (to trim kernel dentries): "
4210 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4211 } else {
4212 lderr(cct) <<
4213 "failed to remount (to trim kernel dentries): "
4214 "return code = " << r << dendl;
4215 }
4216 bool should_abort =
4217 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4218 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4219 !(retry_on_error && (++retries_on_invalidate < max_retries));
4220 if (should_abort && !unmounting) {
4221 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4222 ceph_abort();
4223 }
4224 }
4225 return r;
4226 }
4227
4228 class C_Client_Remount : public Context {
4229 private:
4230 Client *client;
4231 public:
4232 explicit C_Client_Remount(Client *c) : client(c) {}
4233 void finish(int r) override {
4234 ceph_assert(r == 0);
4235 client->_do_remount(true);
4236 }
4237 };
4238
4239 void Client::_invalidate_kernel_dcache()
4240 {
4241 if (unmounting)
4242 return;
4243 if (can_invalidate_dentries) {
4244 if (dentry_invalidate_cb && root->dir) {
4245 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4246 p != root->dir->dentries.end();
4247 ++p) {
4248 if (p->second->inode)
4249 _schedule_invalidate_dentry_callback(p->second, false);
4250 }
4251 }
4252 } else if (remount_cb) {
4253 // Hacky:
4254 // when remounting a file system, linux kernel trims all unused dentries in the fs
4255 remount_finisher.queue(new C_Client_Remount(this));
4256 }
4257 }
4258
4259 void Client::_trim_negative_child_dentries(InodeRef& in)
4260 {
4261 if (!in->is_dir())
4262 return;
4263
4264 Dir* dir = in->dir;
4265 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4266 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4267 Dentry *dn = p->second;
4268 ++p;
4269 ceph_assert(!dn->inode);
4270 if (dn->lru_is_expireable())
4271 unlink(dn, true, false); // keep dir, drop dentry
4272 }
4273 if (dir->dentries.empty()) {
4274 close_dir(dir);
4275 }
4276 }
4277
4278 if (in->flags & I_SNAPDIR_OPEN) {
4279 InodeRef snapdir = open_snapdir(in.get());
4280 _trim_negative_child_dentries(snapdir);
4281 }
4282 }
4283
4284 class C_Client_CacheRelease : public Context {
4285 private:
4286 Client *client;
4287 vinodeno_t ino;
4288 public:
4289 C_Client_CacheRelease(Client *c, Inode *in) :
4290 client(c) {
4291 if (client->use_faked_inos())
4292 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4293 else
4294 ino = in->vino();
4295 }
4296 void finish(int r) override {
4297 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4298 client->_async_inode_release(ino);
4299 }
4300 };
4301
4302 void Client::_async_inode_release(vinodeno_t ino)
4303 {
4304 if (unmounting)
4305 return;
4306 ldout(cct, 10) << __func__ << " " << ino << dendl;
4307 ino_release_cb(callback_handle, ino);
4308 }
4309
4310 void Client::_schedule_ino_release_callback(Inode *in) {
4311
4312 if (ino_release_cb)
4313 // we queue the invalidate, which calls the callback and decrements the ref
4314 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4315 }
4316
4317 void Client::trim_caps(MetaSession *s, uint64_t max)
4318 {
4319 mds_rank_t mds = s->mds_num;
4320 size_t caps_size = s->caps.size();
4321 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4322 << " caps " << caps_size << dendl;
4323
4324 uint64_t trimmed = 0;
4325 auto p = s->caps.begin();
4326 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4327 * looking at from getting deleted during traversal. */
4328 while ((caps_size - trimmed) > max && !p.end()) {
4329 Cap *cap = *p;
4330 InodeRef in(&cap->inode);
4331
4332 // Increment p early because it will be invalidated if cap
4333 // is deleted inside remove_cap
4334 ++p;
4335
4336 if (in->caps.size() > 1 && cap != in->auth_cap) {
4337 int mine = cap->issued | cap->implemented;
4338 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4339 // disposable non-auth cap
4340 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4341 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4342 cap = (remove_cap(cap, true), nullptr);
4343 trimmed++;
4344 }
4345 } else {
4346 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4347 _trim_negative_child_dentries(in);
4348 bool all = true;
4349 auto q = in->dentries.begin();
4350 while (q != in->dentries.end()) {
4351 Dentry *dn = *q;
4352 ++q;
4353 if (dn->lru_is_expireable()) {
4354 if (can_invalidate_dentries &&
4355 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4356 // Only issue one of these per DN for inodes in root: handle
4357 // others more efficiently by calling for root-child DNs at
4358 // the end of this function.
4359 _schedule_invalidate_dentry_callback(dn, true);
4360 }
4361 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4362 to_trim.insert(dn);
4363 } else {
4364 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4365 all = false;
4366 }
4367 }
4368 if (all && in->ino != MDS_INO_ROOT) {
4369 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4370 trimmed++;
4371 _schedule_ino_release_callback(in.get());
4372 }
4373 }
4374 }
4375 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4376 for (const auto &dn : to_trim) {
4377 trim_dentry(dn);
4378 }
4379 to_trim.clear();
4380
4381 caps_size = s->caps.size();
4382 if (caps_size > (size_t)max)
4383 _invalidate_kernel_dcache();
4384 }
4385
4386 void Client::force_session_readonly(MetaSession *s)
4387 {
4388 s->readonly = true;
4389 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4390 auto &in = (*p)->inode;
4391 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4392 signal_cond_list(in.waitfor_caps);
4393 }
4394 }
4395
4396 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4397 {
4398 MetaSession *session = in->auth_cap->session;
4399
4400 int flushing = in->dirty_caps;
4401 ceph_assert(flushing);
4402
4403 ceph_tid_t flush_tid = ++last_flush_tid;
4404 in->flushing_cap_tids[flush_tid] = flushing;
4405
4406 if (!in->flushing_caps) {
4407 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4408 num_flushing_caps++;
4409 } else {
4410 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4411 }
4412
4413 in->flushing_caps |= flushing;
4414 in->mark_caps_clean();
4415
4416 if (!in->flushing_cap_item.is_on_list())
4417 session->flushing_caps.push_back(&in->flushing_cap_item);
4418 session->flushing_caps_tids.insert(flush_tid);
4419
4420 *ptid = flush_tid;
4421 return flushing;
4422 }
4423
4424 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4425 {
4426 for (auto &p : in->cap_snaps) {
4427 CapSnap &capsnap = p.second;
4428 if (capsnap.flush_tid > 0) {
4429 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4430 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4431 }
4432 }
4433 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4434 it != in->flushing_cap_tids.end();
4435 ++it) {
4436 old_s->flushing_caps_tids.erase(it->first);
4437 new_s->flushing_caps_tids.insert(it->first);
4438 }
4439 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4440 }
4441
4442 /*
4443 * Flush all caps back to the MDS. Because the callers generally wait on the
4444 * result of this function (syncfs and umount cases), we set
4445 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4446 */
4447 void Client::flush_caps_sync()
4448 {
4449 ldout(cct, 10) << __func__ << dendl;
4450 xlist<Inode*>::iterator p = delayed_list.begin();
4451 while (!p.end()) {
4452 unsigned flags = CHECK_CAPS_NODELAY;
4453 Inode *in = *p;
4454
4455 ++p;
4456 delayed_list.pop_front();
4457 if (p.end() && dirty_list.empty())
4458 flags |= CHECK_CAPS_SYNCHRONOUS;
4459 check_caps(in, flags);
4460 }
4461
4462 // other caps, too
4463 p = dirty_list.begin();
4464 while (!p.end()) {
4465 unsigned flags = CHECK_CAPS_NODELAY;
4466 Inode *in = *p;
4467
4468 ++p;
4469 if (p.end())
4470 flags |= CHECK_CAPS_SYNCHRONOUS;
4471 check_caps(in, flags);
4472 }
4473 }
4474
4475 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4476 {
4477 while (in->flushing_caps) {
4478 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4479 ceph_assert(it != in->flushing_cap_tids.end());
4480 if (it->first > want)
4481 break;
4482 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4483 << ccap_string(it->second) << " want " << want
4484 << " last " << it->first << dendl;
4485 wait_on_list(in->waitfor_caps);
4486 }
4487 }
4488
4489 void Client::wait_sync_caps(ceph_tid_t want)
4490 {
4491 retry:
4492 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4493 << num_flushing_caps << " total flushing)" << dendl;
4494 for (auto &p : mds_sessions) {
4495 MetaSession *s = &p.second;
4496 if (s->flushing_caps_tids.empty())
4497 continue;
4498 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4499 if (oldest_tid <= want) {
4500 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4501 << " (want " << want << ")" << dendl;
4502 std::unique_lock l{client_lock, std::adopt_lock};
4503 sync_cond.wait(l);
4504 l.release();
4505 goto retry;
4506 }
4507 }
4508 }
4509
4510 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4511 {
4512 in->flags &= ~I_KICK_FLUSH;
4513
4514 Cap *cap = in->auth_cap;
4515 ceph_assert(cap->session == session);
4516
4517 ceph_tid_t last_snap_flush = 0;
4518 for (auto p = in->flushing_cap_tids.rbegin();
4519 p != in->flushing_cap_tids.rend();
4520 ++p) {
4521 if (!p->second) {
4522 last_snap_flush = p->first;
4523 break;
4524 }
4525 }
4526
4527 int wanted = in->caps_wanted();
4528 int used = get_caps_used(in) | in->caps_dirty();
4529 auto it = in->cap_snaps.begin();
4530 for (auto& p : in->flushing_cap_tids) {
4531 if (p.second) {
4532 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4533 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4534 p.second, p.first);
4535 } else {
4536 ceph_assert(it != in->cap_snaps.end());
4537 ceph_assert(it->second.flush_tid == p.first);
4538 send_flush_snap(in, session, it->first, it->second);
4539 ++it;
4540 }
4541 }
4542 }
4543
4544 void Client::kick_flushing_caps(MetaSession *session)
4545 {
4546 mds_rank_t mds = session->mds_num;
4547 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4548
4549 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4550 Inode *in = *p;
4551 if (in->flags & I_KICK_FLUSH) {
4552 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4553 kick_flushing_caps(in, session);
4554 }
4555 }
4556 }
4557
4558 void Client::early_kick_flushing_caps(MetaSession *session)
4559 {
4560 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4561 Inode *in = *p;
4562 Cap *cap = in->auth_cap;
4563 ceph_assert(cap);
4564
4565 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4566 // stage. This guarantees that MDS processes the cap flush message before issuing
4567 // the flushing caps to other client.
4568 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4569 in->flags |= I_KICK_FLUSH;
4570 continue;
4571 }
4572
4573 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4574 << " to mds." << session->mds_num << dendl;
4575 // send_reconnect() also will reset these sequence numbers. make sure
4576 // sequence numbers in cap flush message match later reconnect message.
4577 cap->seq = 0;
4578 cap->issue_seq = 0;
4579 cap->mseq = 0;
4580 cap->issued = cap->implemented;
4581
4582 kick_flushing_caps(in, session);
4583 }
4584 }
4585
4586 void SnapRealm::build_snap_context()
4587 {
4588 set<snapid_t> snaps;
4589 snapid_t max_seq = seq;
4590
4591 // start with prior_parents?
4592 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4593 snaps.insert(prior_parent_snaps[i]);
4594
4595 // current parent's snaps
4596 if (pparent) {
4597 const SnapContext& psnapc = pparent->get_snap_context();
4598 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4599 if (psnapc.snaps[i] >= parent_since)
4600 snaps.insert(psnapc.snaps[i]);
4601 if (psnapc.seq > max_seq)
4602 max_seq = psnapc.seq;
4603 }
4604
4605 // my snaps
4606 for (unsigned i=0; i<my_snaps.size(); i++)
4607 snaps.insert(my_snaps[i]);
4608
4609 // ok!
4610 cached_snap_context.seq = max_seq;
4611 cached_snap_context.snaps.resize(0);
4612 cached_snap_context.snaps.reserve(snaps.size());
4613 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4614 cached_snap_context.snaps.push_back(*p);
4615 }
4616
4617 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4618 {
4619 list<SnapRealm*> q;
4620 q.push_back(realm);
4621
4622 while (!q.empty()) {
4623 realm = q.front();
4624 q.pop_front();
4625
4626 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4627 realm->invalidate_cache();
4628
4629 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4630 p != realm->pchildren.end();
4631 ++p)
4632 q.push_back(*p);
4633 }
4634 }
4635
4636 SnapRealm *Client::get_snap_realm(inodeno_t r)
4637 {
4638 SnapRealm *realm = snap_realms[r];
4639 if (!realm)
4640 snap_realms[r] = realm = new SnapRealm(r);
4641 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4642 realm->nref++;
4643 return realm;
4644 }
4645
4646 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4647 {
4648 if (snap_realms.count(r) == 0) {
4649 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4650 return NULL;
4651 }
4652 SnapRealm *realm = snap_realms[r];
4653 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4654 realm->nref++;
4655 return realm;
4656 }
4657
4658 void Client::put_snap_realm(SnapRealm *realm)
4659 {
4660 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4661 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4662 if (--realm->nref == 0) {
4663 snap_realms.erase(realm->ino);
4664 if (realm->pparent) {
4665 realm->pparent->pchildren.erase(realm);
4666 put_snap_realm(realm->pparent);
4667 }
4668 delete realm;
4669 }
4670 }
4671
4672 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4673 {
4674 if (realm->parent != parent) {
4675 ldout(cct, 10) << __func__ << " " << *realm
4676 << " " << realm->parent << " -> " << parent << dendl;
4677 realm->parent = parent;
4678 if (realm->pparent) {
4679 realm->pparent->pchildren.erase(realm);
4680 put_snap_realm(realm->pparent);
4681 }
4682 realm->pparent = get_snap_realm(parent);
4683 realm->pparent->pchildren.insert(realm);
4684 return true;
4685 }
4686 return false;
4687 }
4688
4689 static bool has_new_snaps(const SnapContext& old_snapc,
4690 const SnapContext& new_snapc)
4691 {
4692 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4693 }
4694
4695
4696 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4697 {
4698 SnapRealm *first_realm = NULL;
4699 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4700
4701 map<SnapRealm*, SnapContext> dirty_realms;
4702
4703 auto p = bl.cbegin();
4704 while (!p.end()) {
4705 SnapRealmInfo info;
4706 decode(info, p);
4707 SnapRealm *realm = get_snap_realm(info.ino());
4708
4709 bool invalidate = false;
4710
4711 if (info.seq() > realm->seq) {
4712 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4713 << dendl;
4714
4715 if (flush) {
4716 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4717 // flush me + children
4718 list<SnapRealm*> q;
4719 q.push_back(realm);
4720 while (!q.empty()) {
4721 SnapRealm *realm = q.front();
4722 q.pop_front();
4723
4724 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4725 p != realm->pchildren.end();
4726 ++p)
4727 q.push_back(*p);
4728
4729 if (dirty_realms.count(realm) == 0) {
4730 realm->nref++;
4731 dirty_realms[realm] = realm->get_snap_context();
4732 }
4733 }
4734 }
4735
4736 // update
4737 realm->seq = info.seq();
4738 realm->created = info.created();
4739 realm->parent_since = info.parent_since();
4740 realm->prior_parent_snaps = info.prior_parent_snaps;
4741 realm->my_snaps = info.my_snaps;
4742 invalidate = true;
4743 }
4744
4745 // _always_ verify parent
4746 if (adjust_realm_parent(realm, info.parent()))
4747 invalidate = true;
4748
4749 if (invalidate) {
4750 invalidate_snaprealm_and_children(realm);
4751 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4752 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4753 } else {
4754 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4755 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4756 }
4757
4758 if (!first_realm)
4759 first_realm = realm;
4760 else
4761 put_snap_realm(realm);
4762 }
4763
4764 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4765 q != dirty_realms.end();
4766 ++q) {
4767 SnapRealm *realm = q->first;
4768 // if there are new snaps ?
4769 if (has_new_snaps(q->second, realm->get_snap_context())) {
4770 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4771 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4772 while (!r.end()) {
4773 Inode *in = *r;
4774 ++r;
4775 queue_cap_snap(in, q->second);
4776 }
4777 } else {
4778 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4779 }
4780 put_snap_realm(realm);
4781 }
4782
4783 if (realm_ret)
4784 *realm_ret = first_realm;
4785 else
4786 put_snap_realm(first_realm);
4787 }
4788
4789 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4790 {
4791 ldout(cct, 10) << __func__ << " " << *m << dendl;
4792 mds_rank_t mds = mds_rank_t(m->get_source().num());
4793 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4794 if (!session) {
4795 return;
4796 }
4797
4798 got_mds_push(session);
4799
4800 map<Inode*, SnapContext> to_move;
4801 SnapRealm *realm = 0;
4802
4803 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4804 ceph_assert(m->head.split);
4805 SnapRealmInfo info;
4806 auto p = m->bl.cbegin();
4807 decode(info, p);
4808 ceph_assert(info.ino() == m->head.split);
4809
4810 // flush, then move, ino's.
4811 realm = get_snap_realm(info.ino());
4812 ldout(cct, 10) << " splitting off " << *realm << dendl;
4813 for (auto& ino : m->split_inos) {
4814 vinodeno_t vino(ino, CEPH_NOSNAP);
4815 if (inode_map.count(vino)) {
4816 Inode *in = inode_map[vino];
4817 if (!in->snaprealm || in->snaprealm == realm)
4818 continue;
4819 if (in->snaprealm->created > info.created()) {
4820 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4821 << *in->snaprealm << dendl;
4822 continue;
4823 }
4824 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4825
4826
4827 in->snaprealm_item.remove_myself();
4828 to_move[in] = in->snaprealm->get_snap_context();
4829 put_snap_realm(in->snaprealm);
4830 }
4831 }
4832
4833 // move child snaprealms, too
4834 for (auto& child_realm : m->split_realms) {
4835 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4836 SnapRealm *child = get_snap_realm_maybe(child_realm);
4837 if (!child)
4838 continue;
4839 adjust_realm_parent(child, realm->ino);
4840 put_snap_realm(child);
4841 }
4842 }
4843
4844 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4845
4846 if (realm) {
4847 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4848 Inode *in = p->first;
4849 in->snaprealm = realm;
4850 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4851 realm->nref++;
4852 // queue for snap writeback
4853 if (has_new_snaps(p->second, realm->get_snap_context()))
4854 queue_cap_snap(in, p->second);
4855 }
4856 put_snap_realm(realm);
4857 }
4858 }
4859
4860 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4861 {
4862 mds_rank_t mds = mds_rank_t(m->get_source().num());
4863 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4864 if (!session) {
4865 return;
4866 }
4867
4868 got_mds_push(session);
4869
4870 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4871
4872 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4873 if (inode_map.count(vino)) {
4874 Inode *in = NULL;
4875 in = inode_map[vino];
4876
4877 if (in) {
4878 in->quota = m->quota;
4879 in->rstat = m->rstat;
4880 }
4881 }
4882 }
4883
4884 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4885 {
4886 mds_rank_t mds = mds_rank_t(m->get_source().num());
4887 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4888 if (!session) {
4889 return;
4890 }
4891
4892 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4893 // Pause RADOS operations until we see the required epoch
4894 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4895 }
4896
4897 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4898 // Record the barrier so that we will transmit it to MDS when releasing
4899 set_cap_epoch_barrier(m->osd_epoch_barrier);
4900 }
4901
4902 got_mds_push(session);
4903
4904 Inode *in;
4905 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4906 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4907 in = it->second;
4908 } else {
4909 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4910 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4911 session->enqueue_cap_release(
4912 m->get_ino(),
4913 m->get_cap_id(),
4914 m->get_seq(),
4915 m->get_mseq(),
4916 cap_epoch_barrier);
4917 } else {
4918 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4919 }
4920
4921 // in case the mds is waiting on e.g. a revocation
4922 flush_cap_releases();
4923 return;
4924 }
4925
4926 switch (m->get_op()) {
4927 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4928 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4929 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4930 }
4931
4932 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4933 Cap &cap = in->caps.at(mds);
4934
4935 switch (m->get_op()) {
4936 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4937 case CEPH_CAP_OP_IMPORT:
4938 case CEPH_CAP_OP_REVOKE:
4939 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4940 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4941 }
4942 } else {
4943 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4944 return;
4945 }
4946 }
4947
4948 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4949 {
4950 mds_rank_t mds = session->mds_num;
4951
4952 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4953 << " IMPORT from mds." << mds << dendl;
4954
4955 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4956 Cap *cap = NULL;
4957 UserPerm cap_perms;
4958 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4959 cap = &it->second;
4960 cap_perms = cap->latest_perms;
4961 }
4962
4963 // add/update it
4964 SnapRealm *realm = NULL;
4965 update_snap_trace(m->snapbl, &realm);
4966
4967 int issued = m->get_caps();
4968 int wanted = m->get_wanted();
4969 add_update_cap(in, session, m->get_cap_id(),
4970 issued, wanted, m->get_seq(), m->get_mseq(),
4971 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4972
4973 if (cap && cap->cap_id == m->peer.cap_id) {
4974 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4975 }
4976
4977 if (realm)
4978 put_snap_realm(realm);
4979
4980 if (in->auth_cap && in->auth_cap->session == session) {
4981 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4982 in->requested_max_size > m->get_max_size()) {
4983 in->requested_max_size = 0;
4984 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4985 }
4986 // reflush any/all caps (if we are now the auth_cap)
4987 kick_flushing_caps(in, session);
4988 }
4989 }
4990
4991 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4992 {
4993 mds_rank_t mds = session->mds_num;
4994
4995 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4996 << " EXPORT from mds." << mds << dendl;
4997
4998 auto it = in->caps.find(mds);
4999 if (it != in->caps.end()) {
5000 Cap &cap = it->second;
5001 if (cap.cap_id == m->get_cap_id()) {
5002 if (m->peer.cap_id) {
5003 const auto peer_mds = mds_rank_t(m->peer.mds);
5004 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5005 auto it = in->caps.find(peer_mds);
5006 if (it != in->caps.end()) {
5007 Cap &tcap = it->second;
5008 if (tcap.cap_id == m->peer.cap_id &&
5009 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5010 tcap.cap_id = m->peer.cap_id;
5011 tcap.seq = m->peer.seq - 1;
5012 tcap.issue_seq = tcap.seq;
5013 tcap.issued |= cap.issued;
5014 tcap.implemented |= cap.issued;
5015 if (&cap == in->auth_cap)
5016 in->auth_cap = &tcap;
5017 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5018 adjust_session_flushing_caps(in, session, tsession);
5019 }
5020 } else {
5021 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5022 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5023 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5024 cap.latest_perms);
5025 }
5026 } else {
5027 if (cap.wanted | cap.issued)
5028 in->flags |= I_CAP_DROPPED;
5029 }
5030
5031 remove_cap(&cap, false);
5032 }
5033 }
5034 }
5035
5036 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5037 {
5038 mds_rank_t mds = session->mds_num;
5039 ceph_assert(in->caps.count(mds));
5040
5041 ldout(cct, 10) << __func__ << " on ino " << *in
5042 << " size " << in->size << " -> " << m->get_size()
5043 << dendl;
5044
5045 int issued;
5046 in->caps_issued(&issued);
5047 issued |= in->caps_dirty();
5048 update_inode_file_size(in, issued, m->get_size(),
5049 m->get_truncate_seq(), m->get_truncate_size());
5050 }
5051
5052 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5053 {
5054 ceph_tid_t flush_ack_tid = m->get_client_tid();
5055 int dirty = m->get_dirty();
5056 int cleaned = 0;
5057 int flushed = 0;
5058
5059 auto it = in->flushing_cap_tids.begin();
5060 if (it->first < flush_ack_tid) {
5061 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5062 << " got unexpected flush ack tid " << flush_ack_tid
5063 << " expected is " << it->first << dendl;
5064 }
5065 for (; it != in->flushing_cap_tids.end(); ) {
5066 if (!it->second) {
5067 // cap snap
5068 ++it;
5069 continue;
5070 }
5071 if (it->first == flush_ack_tid)
5072 cleaned = it->second;
5073 if (it->first <= flush_ack_tid) {
5074 session->flushing_caps_tids.erase(it->first);
5075 in->flushing_cap_tids.erase(it++);
5076 ++flushed;
5077 continue;
5078 }
5079 cleaned &= ~it->second;
5080 if (!cleaned)
5081 break;
5082 ++it;
5083 }
5084
5085 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5086 << " cleaned " << ccap_string(cleaned) << " on " << *in
5087 << " with " << ccap_string(dirty) << dendl;
5088
5089 if (flushed) {
5090 signal_cond_list(in->waitfor_caps);
5091 if (session->flushing_caps_tids.empty() ||
5092 *session->flushing_caps_tids.begin() > flush_ack_tid)
5093 sync_cond.notify_all();
5094 }
5095
5096 if (!dirty) {
5097 in->cap_dirtier_uid = -1;
5098 in->cap_dirtier_gid = -1;
5099 }
5100
5101 if (!cleaned) {
5102 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5103 } else {
5104 if (in->flushing_caps) {
5105 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5106 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5107 in->flushing_caps &= ~cleaned;
5108 if (in->flushing_caps == 0) {
5109 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5110 num_flushing_caps--;
5111 if (in->flushing_cap_tids.empty())
5112 in->flushing_cap_item.remove_myself();
5113 }
5114 if (!in->caps_dirty())
5115 put_inode(in);
5116 }
5117 }
5118 }
5119
5120
5121 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5122 {
5123 ceph_tid_t flush_ack_tid = m->get_client_tid();
5124 mds_rank_t mds = session->mds_num;
5125 ceph_assert(in->caps.count(mds));
5126 snapid_t follows = m->get_snap_follows();
5127
5128 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5129 auto& capsnap = it->second;
5130 if (flush_ack_tid != capsnap.flush_tid) {
5131 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5132 } else {
5133 InodeRef tmp_ref(in);
5134 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5135 << " on " << *in << dendl;
5136 session->flushing_caps_tids.erase(capsnap.flush_tid);
5137 in->flushing_cap_tids.erase(capsnap.flush_tid);
5138 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5139 in->flushing_cap_item.remove_myself();
5140 in->cap_snaps.erase(it);
5141
5142 signal_cond_list(in->waitfor_caps);
5143 if (session->flushing_caps_tids.empty() ||
5144 *session->flushing_caps_tids.begin() > flush_ack_tid)
5145 sync_cond.notify_all();
5146 }
5147 } else {
5148 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5149 << " on " << *in << dendl;
5150 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5151 }
5152 }
5153
5154 class C_Client_DentryInvalidate : public Context {
5155 private:
5156 Client *client;
5157 vinodeno_t dirino;
5158 vinodeno_t ino;
5159 string name;
5160 public:
5161 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5162 client(c), name(dn->name) {
5163 if (client->use_faked_inos()) {
5164 dirino.ino = dn->dir->parent_inode->faked_ino;
5165 if (del)
5166 ino.ino = dn->inode->faked_ino;
5167 } else {
5168 dirino = dn->dir->parent_inode->vino();
5169 if (del)
5170 ino = dn->inode->vino();
5171 }
5172 if (!del)
5173 ino.ino = inodeno_t();
5174 }
5175 void finish(int r) override {
5176 // _async_dentry_invalidate is responsible for its own locking
5177 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5178 client->_async_dentry_invalidate(dirino, ino, name);
5179 }
5180 };
5181
5182 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5183 {
5184 if (unmounting)
5185 return;
5186 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5187 << " in dir " << dirino << dendl;
5188 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5189 }
5190
5191 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5192 {
5193 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5194 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5195 }
5196
5197 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5198 {
5199 int ref = in->get_num_ref();
5200 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5201
5202 if (in->dir && !in->dir->dentries.empty()) {
5203 for (auto p = in->dir->dentries.begin();
5204 p != in->dir->dentries.end(); ) {
5205 Dentry *dn = p->second;
5206 ++p;
5207 /* rmsnap removes whole subtree, need trim inodes recursively.
5208 * we don't need to invalidate dentries recursively. because
5209 * invalidating a directory dentry effectively invalidate
5210 * whole subtree */
5211 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5212 _try_to_trim_inode(dn->inode.get(), false);
5213
5214 if (dn->lru_is_expireable())
5215 unlink(dn, true, false); // keep dir, drop dentry
5216 }
5217 if (in->dir->dentries.empty()) {
5218 close_dir(in->dir);
5219 --ref;
5220 }
5221 }
5222
5223 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5224 InodeRef snapdir = open_snapdir(in);
5225 _try_to_trim_inode(snapdir.get(), false);
5226 --ref;
5227 }
5228
5229 if (ref > 0) {
5230 auto q = in->dentries.begin();
5231 while (q != in->dentries.end()) {
5232 Dentry *dn = *q;
5233 ++q;
5234 if( in->ll_ref > 0 && sched_inval) {
5235 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5236 // so in->dentries doesn't always reflect the state of kernel's dcache.
5237 _schedule_invalidate_dentry_callback(dn, true);
5238 }
5239 unlink(dn, true, true);
5240 }
5241 }
5242 }
5243
5244 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5245 {
5246 mds_rank_t mds = session->mds_num;
5247 int used = get_caps_used(in);
5248 int wanted = in->caps_wanted();
5249
5250 const unsigned new_caps = m->get_caps();
5251 const bool was_stale = session->cap_gen > cap->gen;
5252 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5253 << " mds." << mds << " seq " << m->get_seq()
5254 << " caps now " << ccap_string(new_caps)
5255 << " was " << ccap_string(cap->issued)
5256 << (was_stale ? " (stale)" : "") << dendl;
5257
5258 if (was_stale)
5259 cap->issued = cap->implemented = CEPH_CAP_PIN;
5260 cap->seq = m->get_seq();
5261 cap->gen = session->cap_gen;
5262
5263 check_cap_issue(in, new_caps);
5264
5265 // update inode
5266 int issued;
5267 in->caps_issued(&issued);
5268 issued |= in->caps_dirty();
5269
5270 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5271 !(issued & CEPH_CAP_AUTH_EXCL)) {
5272 in->mode = m->head.mode;
5273 in->uid = m->head.uid;
5274 in->gid = m->head.gid;
5275 in->btime = m->btime;
5276 }
5277 bool deleted_inode = false;
5278 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5279 !(issued & CEPH_CAP_LINK_EXCL)) {
5280 in->nlink = m->head.nlink;
5281 if (in->nlink == 0 &&
5282 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5283 deleted_inode = true;
5284 }
5285 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5286 m->xattrbl.length() &&
5287 m->head.xattr_version > in->xattr_version) {
5288 auto p = m->xattrbl.cbegin();
5289 decode(in->xattrs, p);
5290 in->xattr_version = m->head.xattr_version;
5291 }
5292
5293 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5294 in->dirstat.nfiles = m->get_nfiles();
5295 in->dirstat.nsubdirs = m->get_nsubdirs();
5296 }
5297
5298 if (new_caps & CEPH_CAP_ANY_RD) {
5299 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5300 m->get_ctime(), m->get_mtime(), m->get_atime());
5301 }
5302
5303 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5304 in->layout = m->get_layout();
5305 update_inode_file_size(in, issued, m->get_size(),
5306 m->get_truncate_seq(), m->get_truncate_size());
5307 }
5308
5309 if (m->inline_version > in->inline_version) {
5310 in->inline_data = m->inline_data;
5311 in->inline_version = m->inline_version;
5312 }
5313
5314 /* always take a newer change attr */
5315 if (m->get_change_attr() > in->change_attr)
5316 in->change_attr = m->get_change_attr();
5317
5318 // max_size
5319 if (cap == in->auth_cap &&
5320 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5321 (m->get_max_size() != in->max_size)) {
5322 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5323 in->max_size = m->get_max_size();
5324 if (in->max_size > in->wanted_max_size) {
5325 in->wanted_max_size = 0;
5326 in->requested_max_size = 0;
5327 }
5328 }
5329
5330 bool check = false;
5331 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5332 (wanted & ~(cap->wanted | new_caps))) {
5333 // If mds is importing cap, prior cap messages that update 'wanted'
5334 // may get dropped by mds (migrate seq mismatch).
5335 //
5336 // We don't send cap message to update 'wanted' if what we want are
5337 // already issued. If mds revokes caps, cap message that releases caps
5338 // also tells mds what we want. But if caps got revoked by mds forcedly
5339 // (session stale). We may haven't told mds what we want.
5340 check = true;
5341 }
5342
5343
5344 // update caps
5345 auto revoked = cap->issued & ~new_caps;
5346 if (revoked) {
5347 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5348 cap->issued = new_caps;
5349 cap->implemented |= new_caps;
5350
5351 // recall delegations if we're losing caps necessary for them
5352 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5353 in->recall_deleg(false);
5354 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5355 in->recall_deleg(true);
5356
5357 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5358 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5359 !_flush(in, new C_Client_FlushComplete(this, in))) {
5360 // waitin' for flush
5361 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5362 if (_release(in))
5363 check = true;
5364 } else {
5365 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5366 check = true;
5367 }
5368 } else if (cap->issued == new_caps) {
5369 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5370 } else {
5371 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5372 cap->issued = new_caps;
5373 cap->implemented |= new_caps;
5374
5375 if (cap == in->auth_cap) {
5376 // non-auth MDS is revoking the newly grant caps ?
5377 for (const auto &p : in->caps) {
5378 if (&p.second == cap)
5379 continue;
5380 if (p.second.implemented & ~p.second.issued & new_caps) {
5381 check = true;
5382 break;
5383 }
5384 }
5385 }
5386 }
5387
5388 if (check)
5389 check_caps(in, 0);
5390
5391 // wake up waiters
5392 if (new_caps)
5393 signal_cond_list(in->waitfor_caps);
5394
5395 // may drop inode's last ref
5396 if (deleted_inode)
5397 _try_to_trim_inode(in, true);
5398 }
5399
5400 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5401 {
5402 if (perms.uid() == 0)
5403 return 0;
5404
5405 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5406 int ret = _posix_acl_permission(in, perms, want);
5407 if (ret != -EAGAIN)
5408 return ret;
5409 }
5410
5411 // check permissions before doing anything else
5412 if (!in->check_mode(perms, want))
5413 return -EACCES;
5414 return 0;
5415 }
5416
5417 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5418 const UserPerm& perms)
5419 {
5420 int r = _getattr_for_perm(in, perms);
5421 if (r < 0)
5422 goto out;
5423
5424 r = 0;
5425 if (strncmp(name, "system.", 7) == 0) {
5426 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5427 r = -EPERM;
5428 } else {
5429 r = inode_permission(in, perms, want);
5430 }
5431 out:
5432 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5433 return r;
5434 }
5435
5436 ostream& operator<<(ostream &out, const UserPerm& perm) {
5437 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5438 return out;
5439 }
5440
5441 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5442 const UserPerm& perms)
5443 {
5444 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5445 int r = _getattr_for_perm(in, perms);
5446 if (r < 0)
5447 goto out;
5448
5449 if (mask & CEPH_SETATTR_SIZE) {
5450 r = inode_permission(in, perms, MAY_WRITE);
5451 if (r < 0)
5452 goto out;
5453 }
5454
5455 r = -EPERM;
5456 if (mask & CEPH_SETATTR_UID) {
5457 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5458 goto out;
5459 }
5460 if (mask & CEPH_SETATTR_GID) {
5461 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5462 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5463 goto out;
5464 }
5465
5466 if (mask & CEPH_SETATTR_MODE) {
5467 if (perms.uid() != 0 && perms.uid() != in->uid)
5468 goto out;
5469
5470 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5471 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5472 stx->stx_mode &= ~S_ISGID;
5473 }
5474
5475 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5476 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5477 if (perms.uid() != 0 && perms.uid() != in->uid) {
5478 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5479 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5480 check_mask |= CEPH_SETATTR_MTIME;
5481 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5482 check_mask |= CEPH_SETATTR_ATIME;
5483 if (check_mask & mask) {
5484 goto out;
5485 } else {
5486 r = inode_permission(in, perms, MAY_WRITE);
5487 if (r < 0)
5488 goto out;
5489 }
5490 }
5491 }
5492 r = 0;
5493 out:
5494 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5495 return r;
5496 }
5497
5498 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5499 {
5500 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5501 unsigned want = 0;
5502
5503 if ((flags & O_ACCMODE) == O_WRONLY)
5504 want = MAY_WRITE;
5505 else if ((flags & O_ACCMODE) == O_RDWR)
5506 want = MAY_READ | MAY_WRITE;
5507 else if ((flags & O_ACCMODE) == O_RDONLY)
5508 want = MAY_READ;
5509 if (flags & O_TRUNC)
5510 want |= MAY_WRITE;
5511
5512 int r = 0;
5513 switch (in->mode & S_IFMT) {
5514 case S_IFLNK:
5515 r = -ELOOP;
5516 goto out;
5517 case S_IFDIR:
5518 if (want & MAY_WRITE) {
5519 r = -EISDIR;
5520 goto out;
5521 }
5522 break;
5523 }
5524
5525 r = _getattr_for_perm(in, perms);
5526 if (r < 0)
5527 goto out;
5528
5529 r = inode_permission(in, perms, want);
5530 out:
5531 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5532 return r;
5533 }
5534
5535 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5536 {
5537 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5538 int r = _getattr_for_perm(dir, perms);
5539 if (r < 0)
5540 goto out;
5541
5542 r = inode_permission(dir, perms, MAY_EXEC);
5543 out:
5544 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5545 return r;
5546 }
5547
5548 int Client::may_create(Inode *dir, const UserPerm& perms)
5549 {
5550 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5551 int r = _getattr_for_perm(dir, perms);
5552 if (r < 0)
5553 goto out;
5554
5555 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5556 out:
5557 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5558 return r;
5559 }
5560
5561 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5562 {
5563 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5564 int r = _getattr_for_perm(dir, perms);
5565 if (r < 0)
5566 goto out;
5567
5568 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5569 if (r < 0)
5570 goto out;
5571
5572 /* 'name == NULL' means rmsnap */
5573 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5574 InodeRef otherin;
5575 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5576 if (r < 0)
5577 goto out;
5578 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5579 r = -EPERM;
5580 }
5581 out:
5582 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5583 return r;
5584 }
5585
5586 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5587 {
5588 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5589 int r = _getattr_for_perm(in, perms);
5590 if (r < 0)
5591 goto out;
5592
5593 if (perms.uid() == 0 || perms.uid() == in->uid) {
5594 r = 0;
5595 goto out;
5596 }
5597
5598 r = -EPERM;
5599 if (!S_ISREG(in->mode))
5600 goto out;
5601
5602 if (in->mode & S_ISUID)
5603 goto out;
5604
5605 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5606 goto out;
5607
5608 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5609 out:
5610 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5611 return r;
5612 }
5613
5614 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5615 {
5616 int mask = CEPH_STAT_CAP_MODE;
5617 bool force = false;
5618 if (acl_type != NO_ACL) {
5619 mask |= CEPH_STAT_CAP_XATTR;
5620 force = in->xattr_version == 0;
5621 }
5622 return _getattr(in, mask, perms, force);
5623 }
5624
5625 vinodeno_t Client::_get_vino(Inode *in)
5626 {
5627 /* The caller must hold the client lock */
5628 return vinodeno_t(in->ino, in->snapid);
5629 }
5630
5631 /**
5632 * Resolve an MDS spec to a list of MDS daemon GIDs.
5633 *
5634 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5635 * It may be '*' in which case it matches all GIDs.
5636 *
5637 * If no error is returned, the `targets` vector will be populated with at least
5638 * one MDS.
5639 */
5640 int Client::resolve_mds(
5641 const std::string &mds_spec,
5642 std::vector<mds_gid_t> *targets)
5643 {
5644 ceph_assert(fsmap);
5645 ceph_assert(targets != nullptr);
5646
5647 mds_role_t role;
5648 std::stringstream ss;
5649 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5650 if (role_r == 0) {
5651 // We got a role, resolve it to a GID
5652 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5653 << role << "'" << dendl;
5654 targets->push_back(
5655 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5656 return 0;
5657 }
5658
5659 std::string strtol_err;
5660 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5661 if (strtol_err.empty()) {
5662 // It is a possible GID
5663 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5664 if (fsmap->gid_exists(mds_gid)) {
5665 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5666 targets->push_back(mds_gid);
5667 } else {
5668 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5669 << dendl;
5670 return -ENOENT;
5671 }
5672 } else if (mds_spec == "*") {
5673 // It is a wildcard: use all MDSs
5674 const auto mds_info = fsmap->get_mds_info();
5675
5676 if (mds_info.empty()) {
5677 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5678 return -ENOENT;
5679 }
5680
5681 for (const auto i : mds_info) {
5682 targets->push_back(i.first);
5683 }
5684 } else {
5685 // It did not parse as an integer, it is not a wildcard, it must be a name
5686 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5687 if (mds_gid == 0) {
5688 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5689
5690 lderr(cct) << "FSMap: " << *fsmap << dendl;
5691
5692 return -ENOENT;
5693 } else {
5694 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5695 << "' to GID " << mds_gid << dendl;
5696 targets->push_back(mds_gid);
5697 }
5698 }
5699
5700 return 0;
5701 }
5702
5703
5704 /**
5705 * Authenticate with mon and establish global ID
5706 */
5707 int Client::authenticate()
5708 {
5709 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5710
5711 if (monclient->is_authenticated()) {
5712 return 0;
5713 }
5714
5715 client_lock.unlock();
5716 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5717 client_lock.lock();
5718 if (r < 0) {
5719 return r;
5720 }
5721
5722 whoami = monclient->get_global_id();
5723 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5724
5725 return 0;
5726 }
5727
5728 int Client::fetch_fsmap(bool user)
5729 {
5730 int r;
5731 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5732 // rather than MDSMap because no one MDSMap contains all the daemons, and
5733 // a `tell` can address any daemon.
5734 version_t fsmap_latest;
5735 do {
5736 C_SaferCond cond;
5737 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5738 client_lock.unlock();
5739 r = cond.wait();
5740 client_lock.lock();
5741 } while (r == -EAGAIN);
5742
5743 if (r < 0) {
5744 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5745 return r;
5746 }
5747
5748 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5749
5750 if (user) {
5751 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5752 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5753 monclient->renew_subs();
5754 wait_on_list(waiting_for_fsmap);
5755 }
5756 ceph_assert(fsmap_user);
5757 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5758 } else {
5759 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5760 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5761 monclient->renew_subs();
5762 wait_on_list(waiting_for_fsmap);
5763 }
5764 ceph_assert(fsmap);
5765 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5766 }
5767 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5768 << fsmap_latest << dendl;
5769 return 0;
5770 }
5771
5772 /**
5773 *
5774 * @mds_spec one of ID, rank, GID, "*"
5775 *
5776 */
5777 int Client::mds_command(
5778 const std::string &mds_spec,
5779 const vector<string>& cmd,
5780 const bufferlist& inbl,
5781 bufferlist *outbl,
5782 string *outs,
5783 Context *onfinish)
5784 {
5785 std::lock_guard lock(client_lock);
5786
5787 if (!initialized)
5788 return -ENOTCONN;
5789
5790 int r;
5791 r = authenticate();
5792 if (r < 0) {
5793 return r;
5794 }
5795
5796 r = fetch_fsmap(false);
5797 if (r < 0) {
5798 return r;
5799 }
5800
5801 // Look up MDS target(s) of the command
5802 std::vector<mds_gid_t> targets;
5803 r = resolve_mds(mds_spec, &targets);
5804 if (r < 0) {
5805 return r;
5806 }
5807
5808 // If daemons are laggy, we won't send them commands. If all
5809 // are laggy then we fail.
5810 std::vector<mds_gid_t> non_laggy;
5811 for (const auto gid : targets) {
5812 const auto info = fsmap->get_info_gid(gid);
5813 if (!info.laggy()) {
5814 non_laggy.push_back(gid);
5815 }
5816 }
5817 if (non_laggy.size() == 0) {
5818 *outs = "All targeted MDS daemons are laggy";
5819 return -ENOENT;
5820 }
5821
5822 if (metadata.empty()) {
5823 // We are called on an unmounted client, so metadata
5824 // won't be initialized yet.
5825 populate_metadata("");
5826 }
5827
5828 // Send commands to targets
5829 C_GatherBuilder gather(cct, onfinish);
5830 for (const auto target_gid : non_laggy) {
5831 const auto info = fsmap->get_info_gid(target_gid);
5832
5833 // Open a connection to the target MDS
5834 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5835
5836 // Generate MDSCommandOp state
5837 auto &op = command_table.start_command();
5838
5839 op.on_finish = gather.new_sub();
5840 op.cmd = cmd;
5841 op.outbl = outbl;
5842 op.outs = outs;
5843 op.inbl = inbl;
5844 op.mds_gid = target_gid;
5845 op.con = conn;
5846
5847 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5848 << " tid=" << op.tid << cmd << dendl;
5849
5850 // Construct and send MCommand
5851 auto m = op.get_message(monclient->get_fsid());
5852 conn->send_message2(std::move(m));
5853 }
5854 gather.activate();
5855
5856 return 0;
5857 }
5858
5859 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5860 {
5861 ceph_tid_t const tid = m->get_tid();
5862
5863 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5864
5865 if (!command_table.exists(tid)) {
5866 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5867 return;
5868 }
5869
5870 auto &op = command_table.get_command(tid);
5871 if (op.outbl) {
5872 *op.outbl = m->get_data();
5873 }
5874 if (op.outs) {
5875 *op.outs = m->rs;
5876 }
5877
5878 if (op.on_finish) {
5879 op.on_finish->complete(m->r);
5880 }
5881
5882 command_table.erase(tid);
5883 }
5884
5885 // -------------------
5886 // MOUNT
5887
5888 int Client::subscribe_mdsmap(const std::string &fs_name)
5889 {
5890 int r = authenticate();
5891 if (r < 0) {
5892 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5893 return r;
5894 }
5895
5896 std::string resolved_fs_name;
5897 if (fs_name.empty()) {
5898 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5899 if (resolved_fs_name.empty())
5900 // Try the backwards compatibility fs name option
5901 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5902 } else {
5903 resolved_fs_name = fs_name;
5904 }
5905
5906 std::string want = "mdsmap";
5907 if (!resolved_fs_name.empty()) {
5908 r = fetch_fsmap(true);
5909 if (r < 0)
5910 return r;
5911 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5912 if (fscid == FS_CLUSTER_ID_NONE) {
5913 return -ENOENT;
5914 }
5915
5916 std::ostringstream oss;
5917 oss << want << "." << fscid;
5918 want = oss.str();
5919 }
5920 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5921
5922 monclient->sub_want(want, 0, 0);
5923 monclient->renew_subs();
5924
5925 return 0;
5926 }
5927
5928 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5929 bool require_mds, const std::string &fs_name)
5930 {
5931 std::lock_guard lock(client_lock);
5932
5933 if (mounted) {
5934 ldout(cct, 5) << "already mounted" << dendl;
5935 return 0;
5936 }
5937
5938 unmounting = false;
5939
5940 int r = subscribe_mdsmap(fs_name);
5941 if (r < 0) {
5942 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5943 return r;
5944 }
5945
5946 tick(); // start tick
5947
5948 if (require_mds) {
5949 while (1) {
5950 auto availability = mdsmap->is_cluster_available();
5951 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5952 // Error out
5953 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5954 return CEPH_FUSE_NO_MDS_UP;
5955 } else if (availability == MDSMap::AVAILABLE) {
5956 // Continue to mount
5957 break;
5958 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5959 // Else, wait. MDSMonitor will update the map to bring
5960 // us to a conclusion eventually.
5961 wait_on_list(waiting_for_mdsmap);
5962 } else {
5963 // Unexpected value!
5964 ceph_abort();
5965 }
5966 }
5967 }
5968
5969 populate_metadata(mount_root.empty() ? "/" : mount_root);
5970
5971 filepath fp(CEPH_INO_ROOT);
5972 if (!mount_root.empty()) {
5973 fp = filepath(mount_root.c_str());
5974 }
5975 while (true) {
5976 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5977 req->set_filepath(fp);
5978 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5979 int res = make_request(req, perms);
5980 if (res < 0) {
5981 if (res == -EACCES && root) {
5982 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5983 break;
5984 }
5985 return res;
5986 }
5987
5988 if (fp.depth())
5989 fp.pop_dentry();
5990 else
5991 break;
5992 }
5993
5994 ceph_assert(root);
5995 _ll_get(root);
5996
5997 mounted = true;
5998
5999 // trace?
6000 if (!cct->_conf->client_trace.empty()) {
6001 traceout.open(cct->_conf->client_trace.c_str());
6002 if (traceout.is_open()) {
6003 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6004 } else {
6005 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6006 }
6007 }
6008
6009 /*
6010 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6011 ldout(cct, 3) << "op: struct stat st;" << dendl;
6012 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6013 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6014 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6015 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6016 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6017 ldout(cct, 3) << "op: int fd;" << dendl;
6018 */
6019 return 0;
6020 }
6021
6022 // UNMOUNT
6023
6024 void Client::_close_sessions()
6025 {
6026 while (!mds_sessions.empty()) {
6027 // send session closes!
6028 for (auto &p : mds_sessions) {
6029 if (p.second.state != MetaSession::STATE_CLOSING) {
6030 _close_mds_session(&p.second);
6031 }
6032 }
6033
6034 // wait for sessions to close
6035 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
6036 std::unique_lock l{client_lock, std::adopt_lock};
6037 mount_cond.wait(l);
6038 l.release();
6039 }
6040 }
6041
6042 void Client::flush_mdlog_sync()
6043 {
6044 if (mds_requests.empty())
6045 return;
6046 for (auto &p : mds_sessions) {
6047 flush_mdlog(&p.second);
6048 }
6049 }
6050
6051 void Client::flush_mdlog(MetaSession *session)
6052 {
6053 // Only send this to Luminous or newer MDS daemons, older daemons
6054 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6055 const uint64_t features = session->con->get_features();
6056 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6057 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6058 session->con->send_message2(std::move(m));
6059 }
6060 }
6061
6062
6063 void Client::_abort_mds_sessions(int err)
6064 {
6065 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6066 auto req = p->second;
6067 ++p;
6068 // unsafe requests will be removed during close session below.
6069 if (req->got_unsafe)
6070 continue;
6071
6072 req->abort(err);
6073 if (req->caller_cond) {
6074 req->kick = true;
6075 req->caller_cond->notify_all();
6076 }
6077 }
6078
6079 // Process aborts on any requests that were on this waitlist.
6080 // Any requests that were on a waiting_for_open session waitlist
6081 // will get kicked during close session below.
6082 signal_cond_list(waiting_for_mdsmap);
6083
6084 // Force-close all sessions
6085 while(!mds_sessions.empty()) {
6086 auto& session = mds_sessions.begin()->second;
6087 _closed_mds_session(&session);
6088 }
6089 }
6090
6091 void Client::_unmount(bool abort)
6092 {
6093 std::unique_lock lock{client_lock, std::adopt_lock};
6094 if (unmounting)
6095 return;
6096
6097 if (abort || blacklisted) {
6098 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6099 } else {
6100 ldout(cct, 2) << "unmounting" << dendl;
6101 }
6102 unmounting = true;
6103
6104 deleg_timeout = 0;
6105
6106 if (abort) {
6107 // Abort all mds sessions
6108 _abort_mds_sessions(-ENOTCONN);
6109
6110 objecter->op_cancel_writes(-ENOTCONN);
6111 } else {
6112 // flush the mdlog for pending requests, if any
6113 flush_mdlog_sync();
6114 }
6115
6116 mount_cond.wait(lock, [this] {
6117 if (!mds_requests.empty()) {
6118 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6119 << dendl;
6120 }
6121 return mds_requests.empty();
6122 });
6123 if (tick_event)
6124 timer.cancel_event(tick_event);
6125 tick_event = 0;
6126
6127 cwd.reset();
6128
6129 // clean up any unclosed files
6130 while (!fd_map.empty()) {
6131 Fh *fh = fd_map.begin()->second;
6132 fd_map.erase(fd_map.begin());
6133 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6134 _release_fh(fh);
6135 }
6136
6137 while (!ll_unclosed_fh_set.empty()) {
6138 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6139 Fh *fh = *it;
6140 ll_unclosed_fh_set.erase(fh);
6141 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6142 _release_fh(fh);
6143 }
6144
6145 while (!opened_dirs.empty()) {
6146 dir_result_t *dirp = *opened_dirs.begin();
6147 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6148 _closedir(dirp);
6149 }
6150
6151 _ll_drop_pins();
6152
6153 mount_cond.wait(lock, [this] {
6154 if (unsafe_sync_write > 0) {
6155 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6156 << dendl;
6157 }
6158 return unsafe_sync_write <= 0;
6159 });
6160
6161 if (cct->_conf->client_oc) {
6162 // flush/release all buffered data
6163 std::list<InodeRef> anchor;
6164 for (auto& p : inode_map) {
6165 Inode *in = p.second;
6166 if (!in) {
6167 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6168 ceph_assert(in);
6169 }
6170
6171 // prevent inode from getting freed
6172 anchor.emplace_back(in);
6173
6174 if (abort || blacklisted) {
6175 objectcacher->purge_set(&in->oset);
6176 } else if (!in->caps.empty()) {
6177 _release(in);
6178 _flush(in, new C_Client_FlushComplete(this, in));
6179 }
6180 }
6181 }
6182
6183 if (abort || blacklisted) {
6184 for (auto p = dirty_list.begin(); !p.end(); ) {
6185 Inode *in = *p;
6186 ++p;
6187 if (in->dirty_caps) {
6188 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6189 in->mark_caps_clean();
6190 put_inode(in);
6191 }
6192 }
6193 } else {
6194 flush_caps_sync();
6195 wait_sync_caps(last_flush_tid);
6196 }
6197
6198 // empty lru cache
6199 trim_cache();
6200
6201 while (lru.lru_get_size() > 0 ||
6202 !inode_map.empty()) {
6203 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6204 << "+" << inode_map.size() << " items"
6205 << ", waiting (for caps to release?)"
6206 << dendl;
6207 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6208 r == std::cv_status::timeout) {
6209 dump_cache(NULL);
6210 }
6211 }
6212 ceph_assert(lru.lru_get_size() == 0);
6213 ceph_assert(inode_map.empty());
6214
6215 // stop tracing
6216 if (!cct->_conf->client_trace.empty()) {
6217 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6218 traceout.close();
6219 }
6220
6221 _close_sessions();
6222
6223 mounted = false;
6224
6225 lock.release();
6226 ldout(cct, 2) << "unmounted." << dendl;
6227 }
6228
6229 void Client::unmount()
6230 {
6231 std::lock_guard lock(client_lock);
6232 _unmount(false);
6233 }
6234
6235 void Client::abort_conn()
6236 {
6237 std::lock_guard lock(client_lock);
6238 _unmount(true);
6239 }
6240
6241 void Client::flush_cap_releases()
6242 {
6243 // send any cap releases
6244 for (auto &p : mds_sessions) {
6245 auto &session = p.second;
6246 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6247 p.first)) {
6248 if (cct->_conf->client_inject_release_failure) {
6249 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6250 } else {
6251 session.con->send_message2(std::move(session.release));
6252 }
6253 session.release.reset();
6254 }
6255 }
6256 }
6257
6258 void Client::tick()
6259 {
6260 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6261 sleep(cct->_conf->client_debug_inject_tick_delay);
6262 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6263 cct->_conf.apply_changes(nullptr);
6264 }
6265
6266 ldout(cct, 21) << "tick" << dendl;
6267 tick_event = timer.add_event_after(
6268 cct->_conf->client_tick_interval,
6269 new LambdaContext([this](int) {
6270 // Called back via Timer, which takes client_lock for us
6271 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6272 tick();
6273 }));
6274 utime_t now = ceph_clock_now();
6275
6276 if (!mounted && !mds_requests.empty()) {
6277 MetaRequest *req = mds_requests.begin()->second;
6278 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6279 req->abort(-ETIMEDOUT);
6280 if (req->caller_cond) {
6281 req->kick = true;
6282 req->caller_cond->notify_all();
6283 }
6284 signal_cond_list(waiting_for_mdsmap);
6285 for (auto &p : mds_sessions) {
6286 signal_context_list(p.second.waiting_for_open);
6287 }
6288 }
6289 }
6290
6291 if (mdsmap->get_epoch()) {
6292 // renew caps?
6293 utime_t el = now - last_cap_renew;
6294 if (el > mdsmap->get_session_timeout() / 3.0)
6295 renew_caps();
6296
6297 flush_cap_releases();
6298 }
6299
6300 // delayed caps
6301 xlist<Inode*>::iterator p = delayed_list.begin();
6302 while (!p.end()) {
6303 Inode *in = *p;
6304 ++p;
6305 if (in->hold_caps_until > now)
6306 break;
6307 delayed_list.pop_front();
6308 check_caps(in, CHECK_CAPS_NODELAY);
6309 }
6310
6311 trim_cache(true);
6312 }
6313
6314 void Client::renew_caps()
6315 {
6316 ldout(cct, 10) << "renew_caps()" << dendl;
6317 last_cap_renew = ceph_clock_now();
6318
6319 for (auto &p : mds_sessions) {
6320 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6321 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6322 renew_caps(&p.second);
6323 }
6324 }
6325
6326 void Client::renew_caps(MetaSession *session)
6327 {
6328 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6329 session->last_cap_renew_request = ceph_clock_now();
6330 uint64_t seq = ++session->cap_renew_seq;
6331 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6332 }
6333
6334
6335 // ===============================================================
6336 // high level (POSIXy) interface
6337
6338 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6339 InodeRef *target, const UserPerm& perms)
6340 {
6341 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6342 MetaRequest *req = new MetaRequest(op);
6343 filepath path;
6344 dir->make_nosnap_relative_path(path);
6345 path.push_dentry(name);
6346 req->set_filepath(path);
6347 req->set_inode(dir);
6348 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6349 mask |= DEBUG_GETATTR_CAPS;
6350 req->head.args.getattr.mask = mask;
6351
6352 ldout(cct, 10) << __func__ << " on " << path << dendl;
6353
6354 int r = make_request(req, perms, target);
6355 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6356 return r;
6357 }
6358
6359 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6360 const UserPerm& perms)
6361 {
6362 int r = 0;
6363 Dentry *dn = NULL;
6364
6365 if (dname == "..") {
6366 if (dir->dentries.empty()) {
6367 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6368 filepath path(dir->ino);
6369 req->set_filepath(path);
6370
6371 InodeRef tmptarget;
6372 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6373
6374 if (r == 0) {
6375 Inode *tempino = tmptarget.get();
6376 _ll_get(tempino);
6377 *target = tempino;
6378 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6379 } else {
6380 *target = dir;
6381 }
6382 }
6383 else
6384 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6385 goto done;
6386 }
6387
6388 if (dname == ".") {
6389 *target = dir;
6390 goto done;
6391 }
6392
6393 if (!dir->is_dir()) {
6394 r = -ENOTDIR;
6395 goto done;
6396 }
6397
6398 if (dname.length() > NAME_MAX) {
6399 r = -ENAMETOOLONG;
6400 goto done;
6401 }
6402
6403 if (dname == cct->_conf->client_snapdir &&
6404 dir->snapid == CEPH_NOSNAP) {
6405 *target = open_snapdir(dir);
6406 goto done;
6407 }
6408
6409 if (dir->dir &&
6410 dir->dir->dentries.count(dname)) {
6411 dn = dir->dir->dentries[dname];
6412
6413 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6414 << " seq " << dn->lease_seq
6415 << dendl;
6416
6417 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6418 // is dn lease valid?
6419 utime_t now = ceph_clock_now();
6420 if (dn->lease_mds >= 0 &&
6421 dn->lease_ttl > now &&
6422 mds_sessions.count(dn->lease_mds)) {
6423 MetaSession &s = mds_sessions.at(dn->lease_mds);
6424 if (s.cap_ttl > now &&
6425 s.cap_gen == dn->lease_gen) {
6426 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6427 // make trim_caps() behave.
6428 dir->try_touch_cap(dn->lease_mds);
6429 goto hit_dn;
6430 }
6431 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6432 << " vs lease_gen " << dn->lease_gen << dendl;
6433 }
6434 // dir shared caps?
6435 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6436 if (dn->cap_shared_gen == dir->shared_gen &&
6437 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6438 goto hit_dn;
6439 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6440 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6441 << *dir << " dn '" << dname << "'" << dendl;
6442 return -ENOENT;
6443 }
6444 }
6445 } else {
6446 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6447 }
6448 } else {
6449 // can we conclude ENOENT locally?
6450 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6451 (dir->flags & I_COMPLETE)) {
6452 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6453 return -ENOENT;
6454 }
6455 }
6456
6457 r = _do_lookup(dir, dname, mask, target, perms);
6458 goto done;
6459
6460 hit_dn:
6461 if (dn->inode) {
6462 *target = dn->inode;
6463 } else {
6464 r = -ENOENT;
6465 }
6466 touch_dn(dn);
6467
6468 done:
6469 if (r < 0)
6470 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6471 else
6472 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6473 return r;
6474 }
6475
6476 int Client::get_or_create(Inode *dir, const char* name,
6477 Dentry **pdn, bool expect_null)
6478 {
6479 // lookup
6480 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6481 dir->open_dir();
6482 if (dir->dir->dentries.count(name)) {
6483 Dentry *dn = dir->dir->dentries[name];
6484
6485 // is dn lease valid?
6486 utime_t now = ceph_clock_now();
6487 if (dn->inode &&
6488 dn->lease_mds >= 0 &&
6489 dn->lease_ttl > now &&
6490 mds_sessions.count(dn->lease_mds)) {
6491 MetaSession &s = mds_sessions.at(dn->lease_mds);
6492 if (s.cap_ttl > now &&
6493 s.cap_gen == dn->lease_gen) {
6494 if (expect_null)
6495 return -EEXIST;
6496 }
6497 }
6498 *pdn = dn;
6499 } else {
6500 // otherwise link up a new one
6501 *pdn = link(dir->dir, name, NULL, NULL);
6502 }
6503
6504 // success
6505 return 0;
6506 }
6507
6508 int Client::path_walk(const filepath& origpath, InodeRef *end,
6509 const UserPerm& perms, bool followsym, int mask)
6510 {
6511 filepath path = origpath;
6512 InodeRef cur;
6513 if (origpath.absolute())
6514 cur = root;
6515 else
6516 cur = cwd;
6517 ceph_assert(cur);
6518
6519 ldout(cct, 10) << __func__ << " " << path << dendl;
6520
6521 int symlinks = 0;
6522
6523 unsigned i=0;
6524 while (i < path.depth() && cur) {
6525 int caps = 0;
6526 const string &dname = path[i];
6527 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6528 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6529 InodeRef next;
6530 if (cct->_conf->client_permissions) {
6531 int r = may_lookup(cur.get(), perms);
6532 if (r < 0)
6533 return r;
6534 caps = CEPH_CAP_AUTH_SHARED;
6535 }
6536
6537 /* Get extra requested caps on the last component */
6538 if (i == (path.depth() - 1))
6539 caps |= mask;
6540 int r = _lookup(cur.get(), dname, caps, &next, perms);
6541 if (r < 0)
6542 return r;
6543 // only follow trailing symlink if followsym. always follow
6544 // 'directory' symlinks.
6545 if (next && next->is_symlink()) {
6546 symlinks++;
6547 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6548 if (symlinks > MAXSYMLINKS) {
6549 return -ELOOP;
6550 }
6551
6552 if (i < path.depth() - 1) {
6553 // dir symlink
6554 // replace consumed components of path with symlink dir target
6555 filepath resolved(next->symlink.c_str());
6556 resolved.append(path.postfixpath(i + 1));
6557 path = resolved;
6558 i = 0;
6559 if (next->symlink[0] == '/') {
6560 cur = root;
6561 }
6562 continue;
6563 } else if (followsym) {
6564 if (next->symlink[0] == '/') {
6565 path = next->symlink.c_str();
6566 i = 0;
6567 // reset position
6568 cur = root;
6569 } else {
6570 filepath more(next->symlink.c_str());
6571 // we need to remove the symlink component from off of the path
6572 // before adding the target that the symlink points to. remain
6573 // at the same position in the path.
6574 path.pop_dentry();
6575 path.append(more);
6576 }
6577 continue;
6578 }
6579 }
6580 cur.swap(next);
6581 i++;
6582 }
6583 if (!cur)
6584 return -ENOENT;
6585 if (end)
6586 end->swap(cur);
6587 return 0;
6588 }
6589
6590
6591 // namespace ops
6592
6593 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6594 {
6595 std::lock_guard lock(client_lock);
6596 tout(cct) << "link" << std::endl;
6597 tout(cct) << relexisting << std::endl;
6598 tout(cct) << relpath << std::endl;
6599
6600 if (unmounting)
6601 return -ENOTCONN;
6602
6603 filepath existing(relexisting);
6604
6605 InodeRef in, dir;
6606 int r = path_walk(existing, &in, perm, true);
6607 if (r < 0)
6608 return r;
6609 if (std::string(relpath) == "/") {
6610 r = -EEXIST;
6611 return r;
6612 }
6613 filepath path(relpath);
6614 string name = path.last_dentry();
6615 path.pop_dentry();
6616
6617 r = path_walk(path, &dir, perm, true);
6618 if (r < 0)
6619 return r;
6620 if (cct->_conf->client_permissions) {
6621 if (S_ISDIR(in->mode)) {
6622 r = -EPERM;
6623 return r;
6624 }
6625 r = may_hardlink(in.get(), perm);
6626 if (r < 0)
6627 return r;
6628 r = may_create(dir.get(), perm);
6629 if (r < 0)
6630 return r;
6631 }
6632 r = _link(in.get(), dir.get(), name.c_str(), perm);
6633 return r;
6634 }
6635
6636 int Client::unlink(const char *relpath, const UserPerm& perm)
6637 {
6638 std::lock_guard lock(client_lock);
6639 tout(cct) << __func__ << std::endl;
6640 tout(cct) << relpath << std::endl;
6641
6642 if (unmounting)
6643 return -ENOTCONN;
6644
6645 if (std::string(relpath) == "/")
6646 return -EISDIR;
6647
6648 filepath path(relpath);
6649 string name = path.last_dentry();
6650 path.pop_dentry();
6651 InodeRef dir;
6652 int r = path_walk(path, &dir, perm);
6653 if (r < 0)
6654 return r;
6655 if (cct->_conf->client_permissions) {
6656 r = may_delete(dir.get(), name.c_str(), perm);
6657 if (r < 0)
6658 return r;
6659 }
6660 return _unlink(dir.get(), name.c_str(), perm);
6661 }
6662
6663 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6664 {
6665 std::lock_guard lock(client_lock);
6666 tout(cct) << __func__ << std::endl;
6667 tout(cct) << relfrom << std::endl;
6668 tout(cct) << relto << std::endl;
6669
6670 if (unmounting)
6671 return -ENOTCONN;
6672
6673 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6674 return -EBUSY;
6675
6676 filepath from(relfrom);
6677 filepath to(relto);
6678 string fromname = from.last_dentry();
6679 from.pop_dentry();
6680 string toname = to.last_dentry();
6681 to.pop_dentry();
6682
6683 InodeRef fromdir, todir;
6684 int r = path_walk(from, &fromdir, perm);
6685 if (r < 0)
6686 goto out;
6687 r = path_walk(to, &todir, perm);
6688 if (r < 0)
6689 goto out;
6690
6691 if (cct->_conf->client_permissions) {
6692 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6693 if (r < 0)
6694 return r;
6695 r = may_delete(todir.get(), toname.c_str(), perm);
6696 if (r < 0 && r != -ENOENT)
6697 return r;
6698 }
6699 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6700 out:
6701 return r;
6702 }
6703
6704 // dirs
6705
6706 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6707 {
6708 std::lock_guard lock(client_lock);
6709 tout(cct) << __func__ << std::endl;
6710 tout(cct) << relpath << std::endl;
6711 tout(cct) << mode << std::endl;
6712 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6713
6714 if (unmounting)
6715 return -ENOTCONN;
6716
6717 if (std::string(relpath) == "/")
6718 return -EEXIST;
6719
6720 filepath path(relpath);
6721 string name = path.last_dentry();
6722 path.pop_dentry();
6723 InodeRef dir;
6724 int r = path_walk(path, &dir, perm);
6725 if (r < 0)
6726 return r;
6727 if (cct->_conf->client_permissions) {
6728 r = may_create(dir.get(), perm);
6729 if (r < 0)
6730 return r;
6731 }
6732 return _mkdir(dir.get(), name.c_str(), mode, perm);
6733 }
6734
6735 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6736 {
6737 std::lock_guard lock(client_lock);
6738 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6739 tout(cct) << __func__ << std::endl;
6740 tout(cct) << relpath << std::endl;
6741 tout(cct) << mode << std::endl;
6742
6743 if (unmounting)
6744 return -ENOTCONN;
6745
6746 //get through existing parts of path
6747 filepath path(relpath);
6748 unsigned int i;
6749 int r = 0, caps = 0;
6750 InodeRef cur, next;
6751 cur = cwd;
6752 for (i=0; i<path.depth(); ++i) {
6753 if (cct->_conf->client_permissions) {
6754 r = may_lookup(cur.get(), perms);
6755 if (r < 0)
6756 break;
6757 caps = CEPH_CAP_AUTH_SHARED;
6758 }
6759 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6760 if (r < 0)
6761 break;
6762 cur.swap(next);
6763 }
6764 if (r!=-ENOENT) return r;
6765 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6766 //make new directory at each level
6767 for (; i<path.depth(); ++i) {
6768 if (cct->_conf->client_permissions) {
6769 r = may_create(cur.get(), perms);
6770 if (r < 0)
6771 return r;
6772 }
6773 //make new dir
6774 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6775
6776 //check proper creation/existence
6777 if(-EEXIST == r && i < path.depth() - 1) {
6778 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6779 }
6780 if (r < 0)
6781 return r;
6782 //move to new dir and continue
6783 cur.swap(next);
6784 ldout(cct, 20) << __func__ << ": successfully created directory "
6785 << filepath(cur->ino).get_path() << dendl;
6786 }
6787 return 0;
6788 }
6789
6790 int Client::rmdir(const char *relpath, const UserPerm& perms)
6791 {
6792 std::lock_guard lock(client_lock);
6793 tout(cct) << __func__ << std::endl;
6794 tout(cct) << relpath << std::endl;
6795
6796 if (unmounting)
6797 return -ENOTCONN;
6798
6799 if (std::string(relpath) == "/")
6800 return -EBUSY;
6801
6802 filepath path(relpath);
6803 string name = path.last_dentry();
6804 path.pop_dentry();
6805 InodeRef dir;
6806 int r = path_walk(path, &dir, perms);
6807 if (r < 0)
6808 return r;
6809 if (cct->_conf->client_permissions) {
6810 int r = may_delete(dir.get(), name.c_str(), perms);
6811 if (r < 0)
6812 return r;
6813 }
6814 return _rmdir(dir.get(), name.c_str(), perms);
6815 }
6816
6817 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6818 {
6819 std::lock_guard lock(client_lock);
6820 tout(cct) << __func__ << std::endl;
6821 tout(cct) << relpath << std::endl;
6822 tout(cct) << mode << std::endl;
6823 tout(cct) << rdev << std::endl;
6824
6825 if (unmounting)
6826 return -ENOTCONN;
6827
6828 if (std::string(relpath) == "/")
6829 return -EEXIST;
6830
6831 filepath path(relpath);
6832 string name = path.last_dentry();
6833 path.pop_dentry();
6834 InodeRef dir;
6835 int r = path_walk(path, &dir, perms);
6836 if (r < 0)
6837 return r;
6838 if (cct->_conf->client_permissions) {
6839 int r = may_create(dir.get(), perms);
6840 if (r < 0)
6841 return r;
6842 }
6843 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6844 }
6845
6846 // symlinks
6847
6848 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6849 {
6850 std::lock_guard lock(client_lock);
6851 tout(cct) << __func__ << std::endl;
6852 tout(cct) << target << std::endl;
6853 tout(cct) << relpath << std::endl;
6854
6855 if (unmounting)
6856 return -ENOTCONN;
6857
6858 if (std::string(relpath) == "/")
6859 return -EEXIST;
6860
6861 filepath path(relpath);
6862 string name = path.last_dentry();
6863 path.pop_dentry();
6864 InodeRef dir;
6865 int r = path_walk(path, &dir, perms);
6866 if (r < 0)
6867 return r;
6868 if (cct->_conf->client_permissions) {
6869 int r = may_create(dir.get(), perms);
6870 if (r < 0)
6871 return r;
6872 }
6873 return _symlink(dir.get(), name.c_str(), target, perms);
6874 }
6875
6876 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6877 {
6878 std::lock_guard lock(client_lock);
6879 tout(cct) << __func__ << std::endl;
6880 tout(cct) << relpath << std::endl;
6881
6882 if (unmounting)
6883 return -ENOTCONN;
6884
6885 filepath path(relpath);
6886 InodeRef in;
6887 int r = path_walk(path, &in, perms, false);
6888 if (r < 0)
6889 return r;
6890
6891 return _readlink(in.get(), buf, size);
6892 }
6893
6894 int Client::_readlink(Inode *in, char *buf, size_t size)
6895 {
6896 if (!in->is_symlink())
6897 return -EINVAL;
6898
6899 // copy into buf (at most size bytes)
6900 int r = in->symlink.length();
6901 if (r > (int)size)
6902 r = size;
6903 memcpy(buf, in->symlink.c_str(), r);
6904 return r;
6905 }
6906
6907
6908 // inode stuff
6909
6910 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6911 {
6912 bool yes = in->caps_issued_mask(mask, true);
6913
6914 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6915 if (yes && !force)
6916 return 0;
6917
6918 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6919 filepath path;
6920 in->make_nosnap_relative_path(path);
6921 req->set_filepath(path);
6922 req->set_inode(in);
6923 req->head.args.getattr.mask = mask;
6924
6925 int res = make_request(req, perms);
6926 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6927 return res;
6928 }
6929
6930 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6931 const UserPerm& perms, InodeRef *inp)
6932 {
6933 int issued = in->caps_issued();
6934
6935 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6936 ccap_string(issued) << dendl;
6937
6938 if (in->snapid != CEPH_NOSNAP) {
6939 return -EROFS;
6940 }
6941 if ((mask & CEPH_SETATTR_SIZE) &&
6942 (unsigned long)stx->stx_size > in->size &&
6943 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6944 perms)) {
6945 return -EDQUOT;
6946 }
6947
6948 // make the change locally?
6949 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6950 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6951 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6952 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6953 << in->cap_dirtier_gid << ", forcing sync setattr"
6954 << dendl;
6955 /*
6956 * This works because we implicitly flush the caps as part of the
6957 * request, so the cap update check will happen with the writeback
6958 * cap context, and then the setattr check will happen with the
6959 * caller's context.
6960 *
6961 * In reality this pattern is likely pretty rare (different users
6962 * setattr'ing the same file). If that turns out not to be the
6963 * case later, we can build a more complex pipelined cap writeback
6964 * infrastructure...
6965 */
6966 if (!mask)
6967 mask |= CEPH_SETATTR_CTIME;
6968 goto force_request;
6969 }
6970
6971 if (!mask) {
6972 // caller just needs us to bump the ctime
6973 in->ctime = ceph_clock_now();
6974 in->cap_dirtier_uid = perms.uid();
6975 in->cap_dirtier_gid = perms.gid();
6976 if (issued & CEPH_CAP_AUTH_EXCL)
6977 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6978 else if (issued & CEPH_CAP_FILE_EXCL)
6979 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6980 else if (issued & CEPH_CAP_XATTR_EXCL)
6981 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6982 else
6983 mask |= CEPH_SETATTR_CTIME;
6984 }
6985
6986 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6987 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6988
6989 mask &= ~CEPH_SETATTR_KILL_SGUID;
6990
6991 if (mask & CEPH_SETATTR_UID) {
6992 in->ctime = ceph_clock_now();
6993 in->cap_dirtier_uid = perms.uid();
6994 in->cap_dirtier_gid = perms.gid();
6995 in->uid = stx->stx_uid;
6996 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6997 mask &= ~CEPH_SETATTR_UID;
6998 kill_sguid = true;
6999 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7000 }
7001 if (mask & CEPH_SETATTR_GID) {
7002 in->ctime = ceph_clock_now();
7003 in->cap_dirtier_uid = perms.uid();
7004 in->cap_dirtier_gid = perms.gid();
7005 in->gid = stx->stx_gid;
7006 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7007 mask &= ~CEPH_SETATTR_GID;
7008 kill_sguid = true;
7009 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7010 }
7011
7012 if (mask & CEPH_SETATTR_MODE) {
7013 in->ctime = ceph_clock_now();
7014 in->cap_dirtier_uid = perms.uid();
7015 in->cap_dirtier_gid = perms.gid();
7016 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7017 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7018 mask &= ~CEPH_SETATTR_MODE;
7019 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7020 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7021 /* Must squash the any setuid/setgid bits with an ownership change */
7022 in->mode &= ~(S_ISUID|S_ISGID);
7023 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7024 }
7025
7026 if (mask & CEPH_SETATTR_BTIME) {
7027 in->ctime = ceph_clock_now();
7028 in->cap_dirtier_uid = perms.uid();
7029 in->cap_dirtier_gid = perms.gid();
7030 in->btime = utime_t(stx->stx_btime);
7031 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7032 mask &= ~CEPH_SETATTR_BTIME;
7033 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7034 }
7035 } else if (mask & CEPH_SETATTR_SIZE) {
7036 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7037 mask |= CEPH_SETATTR_KILL_SGUID;
7038 }
7039
7040 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7041 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7042 if (mask & CEPH_SETATTR_MTIME)
7043 in->mtime = utime_t(stx->stx_mtime);
7044 if (mask & CEPH_SETATTR_ATIME)
7045 in->atime = utime_t(stx->stx_atime);
7046 in->ctime = ceph_clock_now();
7047 in->cap_dirtier_uid = perms.uid();
7048 in->cap_dirtier_gid = perms.gid();
7049 in->time_warp_seq++;
7050 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7051 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7052 }
7053 }
7054 if (!mask) {
7055 in->change_attr++;
7056 return 0;
7057 }
7058
7059 force_request:
7060 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7061
7062 filepath path;
7063
7064 in->make_nosnap_relative_path(path);
7065 req->set_filepath(path);
7066 req->set_inode(in);
7067
7068 if (mask & CEPH_SETATTR_KILL_SGUID) {
7069 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7070 }
7071 if (mask & CEPH_SETATTR_MODE) {
7072 req->head.args.setattr.mode = stx->stx_mode;
7073 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7074 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7075 }
7076 if (mask & CEPH_SETATTR_UID) {
7077 req->head.args.setattr.uid = stx->stx_uid;
7078 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7079 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7080 }
7081 if (mask & CEPH_SETATTR_GID) {
7082 req->head.args.setattr.gid = stx->stx_gid;
7083 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7084 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7085 }
7086 if (mask & CEPH_SETATTR_BTIME) {
7087 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7088 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7089 }
7090 if (mask & CEPH_SETATTR_MTIME) {
7091 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7092 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7093 CEPH_CAP_FILE_WR;
7094 }
7095 if (mask & CEPH_SETATTR_ATIME) {
7096 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7097 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7098 CEPH_CAP_FILE_WR;
7099 }
7100 if (mask & CEPH_SETATTR_SIZE) {
7101 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7102 req->head.args.setattr.size = stx->stx_size;
7103 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7104 } else { //too big!
7105 put_request(req);
7106 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7107 return -EFBIG;
7108 }
7109 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7110 CEPH_CAP_FILE_WR;
7111 }
7112 req->head.args.setattr.mask = mask;
7113
7114 req->regetattr_mask = mask;
7115
7116 int res = make_request(req, perms, inp);
7117 ldout(cct, 10) << "_setattr result=" << res << dendl;
7118 return res;
7119 }
7120
7121 /* Note that we only care about attrs that setattr cares about */
7122 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7123 {
7124 stx->stx_size = st->st_size;
7125 stx->stx_mode = st->st_mode;
7126 stx->stx_uid = st->st_uid;
7127 stx->stx_gid = st->st_gid;
7128 #ifdef __APPLE__
7129 stx->stx_mtime = st->st_mtimespec;
7130 stx->stx_atime = st->st_atimespec;
7131 #else
7132 stx->stx_mtime = st->st_mtim;
7133 stx->stx_atime = st->st_atim;
7134 #endif
7135 }
7136
7137 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7138 const UserPerm& perms, InodeRef *inp)
7139 {
7140 int ret = _do_setattr(in, stx, mask, perms, inp);
7141 if (ret < 0)
7142 return ret;
7143 if (mask & CEPH_SETATTR_MODE)
7144 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7145 return ret;
7146 }
7147
7148 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7149 const UserPerm& perms)
7150 {
7151 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7152 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7153 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7154 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7155 if (cct->_conf->client_permissions) {
7156 int r = may_setattr(in.get(), stx, mask, perms);
7157 if (r < 0)
7158 return r;
7159 }
7160 return __setattrx(in.get(), stx, mask, perms);
7161 }
7162
7163 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7164 const UserPerm& perms)
7165 {
7166 struct ceph_statx stx;
7167
7168 stat_to_statx(attr, &stx);
7169 mask &= ~CEPH_SETATTR_BTIME;
7170
7171 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7172 mask &= ~CEPH_SETATTR_UID;
7173 }
7174 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7175 mask &= ~CEPH_SETATTR_GID;
7176 }
7177
7178 return _setattrx(in, &stx, mask, perms);
7179 }
7180
7181 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7182 const UserPerm& perms)
7183 {
7184 std::lock_guard lock(client_lock);
7185 tout(cct) << __func__ << std::endl;
7186 tout(cct) << relpath << std::endl;
7187 tout(cct) << mask << std::endl;
7188
7189 if (unmounting)
7190 return -ENOTCONN;
7191
7192 filepath path(relpath);
7193 InodeRef in;
7194 int r = path_walk(path, &in, perms);
7195 if (r < 0)
7196 return r;
7197 return _setattr(in, attr, mask, perms);
7198 }
7199
7200 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7201 const UserPerm& perms, int flags)
7202 {
7203 std::lock_guard lock(client_lock);
7204 tout(cct) << __func__ << std::endl;
7205 tout(cct) << relpath << std::endl;
7206 tout(cct) << mask << std::endl;
7207
7208 if (unmounting)
7209 return -ENOTCONN;
7210
7211 filepath path(relpath);
7212 InodeRef in;
7213 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7214 if (r < 0)
7215 return r;
7216 return _setattrx(in, stx, mask, perms);
7217 }
7218
7219 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7220 {
7221 std::lock_guard lock(client_lock);
7222 tout(cct) << __func__ << std::endl;
7223 tout(cct) << fd << std::endl;
7224 tout(cct) << mask << std::endl;
7225
7226 if (unmounting)
7227 return -ENOTCONN;
7228
7229 Fh *f = get_filehandle(fd);
7230 if (!f)
7231 return -EBADF;
7232 #if defined(__linux__) && defined(O_PATH)
7233 if (f->flags & O_PATH)
7234 return -EBADF;
7235 #endif
7236 return _setattr(f->inode, attr, mask, perms);
7237 }
7238
7239 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7240 {
7241 std::lock_guard lock(client_lock);
7242 tout(cct) << __func__ << std::endl;
7243 tout(cct) << fd << std::endl;
7244 tout(cct) << mask << std::endl;
7245
7246 if (unmounting)
7247 return -ENOTCONN;
7248
7249 Fh *f = get_filehandle(fd);
7250 if (!f)
7251 return -EBADF;
7252 #if defined(__linux__) && defined(O_PATH)
7253 if (f->flags & O_PATH)
7254 return -EBADF;
7255 #endif
7256 return _setattrx(f->inode, stx, mask, perms);
7257 }
7258
7259 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7260 frag_info_t *dirstat, int mask)
7261 {
7262 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7263 std::lock_guard lock(client_lock);
7264 tout(cct) << "stat" << std::endl;
7265 tout(cct) << relpath << std::endl;
7266
7267 if (unmounting)
7268 return -ENOTCONN;
7269
7270 filepath path(relpath);
7271 InodeRef in;
7272 int r = path_walk(path, &in, perms, true, mask);
7273 if (r < 0)
7274 return r;
7275 r = _getattr(in, mask, perms);
7276 if (r < 0) {
7277 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7278 return r;
7279 }
7280 fill_stat(in, stbuf, dirstat);
7281 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7282 return r;
7283 }
7284
7285 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7286 {
7287 unsigned mask = 0;
7288
7289 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7290 if (flags & AT_NO_ATTR_SYNC)
7291 goto out;
7292
7293 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7294 mask |= CEPH_CAP_PIN;
7295 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7296 mask |= CEPH_CAP_AUTH_SHARED;
7297 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7298 mask |= CEPH_CAP_LINK_SHARED;
7299 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7300 mask |= CEPH_CAP_FILE_SHARED;
7301 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7302 mask |= CEPH_CAP_XATTR_SHARED;
7303 out:
7304 return mask;
7305 }
7306
7307 int Client::statx(const char *relpath, struct ceph_statx *stx,
7308 const UserPerm& perms,
7309 unsigned int want, unsigned int flags)
7310 {
7311 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7312 std::lock_guard lock(client_lock);
7313 tout(cct) << "statx" << std::endl;
7314 tout(cct) << relpath << std::endl;
7315
7316 if (unmounting)
7317 return -ENOTCONN;
7318
7319 filepath path(relpath);
7320 InodeRef in;
7321
7322 unsigned mask = statx_to_mask(flags, want);
7323
7324 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7325 if (r < 0)
7326 return r;
7327
7328 r = _getattr(in, mask, perms);
7329 if (r < 0) {
7330 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7331 return r;
7332 }
7333
7334 fill_statx(in, mask, stx);
7335 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7336 return r;
7337 }
7338
7339 int Client::lstat(const char *relpath, struct stat *stbuf,
7340 const UserPerm& perms, frag_info_t *dirstat, int mask)
7341 {
7342 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7343 std::lock_guard lock(client_lock);
7344 tout(cct) << __func__ << std::endl;
7345 tout(cct) << relpath << std::endl;
7346
7347 if (unmounting)
7348 return -ENOTCONN;
7349
7350 filepath path(relpath);
7351 InodeRef in;
7352 // don't follow symlinks
7353 int r = path_walk(path, &in, perms, false, mask);
7354 if (r < 0)
7355 return r;
7356 r = _getattr(in, mask, perms);
7357 if (r < 0) {
7358 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7359 return r;
7360 }
7361 fill_stat(in, stbuf, dirstat);
7362 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7363 return r;
7364 }
7365
7366 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7367 {
7368 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7369 << " mode 0" << oct << in->mode << dec
7370 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7371 memset(st, 0, sizeof(struct stat));
7372 if (use_faked_inos())
7373 st->st_ino = in->faked_ino;
7374 else
7375 st->st_ino = in->ino;
7376 st->st_dev = in->snapid;
7377 st->st_mode = in->mode;
7378 st->st_rdev = in->rdev;
7379 if (in->is_dir()) {
7380 switch (in->nlink) {
7381 case 0:
7382 st->st_nlink = 0; /* dir is unlinked */
7383 break;
7384 case 1:
7385 st->st_nlink = 1 /* parent dentry */
7386 + 1 /* <dir>/. */
7387 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7388 break;
7389 default:
7390 ceph_abort();
7391 }
7392 } else {
7393 st->st_nlink = in->nlink;
7394 }
7395 st->st_uid = in->uid;
7396 st->st_gid = in->gid;
7397 if (in->ctime > in->mtime) {
7398 stat_set_ctime_sec(st, in->ctime.sec());
7399 stat_set_ctime_nsec(st, in->ctime.nsec());
7400 } else {
7401 stat_set_ctime_sec(st, in->mtime.sec());
7402 stat_set_ctime_nsec(st, in->mtime.nsec());
7403 }
7404 stat_set_atime_sec(st, in->atime.sec());
7405 stat_set_atime_nsec(st, in->atime.nsec());
7406 stat_set_mtime_sec(st, in->mtime.sec());
7407 stat_set_mtime_nsec(st, in->mtime.nsec());
7408 if (in->is_dir()) {
7409 if (cct->_conf->client_dirsize_rbytes)
7410 st->st_size = in->rstat.rbytes;
7411 else
7412 st->st_size = in->dirstat.size();
7413 st->st_blocks = 1;
7414 } else {
7415 st->st_size = in->size;
7416 st->st_blocks = (in->size + 511) >> 9;
7417 }
7418 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7419
7420 if (dirstat)
7421 *dirstat = in->dirstat;
7422 if (rstat)
7423 *rstat = in->rstat;
7424
7425 return in->caps_issued();
7426 }
7427
7428 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7429 {
7430 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7431 << " mode 0" << oct << in->mode << dec
7432 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7433 memset(stx, 0, sizeof(struct ceph_statx));
7434
7435 /*
7436 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7437 * so that all bits are set.
7438 */
7439 if (!mask)
7440 mask = ~0;
7441
7442 /* These are always considered to be available */
7443 stx->stx_dev = in->snapid;
7444 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7445
7446 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7447 stx->stx_mode = S_IFMT & in->mode;
7448 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7449 stx->stx_rdev = in->rdev;
7450 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7451
7452 if (mask & CEPH_CAP_AUTH_SHARED) {
7453 stx->stx_uid = in->uid;
7454 stx->stx_gid = in->gid;
7455 stx->stx_mode = in->mode;
7456 in->btime.to_timespec(&stx->stx_btime);
7457 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7458 }
7459
7460 if (mask & CEPH_CAP_LINK_SHARED) {
7461 if (in->is_dir()) {
7462 switch (in->nlink) {
7463 case 0:
7464 stx->stx_nlink = 0; /* dir is unlinked */
7465 break;
7466 case 1:
7467 stx->stx_nlink = 1 /* parent dentry */
7468 + 1 /* <dir>/. */
7469 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7470 break;
7471 default:
7472 ceph_abort();
7473 }
7474 } else {
7475 stx->stx_nlink = in->nlink;
7476 }
7477 stx->stx_mask |= CEPH_STATX_NLINK;
7478 }
7479
7480 if (mask & CEPH_CAP_FILE_SHARED) {
7481
7482 in->atime.to_timespec(&stx->stx_atime);
7483 in->mtime.to_timespec(&stx->stx_mtime);
7484
7485 if (in->is_dir()) {
7486 if (cct->_conf->client_dirsize_rbytes)
7487 stx->stx_size = in->rstat.rbytes;
7488 else
7489 stx->stx_size = in->dirstat.size();
7490 stx->stx_blocks = 1;
7491 } else {
7492 stx->stx_size = in->size;
7493 stx->stx_blocks = (in->size + 511) >> 9;
7494 }
7495 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7496 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7497 }
7498
7499 /* Change time and change_attr both require all shared caps to view */
7500 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7501 stx->stx_version = in->change_attr;
7502 if (in->ctime > in->mtime)
7503 in->ctime.to_timespec(&stx->stx_ctime);
7504 else
7505 in->mtime.to_timespec(&stx->stx_ctime);
7506 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7507 }
7508
7509 }
7510
7511 void Client::touch_dn(Dentry *dn)
7512 {
7513 lru.lru_touch(dn);
7514 }
7515
7516 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7517 {
7518 std::lock_guard lock(client_lock);
7519 tout(cct) << __func__ << std::endl;
7520 tout(cct) << relpath << std::endl;
7521 tout(cct) << mode << std::endl;
7522
7523 if (unmounting)
7524 return -ENOTCONN;
7525
7526 filepath path(relpath);
7527 InodeRef in;
7528 int r = path_walk(path, &in, perms);
7529 if (r < 0)
7530 return r;
7531 struct stat attr;
7532 attr.st_mode = mode;
7533 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7534 }
7535
7536 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7537 {
7538 std::lock_guard lock(client_lock);
7539 tout(cct) << __func__ << std::endl;
7540 tout(cct) << fd << std::endl;
7541 tout(cct) << mode << std::endl;
7542
7543 if (unmounting)
7544 return -ENOTCONN;
7545
7546 Fh *f = get_filehandle(fd);
7547 if (!f)
7548 return -EBADF;
7549 #if defined(__linux__) && defined(O_PATH)
7550 if (f->flags & O_PATH)
7551 return -EBADF;
7552 #endif
7553 struct stat attr;
7554 attr.st_mode = mode;
7555 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7556 }
7557
7558 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7559 {
7560 std::lock_guard lock(client_lock);
7561 tout(cct) << __func__ << std::endl;
7562 tout(cct) << relpath << std::endl;
7563 tout(cct) << mode << std::endl;
7564
7565 if (unmounting)
7566 return -ENOTCONN;
7567
7568 filepath path(relpath);
7569 InodeRef in;
7570 // don't follow symlinks
7571 int r = path_walk(path, &in, perms, false);
7572 if (r < 0)
7573 return r;
7574 struct stat attr;
7575 attr.st_mode = mode;
7576 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7577 }
7578
7579 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7580 const UserPerm& perms)
7581 {
7582 std::lock_guard lock(client_lock);
7583 tout(cct) << __func__ << std::endl;
7584 tout(cct) << relpath << std::endl;
7585 tout(cct) << new_uid << std::endl;
7586 tout(cct) << new_gid << std::endl;
7587
7588 if (unmounting)
7589 return -ENOTCONN;
7590
7591 filepath path(relpath);
7592 InodeRef in;
7593 int r = path_walk(path, &in, perms);
7594 if (r < 0)
7595 return r;
7596 struct stat attr;
7597 attr.st_uid = new_uid;
7598 attr.st_gid = new_gid;
7599 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7600 }
7601
7602 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7603 {
7604 std::lock_guard lock(client_lock);
7605 tout(cct) << __func__ << std::endl;
7606 tout(cct) << fd << std::endl;
7607 tout(cct) << new_uid << std::endl;
7608 tout(cct) << new_gid << std::endl;
7609
7610 if (unmounting)
7611 return -ENOTCONN;
7612
7613 Fh *f = get_filehandle(fd);
7614 if (!f)
7615 return -EBADF;
7616 #if defined(__linux__) && defined(O_PATH)
7617 if (f->flags & O_PATH)
7618 return -EBADF;
7619 #endif
7620 struct stat attr;
7621 attr.st_uid = new_uid;
7622 attr.st_gid = new_gid;
7623 int mask = 0;
7624 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7625 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7626 return _setattr(f->inode, &attr, mask, perms);
7627 }
7628
7629 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7630 const UserPerm& perms)
7631 {
7632 std::lock_guard lock(client_lock);
7633 tout(cct) << __func__ << std::endl;
7634 tout(cct) << relpath << std::endl;
7635 tout(cct) << new_uid << std::endl;
7636 tout(cct) << new_gid << std::endl;
7637
7638 if (unmounting)
7639 return -ENOTCONN;
7640
7641 filepath path(relpath);
7642 InodeRef in;
7643 // don't follow symlinks
7644 int r = path_walk(path, &in, perms, false);
7645 if (r < 0)
7646 return r;
7647 struct stat attr;
7648 attr.st_uid = new_uid;
7649 attr.st_gid = new_gid;
7650 int mask = 0;
7651 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7652 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7653 return _setattr(in, &attr, mask, perms);
7654 }
7655
7656 static void attr_set_atime_and_mtime(struct stat *attr,
7657 const utime_t &atime,
7658 const utime_t &mtime)
7659 {
7660 stat_set_atime_sec(attr, atime.tv.tv_sec);
7661 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7662 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7663 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7664 }
7665
7666 // for [l]utime() invoke the timeval variant as the timespec
7667 // variant are not yet implemented. for futime[s](), invoke
7668 // the timespec variant.
7669 int Client::utime(const char *relpath, struct utimbuf *buf,
7670 const UserPerm& perms)
7671 {
7672 struct timeval tv[2];
7673 tv[0].tv_sec = buf->actime;
7674 tv[0].tv_usec = 0;
7675 tv[1].tv_sec = buf->modtime;
7676 tv[1].tv_usec = 0;
7677
7678 return utimes(relpath, tv, perms);
7679 }
7680
7681 int Client::lutime(const char *relpath, struct utimbuf *buf,
7682 const UserPerm& perms)
7683 {
7684 struct timeval tv[2];
7685 tv[0].tv_sec = buf->actime;
7686 tv[0].tv_usec = 0;
7687 tv[1].tv_sec = buf->modtime;
7688 tv[1].tv_usec = 0;
7689
7690 return lutimes(relpath, tv, perms);
7691 }
7692
7693 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7694 {
7695 struct timespec ts[2];
7696 ts[0].tv_sec = buf->actime;
7697 ts[0].tv_nsec = 0;
7698 ts[1].tv_sec = buf->modtime;
7699 ts[1].tv_nsec = 0;
7700
7701 return futimens(fd, ts, perms);
7702 }
7703
7704 int Client::utimes(const char *relpath, struct timeval times[2],
7705 const UserPerm& perms)
7706 {
7707 std::lock_guard lock(client_lock);
7708 tout(cct) << __func__ << std::endl;
7709 tout(cct) << relpath << std::endl;
7710 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7711 << std::endl;
7712 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7713 << std::endl;
7714
7715 if (unmounting)
7716 return -ENOTCONN;
7717
7718 filepath path(relpath);
7719 InodeRef in;
7720 int r = path_walk(path, &in, perms);
7721 if (r < 0)
7722 return r;
7723 struct stat attr;
7724 utime_t atime(times[0]);
7725 utime_t mtime(times[1]);
7726
7727 attr_set_atime_and_mtime(&attr, atime, mtime);
7728 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7729 }
7730
7731 int Client::lutimes(const char *relpath, struct timeval times[2],
7732 const UserPerm& perms)
7733 {
7734 std::lock_guard lock(client_lock);
7735 tout(cct) << __func__ << std::endl;
7736 tout(cct) << relpath << std::endl;
7737 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7738 << std::endl;
7739 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7740 << std::endl;
7741
7742 if (unmounting)
7743 return -ENOTCONN;
7744
7745 filepath path(relpath);
7746 InodeRef in;
7747 int r = path_walk(path, &in, perms, false);
7748 if (r < 0)
7749 return r;
7750 struct stat attr;
7751 utime_t atime(times[0]);
7752 utime_t mtime(times[1]);
7753
7754 attr_set_atime_and_mtime(&attr, atime, mtime);
7755 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7756 }
7757
7758 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7759 {
7760 struct timespec ts[2];
7761 ts[0].tv_sec = times[0].tv_sec;
7762 ts[0].tv_nsec = times[0].tv_usec * 1000;
7763 ts[1].tv_sec = times[1].tv_sec;
7764 ts[1].tv_nsec = times[1].tv_usec * 1000;
7765
7766 return futimens(fd, ts, perms);
7767 }
7768
7769 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7770 {
7771 std::lock_guard lock(client_lock);
7772 tout(cct) << __func__ << std::endl;
7773 tout(cct) << fd << std::endl;
7774 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7775 << std::endl;
7776 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7777 << std::endl;
7778
7779 if (unmounting)
7780 return -ENOTCONN;
7781
7782 Fh *f = get_filehandle(fd);
7783 if (!f)
7784 return -EBADF;
7785 #if defined(__linux__) && defined(O_PATH)
7786 if (f->flags & O_PATH)
7787 return -EBADF;
7788 #endif
7789 struct stat attr;
7790 utime_t atime(times[0]);
7791 utime_t mtime(times[1]);
7792
7793 attr_set_atime_and_mtime(&attr, atime, mtime);
7794 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7795 }
7796
7797 int Client::flock(int fd, int operation, uint64_t owner)
7798 {
7799 std::lock_guard lock(client_lock);
7800 tout(cct) << __func__ << std::endl;
7801 tout(cct) << fd << std::endl;
7802 tout(cct) << operation << std::endl;
7803 tout(cct) << owner << std::endl;
7804
7805 if (unmounting)
7806 return -ENOTCONN;
7807
7808 Fh *f = get_filehandle(fd);
7809 if (!f)
7810 return -EBADF;
7811
7812 return _flock(f, operation, owner);
7813 }
7814
7815 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7816 {
7817 std::lock_guard lock(client_lock);
7818 tout(cct) << __func__ << std::endl;
7819 tout(cct) << relpath << std::endl;
7820
7821 if (unmounting)
7822 return -ENOTCONN;
7823
7824 filepath path(relpath);
7825 InodeRef in;
7826 int r = path_walk(path, &in, perms, true);
7827 if (r < 0)
7828 return r;
7829 if (cct->_conf->client_permissions) {
7830 int r = may_open(in.get(), O_RDONLY, perms);
7831 if (r < 0)
7832 return r;
7833 }
7834 r = _opendir(in.get(), dirpp, perms);
7835 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7836 if (r != -ENOTDIR)
7837 tout(cct) << (unsigned long)*dirpp << std::endl;
7838 return r;
7839 }
7840
7841 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7842 {
7843 if (!in->is_dir())
7844 return -ENOTDIR;
7845 *dirpp = new dir_result_t(in, perms);
7846 opened_dirs.insert(*dirpp);
7847 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7848 return 0;
7849 }
7850
7851
7852 int Client::closedir(dir_result_t *dir)
7853 {
7854 std::lock_guard lock(client_lock);
7855 tout(cct) << __func__ << std::endl;
7856 tout(cct) << (unsigned long)dir << std::endl;
7857
7858 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7859 _closedir(dir);
7860 return 0;
7861 }
7862
7863 void Client::_closedir(dir_result_t *dirp)
7864 {
7865 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7866 if (dirp->inode) {
7867 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7868 dirp->inode.reset();
7869 }
7870 _readdir_drop_dirp_buffer(dirp);
7871 opened_dirs.erase(dirp);
7872 delete dirp;
7873 }
7874
7875 void Client::rewinddir(dir_result_t *dirp)
7876 {
7877 std::lock_guard lock(client_lock);
7878 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7879
7880 if (unmounting)
7881 return;
7882
7883 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7884 _readdir_drop_dirp_buffer(d);
7885 d->reset();
7886 }
7887
7888 loff_t Client::telldir(dir_result_t *dirp)
7889 {
7890 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7891 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7892 return d->offset;
7893 }
7894
7895 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7896 {
7897 std::lock_guard lock(client_lock);
7898
7899 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7900
7901 if (unmounting)
7902 return;
7903
7904 if (offset == dirp->offset)
7905 return;
7906
7907 if (offset > dirp->offset)
7908 dirp->release_count = 0; // bump if we do a forward seek
7909 else
7910 dirp->ordered_count = 0; // disable filling readdir cache
7911
7912 if (dirp->hash_order()) {
7913 if (dirp->offset > offset) {
7914 _readdir_drop_dirp_buffer(dirp);
7915 dirp->reset();
7916 }
7917 } else {
7918 if (offset == 0 ||
7919 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7920 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7921 _readdir_drop_dirp_buffer(dirp);
7922 dirp->reset();
7923 }
7924 }
7925
7926 dirp->offset = offset;
7927 }
7928
7929
7930 //struct dirent {
7931 // ino_t d_ino; /* inode number */
7932 // off_t d_off; /* offset to the next dirent */
7933 // unsigned short d_reclen; /* length of this record */
7934 // unsigned char d_type; /* type of file */
7935 // char d_name[256]; /* filename */
7936 //};
7937 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7938 {
7939 strncpy(de->d_name, name, 255);
7940 de->d_name[255] = '\0';
7941 #ifndef __CYGWIN__
7942 de->d_ino = ino;
7943 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7944 de->d_off = next_off;
7945 #endif
7946 de->d_reclen = 1;
7947 de->d_type = IFTODT(type);
7948 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7949 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7950 #endif
7951 }
7952
7953 void Client::_readdir_next_frag(dir_result_t *dirp)
7954 {
7955 frag_t fg = dirp->buffer_frag;
7956
7957 if (fg.is_rightmost()) {
7958 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7959 dirp->set_end();
7960 return;
7961 }
7962
7963 // advance
7964 fg = fg.next();
7965 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7966
7967 if (dirp->hash_order()) {
7968 // keep last_name
7969 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7970 if (dirp->offset < new_offset) // don't decrease offset
7971 dirp->offset = new_offset;
7972 } else {
7973 dirp->last_name.clear();
7974 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7975 _readdir_rechoose_frag(dirp);
7976 }
7977 }
7978
7979 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7980 {
7981 ceph_assert(dirp->inode);
7982
7983 if (dirp->hash_order())
7984 return;
7985
7986 frag_t cur = frag_t(dirp->offset_high());
7987 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7988 if (fg != cur) {
7989 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7990 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7991 dirp->last_name.clear();
7992 dirp->next_offset = 2;
7993 }
7994 }
7995
7996 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7997 {
7998 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7999 dirp->buffer.clear();
8000 }
8001
8002 int Client::_readdir_get_frag(dir_result_t *dirp)
8003 {
8004 ceph_assert(dirp);
8005 ceph_assert(dirp->inode);
8006
8007 // get the current frag.
8008 frag_t fg;
8009 if (dirp->hash_order())
8010 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8011 else
8012 fg = frag_t(dirp->offset_high());
8013
8014 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8015 << " offset " << hex << dirp->offset << dec << dendl;
8016
8017 int op = CEPH_MDS_OP_READDIR;
8018 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8019 op = CEPH_MDS_OP_LSSNAP;
8020
8021 InodeRef& diri = dirp->inode;
8022
8023 MetaRequest *req = new MetaRequest(op);
8024 filepath path;
8025 diri->make_nosnap_relative_path(path);
8026 req->set_filepath(path);
8027 req->set_inode(diri.get());
8028 req->head.args.readdir.frag = fg;
8029 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8030 if (dirp->last_name.length()) {
8031 req->path2.set_path(dirp->last_name);
8032 } else if (dirp->hash_order()) {
8033 req->head.args.readdir.offset_hash = dirp->offset_high();
8034 }
8035 req->dirp = dirp;
8036
8037 bufferlist dirbl;
8038 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8039
8040 if (res == -EAGAIN) {
8041 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8042 _readdir_rechoose_frag(dirp);
8043 return _readdir_get_frag(dirp);
8044 }
8045
8046 if (res == 0) {
8047 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8048 << " size " << dirp->buffer.size() << dendl;
8049 } else {
8050 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8051 dirp->set_end();
8052 }
8053
8054 return res;
8055 }
8056
8057 struct dentry_off_lt {
8058 bool operator()(const Dentry* dn, int64_t off) const {
8059 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8060 }
8061 };
8062
8063 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8064 int caps, bool getref)
8065 {
8066 ceph_assert(ceph_mutex_is_locked(client_lock));
8067 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8068 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8069 << dendl;
8070 Dir *dir = dirp->inode->dir;
8071
8072 if (!dir) {
8073 ldout(cct, 10) << " dir is empty" << dendl;
8074 dirp->set_end();
8075 return 0;
8076 }
8077
8078 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8079 dir->readdir_cache.end(),
8080 dirp->offset, dentry_off_lt());
8081
8082 string dn_name;
8083 while (true) {
8084 if (!dirp->inode->is_complete_and_ordered())
8085 return -EAGAIN;
8086 if (pd == dir->readdir_cache.end())
8087 break;
8088 Dentry *dn = *pd;
8089 if (dn->inode == NULL) {
8090 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8091 ++pd;
8092 continue;
8093 }
8094 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8095 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8096 ++pd;
8097 continue;
8098 }
8099
8100 int idx = pd - dir->readdir_cache.begin();
8101 int r = _getattr(dn->inode, caps, dirp->perms);
8102 if (r < 0)
8103 return r;
8104
8105 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8106 pd = dir->readdir_cache.begin() + idx;
8107 if (pd >= dir->readdir_cache.end() || *pd != dn)
8108 return -EAGAIN;
8109
8110 struct ceph_statx stx;
8111 struct dirent de;
8112 fill_statx(dn->inode, caps, &stx);
8113
8114 uint64_t next_off = dn->offset + 1;
8115 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8116 ++pd;
8117 if (pd == dir->readdir_cache.end())
8118 next_off = dir_result_t::END;
8119
8120 Inode *in = NULL;
8121 if (getref) {
8122 in = dn->inode.get();
8123 _ll_get(in);
8124 }
8125
8126 dn_name = dn->name; // fill in name while we have lock
8127
8128 client_lock.unlock();
8129 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8130 client_lock.lock();
8131 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8132 << " = " << r << dendl;
8133 if (r < 0) {
8134 return r;
8135 }
8136
8137 dirp->offset = next_off;
8138 if (dirp->at_end())
8139 dirp->next_offset = 2;
8140 else
8141 dirp->next_offset = dirp->offset_low();
8142 dirp->last_name = dn_name; // we successfully returned this one; update!
8143 dirp->release_count = 0; // last_name no longer match cache index
8144 if (r > 0)
8145 return r;
8146 }
8147
8148 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8149 dirp->set_end();
8150 return 0;
8151 }
8152
8153 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8154 unsigned want, unsigned flags, bool getref)
8155 {
8156 int caps = statx_to_mask(flags, want);
8157
8158 std::lock_guard lock(client_lock);
8159
8160 if (unmounting)
8161 return -ENOTCONN;
8162
8163 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8164
8165 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8166 << dec << " at_end=" << dirp->at_end()
8167 << " hash_order=" << dirp->hash_order() << dendl;
8168
8169 struct dirent de;
8170 struct ceph_statx stx;
8171 memset(&de, 0, sizeof(de));
8172 memset(&stx, 0, sizeof(stx));
8173
8174 InodeRef& diri = dirp->inode;
8175
8176 if (dirp->at_end())
8177 return 0;
8178
8179 if (dirp->offset == 0) {
8180 ldout(cct, 15) << " including ." << dendl;
8181 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8182 uint64_t next_off = 1;
8183
8184 int r;
8185 r = _getattr(diri, caps, dirp->perms);
8186 if (r < 0)
8187 return r;
8188
8189 fill_statx(diri, caps, &stx);
8190 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8191
8192 Inode *inode = NULL;
8193 if (getref) {
8194 inode = diri.get();
8195 _ll_get(inode);
8196 }
8197
8198 client_lock.unlock();
8199 r = cb(p, &de, &stx, next_off, inode);
8200 client_lock.lock();
8201 if (r < 0)
8202 return r;
8203
8204 dirp->offset = next_off;
8205 if (r > 0)
8206 return r;
8207 }
8208 if (dirp->offset == 1) {
8209 ldout(cct, 15) << " including .." << dendl;
8210 uint64_t next_off = 2;
8211 InodeRef in;
8212 if (diri->dentries.empty())
8213 in = diri;
8214 else
8215 in = diri->get_first_parent()->dir->parent_inode;
8216
8217 int r;
8218 r = _getattr(in, caps, dirp->perms);
8219 if (r < 0)
8220 return r;
8221
8222 fill_statx(in, caps, &stx);
8223 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8224
8225 Inode *inode = NULL;
8226 if (getref) {
8227 inode = in.get();
8228 _ll_get(inode);
8229 }
8230
8231 client_lock.unlock();
8232 r = cb(p, &de, &stx, next_off, inode);
8233 client_lock.lock();
8234 if (r < 0)
8235 return r;
8236
8237 dirp->offset = next_off;
8238 if (r > 0)
8239 return r;
8240 }
8241
8242 // can we read from our cache?
8243 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8244 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8245 << dirp->inode->is_complete_and_ordered()
8246 << " issued " << ccap_string(dirp->inode->caps_issued())
8247 << dendl;
8248 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8249 dirp->inode->is_complete_and_ordered() &&
8250 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8251 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8252 if (err != -EAGAIN)
8253 return err;
8254 }
8255
8256 while (1) {
8257 if (dirp->at_end())
8258 return 0;
8259
8260 bool check_caps = true;
8261 if (!dirp->is_cached()) {
8262 int r = _readdir_get_frag(dirp);
8263 if (r)
8264 return r;
8265 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8266 // different than the requested one. (our dirfragtree was outdated)
8267 check_caps = false;
8268 }
8269 frag_t fg = dirp->buffer_frag;
8270
8271 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8272 << " offset " << hex << dirp->offset << dendl;
8273
8274 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8275 dirp->offset, dir_result_t::dentry_off_lt());
8276 it != dirp->buffer.end();
8277 ++it) {
8278 dir_result_t::dentry &entry = *it;
8279
8280 uint64_t next_off = entry.offset + 1;
8281
8282 int r;
8283 if (check_caps) {
8284 r = _getattr(entry.inode, caps, dirp->perms);
8285 if (r < 0)
8286 return r;
8287 }
8288
8289 fill_statx(entry.inode, caps, &stx);
8290 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8291
8292 Inode *inode = NULL;
8293 if (getref) {
8294 inode = entry.inode.get();
8295 _ll_get(inode);
8296 }
8297
8298 client_lock.unlock();
8299 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8300 client_lock.lock();
8301
8302 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8303 << " = " << r << dendl;
8304 if (r < 0)
8305 return r;
8306
8307 dirp->offset = next_off;
8308 if (r > 0)
8309 return r;
8310 }
8311
8312 if (dirp->next_offset > 2) {
8313 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8314 _readdir_drop_dirp_buffer(dirp);
8315 continue; // more!
8316 }
8317
8318 if (!fg.is_rightmost()) {
8319 // next frag!
8320 _readdir_next_frag(dirp);
8321 continue;
8322 }
8323
8324 if (diri->shared_gen == dirp->start_shared_gen &&
8325 diri->dir_release_count == dirp->release_count) {
8326 if (diri->dir_ordered_count == dirp->ordered_count) {
8327 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8328 if (diri->dir) {
8329 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8330 diri->dir->readdir_cache.resize(dirp->cache_index);
8331 }
8332 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8333 } else {
8334 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8335 diri->flags |= I_COMPLETE;
8336 }
8337 }
8338
8339 dirp->set_end();
8340 return 0;
8341 }
8342 ceph_abort();
8343 return 0;
8344 }
8345
8346
8347 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8348 {
8349 return readdirplus_r(d, de, 0, 0, 0, NULL);
8350 }
8351
8352 /*
8353 * readdirplus_r
8354 *
8355 * returns
8356 * 1 if we got a dirent
8357 * 0 for end of directory
8358 * <0 on error
8359 */
8360
8361 struct single_readdir {
8362 struct dirent *de;
8363 struct ceph_statx *stx;
8364 Inode *inode;
8365 bool full;
8366 };
8367
8368 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8369 struct ceph_statx *stx, off_t off,
8370 Inode *in)
8371 {
8372 single_readdir *c = static_cast<single_readdir *>(p);
8373
8374 if (c->full)
8375 return -1; // already filled this dirent
8376
8377 *c->de = *de;
8378 if (c->stx)
8379 *c->stx = *stx;
8380 c->inode = in;
8381 c->full = true;
8382 return 1;
8383 }
8384
8385 struct dirent *Client::readdir(dir_result_t *d)
8386 {
8387 int ret;
8388 static struct dirent de;
8389 single_readdir sr;
8390 sr.de = &de;
8391 sr.stx = NULL;
8392 sr.inode = NULL;
8393 sr.full = false;
8394
8395 // our callback fills the dirent and sets sr.full=true on first
8396 // call, and returns -1 the second time around.
8397 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8398 if (ret < -1) {
8399 errno = -ret; // this sucks.
8400 return (dirent *) NULL;
8401 }
8402 if (sr.full) {
8403 return &de;
8404 }
8405 return (dirent *) NULL;
8406 }
8407
8408 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8409 struct ceph_statx *stx, unsigned want,
8410 unsigned flags, Inode **out)
8411 {
8412 single_readdir sr;
8413 sr.de = de;
8414 sr.stx = stx;
8415 sr.inode = NULL;
8416 sr.full = false;
8417
8418 // our callback fills the dirent and sets sr.full=true on first
8419 // call, and returns -1 the second time around.
8420 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8421 if (r < -1)
8422 return r;
8423 if (out)
8424 *out = sr.inode;
8425 if (sr.full)
8426 return 1;
8427 return 0;
8428 }
8429
8430
8431 /* getdents */
8432 struct getdents_result {
8433 char *buf;
8434 int buflen;
8435 int pos;
8436 bool fullent;
8437 };
8438
8439 static int _readdir_getdent_cb(void *p, struct dirent *de,
8440 struct ceph_statx *stx, off_t off, Inode *in)
8441 {
8442 struct getdents_result *c = static_cast<getdents_result *>(p);
8443
8444 int dlen;
8445 if (c->fullent)
8446 dlen = sizeof(*de);
8447 else
8448 dlen = strlen(de->d_name) + 1;
8449
8450 if (c->pos + dlen > c->buflen)
8451 return -1; // doesn't fit
8452
8453 if (c->fullent) {
8454 memcpy(c->buf + c->pos, de, sizeof(*de));
8455 } else {
8456 memcpy(c->buf + c->pos, de->d_name, dlen);
8457 }
8458 c->pos += dlen;
8459 return 0;
8460 }
8461
8462 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8463 {
8464 getdents_result gr;
8465 gr.buf = buf;
8466 gr.buflen = buflen;
8467 gr.fullent = fullent;
8468 gr.pos = 0;
8469
8470 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8471
8472 if (r < 0) { // some error
8473 if (r == -1) { // buffer ran out of space
8474 if (gr.pos) { // but we got some entries already!
8475 return gr.pos;
8476 } // or we need a larger buffer
8477 return -ERANGE;
8478 } else { // actual error, return it
8479 return r;
8480 }
8481 }
8482 return gr.pos;
8483 }
8484
8485
8486 /* getdir */
8487 struct getdir_result {
8488 list<string> *contents;
8489 int num;
8490 };
8491
8492 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8493 {
8494 getdir_result *r = static_cast<getdir_result *>(p);
8495
8496 r->contents->push_back(de->d_name);
8497 r->num++;
8498 return 0;
8499 }
8500
8501 int Client::getdir(const char *relpath, list<string>& contents,
8502 const UserPerm& perms)
8503 {
8504 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8505 {
8506 std::lock_guard lock(client_lock);
8507 tout(cct) << "getdir" << std::endl;
8508 tout(cct) << relpath << std::endl;
8509 }
8510
8511 dir_result_t *d;
8512 int r = opendir(relpath, &d, perms);
8513 if (r < 0)
8514 return r;
8515
8516 getdir_result gr;
8517 gr.contents = &contents;
8518 gr.num = 0;
8519 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8520
8521 closedir(d);
8522
8523 if (r < 0)
8524 return r;
8525 return gr.num;
8526 }
8527
8528
8529 /****** file i/o **********/
8530 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8531 mode_t mode, int stripe_unit, int stripe_count,
8532 int object_size, const char *data_pool)
8533 {
8534 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8535 std::lock_guard lock(client_lock);
8536 tout(cct) << "open" << std::endl;
8537 tout(cct) << relpath << std::endl;
8538 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8539
8540 if (unmounting)
8541 return -ENOTCONN;
8542
8543 Fh *fh = NULL;
8544
8545 #if defined(__linux__) && defined(O_PATH)
8546 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8547 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8548 * in kernel (fs/open.c). */
8549 if (flags & O_PATH)
8550 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8551 #endif
8552
8553 filepath path(relpath);
8554 InodeRef in;
8555 bool created = false;
8556 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8557 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8558 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8559
8560 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8561 return -EEXIST;
8562
8563 #if defined(__linux__) && defined(O_PATH)
8564 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8565 #else
8566 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8567 #endif
8568 return -ELOOP;
8569
8570 if (r == -ENOENT && (flags & O_CREAT)) {
8571 filepath dirpath = path;
8572 string dname = dirpath.last_dentry();
8573 dirpath.pop_dentry();
8574 InodeRef dir;
8575 r = path_walk(dirpath, &dir, perms, true,
8576 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8577 if (r < 0)
8578 goto out;
8579 if (cct->_conf->client_permissions) {
8580 r = may_create(dir.get(), perms);
8581 if (r < 0)
8582 goto out;
8583 }
8584 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8585 stripe_count, object_size, data_pool, &created, perms);
8586 }
8587 if (r < 0)
8588 goto out;
8589
8590 if (!created) {
8591 // posix says we can only check permissions of existing files
8592 if (cct->_conf->client_permissions) {
8593 r = may_open(in.get(), flags, perms);
8594 if (r < 0)
8595 goto out;
8596 }
8597 }
8598
8599 if (!fh)
8600 r = _open(in.get(), flags, mode, &fh, perms);
8601 if (r >= 0) {
8602 // allocate a integer file descriptor
8603 ceph_assert(fh);
8604 r = get_fd();
8605 ceph_assert(fd_map.count(r) == 0);
8606 fd_map[r] = fh;
8607 }
8608
8609 out:
8610 tout(cct) << r << std::endl;
8611 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8612 return r;
8613 }
8614
8615 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8616 {
8617 /* Use default file striping parameters */
8618 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8619 }
8620
8621 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8622 const UserPerm& perms)
8623 {
8624 std::lock_guard lock(client_lock);
8625 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8626
8627 if (unmounting)
8628 return -ENOTCONN;
8629
8630 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8631 filepath path(ino);
8632 req->set_filepath(path);
8633
8634 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8635 char f[30];
8636 sprintf(f, "%u", h);
8637 filepath path2(dirino);
8638 path2.push_dentry(string(f));
8639 req->set_filepath2(path2);
8640
8641 int r = make_request(req, perms, NULL, NULL,
8642 rand() % mdsmap->get_num_in_mds());
8643 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8644 return r;
8645 }
8646
8647
8648 /**
8649 * Load inode into local cache.
8650 *
8651 * If inode pointer is non-NULL, and take a reference on
8652 * the resulting Inode object in one operation, so that caller
8653 * can safely assume inode will still be there after return.
8654 */
8655 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8656 {
8657 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8658
8659 if (unmounting)
8660 return -ENOTCONN;
8661
8662 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8663 filepath path(ino);
8664 req->set_filepath(path);
8665
8666 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8667 if (r == 0 && inode != NULL) {
8668 vinodeno_t vino(ino, CEPH_NOSNAP);
8669 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8670 ceph_assert(p != inode_map.end());
8671 *inode = p->second;
8672 _ll_get(*inode);
8673 }
8674 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8675 return r;
8676 }
8677
8678 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8679 {
8680 std::lock_guard lock(client_lock);
8681 return _lookup_ino(ino, perms, inode);
8682 }
8683
8684 /**
8685 * Find the parent inode of `ino` and insert it into
8686 * our cache. Conditionally also set `parent` to a referenced
8687 * Inode* if caller provides non-NULL value.
8688 */
8689 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8690 {
8691 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8692
8693 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8694 filepath path(ino->ino);
8695 req->set_filepath(path);
8696
8697 InodeRef target;
8698 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8699 // Give caller a reference to the parent ino if they provided a pointer.
8700 if (parent != NULL) {
8701 if (r == 0) {
8702 *parent = target.get();
8703 _ll_get(*parent);
8704 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8705 } else {
8706 *parent = NULL;
8707 }
8708 }
8709 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8710 return r;
8711 }
8712
8713 /**
8714 * Populate the parent dentry for `ino`, provided it is
8715 * a child of `parent`.
8716 */
8717 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8718 {
8719 ceph_assert(parent->is_dir());
8720 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8721
8722 if (unmounting)
8723 return -ENOTCONN;
8724
8725 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8726 req->set_filepath2(filepath(parent->ino));
8727 req->set_filepath(filepath(ino->ino));
8728 req->set_inode(ino);
8729
8730 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8731 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8732 return r;
8733 }
8734
8735 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8736 {
8737 std::lock_guard lock(client_lock);
8738 return _lookup_name(ino, parent, perms);
8739 }
8740
8741 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8742 {
8743 ceph_assert(in);
8744 Fh *f = new Fh(in, flags, cmode, perms);
8745
8746 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8747
8748 if (in->snapid != CEPH_NOSNAP) {
8749 in->snap_cap_refs++;
8750 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8751 << ccap_string(in->caps_issued()) << dendl;
8752 }
8753
8754 const auto& conf = cct->_conf;
8755 f->readahead.set_trigger_requests(1);
8756 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8757 uint64_t max_readahead = Readahead::NO_LIMIT;
8758 if (conf->client_readahead_max_bytes) {
8759 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8760 }
8761 if (conf->client_readahead_max_periods) {
8762 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8763 }
8764 f->readahead.set_max_readahead_size(max_readahead);
8765 vector<uint64_t> alignments;
8766 alignments.push_back(in->layout.get_period());
8767 alignments.push_back(in->layout.stripe_unit);
8768 f->readahead.set_alignments(alignments);
8769
8770 return f;
8771 }
8772
8773 int Client::_release_fh(Fh *f)
8774 {
8775 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8776 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8777 Inode *in = f->inode.get();
8778 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8779
8780 in->unset_deleg(f);
8781
8782 if (in->snapid == CEPH_NOSNAP) {
8783 if (in->put_open_ref(f->mode)) {
8784 _flush(in, new C_Client_FlushComplete(this, in));
8785 check_caps(in, 0);
8786 }
8787 } else {
8788 ceph_assert(in->snap_cap_refs > 0);
8789 in->snap_cap_refs--;
8790 }
8791
8792 _release_filelocks(f);
8793
8794 // Finally, read any async err (i.e. from flushes)
8795 int err = f->take_async_err();
8796 if (err != 0) {
8797 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8798 << cpp_strerror(err) << dendl;
8799 } else {
8800 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8801 }
8802
8803 _put_fh(f);
8804
8805 return err;
8806 }
8807
8808 void Client::_put_fh(Fh *f)
8809 {
8810 int left = f->put();
8811 if (!left) {
8812 delete f;
8813 }
8814 }
8815
8816 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8817 const UserPerm& perms)
8818 {
8819 if (in->snapid != CEPH_NOSNAP &&
8820 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8821 return -EROFS;
8822 }
8823
8824 // use normalized flags to generate cmode
8825 int cflags = ceph_flags_sys2wire(flags);
8826 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8827 cflags |= CEPH_O_LAZY;
8828
8829 int cmode = ceph_flags_to_mode(cflags);
8830 int want = ceph_caps_for_mode(cmode);
8831 int result = 0;
8832
8833 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8834
8835 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8836 // update wanted?
8837 check_caps(in, CHECK_CAPS_NODELAY);
8838 } else {
8839
8840 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8841 filepath path;
8842 in->make_nosnap_relative_path(path);
8843 req->set_filepath(path);
8844 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8845 req->head.args.open.mode = mode;
8846 req->head.args.open.pool = -1;
8847 if (cct->_conf->client_debug_getattr_caps)
8848 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8849 else
8850 req->head.args.open.mask = 0;
8851 req->head.args.open.old_size = in->size; // for O_TRUNC
8852 req->set_inode(in);
8853 result = make_request(req, perms);
8854
8855 /*
8856 * NFS expects that delegations will be broken on a conflicting open,
8857 * not just when there is actual conflicting access to the file. SMB leases
8858 * and oplocks also have similar semantics.
8859 *
8860 * Ensure that clients that have delegations enabled will wait on minimal
8861 * caps during open, just to ensure that other clients holding delegations
8862 * return theirs first.
8863 */
8864 if (deleg_timeout && result == 0) {
8865 int need = 0, have;
8866
8867 if (cmode & CEPH_FILE_MODE_WR)
8868 need |= CEPH_CAP_FILE_WR;
8869 if (cmode & CEPH_FILE_MODE_RD)
8870 need |= CEPH_CAP_FILE_RD;
8871
8872 result = get_caps(in, need, want, &have, -1);
8873 if (result < 0) {
8874 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8875 " . Denying open: " <<
8876 cpp_strerror(result) << dendl;
8877 in->put_open_ref(cmode);
8878 } else {
8879 put_cap_ref(in, need);
8880 }
8881 }
8882 }
8883
8884 // success?
8885 if (result >= 0) {
8886 if (fhp)
8887 *fhp = _create_fh(in, flags, cmode, perms);
8888 } else {
8889 in->put_open_ref(cmode);
8890 }
8891
8892 trim_cache();
8893
8894 return result;
8895 }
8896
8897 int Client::_renew_caps(Inode *in)
8898 {
8899 int wanted = in->caps_file_wanted();
8900 if (in->is_any_caps() &&
8901 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8902 check_caps(in, CHECK_CAPS_NODELAY);
8903 return 0;
8904 }
8905
8906 int flags = 0;
8907 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8908 flags = O_RDWR;
8909 else if (wanted & CEPH_CAP_FILE_RD)
8910 flags = O_RDONLY;
8911 else if (wanted & CEPH_CAP_FILE_WR)
8912 flags = O_WRONLY;
8913
8914 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8915 filepath path;
8916 in->make_nosnap_relative_path(path);
8917 req->set_filepath(path);
8918 req->head.args.open.flags = flags;
8919 req->head.args.open.pool = -1;
8920 if (cct->_conf->client_debug_getattr_caps)
8921 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8922 else
8923 req->head.args.open.mask = 0;
8924 req->set_inode(in);
8925
8926 // duplicate in case Cap goes away; not sure if that race is a concern?
8927 const UserPerm *pperm = in->get_best_perms();
8928 UserPerm perms;
8929 if (pperm != NULL)
8930 perms = *pperm;
8931 int ret = make_request(req, perms);
8932 return ret;
8933 }
8934
8935 int Client::close(int fd)
8936 {
8937 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8938 std::lock_guard lock(client_lock);
8939 tout(cct) << "close" << std::endl;
8940 tout(cct) << fd << std::endl;
8941
8942 if (unmounting)
8943 return -ENOTCONN;
8944
8945 Fh *fh = get_filehandle(fd);
8946 if (!fh)
8947 return -EBADF;
8948 int err = _release_fh(fh);
8949 fd_map.erase(fd);
8950 put_fd(fd);
8951 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8952 return err;
8953 }
8954
8955
8956 // ------------
8957 // read, write
8958
8959 loff_t Client::lseek(int fd, loff_t offset, int whence)
8960 {
8961 std::lock_guard lock(client_lock);
8962 tout(cct) << "lseek" << std::endl;
8963 tout(cct) << fd << std::endl;
8964 tout(cct) << offset << std::endl;
8965 tout(cct) << whence << std::endl;
8966
8967 if (unmounting)
8968 return -ENOTCONN;
8969
8970 Fh *f = get_filehandle(fd);
8971 if (!f)
8972 return -EBADF;
8973 #if defined(__linux__) && defined(O_PATH)
8974 if (f->flags & O_PATH)
8975 return -EBADF;
8976 #endif
8977 return _lseek(f, offset, whence);
8978 }
8979
8980 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8981 {
8982 Inode *in = f->inode.get();
8983 bool whence_check = false;
8984 loff_t pos = -1;
8985
8986 switch (whence) {
8987 case SEEK_END:
8988 whence_check = true;
8989 break;
8990
8991 #ifdef SEEK_DATA
8992 case SEEK_DATA:
8993 whence_check = true;
8994 break;
8995 #endif
8996
8997 #ifdef SEEK_HOLE
8998 case SEEK_HOLE:
8999 whence_check = true;
9000 break;
9001 #endif
9002 }
9003
9004 if (whence_check) {
9005 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9006 if (r < 0)
9007 return r;
9008 }
9009
9010 switch (whence) {
9011 case SEEK_SET:
9012 pos = offset;
9013 break;
9014
9015 case SEEK_CUR:
9016 pos = f->pos + offset;
9017 break;
9018
9019 case SEEK_END:
9020 pos = in->size + offset;
9021 break;
9022
9023 #ifdef SEEK_DATA
9024 case SEEK_DATA:
9025 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9026 return -ENXIO;
9027 pos = offset;
9028 break;
9029 #endif
9030
9031 #ifdef SEEK_HOLE
9032 case SEEK_HOLE:
9033 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9034 return -ENXIO;
9035 pos = in->size;
9036 break;
9037 #endif
9038
9039 default:
9040 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9041 return -EINVAL;
9042 }
9043
9044 if (pos < 0) {
9045 return -EINVAL;
9046 } else {
9047 f->pos = pos;
9048 }
9049
9050 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9051 return f->pos;
9052 }
9053
9054
9055 void Client::lock_fh_pos(Fh *f)
9056 {
9057 ldout(cct, 10) << __func__ << " " << f << dendl;
9058
9059 if (f->pos_locked || !f->pos_waiters.empty()) {
9060 ceph::condition_variable cond;
9061 f->pos_waiters.push_back(&cond);
9062 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9063 std::unique_lock l{client_lock, std::adopt_lock};
9064 cond.wait(l, [f, me=&cond] {
9065 return !f->pos_locked && f->pos_waiters.front() == me;
9066 });
9067 l.release();
9068 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9069 ceph_assert(f->pos_waiters.front() == &cond);
9070 f->pos_waiters.pop_front();
9071 }
9072
9073 f->pos_locked = true;
9074 }
9075
9076 void Client::unlock_fh_pos(Fh *f)
9077 {
9078 ldout(cct, 10) << __func__ << " " << f << dendl;
9079 f->pos_locked = false;
9080 }
9081
9082 int Client::uninline_data(Inode *in, Context *onfinish)
9083 {
9084 if (!in->inline_data.length()) {
9085 onfinish->complete(0);
9086 return 0;
9087 }
9088
9089 char oid_buf[32];
9090 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9091 object_t oid = oid_buf;
9092
9093 ObjectOperation create_ops;
9094 create_ops.create(false);
9095
9096 objecter->mutate(oid,
9097 OSDMap::file_to_object_locator(in->layout),
9098 create_ops,
9099 in->snaprealm->get_snap_context(),
9100 ceph::real_clock::now(),
9101 0,
9102 NULL);
9103
9104 bufferlist inline_version_bl;
9105 encode(in->inline_version, inline_version_bl);
9106
9107 ObjectOperation uninline_ops;
9108 uninline_ops.cmpxattr("inline_version",
9109 CEPH_OSD_CMPXATTR_OP_GT,
9110 CEPH_OSD_CMPXATTR_MODE_U64,
9111 inline_version_bl);
9112 bufferlist inline_data = in->inline_data;
9113 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9114 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9115
9116 objecter->mutate(oid,
9117 OSDMap::file_to_object_locator(in->layout),
9118 uninline_ops,
9119 in->snaprealm->get_snap_context(),
9120 ceph::real_clock::now(),
9121 0,
9122 onfinish);
9123
9124 return 0;
9125 }
9126
9127 //
9128
9129 // blocking osd interface
9130
9131 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9132 {
9133 std::lock_guard lock(client_lock);
9134 tout(cct) << "read" << std::endl;
9135 tout(cct) << fd << std::endl;
9136 tout(cct) << size << std::endl;
9137 tout(cct) << offset << std::endl;
9138
9139 if (unmounting)
9140 return -ENOTCONN;
9141
9142 Fh *f = get_filehandle(fd);
9143 if (!f)
9144 return -EBADF;
9145 #if defined(__linux__) && defined(O_PATH)
9146 if (f->flags & O_PATH)
9147 return -EBADF;
9148 #endif
9149 bufferlist bl;
9150 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9151 size = std::min(size, (loff_t)INT_MAX);
9152 int r = _read(f, offset, size, &bl);
9153 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9154 if (r >= 0) {
9155 bl.begin().copy(bl.length(), buf);
9156 r = bl.length();
9157 }
9158 return r;
9159 }
9160
9161 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9162 {
9163 if (iovcnt < 0)
9164 return -EINVAL;
9165 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9166 }
9167
9168 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9169 {
9170 int want, have = 0;
9171 bool movepos = false;
9172 std::unique_ptr<C_SaferCond> onuninline;
9173 int64_t r = 0;
9174 const auto& conf = cct->_conf;
9175 Inode *in = f->inode.get();
9176 utime_t lat;
9177 utime_t start = ceph_clock_now();
9178
9179 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9180 return -EBADF;
9181 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9182
9183 if (offset < 0) {
9184 lock_fh_pos(f);
9185 offset = f->pos;
9186 movepos = true;
9187 }
9188 loff_t start_pos = offset;
9189
9190 if (in->inline_version == 0) {
9191 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9192 if (r < 0) {
9193 goto done;
9194 }
9195 ceph_assert(in->inline_version > 0);
9196 }
9197
9198 retry:
9199 if (f->mode & CEPH_FILE_MODE_LAZY)
9200 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9201 else
9202 want = CEPH_CAP_FILE_CACHE;
9203 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9204 if (r < 0) {
9205 goto done;
9206 }
9207 if (f->flags & O_DIRECT)
9208 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9209
9210 if (in->inline_version < CEPH_INLINE_NONE) {
9211 if (!(have & CEPH_CAP_FILE_CACHE)) {
9212 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9213 uninline_data(in, onuninline.get());
9214 } else {
9215 uint32_t len = in->inline_data.length();
9216 uint64_t endoff = offset + size;
9217 if (endoff > in->size)
9218 endoff = in->size;
9219
9220 if (offset < len) {
9221 if (endoff <= len) {
9222 bl->substr_of(in->inline_data, offset, endoff - offset);
9223 } else {
9224 bl->substr_of(in->inline_data, offset, len - offset);
9225 bl->append_zero(endoff - len);
9226 }
9227 r = endoff - offset;
9228 } else if ((uint64_t)offset < endoff) {
9229 bl->append_zero(endoff - offset);
9230 r = endoff - offset;
9231 } else {
9232 r = 0;
9233 }
9234 goto success;
9235 }
9236 }
9237
9238 if (!conf->client_debug_force_sync_read &&
9239 conf->client_oc &&
9240 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9241
9242 if (f->flags & O_RSYNC) {
9243 _flush_range(in, offset, size);
9244 }
9245 r = _read_async(f, offset, size, bl);
9246 if (r < 0)
9247 goto done;
9248 } else {
9249 if (f->flags & O_DIRECT)
9250 _flush_range(in, offset, size);
9251
9252 bool checkeof = false;
9253 r = _read_sync(f, offset, size, bl, &checkeof);
9254 if (r < 0)
9255 goto done;
9256 if (checkeof) {
9257 offset += r;
9258 size -= r;
9259
9260 put_cap_ref(in, CEPH_CAP_FILE_RD);
9261 have = 0;
9262 // reverify size
9263 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9264 if (r < 0)
9265 goto done;
9266
9267 // eof? short read.
9268 if ((uint64_t)offset < in->size)
9269 goto retry;
9270 }
9271 }
9272
9273 success:
9274 ceph_assert(r >= 0);
9275 if (movepos) {
9276 // adjust fd pos
9277 f->pos = start_pos + r;
9278 }
9279
9280 lat = ceph_clock_now();
9281 lat -= start;
9282 logger->tinc(l_c_read, lat);
9283
9284 done:
9285 // done!
9286
9287 if (onuninline) {
9288 client_lock.unlock();
9289 int ret = onuninline->wait();
9290 client_lock.lock();
9291 if (ret >= 0 || ret == -ECANCELED) {
9292 in->inline_data.clear();
9293 in->inline_version = CEPH_INLINE_NONE;
9294 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9295 check_caps(in, 0);
9296 } else
9297 r = ret;
9298 }
9299 if (have) {
9300 put_cap_ref(in, CEPH_CAP_FILE_RD);
9301 }
9302 if (movepos) {
9303 unlock_fh_pos(f);
9304 }
9305 return r;
9306 }
9307
9308 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9309 client(c), f(f) {
9310 f->get();
9311 f->readahead.inc_pending();
9312 }
9313
9314 Client::C_Readahead::~C_Readahead() {
9315 f->readahead.dec_pending();
9316 client->_put_fh(f);
9317 }
9318
9319 void Client::C_Readahead::finish(int r) {
9320 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9321 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9322 }
9323
9324 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9325 {
9326 const auto& conf = cct->_conf;
9327 Inode *in = f->inode.get();
9328
9329 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9330
9331 // trim read based on file size?
9332 if (off >= in->size)
9333 return 0;
9334 if (len == 0)
9335 return 0;
9336 if (off + len > in->size) {
9337 len = in->size - off;
9338 }
9339
9340 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9341 << " max_bytes=" << f->readahead.get_max_readahead_size()
9342 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9343
9344 // read (and possibly block)
9345 int r = 0;
9346 C_SaferCond onfinish("Client::_read_async flock");
9347 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9348 off, len, bl, 0, &onfinish);
9349 if (r == 0) {
9350 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9351 client_lock.unlock();
9352 r = onfinish.wait();
9353 client_lock.lock();
9354 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9355 }
9356
9357 if(f->readahead.get_min_readahead_size() > 0) {
9358 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9359 if (readahead_extent.second > 0) {
9360 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9361 << " (caller wants " << off << "~" << len << ")" << dendl;
9362 Context *onfinish2 = new C_Readahead(this, f);
9363 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9364 readahead_extent.first, readahead_extent.second,
9365 NULL, 0, onfinish2);
9366 if (r2 == 0) {
9367 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9368 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9369 } else {
9370 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9371 delete onfinish2;
9372 }
9373 }
9374 }
9375
9376 return r;
9377 }
9378
9379 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9380 bool *checkeof)
9381 {
9382 Inode *in = f->inode.get();
9383 uint64_t pos = off;
9384 int left = len;
9385 int read = 0;
9386
9387 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9388
9389 while (left > 0) {
9390 C_SaferCond onfinish("Client::_read_sync flock");
9391 bufferlist tbl;
9392
9393 int wanted = left;
9394 filer->read_trunc(in->ino, &in->layout, in->snapid,
9395 pos, left, &tbl, 0,
9396 in->truncate_size, in->truncate_seq,
9397 &onfinish);
9398 client_lock.unlock();
9399 int r = onfinish.wait();
9400 client_lock.lock();
9401
9402 // if we get ENOENT from OSD, assume 0 bytes returned
9403 if (r == -ENOENT)
9404 r = 0;
9405 if (r < 0)
9406 return r;
9407 if (tbl.length()) {
9408 r = tbl.length();
9409
9410 read += r;
9411 pos += r;
9412 left -= r;
9413 bl->claim_append(tbl);
9414 }
9415 // short read?
9416 if (r >= 0 && r < wanted) {
9417 if (pos < in->size) {
9418 // zero up to known EOF
9419 int64_t some = in->size - pos;
9420 if (some > left)
9421 some = left;
9422 auto z = buffer::ptr_node::create(some);
9423 z->zero();
9424 bl->push_back(std::move(z));
9425 read += some;
9426 pos += some;
9427 left -= some;
9428 if (left == 0)
9429 return read;
9430 }
9431
9432 *checkeof = true;
9433 return read;
9434 }
9435 }
9436 return read;
9437 }
9438
9439
9440 /*
9441 * we keep count of uncommitted sync writes on the inode, so that
9442 * fsync can DDRT.
9443 */
9444 void Client::_sync_write_commit(Inode *in)
9445 {
9446 ceph_assert(unsafe_sync_write > 0);
9447 unsafe_sync_write--;
9448
9449 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9450
9451 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9452 if (unsafe_sync_write == 0 && unmounting) {
9453 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9454 mount_cond.notify_all();
9455 }
9456 }
9457
9458 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9459 {
9460 std::lock_guard lock(client_lock);
9461 tout(cct) << "write" << std::endl;
9462 tout(cct) << fd << std::endl;
9463 tout(cct) << size << std::endl;
9464 tout(cct) << offset << std::endl;
9465
9466 if (unmounting)
9467 return -ENOTCONN;
9468
9469 Fh *fh = get_filehandle(fd);
9470 if (!fh)
9471 return -EBADF;
9472 #if defined(__linux__) && defined(O_PATH)
9473 if (fh->flags & O_PATH)
9474 return -EBADF;
9475 #endif
9476 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9477 size = std::min(size, (loff_t)INT_MAX);
9478 int r = _write(fh, offset, size, buf, NULL, false);
9479 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9480 return r;
9481 }
9482
9483 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9484 {
9485 if (iovcnt < 0)
9486 return -EINVAL;
9487 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9488 }
9489
9490 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9491 unsigned iovcnt, int64_t offset, bool write,
9492 bool clamp_to_int)
9493 {
9494 #if defined(__linux__) && defined(O_PATH)
9495 if (fh->flags & O_PATH)
9496 return -EBADF;
9497 #endif
9498 loff_t totallen = 0;
9499 for (unsigned i = 0; i < iovcnt; i++) {
9500 totallen += iov[i].iov_len;
9501 }
9502
9503 /*
9504 * Some of the API functions take 64-bit size values, but only return
9505 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9506 * we don't do I/Os larger than the values we can return.
9507 */
9508 if (clamp_to_int) {
9509 totallen = std::min(totallen, (loff_t)INT_MAX);
9510 }
9511 if (write) {
9512 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9513 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9514 return w;
9515 } else {
9516 bufferlist bl;
9517 int64_t r = _read(fh, offset, totallen, &bl);
9518 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9519 if (r <= 0)
9520 return r;
9521
9522 auto iter = bl.cbegin();
9523 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9524 /*
9525 * This piece of code aims to handle the case that bufferlist does not have enough data
9526 * to fill in the iov
9527 */
9528 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9529 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9530 resid -= round_size;
9531 /* iter is self-updating */
9532 }
9533 return r;
9534 }
9535 }
9536
9537 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9538 {
9539 std::lock_guard lock(client_lock);
9540 tout(cct) << fd << std::endl;
9541 tout(cct) << offset << std::endl;
9542
9543 if (unmounting)
9544 return -ENOTCONN;
9545
9546 Fh *fh = get_filehandle(fd);
9547 if (!fh)
9548 return -EBADF;
9549 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9550 }
9551
9552 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9553 const struct iovec *iov, int iovcnt)
9554 {
9555 uint64_t fpos = 0;
9556
9557 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9558 return -EFBIG;
9559
9560 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9561 Inode *in = f->inode.get();
9562
9563 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9564 return -ENOSPC;
9565 }
9566
9567 ceph_assert(in->snapid == CEPH_NOSNAP);
9568
9569 // was Fh opened as writeable?
9570 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9571 return -EBADF;
9572
9573 // use/adjust fd pos?
9574 if (offset < 0) {
9575 lock_fh_pos(f);
9576 /*
9577 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9578 * change out from under us.
9579 */
9580 if (f->flags & O_APPEND) {
9581 auto r = _lseek(f, 0, SEEK_END);
9582 if (r < 0) {
9583 unlock_fh_pos(f);
9584 return r;
9585 }
9586 }
9587 offset = f->pos;
9588 fpos = offset+size;
9589 unlock_fh_pos(f);
9590 }
9591
9592 // check quota
9593 uint64_t endoff = offset + size;
9594 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9595 f->actor_perms)) {
9596 return -EDQUOT;
9597 }
9598
9599 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9600
9601 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9602
9603 // time it.
9604 utime_t start = ceph_clock_now();
9605
9606 if (in->inline_version == 0) {
9607 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9608 if (r < 0)
9609 return r;
9610 ceph_assert(in->inline_version > 0);
9611 }
9612
9613 // copy into fresh buffer (since our write may be resub, async)
9614 bufferlist bl;
9615 if (buf) {
9616 if (size > 0)
9617 bl.append(buf, size);
9618 } else if (iov){
9619 for (int i = 0; i < iovcnt; i++) {
9620 if (iov[i].iov_len > 0) {
9621 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9622 }
9623 }
9624 }
9625
9626 utime_t lat;
9627 uint64_t totalwritten;
9628 int want, have;
9629 if (f->mode & CEPH_FILE_MODE_LAZY)
9630 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9631 else
9632 want = CEPH_CAP_FILE_BUFFER;
9633 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9634 if (r < 0)
9635 return r;
9636
9637 /* clear the setuid/setgid bits, if any */
9638 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9639 struct ceph_statx stx = { 0 };
9640
9641 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9642 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9643 if (r < 0)
9644 return r;
9645 } else {
9646 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9647 }
9648
9649 if (f->flags & O_DIRECT)
9650 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9651
9652 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9653
9654 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9655
9656 if (in->inline_version < CEPH_INLINE_NONE) {
9657 if (endoff > cct->_conf->client_max_inline_size ||
9658 endoff > CEPH_INLINE_MAX_SIZE ||
9659 !(have & CEPH_CAP_FILE_BUFFER)) {
9660 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9661 uninline_data(in, onuninline.get());
9662 } else {
9663 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9664
9665 uint32_t len = in->inline_data.length();
9666
9667 if (endoff < len)
9668 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9669
9670 if (offset < len)
9671 in->inline_data.splice(offset, len - offset);
9672 else if (offset > len)
9673 in->inline_data.append_zero(offset - len);
9674
9675 in->inline_data.append(bl);
9676 in->inline_version++;
9677
9678 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9679
9680 goto success;
9681 }
9682 }
9683
9684 if (cct->_conf->client_oc &&
9685 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9686 // do buffered write
9687 if (!in->oset.dirty_or_tx)
9688 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9689
9690 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9691
9692 // async, caching, non-blocking.
9693 r = objectcacher->file_write(&in->oset, &in->layout,
9694 in->snaprealm->get_snap_context(),
9695 offset, size, bl, ceph::real_clock::now(),
9696 0);
9697 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9698
9699 if (r < 0)
9700 goto done;
9701
9702 // flush cached write if O_SYNC is set on file fh
9703 // O_DSYNC == O_SYNC on linux < 2.6.33
9704 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9705 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9706 _flush_range(in, offset, size);
9707 }
9708 } else {
9709 if (f->flags & O_DIRECT)
9710 _flush_range(in, offset, size);
9711
9712 // simple, non-atomic sync write
9713 C_SaferCond onfinish("Client::_write flock");
9714 unsafe_sync_write++;
9715 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9716
9717 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9718 offset, size, bl, ceph::real_clock::now(), 0,
9719 in->truncate_size, in->truncate_seq,
9720 &onfinish);
9721 client_lock.unlock();
9722 onfinish.wait();
9723 client_lock.lock();
9724 _sync_write_commit(in);
9725 }
9726
9727 // if we get here, write was successful, update client metadata
9728 success:
9729 // time
9730 lat = ceph_clock_now();
9731 lat -= start;
9732 logger->tinc(l_c_wrlat, lat);
9733
9734 if (fpos) {
9735 lock_fh_pos(f);
9736 f->pos = fpos;
9737 unlock_fh_pos(f);
9738 }
9739 totalwritten = size;
9740 r = (int64_t)totalwritten;
9741
9742 // extend file?
9743 if (totalwritten + offset > in->size) {
9744 in->size = totalwritten + offset;
9745 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9746
9747 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9748 check_caps(in, CHECK_CAPS_NODELAY);
9749 } else if (is_max_size_approaching(in)) {
9750 check_caps(in, 0);
9751 }
9752
9753 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9754 } else {
9755 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9756 }
9757
9758 // mtime
9759 in->mtime = in->ctime = ceph_clock_now();
9760 in->change_attr++;
9761 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9762
9763 done:
9764
9765 if (nullptr != onuninline) {
9766 client_lock.unlock();
9767 int uninline_ret = onuninline->wait();
9768 client_lock.lock();
9769
9770 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9771 in->inline_data.clear();
9772 in->inline_version = CEPH_INLINE_NONE;
9773 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9774 check_caps(in, 0);
9775 } else
9776 r = uninline_ret;
9777 }
9778
9779 put_cap_ref(in, CEPH_CAP_FILE_WR);
9780 return r;
9781 }
9782
9783 int Client::_flush(Fh *f)
9784 {
9785 Inode *in = f->inode.get();
9786 int err = f->take_async_err();
9787 if (err != 0) {
9788 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9789 << cpp_strerror(err) << dendl;
9790 } else {
9791 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9792 }
9793
9794 return err;
9795 }
9796
9797 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9798 {
9799 struct ceph_statx stx;
9800 stx.stx_size = length;
9801 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9802 }
9803
9804 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9805 {
9806 std::lock_guard lock(client_lock);
9807 tout(cct) << __func__ << std::endl;
9808 tout(cct) << fd << std::endl;
9809 tout(cct) << length << std::endl;
9810
9811 if (unmounting)
9812 return -ENOTCONN;
9813
9814 Fh *f = get_filehandle(fd);
9815 if (!f)
9816 return -EBADF;
9817 #if defined(__linux__) && defined(O_PATH)
9818 if (f->flags & O_PATH)
9819 return -EBADF;
9820 #endif
9821 struct stat attr;
9822 attr.st_size = length;
9823 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9824 }
9825
9826 int Client::fsync(int fd, bool syncdataonly)
9827 {
9828 std::lock_guard lock(client_lock);
9829 tout(cct) << "fsync" << std::endl;
9830 tout(cct) << fd << std::endl;
9831 tout(cct) << syncdataonly << std::endl;
9832
9833 if (unmounting)
9834 return -ENOTCONN;
9835
9836 Fh *f = get_filehandle(fd);
9837 if (!f)
9838 return -EBADF;
9839 #if defined(__linux__) && defined(O_PATH)
9840 if (f->flags & O_PATH)
9841 return -EBADF;
9842 #endif
9843 int r = _fsync(f, syncdataonly);
9844 if (r == 0) {
9845 // The IOs in this fsync were okay, but maybe something happened
9846 // in the background that we shoudl be reporting?
9847 r = f->take_async_err();
9848 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9849 << ") = 0, async_err = " << r << dendl;
9850 } else {
9851 // Assume that an error we encountered during fsync, even reported
9852 // synchronously, would also have applied the error to the Fh, and we
9853 // should clear it here to avoid returning the same error again on next
9854 // call.
9855 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9856 << r << dendl;
9857 f->take_async_err();
9858 }
9859 return r;
9860 }
9861
9862 int Client::_fsync(Inode *in, bool syncdataonly)
9863 {
9864 int r = 0;
9865 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9866 ceph_tid_t flush_tid = 0;
9867 InodeRef tmp_ref;
9868 utime_t lat;
9869 utime_t start = ceph_clock_now();
9870
9871 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9872
9873 if (cct->_conf->client_oc) {
9874 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9875 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9876 _flush(in, object_cacher_completion.get());
9877 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9878 }
9879
9880 if (!syncdataonly && in->dirty_caps) {
9881 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9882 if (in->flushing_caps)
9883 flush_tid = last_flush_tid;
9884 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9885
9886 if (!syncdataonly && !in->unsafe_ops.empty()) {
9887 flush_mdlog_sync();
9888
9889 MetaRequest *req = in->unsafe_ops.back();
9890 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9891
9892 req->get();
9893 wait_on_list(req->waitfor_safe);
9894 put_request(req);
9895 }
9896
9897 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9898 client_lock.unlock();
9899 ldout(cct, 15) << "waiting on data to flush" << dendl;
9900 r = object_cacher_completion->wait();
9901 client_lock.lock();
9902 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9903 } else {
9904 // FIXME: this can starve
9905 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9906 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9907 << " uncommitted, waiting" << dendl;
9908 wait_on_list(in->waitfor_commit);
9909 }
9910 }
9911
9912 if (!r) {
9913 if (flush_tid > 0)
9914 wait_sync_caps(in, flush_tid);
9915
9916 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9917 } else {
9918 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9919 << cpp_strerror(-r) << dendl;
9920 }
9921
9922 lat = ceph_clock_now();
9923 lat -= start;
9924 logger->tinc(l_c_fsync, lat);
9925
9926 return r;
9927 }
9928
9929 int Client::_fsync(Fh *f, bool syncdataonly)
9930 {
9931 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9932 return _fsync(f->inode.get(), syncdataonly);
9933 }
9934
9935 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9936 {
9937 std::lock_guard lock(client_lock);
9938 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9939 tout(cct) << fd << std::endl;
9940
9941 if (unmounting)
9942 return -ENOTCONN;
9943
9944 Fh *f = get_filehandle(fd);
9945 if (!f)
9946 return -EBADF;
9947 int r = _getattr(f->inode, mask, perms);
9948 if (r < 0)
9949 return r;
9950 fill_stat(f->inode, stbuf, NULL);
9951 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9952 return r;
9953 }
9954
9955 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9956 unsigned int want, unsigned int flags)
9957 {
9958 std::lock_guard lock(client_lock);
9959 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9960 tout(cct) << fd << std::endl;
9961
9962 if (unmounting)
9963 return -ENOTCONN;
9964
9965 Fh *f = get_filehandle(fd);
9966 if (!f)
9967 return -EBADF;
9968
9969 unsigned mask = statx_to_mask(flags, want);
9970
9971 int r = 0;
9972 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9973 r = _getattr(f->inode, mask, perms);
9974 if (r < 0) {
9975 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9976 return r;
9977 }
9978 }
9979
9980 fill_statx(f->inode, mask, stx);
9981 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9982 return r;
9983 }
9984
9985 // not written yet, but i want to link!
9986
9987 int Client::chdir(const char *relpath, std::string &new_cwd,
9988 const UserPerm& perms)
9989 {
9990 std::lock_guard lock(client_lock);
9991 tout(cct) << "chdir" << std::endl;
9992 tout(cct) << relpath << std::endl;
9993
9994 if (unmounting)
9995 return -ENOTCONN;
9996
9997 filepath path(relpath);
9998 InodeRef in;
9999 int r = path_walk(path, &in, perms);
10000 if (r < 0)
10001 return r;
10002
10003 if (!(in.get()->is_dir()))
10004 return -ENOTDIR;
10005
10006 if (cwd != in)
10007 cwd.swap(in);
10008 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10009
10010 _getcwd(new_cwd, perms);
10011 return 0;
10012 }
10013
10014 void Client::_getcwd(string& dir, const UserPerm& perms)
10015 {
10016 filepath path;
10017 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10018
10019 Inode *in = cwd.get();
10020 while (in != root) {
10021 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10022
10023 // A cwd or ancester is unlinked
10024 if (in->dentries.empty()) {
10025 return;
10026 }
10027
10028 Dentry *dn = in->get_first_parent();
10029
10030
10031 if (!dn) {
10032 // look it up
10033 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10034 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10035 filepath path(in->ino);
10036 req->set_filepath(path);
10037 req->set_inode(in);
10038 int res = make_request(req, perms);
10039 if (res < 0)
10040 break;
10041
10042 // start over
10043 path = filepath();
10044 in = cwd.get();
10045 continue;
10046 }
10047 path.push_front_dentry(dn->name);
10048 in = dn->dir->parent_inode;
10049 }
10050 dir = "/";
10051 dir += path.get_path();
10052 }
10053
10054 void Client::getcwd(string& dir, const UserPerm& perms)
10055 {
10056 std::lock_guard l(client_lock);
10057 if (!unmounting)
10058 _getcwd(dir, perms);
10059 }
10060
10061 int Client::statfs(const char *path, struct statvfs *stbuf,
10062 const UserPerm& perms)
10063 {
10064 std::lock_guard l(client_lock);
10065 tout(cct) << __func__ << std::endl;
10066 unsigned long int total_files_on_fs;
10067
10068 if (unmounting)
10069 return -ENOTCONN;
10070
10071 ceph_statfs stats;
10072 C_SaferCond cond;
10073
10074 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10075 if (data_pools.size() == 1) {
10076 objecter->get_fs_stats(stats, data_pools[0], &cond);
10077 } else {
10078 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10079 }
10080
10081 client_lock.unlock();
10082 int rval = cond.wait();
10083 assert(root);
10084 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10085 client_lock.lock();
10086
10087 if (rval < 0) {
10088 ldout(cct, 1) << "underlying call to statfs returned error: "
10089 << cpp_strerror(rval)
10090 << dendl;
10091 return rval;
10092 }
10093
10094 memset(stbuf, 0, sizeof(*stbuf));
10095
10096 /*
10097 * we're going to set a block size of 4MB so we can represent larger
10098 * FSes without overflowing. Additionally convert the space
10099 * measurements from KB to bytes while making them in terms of
10100 * blocks. We use 4MB only because it is big enough, and because it
10101 * actually *is* the (ceph) default block size.
10102 */
10103 const int CEPH_BLOCK_SHIFT = 22;
10104 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10105 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10106 stbuf->f_files = total_files_on_fs;
10107 stbuf->f_ffree = 0;
10108 stbuf->f_favail = -1;
10109 stbuf->f_fsid = -1; // ??
10110 stbuf->f_flag = 0; // ??
10111 stbuf->f_namemax = NAME_MAX;
10112
10113 // Usually quota_root will == root_ancestor, but if the mount root has no
10114 // quota but we can see a parent of it that does have a quota, we'll
10115 // respect that one instead.
10116 ceph_assert(root != nullptr);
10117 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10118
10119 // get_quota_root should always give us something
10120 // because client quotas are always enabled
10121 ceph_assert(quota_root != nullptr);
10122
10123 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10124
10125 // Skip the getattr if any sessions are stale, as we don't want to
10126 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10127 // is unhealthy.
10128 if (!_any_stale_sessions()) {
10129 int r = _getattr(quota_root, 0, perms, true);
10130 if (r != 0) {
10131 // Ignore return value: error getting latest inode metadata is not a good
10132 // reason to break "df".
10133 lderr(cct) << "Error in getattr on quota root 0x"
10134 << std::hex << quota_root->ino << std::dec
10135 << " statfs result may be outdated" << dendl;
10136 }
10137 }
10138
10139 // Special case: if there is a size quota set on the Inode acting
10140 // as the root for this client mount, then report the quota status
10141 // as the filesystem statistics.
10142 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10143 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10144 // It is possible for a quota to be exceeded: arithmetic here must
10145 // handle case where used > total.
10146 const fsblkcnt_t free = total > used ? total - used : 0;
10147
10148 stbuf->f_blocks = total;
10149 stbuf->f_bfree = free;
10150 stbuf->f_bavail = free;
10151 } else {
10152 // General case: report the cluster statistics returned from RADOS. Because
10153 // multiple pools may be used without one filesystem namespace via
10154 // layouts, this is the most correct thing we can do.
10155 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10156 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10157 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10158 }
10159
10160 return rval;
10161 }
10162
10163 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10164 struct flock *fl, uint64_t owner, bool removing)
10165 {
10166 ldout(cct, 10) << __func__ << " ino " << in->ino
10167 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10168 << " type " << fl->l_type << " owner " << owner
10169 << " " << fl->l_start << "~" << fl->l_len << dendl;
10170
10171 int lock_cmd;
10172 if (F_RDLCK == fl->l_type)
10173 lock_cmd = CEPH_LOCK_SHARED;
10174 else if (F_WRLCK == fl->l_type)
10175 lock_cmd = CEPH_LOCK_EXCL;
10176 else if (F_UNLCK == fl->l_type)
10177 lock_cmd = CEPH_LOCK_UNLOCK;
10178 else
10179 return -EIO;
10180
10181 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10182 sleep = 0;
10183
10184 /*
10185 * Set the most significant bit, so that MDS knows the 'owner'
10186 * is sufficient to identify the owner of lock. (old code uses
10187 * both 'owner' and 'pid')
10188 */
10189 owner |= (1ULL << 63);
10190
10191 MetaRequest *req = new MetaRequest(op);
10192 filepath path;
10193 in->make_nosnap_relative_path(path);
10194 req->set_filepath(path);
10195 req->set_inode(in);
10196
10197 req->head.args.filelock_change.rule = lock_type;
10198 req->head.args.filelock_change.type = lock_cmd;
10199 req->head.args.filelock_change.owner = owner;
10200 req->head.args.filelock_change.pid = fl->l_pid;
10201 req->head.args.filelock_change.start = fl->l_start;
10202 req->head.args.filelock_change.length = fl->l_len;
10203 req->head.args.filelock_change.wait = sleep;
10204
10205 int ret;
10206 bufferlist bl;
10207
10208 if (sleep && switch_interrupt_cb) {
10209 // enable interrupt
10210 switch_interrupt_cb(callback_handle, req->get());
10211 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10212 // disable interrupt
10213 switch_interrupt_cb(callback_handle, NULL);
10214 if (ret == 0 && req->aborted()) {
10215 // effect of this lock request has been revoked by the 'lock intr' request
10216 ret = req->get_abort_code();
10217 }
10218 put_request(req);
10219 } else {
10220 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10221 }
10222
10223 if (ret == 0) {
10224 if (op == CEPH_MDS_OP_GETFILELOCK) {
10225 ceph_filelock filelock;
10226 auto p = bl.cbegin();
10227 decode(filelock, p);
10228
10229 if (CEPH_LOCK_SHARED == filelock.type)
10230 fl->l_type = F_RDLCK;
10231 else if (CEPH_LOCK_EXCL == filelock.type)
10232 fl->l_type = F_WRLCK;
10233 else
10234 fl->l_type = F_UNLCK;
10235
10236 fl->l_whence = SEEK_SET;
10237 fl->l_start = filelock.start;
10238 fl->l_len = filelock.length;
10239 fl->l_pid = filelock.pid;
10240 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10241 ceph_lock_state_t *lock_state;
10242 if (lock_type == CEPH_LOCK_FCNTL) {
10243 if (!in->fcntl_locks)
10244 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10245 lock_state = in->fcntl_locks.get();
10246 } else if (lock_type == CEPH_LOCK_FLOCK) {
10247 if (!in->flock_locks)
10248 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10249 lock_state = in->flock_locks.get();
10250 } else {
10251 ceph_abort();
10252 return -EINVAL;
10253 }
10254 _update_lock_state(fl, owner, lock_state);
10255
10256 if (!removing) {
10257 if (lock_type == CEPH_LOCK_FCNTL) {
10258 if (!fh->fcntl_locks)
10259 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10260 lock_state = fh->fcntl_locks.get();
10261 } else {
10262 if (!fh->flock_locks)
10263 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10264 lock_state = fh->flock_locks.get();
10265 }
10266 _update_lock_state(fl, owner, lock_state);
10267 }
10268 } else
10269 ceph_abort();
10270 }
10271 return ret;
10272 }
10273
10274 int Client::_interrupt_filelock(MetaRequest *req)
10275 {
10276 // Set abort code, but do not kick. The abort code prevents the request
10277 // from being re-sent.
10278 req->abort(-EINTR);
10279 if (req->mds < 0)
10280 return 0; // haven't sent the request
10281
10282 Inode *in = req->inode();
10283
10284 int lock_type;
10285 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10286 lock_type = CEPH_LOCK_FLOCK_INTR;
10287 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10288 lock_type = CEPH_LOCK_FCNTL_INTR;
10289 else {
10290 ceph_abort();
10291 return -EINVAL;
10292 }
10293
10294 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10295 filepath path;
10296 in->make_nosnap_relative_path(path);
10297 intr_req->set_filepath(path);
10298 intr_req->set_inode(in);
10299 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10300 intr_req->head.args.filelock_change.rule = lock_type;
10301 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10302
10303 UserPerm perms(req->get_uid(), req->get_gid());
10304 return make_request(intr_req, perms, NULL, NULL, -1);
10305 }
10306
10307 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10308 {
10309 if (!in->fcntl_locks && !in->flock_locks)
10310 return;
10311
10312 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10313 encode(nr_fcntl_locks, bl);
10314 if (nr_fcntl_locks) {
10315 auto &lock_state = in->fcntl_locks;
10316 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10317 p != lock_state->held_locks.end();
10318 ++p)
10319 encode(p->second, bl);
10320 }
10321
10322 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10323 encode(nr_flock_locks, bl);
10324 if (nr_flock_locks) {
10325 auto &lock_state = in->flock_locks;
10326 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10327 p != lock_state->held_locks.end();
10328 ++p)
10329 encode(p->second, bl);
10330 }
10331
10332 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10333 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10334 }
10335
10336 void Client::_release_filelocks(Fh *fh)
10337 {
10338 if (!fh->fcntl_locks && !fh->flock_locks)
10339 return;
10340
10341 Inode *in = fh->inode.get();
10342 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10343
10344 list<pair<int, ceph_filelock> > to_release;
10345
10346 if (fh->fcntl_locks) {
10347 auto &lock_state = fh->fcntl_locks;
10348 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10349 p != lock_state->held_locks.end();
10350 ++p)
10351 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10352 lock_state.reset();
10353 }
10354 if (fh->flock_locks) {
10355 auto &lock_state = fh->flock_locks;
10356 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10357 p != lock_state->held_locks.end();
10358 ++p)
10359 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10360 lock_state.reset();
10361 }
10362
10363 if (to_release.empty())
10364 return;
10365
10366 // mds has already released filelocks if session was closed.
10367 if (in->caps.empty())
10368 return;
10369
10370 struct flock fl;
10371 memset(&fl, 0, sizeof(fl));
10372 fl.l_whence = SEEK_SET;
10373 fl.l_type = F_UNLCK;
10374
10375 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10376 p != to_release.end();
10377 ++p) {
10378 fl.l_start = p->second.start;
10379 fl.l_len = p->second.length;
10380 fl.l_pid = p->second.pid;
10381 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10382 p->second.owner, true);
10383 }
10384 }
10385
10386 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10387 ceph_lock_state_t *lock_state)
10388 {
10389 int lock_cmd;
10390 if (F_RDLCK == fl->l_type)
10391 lock_cmd = CEPH_LOCK_SHARED;
10392 else if (F_WRLCK == fl->l_type)
10393 lock_cmd = CEPH_LOCK_EXCL;
10394 else
10395 lock_cmd = CEPH_LOCK_UNLOCK;;
10396
10397 ceph_filelock filelock;
10398 filelock.start = fl->l_start;
10399 filelock.length = fl->l_len;
10400 filelock.client = 0;
10401 // see comment in _do_filelock()
10402 filelock.owner = owner | (1ULL << 63);
10403 filelock.pid = fl->l_pid;
10404 filelock.type = lock_cmd;
10405
10406 if (filelock.type == CEPH_LOCK_UNLOCK) {
10407 list<ceph_filelock> activated_locks;
10408 lock_state->remove_lock(filelock, activated_locks);
10409 } else {
10410 bool r = lock_state->add_lock(filelock, false, false, NULL);
10411 ceph_assert(r);
10412 }
10413 }
10414
10415 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10416 {
10417 Inode *in = fh->inode.get();
10418 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10419 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10420 return ret;
10421 }
10422
10423 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10424 {
10425 Inode *in = fh->inode.get();
10426 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10427 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10428 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10429 return ret;
10430 }
10431
10432 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10433 {
10434 Inode *in = fh->inode.get();
10435 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10436
10437 int sleep = !(cmd & LOCK_NB);
10438 cmd &= ~LOCK_NB;
10439
10440 int type;
10441 switch (cmd) {
10442 case LOCK_SH:
10443 type = F_RDLCK;
10444 break;
10445 case LOCK_EX:
10446 type = F_WRLCK;
10447 break;
10448 case LOCK_UN:
10449 type = F_UNLCK;
10450 break;
10451 default:
10452 return -EINVAL;
10453 }
10454
10455 struct flock fl;
10456 memset(&fl, 0, sizeof(fl));
10457 fl.l_type = type;
10458 fl.l_whence = SEEK_SET;
10459
10460 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10461 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10462 return ret;
10463 }
10464
10465 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10466 {
10467 /* Since the only thing this does is wrap a call to statfs, and
10468 statfs takes a lock, it doesn't seem we have a need to split it
10469 out. */
10470 return statfs(0, stbuf, perms);
10471 }
10472
10473 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10474 {
10475 if (!args)
10476 return;
10477 std::lock_guard l(client_lock);
10478 ldout(cct, 10) << __func__ << " cb " << args->handle
10479 << " invalidate_ino_cb " << args->ino_cb
10480 << " invalidate_dentry_cb " << args->dentry_cb
10481 << " switch_interrupt_cb " << args->switch_intr_cb
10482 << " remount_cb " << args->remount_cb
10483 << dendl;
10484 callback_handle = args->handle;
10485 if (args->ino_cb) {
10486 ino_invalidate_cb = args->ino_cb;
10487 async_ino_invalidator.start();
10488 }
10489 if (args->dentry_cb) {
10490 dentry_invalidate_cb = args->dentry_cb;
10491 async_dentry_invalidator.start();
10492 }
10493 if (args->switch_intr_cb) {
10494 switch_interrupt_cb = args->switch_intr_cb;
10495 interrupt_finisher.start();
10496 }
10497 if (args->remount_cb) {
10498 remount_cb = args->remount_cb;
10499 remount_finisher.start();
10500 }
10501 if (args->ino_release_cb) {
10502 ino_release_cb = args->ino_release_cb;
10503 async_ino_releasor.start();
10504 }
10505 if (args->umask_cb)
10506 umask_cb = args->umask_cb;
10507 }
10508
10509 int Client::test_dentry_handling(bool can_invalidate)
10510 {
10511 int r = 0;
10512
10513 can_invalidate_dentries = can_invalidate;
10514
10515 if (can_invalidate_dentries) {
10516 ceph_assert(dentry_invalidate_cb);
10517 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10518 r = 0;
10519 } else {
10520 ceph_assert(remount_cb);
10521 ldout(cct, 1) << "using remount_cb" << dendl;
10522 r = _do_remount(false);
10523 }
10524
10525 return r;
10526 }
10527
10528 int Client::_sync_fs()
10529 {
10530 ldout(cct, 10) << __func__ << dendl;
10531
10532 // flush file data
10533 std::unique_ptr<C_SaferCond> cond = nullptr;
10534 if (cct->_conf->client_oc) {
10535 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10536 objectcacher->flush_all(cond.get());
10537 }
10538
10539 // flush caps
10540 flush_caps_sync();
10541 ceph_tid_t flush_tid = last_flush_tid;
10542
10543 // wait for unsafe mds requests
10544 wait_unsafe_requests();
10545
10546 wait_sync_caps(flush_tid);
10547
10548 if (nullptr != cond) {
10549 client_lock.unlock();
10550 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10551 cond->wait();
10552 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10553 client_lock.lock();
10554 }
10555
10556 return 0;
10557 }
10558
10559 int Client::sync_fs()
10560 {
10561 std::lock_guard l(client_lock);
10562
10563 if (unmounting)
10564 return -ENOTCONN;
10565
10566 return _sync_fs();
10567 }
10568
10569 int64_t Client::drop_caches()
10570 {
10571 std::lock_guard l(client_lock);
10572 return objectcacher->release_all();
10573 }
10574
10575 int Client::_lazyio(Fh *fh, int enable)
10576 {
10577 Inode *in = fh->inode.get();
10578 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10579
10580 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10581 return 0;
10582
10583 int orig_mode = fh->mode;
10584 if (enable) {
10585 fh->mode |= CEPH_FILE_MODE_LAZY;
10586 in->get_open_ref(fh->mode);
10587 in->put_open_ref(orig_mode);
10588 check_caps(in, CHECK_CAPS_NODELAY);
10589 } else {
10590 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10591 in->get_open_ref(fh->mode);
10592 in->put_open_ref(orig_mode);
10593 check_caps(in, 0);
10594 }
10595
10596 return 0;
10597 }
10598
10599 int Client::lazyio(int fd, int enable)
10600 {
10601 std::lock_guard l(client_lock);
10602 Fh *f = get_filehandle(fd);
10603 if (!f)
10604 return -EBADF;
10605
10606 return _lazyio(f, enable);
10607 }
10608
10609 int Client::ll_lazyio(Fh *fh, int enable)
10610 {
10611 std::lock_guard lock(client_lock);
10612 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10613 tout(cct) << __func__ << std::endl;
10614
10615 return _lazyio(fh, enable);
10616 }
10617
10618 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10619 {
10620 std::lock_guard l(client_lock);
10621 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10622 << ", " << offset << ", " << count << ")" << dendl;
10623
10624 Fh *f = get_filehandle(fd);
10625 if (!f)
10626 return -EBADF;
10627
10628 // for now
10629 _fsync(f, true);
10630
10631 return 0;
10632 }
10633
10634 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10635 {
10636 std::lock_guard l(client_lock);
10637 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10638 << ", " << offset << ", " << count << ")" << dendl;
10639
10640 Fh *f = get_filehandle(fd);
10641 if (!f)
10642 return -EBADF;
10643 Inode *in = f->inode.get();
10644
10645 _fsync(f, true);
10646 if (_release(in)) {
10647 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10648 if (r < 0)
10649 return r;
10650 }
10651 return 0;
10652 }
10653
10654
10655 // =============================
10656 // snaps
10657
10658 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10659 {
10660 std::lock_guard l(client_lock);
10661
10662 if (unmounting)
10663 return -ENOTCONN;
10664
10665 filepath path(relpath);
10666 InodeRef in;
10667 int r = path_walk(path, &in, perm);
10668 if (r < 0)
10669 return r;
10670 if (cct->_conf->client_permissions) {
10671 r = may_create(in.get(), perm);
10672 if (r < 0)
10673 return r;
10674 }
10675 Inode *snapdir = open_snapdir(in.get());
10676 return _mkdir(snapdir, name, 0, perm);
10677 }
10678
10679 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10680 {
10681 std::lock_guard l(client_lock);
10682
10683 if (unmounting)
10684 return -ENOTCONN;
10685
10686 filepath path(relpath);
10687 InodeRef in;
10688 int r = path_walk(path, &in, perms);
10689 if (r < 0)
10690 return r;
10691 if (cct->_conf->client_permissions) {
10692 r = may_delete(in.get(), NULL, perms);
10693 if (r < 0)
10694 return r;
10695 }
10696 Inode *snapdir = open_snapdir(in.get());
10697 return _rmdir(snapdir, name, perms);
10698 }
10699
10700 // =============================
10701 // expose caps
10702
10703 int Client::get_caps_issued(int fd) {
10704
10705 std::lock_guard lock(client_lock);
10706
10707 if (unmounting)
10708 return -ENOTCONN;
10709
10710 Fh *f = get_filehandle(fd);
10711 if (!f)
10712 return -EBADF;
10713
10714 return f->inode->caps_issued();
10715 }
10716
10717 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10718 {
10719 std::lock_guard lock(client_lock);
10720
10721 if (unmounting)
10722 return -ENOTCONN;
10723
10724 filepath p(path);
10725 InodeRef in;
10726 int r = path_walk(p, &in, perms, true);
10727 if (r < 0)
10728 return r;
10729 return in->caps_issued();
10730 }
10731
10732 // =========================================
10733 // low level
10734
10735 Inode *Client::open_snapdir(Inode *diri)
10736 {
10737 Inode *in;
10738 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10739 if (!inode_map.count(vino)) {
10740 in = new Inode(this, vino, &diri->layout);
10741
10742 in->ino = diri->ino;
10743 in->snapid = CEPH_SNAPDIR;
10744 in->mode = diri->mode;
10745 in->uid = diri->uid;
10746 in->gid = diri->gid;
10747 in->nlink = 1;
10748 in->mtime = diri->mtime;
10749 in->ctime = diri->ctime;
10750 in->btime = diri->btime;
10751 in->size = diri->size;
10752 in->change_attr = diri->change_attr;
10753
10754 in->dirfragtree.clear();
10755 in->snapdir_parent = diri;
10756 diri->flags |= I_SNAPDIR_OPEN;
10757 inode_map[vino] = in;
10758 if (use_faked_inos())
10759 _assign_faked_ino(in);
10760 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10761 } else {
10762 in = inode_map[vino];
10763 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10764 }
10765 return in;
10766 }
10767
10768 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10769 Inode **out, const UserPerm& perms)
10770 {
10771 std::lock_guard lock(client_lock);
10772 vinodeno_t vparent = _get_vino(parent);
10773 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10774 tout(cct) << __func__ << std::endl;
10775 tout(cct) << name << std::endl;
10776
10777 if (unmounting)
10778 return -ENOTCONN;
10779
10780 int r = 0;
10781 if (!fuse_default_permissions) {
10782 if (strcmp(name, ".") && strcmp(name, "..")) {
10783 r = may_lookup(parent, perms);
10784 if (r < 0)
10785 return r;
10786 }
10787 }
10788
10789 string dname(name);
10790 InodeRef in;
10791
10792 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10793 if (r < 0) {
10794 attr->st_ino = 0;
10795 goto out;
10796 }
10797
10798 ceph_assert(in);
10799 fill_stat(in, attr);
10800 _ll_get(in.get());
10801
10802 out:
10803 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10804 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10805 tout(cct) << attr->st_ino << std::endl;
10806 *out = in.get();
10807 return r;
10808 }
10809
10810 int Client::ll_lookup_inode(
10811 struct inodeno_t ino,
10812 const UserPerm& perms,
10813 Inode **inode)
10814 {
10815 ceph_assert(inode != NULL);
10816 std::lock_guard lock(client_lock);
10817 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10818
10819 if (unmounting)
10820 return -ENOTCONN;
10821
10822 // Num1: get inode and *inode
10823 int r = _lookup_ino(ino, perms, inode);
10824 if (r)
10825 return r;
10826
10827 ceph_assert(*inode != NULL);
10828
10829 if (!(*inode)->dentries.empty()) {
10830 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10831 return 0;
10832 }
10833
10834 if ((*inode)->is_root()) {
10835 ldout(cct, 8) << "ino is root, no parent" << dendl;
10836 return 0;
10837 }
10838
10839 // Num2: Request the parent inode, so that we can look up the name
10840 Inode *parent;
10841 r = _lookup_parent(*inode, perms, &parent);
10842 if (r) {
10843 _ll_forget(*inode, 1);
10844 return r;
10845 }
10846
10847 ceph_assert(parent != NULL);
10848
10849 // Num3: Finally, get the name (dentry) of the requested inode
10850 r = _lookup_name(*inode, parent, perms);
10851 if (r) {
10852 // Unexpected error
10853 _ll_forget(parent, 1);
10854 _ll_forget(*inode, 1);
10855 return r;
10856 }
10857
10858 _ll_forget(parent, 1);
10859 return 0;
10860 }
10861
10862 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10863 struct ceph_statx *stx, unsigned want, unsigned flags,
10864 const UserPerm& perms)
10865 {
10866 std::lock_guard lock(client_lock);
10867 vinodeno_t vparent = _get_vino(parent);
10868 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10869 tout(cct) << "ll_lookupx" << std::endl;
10870 tout(cct) << name << std::endl;
10871
10872 if (unmounting)
10873 return -ENOTCONN;
10874
10875 int r = 0;
10876 if (!fuse_default_permissions) {
10877 r = may_lookup(parent, perms);
10878 if (r < 0)
10879 return r;
10880 }
10881
10882 string dname(name);
10883 InodeRef in;
10884
10885 unsigned mask = statx_to_mask(flags, want);
10886 r = _lookup(parent, dname, mask, &in, perms);
10887 if (r < 0) {
10888 stx->stx_ino = 0;
10889 stx->stx_mask = 0;
10890 } else {
10891 ceph_assert(in);
10892 fill_statx(in, mask, stx);
10893 _ll_get(in.get());
10894 }
10895
10896 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10897 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10898 tout(cct) << stx->stx_ino << std::endl;
10899 *out = in.get();
10900 return r;
10901 }
10902
10903 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10904 unsigned int want, unsigned int flags, const UserPerm& perms)
10905 {
10906 std::lock_guard lock(client_lock);
10907
10908 if (unmounting)
10909 return -ENOTCONN;
10910
10911 filepath fp(name, 0);
10912 InodeRef in;
10913 int rc;
10914 unsigned mask = statx_to_mask(flags, want);
10915
10916 ldout(cct, 3) << __func__ << " " << name << dendl;
10917 tout(cct) << __func__ << std::endl;
10918 tout(cct) << name << std::endl;
10919
10920 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10921 if (rc < 0) {
10922 /* zero out mask, just in case... */
10923 stx->stx_mask = 0;
10924 stx->stx_ino = 0;
10925 *out = NULL;
10926 return rc;
10927 } else {
10928 ceph_assert(in);
10929 fill_statx(in, mask, stx);
10930 _ll_get(in.get());
10931 *out = in.get();
10932 return 0;
10933 }
10934 }
10935
10936 void Client::_ll_get(Inode *in)
10937 {
10938 if (in->ll_ref == 0) {
10939 in->get();
10940 if (in->is_dir() && !in->dentries.empty()) {
10941 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10942 in->get_first_parent()->get(); // pin dentry
10943 }
10944 if (in->snapid != CEPH_NOSNAP)
10945 ll_snap_ref[in->snapid]++;
10946 }
10947 in->ll_get();
10948 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10949 }
10950
10951 int Client::_ll_put(Inode *in, uint64_t num)
10952 {
10953 in->ll_put(num);
10954 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10955 if (in->ll_ref == 0) {
10956 if (in->is_dir() && !in->dentries.empty()) {
10957 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10958 in->get_first_parent()->put(); // unpin dentry
10959 }
10960 if (in->snapid != CEPH_NOSNAP) {
10961 auto p = ll_snap_ref.find(in->snapid);
10962 ceph_assert(p != ll_snap_ref.end());
10963 ceph_assert(p->second > 0);
10964 if (--p->second == 0)
10965 ll_snap_ref.erase(p);
10966 }
10967 put_inode(in);
10968 return 0;
10969 } else {
10970 return in->ll_ref;
10971 }
10972 }
10973
10974 void Client::_ll_drop_pins()
10975 {
10976 ldout(cct, 10) << __func__ << dendl;
10977 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10978 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10979 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10980 it != inode_map.end();
10981 it = next) {
10982 Inode *in = it->second;
10983 next = it;
10984 ++next;
10985 if (in->ll_ref){
10986 to_be_put.insert(in);
10987 _ll_put(in, in->ll_ref);
10988 }
10989 }
10990 }
10991
10992 bool Client::_ll_forget(Inode *in, uint64_t count)
10993 {
10994 inodeno_t ino = in->ino;
10995
10996 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10997 tout(cct) << __func__ << std::endl;
10998 tout(cct) << ino.val << std::endl;
10999 tout(cct) << count << std::endl;
11000
11001 // Ignore forget if we're no longer mounted
11002 if (unmounting)
11003 return true;
11004
11005 if (ino == 1) return true; // ignore forget on root.
11006
11007 bool last = false;
11008 if (in->ll_ref < count) {
11009 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11010 << ", which only has ll_ref=" << in->ll_ref << dendl;
11011 _ll_put(in, in->ll_ref);
11012 last = true;
11013 } else {
11014 if (_ll_put(in, count) == 0)
11015 last = true;
11016 }
11017
11018 return last;
11019 }
11020
11021 bool Client::ll_forget(Inode *in, uint64_t count)
11022 {
11023 std::lock_guard lock(client_lock);
11024 return _ll_forget(in, count);
11025 }
11026
11027 bool Client::ll_put(Inode *in)
11028 {
11029 /* ll_forget already takes the lock */
11030 return ll_forget(in, 1);
11031 }
11032
11033 int Client::ll_get_snap_ref(snapid_t snap)
11034 {
11035 std::lock_guard lock(client_lock);
11036 auto p = ll_snap_ref.find(snap);
11037 if (p != ll_snap_ref.end())
11038 return p->second;
11039 return 0;
11040 }
11041
11042 snapid_t Client::ll_get_snapid(Inode *in)
11043 {
11044 std::lock_guard lock(client_lock);
11045 return in->snapid;
11046 }
11047
11048 Inode *Client::ll_get_inode(ino_t ino)
11049 {
11050 std::lock_guard lock(client_lock);
11051
11052 if (unmounting)
11053 return NULL;
11054
11055 vinodeno_t vino = _map_faked_ino(ino);
11056 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11057 if (p == inode_map.end())
11058 return NULL;
11059 Inode *in = p->second;
11060 _ll_get(in);
11061 return in;
11062 }
11063
11064 Inode *Client::ll_get_inode(vinodeno_t vino)
11065 {
11066 std::lock_guard lock(client_lock);
11067
11068 if (unmounting)
11069 return NULL;
11070
11071 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11072 if (p == inode_map.end())
11073 return NULL;
11074 Inode *in = p->second;
11075 _ll_get(in);
11076 return in;
11077 }
11078
11079 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11080 {
11081 vinodeno_t vino = _get_vino(in);
11082
11083 ldout(cct, 8) << __func__ << " " << vino << dendl;
11084 tout(cct) << __func__ << std::endl;
11085 tout(cct) << vino.ino.val << std::endl;
11086
11087 if (vino.snapid < CEPH_NOSNAP)
11088 return 0;
11089 else
11090 return _getattr(in, caps, perms);
11091 }
11092
11093 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11094 {
11095 std::lock_guard lock(client_lock);
11096
11097 if (unmounting)
11098 return -ENOTCONN;
11099
11100 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11101
11102 if (res == 0)
11103 fill_stat(in, attr);
11104 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11105 return res;
11106 }
11107
11108 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11109 unsigned int flags, const UserPerm& perms)
11110 {
11111 std::lock_guard lock(client_lock);
11112
11113 if (unmounting)
11114 return -ENOTCONN;
11115
11116 int res = 0;
11117 unsigned mask = statx_to_mask(flags, want);
11118
11119 if (mask && !in->caps_issued_mask(mask, true))
11120 res = _ll_getattr(in, mask, perms);
11121
11122 if (res == 0)
11123 fill_statx(in, mask, stx);
11124 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11125 return res;
11126 }
11127
11128 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11129 const UserPerm& perms, InodeRef *inp)
11130 {
11131 vinodeno_t vino = _get_vino(in);
11132
11133 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11134 << dendl;
11135 tout(cct) << __func__ << std::endl;
11136 tout(cct) << vino.ino.val << std::endl;
11137 tout(cct) << stx->stx_mode << std::endl;
11138 tout(cct) << stx->stx_uid << std::endl;
11139 tout(cct) << stx->stx_gid << std::endl;
11140 tout(cct) << stx->stx_size << std::endl;
11141 tout(cct) << stx->stx_mtime << std::endl;
11142 tout(cct) << stx->stx_atime << std::endl;
11143 tout(cct) << stx->stx_btime << std::endl;
11144 tout(cct) << mask << std::endl;
11145
11146 if (!fuse_default_permissions) {
11147 int res = may_setattr(in, stx, mask, perms);
11148 if (res < 0)
11149 return res;
11150 }
11151
11152 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11153
11154 return __setattrx(in, stx, mask, perms, inp);
11155 }
11156
11157 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11158 const UserPerm& perms)
11159 {
11160 std::lock_guard lock(client_lock);
11161
11162 if (unmounting)
11163 return -ENOTCONN;
11164
11165 InodeRef target(in);
11166 int res = _ll_setattrx(in, stx, mask, perms, &target);
11167 if (res == 0) {
11168 ceph_assert(in == target.get());
11169 fill_statx(in, in->caps_issued(), stx);
11170 }
11171
11172 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11173 return res;
11174 }
11175
11176 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11177 const UserPerm& perms)
11178 {
11179 struct ceph_statx stx;
11180 stat_to_statx(attr, &stx);
11181
11182 std::lock_guard lock(client_lock);
11183
11184 if (unmounting)
11185 return -ENOTCONN;
11186
11187 InodeRef target(in);
11188 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11189 if (res == 0) {
11190 ceph_assert(in == target.get());
11191 fill_stat(in, attr);
11192 }
11193
11194 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11195 return res;
11196 }
11197
11198
11199 // ----------
11200 // xattrs
11201
11202 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11203 const UserPerm& perms)
11204 {
11205 std::lock_guard lock(client_lock);
11206
11207 if (unmounting)
11208 return -ENOTCONN;
11209
11210 InodeRef in;
11211 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11212 if (r < 0)
11213 return r;
11214 return _getxattr(in, name, value, size, perms);
11215 }
11216
11217 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11218 const UserPerm& perms)
11219 {
11220 std::lock_guard lock(client_lock);
11221
11222 if (unmounting)
11223 return -ENOTCONN;
11224
11225 InodeRef in;
11226 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11227 if (r < 0)
11228 return r;
11229 return _getxattr(in, name, value, size, perms);
11230 }
11231
11232 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11233 const UserPerm& perms)
11234 {
11235 std::lock_guard lock(client_lock);
11236
11237 if (unmounting)
11238 return -ENOTCONN;
11239
11240 Fh *f = get_filehandle(fd);
11241 if (!f)
11242 return -EBADF;
11243 return _getxattr(f->inode, name, value, size, perms);
11244 }
11245
11246 int Client::listxattr(const char *path, char *list, size_t size,
11247 const UserPerm& perms)
11248 {
11249 std::lock_guard lock(client_lock);
11250
11251 if (unmounting)
11252 return -ENOTCONN;
11253
11254 InodeRef in;
11255 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11256 if (r < 0)
11257 return r;
11258 return Client::_listxattr(in.get(), list, size, perms);
11259 }
11260
11261 int Client::llistxattr(const char *path, char *list, size_t size,
11262 const UserPerm& perms)
11263 {
11264 std::lock_guard lock(client_lock);
11265
11266 if (unmounting)
11267 return -ENOTCONN;
11268
11269 InodeRef in;
11270 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11271 if (r < 0)
11272 return r;
11273 return Client::_listxattr(in.get(), list, size, perms);
11274 }
11275
11276 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11277 {
11278 std::lock_guard lock(client_lock);
11279
11280 if (unmounting)
11281 return -ENOTCONN;
11282
11283 Fh *f = get_filehandle(fd);
11284 if (!f)
11285 return -EBADF;
11286 return Client::_listxattr(f->inode.get(), list, size, perms);
11287 }
11288
11289 int Client::removexattr(const char *path, const char *name,
11290 const UserPerm& perms)
11291 {
11292 std::lock_guard lock(client_lock);
11293
11294 if (unmounting)
11295 return -ENOTCONN;
11296
11297 InodeRef in;
11298 int r = Client::path_walk(path, &in, perms, true);
11299 if (r < 0)
11300 return r;
11301 return _removexattr(in, name, perms);
11302 }
11303
11304 int Client::lremovexattr(const char *path, const char *name,
11305 const UserPerm& perms)
11306 {
11307 std::lock_guard lock(client_lock);
11308
11309 if (unmounting)
11310 return -ENOTCONN;
11311
11312 InodeRef in;
11313 int r = Client::path_walk(path, &in, perms, false);
11314 if (r < 0)
11315 return r;
11316 return _removexattr(in, name, perms);
11317 }
11318
11319 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11320 {
11321 std::lock_guard lock(client_lock);
11322
11323 if (unmounting)
11324 return -ENOTCONN;
11325
11326 Fh *f = get_filehandle(fd);
11327 if (!f)
11328 return -EBADF;
11329 return _removexattr(f->inode, name, perms);
11330 }
11331
11332 int Client::setxattr(const char *path, const char *name, const void *value,
11333 size_t size, int flags, const UserPerm& perms)
11334 {
11335 _setxattr_maybe_wait_for_osdmap(name, value, size);
11336
11337 std::lock_guard lock(client_lock);
11338
11339 if (unmounting)
11340 return -ENOTCONN;
11341
11342 InodeRef in;
11343 int r = Client::path_walk(path, &in, perms, true);
11344 if (r < 0)
11345 return r;
11346 return _setxattr(in, name, value, size, flags, perms);
11347 }
11348
11349 int Client::lsetxattr(const char *path, const char *name, const void *value,
11350 size_t size, int flags, const UserPerm& perms)
11351 {
11352 _setxattr_maybe_wait_for_osdmap(name, value, size);
11353
11354 std::lock_guard lock(client_lock);
11355
11356 if (unmounting)
11357 return -ENOTCONN;
11358
11359 InodeRef in;
11360 int r = Client::path_walk(path, &in, perms, false);
11361 if (r < 0)
11362 return r;
11363 return _setxattr(in, name, value, size, flags, perms);
11364 }
11365
11366 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11367 int flags, const UserPerm& perms)
11368 {
11369 _setxattr_maybe_wait_for_osdmap(name, value, size);
11370
11371 std::lock_guard lock(client_lock);
11372
11373 if (unmounting)
11374 return -ENOTCONN;
11375
11376 Fh *f = get_filehandle(fd);
11377 if (!f)
11378 return -EBADF;
11379 return _setxattr(f->inode, name, value, size, flags, perms);
11380 }
11381
11382 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11383 const UserPerm& perms)
11384 {
11385 int r;
11386
11387 const VXattr *vxattr = _match_vxattr(in, name);
11388 if (vxattr) {
11389 r = -ENODATA;
11390
11391 // Do a force getattr to get the latest quota before returning
11392 // a value to userspace.
11393 int flags = 0;
11394 if (vxattr->flags & VXATTR_RSTAT) {
11395 flags |= CEPH_STAT_RSTAT;
11396 }
11397 r = _getattr(in, flags, perms, true);
11398 if (r != 0) {
11399 // Error from getattr!
11400 return r;
11401 }
11402
11403 // call pointer-to-member function
11404 char buf[256];
11405 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11406 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11407 } else {
11408 r = -ENODATA;
11409 }
11410
11411 if (size != 0) {
11412 if (r > (int)size) {
11413 r = -ERANGE;
11414 } else if (r > 0) {
11415 memcpy(value, buf, r);
11416 }
11417 }
11418 goto out;
11419 }
11420
11421 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11422 r = -EOPNOTSUPP;
11423 goto out;
11424 }
11425
11426 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11427 if (r == 0) {
11428 string n(name);
11429 r = -ENODATA;
11430 if (in->xattrs.count(n)) {
11431 r = in->xattrs[n].length();
11432 if (r > 0 && size != 0) {
11433 if (size >= (unsigned)r)
11434 memcpy(value, in->xattrs[n].c_str(), r);
11435 else
11436 r = -ERANGE;
11437 }
11438 }
11439 }
11440 out:
11441 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11442 return r;
11443 }
11444
11445 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11446 const UserPerm& perms)
11447 {
11448 if (cct->_conf->client_permissions) {
11449 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11450 if (r < 0)
11451 return r;
11452 }
11453 return _getxattr(in.get(), name, value, size, perms);
11454 }
11455
11456 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11457 size_t size, const UserPerm& perms)
11458 {
11459 std::lock_guard lock(client_lock);
11460
11461 if (unmounting)
11462 return -ENOTCONN;
11463
11464 vinodeno_t vino = _get_vino(in);
11465
11466 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11467 tout(cct) << __func__ << std::endl;
11468 tout(cct) << vino.ino.val << std::endl;
11469 tout(cct) << name << std::endl;
11470
11471 if (!fuse_default_permissions) {
11472 int r = xattr_permission(in, name, MAY_READ, perms);
11473 if (r < 0)
11474 return r;
11475 }
11476
11477 return _getxattr(in, name, value, size, perms);
11478 }
11479
11480 int Client::_listxattr(Inode *in, char *name, size_t size,
11481 const UserPerm& perms)
11482 {
11483 bool len_only = (size == 0);
11484 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11485 if (r != 0) {
11486 goto out;
11487 }
11488
11489 r = 0;
11490 for (const auto& p : in->xattrs) {
11491 size_t this_len = p.first.length() + 1;
11492 r += this_len;
11493 if (len_only)
11494 continue;
11495
11496 if (this_len > size) {
11497 r = -ERANGE;
11498 goto out;
11499 }
11500
11501 memcpy(name, p.first.c_str(), this_len);
11502 name += this_len;
11503 size -= this_len;
11504 }
11505 out:
11506 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11507 return r;
11508 }
11509
11510 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11511 const UserPerm& perms)
11512 {
11513 std::lock_guard lock(client_lock);
11514
11515 if (unmounting)
11516 return -ENOTCONN;
11517
11518 vinodeno_t vino = _get_vino(in);
11519
11520 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11521 tout(cct) << __func__ << std::endl;
11522 tout(cct) << vino.ino.val << std::endl;
11523 tout(cct) << size << std::endl;
11524
11525 return _listxattr(in, names, size, perms);
11526 }
11527
11528 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11529 size_t size, int flags, const UserPerm& perms)
11530 {
11531
11532 int xattr_flags = 0;
11533 if (!value)
11534 xattr_flags |= CEPH_XATTR_REMOVE;
11535 if (flags & XATTR_CREATE)
11536 xattr_flags |= CEPH_XATTR_CREATE;
11537 if (flags & XATTR_REPLACE)
11538 xattr_flags |= CEPH_XATTR_REPLACE;
11539
11540 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11541 filepath path;
11542 in->make_nosnap_relative_path(path);
11543 req->set_filepath(path);
11544 req->set_string2(name);
11545 req->set_inode(in);
11546 req->head.args.setxattr.flags = xattr_flags;
11547
11548 bufferlist bl;
11549 assert (value || size == 0);
11550 bl.append((const char*)value, size);
11551 req->set_data(bl);
11552
11553 int res = make_request(req, perms);
11554
11555 trim_cache();
11556 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11557 res << dendl;
11558 return res;
11559 }
11560
11561 int Client::_setxattr(Inode *in, const char *name, const void *value,
11562 size_t size, int flags, const UserPerm& perms)
11563 {
11564 if (in->snapid != CEPH_NOSNAP) {
11565 return -EROFS;
11566 }
11567
11568 bool posix_acl_xattr = false;
11569 if (acl_type == POSIX_ACL)
11570 posix_acl_xattr = !strncmp(name, "system.", 7);
11571
11572 if (strncmp(name, "user.", 5) &&
11573 strncmp(name, "security.", 9) &&
11574 strncmp(name, "trusted.", 8) &&
11575 strncmp(name, "ceph.", 5) &&
11576 !posix_acl_xattr)
11577 return -EOPNOTSUPP;
11578
11579 bool check_realm = false;
11580
11581 if (posix_acl_xattr) {
11582 if (!strcmp(name, ACL_EA_ACCESS)) {
11583 mode_t new_mode = in->mode;
11584 if (value) {
11585 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11586 if (ret < 0)
11587 return ret;
11588 if (ret == 0) {
11589 value = NULL;
11590 size = 0;
11591 }
11592 if (new_mode != in->mode) {
11593 struct ceph_statx stx;
11594 stx.stx_mode = new_mode;
11595 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11596 if (ret < 0)
11597 return ret;
11598 }
11599 }
11600 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11601 if (value) {
11602 if (!S_ISDIR(in->mode))
11603 return -EACCES;
11604 int ret = posix_acl_check(value, size);
11605 if (ret < 0)
11606 return -EINVAL;
11607 if (ret == 0) {
11608 value = NULL;
11609 size = 0;
11610 }
11611 }
11612 } else {
11613 return -EOPNOTSUPP;
11614 }
11615 } else {
11616 const VXattr *vxattr = _match_vxattr(in, name);
11617 if (vxattr) {
11618 if (vxattr->readonly)
11619 return -EOPNOTSUPP;
11620 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11621 check_realm = true;
11622 }
11623 }
11624
11625 int ret = _do_setxattr(in, name, value, size, flags, perms);
11626 if (ret >= 0 && check_realm) {
11627 // check if snaprealm was created for quota inode
11628 if (in->quota.is_enable() &&
11629 !(in->snaprealm && in->snaprealm->ino == in->ino))
11630 ret = -EOPNOTSUPP;
11631 }
11632
11633 return ret;
11634 }
11635
11636 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11637 size_t size, int flags, const UserPerm& perms)
11638 {
11639 if (cct->_conf->client_permissions) {
11640 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11641 if (r < 0)
11642 return r;
11643 }
11644 return _setxattr(in.get(), name, value, size, flags, perms);
11645 }
11646
11647 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11648 {
11649 string tmp;
11650 if (name == "layout") {
11651 string::iterator begin = value.begin();
11652 string::iterator end = value.end();
11653 keys_and_values<string::iterator> p; // create instance of parser
11654 std::map<string, string> m; // map to receive results
11655 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11656 return -EINVAL;
11657 }
11658 if (begin != end)
11659 return -EINVAL;
11660 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11661 if (q->first == "pool") {
11662 tmp = q->second;
11663 break;
11664 }
11665 }
11666 } else if (name == "layout.pool") {
11667 tmp = value;
11668 }
11669
11670 if (tmp.length()) {
11671 int64_t pool;
11672 try {
11673 pool = boost::lexical_cast<unsigned>(tmp);
11674 if (!osdmap->have_pg_pool(pool))
11675 return -ENOENT;
11676 } catch (boost::bad_lexical_cast const&) {
11677 pool = osdmap->lookup_pg_pool_name(tmp);
11678 if (pool < 0) {
11679 return -ENOENT;
11680 }
11681 }
11682 }
11683
11684 return 0;
11685 }
11686
11687 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11688 {
11689 // For setting pool of layout, MetaRequest need osdmap epoch.
11690 // There is a race which create a new data pool but client and mds both don't have.
11691 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11692 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11693 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11694 string rest(strstr(name, "layout"));
11695 string v((const char*)value, size);
11696 int r = objecter->with_osdmap([&](const OSDMap& o) {
11697 return _setxattr_check_data_pool(rest, v, &o);
11698 });
11699
11700 if (r == -ENOENT) {
11701 C_SaferCond ctx;
11702 objecter->wait_for_latest_osdmap(&ctx);
11703 ctx.wait();
11704 }
11705 }
11706 }
11707
11708 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11709 size_t size, int flags, const UserPerm& perms)
11710 {
11711 _setxattr_maybe_wait_for_osdmap(name, value, size);
11712
11713 std::lock_guard lock(client_lock);
11714
11715 if (unmounting)
11716 return -ENOTCONN;
11717
11718 vinodeno_t vino = _get_vino(in);
11719
11720 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11721 tout(cct) << __func__ << std::endl;
11722 tout(cct) << vino.ino.val << std::endl;
11723 tout(cct) << name << std::endl;
11724
11725 if (!fuse_default_permissions) {
11726 int r = xattr_permission(in, name, MAY_WRITE, perms);
11727 if (r < 0)
11728 return r;
11729 }
11730 return _setxattr(in, name, value, size, flags, perms);
11731 }
11732
11733 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11734 {
11735 if (in->snapid != CEPH_NOSNAP) {
11736 return -EROFS;
11737 }
11738
11739 // same xattrs supported by kernel client
11740 if (strncmp(name, "user.", 5) &&
11741 strncmp(name, "system.", 7) &&
11742 strncmp(name, "security.", 9) &&
11743 strncmp(name, "trusted.", 8) &&
11744 strncmp(name, "ceph.", 5))
11745 return -EOPNOTSUPP;
11746
11747 const VXattr *vxattr = _match_vxattr(in, name);
11748 if (vxattr && vxattr->readonly)
11749 return -EOPNOTSUPP;
11750
11751 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11752 filepath path;
11753 in->make_nosnap_relative_path(path);
11754 req->set_filepath(path);
11755 req->set_filepath2(name);
11756 req->set_inode(in);
11757
11758 int res = make_request(req, perms);
11759
11760 trim_cache();
11761 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11762 return res;
11763 }
11764
11765 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11766 {
11767 if (cct->_conf->client_permissions) {
11768 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11769 if (r < 0)
11770 return r;
11771 }
11772 return _removexattr(in.get(), name, perms);
11773 }
11774
11775 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11776 {
11777 std::lock_guard lock(client_lock);
11778
11779 if (unmounting)
11780 return -ENOTCONN;
11781
11782 vinodeno_t vino = _get_vino(in);
11783
11784 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11785 tout(cct) << "ll_removexattr" << std::endl;
11786 tout(cct) << vino.ino.val << std::endl;
11787 tout(cct) << name << std::endl;
11788
11789 if (!fuse_default_permissions) {
11790 int r = xattr_permission(in, name, MAY_WRITE, perms);
11791 if (r < 0)
11792 return r;
11793 }
11794
11795 return _removexattr(in, name, perms);
11796 }
11797
11798 bool Client::_vxattrcb_quota_exists(Inode *in)
11799 {
11800 return in->quota.is_enable() &&
11801 in->snaprealm && in->snaprealm->ino == in->ino;
11802 }
11803 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11804 {
11805 return snprintf(val, size,
11806 "max_bytes=%lld max_files=%lld",
11807 (long long int)in->quota.max_bytes,
11808 (long long int)in->quota.max_files);
11809 }
11810 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11811 {
11812 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11813 }
11814 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11815 {
11816 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11817 }
11818
11819 bool Client::_vxattrcb_layout_exists(Inode *in)
11820 {
11821 return in->layout != file_layout_t();
11822 }
11823 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11824 {
11825 int r = snprintf(val, size,
11826 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11827 (unsigned long long)in->layout.stripe_unit,
11828 (unsigned long long)in->layout.stripe_count,
11829 (unsigned long long)in->layout.object_size);
11830 objecter->with_osdmap([&](const OSDMap& o) {
11831 if (o.have_pg_pool(in->layout.pool_id))
11832 r += snprintf(val + r, size - r, "%s",
11833 o.get_pool_name(in->layout.pool_id).c_str());
11834 else
11835 r += snprintf(val + r, size - r, "%" PRIu64,
11836 (uint64_t)in->layout.pool_id);
11837 });
11838 if (in->layout.pool_ns.length())
11839 r += snprintf(val + r, size - r, " pool_namespace=%s",
11840 in->layout.pool_ns.c_str());
11841 return r;
11842 }
11843 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11844 {
11845 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11846 }
11847 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11848 {
11849 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11850 }
11851 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11852 {
11853 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11854 }
11855 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11856 {
11857 size_t r;
11858 objecter->with_osdmap([&](const OSDMap& o) {
11859 if (o.have_pg_pool(in->layout.pool_id))
11860 r = snprintf(val, size, "%s", o.get_pool_name(
11861 in->layout.pool_id).c_str());
11862 else
11863 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11864 });
11865 return r;
11866 }
11867 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11868 {
11869 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11870 }
11871 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11872 {
11873 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11874 }
11875 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11876 {
11877 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11878 }
11879 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11880 {
11881 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11882 }
11883 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11884 {
11885 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11886 }
11887 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11888 {
11889 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11890 }
11891 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11892 {
11893 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11894 }
11895 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11896 {
11897 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11898 }
11899 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11900 {
11901 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11902 (long)in->rstat.rctime.nsec());
11903 }
11904 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11905 {
11906 return in->dir_pin != -ENODATA;
11907 }
11908 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11909 {
11910 return snprintf(val, size, "%ld", (long)in->dir_pin);
11911 }
11912
11913 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11914 {
11915 return !in->snap_btime.is_zero();
11916 }
11917
11918 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11919 {
11920 return snprintf(val, size, "%llu.%09lu",
11921 (long long unsigned)in->snap_btime.sec(),
11922 (long unsigned)in->snap_btime.nsec());
11923 }
11924
11925 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11926 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11927
11928 #define XATTR_NAME_CEPH(_type, _name) \
11929 { \
11930 name: CEPH_XATTR_NAME(_type, _name), \
11931 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11932 readonly: true, \
11933 exists_cb: NULL, \
11934 flags: 0, \
11935 }
11936 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11937 { \
11938 name: CEPH_XATTR_NAME(_type, _name), \
11939 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11940 readonly: true, \
11941 exists_cb: NULL, \
11942 flags: _flags, \
11943 }
11944 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11945 { \
11946 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11947 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11948 readonly: false, \
11949 exists_cb: &Client::_vxattrcb_layout_exists, \
11950 flags: 0, \
11951 }
11952 #define XATTR_QUOTA_FIELD(_type, _name) \
11953 { \
11954 name: CEPH_XATTR_NAME(_type, _name), \
11955 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11956 readonly: false, \
11957 exists_cb: &Client::_vxattrcb_quota_exists, \
11958 flags: 0, \
11959 }
11960
11961 const Client::VXattr Client::_dir_vxattrs[] = {
11962 {
11963 name: "ceph.dir.layout",
11964 getxattr_cb: &Client::_vxattrcb_layout,
11965 readonly: false,
11966 exists_cb: &Client::_vxattrcb_layout_exists,
11967 flags: 0,
11968 },
11969 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11970 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11971 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11972 XATTR_LAYOUT_FIELD(dir, layout, pool),
11973 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11974 XATTR_NAME_CEPH(dir, entries),
11975 XATTR_NAME_CEPH(dir, files),
11976 XATTR_NAME_CEPH(dir, subdirs),
11977 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11978 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11979 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11980 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11981 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11982 {
11983 name: "ceph.quota",
11984 getxattr_cb: &Client::_vxattrcb_quota,
11985 readonly: false,
11986 exists_cb: &Client::_vxattrcb_quota_exists,
11987 flags: 0,
11988 },
11989 XATTR_QUOTA_FIELD(quota, max_bytes),
11990 XATTR_QUOTA_FIELD(quota, max_files),
11991 {
11992 name: "ceph.dir.pin",
11993 getxattr_cb: &Client::_vxattrcb_dir_pin,
11994 readonly: false,
11995 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11996 flags: 0,
11997 },
11998 {
11999 name: "ceph.snap.btime",
12000 getxattr_cb: &Client::_vxattrcb_snap_btime,
12001 readonly: true,
12002 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12003 flags: 0,
12004 },
12005 { name: "" } /* Required table terminator */
12006 };
12007
12008 const Client::VXattr Client::_file_vxattrs[] = {
12009 {
12010 name: "ceph.file.layout",
12011 getxattr_cb: &Client::_vxattrcb_layout,
12012 readonly: false,
12013 exists_cb: &Client::_vxattrcb_layout_exists,
12014 flags: 0,
12015 },
12016 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12017 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12018 XATTR_LAYOUT_FIELD(file, layout, object_size),
12019 XATTR_LAYOUT_FIELD(file, layout, pool),
12020 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12021 {
12022 name: "ceph.snap.btime",
12023 getxattr_cb: &Client::_vxattrcb_snap_btime,
12024 readonly: true,
12025 exists_cb: &Client::_vxattrcb_snap_btime_exists,
12026 flags: 0,
12027 },
12028 { name: "" } /* Required table terminator */
12029 };
12030
12031 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12032 {
12033 if (in->is_dir())
12034 return _dir_vxattrs;
12035 else if (in->is_file())
12036 return _file_vxattrs;
12037 return NULL;
12038 }
12039
12040 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12041 {
12042 if (strncmp(name, "ceph.", 5) == 0) {
12043 const VXattr *vxattr = _get_vxattrs(in);
12044 if (vxattr) {
12045 while (!vxattr->name.empty()) {
12046 if (vxattr->name == name)
12047 return vxattr;
12048 vxattr++;
12049 }
12050 }
12051 }
12052 return NULL;
12053 }
12054
12055 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12056 {
12057 std::lock_guard lock(client_lock);
12058
12059 if (unmounting)
12060 return -ENOTCONN;
12061
12062 vinodeno_t vino = _get_vino(in);
12063
12064 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12065 tout(cct) << "ll_readlink" << std::endl;
12066 tout(cct) << vino.ino.val << std::endl;
12067
12068 for (auto dn : in->dentries) {
12069 touch_dn(dn);
12070 }
12071
12072 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12073 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12074 return r;
12075 }
12076
12077 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12078 const UserPerm& perms, InodeRef *inp)
12079 {
12080 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12081 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12082 << ", gid " << perms.gid() << ")" << dendl;
12083
12084 if (strlen(name) > NAME_MAX)
12085 return -ENAMETOOLONG;
12086
12087 if (dir->snapid != CEPH_NOSNAP) {
12088 return -EROFS;
12089 }
12090 if (is_quota_files_exceeded(dir, perms)) {
12091 return -EDQUOT;
12092 }
12093
12094 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12095
12096 filepath path;
12097 dir->make_nosnap_relative_path(path);
12098 path.push_dentry(name);
12099 req->set_filepath(path);
12100 req->set_inode(dir);
12101 req->head.args.mknod.rdev = rdev;
12102 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12103 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12104
12105 bufferlist xattrs_bl;
12106 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12107 if (res < 0)
12108 goto fail;
12109 req->head.args.mknod.mode = mode;
12110 if (xattrs_bl.length() > 0)
12111 req->set_data(xattrs_bl);
12112
12113 Dentry *de;
12114 res = get_or_create(dir, name, &de);
12115 if (res < 0)
12116 goto fail;
12117 req->set_dentry(de);
12118
12119 res = make_request(req, perms, inp);
12120
12121 trim_cache();
12122
12123 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12124 return res;
12125
12126 fail:
12127 put_request(req);
12128 return res;
12129 }
12130
12131 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12132 dev_t rdev, struct stat *attr, Inode **out,
12133 const UserPerm& perms)
12134 {
12135 std::lock_guard lock(client_lock);
12136
12137 if (unmounting)
12138 return -ENOTCONN;
12139
12140 vinodeno_t vparent = _get_vino(parent);
12141
12142 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12143 tout(cct) << "ll_mknod" << std::endl;
12144 tout(cct) << vparent.ino.val << std::endl;
12145 tout(cct) << name << std::endl;
12146 tout(cct) << mode << std::endl;
12147 tout(cct) << rdev << std::endl;
12148
12149 if (!fuse_default_permissions) {
12150 int r = may_create(parent, perms);
12151 if (r < 0)
12152 return r;
12153 }
12154
12155 InodeRef in;
12156 int r = _mknod(parent, name, mode, rdev, perms, &in);
12157 if (r == 0) {
12158 fill_stat(in, attr);
12159 _ll_get(in.get());
12160 }
12161 tout(cct) << attr->st_ino << std::endl;
12162 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12163 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12164 *out = in.get();
12165 return r;
12166 }
12167
12168 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12169 dev_t rdev, Inode **out,
12170 struct ceph_statx *stx, unsigned want, unsigned flags,
12171 const UserPerm& perms)
12172 {
12173 unsigned caps = statx_to_mask(flags, want);
12174 std::lock_guard lock(client_lock);
12175
12176 if (unmounting)
12177 return -ENOTCONN;
12178
12179 vinodeno_t vparent = _get_vino(parent);
12180
12181 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12182 tout(cct) << "ll_mknodx" << std::endl;
12183 tout(cct) << vparent.ino.val << std::endl;
12184 tout(cct) << name << std::endl;
12185 tout(cct) << mode << std::endl;
12186 tout(cct) << rdev << std::endl;
12187
12188 if (!fuse_default_permissions) {
12189 int r = may_create(parent, perms);
12190 if (r < 0)
12191 return r;
12192 }
12193
12194 InodeRef in;
12195 int r = _mknod(parent, name, mode, rdev, perms, &in);
12196 if (r == 0) {
12197 fill_statx(in, caps, stx);
12198 _ll_get(in.get());
12199 }
12200 tout(cct) << stx->stx_ino << std::endl;
12201 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12202 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12203 *out = in.get();
12204 return r;
12205 }
12206
12207 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12208 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12209 int object_size, const char *data_pool, bool *created,
12210 const UserPerm& perms)
12211 {
12212 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12213 mode << dec << ")" << dendl;
12214
12215 if (strlen(name) > NAME_MAX)
12216 return -ENAMETOOLONG;
12217 if (dir->snapid != CEPH_NOSNAP) {
12218 return -EROFS;
12219 }
12220 if (is_quota_files_exceeded(dir, perms)) {
12221 return -EDQUOT;
12222 }
12223
12224 // use normalized flags to generate cmode
12225 int cflags = ceph_flags_sys2wire(flags);
12226 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12227 cflags |= CEPH_O_LAZY;
12228
12229 int cmode = ceph_flags_to_mode(cflags);
12230
12231 int64_t pool_id = -1;
12232 if (data_pool && *data_pool) {
12233 pool_id = objecter->with_osdmap(
12234 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12235 if (pool_id < 0)
12236 return -EINVAL;
12237 if (pool_id > 0xffffffffll)
12238 return -ERANGE; // bummer!
12239 }
12240
12241 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12242
12243 filepath path;
12244 dir->make_nosnap_relative_path(path);
12245 path.push_dentry(name);
12246 req->set_filepath(path);
12247 req->set_inode(dir);
12248 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12249
12250 req->head.args.open.stripe_unit = stripe_unit;
12251 req->head.args.open.stripe_count = stripe_count;
12252 req->head.args.open.object_size = object_size;
12253 if (cct->_conf->client_debug_getattr_caps)
12254 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12255 else
12256 req->head.args.open.mask = 0;
12257 req->head.args.open.pool = pool_id;
12258 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12259 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12260
12261 mode |= S_IFREG;
12262 bufferlist xattrs_bl;
12263 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12264 if (res < 0)
12265 goto fail;
12266 req->head.args.open.mode = mode;
12267 if (xattrs_bl.length() > 0)
12268 req->set_data(xattrs_bl);
12269
12270 Dentry *de;
12271 res = get_or_create(dir, name, &de);
12272 if (res < 0)
12273 goto fail;
12274 req->set_dentry(de);
12275
12276 res = make_request(req, perms, inp, created);
12277 if (res < 0) {
12278 goto reply_error;
12279 }
12280
12281 /* If the caller passed a value in fhp, do the open */
12282 if(fhp) {
12283 (*inp)->get_open_ref(cmode);
12284 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12285 }
12286
12287 reply_error:
12288 trim_cache();
12289
12290 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12291 << " layout " << stripe_unit
12292 << ' ' << stripe_count
12293 << ' ' << object_size
12294 <<") = " << res << dendl;
12295 return res;
12296
12297 fail:
12298 put_request(req);
12299 return res;
12300 }
12301
12302
12303 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12304 InodeRef *inp)
12305 {
12306 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12307 << mode << dec << ", uid " << perm.uid()
12308 << ", gid " << perm.gid() << ")" << dendl;
12309
12310 if (strlen(name) > NAME_MAX)
12311 return -ENAMETOOLONG;
12312
12313 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12314 return -EROFS;
12315 }
12316 if (is_quota_files_exceeded(dir, perm)) {
12317 return -EDQUOT;
12318 }
12319 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12320 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12321
12322 filepath path;
12323 dir->make_nosnap_relative_path(path);
12324 path.push_dentry(name);
12325 req->set_filepath(path);
12326 req->set_inode(dir);
12327 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12328 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12329
12330 mode |= S_IFDIR;
12331 bufferlist xattrs_bl;
12332 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12333 if (res < 0)
12334 goto fail;
12335 req->head.args.mkdir.mode = mode;
12336 if (xattrs_bl.length() > 0)
12337 req->set_data(xattrs_bl);
12338
12339 Dentry *de;
12340 res = get_or_create(dir, name, &de);
12341 if (res < 0)
12342 goto fail;
12343 req->set_dentry(de);
12344
12345 ldout(cct, 10) << "_mkdir: making request" << dendl;
12346 res = make_request(req, perm, inp);
12347 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12348
12349 trim_cache();
12350
12351 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12352 return res;
12353
12354 fail:
12355 put_request(req);
12356 return res;
12357 }
12358
12359 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12360 struct stat *attr, Inode **out, const UserPerm& perm)
12361 {
12362 std::lock_guard lock(client_lock);
12363
12364 if (unmounting)
12365 return -ENOTCONN;
12366
12367 vinodeno_t vparent = _get_vino(parent);
12368
12369 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12370 tout(cct) << "ll_mkdir" << std::endl;
12371 tout(cct) << vparent.ino.val << std::endl;
12372 tout(cct) << name << std::endl;
12373 tout(cct) << mode << std::endl;
12374
12375 if (!fuse_default_permissions) {
12376 int r = may_create(parent, perm);
12377 if (r < 0)
12378 return r;
12379 }
12380
12381 InodeRef in;
12382 int r = _mkdir(parent, name, mode, perm, &in);
12383 if (r == 0) {
12384 fill_stat(in, attr);
12385 _ll_get(in.get());
12386 }
12387 tout(cct) << attr->st_ino << std::endl;
12388 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12389 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12390 *out = in.get();
12391 return r;
12392 }
12393
12394 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12395 struct ceph_statx *stx, unsigned want, unsigned flags,
12396 const UserPerm& perms)
12397 {
12398 std::lock_guard lock(client_lock);
12399
12400 if (unmounting)
12401 return -ENOTCONN;
12402
12403 vinodeno_t vparent = _get_vino(parent);
12404
12405 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12406 tout(cct) << "ll_mkdirx" << std::endl;
12407 tout(cct) << vparent.ino.val << std::endl;
12408 tout(cct) << name << std::endl;
12409 tout(cct) << mode << std::endl;
12410
12411 if (!fuse_default_permissions) {
12412 int r = may_create(parent, perms);
12413 if (r < 0)
12414 return r;
12415 }
12416
12417 InodeRef in;
12418 int r = _mkdir(parent, name, mode, perms, &in);
12419 if (r == 0) {
12420 fill_statx(in, statx_to_mask(flags, want), stx);
12421 _ll_get(in.get());
12422 } else {
12423 stx->stx_ino = 0;
12424 stx->stx_mask = 0;
12425 }
12426 tout(cct) << stx->stx_ino << std::endl;
12427 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12428 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12429 *out = in.get();
12430 return r;
12431 }
12432
12433 int Client::_symlink(Inode *dir, const char *name, const char *target,
12434 const UserPerm& perms, InodeRef *inp)
12435 {
12436 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12437 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12438 << dendl;
12439
12440 if (strlen(name) > NAME_MAX)
12441 return -ENAMETOOLONG;
12442
12443 if (dir->snapid != CEPH_NOSNAP) {
12444 return -EROFS;
12445 }
12446 if (is_quota_files_exceeded(dir, perms)) {
12447 return -EDQUOT;
12448 }
12449
12450 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12451
12452 filepath path;
12453 dir->make_nosnap_relative_path(path);
12454 path.push_dentry(name);
12455 req->set_filepath(path);
12456 req->set_inode(dir);
12457 req->set_string2(target);
12458 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12459 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12460
12461 Dentry *de;
12462 int res = get_or_create(dir, name, &de);
12463 if (res < 0)
12464 goto fail;
12465 req->set_dentry(de);
12466
12467 res = make_request(req, perms, inp);
12468
12469 trim_cache();
12470 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12471 res << dendl;
12472 return res;
12473
12474 fail:
12475 put_request(req);
12476 return res;
12477 }
12478
12479 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12480 struct stat *attr, Inode **out, const UserPerm& perms)
12481 {
12482 std::lock_guard lock(client_lock);
12483
12484 if (unmounting)
12485 return -ENOTCONN;
12486
12487 vinodeno_t vparent = _get_vino(parent);
12488
12489 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12490 << dendl;
12491 tout(cct) << "ll_symlink" << std::endl;
12492 tout(cct) << vparent.ino.val << std::endl;
12493 tout(cct) << name << std::endl;
12494 tout(cct) << value << std::endl;
12495
12496 if (!fuse_default_permissions) {
12497 int r = may_create(parent, perms);
12498 if (r < 0)
12499 return r;
12500 }
12501
12502 InodeRef in;
12503 int r = _symlink(parent, name, value, perms, &in);
12504 if (r == 0) {
12505 fill_stat(in, attr);
12506 _ll_get(in.get());
12507 }
12508 tout(cct) << attr->st_ino << std::endl;
12509 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12510 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12511 *out = in.get();
12512 return r;
12513 }
12514
12515 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12516 Inode **out, struct ceph_statx *stx, unsigned want,
12517 unsigned flags, const UserPerm& perms)
12518 {
12519 std::lock_guard lock(client_lock);
12520
12521 if (unmounting)
12522 return -ENOTCONN;
12523
12524 vinodeno_t vparent = _get_vino(parent);
12525
12526 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12527 << dendl;
12528 tout(cct) << "ll_symlinkx" << std::endl;
12529 tout(cct) << vparent.ino.val << std::endl;
12530 tout(cct) << name << std::endl;
12531 tout(cct) << value << std::endl;
12532
12533 if (!fuse_default_permissions) {
12534 int r = may_create(parent, perms);
12535 if (r < 0)
12536 return r;
12537 }
12538
12539 InodeRef in;
12540 int r = _symlink(parent, name, value, perms, &in);
12541 if (r == 0) {
12542 fill_statx(in, statx_to_mask(flags, want), stx);
12543 _ll_get(in.get());
12544 }
12545 tout(cct) << stx->stx_ino << std::endl;
12546 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12547 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12548 *out = in.get();
12549 return r;
12550 }
12551
12552 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12553 {
12554 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12555 << " uid " << perm.uid() << " gid " << perm.gid()
12556 << ")" << dendl;
12557
12558 if (dir->snapid != CEPH_NOSNAP) {
12559 return -EROFS;
12560 }
12561
12562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12563
12564 filepath path;
12565 dir->make_nosnap_relative_path(path);
12566 path.push_dentry(name);
12567 req->set_filepath(path);
12568
12569 InodeRef otherin;
12570 Inode *in;
12571 Dentry *de;
12572
12573 int res = get_or_create(dir, name, &de);
12574 if (res < 0)
12575 goto fail;
12576 req->set_dentry(de);
12577 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12578 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12579
12580 res = _lookup(dir, name, 0, &otherin, perm);
12581 if (res < 0)
12582 goto fail;
12583
12584 in = otherin.get();
12585 req->set_other_inode(in);
12586 in->break_all_delegs();
12587 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12588
12589 req->set_inode(dir);
12590
12591 res = make_request(req, perm);
12592
12593 trim_cache();
12594 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12595 return res;
12596
12597 fail:
12598 put_request(req);
12599 return res;
12600 }
12601
12602 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12603 {
12604 std::lock_guard lock(client_lock);
12605
12606 if (unmounting)
12607 return -ENOTCONN;
12608
12609 vinodeno_t vino = _get_vino(in);
12610
12611 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12612 tout(cct) << "ll_unlink" << std::endl;
12613 tout(cct) << vino.ino.val << std::endl;
12614 tout(cct) << name << std::endl;
12615
12616 if (!fuse_default_permissions) {
12617 int r = may_delete(in, name, perm);
12618 if (r < 0)
12619 return r;
12620 }
12621 return _unlink(in, name, perm);
12622 }
12623
12624 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12625 {
12626 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12627 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12628
12629 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12630 return -EROFS;
12631 }
12632
12633 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12634 MetaRequest *req = new MetaRequest(op);
12635 filepath path;
12636 dir->make_nosnap_relative_path(path);
12637 path.push_dentry(name);
12638 req->set_filepath(path);
12639 req->set_inode(dir);
12640
12641 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12642 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12643 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12644
12645 InodeRef in;
12646
12647 Dentry *de;
12648 int res = get_or_create(dir, name, &de);
12649 if (res < 0)
12650 goto fail;
12651 if (op == CEPH_MDS_OP_RMDIR)
12652 req->set_dentry(de);
12653 else
12654 de->get();
12655
12656 res = _lookup(dir, name, 0, &in, perms);
12657 if (res < 0)
12658 goto fail;
12659
12660 if (op == CEPH_MDS_OP_RMSNAP) {
12661 unlink(de, true, true);
12662 de->put();
12663 }
12664 req->set_other_inode(in.get());
12665
12666 res = make_request(req, perms);
12667
12668 trim_cache();
12669 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12670 return res;
12671
12672 fail:
12673 put_request(req);
12674 return res;
12675 }
12676
12677 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12678 {
12679 std::lock_guard lock(client_lock);
12680
12681 if (unmounting)
12682 return -ENOTCONN;
12683
12684 vinodeno_t vino = _get_vino(in);
12685
12686 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12687 tout(cct) << "ll_rmdir" << std::endl;
12688 tout(cct) << vino.ino.val << std::endl;
12689 tout(cct) << name << std::endl;
12690
12691 if (!fuse_default_permissions) {
12692 int r = may_delete(in, name, perms);
12693 if (r < 0)
12694 return r;
12695 }
12696
12697 return _rmdir(in, name, perms);
12698 }
12699
12700 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12701 {
12702 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12703 << todir->ino << " " << toname
12704 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12705 << dendl;
12706
12707 if (fromdir->snapid != todir->snapid)
12708 return -EXDEV;
12709
12710 int op = CEPH_MDS_OP_RENAME;
12711 if (fromdir->snapid != CEPH_NOSNAP) {
12712 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12713 op = CEPH_MDS_OP_RENAMESNAP;
12714 else
12715 return -EROFS;
12716 }
12717
12718 InodeRef target;
12719 MetaRequest *req = new MetaRequest(op);
12720
12721 filepath from;
12722 fromdir->make_nosnap_relative_path(from);
12723 from.push_dentry(fromname);
12724 filepath to;
12725 todir->make_nosnap_relative_path(to);
12726 to.push_dentry(toname);
12727 req->set_filepath(to);
12728 req->set_filepath2(from);
12729
12730 Dentry *oldde;
12731 int res = get_or_create(fromdir, fromname, &oldde);
12732 if (res < 0)
12733 goto fail;
12734 Dentry *de;
12735 res = get_or_create(todir, toname, &de);
12736 if (res < 0)
12737 goto fail;
12738
12739 if (op == CEPH_MDS_OP_RENAME) {
12740 req->set_old_dentry(oldde);
12741 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12742 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12743
12744 req->set_dentry(de);
12745 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12746 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12747
12748 InodeRef oldin, otherin;
12749 Inode *fromdir_root = nullptr;
12750 Inode *todir_root = nullptr;
12751 int mask = 0;
12752 bool quota_check = false;
12753 if (fromdir != todir) {
12754 fromdir_root =
12755 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12756 todir_root =
12757 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12758
12759 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12760 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12761 // to auth MDS to get latest rstat for todir_root and source dir
12762 // even if their dentry caches and inode caps are satisfied.
12763 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12764 if (res < 0)
12765 goto fail;
12766
12767 quota_check = true;
12768 if (oldde->inode && oldde->inode->is_dir()) {
12769 mask |= CEPH_STAT_RSTAT;
12770 }
12771 }
12772 }
12773
12774 res = _lookup(fromdir, fromname, mask, &oldin, perm);
12775 if (res < 0)
12776 goto fail;
12777
12778 Inode *oldinode = oldin.get();
12779 oldinode->break_all_delegs();
12780 req->set_old_inode(oldinode);
12781 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12782
12783 if (quota_check) {
12784 int64_t old_bytes, old_files;
12785 if (oldinode->is_dir()) {
12786 old_bytes = oldinode->rstat.rbytes;
12787 old_files = oldinode->rstat.rsize();
12788 } else {
12789 old_bytes = oldinode->size;
12790 old_files = 1;
12791 }
12792
12793 bool quota_exceed = false;
12794 if (todir_root && todir_root->quota.max_bytes &&
12795 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12796 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12797 << old_bytes << ") to (" << todir->ino
12798 << ") will exceed quota on " << *todir_root << dendl;
12799 quota_exceed = true;
12800 }
12801
12802 if (todir_root && todir_root->quota.max_files &&
12803 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12804 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12805 << old_files << ") to (" << todir->ino
12806 << ") will exceed quota on " << *todir_root << dendl;
12807 quota_exceed = true;
12808 }
12809
12810 if (quota_exceed) {
12811 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12812 goto fail;
12813 }
12814 }
12815
12816 res = _lookup(todir, toname, 0, &otherin, perm);
12817 switch (res) {
12818 case 0:
12819 {
12820 Inode *in = otherin.get();
12821 req->set_other_inode(in);
12822 in->break_all_delegs();
12823 }
12824 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12825 break;
12826 case -ENOENT:
12827 break;
12828 default:
12829 goto fail;
12830 }
12831
12832 req->set_inode(todir);
12833 } else {
12834 // renamesnap reply contains no tracedn, so we need to invalidate
12835 // dentry manually
12836 unlink(oldde, true, true);
12837 unlink(de, true, true);
12838
12839 req->set_inode(todir);
12840 }
12841
12842 res = make_request(req, perm, &target);
12843 ldout(cct, 10) << "rename result is " << res << dendl;
12844
12845 // renamed item from our cache
12846
12847 trim_cache();
12848 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12849 return res;
12850
12851 fail:
12852 put_request(req);
12853 return res;
12854 }
12855
12856 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12857 const char *newname, const UserPerm& perm)
12858 {
12859 std::lock_guard lock(client_lock);
12860
12861 if (unmounting)
12862 return -ENOTCONN;
12863
12864 vinodeno_t vparent = _get_vino(parent);
12865 vinodeno_t vnewparent = _get_vino(newparent);
12866
12867 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12868 << vnewparent << " " << newname << dendl;
12869 tout(cct) << "ll_rename" << std::endl;
12870 tout(cct) << vparent.ino.val << std::endl;
12871 tout(cct) << name << std::endl;
12872 tout(cct) << vnewparent.ino.val << std::endl;
12873 tout(cct) << newname << std::endl;
12874
12875 if (!fuse_default_permissions) {
12876 int r = may_delete(parent, name, perm);
12877 if (r < 0)
12878 return r;
12879 r = may_delete(newparent, newname, perm);
12880 if (r < 0 && r != -ENOENT)
12881 return r;
12882 }
12883
12884 return _rename(parent, name, newparent, newname, perm);
12885 }
12886
12887 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12888 {
12889 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12890 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12891
12892 if (strlen(newname) > NAME_MAX)
12893 return -ENAMETOOLONG;
12894
12895 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12896 return -EROFS;
12897 }
12898 if (is_quota_files_exceeded(dir, perm)) {
12899 return -EDQUOT;
12900 }
12901
12902 in->break_all_delegs();
12903 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12904
12905 filepath path(newname, dir->ino);
12906 req->set_filepath(path);
12907 filepath existing(in->ino);
12908 req->set_filepath2(existing);
12909
12910 req->set_inode(dir);
12911 req->inode_drop = CEPH_CAP_FILE_SHARED;
12912 req->inode_unless = CEPH_CAP_FILE_EXCL;
12913
12914 Dentry *de;
12915 int res = get_or_create(dir, newname, &de);
12916 if (res < 0)
12917 goto fail;
12918 req->set_dentry(de);
12919
12920 res = make_request(req, perm, inp);
12921 ldout(cct, 10) << "link result is " << res << dendl;
12922
12923 trim_cache();
12924 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12925 return res;
12926
12927 fail:
12928 put_request(req);
12929 return res;
12930 }
12931
12932 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12933 const UserPerm& perm)
12934 {
12935 std::lock_guard lock(client_lock);
12936
12937 if (unmounting)
12938 return -ENOTCONN;
12939
12940 vinodeno_t vino = _get_vino(in);
12941 vinodeno_t vnewparent = _get_vino(newparent);
12942
12943 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12944 newname << dendl;
12945 tout(cct) << "ll_link" << std::endl;
12946 tout(cct) << vino.ino.val << std::endl;
12947 tout(cct) << vnewparent << std::endl;
12948 tout(cct) << newname << std::endl;
12949
12950 InodeRef target;
12951
12952 if (!fuse_default_permissions) {
12953 if (S_ISDIR(in->mode))
12954 return -EPERM;
12955
12956 int r = may_hardlink(in, perm);
12957 if (r < 0)
12958 return r;
12959
12960 r = may_create(newparent, perm);
12961 if (r < 0)
12962 return r;
12963 }
12964
12965 return _link(in, newparent, newname, perm, &target);
12966 }
12967
12968 int Client::ll_num_osds(void)
12969 {
12970 std::lock_guard lock(client_lock);
12971 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12972 }
12973
12974 int Client::ll_osdaddr(int osd, uint32_t *addr)
12975 {
12976 std::lock_guard lock(client_lock);
12977
12978 entity_addr_t g;
12979 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12980 if (!o.exists(osd))
12981 return false;
12982 g = o.get_addrs(osd).front();
12983 return true;
12984 });
12985 if (!exists)
12986 return -1;
12987 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12988 *addr = ntohl(nb_addr);
12989 return 0;
12990 }
12991
12992 uint32_t Client::ll_stripe_unit(Inode *in)
12993 {
12994 std::lock_guard lock(client_lock);
12995 return in->layout.stripe_unit;
12996 }
12997
12998 uint64_t Client::ll_snap_seq(Inode *in)
12999 {
13000 std::lock_guard lock(client_lock);
13001 return in->snaprealm->seq;
13002 }
13003
13004 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13005 {
13006 std::lock_guard lock(client_lock);
13007 *layout = in->layout;
13008 return 0;
13009 }
13010
13011 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13012 {
13013 return ll_file_layout(fh->inode.get(), layout);
13014 }
13015
13016 /* Currently we cannot take advantage of redundancy in reads, since we
13017 would have to go through all possible placement groups (a
13018 potentially quite large number determined by a hash), and use CRUSH
13019 to calculate the appropriate set of OSDs for each placement group,
13020 then index into that. An array with one entry per OSD is much more
13021 tractable and works for demonstration purposes. */
13022
13023 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13024 file_layout_t* layout)
13025 {
13026 std::lock_guard lock(client_lock);
13027
13028 inodeno_t ino = in->ino;
13029 uint32_t object_size = layout->object_size;
13030 uint32_t su = layout->stripe_unit;
13031 uint32_t stripe_count = layout->stripe_count;
13032 uint64_t stripes_per_object = object_size / su;
13033 uint64_t stripeno = 0, stripepos = 0;
13034
13035 if(stripe_count) {
13036 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
13037 stripepos = blockno % stripe_count; // which object in the object set (X)
13038 }
13039 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
13040 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
13041
13042 object_t oid = file_object_t(ino, objectno);
13043 return objecter->with_osdmap([&](const OSDMap& o) {
13044 ceph_object_layout olayout =
13045 o.file_to_object_layout(oid, *layout);
13046 pg_t pg = (pg_t)olayout.ol_pgid;
13047 vector<int> osds;
13048 int primary;
13049 o.pg_to_acting_osds(pg, &osds, &primary);
13050 return primary;
13051 });
13052 }
13053
13054 /* Return the offset of the block, internal to the object */
13055
13056 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13057 {
13058 std::lock_guard lock(client_lock);
13059 file_layout_t *layout=&(in->layout);
13060 uint32_t object_size = layout->object_size;
13061 uint32_t su = layout->stripe_unit;
13062 uint64_t stripes_per_object = object_size / su;
13063
13064 return (blockno % stripes_per_object) * su;
13065 }
13066
13067 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13068 const UserPerm& perms)
13069 {
13070 std::lock_guard lock(client_lock);
13071
13072 if (unmounting)
13073 return -ENOTCONN;
13074
13075 vinodeno_t vino = _get_vino(in);
13076
13077 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13078 tout(cct) << "ll_opendir" << std::endl;
13079 tout(cct) << vino.ino.val << std::endl;
13080
13081 if (!fuse_default_permissions) {
13082 int r = may_open(in, flags, perms);
13083 if (r < 0)
13084 return r;
13085 }
13086
13087 int r = _opendir(in, dirpp, perms);
13088 tout(cct) << (unsigned long)*dirpp << std::endl;
13089
13090 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13091 << dendl;
13092 return r;
13093 }
13094
13095 int Client::ll_releasedir(dir_result_t *dirp)
13096 {
13097 std::lock_guard lock(client_lock);
13098 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13099 tout(cct) << "ll_releasedir" << std::endl;
13100 tout(cct) << (unsigned long)dirp << std::endl;
13101
13102 if (unmounting)
13103 return -ENOTCONN;
13104
13105 _closedir(dirp);
13106 return 0;
13107 }
13108
13109 int Client::ll_fsyncdir(dir_result_t *dirp)
13110 {
13111 std::lock_guard lock(client_lock);
13112 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13113 tout(cct) << "ll_fsyncdir" << std::endl;
13114 tout(cct) << (unsigned long)dirp << std::endl;
13115
13116 if (unmounting)
13117 return -ENOTCONN;
13118
13119 return _fsync(dirp->inode.get(), false);
13120 }
13121
13122 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13123 {
13124 ceph_assert(!(flags & O_CREAT));
13125
13126 std::lock_guard lock(client_lock);
13127
13128 if (unmounting)
13129 return -ENOTCONN;
13130
13131 vinodeno_t vino = _get_vino(in);
13132
13133 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13134 tout(cct) << "ll_open" << std::endl;
13135 tout(cct) << vino.ino.val << std::endl;
13136 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13137
13138 int r;
13139 if (!fuse_default_permissions) {
13140 r = may_open(in, flags, perms);
13141 if (r < 0)
13142 goto out;
13143 }
13144
13145 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13146
13147 out:
13148 Fh *fhptr = fhp ? *fhp : NULL;
13149 if (fhptr) {
13150 ll_unclosed_fh_set.insert(fhptr);
13151 }
13152 tout(cct) << (unsigned long)fhptr << std::endl;
13153 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13154 " = " << r << " (" << fhptr << ")" << dendl;
13155 return r;
13156 }
13157
13158 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13159 int flags, InodeRef *in, int caps, Fh **fhp,
13160 const UserPerm& perms)
13161 {
13162 *fhp = NULL;
13163
13164 vinodeno_t vparent = _get_vino(parent);
13165
13166 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13167 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13168 << ", gid " << perms.gid() << dendl;
13169 tout(cct) << "ll_create" << std::endl;
13170 tout(cct) << vparent.ino.val << std::endl;
13171 tout(cct) << name << std::endl;
13172 tout(cct) << mode << std::endl;
13173 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13174
13175 bool created = false;
13176 int r = _lookup(parent, name, caps, in, perms);
13177
13178 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13179 return -EEXIST;
13180
13181 if (r == -ENOENT && (flags & O_CREAT)) {
13182 if (!fuse_default_permissions) {
13183 r = may_create(parent, perms);
13184 if (r < 0)
13185 goto out;
13186 }
13187 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13188 perms);
13189 if (r < 0)
13190 goto out;
13191 }
13192
13193 if (r < 0)
13194 goto out;
13195
13196 ceph_assert(*in);
13197
13198 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13199 if (!created) {
13200 if (!fuse_default_permissions) {
13201 r = may_open(in->get(), flags, perms);
13202 if (r < 0) {
13203 if (*fhp) {
13204 int release_r = _release_fh(*fhp);
13205 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13206 }
13207 goto out;
13208 }
13209 }
13210 if (*fhp == NULL) {
13211 r = _open(in->get(), flags, mode, fhp, perms);
13212 if (r < 0)
13213 goto out;
13214 }
13215 }
13216
13217 out:
13218 if (*fhp) {
13219 ll_unclosed_fh_set.insert(*fhp);
13220 }
13221
13222 ino_t ino = 0;
13223 if (r >= 0) {
13224 Inode *inode = in->get();
13225 if (use_faked_inos())
13226 ino = inode->faked_ino;
13227 else
13228 ino = inode->ino;
13229 }
13230
13231 tout(cct) << (unsigned long)*fhp << std::endl;
13232 tout(cct) << ino << std::endl;
13233 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13234 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13235 *fhp << " " << hex << ino << dec << ")" << dendl;
13236
13237 return r;
13238 }
13239
13240 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13241 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13242 const UserPerm& perms)
13243 {
13244 std::lock_guard lock(client_lock);
13245 InodeRef in;
13246
13247 if (unmounting)
13248 return -ENOTCONN;
13249
13250 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13251 fhp, perms);
13252 if (r >= 0) {
13253 ceph_assert(in);
13254
13255 // passing an Inode in outp requires an additional ref
13256 if (outp) {
13257 _ll_get(in.get());
13258 *outp = in.get();
13259 }
13260 fill_stat(in, attr);
13261 } else {
13262 attr->st_ino = 0;
13263 }
13264
13265 return r;
13266 }
13267
13268 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13269 int oflags, Inode **outp, Fh **fhp,
13270 struct ceph_statx *stx, unsigned want, unsigned lflags,
13271 const UserPerm& perms)
13272 {
13273 unsigned caps = statx_to_mask(lflags, want);
13274 std::lock_guard lock(client_lock);
13275 InodeRef in;
13276
13277 if (unmounting)
13278 return -ENOTCONN;
13279
13280 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13281 if (r >= 0) {
13282 ceph_assert(in);
13283
13284 // passing an Inode in outp requires an additional ref
13285 if (outp) {
13286 _ll_get(in.get());
13287 *outp = in.get();
13288 }
13289 fill_statx(in, caps, stx);
13290 } else {
13291 stx->stx_ino = 0;
13292 stx->stx_mask = 0;
13293 }
13294
13295 return r;
13296 }
13297
13298 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13299 {
13300 std::lock_guard lock(client_lock);
13301 tout(cct) << "ll_lseek" << std::endl;
13302 tout(cct) << offset << std::endl;
13303 tout(cct) << whence << std::endl;
13304
13305 if (unmounting)
13306 return -ENOTCONN;
13307
13308 return _lseek(fh, offset, whence);
13309 }
13310
13311 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13312 {
13313 std::lock_guard lock(client_lock);
13314 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13315 tout(cct) << "ll_read" << std::endl;
13316 tout(cct) << (unsigned long)fh << std::endl;
13317 tout(cct) << off << std::endl;
13318 tout(cct) << len << std::endl;
13319
13320 if (unmounting)
13321 return -ENOTCONN;
13322
13323 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13324 len = std::min(len, (loff_t)INT_MAX);
13325 return _read(fh, off, len, bl);
13326 }
13327
13328 int Client::ll_read_block(Inode *in, uint64_t blockid,
13329 char *buf,
13330 uint64_t offset,
13331 uint64_t length,
13332 file_layout_t* layout)
13333 {
13334 std::lock_guard lock(client_lock);
13335
13336 if (unmounting)
13337 return -ENOTCONN;
13338
13339 vinodeno_t vino = _get_vino(in);
13340 object_t oid = file_object_t(vino.ino, blockid);
13341 C_SaferCond onfinish;
13342 bufferlist bl;
13343
13344 objecter->read(oid,
13345 object_locator_t(layout->pool_id),
13346 offset,
13347 length,
13348 vino.snapid,
13349 &bl,
13350 CEPH_OSD_FLAG_READ,
13351 &onfinish);
13352
13353 client_lock.unlock();
13354 int r = onfinish.wait();
13355 client_lock.lock();
13356
13357 if (r >= 0) {
13358 bl.begin().copy(bl.length(), buf);
13359 r = bl.length();
13360 }
13361
13362 return r;
13363 }
13364
13365 /* It appears that the OSD doesn't return success unless the entire
13366 buffer was written, return the write length on success. */
13367
13368 int Client::ll_write_block(Inode *in, uint64_t blockid,
13369 char* buf, uint64_t offset,
13370 uint64_t length, file_layout_t* layout,
13371 uint64_t snapseq, uint32_t sync)
13372 {
13373 vinodeno_t vino = ll_get_vino(in);
13374 int r = 0;
13375 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13376
13377 if (length == 0) {
13378 return -EINVAL;
13379 }
13380 if (true || sync) {
13381 /* if write is stable, the epilogue is waiting on
13382 * flock */
13383 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13384 }
13385 object_t oid = file_object_t(vino.ino, blockid);
13386 SnapContext fakesnap;
13387 ceph::bufferlist bl;
13388 if (length > 0) {
13389 bl.push_back(buffer::copy(buf, length));
13390 }
13391
13392 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13393 << dendl;
13394
13395 fakesnap.seq = snapseq;
13396
13397 /* lock just in time */
13398 client_lock.lock();
13399 if (unmounting) {
13400 client_lock.unlock();
13401 return -ENOTCONN;
13402 }
13403
13404 objecter->write(oid,
13405 object_locator_t(layout->pool_id),
13406 offset,
13407 length,
13408 fakesnap,
13409 bl,
13410 ceph::real_clock::now(),
13411 0,
13412 onsafe.get());
13413
13414 client_lock.unlock();
13415 if (nullptr != onsafe) {
13416 r = onsafe->wait();
13417 }
13418
13419 if (r < 0) {
13420 return r;
13421 } else {
13422 return length;
13423 }
13424 }
13425
13426 int Client::ll_commit_blocks(Inode *in,
13427 uint64_t offset,
13428 uint64_t length)
13429 {
13430 std::lock_guard lock(client_lock);
13431 /*
13432 BarrierContext *bctx;
13433 vinodeno_t vino = _get_vino(in);
13434 uint64_t ino = vino.ino;
13435
13436 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13437 << offset << " to " << length << dendl;
13438
13439 if (length == 0) {
13440 return -EINVAL;
13441 }
13442
13443 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13444 if (p != barriers.end()) {
13445 barrier_interval civ(offset, offset + length);
13446 p->second->commit_barrier(civ);
13447 }
13448 */
13449 return 0;
13450 }
13451
13452 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13453 {
13454 std::lock_guard lock(client_lock);
13455 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13456 "~" << len << dendl;
13457 tout(cct) << "ll_write" << std::endl;
13458 tout(cct) << (unsigned long)fh << std::endl;
13459 tout(cct) << off << std::endl;
13460 tout(cct) << len << std::endl;
13461
13462 if (unmounting)
13463 return -ENOTCONN;
13464
13465 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13466 len = std::min(len, (loff_t)INT_MAX);
13467 int r = _write(fh, off, len, data, NULL, 0);
13468 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13469 << dendl;
13470 return r;
13471 }
13472
13473 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13474 {
13475 std::lock_guard lock(client_lock);
13476 if (unmounting)
13477 return -ENOTCONN;
13478 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13479 }
13480
13481 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13482 {
13483 std::lock_guard lock(client_lock);
13484 if (unmounting)
13485 return -ENOTCONN;
13486 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13487 }
13488
13489 int Client::ll_flush(Fh *fh)
13490 {
13491 std::lock_guard lock(client_lock);
13492 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13493 tout(cct) << "ll_flush" << std::endl;
13494 tout(cct) << (unsigned long)fh << std::endl;
13495
13496 if (unmounting)
13497 return -ENOTCONN;
13498
13499 return _flush(fh);
13500 }
13501
13502 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13503 {
13504 std::lock_guard lock(client_lock);
13505 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13506 tout(cct) << "ll_fsync" << std::endl;
13507 tout(cct) << (unsigned long)fh << std::endl;
13508
13509 if (unmounting)
13510 return -ENOTCONN;
13511
13512 int r = _fsync(fh, syncdataonly);
13513 if (r) {
13514 // If we're returning an error, clear it from the FH
13515 fh->take_async_err();
13516 }
13517 return r;
13518 }
13519
13520 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13521 {
13522 std::lock_guard lock(client_lock);
13523 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13524 tout(cct) << "ll_sync_inode" << std::endl;
13525 tout(cct) << (unsigned long)in << std::endl;
13526
13527 if (unmounting)
13528 return -ENOTCONN;
13529
13530 return _fsync(in, syncdataonly);
13531 }
13532
13533 #ifdef FALLOC_FL_PUNCH_HOLE
13534
13535 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13536 {
13537 if (offset < 0 || length <= 0)
13538 return -EINVAL;
13539
13540 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13541 return -EOPNOTSUPP;
13542
13543 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13544 return -EOPNOTSUPP;
13545
13546 Inode *in = fh->inode.get();
13547
13548 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13549 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13550 return -ENOSPC;
13551 }
13552
13553 if (in->snapid != CEPH_NOSNAP)
13554 return -EROFS;
13555
13556 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13557 return -EBADF;
13558
13559 uint64_t size = offset + length;
13560 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13561 size > in->size &&
13562 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13563 return -EDQUOT;
13564 }
13565
13566 int have;
13567 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13568 if (r < 0)
13569 return r;
13570
13571 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13572 if (mode & FALLOC_FL_PUNCH_HOLE) {
13573 if (in->inline_version < CEPH_INLINE_NONE &&
13574 (have & CEPH_CAP_FILE_BUFFER)) {
13575 bufferlist bl;
13576 auto inline_iter = in->inline_data.cbegin();
13577 int len = in->inline_data.length();
13578 if (offset < len) {
13579 if (offset > 0)
13580 inline_iter.copy(offset, bl);
13581 int size = length;
13582 if (offset + size > len)
13583 size = len - offset;
13584 if (size > 0)
13585 bl.append_zero(size);
13586 if (offset + size < len) {
13587 inline_iter += size;
13588 inline_iter.copy(len - offset - size, bl);
13589 }
13590 in->inline_data = bl;
13591 in->inline_version++;
13592 }
13593 in->mtime = in->ctime = ceph_clock_now();
13594 in->change_attr++;
13595 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13596 } else {
13597 if (in->inline_version < CEPH_INLINE_NONE) {
13598 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13599 uninline_data(in, onuninline.get());
13600 }
13601
13602 C_SaferCond onfinish("Client::_punch_hole flock");
13603
13604 unsafe_sync_write++;
13605 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13606
13607 _invalidate_inode_cache(in, offset, length);
13608 filer->zero(in->ino, &in->layout,
13609 in->snaprealm->get_snap_context(),
13610 offset, length,
13611 ceph::real_clock::now(),
13612 0, true, &onfinish);
13613 in->mtime = in->ctime = ceph_clock_now();
13614 in->change_attr++;
13615 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13616
13617 client_lock.unlock();
13618 onfinish.wait();
13619 client_lock.lock();
13620 _sync_write_commit(in);
13621 }
13622 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13623 uint64_t size = offset + length;
13624 if (size > in->size) {
13625 in->size = size;
13626 in->mtime = in->ctime = ceph_clock_now();
13627 in->change_attr++;
13628 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13629
13630 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13631 check_caps(in, CHECK_CAPS_NODELAY);
13632 } else if (is_max_size_approaching(in)) {
13633 check_caps(in, 0);
13634 }
13635 }
13636 }
13637
13638 if (nullptr != onuninline) {
13639 client_lock.unlock();
13640 int ret = onuninline->wait();
13641 client_lock.lock();
13642
13643 if (ret >= 0 || ret == -ECANCELED) {
13644 in->inline_data.clear();
13645 in->inline_version = CEPH_INLINE_NONE;
13646 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13647 check_caps(in, 0);
13648 } else
13649 r = ret;
13650 }
13651
13652 put_cap_ref(in, CEPH_CAP_FILE_WR);
13653 return r;
13654 }
13655 #else
13656
13657 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13658 {
13659 return -EOPNOTSUPP;
13660 }
13661
13662 #endif
13663
13664
13665 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13666 {
13667 std::lock_guard lock(client_lock);
13668 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13669 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13670 tout(cct) << (unsigned long)fh << std::endl;
13671
13672 if (unmounting)
13673 return -ENOTCONN;
13674
13675 return _fallocate(fh, mode, offset, length);
13676 }
13677
13678 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13679 {
13680 std::lock_guard lock(client_lock);
13681 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13682
13683 if (unmounting)
13684 return -ENOTCONN;
13685
13686 Fh *fh = get_filehandle(fd);
13687 if (!fh)
13688 return -EBADF;
13689 #if defined(__linux__) && defined(O_PATH)
13690 if (fh->flags & O_PATH)
13691 return -EBADF;
13692 #endif
13693 return _fallocate(fh, mode, offset, length);
13694 }
13695
13696 int Client::ll_release(Fh *fh)
13697 {
13698 std::lock_guard lock(client_lock);
13699
13700 if (unmounting)
13701 return -ENOTCONN;
13702
13703 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13704 dendl;
13705 tout(cct) << __func__ << " (fh)" << std::endl;
13706 tout(cct) << (unsigned long)fh << std::endl;
13707
13708 if (ll_unclosed_fh_set.count(fh))
13709 ll_unclosed_fh_set.erase(fh);
13710 return _release_fh(fh);
13711 }
13712
13713 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13714 {
13715 std::lock_guard lock(client_lock);
13716
13717 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13718 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13719
13720 if (unmounting)
13721 return -ENOTCONN;
13722
13723 return _getlk(fh, fl, owner);
13724 }
13725
13726 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13727 {
13728 std::lock_guard lock(client_lock);
13729
13730 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13731 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13732
13733 if (unmounting)
13734 return -ENOTCONN;
13735
13736 return _setlk(fh, fl, owner, sleep);
13737 }
13738
13739 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13740 {
13741 std::lock_guard lock(client_lock);
13742
13743 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13744 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13745
13746 if (unmounting)
13747 return -ENOTCONN;
13748
13749 return _flock(fh, cmd, owner);
13750 }
13751
13752 int Client::set_deleg_timeout(uint32_t timeout)
13753 {
13754 std::lock_guard lock(client_lock);
13755
13756 /*
13757 * The whole point is to prevent blacklisting so we must time out the
13758 * delegation before the session autoclose timeout kicks in.
13759 */
13760 if (timeout >= mdsmap->get_session_autoclose())
13761 return -EINVAL;
13762
13763 deleg_timeout = timeout;
13764 return 0;
13765 }
13766
13767 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13768 {
13769 int ret = -EINVAL;
13770
13771 std::lock_guard lock(client_lock);
13772
13773 if (!mounted)
13774 return -ENOTCONN;
13775
13776 Inode *inode = fh->inode.get();
13777
13778 switch(cmd) {
13779 case CEPH_DELEGATION_NONE:
13780 inode->unset_deleg(fh);
13781 ret = 0;
13782 break;
13783 default:
13784 try {
13785 ret = inode->set_deleg(fh, cmd, cb, priv);
13786 } catch (std::bad_alloc&) {
13787 ret = -ENOMEM;
13788 }
13789 break;
13790 }
13791 return ret;
13792 }
13793
13794 class C_Client_RequestInterrupt : public Context {
13795 private:
13796 Client *client;
13797 MetaRequest *req;
13798 public:
13799 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13800 req->get();
13801 }
13802 void finish(int r) override {
13803 std::lock_guard l(client->client_lock);
13804 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13805 client->_interrupt_filelock(req);
13806 client->put_request(req);
13807 }
13808 };
13809
13810 void Client::ll_interrupt(void *d)
13811 {
13812 MetaRequest *req = static_cast<MetaRequest*>(d);
13813 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13814 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13815 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13816 }
13817
13818 // =========================================
13819 // layout
13820
13821 // expose file layouts
13822
13823 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13824 const UserPerm& perms)
13825 {
13826 std::lock_guard lock(client_lock);
13827
13828 if (unmounting)
13829 return -ENOTCONN;
13830
13831 filepath path(relpath);
13832 InodeRef in;
13833 int r = path_walk(path, &in, perms);
13834 if (r < 0)
13835 return r;
13836
13837 *lp = in->layout;
13838
13839 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13840 return 0;
13841 }
13842
13843 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13844 {
13845 std::lock_guard lock(client_lock);
13846
13847 if (unmounting)
13848 return -ENOTCONN;
13849
13850 Fh *f = get_filehandle(fd);
13851 if (!f)
13852 return -EBADF;
13853 Inode *in = f->inode.get();
13854
13855 *lp = in->layout;
13856
13857 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13858 return 0;
13859 }
13860
13861 int64_t Client::get_default_pool_id()
13862 {
13863 std::lock_guard lock(client_lock);
13864
13865 if (unmounting)
13866 return -ENOTCONN;
13867
13868 /* first data pool is the default */
13869 return mdsmap->get_first_data_pool();
13870 }
13871
13872 // expose osdmap
13873
13874 int64_t Client::get_pool_id(const char *pool_name)
13875 {
13876 std::lock_guard lock(client_lock);
13877
13878 if (unmounting)
13879 return -ENOTCONN;
13880
13881 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13882 pool_name);
13883 }
13884
13885 string Client::get_pool_name(int64_t pool)
13886 {
13887 std::lock_guard lock(client_lock);
13888
13889 if (unmounting)
13890 return string();
13891
13892 return objecter->with_osdmap([pool](const OSDMap& o) {
13893 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13894 });
13895 }
13896
13897 int Client::get_pool_replication(int64_t pool)
13898 {
13899 std::lock_guard lock(client_lock);
13900
13901 if (unmounting)
13902 return -ENOTCONN;
13903
13904 return objecter->with_osdmap([pool](const OSDMap& o) {
13905 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13906 });
13907 }
13908
13909 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13910 {
13911 std::lock_guard lock(client_lock);
13912
13913 if (unmounting)
13914 return -ENOTCONN;
13915
13916 Fh *f = get_filehandle(fd);
13917 if (!f)
13918 return -EBADF;
13919 Inode *in = f->inode.get();
13920
13921 vector<ObjectExtent> extents;
13922 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13923 ceph_assert(extents.size() == 1);
13924
13925 objecter->with_osdmap([&](const OSDMap& o) {
13926 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13927 o.pg_to_acting_osds(pg, osds);
13928 });
13929
13930 if (osds.empty())
13931 return -EINVAL;
13932
13933 /*
13934 * Return the remainder of the extent (stripe unit)
13935 *
13936 * If length = 1 is passed to Striper::file_to_extents we get a single
13937 * extent back, but its length is one so we still need to compute the length
13938 * to the end of the stripe unit.
13939 *
13940 * If length = su then we may get 1 or 2 objects back in the extents vector
13941 * which would have to be examined. Even then, the offsets are local to the
13942 * object, so matching up to the file offset is extra work.
13943 *
13944 * It seems simpler to stick with length = 1 and manually compute the
13945 * remainder.
13946 */
13947 if (len) {
13948 uint64_t su = in->layout.stripe_unit;
13949 *len = su - (off % su);
13950 }
13951
13952 return 0;
13953 }
13954
13955 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13956 {
13957 std::lock_guard lock(client_lock);
13958
13959 if (unmounting)
13960 return -ENOTCONN;
13961
13962 if (id < 0)
13963 return -EINVAL;
13964 return objecter->with_osdmap([&](const OSDMap& o) {
13965 return o.crush->get_full_location_ordered(id, path);
13966 });
13967 }
13968
13969 int Client::get_file_stripe_address(int fd, loff_t offset,
13970 vector<entity_addr_t>& address)
13971 {
13972 std::lock_guard lock(client_lock);
13973
13974 if (unmounting)
13975 return -ENOTCONN;
13976
13977 Fh *f = get_filehandle(fd);
13978 if (!f)
13979 return -EBADF;
13980 Inode *in = f->inode.get();
13981
13982 // which object?
13983 vector<ObjectExtent> extents;
13984 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13985 in->truncate_size, extents);
13986 ceph_assert(extents.size() == 1);
13987
13988 // now we have the object and its 'layout'
13989 return objecter->with_osdmap([&](const OSDMap& o) {
13990 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13991 vector<int> osds;
13992 o.pg_to_acting_osds(pg, osds);
13993 if (osds.empty())
13994 return -EINVAL;
13995 for (unsigned i = 0; i < osds.size(); i++) {
13996 entity_addr_t addr = o.get_addrs(osds[i]).front();
13997 address.push_back(addr);
13998 }
13999 return 0;
14000 });
14001 }
14002
14003 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14004 {
14005 std::lock_guard lock(client_lock);
14006
14007 if (unmounting)
14008 return -ENOTCONN;
14009
14010 return objecter->with_osdmap([&](const OSDMap& o) {
14011 if (!o.exists(osd))
14012 return -ENOENT;
14013
14014 addr = o.get_addrs(osd).front();
14015 return 0;
14016 });
14017 }
14018
14019 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14020 loff_t length, loff_t offset)
14021 {
14022 std::lock_guard lock(client_lock);
14023
14024 if (unmounting)
14025 return -ENOTCONN;
14026
14027 Fh *f = get_filehandle(fd);
14028 if (!f)
14029 return -EBADF;
14030 Inode *in = f->inode.get();
14031
14032 // map to a list of extents
14033 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14034
14035 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14036 return 0;
14037 }
14038
14039
14040 /* find an osd with the same ip. -ENXIO if none. */
14041 int Client::get_local_osd()
14042 {
14043 std::lock_guard lock(client_lock);
14044
14045 if (unmounting)
14046 return -ENOTCONN;
14047
14048 objecter->with_osdmap([this](const OSDMap& o) {
14049 if (o.get_epoch() != local_osd_epoch) {
14050 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14051 local_osd_epoch = o.get_epoch();
14052 }
14053 });
14054 return local_osd;
14055 }
14056
14057
14058
14059
14060
14061
14062 // ===============================
14063
14064 void Client::ms_handle_connect(Connection *con)
14065 {
14066 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14067 }
14068
14069 bool Client::ms_handle_reset(Connection *con)
14070 {
14071 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14072 return false;
14073 }
14074
14075 void Client::ms_handle_remote_reset(Connection *con)
14076 {
14077 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14078 std::lock_guard l(client_lock);
14079 switch (con->get_peer_type()) {
14080 case CEPH_ENTITY_TYPE_MDS:
14081 {
14082 // kludge to figure out which mds this is; fixme with a Connection* state
14083 mds_rank_t mds = MDS_RANK_NONE;
14084 MetaSession *s = NULL;
14085 for (auto &p : mds_sessions) {
14086 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14087 mds = p.first;
14088 s = &p.second;
14089 }
14090 }
14091 if (mds >= 0) {
14092 assert (s != NULL);
14093 switch (s->state) {
14094 case MetaSession::STATE_CLOSING:
14095 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14096 _closed_mds_session(s);
14097 break;
14098
14099 case MetaSession::STATE_OPENING:
14100 {
14101 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14102 list<Context*> waiters;
14103 waiters.swap(s->waiting_for_open);
14104 _closed_mds_session(s);
14105 MetaSession *news = _get_or_open_mds_session(mds);
14106 news->waiting_for_open.swap(waiters);
14107 }
14108 break;
14109
14110 case MetaSession::STATE_OPEN:
14111 {
14112 objecter->maybe_request_map(); /* to check if we are blacklisted */
14113 const auto& conf = cct->_conf;
14114 if (conf->client_reconnect_stale) {
14115 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14116 _closed_mds_session(s);
14117 } else {
14118 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14119 s->state = MetaSession::STATE_STALE;
14120 }
14121 }
14122 break;
14123
14124 case MetaSession::STATE_NEW:
14125 case MetaSession::STATE_CLOSED:
14126 default:
14127 break;
14128 }
14129 }
14130 }
14131 break;
14132 }
14133 }
14134
14135 bool Client::ms_handle_refused(Connection *con)
14136 {
14137 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14138 return false;
14139 }
14140
14141 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14142 {
14143 Inode *quota_in = root_ancestor;
14144 SnapRealm *realm = in->snaprealm;
14145 while (realm) {
14146 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14147 if (realm->ino != in->ino) {
14148 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14149 if (p == inode_map.end())
14150 break;
14151
14152 if (p->second->quota.is_enable()) {
14153 quota_in = p->second;
14154 break;
14155 }
14156 }
14157 realm = realm->pparent;
14158 }
14159 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14160 return quota_in;
14161 }
14162
14163 /**
14164 * Traverse quota ancestors of the Inode, return true
14165 * if any of them passes the passed function
14166 */
14167 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14168 std::function<bool (const Inode &in)> test)
14169 {
14170 while (true) {
14171 ceph_assert(in != NULL);
14172 if (test(*in)) {
14173 return true;
14174 }
14175
14176 if (in == root_ancestor) {
14177 // We're done traversing, drop out
14178 return false;
14179 } else {
14180 // Continue up the tree
14181 in = get_quota_root(in, perms);
14182 }
14183 }
14184
14185 return false;
14186 }
14187
14188 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14189 {
14190 return check_quota_condition(in, perms,
14191 [](const Inode &in) {
14192 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14193 });
14194 }
14195
14196 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14197 const UserPerm& perms)
14198 {
14199 return check_quota_condition(in, perms,
14200 [&new_bytes](const Inode &in) {
14201 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14202 > in.quota.max_bytes;
14203 });
14204 }
14205
14206 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14207 {
14208 ceph_assert(in->size >= in->reported_size);
14209 const uint64_t size = in->size - in->reported_size;
14210 return check_quota_condition(in, perms,
14211 [&size](const Inode &in) {
14212 if (in.quota.max_bytes) {
14213 if (in.rstat.rbytes >= in.quota.max_bytes) {
14214 return true;
14215 }
14216
14217 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14218 return (space >> 4) < size;
14219 } else {
14220 return false;
14221 }
14222 });
14223 }
14224
14225 enum {
14226 POOL_CHECKED = 1,
14227 POOL_CHECKING = 2,
14228 POOL_READ = 4,
14229 POOL_WRITE = 8,
14230 };
14231
14232 int Client::check_pool_perm(Inode *in, int need)
14233 {
14234 if (!cct->_conf->client_check_pool_perm)
14235 return 0;
14236
14237 int64_t pool_id = in->layout.pool_id;
14238 std::string pool_ns = in->layout.pool_ns;
14239 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14240 int have = 0;
14241 while (true) {
14242 auto it = pool_perms.find(perm_key);
14243 if (it == pool_perms.end())
14244 break;
14245 if (it->second == POOL_CHECKING) {
14246 // avoid concurrent checkings
14247 wait_on_list(waiting_for_pool_perm);
14248 } else {
14249 have = it->second;
14250 ceph_assert(have & POOL_CHECKED);
14251 break;
14252 }
14253 }
14254
14255 if (!have) {
14256 if (in->snapid != CEPH_NOSNAP) {
14257 // pool permission check needs to write to the first object. But for snapshot,
14258 // head of the first object may have alread been deleted. To avoid creating
14259 // orphan object, skip the check for now.
14260 return 0;
14261 }
14262
14263 pool_perms[perm_key] = POOL_CHECKING;
14264
14265 char oid_buf[32];
14266 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14267 object_t oid = oid_buf;
14268
14269 SnapContext nullsnapc;
14270
14271 C_SaferCond rd_cond;
14272 ObjectOperation rd_op;
14273 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14274
14275 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14276 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14277
14278 C_SaferCond wr_cond;
14279 ObjectOperation wr_op;
14280 wr_op.create(true);
14281
14282 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14283 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14284
14285 client_lock.unlock();
14286 int rd_ret = rd_cond.wait();
14287 int wr_ret = wr_cond.wait();
14288 client_lock.lock();
14289
14290 bool errored = false;
14291
14292 if (rd_ret == 0 || rd_ret == -ENOENT)
14293 have |= POOL_READ;
14294 else if (rd_ret != -EPERM) {
14295 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14296 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14297 errored = true;
14298 }
14299
14300 if (wr_ret == 0 || wr_ret == -EEXIST)
14301 have |= POOL_WRITE;
14302 else if (wr_ret != -EPERM) {
14303 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14304 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14305 errored = true;
14306 }
14307
14308 if (errored) {
14309 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14310 // Raise EIO because actual error code might be misleading for
14311 // userspace filesystem user.
14312 pool_perms.erase(perm_key);
14313 signal_cond_list(waiting_for_pool_perm);
14314 return -EIO;
14315 }
14316
14317 pool_perms[perm_key] = have | POOL_CHECKED;
14318 signal_cond_list(waiting_for_pool_perm);
14319 }
14320
14321 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14322 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14323 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14324 return -EPERM;
14325 }
14326 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14327 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14328 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14329 return -EPERM;
14330 }
14331
14332 return 0;
14333 }
14334
14335 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14336 {
14337 if (acl_type == POSIX_ACL) {
14338 if (in->xattrs.count(ACL_EA_ACCESS)) {
14339 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14340
14341 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14342 }
14343 }
14344 return -EAGAIN;
14345 }
14346
14347 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14348 {
14349 if (acl_type == NO_ACL)
14350 return 0;
14351
14352 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14353 if (r < 0)
14354 goto out;
14355
14356 if (acl_type == POSIX_ACL) {
14357 if (in->xattrs.count(ACL_EA_ACCESS)) {
14358 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14359 bufferptr acl(access_acl.c_str(), access_acl.length());
14360 r = posix_acl_access_chmod(acl, mode);
14361 if (r < 0)
14362 goto out;
14363 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14364 } else {
14365 r = 0;
14366 }
14367 }
14368 out:
14369 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14370 return r;
14371 }
14372
14373 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14374 const UserPerm& perms)
14375 {
14376 if (acl_type == NO_ACL)
14377 return 0;
14378
14379 if (S_ISLNK(*mode))
14380 return 0;
14381
14382 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14383 if (r < 0)
14384 goto out;
14385
14386 if (acl_type == POSIX_ACL) {
14387 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14388 map<string, bufferptr> xattrs;
14389
14390 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14391 bufferptr acl(default_acl.c_str(), default_acl.length());
14392 r = posix_acl_inherit_mode(acl, mode);
14393 if (r < 0)
14394 goto out;
14395
14396 if (r > 0) {
14397 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14398 if (r < 0)
14399 goto out;
14400 if (r > 0)
14401 xattrs[ACL_EA_ACCESS] = acl;
14402 }
14403
14404 if (S_ISDIR(*mode))
14405 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14406
14407 r = xattrs.size();
14408 if (r > 0)
14409 encode(xattrs, xattrs_bl);
14410 } else {
14411 if (umask_cb)
14412 *mode &= ~umask_cb(callback_handle);
14413 r = 0;
14414 }
14415 }
14416 out:
14417 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14418 return r;
14419 }
14420
14421 void Client::set_filer_flags(int flags)
14422 {
14423 std::lock_guard l(client_lock);
14424 ceph_assert(flags == 0 ||
14425 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14426 objecter->add_global_op_flags(flags);
14427 }
14428
14429 void Client::clear_filer_flags(int flags)
14430 {
14431 std::lock_guard l(client_lock);
14432 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14433 objecter->clear_global_op_flag(flags);
14434 }
14435
14436 // called before mount
14437 void Client::set_uuid(const std::string& uuid)
14438 {
14439 std::lock_guard l(client_lock);
14440 assert(initialized);
14441 assert(!uuid.empty());
14442
14443 metadata["uuid"] = uuid;
14444 _close_sessions();
14445 }
14446
14447 // called before mount. 0 means infinite
14448 void Client::set_session_timeout(unsigned timeout)
14449 {
14450 std::lock_guard l(client_lock);
14451 assert(initialized);
14452
14453 metadata["timeout"] = stringify(timeout);
14454 }
14455
14456 // called before mount
14457 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14458 const std::string& fs_name)
14459 {
14460 std::lock_guard l(client_lock);
14461 if (!initialized)
14462 return -ENOTCONN;
14463
14464 if (uuid.empty())
14465 return -EINVAL;
14466
14467 {
14468 auto it = metadata.find("uuid");
14469 if (it != metadata.end() && it->second == uuid)
14470 return -EINVAL;
14471 }
14472
14473 int r = subscribe_mdsmap(fs_name);
14474 if (r < 0) {
14475 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14476 return r;
14477 }
14478
14479 if (metadata.empty())
14480 populate_metadata("");
14481
14482 while (mdsmap->get_epoch() == 0)
14483 wait_on_list(waiting_for_mdsmap);
14484
14485 reclaim_errno = 0;
14486 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14487 if (!mdsmap->is_up(mds)) {
14488 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14489 wait_on_list(waiting_for_mdsmap);
14490 continue;
14491 }
14492
14493 MetaSession *session;
14494 if (!have_open_session(mds)) {
14495 session = _get_or_open_mds_session(mds);
14496 if (session->state != MetaSession::STATE_OPENING) {
14497 // umounting?
14498 return -EINVAL;
14499 }
14500 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14501 wait_on_context_list(session->waiting_for_open);
14502 if (rejected_by_mds.count(mds))
14503 return -EPERM;
14504 continue;
14505 }
14506
14507 session = &mds_sessions.at(mds);
14508 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14509 return -EOPNOTSUPP;
14510
14511 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14512 session->reclaim_state == MetaSession::RECLAIMING) {
14513 session->reclaim_state = MetaSession::RECLAIMING;
14514 auto m = make_message<MClientReclaim>(uuid, flags);
14515 session->con->send_message2(std::move(m));
14516 wait_on_list(waiting_for_reclaim);
14517 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14518 return reclaim_errno ? : -ENOTRECOVERABLE;
14519 } else {
14520 mds++;
14521 }
14522 }
14523
14524 // didn't find target session in any mds
14525 if (reclaim_target_addrs.empty()) {
14526 if (flags & CEPH_RECLAIM_RESET)
14527 return -ENOENT;
14528 return -ENOTRECOVERABLE;
14529 }
14530
14531 if (flags & CEPH_RECLAIM_RESET)
14532 return 0;
14533
14534 // use blacklist to check if target session was killed
14535 // (config option mds_session_blacklist_on_evict needs to be true)
14536 C_SaferCond cond;
14537 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14538 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14539 client_lock.unlock();
14540 cond.wait();
14541 client_lock.lock();
14542 }
14543
14544 bool blacklisted = objecter->with_osdmap(
14545 [this](const OSDMap &osd_map) -> bool {
14546 return osd_map.is_blacklisted(reclaim_target_addrs);
14547 });
14548 if (blacklisted)
14549 return -ENOTRECOVERABLE;
14550
14551 metadata["reclaiming_uuid"] = uuid;
14552 return 0;
14553 }
14554
14555 void Client::finish_reclaim()
14556 {
14557 auto it = metadata.find("reclaiming_uuid");
14558 if (it == metadata.end()) {
14559 for (auto &p : mds_sessions)
14560 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14561 return;
14562 }
14563
14564 for (auto &p : mds_sessions) {
14565 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14566 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14567 p.second.con->send_message2(std::move(m));
14568 }
14569
14570 metadata["uuid"] = it->second;
14571 metadata.erase(it);
14572 }
14573
14574 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14575 {
14576 mds_rank_t from = mds_rank_t(reply->get_source().num());
14577 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14578
14579 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14580 if (!session) {
14581 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14582 return;
14583 }
14584
14585 if (reply->get_result() >= 0) {
14586 session->reclaim_state = MetaSession::RECLAIM_OK;
14587 if (reply->get_epoch() > reclaim_osd_epoch)
14588 reclaim_osd_epoch = reply->get_epoch();
14589 if (!reply->get_addrs().empty())
14590 reclaim_target_addrs = reply->get_addrs();
14591 } else {
14592 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14593 reclaim_errno = reply->get_result();
14594 }
14595
14596 signal_cond_list(waiting_for_reclaim);
14597 }
14598
14599 /**
14600 * This is included in cap release messages, to cause
14601 * the MDS to wait until this OSD map epoch. It is necessary
14602 * in corner cases where we cancel RADOS ops, so that
14603 * nobody else tries to do IO to the same objects in
14604 * the same epoch as the cancelled ops.
14605 */
14606 void Client::set_cap_epoch_barrier(epoch_t e)
14607 {
14608 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14609 cap_epoch_barrier = e;
14610 }
14611
14612 const char** Client::get_tracked_conf_keys() const
14613 {
14614 static const char* keys[] = {
14615 "client_cache_size",
14616 "client_cache_mid",
14617 "client_acl_type",
14618 "client_deleg_timeout",
14619 "client_deleg_break_on_open",
14620 NULL
14621 };
14622 return keys;
14623 }
14624
14625 void Client::handle_conf_change(const ConfigProxy& conf,
14626 const std::set <std::string> &changed)
14627 {
14628 std::lock_guard lock(client_lock);
14629
14630 if (changed.count("client_cache_mid")) {
14631 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14632 }
14633 if (changed.count("client_acl_type")) {
14634 acl_type = NO_ACL;
14635 if (cct->_conf->client_acl_type == "posix_acl")
14636 acl_type = POSIX_ACL;
14637 }
14638 }
14639
14640 void intrusive_ptr_add_ref(Inode *in)
14641 {
14642 in->get();
14643 }
14644
14645 void intrusive_ptr_release(Inode *in)
14646 {
14647 in->client->put_inode(in);
14648 }
14649
14650 mds_rank_t Client::_get_random_up_mds() const
14651 {
14652 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14653
14654 std::set<mds_rank_t> up;
14655 mdsmap->get_up_mds_set(up);
14656
14657 if (up.empty())
14658 return MDS_RANK_NONE;
14659 std::set<mds_rank_t>::const_iterator p = up.begin();
14660 for (int n = rand() % up.size(); n; n--)
14661 ++p;
14662 return *p;
14663 }
14664
14665
14666 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14667 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14668 {
14669 monclient->set_messenger(m);
14670 objecter->set_client_incarnation(0);
14671 }
14672
14673 StandaloneClient::~StandaloneClient()
14674 {
14675 delete objecter;
14676 objecter = nullptr;
14677 }
14678
14679 int StandaloneClient::init()
14680 {
14681 _pre_init();
14682 objecter->init();
14683
14684 client_lock.lock();
14685 ceph_assert(!is_initialized());
14686
14687 messenger->add_dispatcher_tail(objecter);
14688 messenger->add_dispatcher_tail(this);
14689
14690 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14691 int r = monclient->init();
14692 if (r < 0) {
14693 // need to do cleanup because we're in an intermediate init state
14694 timer.shutdown();
14695 client_lock.unlock();
14696 objecter->shutdown();
14697 objectcacher->stop();
14698 monclient->shutdown();
14699 return r;
14700 }
14701 objecter->start();
14702
14703 client_lock.unlock();
14704 _finish_init();
14705
14706 return 0;
14707 }
14708
14709 void StandaloneClient::shutdown()
14710 {
14711 Client::shutdown();
14712 objecter->shutdown();
14713 monclient->shutdown();
14714 }